@ -1,80 +1,21 @@
#%%
import os
import re
import pandas as pd
from lxml import etree
import matplotlib . pyplot as plt
import mwxml # pip install mwxml
# UPDATE THIS PATH TO YOUR OWN FILE
f = r " D: \ OneDrive \ OneDrive - UNSW \ Code \ wiki-stats \ WRL+Wiki-20250519003348.xml "
OUTPUT_DIR = ' csv '
PROJECTS_TEAM = [ # Oderered by first entry
' Anderson ' ,
' Robert ' ,
' Brett ' ,
' Conrad ' ,
' Matt ' ,
' Duncan ' ,
' Ianc ' ,
' William ' ,
' Laurent ' ,
' Alexandra ' ,
' Wendy ' ,
' Alessio ' ,
' Luis ' ,
' Grantley ' ,
' Beatrice ' ,
' Sarah ' ,
' Rossm ' ,
' ROBERT ' ,
' Steven ' ,
' Wendyth ' ,
' Melissa ' ,
' Andy ' ,
' Michael ' ,
' Msa ' ,
' Jamie ' ,
' Toms ' ,
' Sam ' ,
' Larry ' ,
' Annakg ' ,
' Hamish ' ,
' Francois ' ,
' Annab ' ,
' Erica ' ,
' Coral ' ,
' Priom ' ,
' Barry ' ,
' Nathan ' ,
' Chrisd ' ,
' Andrewf ' ,
' Joshuas ' ,
' Daniel ' ,
' Danh ' ,
' Duncanr ' ,
' Robertt ' ,
' Chrisdu ' ,
' Brettm ' ,
' Mathieud ' ,
' Ianco ' ,
' Larryp ' ,
' Grantleys ' ,
' Aliceh ' ,
' Mattb ' ,
' Tobyt ' ,
' Benm ' ,
' Jamess ' ,
' Douga ' ,
' Gabil ' ,
' Francoisf ' ,
' Lluna ' ,
]
def main ( xml_name ) :
revisions = [ ]
with open ( xml_name , ' rb ' ) as f :
pages = mwxml . Dump . from_file ( f )
for page in pages :
pages = mwxml . Dump . from_file ( f )
print ( " Processing pages... " )
revisions = [ ]
for page in pages :
title = page . title
for revision in page :
timestamp = str ( revision . timestamp )
contributor = revision . user . text
@ -85,60 +26,125 @@ def main(xml_name):
' date ' : timestamp
} )
df = pd . DataFrame ( revisions )
df . index = pd . to_datetime ( df [ ' date ' ] )
# Convert to local time, and round to nearest day
df = df . tz_convert ( ' Australia/Sydney ' )
df = df . sort_index ( )
df [ ' date ' ] = df . index . date
df = pd . DataFrame ( revisions )
df . index = pd . to_datetime ( df [ ' date ' ] )
# Get the latest year from the dataset
LATEST_YEAR = df . index . year . max ( )
# Get script directory
SCRIPT_DIR = os . path . dirname ( os . path . abspath ( __file__ ) )
# Define figure output directory
FIGURE_DIR = os . path . join ( SCRIPT_DIR , f " { LATEST_YEAR } _Figures " )
os . makedirs ( FIGURE_DIR , exist_ok = True )
# Convert to local time, and round to nearest day
df = df . tz_convert ( ' Australia/Sydney ' )
df = df . sort_index ( )
df [ ' date ' ] = df . index . date
# Drop duplicate entries (ignore edits for same user/same page/same day)
df = df . drop_duplicates ( )
df = df . drop ( columns = [ ' date ' ] )
# Get list of years
years = df . index . year . unique ( )
os . makedirs ( OUTPUT_DIR , exist_ok = True )
# Drop duplicate entries (ignore edits for same user/same page/same day)
df = df . drop_duplicates ( )
df = df . drop ( columns = [ ' date ' ] )
################################################################################
# Initialize DataFrame to track page edits by year
page_edits_by_year = pd . DataFrame ( )
# Only include non-academic users
df = df [ df [ ' user ' ] . isin ( PROJECTS_TEAM ) ]
# Get all unique page s
all_pages = df [ ' page ' ] . unique ( )
# Get list of years
years = df . index . year . unique ( )
for year in years :
# Filter data to only include rows from the current year
year_df = df [ df . index . year == year ]
os . makedirs ( OUTPUT_DIR , exist_ok = True )
# Count each page's contributions in the current year
page_counts = year_df [ ' page ' ] . value_counts ( ) . reindex ( all_pages , fill_value = 0 )
summary = [ ]
for year in years :
idx = df . index . year == year
pages = df [ idx ] . groupby ( ' page ' ) . count ( )
pages = pages . rename ( columns = { ' user ' : ' edits ' } )
pages = pages . sort_values ( ' edits ' , ascending = False )
# Add the page counts for this year as a new column
page_edits_by_year [ year ] = page_counts
users = df [ idx ] . groupby ( ' user ' ) . count ( )
users = users . rename ( columns = { ' page ' : ' edits ' } )
users = users . sort_values ( ' edits ' , ascending = False )
page_edits_by_year = page_edits_by_year . replace ( 0 , pd . NA )
pages . to_csv ( f ' { OUTPUT_DIR } / { year } -pages.csv ' )
users . to_csv ( f ' { OUTPUT_DIR } / { year } -users.csv ' )
# Sort pages based on the most recent year's edit counts in descending order
most_recent_year = years . max ( )
page_edits_by_year = page_edits_by_year . sort_values ( by = most_recent_year , ascending = False )
################################################################################
summary = [ ]
user_edits_by_year = pd . DataFrame ( )
# Get all unique users across all years
all_users = df [ ' user ' ] . unique ( )
for year in years :
# Filter the DataFrame to only include rows from the current year
year_df = df [ df . index . year == year ]
# Count each user's contributions in the current year
user_counts = year_df [ ' user ' ] . value_counts ( ) . reindex ( all_users , fill_value = 0 )
# # Count how many unique pages each user edited (matching page logic)
# user_counts = (
# year_df.groupby('user')['page']
# .nunique()
# .reindex(all_users, fill_value=0)
# )
# Add the user counts for this year as a new column
user_edits_by_year [ year ] = user_counts
# Count pages and active users for summary
pages = year_df [ ' page ' ] . nunique ( ) # Count of unique pages edited in the year
active_users = user_counts [ user_counts > 0 ] . count ( ) # Count users with edits in this year
summary . append ( {
' year ' : year ,
' page edits ' : pages . shape [ 0 ] ,
' active users ' : users . shape [ 0 ]
' page edits ' : pages ,
' active users ' : active_ users
} )
summary = pd . DataFrame ( summary )
# Convert summary to DataFrame
summary = pd . DataFrame ( summary )
#print("Summary DataFrame before setting index:", summary)
if ' year ' in summary . columns :
summary = summary . set_index ( ' year ' )
else :
raise KeyError ( " The ' year ' column is missing from the summary DataFrame. " )
user_edits_by_year = user_edits_by_year . replace ( 0 , pd . NA )
# Sort users based on edits in the most recent year
most_recent_year = years . max ( )
user_edits_by_year = user_edits_by_year . sort_values ( by = most_recent_year , ascending = False )
fig , ax = plt . subplots ( 2 ,
# Save user edits by year as CSV
user_edits_by_year . to_csv ( f ' { OUTPUT_DIR } /user_edits_by_year.csv ' )
print ( " Creating summary plot... " )
fig , ax = plt . subplots ( 2 ,
1 ,
figsize = ( 6 , 4 ) ,
sharex = True ,
gridspec_kw = { ' hspace ' : 0.5 } )
summary [ [ ' page edits ' ] ] . plot . bar ( ax = ax [ 0 ] , legend = False )
summary [ [ ' active users ' ] ] . plot . bar ( ax = ax [ 1 ] , legend = False )
summary [ [ ' page edits ' ] ] . plot . bar ( ax = ax [ 0 ] , legend = False )
summary [ [ ' active users ' ] ] . plot . bar ( ax = ax [ 1 ] , legend = False )
j = 0
for i , row in summary . iterrows ( ) :
j = 0
for i , row in summary . iterrows ( ) :
ax [ 0 ] . annotate ( row [ ' page edits ' ] ,
xy = ( j , row [ ' page edits ' ] ) ,
xytext = ( 0 , 6 ) ,
@ -154,22 +160,125 @@ def main(xml_name):
fontsize = 8 )
j + = 1
ax [ 0 ] . set_title ( ' Page edits ' , fontsize = 10 , y = 0.9 )
ax [ 1 ] . set_title ( ' Active users ' , fontsize = 10 , y = 0.9 )
ax [ 0 ] . set_title ( ' Page edits ' , fontsize = 10 , y = 0.9 )
ax [ 1 ] . set_title ( ' Active users ' , fontsize = 10 , y = 0.9 )
ax [ 1 ] . set_xlabel ( ' ' )
ax [ 0 ] . set_ylabel ( ' Count ' , labelpad = 10 )
ax [ 1 ] . set_ylabel ( ' Count ' , labelpad = 10 )
ax [ 1 ] . set_xlabel ( ' ' )
ax [ 0 ] . set_ylabel ( ' Count ' , labelpad = 10 )
ax [ 1 ] . set_ylabel ( ' Count ' , labelpad = 10 )
for a in ax . ravel ( ) :
for a in ax . ravel ( ) :
a . spines [ ' top ' ] . set_visible ( False )
a . spines [ ' right ' ] . set_visible ( False )
png_name = xml_name . replace ( ' .xml ' , ' .png ' )
#png_name = f.replace('.xml', '.png')
png_name = os . path . join ( FIGURE_DIR , ' summary_bar_chart.png ' )
plt . savefig ( png_name , bbox_inches = ' tight ' , dpi = 300 )
#------------------------------------------------------------------
print ( " Creating user edits table... " )
# Select last 5 years
latest_5_years = sorted ( years ) [ - 5 : ]
user_table = user_edits_by_year [ latest_5_years ] . fillna ( 0 ) . astype ( int ) . copy ( )
# Sort by latest year, then previous years
user_table = user_table . sort_values ( by = latest_5_years [ : : - 1 ] , ascending = False )
# Drop users with all NaNs (i.e., no activity in these 5 years)
user_table = user_table . dropna ( how = ' all ' )
# Keep only top 13 users
user_table = user_table . head ( 13 )
# Reset index so 'user' becomes a column
user_table = user_table . reset_index ( )
user_table . columns = [ ' User ' ] + [ str ( year ) for year in latest_5_years ]
# Plot table
fig , ax = plt . subplots ( figsize = ( 10 , 4 ) )
ax . axis ( ' off ' )
tbl = ax . table ( cellText = user_table . values ,
colLabels = user_table . columns ,
cellLoc = ' center ' ,
loc = ' center ' )
tbl . auto_set_font_size ( False )
tbl . set_fontsize ( 9 )
tbl . scale ( 1 , 1.5 )
# Make column header text bold
for col in range ( len ( user_table . columns ) ) :
header_cell = tbl [ ( 0 , col ) ]
header_cell . set_text_props ( weight = ' bold ' )
# Save figure
table_png_name = os . path . join ( FIGURE_DIR , ' top_users_table.png ' )
plt . savefig ( table_png_name , bbox_inches = ' tight ' , dpi = 300 )
# -------------------------
# Page Edits Table (Last 3 Years)
# -------------------------
print ( " Creating page edits table... " )
import textwrap
latest_3_years = sorted ( years ) [ - 3 : ]
page_table = page_edits_by_year [ latest_3_years ] . fillna ( 0 ) . astype ( int ) . copy ( )
# Sort pages by latest year, then previous years
page_table = page_table . sort_values ( by = latest_3_years [ : : - 1 ] , ascending = False )
# Keep only top 20 pages
page_table = page_table . head ( 20 )
# Reset index so 'page' becomes a column
page_table = page_table . reset_index ( )
page_table . columns = [ ' Page ' ] + [ str ( year ) for year in latest_3_years ]
# Define a max character width per line (adjust as needed)
WRAP_WIDTH = 50
# Wrap page titles
page_table [ ' Page ' ] = page_table [ ' Page ' ] . apply (
lambda title : ' \n ' . join ( textwrap . wrap ( title , width = WRAP_WIDTH ) )
)
# Plot table
fig , ax = plt . subplots ( figsize = ( 10 , 4 ) )
ax . axis ( ' off ' )
tbl = ax . table ( cellText = page_table . values ,
colLabels = page_table . columns ,
cellLoc = ' center ' ,
loc = ' center ' )
# Dynamically set column widths
num_cols = len ( page_table . columns )
col_widths = [ 0.40 ] + [ 0.60 / ( num_cols - 1 ) ] * ( num_cols - 1 ) # 45% for "Page", rest split
for i , width in enumerate ( col_widths ) :
for row in range ( len ( page_table ) + 1 ) : # +1 includes header row
cell = tbl [ ( row , i ) ]
cell . set_width ( width )
tbl . auto_set_font_size ( False )
tbl . set_fontsize ( 9 )
tbl . scale ( 1 , 1.5 )
# Make column header text bold
for col in range ( len ( page_table . columns ) ) :
header_cell = tbl [ ( 0 , col ) ]
header_cell . set_text_props ( weight = ' bold ' )
plt . savefig ( png_name , bbox_inches = ' tight ' , dpi = 300 )
# Adjust row heights to allow wrapped text to be visible
num_rows = len ( page_table ) + 1 # include header
row_height = 1.0 / num_rows
for row in range ( num_rows ) :
for col in range ( len ( page_table . columns ) ) :
tbl [ ( row , col ) ] . set_height ( row_height * 2 ) # tweak multiplier as needed
# Save figure
page_table_png_name = os . path . join ( FIGURE_DIR , ' top_pages_table.png ' )
plt . savefig ( page_table_png_name , bbox_inches = ' tight ' , dpi = 300 )
if __name__ == ' __main__ ' :
xml_names = [ f for f in os . listdir ( ' . ' ) if f . endswith ( ' .xml ' ) ]
for xml_name in xml_names :
main ( xml_name )
# %%