From 7a410bfde8c94eae03208de169f68c308596ad97 Mon Sep 17 00:00:00 2001
From: Jonathan Chan <z5079346@ad.unsw.edu.au>
Date: Mon, 19 May 2025 12:25:50 +1000
Subject: [PATCH] Updated by JWC following WRL wiki migration to AWS WRL Web
 Services

---
 README.md     |  22 +--
 wiki_stats.py | 439 +++++++++++++++++++++++++++++++-------------------
 2 files changed, 287 insertions(+), 174 deletions(-)

diff --git a/README.md b/README.md
index 99595f1..189772f 100644
--- a/README.md
+++ b/README.md
@@ -1,21 +1,25 @@
 # wiki-usage
 
-Generate usage stats on a mediawiki site.
+Generate usage stats on a mediawiki site, specically outputting the following 3 figures. 
+1. Bar chart showing page edits and active users per year
+2. Table showing user contributions per user over the past 5 years
+3. Table showing page edits over the past 3 years
 
-## Usage
+Note, 'page edits' in the bar chart only shows the number of unique pages that have been edited. For example if mutliple users make multiple edits to a page, that page edit tally will still just be 1. Conversely, the user contributions table includes all unique edits to a page per day. For example, if a user makes 3 edits to the same page on three different days, the user contribution tally will be 3.
 
-1.  Get list of all wiki pages with the script `wiki_pages.py`
+## Usage
 
-2.  Go to this wiki page:  
-    <http://wiki.wrl.unsw.edu.au/index.php/Special:Export>
+1.  Go to this wiki page:  
+    <https://wiki.wrl.unsw.edu.au/index.php?title=Special:AllPages>
 
-3.  Paste the list of page titles.
-4.  Uncheck the box 'Include only the current revision, not the full history'.
-5.  Click export.
+2.  Highlight all the pages then copy/paste the list of page titles in the "Add pages manually" text box here: https://wiki.wrl.unsw.edu.au/index.php/Special:Export. Note, you'll need to click Next Page to copy/paste ALL of the pages.
+3.  Uncheck the box 'Include only the current revision, not the full history' and check the box 'Save as file'
+4.  Click export.
 
     ![](docs/export.png)
 
-6.  Move the downloaded xml file into the same folder as `wiki-stats.py`, and run the script.
+5. Move the downloaded xml file into the same folder as `wiki-stats.py`, 
+6. Update the filepath to this .xml in the script. You may also need to update the list of project engineers. Then run it. Figures will be output in a folder called `<YEAR>_Figures`
 
 ## Sample output
 
diff --git a/wiki_stats.py b/wiki_stats.py
index 78e6b65..a0a8a90 100644
--- a/wiki_stats.py
+++ b/wiki_stats.py
@@ -1,175 +1,284 @@
+
+#%%
 import os
-import re
 import pandas as pd
-from lxml import etree
 import matplotlib.pyplot as plt
 import mwxml  # pip install mwxml
 
+# UPDATE THIS PATH TO YOUR OWN FILE
+f = r"D:\OneDrive\OneDrive - UNSW\Code\wiki-stats\WRL+Wiki-20250519003348.xml"
+
 OUTPUT_DIR = 'csv'
-PROJECTS_TEAM = [  # Oderered by first entry
-    'Anderson',
-    'Robert',
-    'Brett',
-    'Conrad',
-    'Matt',
-    'Duncan',
-    'Ianc',
-    'William',
-    'Laurent',
-    'Alexandra',
-    'Wendy',
-    'Alessio',
-    'Luis',
-    'Grantley',
-    'Beatrice',
-    'Sarah',
-    'Rossm',
-    'ROBERT',
-    'Steven',
-    'Wendyth',
-    'Melissa',
-    'Andy',
-    'Michael',
-    'Msa',
-    'Jamie',
-    'Toms',
-    'Sam',
-    'Larry',
-    'Annakg',
-    'Hamish',
-    'Francois',
-    'Annab',
-    'Erica',
-    'Coral',
-    'Priom',
-    'Barry',
-    'Nathan',
-    'Chrisd',
-    'Andrewf',
-    'Joshuas',
-    'Daniel',
-    'Danh',
-    'Duncanr',
-    'Robertt',
-    'Chrisdu',
-    'Brettm',
-    'Mathieud',
-    'Ianco',
-    'Larryp',
-    'Grantleys',
-    'Aliceh',
-    'Mattb',
-    'Tobyt',
-    'Benm',
-    'Jamess',
-    'Douga',
-    'Gabil',
-    'Francoisf',
-    'Lluna',
-]
-
-
-def main(xml_name):
-    revisions = []
-    with open(xml_name, 'rb') as f:
-        pages = mwxml.Dump.from_file(f)
-        for page in pages:
-            title = page.title
-            for revision in page:
-                timestamp = str(revision.timestamp)
-                contributor = revision.user.text
-
-                revisions.append({
-                    'page': title,
-                    'user': contributor,
-                    'date': timestamp
-                })
-
-    df = pd.DataFrame(revisions)
-    df.index = pd.to_datetime(df['date'])
-
-    # Convert to local time, and round to nearest day
-    df = df.tz_convert('Australia/Sydney')
-    df = df.sort_index()
-    df['date'] = df.index.date
-
-    # Drop duplicate entries (ignore edits for same user/same page/same day)
-    df = df.drop_duplicates()
-    df = df.drop(columns=['date'])
-
-    # Only include non-academic users
-    df = df[df['user'].isin(PROJECTS_TEAM)]
-
-    # Get list of years
-    years = df.index.year.unique()
-
-    os.makedirs(OUTPUT_DIR, exist_ok=True)
-
-    summary = []
-    for year in years:
-        idx = df.index.year == year
-        pages = df[idx].groupby('page').count()
-        pages = pages.rename(columns={'user': 'edits'})
-        pages = pages.sort_values('edits', ascending=False)
-
-        users = df[idx].groupby('user').count()
-        users = users.rename(columns={'page': 'edits'})
-        users = users.sort_values('edits', ascending=False)
-
-        pages.to_csv(f'{OUTPUT_DIR}/{year}-pages.csv')
-        users.to_csv(f'{OUTPUT_DIR}/{year}-users.csv')
-
-        summary.append({
-            'year': year,
-            'page edits': pages.shape[0],
-            'active users': users.shape[0]
+
+pages = mwxml.Dump.from_file(f)
+print("Processing pages...")
+revisions = []
+for page in pages:
+    title = page.title
+
+    for revision in page:
+        timestamp = str(revision.timestamp)
+        contributor = revision.user.text
+
+        revisions.append({
+            'page': title,
+            'user': contributor,
+            'date': timestamp
         })
 
-    summary = pd.DataFrame(summary)
+
+df = pd.DataFrame(revisions)    
+df.index = pd.to_datetime(df['date'])
+
+
+# Get the latest year from the dataset
+LATEST_YEAR = df.index.year.max()
+
+# Get script directory
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# Define figure output directory
+FIGURE_DIR = os.path.join(SCRIPT_DIR, f"{LATEST_YEAR}_Figures")
+os.makedirs(FIGURE_DIR, exist_ok=True)
+
+
+# Convert to local time, and round to nearest day
+df = df.tz_convert('Australia/Sydney')
+df = df.sort_index()
+df['date'] = df.index.date
+
+# Drop duplicate entries (ignore edits for same user/same page/same day)
+df = df.drop_duplicates()
+df = df.drop(columns=['date'])
+
+# Get list of years
+years = df.index.year.unique()
+
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+################################################################################
+# Initialize DataFrame to track page edits by year
+page_edits_by_year = pd.DataFrame()
+
+# Get all unique pages
+all_pages = df['page'].unique()
+
+for year in years:
+    # Filter data to only include rows from the current year
+    year_df = df[df.index.year == year]
+
+    # Count each page's contributions in the current year
+    page_counts = year_df['page'].value_counts().reindex(all_pages, fill_value=0)
+
+    # Add the page counts for this year as a new column
+    page_edits_by_year[year] = page_counts
+
+page_edits_by_year = page_edits_by_year.replace(0, pd.NA)
+
+# Sort pages based on the most recent year's edit counts in descending order
+most_recent_year = years.max()
+page_edits_by_year = page_edits_by_year.sort_values(by=most_recent_year, ascending=False)
+
+################################################################################
+
+summary = []
+user_edits_by_year = pd.DataFrame()
+
+# Get all unique users across all years
+all_users = df['user'].unique()
+
+for year in years:
+    # Filter the DataFrame to only include rows from the current year
+    year_df = df[df.index.year == year]
+
+    # Count each user's contributions in the current year
+    user_counts = year_df['user'].value_counts().reindex(all_users, fill_value=0)
+
+    # # Count how many unique pages each user edited (matching page logic)
+    # user_counts = (
+    #     year_df.groupby('user')['page']
+    #     .nunique()
+    #     .reindex(all_users, fill_value=0)
+    # )
+
+    # Add the user counts for this year as a new column
+    user_edits_by_year[year] = user_counts
+
+    # Count pages and active users for summary
+    pages = year_df['page'].nunique()  # Count of unique pages edited in the year
+    active_users = user_counts[user_counts > 0].count()  # Count users with edits in this year
+    
+    summary.append({
+        'year': year,
+        'page edits': pages,
+        'active users': active_users
+    })
+
+# Convert summary to DataFrame
+summary = pd.DataFrame(summary)
+#print("Summary DataFrame before setting index:", summary)
+
+if 'year' in summary.columns:
     summary = summary.set_index('year')
+else:
+    raise KeyError("The 'year' column is missing from the summary DataFrame.")
+
+
+user_edits_by_year = user_edits_by_year.replace(0, pd.NA)
+
+# Sort users based on edits in the most recent year
+most_recent_year = years.max()
+user_edits_by_year = user_edits_by_year.sort_values(by=most_recent_year, ascending=False)
+
+# Save user edits by year as CSV
+user_edits_by_year.to_csv(f'{OUTPUT_DIR}/user_edits_by_year.csv')
+
+print("Creating summary plot...")
+fig, ax = plt.subplots(2,
+                   1,
+                   figsize=(6, 4),
+                   sharex=True,
+                   gridspec_kw={'hspace': 0.5})
+
+summary[['page edits']].plot.bar(ax=ax[0], legend=False)
+summary[['active users']].plot.bar(ax=ax[1], legend=False)
+
+j = 0
+for i, row in summary.iterrows():
+    ax[0].annotate(row['page edits'],
+                   xy=(j, row['page edits']),
+                   xytext=(0, 6),
+                   textcoords='offset pixels',
+                   ha='center',
+                   fontsize=8)
+
+    ax[1].annotate(row['active users'],
+                   xy=(j, row['active users']),
+                   xytext=(0, 6),
+                   textcoords='offset pixels',
+                   ha='center',
+                   fontsize=8)
+    j += 1
+
+ax[0].set_title('Page edits', fontsize=10, y=0.9)
+ax[1].set_title('Active users', fontsize=10, y=0.9)
+
+ax[1].set_xlabel('')
+ax[0].set_ylabel('Count', labelpad=10)
+ax[1].set_ylabel('Count', labelpad=10)
+
+for a in ax.ravel():
+    a.spines['top'].set_visible(False)
+    a.spines['right'].set_visible(False)
+    #png_name = f.replace('.xml', '.png')
+    png_name = os.path.join(FIGURE_DIR, 'summary_bar_chart.png')
+
+
+plt.savefig(png_name, bbox_inches='tight', dpi=300)
+
+
+#------------------------------------------------------------------
+print("Creating user edits table...")
+# Select last 5 years
+latest_5_years = sorted(years)[-5:]
+user_table = user_edits_by_year[latest_5_years].fillna(0).astype(int).copy()
+
+# Sort by latest year, then previous years
+user_table = user_table.sort_values(by=latest_5_years[::-1], ascending=False)
+
+# Drop users with all NaNs (i.e., no activity in these 5 years)
+user_table = user_table.dropna(how='all')
+
+# Keep only top 13 users
+user_table = user_table.head(13)
+
+# Reset index so 'user' becomes a column
+user_table = user_table.reset_index()
+user_table.columns = ['User'] + [str(year) for year in latest_5_years]
+
+# Plot table
+fig, ax = plt.subplots(figsize=(10, 4))
+ax.axis('off')
+tbl = ax.table(cellText=user_table.values,
+               colLabels=user_table.columns,
+               cellLoc='center',
+               loc='center')
+
+tbl.auto_set_font_size(False)
+tbl.set_fontsize(9)
+tbl.scale(1, 1.5)
+
+# Make column header text bold
+for col in range(len(user_table.columns)):
+    header_cell = tbl[(0, col)]
+    header_cell.set_text_props(weight='bold')
+
+# Save figure
+table_png_name = os.path.join(FIGURE_DIR, 'top_users_table.png')
+plt.savefig(table_png_name, bbox_inches='tight', dpi=300)
+
+# -------------------------
+# Page Edits Table (Last 3 Years)
+# -------------------------
+print("Creating page edits table...")
+import textwrap
+
+latest_3_years = sorted(years)[-3:]
+page_table = page_edits_by_year[latest_3_years].fillna(0).astype(int).copy()
+
+# Sort pages by latest year, then previous years
+page_table = page_table.sort_values(by=latest_3_years[::-1], ascending=False)
+
+# Keep only top 20 pages
+page_table = page_table.head(20)
+
+# Reset index so 'page' becomes a column
+page_table = page_table.reset_index()
+page_table.columns = ['Page'] + [str(year) for year in latest_3_years]
+
+# Define a max character width per line (adjust as needed)
+WRAP_WIDTH = 50
+
+# Wrap page titles
+page_table['Page'] = page_table['Page'].apply(
+    lambda title: '\n'.join(textwrap.wrap(title, width=WRAP_WIDTH))
+)
+
+# Plot table
+fig, ax = plt.subplots(figsize=(10, 4))
+ax.axis('off')
+tbl = ax.table(cellText=page_table.values,
+               colLabels=page_table.columns,
+               cellLoc='center',
+               loc='center')
+
+# Dynamically set column widths
+num_cols = len(page_table.columns)
+col_widths = [0.40] + [0.60 / (num_cols - 1)] * (num_cols - 1)  # 45% for "Page", rest split
+
+for i, width in enumerate(col_widths):
+    for row in range(len(page_table) + 1):  # +1 includes header row
+        cell = tbl[(row, i)]
+        cell.set_width(width)
+
+tbl.auto_set_font_size(False)
+tbl.set_fontsize(9)
+tbl.scale(1, 1.5)
+
+# Make column header text bold
+for col in range(len(page_table.columns)):
+    header_cell = tbl[(0, col)]
+    header_cell.set_text_props(weight='bold')
+
+# Adjust row heights to allow wrapped text to be visible
+num_rows = len(page_table) + 1  # include header
+row_height = 1.0 / num_rows
+for row in range(num_rows):
+    for col in range(len(page_table.columns)):
+        tbl[(row, col)].set_height(row_height * 2)  # tweak multiplier as needed
+
+# Save figure
+page_table_png_name = os.path.join(FIGURE_DIR, 'top_pages_table.png')
+plt.savefig(page_table_png_name, bbox_inches='tight', dpi=300)
 
-    fig, ax = plt.subplots(2,
-                           1,
-                           figsize=(6, 4),
-                           sharex=True,
-                           gridspec_kw={'hspace': 0.5})
-
-    summary[['page edits']].plot.bar(ax=ax[0], legend=False)
-    summary[['active users']].plot.bar(ax=ax[1], legend=False)
-
-    j = 0
-    for i, row in summary.iterrows():
-        ax[0].annotate(row['page edits'],
-                       xy=(j, row['page edits']),
-                       xytext=(0, 6),
-                       textcoords='offset pixels',
-                       ha='center',
-                       fontsize=8)
-
-        ax[1].annotate(row['active users'],
-                       xy=(j, row['active users']),
-                       xytext=(0, 6),
-                       textcoords='offset pixels',
-                       ha='center',
-                       fontsize=8)
-        j += 1
-
-    ax[0].set_title('Page edits', fontsize=10, y=0.9)
-    ax[1].set_title('Active users', fontsize=10, y=0.9)
-
-    ax[1].set_xlabel('')
-    ax[0].set_ylabel('Count', labelpad=10)
-    ax[1].set_ylabel('Count', labelpad=10)
-
-    for a in ax.ravel():
-        a.spines['top'].set_visible(False)
-        a.spines['right'].set_visible(False)
-        png_name = xml_name.replace('.xml', '.png')
-
-    plt.savefig(png_name, bbox_inches='tight', dpi=300)
-
-
-if __name__ == '__main__':
-    xml_names = [f for f in os.listdir('.') if f.endswith('.xml')]
-    for xml_name in xml_names:
-        main(xml_name)
+# %%