From 7a410bfde8c94eae03208de169f68c308596ad97 Mon Sep 17 00:00:00 2001 From: Jonathan Chan <z5079346@ad.unsw.edu.au> Date: Mon, 19 May 2025 12:25:50 +1000 Subject: [PATCH] Updated by JWC following WRL wiki migration to AWS WRL Web Services --- README.md | 22 +-- wiki_stats.py | 439 +++++++++++++++++++++++++++++++------------------- 2 files changed, 287 insertions(+), 174 deletions(-) diff --git a/README.md b/README.md index 99595f1..189772f 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,25 @@ # wiki-usage -Generate usage stats on a mediawiki site. +Generate usage stats on a mediawiki site, specically outputting the following 3 figures. +1. Bar chart showing page edits and active users per year +2. Table showing user contributions per user over the past 5 years +3. Table showing page edits over the past 3 years -## Usage +Note, 'page edits' in the bar chart only shows the number of unique pages that have been edited. For example if mutliple users make multiple edits to a page, that page edit tally will still just be 1. Conversely, the user contributions table includes all unique edits to a page per day. For example, if a user makes 3 edits to the same page on three different days, the user contribution tally will be 3. -1. Get list of all wiki pages with the script `wiki_pages.py` +## Usage -2. Go to this wiki page: - <http://wiki.wrl.unsw.edu.au/index.php/Special:Export> +1. Go to this wiki page: + <https://wiki.wrl.unsw.edu.au/index.php?title=Special:AllPages> -3. Paste the list of page titles. -4. Uncheck the box 'Include only the current revision, not the full history'. -5. Click export. +2. Highlight all the pages then copy/paste the list of page titles in the "Add pages manually" text box here: https://wiki.wrl.unsw.edu.au/index.php/Special:Export. Note, you'll need to click Next Page to copy/paste ALL of the pages. +3. Uncheck the box 'Include only the current revision, not the full history' and check the box 'Save as file' +4. Click export.  -6. Move the downloaded xml file into the same folder as `wiki-stats.py`, and run the script. +5. Move the downloaded xml file into the same folder as `wiki-stats.py`, +6. Update the filepath to this .xml in the script. You may also need to update the list of project engineers. Then run it. Figures will be output in a folder called `<YEAR>_Figures` ## Sample output diff --git a/wiki_stats.py b/wiki_stats.py index 78e6b65..a0a8a90 100644 --- a/wiki_stats.py +++ b/wiki_stats.py @@ -1,175 +1,284 @@ + +#%% import os -import re import pandas as pd -from lxml import etree import matplotlib.pyplot as plt import mwxml # pip install mwxml +# UPDATE THIS PATH TO YOUR OWN FILE +f = r"D:\OneDrive\OneDrive - UNSW\Code\wiki-stats\WRL+Wiki-20250519003348.xml" + OUTPUT_DIR = 'csv' -PROJECTS_TEAM = [ # Oderered by first entry - 'Anderson', - 'Robert', - 'Brett', - 'Conrad', - 'Matt', - 'Duncan', - 'Ianc', - 'William', - 'Laurent', - 'Alexandra', - 'Wendy', - 'Alessio', - 'Luis', - 'Grantley', - 'Beatrice', - 'Sarah', - 'Rossm', - 'ROBERT', - 'Steven', - 'Wendyth', - 'Melissa', - 'Andy', - 'Michael', - 'Msa', - 'Jamie', - 'Toms', - 'Sam', - 'Larry', - 'Annakg', - 'Hamish', - 'Francois', - 'Annab', - 'Erica', - 'Coral', - 'Priom', - 'Barry', - 'Nathan', - 'Chrisd', - 'Andrewf', - 'Joshuas', - 'Daniel', - 'Danh', - 'Duncanr', - 'Robertt', - 'Chrisdu', - 'Brettm', - 'Mathieud', - 'Ianco', - 'Larryp', - 'Grantleys', - 'Aliceh', - 'Mattb', - 'Tobyt', - 'Benm', - 'Jamess', - 'Douga', - 'Gabil', - 'Francoisf', - 'Lluna', -] - - -def main(xml_name): - revisions = [] - with open(xml_name, 'rb') as f: - pages = mwxml.Dump.from_file(f) - for page in pages: - title = page.title - for revision in page: - timestamp = str(revision.timestamp) - contributor = revision.user.text - - revisions.append({ - 'page': title, - 'user': contributor, - 'date': timestamp - }) - - df = pd.DataFrame(revisions) - df.index = pd.to_datetime(df['date']) - - # Convert to local time, and round to nearest day - df = df.tz_convert('Australia/Sydney') - df = df.sort_index() - df['date'] = df.index.date - - # Drop duplicate entries (ignore edits for same user/same page/same day) - df = df.drop_duplicates() - df = df.drop(columns=['date']) - - # Only include non-academic users - df = df[df['user'].isin(PROJECTS_TEAM)] - - # Get list of years - years = df.index.year.unique() - - os.makedirs(OUTPUT_DIR, exist_ok=True) - - summary = [] - for year in years: - idx = df.index.year == year - pages = df[idx].groupby('page').count() - pages = pages.rename(columns={'user': 'edits'}) - pages = pages.sort_values('edits', ascending=False) - - users = df[idx].groupby('user').count() - users = users.rename(columns={'page': 'edits'}) - users = users.sort_values('edits', ascending=False) - - pages.to_csv(f'{OUTPUT_DIR}/{year}-pages.csv') - users.to_csv(f'{OUTPUT_DIR}/{year}-users.csv') - - summary.append({ - 'year': year, - 'page edits': pages.shape[0], - 'active users': users.shape[0] + +pages = mwxml.Dump.from_file(f) +print("Processing pages...") +revisions = [] +for page in pages: + title = page.title + + for revision in page: + timestamp = str(revision.timestamp) + contributor = revision.user.text + + revisions.append({ + 'page': title, + 'user': contributor, + 'date': timestamp }) - summary = pd.DataFrame(summary) + +df = pd.DataFrame(revisions) +df.index = pd.to_datetime(df['date']) + + +# Get the latest year from the dataset +LATEST_YEAR = df.index.year.max() + +# Get script directory +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) + +# Define figure output directory +FIGURE_DIR = os.path.join(SCRIPT_DIR, f"{LATEST_YEAR}_Figures") +os.makedirs(FIGURE_DIR, exist_ok=True) + + +# Convert to local time, and round to nearest day +df = df.tz_convert('Australia/Sydney') +df = df.sort_index() +df['date'] = df.index.date + +# Drop duplicate entries (ignore edits for same user/same page/same day) +df = df.drop_duplicates() +df = df.drop(columns=['date']) + +# Get list of years +years = df.index.year.unique() + +os.makedirs(OUTPUT_DIR, exist_ok=True) + +################################################################################ +# Initialize DataFrame to track page edits by year +page_edits_by_year = pd.DataFrame() + +# Get all unique pages +all_pages = df['page'].unique() + +for year in years: + # Filter data to only include rows from the current year + year_df = df[df.index.year == year] + + # Count each page's contributions in the current year + page_counts = year_df['page'].value_counts().reindex(all_pages, fill_value=0) + + # Add the page counts for this year as a new column + page_edits_by_year[year] = page_counts + +page_edits_by_year = page_edits_by_year.replace(0, pd.NA) + +# Sort pages based on the most recent year's edit counts in descending order +most_recent_year = years.max() +page_edits_by_year = page_edits_by_year.sort_values(by=most_recent_year, ascending=False) + +################################################################################ + +summary = [] +user_edits_by_year = pd.DataFrame() + +# Get all unique users across all years +all_users = df['user'].unique() + +for year in years: + # Filter the DataFrame to only include rows from the current year + year_df = df[df.index.year == year] + + # Count each user's contributions in the current year + user_counts = year_df['user'].value_counts().reindex(all_users, fill_value=0) + + # # Count how many unique pages each user edited (matching page logic) + # user_counts = ( + # year_df.groupby('user')['page'] + # .nunique() + # .reindex(all_users, fill_value=0) + # ) + + # Add the user counts for this year as a new column + user_edits_by_year[year] = user_counts + + # Count pages and active users for summary + pages = year_df['page'].nunique() # Count of unique pages edited in the year + active_users = user_counts[user_counts > 0].count() # Count users with edits in this year + + summary.append({ + 'year': year, + 'page edits': pages, + 'active users': active_users + }) + +# Convert summary to DataFrame +summary = pd.DataFrame(summary) +#print("Summary DataFrame before setting index:", summary) + +if 'year' in summary.columns: summary = summary.set_index('year') +else: + raise KeyError("The 'year' column is missing from the summary DataFrame.") + + +user_edits_by_year = user_edits_by_year.replace(0, pd.NA) + +# Sort users based on edits in the most recent year +most_recent_year = years.max() +user_edits_by_year = user_edits_by_year.sort_values(by=most_recent_year, ascending=False) + +# Save user edits by year as CSV +user_edits_by_year.to_csv(f'{OUTPUT_DIR}/user_edits_by_year.csv') + +print("Creating summary plot...") +fig, ax = plt.subplots(2, + 1, + figsize=(6, 4), + sharex=True, + gridspec_kw={'hspace': 0.5}) + +summary[['page edits']].plot.bar(ax=ax[0], legend=False) +summary[['active users']].plot.bar(ax=ax[1], legend=False) + +j = 0 +for i, row in summary.iterrows(): + ax[0].annotate(row['page edits'], + xy=(j, row['page edits']), + xytext=(0, 6), + textcoords='offset pixels', + ha='center', + fontsize=8) + + ax[1].annotate(row['active users'], + xy=(j, row['active users']), + xytext=(0, 6), + textcoords='offset pixels', + ha='center', + fontsize=8) + j += 1 + +ax[0].set_title('Page edits', fontsize=10, y=0.9) +ax[1].set_title('Active users', fontsize=10, y=0.9) + +ax[1].set_xlabel('') +ax[0].set_ylabel('Count', labelpad=10) +ax[1].set_ylabel('Count', labelpad=10) + +for a in ax.ravel(): + a.spines['top'].set_visible(False) + a.spines['right'].set_visible(False) + #png_name = f.replace('.xml', '.png') + png_name = os.path.join(FIGURE_DIR, 'summary_bar_chart.png') + + +plt.savefig(png_name, bbox_inches='tight', dpi=300) + + +#------------------------------------------------------------------ +print("Creating user edits table...") +# Select last 5 years +latest_5_years = sorted(years)[-5:] +user_table = user_edits_by_year[latest_5_years].fillna(0).astype(int).copy() + +# Sort by latest year, then previous years +user_table = user_table.sort_values(by=latest_5_years[::-1], ascending=False) + +# Drop users with all NaNs (i.e., no activity in these 5 years) +user_table = user_table.dropna(how='all') + +# Keep only top 13 users +user_table = user_table.head(13) + +# Reset index so 'user' becomes a column +user_table = user_table.reset_index() +user_table.columns = ['User'] + [str(year) for year in latest_5_years] + +# Plot table +fig, ax = plt.subplots(figsize=(10, 4)) +ax.axis('off') +tbl = ax.table(cellText=user_table.values, + colLabels=user_table.columns, + cellLoc='center', + loc='center') + +tbl.auto_set_font_size(False) +tbl.set_fontsize(9) +tbl.scale(1, 1.5) + +# Make column header text bold +for col in range(len(user_table.columns)): + header_cell = tbl[(0, col)] + header_cell.set_text_props(weight='bold') + +# Save figure +table_png_name = os.path.join(FIGURE_DIR, 'top_users_table.png') +plt.savefig(table_png_name, bbox_inches='tight', dpi=300) + +# ------------------------- +# Page Edits Table (Last 3 Years) +# ------------------------- +print("Creating page edits table...") +import textwrap + +latest_3_years = sorted(years)[-3:] +page_table = page_edits_by_year[latest_3_years].fillna(0).astype(int).copy() + +# Sort pages by latest year, then previous years +page_table = page_table.sort_values(by=latest_3_years[::-1], ascending=False) + +# Keep only top 20 pages +page_table = page_table.head(20) + +# Reset index so 'page' becomes a column +page_table = page_table.reset_index() +page_table.columns = ['Page'] + [str(year) for year in latest_3_years] + +# Define a max character width per line (adjust as needed) +WRAP_WIDTH = 50 + +# Wrap page titles +page_table['Page'] = page_table['Page'].apply( + lambda title: '\n'.join(textwrap.wrap(title, width=WRAP_WIDTH)) +) + +# Plot table +fig, ax = plt.subplots(figsize=(10, 4)) +ax.axis('off') +tbl = ax.table(cellText=page_table.values, + colLabels=page_table.columns, + cellLoc='center', + loc='center') + +# Dynamically set column widths +num_cols = len(page_table.columns) +col_widths = [0.40] + [0.60 / (num_cols - 1)] * (num_cols - 1) # 45% for "Page", rest split + +for i, width in enumerate(col_widths): + for row in range(len(page_table) + 1): # +1 includes header row + cell = tbl[(row, i)] + cell.set_width(width) + +tbl.auto_set_font_size(False) +tbl.set_fontsize(9) +tbl.scale(1, 1.5) + +# Make column header text bold +for col in range(len(page_table.columns)): + header_cell = tbl[(0, col)] + header_cell.set_text_props(weight='bold') + +# Adjust row heights to allow wrapped text to be visible +num_rows = len(page_table) + 1 # include header +row_height = 1.0 / num_rows +for row in range(num_rows): + for col in range(len(page_table.columns)): + tbl[(row, col)].set_height(row_height * 2) # tweak multiplier as needed + +# Save figure +page_table_png_name = os.path.join(FIGURE_DIR, 'top_pages_table.png') +plt.savefig(page_table_png_name, bbox_inches='tight', dpi=300) - fig, ax = plt.subplots(2, - 1, - figsize=(6, 4), - sharex=True, - gridspec_kw={'hspace': 0.5}) - - summary[['page edits']].plot.bar(ax=ax[0], legend=False) - summary[['active users']].plot.bar(ax=ax[1], legend=False) - - j = 0 - for i, row in summary.iterrows(): - ax[0].annotate(row['page edits'], - xy=(j, row['page edits']), - xytext=(0, 6), - textcoords='offset pixels', - ha='center', - fontsize=8) - - ax[1].annotate(row['active users'], - xy=(j, row['active users']), - xytext=(0, 6), - textcoords='offset pixels', - ha='center', - fontsize=8) - j += 1 - - ax[0].set_title('Page edits', fontsize=10, y=0.9) - ax[1].set_title('Active users', fontsize=10, y=0.9) - - ax[1].set_xlabel('') - ax[0].set_ylabel('Count', labelpad=10) - ax[1].set_ylabel('Count', labelpad=10) - - for a in ax.ravel(): - a.spines['top'].set_visible(False) - a.spines['right'].set_visible(False) - png_name = xml_name.replace('.xml', '.png') - - plt.savefig(png_name, bbox_inches='tight', dpi=300) - - -if __name__ == '__main__': - xml_names = [f for f in os.listdir('.') if f.endswith('.xml')] - for xml_name in xml_names: - main(xml_name) +# %%