Updated by JWC following WRL wiki migration to AWS WRL Web Services

7 months ago · 7a410bfde8
parent 83f62d2820
commit 7a410bfde8
2 changed files with 287 additions and 174 deletions
--- a/README.md
+++ b/README.md
@ -1,21 +1,25 @@
 # wiki-usage
-Generate usage stats on a mediawiki site.
+Generate usage stats on a mediawiki site, specically outputting the following 3 figures. 
 1. Bar chart showing page edits and active users per year
 2. Table showing user contributions per user over the past 5 years
 3. Table showing page edits over the past 3 years
-## Usage
+Note, 'page edits' in the bar chart only shows the number of unique pages that have been edited. For example if mutliple users make multiple edits to a page, that page edit tally will still just be 1. Conversely, the user contributions table includes all unique edits to a page per day. For example, if a user makes 3 edits to the same page on three different days, the user contribution tally will be 3.
-1.  Get list of all wiki pages with the script `wiki_pages.py`
+## Usage
-2.  Go to this wiki page:  
+1.  Go to this wiki page:  
-    <http://wiki.wrl.unsw.edu.au/index.php/Special:Export>
+    <https://wiki.wrl.unsw.edu.au/index.php?title=Special:AllPages>
-3.  Paste the list of page titles.
+2.  Highlight all the pages then copy/paste the list of page titles in the "Add pages manually" text box here: https://wiki.wrl.unsw.edu.au/index.php/Special:Export. Note, you'll need to click Next Page to copy/paste ALL of the pages.
-4.  Uncheck the box 'Include only the current revision, not the full history'.
+3.  Uncheck the box 'Include only the current revision, not the full history' and check the box 'Save as file'
-5.  Click export.
+4.  Click export.
    ![](docs/export.png)
-6.  Move the downloaded xml file into the same folder as `wiki-stats.py`, and run the script.
+5. Move the downloaded xml file into the same folder as `wiki-stats.py`, 
 6. Update the filepath to this .xml in the script. You may also need to update the list of project engineers. Then run it. Figures will be output in a folder called `<YEAR>_Figures`
 ## Sample output
--- a/wiki_stats.py
+++ b/wiki_stats.py
@ -1,175 +1,284 @@
 #%%
 import os
 import re
 import pandas as pd
 from lxml import etree
 import matplotlib.pyplot as plt
 import mwxml  # pip install mwxml
 # UPDATE THIS PATH TO YOUR OWN FILE
 f = r"D:\OneDrive\OneDrive - UNSW\Code\wiki-stats\WRL+Wiki-20250519003348.xml"
 OUTPUT_DIR = 'csv'
-PROJECTS_TEAM = [  # Oderered by first entry
+
-    'Anderson',
+pages = mwxml.Dump.from_file(f)
-    'Robert',
+print("Processing pages...")
-    'Brett',
+revisions = []
-    'Conrad',
+for page in pages:
-    'Matt',
+    title = page.title
-    'Duncan',
+
-    'Ianc',
+    for revision in page:
-    'William',
+        timestamp = str(revision.timestamp)
-    'Laurent',
+        contributor = revision.user.text
-    'Alexandra',
+
-    'Wendy',
+        revisions.append({
-    'Alessio',
+            'page': title,
-    'Luis',
+            'user': contributor,
-    'Grantley',
+            'date': timestamp
    'Beatrice',
    'Sarah',
    'Rossm',
    'ROBERT',
    'Steven',
    'Wendyth',
    'Melissa',
    'Andy',
    'Michael',
    'Msa',
    'Jamie',
    'Toms',
    'Sam',
    'Larry',
    'Annakg',
    'Hamish',
    'Francois',
    'Annab',
    'Erica',
    'Coral',
    'Priom',
    'Barry',
    'Nathan',
    'Chrisd',
    'Andrewf',
    'Joshuas',
    'Daniel',
    'Danh',
    'Duncanr',
    'Robertt',
    'Chrisdu',
    'Brettm',
    'Mathieud',
    'Ianco',
    'Larryp',
    'Grantleys',
    'Aliceh',
    'Mattb',
    'Tobyt',
    'Benm',
    'Jamess',
    'Douga',
    'Gabil',
    'Francoisf',
    'Lluna',
 ]
 def main(xml_name):
    revisions = []
    with open(xml_name, 'rb') as f:
        pages = mwxml.Dump.from_file(f)
        for page in pages:
            title = page.title
            for revision in page:
                timestamp = str(revision.timestamp)
                contributor = revision.user.text
                revisions.append({
                    'page': title,
                    'user': contributor,
                    'date': timestamp
                })
    df = pd.DataFrame(revisions)
    df.index = pd.to_datetime(df['date'])
    # Convert to local time, and round to nearest day
    df = df.tz_convert('Australia/Sydney')
    df = df.sort_index()
    df['date'] = df.index.date
    # Drop duplicate entries (ignore edits for same user/same page/same day)
    df = df.drop_duplicates()
    df = df.drop(columns=['date'])
    # Only include non-academic users
    df = df[df['user'].isin(PROJECTS_TEAM)]
    # Get list of years
    years = df.index.year.unique()
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    summary = []
    for year in years:
        idx = df.index.year == year
        pages = df[idx].groupby('page').count()
        pages = pages.rename(columns={'user': 'edits'})
        pages = pages.sort_values('edits', ascending=False)
        users = df[idx].groupby('user').count()
        users = users.rename(columns={'page': 'edits'})
        users = users.sort_values('edits', ascending=False)
        pages.to_csv(f'{OUTPUT_DIR}/{year}-pages.csv')
        users.to_csv(f'{OUTPUT_DIR}/{year}-users.csv')
        summary.append({
            'year': year,
            'page edits': pages.shape[0],
            'active users': users.shape[0]
        })
-    summary = pd.DataFrame(summary)
+
 df = pd.DataFrame(revisions)    
 df.index = pd.to_datetime(df['date'])
 # Get the latest year from the dataset
 LATEST_YEAR = df.index.year.max()
 # Get script directory
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 # Define figure output directory
 FIGURE_DIR = os.path.join(SCRIPT_DIR, f"{LATEST_YEAR}_Figures")
 os.makedirs(FIGURE_DIR, exist_ok=True)
 # Convert to local time, and round to nearest day
 df = df.tz_convert('Australia/Sydney')
 df = df.sort_index()
 df['date'] = df.index.date
 # Drop duplicate entries (ignore edits for same user/same page/same day)
 df = df.drop_duplicates()
 df = df.drop(columns=['date'])
 # Get list of years
 years = df.index.year.unique()
 os.makedirs(OUTPUT_DIR, exist_ok=True)
 ################################################################################
 # Initialize DataFrame to track page edits by year
 page_edits_by_year = pd.DataFrame()
 # Get all unique pages
 all_pages = df['page'].unique()
 for year in years:
    # Filter data to only include rows from the current year
    year_df = df[df.index.year == year]
    # Count each page's contributions in the current year
    page_counts = year_df['page'].value_counts().reindex(all_pages, fill_value=0)
    # Add the page counts for this year as a new column
    page_edits_by_year[year] = page_counts
 page_edits_by_year = page_edits_by_year.replace(0, pd.NA)
 # Sort pages based on the most recent year's edit counts in descending order
 most_recent_year = years.max()
 page_edits_by_year = page_edits_by_year.sort_values(by=most_recent_year, ascending=False)
 ################################################################################
 summary = []
 user_edits_by_year = pd.DataFrame()
 # Get all unique users across all years
 all_users = df['user'].unique()
 for year in years:
    # Filter the DataFrame to only include rows from the current year
    year_df = df[df.index.year == year]
    # Count each user's contributions in the current year
    user_counts = year_df['user'].value_counts().reindex(all_users, fill_value=0)
    # # Count how many unique pages each user edited (matching page logic)
    # user_counts = (
    #     year_df.groupby('user')['page']
    #     .nunique()
    #     .reindex(all_users, fill_value=0)
    # )
    # Add the user counts for this year as a new column
    user_edits_by_year[year] = user_counts
    # Count pages and active users for summary
    pages = year_df['page'].nunique()  # Count of unique pages edited in the year
    active_users = user_counts[user_counts > 0].count()  # Count users with edits in this year
    summary.append({
        'year': year,
        'page edits': pages,
        'active users': active_users
    })
 # Convert summary to DataFrame
 summary = pd.DataFrame(summary)
 #print("Summary DataFrame before setting index:", summary)
 if 'year' in summary.columns:
    summary = summary.set_index('year')
 else:
    raise KeyError("The 'year' column is missing from the summary DataFrame.")
 user_edits_by_year = user_edits_by_year.replace(0, pd.NA)
 # Sort users based on edits in the most recent year
 most_recent_year = years.max()
 user_edits_by_year = user_edits_by_year.sort_values(by=most_recent_year, ascending=False)
 # Save user edits by year as CSV
 user_edits_by_year.to_csv(f'{OUTPUT_DIR}/user_edits_by_year.csv')
 print("Creating summary plot...")
 fig, ax = plt.subplots(2,
                   1,
                   figsize=(6, 4),
                   sharex=True,
                   gridspec_kw={'hspace': 0.5})
 summary[['page edits']].plot.bar(ax=ax[0], legend=False)
 summary[['active users']].plot.bar(ax=ax[1], legend=False)
 j = 0
 for i, row in summary.iterrows():
    ax[0].annotate(row['page edits'],
                   xy=(j, row['page edits']),
                   xytext=(0, 6),
                   textcoords='offset pixels',
                   ha='center',
                   fontsize=8)
    ax[1].annotate(row['active users'],
                   xy=(j, row['active users']),
                   xytext=(0, 6),
                   textcoords='offset pixels',
                   ha='center',
                   fontsize=8)
    j += 1
 ax[0].set_title('Page edits', fontsize=10, y=0.9)
 ax[1].set_title('Active users', fontsize=10, y=0.9)
 ax[1].set_xlabel('')
 ax[0].set_ylabel('Count', labelpad=10)
 ax[1].set_ylabel('Count', labelpad=10)
 for a in ax.ravel():
    a.spines['top'].set_visible(False)
    a.spines['right'].set_visible(False)
    #png_name = f.replace('.xml', '.png')
    png_name = os.path.join(FIGURE_DIR, 'summary_bar_chart.png')
 plt.savefig(png_name, bbox_inches='tight', dpi=300)
 #------------------------------------------------------------------
 print("Creating user edits table...")
 # Select last 5 years
 latest_5_years = sorted(years)[-5:]
 user_table = user_edits_by_year[latest_5_years].fillna(0).astype(int).copy()
 # Sort by latest year, then previous years
 user_table = user_table.sort_values(by=latest_5_years[::-1], ascending=False)
 # Drop users with all NaNs (i.e., no activity in these 5 years)
 user_table = user_table.dropna(how='all')
 # Keep only top 13 users
 user_table = user_table.head(13)
 # Reset index so 'user' becomes a column
 user_table = user_table.reset_index()
 user_table.columns = ['User'] + [str(year) for year in latest_5_years]
 # Plot table
 fig, ax = plt.subplots(figsize=(10, 4))
 ax.axis('off')
 tbl = ax.table(cellText=user_table.values,
               colLabels=user_table.columns,
               cellLoc='center',
               loc='center')
 tbl.auto_set_font_size(False)
 tbl.set_fontsize(9)
 tbl.scale(1, 1.5)
 # Make column header text bold
 for col in range(len(user_table.columns)):
    header_cell = tbl[(0, col)]
    header_cell.set_text_props(weight='bold')
 # Save figure
 table_png_name = os.path.join(FIGURE_DIR, 'top_users_table.png')
 plt.savefig(table_png_name, bbox_inches='tight', dpi=300)
 # -------------------------
 # Page Edits Table (Last 3 Years)
 # -------------------------
 print("Creating page edits table...")
 import textwrap
 latest_3_years = sorted(years)[-3:]
 page_table = page_edits_by_year[latest_3_years].fillna(0).astype(int).copy()
 # Sort pages by latest year, then previous years
 page_table = page_table.sort_values(by=latest_3_years[::-1], ascending=False)
 # Keep only top 20 pages
 page_table = page_table.head(20)
 # Reset index so 'page' becomes a column
 page_table = page_table.reset_index()
 page_table.columns = ['Page'] + [str(year) for year in latest_3_years]
 # Define a max character width per line (adjust as needed)
 WRAP_WIDTH = 50
 # Wrap page titles
 page_table['Page'] = page_table['Page'].apply(
    lambda title: '\n'.join(textwrap.wrap(title, width=WRAP_WIDTH))
 )
 # Plot table
 fig, ax = plt.subplots(figsize=(10, 4))
 ax.axis('off')
 tbl = ax.table(cellText=page_table.values,
               colLabels=page_table.columns,
               cellLoc='center',
               loc='center')
 # Dynamically set column widths
 num_cols = len(page_table.columns)
 col_widths = [0.40] + [0.60 / (num_cols - 1)] * (num_cols - 1)  # 45% for "Page", rest split
 for i, width in enumerate(col_widths):
    for row in range(len(page_table) + 1):  # +1 includes header row
        cell = tbl[(row, i)]
        cell.set_width(width)
 tbl.auto_set_font_size(False)
 tbl.set_fontsize(9)
 tbl.scale(1, 1.5)
 # Make column header text bold
 for col in range(len(page_table.columns)):
    header_cell = tbl[(0, col)]
    header_cell.set_text_props(weight='bold')
 # Adjust row heights to allow wrapped text to be visible
 num_rows = len(page_table) + 1  # include header
 row_height = 1.0 / num_rows
 for row in range(num_rows):
    for col in range(len(page_table.columns)):
        tbl[(row, col)].set_height(row_height * 2)  # tweak multiplier as needed
 # Save figure
 page_table_png_name = os.path.join(FIGURE_DIR, 'top_pages_table.png')
 plt.savefig(page_table_png_name, bbox_inches='tight', dpi=300)
-    fig, ax = plt.subplots(2,
+# %%
                           1,
                           figsize=(6, 4),
                           sharex=True,
                           gridspec_kw={'hspace': 0.5})
    summary[['page edits']].plot.bar(ax=ax[0], legend=False)
    summary[['active users']].plot.bar(ax=ax[1], legend=False)
    j = 0
    for i, row in summary.iterrows():
        ax[0].annotate(row['page edits'],
                       xy=(j, row['page edits']),
                       xytext=(0, 6),
                       textcoords='offset pixels',
                       ha='center',
                       fontsize=8)
        ax[1].annotate(row['active users'],
                       xy=(j, row['active users']),
                       xytext=(0, 6),
                       textcoords='offset pixels',
                       ha='center',
                       fontsize=8)
        j += 1
    ax[0].set_title('Page edits', fontsize=10, y=0.9)
    ax[1].set_title('Active users', fontsize=10, y=0.9)
    ax[1].set_xlabel('')
    ax[0].set_ylabel('Count', labelpad=10)
    ax[1].set_ylabel('Count', labelpad=10)
    for a in ax.ravel():
        a.spines['top'].set_visible(False)
        a.spines['right'].set_visible(False)
        png_name = xml_name.replace('.xml', '.png')
    plt.savefig(png_name, bbox_inches='tight', dpi=300)
 if __name__ == '__main__':
    xml_names = [f for f in os.listdir('.') if f.endswith('.xml')]
    for xml_name in xml_names:
        main(xml_name)