Updated by JWC following WRL wiki migration to AWS WRL Web Services

master
Jonathan Chan 2 weeks ago
parent 83f62d2820
commit 7a410bfde8

@ -1,21 +1,25 @@
# wiki-usage
Generate usage stats on a mediawiki site.
Generate usage stats on a mediawiki site, specically outputting the following 3 figures.
1. Bar chart showing page edits and active users per year
2. Table showing user contributions per user over the past 5 years
3. Table showing page edits over the past 3 years
## Usage
Note, 'page edits' in the bar chart only shows the number of unique pages that have been edited. For example if mutliple users make multiple edits to a page, that page edit tally will still just be 1. Conversely, the user contributions table includes all unique edits to a page per day. For example, if a user makes 3 edits to the same page on three different days, the user contribution tally will be 3.
1. Get list of all wiki pages with the script `wiki_pages.py`
## Usage
2. Go to this wiki page:
<http://wiki.wrl.unsw.edu.au/index.php/Special:Export>
1. Go to this wiki page:
<https://wiki.wrl.unsw.edu.au/index.php?title=Special:AllPages>
3. Paste the list of page titles.
4. Uncheck the box 'Include only the current revision, not the full history'.
5. Click export.
2. Highlight all the pages then copy/paste the list of page titles in the "Add pages manually" text box here: https://wiki.wrl.unsw.edu.au/index.php/Special:Export. Note, you'll need to click Next Page to copy/paste ALL of the pages.
3. Uncheck the box 'Include only the current revision, not the full history' and check the box 'Save as file'
4. Click export.
![](docs/export.png)
6. Move the downloaded xml file into the same folder as `wiki-stats.py`, and run the script.
5. Move the downloaded xml file into the same folder as `wiki-stats.py`,
6. Update the filepath to this .xml in the script. You may also need to update the list of project engineers. Then run it. Figures will be output in a folder called `<YEAR>_Figures`
## Sample output

@ -1,175 +1,284 @@
#%%
import os
import re
import pandas as pd
from lxml import etree
import matplotlib.pyplot as plt
import mwxml # pip install mwxml
# UPDATE THIS PATH TO YOUR OWN FILE
f = r"D:\OneDrive\OneDrive - UNSW\Code\wiki-stats\WRL+Wiki-20250519003348.xml"
OUTPUT_DIR = 'csv'
PROJECTS_TEAM = [ # Oderered by first entry
'Anderson',
'Robert',
'Brett',
'Conrad',
'Matt',
'Duncan',
'Ianc',
'William',
'Laurent',
'Alexandra',
'Wendy',
'Alessio',
'Luis',
'Grantley',
'Beatrice',
'Sarah',
'Rossm',
'ROBERT',
'Steven',
'Wendyth',
'Melissa',
'Andy',
'Michael',
'Msa',
'Jamie',
'Toms',
'Sam',
'Larry',
'Annakg',
'Hamish',
'Francois',
'Annab',
'Erica',
'Coral',
'Priom',
'Barry',
'Nathan',
'Chrisd',
'Andrewf',
'Joshuas',
'Daniel',
'Danh',
'Duncanr',
'Robertt',
'Chrisdu',
'Brettm',
'Mathieud',
'Ianco',
'Larryp',
'Grantleys',
'Aliceh',
'Mattb',
'Tobyt',
'Benm',
'Jamess',
'Douga',
'Gabil',
'Francoisf',
'Lluna',
]
def main(xml_name):
revisions = []
with open(xml_name, 'rb') as f:
pages = mwxml.Dump.from_file(f)
for page in pages:
title = page.title
for revision in page:
timestamp = str(revision.timestamp)
contributor = revision.user.text
revisions.append({
'page': title,
'user': contributor,
'date': timestamp
})
df = pd.DataFrame(revisions)
df.index = pd.to_datetime(df['date'])
# Convert to local time, and round to nearest day
df = df.tz_convert('Australia/Sydney')
df = df.sort_index()
df['date'] = df.index.date
# Drop duplicate entries (ignore edits for same user/same page/same day)
df = df.drop_duplicates()
df = df.drop(columns=['date'])
# Only include non-academic users
df = df[df['user'].isin(PROJECTS_TEAM)]
# Get list of years
years = df.index.year.unique()
os.makedirs(OUTPUT_DIR, exist_ok=True)
summary = []
for year in years:
idx = df.index.year == year
pages = df[idx].groupby('page').count()
pages = pages.rename(columns={'user': 'edits'})
pages = pages.sort_values('edits', ascending=False)
users = df[idx].groupby('user').count()
users = users.rename(columns={'page': 'edits'})
users = users.sort_values('edits', ascending=False)
pages.to_csv(f'{OUTPUT_DIR}/{year}-pages.csv')
users.to_csv(f'{OUTPUT_DIR}/{year}-users.csv')
summary.append({
'year': year,
'page edits': pages.shape[0],
'active users': users.shape[0]
pages = mwxml.Dump.from_file(f)
print("Processing pages...")
revisions = []
for page in pages:
title = page.title
for revision in page:
timestamp = str(revision.timestamp)
contributor = revision.user.text
revisions.append({
'page': title,
'user': contributor,
'date': timestamp
})
summary = pd.DataFrame(summary)
df = pd.DataFrame(revisions)
df.index = pd.to_datetime(df['date'])
# Get the latest year from the dataset
LATEST_YEAR = df.index.year.max()
# Get script directory
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
# Define figure output directory
FIGURE_DIR = os.path.join(SCRIPT_DIR, f"{LATEST_YEAR}_Figures")
os.makedirs(FIGURE_DIR, exist_ok=True)
# Convert to local time, and round to nearest day
df = df.tz_convert('Australia/Sydney')
df = df.sort_index()
df['date'] = df.index.date
# Drop duplicate entries (ignore edits for same user/same page/same day)
df = df.drop_duplicates()
df = df.drop(columns=['date'])
# Get list of years
years = df.index.year.unique()
os.makedirs(OUTPUT_DIR, exist_ok=True)
################################################################################
# Initialize DataFrame to track page edits by year
page_edits_by_year = pd.DataFrame()
# Get all unique pages
all_pages = df['page'].unique()
for year in years:
# Filter data to only include rows from the current year
year_df = df[df.index.year == year]
# Count each page's contributions in the current year
page_counts = year_df['page'].value_counts().reindex(all_pages, fill_value=0)
# Add the page counts for this year as a new column
page_edits_by_year[year] = page_counts
page_edits_by_year = page_edits_by_year.replace(0, pd.NA)
# Sort pages based on the most recent year's edit counts in descending order
most_recent_year = years.max()
page_edits_by_year = page_edits_by_year.sort_values(by=most_recent_year, ascending=False)
################################################################################
summary = []
user_edits_by_year = pd.DataFrame()
# Get all unique users across all years
all_users = df['user'].unique()
for year in years:
# Filter the DataFrame to only include rows from the current year
year_df = df[df.index.year == year]
# Count each user's contributions in the current year
user_counts = year_df['user'].value_counts().reindex(all_users, fill_value=0)
# # Count how many unique pages each user edited (matching page logic)
# user_counts = (
# year_df.groupby('user')['page']
# .nunique()
# .reindex(all_users, fill_value=0)
# )
# Add the user counts for this year as a new column
user_edits_by_year[year] = user_counts
# Count pages and active users for summary
pages = year_df['page'].nunique() # Count of unique pages edited in the year
active_users = user_counts[user_counts > 0].count() # Count users with edits in this year
summary.append({
'year': year,
'page edits': pages,
'active users': active_users
})
# Convert summary to DataFrame
summary = pd.DataFrame(summary)
#print("Summary DataFrame before setting index:", summary)
if 'year' in summary.columns:
summary = summary.set_index('year')
else:
raise KeyError("The 'year' column is missing from the summary DataFrame.")
user_edits_by_year = user_edits_by_year.replace(0, pd.NA)
# Sort users based on edits in the most recent year
most_recent_year = years.max()
user_edits_by_year = user_edits_by_year.sort_values(by=most_recent_year, ascending=False)
# Save user edits by year as CSV
user_edits_by_year.to_csv(f'{OUTPUT_DIR}/user_edits_by_year.csv')
print("Creating summary plot...")
fig, ax = plt.subplots(2,
1,
figsize=(6, 4),
sharex=True,
gridspec_kw={'hspace': 0.5})
summary[['page edits']].plot.bar(ax=ax[0], legend=False)
summary[['active users']].plot.bar(ax=ax[1], legend=False)
j = 0
for i, row in summary.iterrows():
ax[0].annotate(row['page edits'],
xy=(j, row['page edits']),
xytext=(0, 6),
textcoords='offset pixels',
ha='center',
fontsize=8)
ax[1].annotate(row['active users'],
xy=(j, row['active users']),
xytext=(0, 6),
textcoords='offset pixels',
ha='center',
fontsize=8)
j += 1
ax[0].set_title('Page edits', fontsize=10, y=0.9)
ax[1].set_title('Active users', fontsize=10, y=0.9)
ax[1].set_xlabel('')
ax[0].set_ylabel('Count', labelpad=10)
ax[1].set_ylabel('Count', labelpad=10)
for a in ax.ravel():
a.spines['top'].set_visible(False)
a.spines['right'].set_visible(False)
#png_name = f.replace('.xml', '.png')
png_name = os.path.join(FIGURE_DIR, 'summary_bar_chart.png')
plt.savefig(png_name, bbox_inches='tight', dpi=300)
#------------------------------------------------------------------
print("Creating user edits table...")
# Select last 5 years
latest_5_years = sorted(years)[-5:]
user_table = user_edits_by_year[latest_5_years].fillna(0).astype(int).copy()
# Sort by latest year, then previous years
user_table = user_table.sort_values(by=latest_5_years[::-1], ascending=False)
# Drop users with all NaNs (i.e., no activity in these 5 years)
user_table = user_table.dropna(how='all')
# Keep only top 13 users
user_table = user_table.head(13)
# Reset index so 'user' becomes a column
user_table = user_table.reset_index()
user_table.columns = ['User'] + [str(year) for year in latest_5_years]
# Plot table
fig, ax = plt.subplots(figsize=(10, 4))
ax.axis('off')
tbl = ax.table(cellText=user_table.values,
colLabels=user_table.columns,
cellLoc='center',
loc='center')
tbl.auto_set_font_size(False)
tbl.set_fontsize(9)
tbl.scale(1, 1.5)
# Make column header text bold
for col in range(len(user_table.columns)):
header_cell = tbl[(0, col)]
header_cell.set_text_props(weight='bold')
# Save figure
table_png_name = os.path.join(FIGURE_DIR, 'top_users_table.png')
plt.savefig(table_png_name, bbox_inches='tight', dpi=300)
# -------------------------
# Page Edits Table (Last 3 Years)
# -------------------------
print("Creating page edits table...")
import textwrap
latest_3_years = sorted(years)[-3:]
page_table = page_edits_by_year[latest_3_years].fillna(0).astype(int).copy()
# Sort pages by latest year, then previous years
page_table = page_table.sort_values(by=latest_3_years[::-1], ascending=False)
# Keep only top 20 pages
page_table = page_table.head(20)
# Reset index so 'page' becomes a column
page_table = page_table.reset_index()
page_table.columns = ['Page'] + [str(year) for year in latest_3_years]
# Define a max character width per line (adjust as needed)
WRAP_WIDTH = 50
# Wrap page titles
page_table['Page'] = page_table['Page'].apply(
lambda title: '\n'.join(textwrap.wrap(title, width=WRAP_WIDTH))
)
# Plot table
fig, ax = plt.subplots(figsize=(10, 4))
ax.axis('off')
tbl = ax.table(cellText=page_table.values,
colLabels=page_table.columns,
cellLoc='center',
loc='center')
# Dynamically set column widths
num_cols = len(page_table.columns)
col_widths = [0.40] + [0.60 / (num_cols - 1)] * (num_cols - 1) # 45% for "Page", rest split
for i, width in enumerate(col_widths):
for row in range(len(page_table) + 1): # +1 includes header row
cell = tbl[(row, i)]
cell.set_width(width)
tbl.auto_set_font_size(False)
tbl.set_fontsize(9)
tbl.scale(1, 1.5)
# Make column header text bold
for col in range(len(page_table.columns)):
header_cell = tbl[(0, col)]
header_cell.set_text_props(weight='bold')
# Adjust row heights to allow wrapped text to be visible
num_rows = len(page_table) + 1 # include header
row_height = 1.0 / num_rows
for row in range(num_rows):
for col in range(len(page_table.columns)):
tbl[(row, col)].set_height(row_height * 2) # tweak multiplier as needed
# Save figure
page_table_png_name = os.path.join(FIGURE_DIR, 'top_pages_table.png')
plt.savefig(page_table_png_name, bbox_inches='tight', dpi=300)
fig, ax = plt.subplots(2,
1,
figsize=(6, 4),
sharex=True,
gridspec_kw={'hspace': 0.5})
summary[['page edits']].plot.bar(ax=ax[0], legend=False)
summary[['active users']].plot.bar(ax=ax[1], legend=False)
j = 0
for i, row in summary.iterrows():
ax[0].annotate(row['page edits'],
xy=(j, row['page edits']),
xytext=(0, 6),
textcoords='offset pixels',
ha='center',
fontsize=8)
ax[1].annotate(row['active users'],
xy=(j, row['active users']),
xytext=(0, 6),
textcoords='offset pixels',
ha='center',
fontsize=8)
j += 1
ax[0].set_title('Page edits', fontsize=10, y=0.9)
ax[1].set_title('Active users', fontsize=10, y=0.9)
ax[1].set_xlabel('')
ax[0].set_ylabel('Count', labelpad=10)
ax[1].set_ylabel('Count', labelpad=10)
for a in ax.ravel():
a.spines['top'].set_visible(False)
a.spines['right'].set_visible(False)
png_name = xml_name.replace('.xml', '.png')
plt.savefig(png_name, bbox_inches='tight', dpi=300)
if __name__ == '__main__':
xml_names = [f for f in os.listdir('.') if f.endswith('.xml')]
for xml_name in xml_names:
main(xml_name)
# %%

Loading…
Cancel
Save