Updated by JWC following WRL wiki migration to AWS WRL Web Services
parent
83f62d2820
commit
7a410bfde8
@ -1,175 +1,284 @@
|
||||
|
||||
#%%
|
||||
import os
|
||||
import re
|
||||
import pandas as pd
|
||||
from lxml import etree
|
||||
import matplotlib.pyplot as plt
|
||||
import mwxml # pip install mwxml
|
||||
|
||||
# UPDATE THIS PATH TO YOUR OWN FILE
|
||||
f = r"D:\OneDrive\OneDrive - UNSW\Code\wiki-stats\WRL+Wiki-20250519003348.xml"
|
||||
|
||||
OUTPUT_DIR = 'csv'
|
||||
PROJECTS_TEAM = [ # Oderered by first entry
|
||||
'Anderson',
|
||||
'Robert',
|
||||
'Brett',
|
||||
'Conrad',
|
||||
'Matt',
|
||||
'Duncan',
|
||||
'Ianc',
|
||||
'William',
|
||||
'Laurent',
|
||||
'Alexandra',
|
||||
'Wendy',
|
||||
'Alessio',
|
||||
'Luis',
|
||||
'Grantley',
|
||||
'Beatrice',
|
||||
'Sarah',
|
||||
'Rossm',
|
||||
'ROBERT',
|
||||
'Steven',
|
||||
'Wendyth',
|
||||
'Melissa',
|
||||
'Andy',
|
||||
'Michael',
|
||||
'Msa',
|
||||
'Jamie',
|
||||
'Toms',
|
||||
'Sam',
|
||||
'Larry',
|
||||
'Annakg',
|
||||
'Hamish',
|
||||
'Francois',
|
||||
'Annab',
|
||||
'Erica',
|
||||
'Coral',
|
||||
'Priom',
|
||||
'Barry',
|
||||
'Nathan',
|
||||
'Chrisd',
|
||||
'Andrewf',
|
||||
'Joshuas',
|
||||
'Daniel',
|
||||
'Danh',
|
||||
'Duncanr',
|
||||
'Robertt',
|
||||
'Chrisdu',
|
||||
'Brettm',
|
||||
'Mathieud',
|
||||
'Ianco',
|
||||
'Larryp',
|
||||
'Grantleys',
|
||||
'Aliceh',
|
||||
'Mattb',
|
||||
'Tobyt',
|
||||
'Benm',
|
||||
'Jamess',
|
||||
'Douga',
|
||||
'Gabil',
|
||||
'Francoisf',
|
||||
'Lluna',
|
||||
]
|
||||
|
||||
|
||||
def main(xml_name):
|
||||
revisions = []
|
||||
with open(xml_name, 'rb') as f:
|
||||
pages = mwxml.Dump.from_file(f)
|
||||
for page in pages:
|
||||
title = page.title
|
||||
for revision in page:
|
||||
timestamp = str(revision.timestamp)
|
||||
contributor = revision.user.text
|
||||
|
||||
revisions.append({
|
||||
'page': title,
|
||||
'user': contributor,
|
||||
'date': timestamp
|
||||
})
|
||||
|
||||
df = pd.DataFrame(revisions)
|
||||
df.index = pd.to_datetime(df['date'])
|
||||
|
||||
# Convert to local time, and round to nearest day
|
||||
df = df.tz_convert('Australia/Sydney')
|
||||
df = df.sort_index()
|
||||
df['date'] = df.index.date
|
||||
|
||||
# Drop duplicate entries (ignore edits for same user/same page/same day)
|
||||
df = df.drop_duplicates()
|
||||
df = df.drop(columns=['date'])
|
||||
|
||||
# Only include non-academic users
|
||||
df = df[df['user'].isin(PROJECTS_TEAM)]
|
||||
|
||||
# Get list of years
|
||||
years = df.index.year.unique()
|
||||
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
summary = []
|
||||
for year in years:
|
||||
idx = df.index.year == year
|
||||
pages = df[idx].groupby('page').count()
|
||||
pages = pages.rename(columns={'user': 'edits'})
|
||||
pages = pages.sort_values('edits', ascending=False)
|
||||
|
||||
users = df[idx].groupby('user').count()
|
||||
users = users.rename(columns={'page': 'edits'})
|
||||
users = users.sort_values('edits', ascending=False)
|
||||
|
||||
pages.to_csv(f'{OUTPUT_DIR}/{year}-pages.csv')
|
||||
users.to_csv(f'{OUTPUT_DIR}/{year}-users.csv')
|
||||
|
||||
summary.append({
|
||||
'year': year,
|
||||
'page edits': pages.shape[0],
|
||||
'active users': users.shape[0]
|
||||
|
||||
pages = mwxml.Dump.from_file(f)
|
||||
print("Processing pages...")
|
||||
revisions = []
|
||||
for page in pages:
|
||||
title = page.title
|
||||
|
||||
for revision in page:
|
||||
timestamp = str(revision.timestamp)
|
||||
contributor = revision.user.text
|
||||
|
||||
revisions.append({
|
||||
'page': title,
|
||||
'user': contributor,
|
||||
'date': timestamp
|
||||
})
|
||||
|
||||
summary = pd.DataFrame(summary)
|
||||
|
||||
df = pd.DataFrame(revisions)
|
||||
df.index = pd.to_datetime(df['date'])
|
||||
|
||||
|
||||
# Get the latest year from the dataset
|
||||
LATEST_YEAR = df.index.year.max()
|
||||
|
||||
# Get script directory
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# Define figure output directory
|
||||
FIGURE_DIR = os.path.join(SCRIPT_DIR, f"{LATEST_YEAR}_Figures")
|
||||
os.makedirs(FIGURE_DIR, exist_ok=True)
|
||||
|
||||
|
||||
# Convert to local time, and round to nearest day
|
||||
df = df.tz_convert('Australia/Sydney')
|
||||
df = df.sort_index()
|
||||
df['date'] = df.index.date
|
||||
|
||||
# Drop duplicate entries (ignore edits for same user/same page/same day)
|
||||
df = df.drop_duplicates()
|
||||
df = df.drop(columns=['date'])
|
||||
|
||||
# Get list of years
|
||||
years = df.index.year.unique()
|
||||
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
################################################################################
|
||||
# Initialize DataFrame to track page edits by year
|
||||
page_edits_by_year = pd.DataFrame()
|
||||
|
||||
# Get all unique pages
|
||||
all_pages = df['page'].unique()
|
||||
|
||||
for year in years:
|
||||
# Filter data to only include rows from the current year
|
||||
year_df = df[df.index.year == year]
|
||||
|
||||
# Count each page's contributions in the current year
|
||||
page_counts = year_df['page'].value_counts().reindex(all_pages, fill_value=0)
|
||||
|
||||
# Add the page counts for this year as a new column
|
||||
page_edits_by_year[year] = page_counts
|
||||
|
||||
page_edits_by_year = page_edits_by_year.replace(0, pd.NA)
|
||||
|
||||
# Sort pages based on the most recent year's edit counts in descending order
|
||||
most_recent_year = years.max()
|
||||
page_edits_by_year = page_edits_by_year.sort_values(by=most_recent_year, ascending=False)
|
||||
|
||||
################################################################################
|
||||
|
||||
summary = []
|
||||
user_edits_by_year = pd.DataFrame()
|
||||
|
||||
# Get all unique users across all years
|
||||
all_users = df['user'].unique()
|
||||
|
||||
for year in years:
|
||||
# Filter the DataFrame to only include rows from the current year
|
||||
year_df = df[df.index.year == year]
|
||||
|
||||
# Count each user's contributions in the current year
|
||||
user_counts = year_df['user'].value_counts().reindex(all_users, fill_value=0)
|
||||
|
||||
# # Count how many unique pages each user edited (matching page logic)
|
||||
# user_counts = (
|
||||
# year_df.groupby('user')['page']
|
||||
# .nunique()
|
||||
# .reindex(all_users, fill_value=0)
|
||||
# )
|
||||
|
||||
# Add the user counts for this year as a new column
|
||||
user_edits_by_year[year] = user_counts
|
||||
|
||||
# Count pages and active users for summary
|
||||
pages = year_df['page'].nunique() # Count of unique pages edited in the year
|
||||
active_users = user_counts[user_counts > 0].count() # Count users with edits in this year
|
||||
|
||||
summary.append({
|
||||
'year': year,
|
||||
'page edits': pages,
|
||||
'active users': active_users
|
||||
})
|
||||
|
||||
# Convert summary to DataFrame
|
||||
summary = pd.DataFrame(summary)
|
||||
#print("Summary DataFrame before setting index:", summary)
|
||||
|
||||
if 'year' in summary.columns:
|
||||
summary = summary.set_index('year')
|
||||
else:
|
||||
raise KeyError("The 'year' column is missing from the summary DataFrame.")
|
||||
|
||||
|
||||
user_edits_by_year = user_edits_by_year.replace(0, pd.NA)
|
||||
|
||||
# Sort users based on edits in the most recent year
|
||||
most_recent_year = years.max()
|
||||
user_edits_by_year = user_edits_by_year.sort_values(by=most_recent_year, ascending=False)
|
||||
|
||||
# Save user edits by year as CSV
|
||||
user_edits_by_year.to_csv(f'{OUTPUT_DIR}/user_edits_by_year.csv')
|
||||
|
||||
print("Creating summary plot...")
|
||||
fig, ax = plt.subplots(2,
|
||||
1,
|
||||
figsize=(6, 4),
|
||||
sharex=True,
|
||||
gridspec_kw={'hspace': 0.5})
|
||||
|
||||
summary[['page edits']].plot.bar(ax=ax[0], legend=False)
|
||||
summary[['active users']].plot.bar(ax=ax[1], legend=False)
|
||||
|
||||
j = 0
|
||||
for i, row in summary.iterrows():
|
||||
ax[0].annotate(row['page edits'],
|
||||
xy=(j, row['page edits']),
|
||||
xytext=(0, 6),
|
||||
textcoords='offset pixels',
|
||||
ha='center',
|
||||
fontsize=8)
|
||||
|
||||
ax[1].annotate(row['active users'],
|
||||
xy=(j, row['active users']),
|
||||
xytext=(0, 6),
|
||||
textcoords='offset pixels',
|
||||
ha='center',
|
||||
fontsize=8)
|
||||
j += 1
|
||||
|
||||
ax[0].set_title('Page edits', fontsize=10, y=0.9)
|
||||
ax[1].set_title('Active users', fontsize=10, y=0.9)
|
||||
|
||||
ax[1].set_xlabel('')
|
||||
ax[0].set_ylabel('Count', labelpad=10)
|
||||
ax[1].set_ylabel('Count', labelpad=10)
|
||||
|
||||
for a in ax.ravel():
|
||||
a.spines['top'].set_visible(False)
|
||||
a.spines['right'].set_visible(False)
|
||||
#png_name = f.replace('.xml', '.png')
|
||||
png_name = os.path.join(FIGURE_DIR, 'summary_bar_chart.png')
|
||||
|
||||
|
||||
plt.savefig(png_name, bbox_inches='tight', dpi=300)
|
||||
|
||||
|
||||
#------------------------------------------------------------------
|
||||
print("Creating user edits table...")
|
||||
# Select last 5 years
|
||||
latest_5_years = sorted(years)[-5:]
|
||||
user_table = user_edits_by_year[latest_5_years].fillna(0).astype(int).copy()
|
||||
|
||||
# Sort by latest year, then previous years
|
||||
user_table = user_table.sort_values(by=latest_5_years[::-1], ascending=False)
|
||||
|
||||
# Drop users with all NaNs (i.e., no activity in these 5 years)
|
||||
user_table = user_table.dropna(how='all')
|
||||
|
||||
# Keep only top 13 users
|
||||
user_table = user_table.head(13)
|
||||
|
||||
# Reset index so 'user' becomes a column
|
||||
user_table = user_table.reset_index()
|
||||
user_table.columns = ['User'] + [str(year) for year in latest_5_years]
|
||||
|
||||
# Plot table
|
||||
fig, ax = plt.subplots(figsize=(10, 4))
|
||||
ax.axis('off')
|
||||
tbl = ax.table(cellText=user_table.values,
|
||||
colLabels=user_table.columns,
|
||||
cellLoc='center',
|
||||
loc='center')
|
||||
|
||||
tbl.auto_set_font_size(False)
|
||||
tbl.set_fontsize(9)
|
||||
tbl.scale(1, 1.5)
|
||||
|
||||
# Make column header text bold
|
||||
for col in range(len(user_table.columns)):
|
||||
header_cell = tbl[(0, col)]
|
||||
header_cell.set_text_props(weight='bold')
|
||||
|
||||
# Save figure
|
||||
table_png_name = os.path.join(FIGURE_DIR, 'top_users_table.png')
|
||||
plt.savefig(table_png_name, bbox_inches='tight', dpi=300)
|
||||
|
||||
# -------------------------
|
||||
# Page Edits Table (Last 3 Years)
|
||||
# -------------------------
|
||||
print("Creating page edits table...")
|
||||
import textwrap
|
||||
|
||||
latest_3_years = sorted(years)[-3:]
|
||||
page_table = page_edits_by_year[latest_3_years].fillna(0).astype(int).copy()
|
||||
|
||||
# Sort pages by latest year, then previous years
|
||||
page_table = page_table.sort_values(by=latest_3_years[::-1], ascending=False)
|
||||
|
||||
# Keep only top 20 pages
|
||||
page_table = page_table.head(20)
|
||||
|
||||
# Reset index so 'page' becomes a column
|
||||
page_table = page_table.reset_index()
|
||||
page_table.columns = ['Page'] + [str(year) for year in latest_3_years]
|
||||
|
||||
# Define a max character width per line (adjust as needed)
|
||||
WRAP_WIDTH = 50
|
||||
|
||||
# Wrap page titles
|
||||
page_table['Page'] = page_table['Page'].apply(
|
||||
lambda title: '\n'.join(textwrap.wrap(title, width=WRAP_WIDTH))
|
||||
)
|
||||
|
||||
# Plot table
|
||||
fig, ax = plt.subplots(figsize=(10, 4))
|
||||
ax.axis('off')
|
||||
tbl = ax.table(cellText=page_table.values,
|
||||
colLabels=page_table.columns,
|
||||
cellLoc='center',
|
||||
loc='center')
|
||||
|
||||
# Dynamically set column widths
|
||||
num_cols = len(page_table.columns)
|
||||
col_widths = [0.40] + [0.60 / (num_cols - 1)] * (num_cols - 1) # 45% for "Page", rest split
|
||||
|
||||
for i, width in enumerate(col_widths):
|
||||
for row in range(len(page_table) + 1): # +1 includes header row
|
||||
cell = tbl[(row, i)]
|
||||
cell.set_width(width)
|
||||
|
||||
tbl.auto_set_font_size(False)
|
||||
tbl.set_fontsize(9)
|
||||
tbl.scale(1, 1.5)
|
||||
|
||||
# Make column header text bold
|
||||
for col in range(len(page_table.columns)):
|
||||
header_cell = tbl[(0, col)]
|
||||
header_cell.set_text_props(weight='bold')
|
||||
|
||||
# Adjust row heights to allow wrapped text to be visible
|
||||
num_rows = len(page_table) + 1 # include header
|
||||
row_height = 1.0 / num_rows
|
||||
for row in range(num_rows):
|
||||
for col in range(len(page_table.columns)):
|
||||
tbl[(row, col)].set_height(row_height * 2) # tweak multiplier as needed
|
||||
|
||||
# Save figure
|
||||
page_table_png_name = os.path.join(FIGURE_DIR, 'top_pages_table.png')
|
||||
plt.savefig(page_table_png_name, bbox_inches='tight', dpi=300)
|
||||
|
||||
fig, ax = plt.subplots(2,
|
||||
1,
|
||||
figsize=(6, 4),
|
||||
sharex=True,
|
||||
gridspec_kw={'hspace': 0.5})
|
||||
|
||||
summary[['page edits']].plot.bar(ax=ax[0], legend=False)
|
||||
summary[['active users']].plot.bar(ax=ax[1], legend=False)
|
||||
|
||||
j = 0
|
||||
for i, row in summary.iterrows():
|
||||
ax[0].annotate(row['page edits'],
|
||||
xy=(j, row['page edits']),
|
||||
xytext=(0, 6),
|
||||
textcoords='offset pixels',
|
||||
ha='center',
|
||||
fontsize=8)
|
||||
|
||||
ax[1].annotate(row['active users'],
|
||||
xy=(j, row['active users']),
|
||||
xytext=(0, 6),
|
||||
textcoords='offset pixels',
|
||||
ha='center',
|
||||
fontsize=8)
|
||||
j += 1
|
||||
|
||||
ax[0].set_title('Page edits', fontsize=10, y=0.9)
|
||||
ax[1].set_title('Active users', fontsize=10, y=0.9)
|
||||
|
||||
ax[1].set_xlabel('')
|
||||
ax[0].set_ylabel('Count', labelpad=10)
|
||||
ax[1].set_ylabel('Count', labelpad=10)
|
||||
|
||||
for a in ax.ravel():
|
||||
a.spines['top'].set_visible(False)
|
||||
a.spines['right'].set_visible(False)
|
||||
png_name = xml_name.replace('.xml', '.png')
|
||||
|
||||
plt.savefig(png_name, bbox_inches='tight', dpi=300)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
xml_names = [f for f in os.listdir('.') if f.endswith('.xml')]
|
||||
for xml_name in xml_names:
|
||||
main(xml_name)
|
||||
# %%
|
||||
|
Loading…
Reference in New Issue