Updated by JWC following WRL wiki migration to AWS WRL Web Services
parent
83f62d2820
commit
7a410bfde8
@ -1,175 +1,284 @@
|
|||||||
|
|
||||||
|
#%%
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from lxml import etree
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import mwxml # pip install mwxml
|
import mwxml # pip install mwxml
|
||||||
|
|
||||||
|
# UPDATE THIS PATH TO YOUR OWN FILE
|
||||||
|
f = r"D:\OneDrive\OneDrive - UNSW\Code\wiki-stats\WRL+Wiki-20250519003348.xml"
|
||||||
|
|
||||||
OUTPUT_DIR = 'csv'
|
OUTPUT_DIR = 'csv'
|
||||||
PROJECTS_TEAM = [ # Oderered by first entry
|
|
||||||
'Anderson',
|
pages = mwxml.Dump.from_file(f)
|
||||||
'Robert',
|
print("Processing pages...")
|
||||||
'Brett',
|
revisions = []
|
||||||
'Conrad',
|
for page in pages:
|
||||||
'Matt',
|
title = page.title
|
||||||
'Duncan',
|
|
||||||
'Ianc',
|
for revision in page:
|
||||||
'William',
|
timestamp = str(revision.timestamp)
|
||||||
'Laurent',
|
contributor = revision.user.text
|
||||||
'Alexandra',
|
|
||||||
'Wendy',
|
revisions.append({
|
||||||
'Alessio',
|
'page': title,
|
||||||
'Luis',
|
'user': contributor,
|
||||||
'Grantley',
|
'date': timestamp
|
||||||
'Beatrice',
|
|
||||||
'Sarah',
|
|
||||||
'Rossm',
|
|
||||||
'ROBERT',
|
|
||||||
'Steven',
|
|
||||||
'Wendyth',
|
|
||||||
'Melissa',
|
|
||||||
'Andy',
|
|
||||||
'Michael',
|
|
||||||
'Msa',
|
|
||||||
'Jamie',
|
|
||||||
'Toms',
|
|
||||||
'Sam',
|
|
||||||
'Larry',
|
|
||||||
'Annakg',
|
|
||||||
'Hamish',
|
|
||||||
'Francois',
|
|
||||||
'Annab',
|
|
||||||
'Erica',
|
|
||||||
'Coral',
|
|
||||||
'Priom',
|
|
||||||
'Barry',
|
|
||||||
'Nathan',
|
|
||||||
'Chrisd',
|
|
||||||
'Andrewf',
|
|
||||||
'Joshuas',
|
|
||||||
'Daniel',
|
|
||||||
'Danh',
|
|
||||||
'Duncanr',
|
|
||||||
'Robertt',
|
|
||||||
'Chrisdu',
|
|
||||||
'Brettm',
|
|
||||||
'Mathieud',
|
|
||||||
'Ianco',
|
|
||||||
'Larryp',
|
|
||||||
'Grantleys',
|
|
||||||
'Aliceh',
|
|
||||||
'Mattb',
|
|
||||||
'Tobyt',
|
|
||||||
'Benm',
|
|
||||||
'Jamess',
|
|
||||||
'Douga',
|
|
||||||
'Gabil',
|
|
||||||
'Francoisf',
|
|
||||||
'Lluna',
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def main(xml_name):
|
|
||||||
revisions = []
|
|
||||||
with open(xml_name, 'rb') as f:
|
|
||||||
pages = mwxml.Dump.from_file(f)
|
|
||||||
for page in pages:
|
|
||||||
title = page.title
|
|
||||||
for revision in page:
|
|
||||||
timestamp = str(revision.timestamp)
|
|
||||||
contributor = revision.user.text
|
|
||||||
|
|
||||||
revisions.append({
|
|
||||||
'page': title,
|
|
||||||
'user': contributor,
|
|
||||||
'date': timestamp
|
|
||||||
})
|
|
||||||
|
|
||||||
df = pd.DataFrame(revisions)
|
|
||||||
df.index = pd.to_datetime(df['date'])
|
|
||||||
|
|
||||||
# Convert to local time, and round to nearest day
|
|
||||||
df = df.tz_convert('Australia/Sydney')
|
|
||||||
df = df.sort_index()
|
|
||||||
df['date'] = df.index.date
|
|
||||||
|
|
||||||
# Drop duplicate entries (ignore edits for same user/same page/same day)
|
|
||||||
df = df.drop_duplicates()
|
|
||||||
df = df.drop(columns=['date'])
|
|
||||||
|
|
||||||
# Only include non-academic users
|
|
||||||
df = df[df['user'].isin(PROJECTS_TEAM)]
|
|
||||||
|
|
||||||
# Get list of years
|
|
||||||
years = df.index.year.unique()
|
|
||||||
|
|
||||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
||||||
|
|
||||||
summary = []
|
|
||||||
for year in years:
|
|
||||||
idx = df.index.year == year
|
|
||||||
pages = df[idx].groupby('page').count()
|
|
||||||
pages = pages.rename(columns={'user': 'edits'})
|
|
||||||
pages = pages.sort_values('edits', ascending=False)
|
|
||||||
|
|
||||||
users = df[idx].groupby('user').count()
|
|
||||||
users = users.rename(columns={'page': 'edits'})
|
|
||||||
users = users.sort_values('edits', ascending=False)
|
|
||||||
|
|
||||||
pages.to_csv(f'{OUTPUT_DIR}/{year}-pages.csv')
|
|
||||||
users.to_csv(f'{OUTPUT_DIR}/{year}-users.csv')
|
|
||||||
|
|
||||||
summary.append({
|
|
||||||
'year': year,
|
|
||||||
'page edits': pages.shape[0],
|
|
||||||
'active users': users.shape[0]
|
|
||||||
})
|
})
|
||||||
|
|
||||||
summary = pd.DataFrame(summary)
|
|
||||||
|
df = pd.DataFrame(revisions)
|
||||||
|
df.index = pd.to_datetime(df['date'])
|
||||||
|
|
||||||
|
|
||||||
|
# Get the latest year from the dataset
|
||||||
|
LATEST_YEAR = df.index.year.max()
|
||||||
|
|
||||||
|
# Get script directory
|
||||||
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
|
# Define figure output directory
|
||||||
|
FIGURE_DIR = os.path.join(SCRIPT_DIR, f"{LATEST_YEAR}_Figures")
|
||||||
|
os.makedirs(FIGURE_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
# Convert to local time, and round to nearest day
|
||||||
|
df = df.tz_convert('Australia/Sydney')
|
||||||
|
df = df.sort_index()
|
||||||
|
df['date'] = df.index.date
|
||||||
|
|
||||||
|
# Drop duplicate entries (ignore edits for same user/same page/same day)
|
||||||
|
df = df.drop_duplicates()
|
||||||
|
df = df.drop(columns=['date'])
|
||||||
|
|
||||||
|
# Get list of years
|
||||||
|
years = df.index.year.unique()
|
||||||
|
|
||||||
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Initialize DataFrame to track page edits by year
|
||||||
|
page_edits_by_year = pd.DataFrame()
|
||||||
|
|
||||||
|
# Get all unique pages
|
||||||
|
all_pages = df['page'].unique()
|
||||||
|
|
||||||
|
for year in years:
|
||||||
|
# Filter data to only include rows from the current year
|
||||||
|
year_df = df[df.index.year == year]
|
||||||
|
|
||||||
|
# Count each page's contributions in the current year
|
||||||
|
page_counts = year_df['page'].value_counts().reindex(all_pages, fill_value=0)
|
||||||
|
|
||||||
|
# Add the page counts for this year as a new column
|
||||||
|
page_edits_by_year[year] = page_counts
|
||||||
|
|
||||||
|
page_edits_by_year = page_edits_by_year.replace(0, pd.NA)
|
||||||
|
|
||||||
|
# Sort pages based on the most recent year's edit counts in descending order
|
||||||
|
most_recent_year = years.max()
|
||||||
|
page_edits_by_year = page_edits_by_year.sort_values(by=most_recent_year, ascending=False)
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
summary = []
|
||||||
|
user_edits_by_year = pd.DataFrame()
|
||||||
|
|
||||||
|
# Get all unique users across all years
|
||||||
|
all_users = df['user'].unique()
|
||||||
|
|
||||||
|
for year in years:
|
||||||
|
# Filter the DataFrame to only include rows from the current year
|
||||||
|
year_df = df[df.index.year == year]
|
||||||
|
|
||||||
|
# Count each user's contributions in the current year
|
||||||
|
user_counts = year_df['user'].value_counts().reindex(all_users, fill_value=0)
|
||||||
|
|
||||||
|
# # Count how many unique pages each user edited (matching page logic)
|
||||||
|
# user_counts = (
|
||||||
|
# year_df.groupby('user')['page']
|
||||||
|
# .nunique()
|
||||||
|
# .reindex(all_users, fill_value=0)
|
||||||
|
# )
|
||||||
|
|
||||||
|
# Add the user counts for this year as a new column
|
||||||
|
user_edits_by_year[year] = user_counts
|
||||||
|
|
||||||
|
# Count pages and active users for summary
|
||||||
|
pages = year_df['page'].nunique() # Count of unique pages edited in the year
|
||||||
|
active_users = user_counts[user_counts > 0].count() # Count users with edits in this year
|
||||||
|
|
||||||
|
summary.append({
|
||||||
|
'year': year,
|
||||||
|
'page edits': pages,
|
||||||
|
'active users': active_users
|
||||||
|
})
|
||||||
|
|
||||||
|
# Convert summary to DataFrame
|
||||||
|
summary = pd.DataFrame(summary)
|
||||||
|
#print("Summary DataFrame before setting index:", summary)
|
||||||
|
|
||||||
|
if 'year' in summary.columns:
|
||||||
summary = summary.set_index('year')
|
summary = summary.set_index('year')
|
||||||
|
else:
|
||||||
|
raise KeyError("The 'year' column is missing from the summary DataFrame.")
|
||||||
|
|
||||||
|
|
||||||
|
user_edits_by_year = user_edits_by_year.replace(0, pd.NA)
|
||||||
|
|
||||||
|
# Sort users based on edits in the most recent year
|
||||||
|
most_recent_year = years.max()
|
||||||
|
user_edits_by_year = user_edits_by_year.sort_values(by=most_recent_year, ascending=False)
|
||||||
|
|
||||||
|
# Save user edits by year as CSV
|
||||||
|
user_edits_by_year.to_csv(f'{OUTPUT_DIR}/user_edits_by_year.csv')
|
||||||
|
|
||||||
|
print("Creating summary plot...")
|
||||||
|
fig, ax = plt.subplots(2,
|
||||||
|
1,
|
||||||
|
figsize=(6, 4),
|
||||||
|
sharex=True,
|
||||||
|
gridspec_kw={'hspace': 0.5})
|
||||||
|
|
||||||
|
summary[['page edits']].plot.bar(ax=ax[0], legend=False)
|
||||||
|
summary[['active users']].plot.bar(ax=ax[1], legend=False)
|
||||||
|
|
||||||
|
j = 0
|
||||||
|
for i, row in summary.iterrows():
|
||||||
|
ax[0].annotate(row['page edits'],
|
||||||
|
xy=(j, row['page edits']),
|
||||||
|
xytext=(0, 6),
|
||||||
|
textcoords='offset pixels',
|
||||||
|
ha='center',
|
||||||
|
fontsize=8)
|
||||||
|
|
||||||
|
ax[1].annotate(row['active users'],
|
||||||
|
xy=(j, row['active users']),
|
||||||
|
xytext=(0, 6),
|
||||||
|
textcoords='offset pixels',
|
||||||
|
ha='center',
|
||||||
|
fontsize=8)
|
||||||
|
j += 1
|
||||||
|
|
||||||
|
ax[0].set_title('Page edits', fontsize=10, y=0.9)
|
||||||
|
ax[1].set_title('Active users', fontsize=10, y=0.9)
|
||||||
|
|
||||||
|
ax[1].set_xlabel('')
|
||||||
|
ax[0].set_ylabel('Count', labelpad=10)
|
||||||
|
ax[1].set_ylabel('Count', labelpad=10)
|
||||||
|
|
||||||
|
for a in ax.ravel():
|
||||||
|
a.spines['top'].set_visible(False)
|
||||||
|
a.spines['right'].set_visible(False)
|
||||||
|
#png_name = f.replace('.xml', '.png')
|
||||||
|
png_name = os.path.join(FIGURE_DIR, 'summary_bar_chart.png')
|
||||||
|
|
||||||
|
|
||||||
|
plt.savefig(png_name, bbox_inches='tight', dpi=300)
|
||||||
|
|
||||||
|
|
||||||
|
#------------------------------------------------------------------
|
||||||
|
print("Creating user edits table...")
|
||||||
|
# Select last 5 years
|
||||||
|
latest_5_years = sorted(years)[-5:]
|
||||||
|
user_table = user_edits_by_year[latest_5_years].fillna(0).astype(int).copy()
|
||||||
|
|
||||||
|
# Sort by latest year, then previous years
|
||||||
|
user_table = user_table.sort_values(by=latest_5_years[::-1], ascending=False)
|
||||||
|
|
||||||
|
# Drop users with all NaNs (i.e., no activity in these 5 years)
|
||||||
|
user_table = user_table.dropna(how='all')
|
||||||
|
|
||||||
|
# Keep only top 13 users
|
||||||
|
user_table = user_table.head(13)
|
||||||
|
|
||||||
|
# Reset index so 'user' becomes a column
|
||||||
|
user_table = user_table.reset_index()
|
||||||
|
user_table.columns = ['User'] + [str(year) for year in latest_5_years]
|
||||||
|
|
||||||
|
# Plot table
|
||||||
|
fig, ax = plt.subplots(figsize=(10, 4))
|
||||||
|
ax.axis('off')
|
||||||
|
tbl = ax.table(cellText=user_table.values,
|
||||||
|
colLabels=user_table.columns,
|
||||||
|
cellLoc='center',
|
||||||
|
loc='center')
|
||||||
|
|
||||||
|
tbl.auto_set_font_size(False)
|
||||||
|
tbl.set_fontsize(9)
|
||||||
|
tbl.scale(1, 1.5)
|
||||||
|
|
||||||
|
# Make column header text bold
|
||||||
|
for col in range(len(user_table.columns)):
|
||||||
|
header_cell = tbl[(0, col)]
|
||||||
|
header_cell.set_text_props(weight='bold')
|
||||||
|
|
||||||
|
# Save figure
|
||||||
|
table_png_name = os.path.join(FIGURE_DIR, 'top_users_table.png')
|
||||||
|
plt.savefig(table_png_name, bbox_inches='tight', dpi=300)
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Page Edits Table (Last 3 Years)
|
||||||
|
# -------------------------
|
||||||
|
print("Creating page edits table...")
|
||||||
|
import textwrap
|
||||||
|
|
||||||
|
latest_3_years = sorted(years)[-3:]
|
||||||
|
page_table = page_edits_by_year[latest_3_years].fillna(0).astype(int).copy()
|
||||||
|
|
||||||
|
# Sort pages by latest year, then previous years
|
||||||
|
page_table = page_table.sort_values(by=latest_3_years[::-1], ascending=False)
|
||||||
|
|
||||||
|
# Keep only top 20 pages
|
||||||
|
page_table = page_table.head(20)
|
||||||
|
|
||||||
|
# Reset index so 'page' becomes a column
|
||||||
|
page_table = page_table.reset_index()
|
||||||
|
page_table.columns = ['Page'] + [str(year) for year in latest_3_years]
|
||||||
|
|
||||||
|
# Define a max character width per line (adjust as needed)
|
||||||
|
WRAP_WIDTH = 50
|
||||||
|
|
||||||
|
# Wrap page titles
|
||||||
|
page_table['Page'] = page_table['Page'].apply(
|
||||||
|
lambda title: '\n'.join(textwrap.wrap(title, width=WRAP_WIDTH))
|
||||||
|
)
|
||||||
|
|
||||||
|
# Plot table
|
||||||
|
fig, ax = plt.subplots(figsize=(10, 4))
|
||||||
|
ax.axis('off')
|
||||||
|
tbl = ax.table(cellText=page_table.values,
|
||||||
|
colLabels=page_table.columns,
|
||||||
|
cellLoc='center',
|
||||||
|
loc='center')
|
||||||
|
|
||||||
|
# Dynamically set column widths
|
||||||
|
num_cols = len(page_table.columns)
|
||||||
|
col_widths = [0.40] + [0.60 / (num_cols - 1)] * (num_cols - 1) # 45% for "Page", rest split
|
||||||
|
|
||||||
|
for i, width in enumerate(col_widths):
|
||||||
|
for row in range(len(page_table) + 1): # +1 includes header row
|
||||||
|
cell = tbl[(row, i)]
|
||||||
|
cell.set_width(width)
|
||||||
|
|
||||||
|
tbl.auto_set_font_size(False)
|
||||||
|
tbl.set_fontsize(9)
|
||||||
|
tbl.scale(1, 1.5)
|
||||||
|
|
||||||
|
# Make column header text bold
|
||||||
|
for col in range(len(page_table.columns)):
|
||||||
|
header_cell = tbl[(0, col)]
|
||||||
|
header_cell.set_text_props(weight='bold')
|
||||||
|
|
||||||
|
# Adjust row heights to allow wrapped text to be visible
|
||||||
|
num_rows = len(page_table) + 1 # include header
|
||||||
|
row_height = 1.0 / num_rows
|
||||||
|
for row in range(num_rows):
|
||||||
|
for col in range(len(page_table.columns)):
|
||||||
|
tbl[(row, col)].set_height(row_height * 2) # tweak multiplier as needed
|
||||||
|
|
||||||
|
# Save figure
|
||||||
|
page_table_png_name = os.path.join(FIGURE_DIR, 'top_pages_table.png')
|
||||||
|
plt.savefig(page_table_png_name, bbox_inches='tight', dpi=300)
|
||||||
|
|
||||||
fig, ax = plt.subplots(2,
|
# %%
|
||||||
1,
|
|
||||||
figsize=(6, 4),
|
|
||||||
sharex=True,
|
|
||||||
gridspec_kw={'hspace': 0.5})
|
|
||||||
|
|
||||||
summary[['page edits']].plot.bar(ax=ax[0], legend=False)
|
|
||||||
summary[['active users']].plot.bar(ax=ax[1], legend=False)
|
|
||||||
|
|
||||||
j = 0
|
|
||||||
for i, row in summary.iterrows():
|
|
||||||
ax[0].annotate(row['page edits'],
|
|
||||||
xy=(j, row['page edits']),
|
|
||||||
xytext=(0, 6),
|
|
||||||
textcoords='offset pixels',
|
|
||||||
ha='center',
|
|
||||||
fontsize=8)
|
|
||||||
|
|
||||||
ax[1].annotate(row['active users'],
|
|
||||||
xy=(j, row['active users']),
|
|
||||||
xytext=(0, 6),
|
|
||||||
textcoords='offset pixels',
|
|
||||||
ha='center',
|
|
||||||
fontsize=8)
|
|
||||||
j += 1
|
|
||||||
|
|
||||||
ax[0].set_title('Page edits', fontsize=10, y=0.9)
|
|
||||||
ax[1].set_title('Active users', fontsize=10, y=0.9)
|
|
||||||
|
|
||||||
ax[1].set_xlabel('')
|
|
||||||
ax[0].set_ylabel('Count', labelpad=10)
|
|
||||||
ax[1].set_ylabel('Count', labelpad=10)
|
|
||||||
|
|
||||||
for a in ax.ravel():
|
|
||||||
a.spines['top'].set_visible(False)
|
|
||||||
a.spines['right'].set_visible(False)
|
|
||||||
png_name = xml_name.replace('.xml', '.png')
|
|
||||||
|
|
||||||
plt.savefig(png_name, bbox_inches='tight', dpi=300)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
xml_names = [f for f in os.listdir('.') if f.endswith('.xml')]
|
|
||||||
for xml_name in xml_names:
|
|
||||||
main(xml_name)
|
|
||||||
|
Loading…
Reference in New Issue