You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

285 lines
8.3 KiB
Python

#%%
import os
import pandas as pd
import matplotlib.pyplot as plt
import mwxml # pip install mwxml
# UPDATE THIS PATH TO YOUR OWN FILE
f = r"D:\OneDrive\OneDrive - UNSW\Code\wiki-stats\WRL+Wiki-20250519003348.xml"
OUTPUT_DIR = 'csv'
pages = mwxml.Dump.from_file(f)
print("Processing pages...")
revisions = []
for page in pages:
title = page.title
for revision in page:
timestamp = str(revision.timestamp)
contributor = revision.user.text
revisions.append({
'page': title,
'user': contributor,
'date': timestamp
})
df = pd.DataFrame(revisions)
df.index = pd.to_datetime(df['date'])
# Get the latest year from the dataset
LATEST_YEAR = df.index.year.max()
# Get script directory
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
# Define figure output directory
FIGURE_DIR = os.path.join(SCRIPT_DIR, f"{LATEST_YEAR}_Figures")
os.makedirs(FIGURE_DIR, exist_ok=True)
# Convert to local time, and round to nearest day
df = df.tz_convert('Australia/Sydney')
df = df.sort_index()
df['date'] = df.index.date
# Drop duplicate entries (ignore edits for same user/same page/same day)
df = df.drop_duplicates()
df = df.drop(columns=['date'])
# Get list of years
years = df.index.year.unique()
os.makedirs(OUTPUT_DIR, exist_ok=True)
################################################################################
# Initialize DataFrame to track page edits by year
page_edits_by_year = pd.DataFrame()
# Get all unique pages
all_pages = df['page'].unique()
for year in years:
# Filter data to only include rows from the current year
year_df = df[df.index.year == year]
# Count each page's contributions in the current year
page_counts = year_df['page'].value_counts().reindex(all_pages, fill_value=0)
# Add the page counts for this year as a new column
page_edits_by_year[year] = page_counts
page_edits_by_year = page_edits_by_year.replace(0, pd.NA)
# Sort pages based on the most recent year's edit counts in descending order
most_recent_year = years.max()
page_edits_by_year = page_edits_by_year.sort_values(by=most_recent_year, ascending=False)
################################################################################
summary = []
user_edits_by_year = pd.DataFrame()
# Get all unique users across all years
all_users = df['user'].unique()
for year in years:
# Filter the DataFrame to only include rows from the current year
year_df = df[df.index.year == year]
# Count each user's contributions in the current year
user_counts = year_df['user'].value_counts().reindex(all_users, fill_value=0)
# # Count how many unique pages each user edited (matching page logic)
# user_counts = (
# year_df.groupby('user')['page']
# .nunique()
# .reindex(all_users, fill_value=0)
# )
# Add the user counts for this year as a new column
user_edits_by_year[year] = user_counts
# Count pages and active users for summary
pages = year_df['page'].nunique() # Count of unique pages edited in the year
active_users = user_counts[user_counts > 0].count() # Count users with edits in this year
summary.append({
'year': year,
'page edits': pages,
'active users': active_users
})
# Convert summary to DataFrame
summary = pd.DataFrame(summary)
#print("Summary DataFrame before setting index:", summary)
if 'year' in summary.columns:
summary = summary.set_index('year')
else:
raise KeyError("The 'year' column is missing from the summary DataFrame.")
user_edits_by_year = user_edits_by_year.replace(0, pd.NA)
# Sort users based on edits in the most recent year
most_recent_year = years.max()
user_edits_by_year = user_edits_by_year.sort_values(by=most_recent_year, ascending=False)
# Save user edits by year as CSV
user_edits_by_year.to_csv(f'{OUTPUT_DIR}/user_edits_by_year.csv')
print("Creating summary plot...")
fig, ax = plt.subplots(2,
1,
figsize=(6, 4),
sharex=True,
gridspec_kw={'hspace': 0.5})
summary[['page edits']].plot.bar(ax=ax[0], legend=False)
summary[['active users']].plot.bar(ax=ax[1], legend=False)
j = 0
for i, row in summary.iterrows():
ax[0].annotate(row['page edits'],
xy=(j, row['page edits']),
xytext=(0, 6),
textcoords='offset pixels',
ha='center',
fontsize=8)
ax[1].annotate(row['active users'],
xy=(j, row['active users']),
xytext=(0, 6),
textcoords='offset pixels',
ha='center',
fontsize=8)
j += 1
ax[0].set_title('Page edits', fontsize=10, y=0.9)
ax[1].set_title('Active users', fontsize=10, y=0.9)
ax[1].set_xlabel('')
ax[0].set_ylabel('Count', labelpad=10)
ax[1].set_ylabel('Count', labelpad=10)
for a in ax.ravel():
a.spines['top'].set_visible(False)
a.spines['right'].set_visible(False)
#png_name = f.replace('.xml', '.png')
png_name = os.path.join(FIGURE_DIR, 'summary_bar_chart.png')
plt.savefig(png_name, bbox_inches='tight', dpi=300)
#------------------------------------------------------------------
print("Creating user edits table...")
# Select last 5 years
latest_5_years = sorted(years)[-5:]
user_table = user_edits_by_year[latest_5_years].fillna(0).astype(int).copy()
# Sort by latest year, then previous years
user_table = user_table.sort_values(by=latest_5_years[::-1], ascending=False)
# Drop users with all NaNs (i.e., no activity in these 5 years)
user_table = user_table.dropna(how='all')
# Keep only top 13 users
user_table = user_table.head(13)
# Reset index so 'user' becomes a column
user_table = user_table.reset_index()
user_table.columns = ['User'] + [str(year) for year in latest_5_years]
# Plot table
fig, ax = plt.subplots(figsize=(10, 4))
ax.axis('off')
tbl = ax.table(cellText=user_table.values,
colLabels=user_table.columns,
cellLoc='center',
loc='center')
tbl.auto_set_font_size(False)
tbl.set_fontsize(9)
tbl.scale(1, 1.5)
# Make column header text bold
for col in range(len(user_table.columns)):
header_cell = tbl[(0, col)]
header_cell.set_text_props(weight='bold')
# Save figure
table_png_name = os.path.join(FIGURE_DIR, 'top_users_table.png')
plt.savefig(table_png_name, bbox_inches='tight', dpi=300)
# -------------------------
# Page Edits Table (Last 3 Years)
# -------------------------
print("Creating page edits table...")
import textwrap
latest_3_years = sorted(years)[-3:]
page_table = page_edits_by_year[latest_3_years].fillna(0).astype(int).copy()
# Sort pages by latest year, then previous years
page_table = page_table.sort_values(by=latest_3_years[::-1], ascending=False)
# Keep only top 20 pages
page_table = page_table.head(20)
# Reset index so 'page' becomes a column
page_table = page_table.reset_index()
page_table.columns = ['Page'] + [str(year) for year in latest_3_years]
# Define a max character width per line (adjust as needed)
WRAP_WIDTH = 50
# Wrap page titles
page_table['Page'] = page_table['Page'].apply(
lambda title: '\n'.join(textwrap.wrap(title, width=WRAP_WIDTH))
)
# Plot table
fig, ax = plt.subplots(figsize=(10, 4))
ax.axis('off')
tbl = ax.table(cellText=page_table.values,
colLabels=page_table.columns,
cellLoc='center',
loc='center')
# Dynamically set column widths
num_cols = len(page_table.columns)
col_widths = [0.40] + [0.60 / (num_cols - 1)] * (num_cols - 1) # 45% for "Page", rest split
for i, width in enumerate(col_widths):
for row in range(len(page_table) + 1): # +1 includes header row
cell = tbl[(row, i)]
cell.set_width(width)
tbl.auto_set_font_size(False)
tbl.set_fontsize(9)
tbl.scale(1, 1.5)
# Make column header text bold
for col in range(len(page_table.columns)):
header_cell = tbl[(0, col)]
header_cell.set_text_props(weight='bold')
# Adjust row heights to allow wrapped text to be visible
num_rows = len(page_table) + 1 # include header
row_height = 1.0 / num_rows
for row in range(num_rows):
for col in range(len(page_table.columns)):
tbl[(row, col)].set_height(row_height * 2) # tweak multiplier as needed
# Save figure
page_table_png_name = os.path.join(FIGURE_DIR, 'top_pages_table.png')
plt.savefig(page_table_png_name, bbox_inches='tight', dpi=300)
# %%