You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
285 lines
8.3 KiB
Python
285 lines
8.3 KiB
Python
|
|
#%%
|
|
import os
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
import mwxml # pip install mwxml
|
|
|
|
# UPDATE THIS PATH TO YOUR OWN FILE
|
|
f = r"D:\OneDrive\OneDrive - UNSW\Code\wiki-stats\WRL+Wiki-20250519003348.xml"
|
|
|
|
OUTPUT_DIR = 'csv'
|
|
|
|
pages = mwxml.Dump.from_file(f)
|
|
print("Processing pages...")
|
|
revisions = []
|
|
for page in pages:
|
|
title = page.title
|
|
|
|
for revision in page:
|
|
timestamp = str(revision.timestamp)
|
|
contributor = revision.user.text
|
|
|
|
revisions.append({
|
|
'page': title,
|
|
'user': contributor,
|
|
'date': timestamp
|
|
})
|
|
|
|
|
|
df = pd.DataFrame(revisions)
|
|
df.index = pd.to_datetime(df['date'])
|
|
|
|
|
|
# Get the latest year from the dataset
|
|
LATEST_YEAR = df.index.year.max()
|
|
|
|
# Get script directory
|
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
# Define figure output directory
|
|
FIGURE_DIR = os.path.join(SCRIPT_DIR, f"{LATEST_YEAR}_Figures")
|
|
os.makedirs(FIGURE_DIR, exist_ok=True)
|
|
|
|
|
|
# Convert to local time, and round to nearest day
|
|
df = df.tz_convert('Australia/Sydney')
|
|
df = df.sort_index()
|
|
df['date'] = df.index.date
|
|
|
|
# Drop duplicate entries (ignore edits for same user/same page/same day)
|
|
df = df.drop_duplicates()
|
|
df = df.drop(columns=['date'])
|
|
|
|
# Get list of years
|
|
years = df.index.year.unique()
|
|
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
|
|
################################################################################
|
|
# Initialize DataFrame to track page edits by year
|
|
page_edits_by_year = pd.DataFrame()
|
|
|
|
# Get all unique pages
|
|
all_pages = df['page'].unique()
|
|
|
|
for year in years:
|
|
# Filter data to only include rows from the current year
|
|
year_df = df[df.index.year == year]
|
|
|
|
# Count each page's contributions in the current year
|
|
page_counts = year_df['page'].value_counts().reindex(all_pages, fill_value=0)
|
|
|
|
# Add the page counts for this year as a new column
|
|
page_edits_by_year[year] = page_counts
|
|
|
|
page_edits_by_year = page_edits_by_year.replace(0, pd.NA)
|
|
|
|
# Sort pages based on the most recent year's edit counts in descending order
|
|
most_recent_year = years.max()
|
|
page_edits_by_year = page_edits_by_year.sort_values(by=most_recent_year, ascending=False)
|
|
|
|
################################################################################
|
|
|
|
summary = []
|
|
user_edits_by_year = pd.DataFrame()
|
|
|
|
# Get all unique users across all years
|
|
all_users = df['user'].unique()
|
|
|
|
for year in years:
|
|
# Filter the DataFrame to only include rows from the current year
|
|
year_df = df[df.index.year == year]
|
|
|
|
# Count each user's contributions in the current year
|
|
user_counts = year_df['user'].value_counts().reindex(all_users, fill_value=0)
|
|
|
|
# # Count how many unique pages each user edited (matching page logic)
|
|
# user_counts = (
|
|
# year_df.groupby('user')['page']
|
|
# .nunique()
|
|
# .reindex(all_users, fill_value=0)
|
|
# )
|
|
|
|
# Add the user counts for this year as a new column
|
|
user_edits_by_year[year] = user_counts
|
|
|
|
# Count pages and active users for summary
|
|
pages = year_df['page'].nunique() # Count of unique pages edited in the year
|
|
active_users = user_counts[user_counts > 0].count() # Count users with edits in this year
|
|
|
|
summary.append({
|
|
'year': year,
|
|
'page edits': pages,
|
|
'active users': active_users
|
|
})
|
|
|
|
# Convert summary to DataFrame
|
|
summary = pd.DataFrame(summary)
|
|
#print("Summary DataFrame before setting index:", summary)
|
|
|
|
if 'year' in summary.columns:
|
|
summary = summary.set_index('year')
|
|
else:
|
|
raise KeyError("The 'year' column is missing from the summary DataFrame.")
|
|
|
|
|
|
user_edits_by_year = user_edits_by_year.replace(0, pd.NA)
|
|
|
|
# Sort users based on edits in the most recent year
|
|
most_recent_year = years.max()
|
|
user_edits_by_year = user_edits_by_year.sort_values(by=most_recent_year, ascending=False)
|
|
|
|
# Save user edits by year as CSV
|
|
user_edits_by_year.to_csv(f'{OUTPUT_DIR}/user_edits_by_year.csv')
|
|
|
|
print("Creating summary plot...")
|
|
fig, ax = plt.subplots(2,
|
|
1,
|
|
figsize=(6, 4),
|
|
sharex=True,
|
|
gridspec_kw={'hspace': 0.5})
|
|
|
|
summary[['page edits']].plot.bar(ax=ax[0], legend=False)
|
|
summary[['active users']].plot.bar(ax=ax[1], legend=False)
|
|
|
|
j = 0
|
|
for i, row in summary.iterrows():
|
|
ax[0].annotate(row['page edits'],
|
|
xy=(j, row['page edits']),
|
|
xytext=(0, 6),
|
|
textcoords='offset pixels',
|
|
ha='center',
|
|
fontsize=8)
|
|
|
|
ax[1].annotate(row['active users'],
|
|
xy=(j, row['active users']),
|
|
xytext=(0, 6),
|
|
textcoords='offset pixels',
|
|
ha='center',
|
|
fontsize=8)
|
|
j += 1
|
|
|
|
ax[0].set_title('Page edits', fontsize=10, y=0.9)
|
|
ax[1].set_title('Active users', fontsize=10, y=0.9)
|
|
|
|
ax[1].set_xlabel('')
|
|
ax[0].set_ylabel('Count', labelpad=10)
|
|
ax[1].set_ylabel('Count', labelpad=10)
|
|
|
|
for a in ax.ravel():
|
|
a.spines['top'].set_visible(False)
|
|
a.spines['right'].set_visible(False)
|
|
#png_name = f.replace('.xml', '.png')
|
|
png_name = os.path.join(FIGURE_DIR, 'summary_bar_chart.png')
|
|
|
|
|
|
plt.savefig(png_name, bbox_inches='tight', dpi=300)
|
|
|
|
|
|
#------------------------------------------------------------------
|
|
print("Creating user edits table...")
|
|
# Select last 5 years
|
|
latest_5_years = sorted(years)[-5:]
|
|
user_table = user_edits_by_year[latest_5_years].fillna(0).astype(int).copy()
|
|
|
|
# Sort by latest year, then previous years
|
|
user_table = user_table.sort_values(by=latest_5_years[::-1], ascending=False)
|
|
|
|
# Drop users with all NaNs (i.e., no activity in these 5 years)
|
|
user_table = user_table.dropna(how='all')
|
|
|
|
# Keep only top 13 users
|
|
user_table = user_table.head(13)
|
|
|
|
# Reset index so 'user' becomes a column
|
|
user_table = user_table.reset_index()
|
|
user_table.columns = ['User'] + [str(year) for year in latest_5_years]
|
|
|
|
# Plot table
|
|
fig, ax = plt.subplots(figsize=(10, 4))
|
|
ax.axis('off')
|
|
tbl = ax.table(cellText=user_table.values,
|
|
colLabels=user_table.columns,
|
|
cellLoc='center',
|
|
loc='center')
|
|
|
|
tbl.auto_set_font_size(False)
|
|
tbl.set_fontsize(9)
|
|
tbl.scale(1, 1.5)
|
|
|
|
# Make column header text bold
|
|
for col in range(len(user_table.columns)):
|
|
header_cell = tbl[(0, col)]
|
|
header_cell.set_text_props(weight='bold')
|
|
|
|
# Save figure
|
|
table_png_name = os.path.join(FIGURE_DIR, 'top_users_table.png')
|
|
plt.savefig(table_png_name, bbox_inches='tight', dpi=300)
|
|
|
|
# -------------------------
|
|
# Page Edits Table (Last 3 Years)
|
|
# -------------------------
|
|
print("Creating page edits table...")
|
|
import textwrap
|
|
|
|
latest_3_years = sorted(years)[-3:]
|
|
page_table = page_edits_by_year[latest_3_years].fillna(0).astype(int).copy()
|
|
|
|
# Sort pages by latest year, then previous years
|
|
page_table = page_table.sort_values(by=latest_3_years[::-1], ascending=False)
|
|
|
|
# Keep only top 20 pages
|
|
page_table = page_table.head(20)
|
|
|
|
# Reset index so 'page' becomes a column
|
|
page_table = page_table.reset_index()
|
|
page_table.columns = ['Page'] + [str(year) for year in latest_3_years]
|
|
|
|
# Define a max character width per line (adjust as needed)
|
|
WRAP_WIDTH = 50
|
|
|
|
# Wrap page titles
|
|
page_table['Page'] = page_table['Page'].apply(
|
|
lambda title: '\n'.join(textwrap.wrap(title, width=WRAP_WIDTH))
|
|
)
|
|
|
|
# Plot table
|
|
fig, ax = plt.subplots(figsize=(10, 4))
|
|
ax.axis('off')
|
|
tbl = ax.table(cellText=page_table.values,
|
|
colLabels=page_table.columns,
|
|
cellLoc='center',
|
|
loc='center')
|
|
|
|
# Dynamically set column widths
|
|
num_cols = len(page_table.columns)
|
|
col_widths = [0.40] + [0.60 / (num_cols - 1)] * (num_cols - 1) # 45% for "Page", rest split
|
|
|
|
for i, width in enumerate(col_widths):
|
|
for row in range(len(page_table) + 1): # +1 includes header row
|
|
cell = tbl[(row, i)]
|
|
cell.set_width(width)
|
|
|
|
tbl.auto_set_font_size(False)
|
|
tbl.set_fontsize(9)
|
|
tbl.scale(1, 1.5)
|
|
|
|
# Make column header text bold
|
|
for col in range(len(page_table.columns)):
|
|
header_cell = tbl[(0, col)]
|
|
header_cell.set_text_props(weight='bold')
|
|
|
|
# Adjust row heights to allow wrapped text to be visible
|
|
num_rows = len(page_table) + 1 # include header
|
|
row_height = 1.0 / num_rows
|
|
for row in range(num_rows):
|
|
for col in range(len(page_table.columns)):
|
|
tbl[(row, col)].set_height(row_height * 2) # tweak multiplier as needed
|
|
|
|
# Save figure
|
|
page_table_png_name = os.path.join(FIGURE_DIR, 'top_pages_table.png')
|
|
plt.savefig(page_table_png_name, bbox_inches='tight', dpi=300)
|
|
|
|
# %%
|