#%% import os import pandas as pd import matplotlib.pyplot as plt import mwxml # pip install mwxml # UPDATE THIS PATH TO YOUR OWN FILE f = r"D:\OneDrive\OneDrive - UNSW\Code\wiki-stats\WRL+Wiki-20250519003348.xml" OUTPUT_DIR = 'csv' pages = mwxml.Dump.from_file(f) print("Processing pages...") revisions = [] for page in pages: title = page.title for revision in page: timestamp = str(revision.timestamp) contributor = revision.user.text revisions.append({ 'page': title, 'user': contributor, 'date': timestamp }) df = pd.DataFrame(revisions) df.index = pd.to_datetime(df['date']) # Get the latest year from the dataset LATEST_YEAR = df.index.year.max() # Get script directory SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) # Define figure output directory FIGURE_DIR = os.path.join(SCRIPT_DIR, f"{LATEST_YEAR}_Figures") os.makedirs(FIGURE_DIR, exist_ok=True) # Convert to local time, and round to nearest day df = df.tz_convert('Australia/Sydney') df = df.sort_index() df['date'] = df.index.date # Drop duplicate entries (ignore edits for same user/same page/same day) df = df.drop_duplicates() df = df.drop(columns=['date']) # Get list of years years = df.index.year.unique() os.makedirs(OUTPUT_DIR, exist_ok=True) ################################################################################ # Initialize DataFrame to track page edits by year page_edits_by_year = pd.DataFrame() # Get all unique pages all_pages = df['page'].unique() for year in years: # Filter data to only include rows from the current year year_df = df[df.index.year == year] # Count each page's contributions in the current year page_counts = year_df['page'].value_counts().reindex(all_pages, fill_value=0) # Add the page counts for this year as a new column page_edits_by_year[year] = page_counts page_edits_by_year = page_edits_by_year.replace(0, pd.NA) # Sort pages based on the most recent year's edit counts in descending order most_recent_year = years.max() page_edits_by_year = page_edits_by_year.sort_values(by=most_recent_year, ascending=False) ################################################################################ summary = [] user_edits_by_year = pd.DataFrame() # Get all unique users across all years all_users = df['user'].unique() for year in years: # Filter the DataFrame to only include rows from the current year year_df = df[df.index.year == year] # Count each user's contributions in the current year user_counts = year_df['user'].value_counts().reindex(all_users, fill_value=0) # # Count how many unique pages each user edited (matching page logic) # user_counts = ( # year_df.groupby('user')['page'] # .nunique() # .reindex(all_users, fill_value=0) # ) # Add the user counts for this year as a new column user_edits_by_year[year] = user_counts # Count pages and active users for summary pages = year_df['page'].nunique() # Count of unique pages edited in the year active_users = user_counts[user_counts > 0].count() # Count users with edits in this year summary.append({ 'year': year, 'page edits': pages, 'active users': active_users }) # Convert summary to DataFrame summary = pd.DataFrame(summary) #print("Summary DataFrame before setting index:", summary) if 'year' in summary.columns: summary = summary.set_index('year') else: raise KeyError("The 'year' column is missing from the summary DataFrame.") user_edits_by_year = user_edits_by_year.replace(0, pd.NA) # Sort users based on edits in the most recent year most_recent_year = years.max() user_edits_by_year = user_edits_by_year.sort_values(by=most_recent_year, ascending=False) # Save user edits by year as CSV user_edits_by_year.to_csv(f'{OUTPUT_DIR}/user_edits_by_year.csv') print("Creating summary plot...") fig, ax = plt.subplots(2, 1, figsize=(6, 4), sharex=True, gridspec_kw={'hspace': 0.5}) summary[['page edits']].plot.bar(ax=ax[0], legend=False) summary[['active users']].plot.bar(ax=ax[1], legend=False) j = 0 for i, row in summary.iterrows(): ax[0].annotate(row['page edits'], xy=(j, row['page edits']), xytext=(0, 6), textcoords='offset pixels', ha='center', fontsize=8) ax[1].annotate(row['active users'], xy=(j, row['active users']), xytext=(0, 6), textcoords='offset pixels', ha='center', fontsize=8) j += 1 ax[0].set_title('Page edits', fontsize=10, y=0.9) ax[1].set_title('Active users', fontsize=10, y=0.9) ax[1].set_xlabel('') ax[0].set_ylabel('Count', labelpad=10) ax[1].set_ylabel('Count', labelpad=10) for a in ax.ravel(): a.spines['top'].set_visible(False) a.spines['right'].set_visible(False) #png_name = f.replace('.xml', '.png') png_name = os.path.join(FIGURE_DIR, 'summary_bar_chart.png') plt.savefig(png_name, bbox_inches='tight', dpi=300) #------------------------------------------------------------------ print("Creating user edits table...") # Select last 5 years latest_5_years = sorted(years)[-5:] user_table = user_edits_by_year[latest_5_years].fillna(0).astype(int).copy() # Sort by latest year, then previous years user_table = user_table.sort_values(by=latest_5_years[::-1], ascending=False) # Drop users with all NaNs (i.e., no activity in these 5 years) user_table = user_table.dropna(how='all') # Keep only top 13 users user_table = user_table.head(13) # Reset index so 'user' becomes a column user_table = user_table.reset_index() user_table.columns = ['User'] + [str(year) for year in latest_5_years] # Plot table fig, ax = plt.subplots(figsize=(10, 4)) ax.axis('off') tbl = ax.table(cellText=user_table.values, colLabels=user_table.columns, cellLoc='center', loc='center') tbl.auto_set_font_size(False) tbl.set_fontsize(9) tbl.scale(1, 1.5) # Make column header text bold for col in range(len(user_table.columns)): header_cell = tbl[(0, col)] header_cell.set_text_props(weight='bold') # Save figure table_png_name = os.path.join(FIGURE_DIR, 'top_users_table.png') plt.savefig(table_png_name, bbox_inches='tight', dpi=300) # ------------------------- # Page Edits Table (Last 3 Years) # ------------------------- print("Creating page edits table...") import textwrap latest_3_years = sorted(years)[-3:] page_table = page_edits_by_year[latest_3_years].fillna(0).astype(int).copy() # Sort pages by latest year, then previous years page_table = page_table.sort_values(by=latest_3_years[::-1], ascending=False) # Keep only top 20 pages page_table = page_table.head(20) # Reset index so 'page' becomes a column page_table = page_table.reset_index() page_table.columns = ['Page'] + [str(year) for year in latest_3_years] # Define a max character width per line (adjust as needed) WRAP_WIDTH = 50 # Wrap page titles page_table['Page'] = page_table['Page'].apply( lambda title: '\n'.join(textwrap.wrap(title, width=WRAP_WIDTH)) ) # Plot table fig, ax = plt.subplots(figsize=(10, 4)) ax.axis('off') tbl = ax.table(cellText=page_table.values, colLabels=page_table.columns, cellLoc='center', loc='center') # Dynamically set column widths num_cols = len(page_table.columns) col_widths = [0.40] + [0.60 / (num_cols - 1)] * (num_cols - 1) # 45% for "Page", rest split for i, width in enumerate(col_widths): for row in range(len(page_table) + 1): # +1 includes header row cell = tbl[(row, i)] cell.set_width(width) tbl.auto_set_font_size(False) tbl.set_fontsize(9) tbl.scale(1, 1.5) # Make column header text bold for col in range(len(page_table.columns)): header_cell = tbl[(0, col)] header_cell.set_text_props(weight='bold') # Adjust row heights to allow wrapped text to be visible num_rows = len(page_table) + 1 # include header row_height = 1.0 / num_rows for row in range(num_rows): for col in range(len(page_table.columns)): tbl[(row, col)].set_height(row_height * 2) # tweak multiplier as needed # Save figure page_table_png_name = os.path.join(FIGURE_DIR, 'top_pages_table.png') plt.savefig(page_table_png_name, bbox_inches='tight', dpi=300) # %%