import os import re import pandas as pd from lxml import etree import matplotlib.pyplot as plt import mwxml # pip install mwxml OUTPUT_DIR = 'csv' PROJECTS_TEAM = [ # Oderered by first entry 'Anderson', 'Robert', 'Brett', 'Conrad', 'Matt', 'Duncan', 'Ianc', 'William', 'Laurent', 'Alexandra', 'Wendy', 'Alessio', 'Luis', 'Grantley', 'Beatrice', 'Sarah', 'Rossm', 'ROBERT', 'Steven', 'Wendyth', 'Melissa', 'Andy', 'Michael', 'Msa', 'Jamie', 'Toms', 'Sam', 'Larry', 'Annakg', 'Hamish', 'Francois', 'Annab', 'Erica', 'Coral', 'Priom', 'Barry', 'Nathan', 'Chrisd', 'Andrewf', 'Joshuas', 'Daniel', 'Danh', 'Duncanr', 'Robertt', 'Chrisdu', 'Brettm', 'Mathieud', 'Ianco', 'Larryp', 'Grantleys', 'Aliceh', 'Mattb', 'Tobyt', 'Benm', 'Jamess', 'Douga', 'Gabil', 'Francoisf', 'Lluna', ] def main(xml_name): revisions = [] with open(xml_name, 'rb') as f: pages = mwxml.Dump.from_file(f) for page in pages: title = page.title for revision in page: timestamp = str(revision.timestamp) contributor = revision.user.text revisions.append({ 'page': title, 'user': contributor, 'date': timestamp }) df = pd.DataFrame(revisions) df.index = pd.to_datetime(df['date']) # Convert to local time, and round to nearest day df = df.tz_convert('Australia/Sydney') df = df.sort_index() df['date'] = df.index.date # Drop duplicate entries (ignore edits for same user/same page/same day) df = df.drop_duplicates() df = df.drop(columns=['date']) # Only include non-academic users df = df[df['user'].isin(PROJECTS_TEAM)] # Get list of years years = df.index.year.unique() os.makedirs(OUTPUT_DIR, exist_ok=True) summary = [] for year in years: idx = df.index.year == year pages = df[idx].groupby('page').count() pages = pages.rename(columns={'user': 'edits'}) pages = pages.sort_values('edits', ascending=False) users = df[idx].groupby('user').count() users = users.rename(columns={'page': 'edits'}) users = users.sort_values('edits', ascending=False) pages.to_csv(f'{OUTPUT_DIR}/{year}-pages.csv') users.to_csv(f'{OUTPUT_DIR}/{year}-users.csv') summary.append({ 'year': year, 'page edits': pages.shape[0], 'active users': users.shape[0] }) summary = pd.DataFrame(summary) summary = summary.set_index('year') fig, ax = plt.subplots(2, 1, figsize=(6, 4), sharex=True, gridspec_kw={'hspace': 0.5}) summary[['page edits']].plot.bar(ax=ax[0], legend=False) summary[['active users']].plot.bar(ax=ax[1], legend=False) j = 0 for i, row in summary.iterrows(): ax[0].annotate(row['page edits'], xy=(j, row['page edits']), xytext=(0, 6), textcoords='offset pixels', ha='center', fontsize=8) ax[1].annotate(row['active users'], xy=(j, row['active users']), xytext=(0, 6), textcoords='offset pixels', ha='center', fontsize=8) j += 1 ax[0].set_title('Page edits', fontsize=10, y=0.9) ax[1].set_title('Active users', fontsize=10, y=0.9) ax[1].set_xlabel('') ax[0].set_ylabel('Count', labelpad=10) ax[1].set_ylabel('Count', labelpad=10) for a in ax.ravel(): a.spines['top'].set_visible(False) a.spines['right'].set_visible(False) png_name = xml_name.replace('.xml', '.png') plt.savefig(png_name, bbox_inches='tight', dpi=300) if __name__ == '__main__': xml_names = [f for f in os.listdir('.') if f.endswith('.xml')] for xml_name in xml_names: main(xml_name)