|
|
|
import os
|
|
|
|
import re
|
|
|
|
import pandas as pd
|
|
|
|
from lxml import etree
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
import mwxml # pip install mwxml
|
|
|
|
|
|
|
|
OUTPUT_DIR = 'csv'
|
|
|
|
PROJECTS_TEAM = [ # Oderered by first entry
|
|
|
|
'Anderson',
|
|
|
|
'Robert',
|
|
|
|
'Brett',
|
|
|
|
'Conrad',
|
|
|
|
'Matt',
|
|
|
|
'Duncan',
|
|
|
|
'Ianc',
|
|
|
|
'William',
|
|
|
|
'Laurent',
|
|
|
|
'Alexandra',
|
|
|
|
'Wendy',
|
|
|
|
'Alessio',
|
|
|
|
'Luis',
|
|
|
|
'Grantley',
|
|
|
|
'Beatrice',
|
|
|
|
'Sarah',
|
|
|
|
'Rossm',
|
|
|
|
'ROBERT',
|
|
|
|
'Steven',
|
|
|
|
'Wendyth',
|
|
|
|
'Melissa',
|
|
|
|
'Andy',
|
|
|
|
'Michael',
|
|
|
|
'Msa',
|
|
|
|
'Jamie',
|
|
|
|
'Toms',
|
|
|
|
'Sam',
|
|
|
|
'Larry',
|
|
|
|
'Annakg',
|
|
|
|
'Hamish',
|
|
|
|
'Francois',
|
|
|
|
'Annab',
|
|
|
|
'Erica',
|
|
|
|
'Coral',
|
|
|
|
'Priom',
|
|
|
|
'Barry',
|
|
|
|
'Nathan',
|
|
|
|
'Chrisd',
|
|
|
|
'Andrewf',
|
|
|
|
'Joshuas',
|
|
|
|
'Daniel',
|
|
|
|
'Danh',
|
|
|
|
'Duncanr',
|
|
|
|
'Robertt',
|
|
|
|
'Chrisdu',
|
|
|
|
'Brettm',
|
|
|
|
'Mathieud',
|
|
|
|
'Ianco',
|
|
|
|
'Larryp',
|
|
|
|
'Grantleys',
|
|
|
|
'Aliceh',
|
|
|
|
'Mattb',
|
|
|
|
'Tobyt',
|
|
|
|
'Benm',
|
|
|
|
'Jamess',
|
|
|
|
'Douga',
|
|
|
|
'Gabil',
|
|
|
|
'Francoisf',
|
|
|
|
'Lluna',
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def main(xml_name):
|
|
|
|
revisions = []
|
|
|
|
with open(xml_name, 'rb') as f:
|
|
|
|
pages = mwxml.Dump.from_file(f)
|
|
|
|
for page in pages:
|
|
|
|
title = page.title
|
|
|
|
for revision in page:
|
|
|
|
timestamp = str(revision.timestamp)
|
|
|
|
contributor = revision.user.text
|
|
|
|
|
|
|
|
revisions.append({
|
|
|
|
'page': title,
|
|
|
|
'user': contributor,
|
|
|
|
'date': timestamp
|
|
|
|
})
|
|
|
|
|
|
|
|
df = pd.DataFrame(revisions)
|
|
|
|
df.index = pd.to_datetime(df['date'])
|
|
|
|
|
|
|
|
# Convert to local time, and round to nearest day
|
|
|
|
df = df.tz_convert('Australia/Sydney')
|
|
|
|
df = df.sort_index()
|
|
|
|
df['date'] = df.index.date
|
|
|
|
|
|
|
|
# Drop duplicate entries (ignore edits for same user/same page/same day)
|
|
|
|
df = df.drop_duplicates()
|
|
|
|
df = df.drop(columns=['date'])
|
|
|
|
|
|
|
|
# Only include non-academic users
|
|
|
|
df = df[df['user'].isin(PROJECTS_TEAM)]
|
|
|
|
|
|
|
|
# Get list of years
|
|
|
|
years = df.index.year.unique()
|
|
|
|
|
|
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
|
|
|
|
|
|
summary = []
|
|
|
|
for year in years:
|
|
|
|
idx = df.index.year == year
|
|
|
|
pages = df[idx].groupby('page').count()
|
|
|
|
pages = pages.rename(columns={'user': 'edits'})
|
|
|
|
pages = pages.sort_values('edits', ascending=False)
|
|
|
|
|
|
|
|
users = df[idx].groupby('user').count()
|
|
|
|
users = users.rename(columns={'page': 'edits'})
|
|
|
|
users = users.sort_values('edits', ascending=False)
|
|
|
|
|
|
|
|
pages.to_csv(f'{OUTPUT_DIR}/{year}-pages.csv')
|
|
|
|
users.to_csv(f'{OUTPUT_DIR}/{year}-users.csv')
|
|
|
|
|
|
|
|
summary.append({
|
|
|
|
'year': year,
|
|
|
|
'page edits': pages.shape[0],
|
|
|
|
'active users': users.shape[0]
|
|
|
|
})
|
|
|
|
|
|
|
|
summary = pd.DataFrame(summary)
|
|
|
|
summary = summary.set_index('year')
|
|
|
|
|
|
|
|
fig, ax = plt.subplots(2,
|
|
|
|
1,
|
|
|
|
figsize=(6, 4),
|
|
|
|
sharex=True,
|
|
|
|
gridspec_kw={'hspace': 0.5})
|
|
|
|
|
|
|
|
summary[['page edits']].plot.bar(ax=ax[0], legend=False)
|
|
|
|
summary[['active users']].plot.bar(ax=ax[1], legend=False)
|
|
|
|
|
|
|
|
j = 0
|
|
|
|
for i, row in summary.iterrows():
|
|
|
|
ax[0].annotate(row['page edits'],
|
|
|
|
xy=(j, row['page edits']),
|
|
|
|
xytext=(0, 6),
|
|
|
|
textcoords='offset pixels',
|
|
|
|
ha='center',
|
|
|
|
fontsize=8)
|
|
|
|
|
|
|
|
ax[1].annotate(row['active users'],
|
|
|
|
xy=(j, row['active users']),
|
|
|
|
xytext=(0, 6),
|
|
|
|
textcoords='offset pixels',
|
|
|
|
ha='center',
|
|
|
|
fontsize=8)
|
|
|
|
j += 1
|
|
|
|
|
|
|
|
ax[0].set_title('Page edits', fontsize=10, y=0.9)
|
|
|
|
ax[1].set_title('Active users', fontsize=10, y=0.9)
|
|
|
|
|
|
|
|
ax[1].set_xlabel('')
|
|
|
|
ax[0].set_ylabel('Count', labelpad=10)
|
|
|
|
ax[1].set_ylabel('Count', labelpad=10)
|
|
|
|
|
|
|
|
for a in ax.ravel():
|
|
|
|
a.spines['top'].set_visible(False)
|
|
|
|
a.spines['right'].set_visible(False)
|
|
|
|
png_name = xml_name.replace('.xml', '.png')
|
|
|
|
|
|
|
|
plt.savefig(png_name, bbox_inches='tight', dpi=300)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
xml_names = [f for f in os.listdir('.') if f.endswith('.xml')]
|
|
|
|
for xml_name in xml_names:
|
|
|
|
main(xml_name)
|