You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

178 lines
4.4 KiB
Python

4 years ago
import os
import re
import pandas as pd
from lxml import etree
import matplotlib.pyplot as plt
OUTPUT_DIR = 'csv'
PROJECTS_TEAM = [ # Oderered by first entry
'Anderson',
'Robert',
'Brett',
'Conrad',
'Matt',
'Duncan',
'Ianc',
'William',
'Laurent',
'Alexandra',
'Wendy',
'Alessio',
'Luis',
'Grantley',
'Beatrice',
'Sarah',
'Rossm',
'ROBERT',
'Steven',
'Wendyth',
'Melissa',
'Andy',
'Michael',
'Msa',
'Jamie',
'Toms',
'Sam',
'Larry',
'Annakg',
'Hamish',
'Francois',
'Annab',
'Erica',
'Coral',
'Priom',
'Barry',
'Nathan',
'Chrisd',
'Andrewf',
'Joshuas',
'Daniel',
'Danh',
'Duncanr',
'Robertt',
'Chrisdu',
'Brettm',
'Mathieud',
'Ianco',
'Larryp',
'Grantleys',
'Aliceh',
'Mattb',
'Tobyt',
'Benm',
'Jamess',
'Douga',
'Gabil',
'Francoisf',
'Lluna',
]
def main(xml_name):
ns = {'mediawiki': 'http://www.mediawiki.org/xml/export-0.3/'}
root = etree.parse(xml_name)
pages = root.xpath('//mediawiki:page', namespaces=ns)
revisions = []
for page in pages:
title = page.xpath('./mediawiki:title', namespaces=ns)[0].text
for revision in page.xpath('./mediawiki:revision', namespaces=ns):
timestamp = revision.xpath('./mediawiki:timestamp',
namespaces=ns)[0].text
contributor = revision.getchildren()[2].getchildren()[0].text
revisions.append({
'page': title,
'user': contributor,
'date': str(timestamp)
})
df = pd.DataFrame(revisions)
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date').sort_index()
# Convert to local time, and round to nearest day
df.index = df.index.tz_convert('Australia/Sydney')
df.index = df.index.round('d')
# Drop duplicate entries (ignore edits for same user/same page/same day)
df = df.drop_duplicates()
# Only include non-academic users
df = df[df['user'].isin(PROJECTS_TEAM)]
# Get list of years
years = df.index.year.unique()
os.makedirs(OUTPUT_DIR, exist_ok=True)
summary = []
for year in years:
idx = df.index.year == year
pages = df[idx].groupby('page').count()
pages = pages.rename(columns={'user': 'edits'})
pages = pages.sort_values('edits', ascending=False)
users = df[idx].groupby('user').count()
users = users.rename(columns={'page': 'edits'})
users = users.sort_values('edits', ascending=False)
pages.to_csv(f'{OUTPUT_DIR}/{year}-pages.csv')
users.to_csv(f'{OUTPUT_DIR}/{year}-users.csv')
summary.append({
'year': year,
'page edits': pages.shape[0],
'active users': users.shape[0]
})
summary = pd.DataFrame(summary)
summary = summary.set_index('year')
fig, ax = plt.subplots(2,
1,
figsize=(6, 4),
sharex=True,
gridspec_kw={'hspace': 0.5})
summary[['page edits']].plot.bar(ax=ax[0], legend=False)
summary[['active users']].plot.bar(ax=ax[1], legend=False)
j = 0
for i, row in summary.iterrows():
ax[0].annotate(row['page edits'],
xy=(j, row['page edits']),
xytext=(0, 6),
textcoords='offset pixels',
ha='center',
fontsize=8)
ax[1].annotate(row['active users'],
xy=(j, row['active users']),
xytext=(0, 6),
textcoords='offset pixels',
ha='center',
fontsize=8)
j += 1
ax[0].set_title('Page edits', fontsize=10, y=0.9)
ax[1].set_title('Active users', fontsize=10, y=0.9)
ax[1].set_xlabel('')
ax[0].set_ylabel('Count', labelpad=10)
ax[1].set_ylabel('Count', labelpad=10)
for a in ax.ravel():
a.spines['top'].set_visible(False)
a.spines['right'].set_visible(False)
png_name = xml_name.replace('.xml', '.png')
plt.savefig(png_name, bbox_inches='tight', dpi=300)
if __name__ == '__main__':
xml_names = [f for f in os.listdir('.') if f.endswith('.xml')]
for xml_name in xml_names:
main(xml_name)