First commit

master
Dan Howe 4 years ago
commit d020979867

@ -0,0 +1,22 @@
# wiki-usage
Generate usage stats on a mediawiki site.
## Usage
1. Get list of all wiki pages with the script `wiki_pages.py`
2. Go to this wiki page:
<http://wiki.wrl.unsw.edu.au/index.php/Special:Export>
3. Paste the list of page titles.
4. Uncheck the box 'Include only the current revision, not the full history'.
5. Click export.
![](docs/export.png)
6. Move the downloaded xml file into the same folder as `wiki-stats.py`, and run the script.
## Sample output
![](docs/stats.png)

Binary file not shown.

After

Width:  |  Height:  |  Size: 77 KiB

@ -0,0 +1,19 @@
import io
import requests
import getpass
from lxml import html
# Get login details
username = input('Username: ')
password = getpass.getpass('Password: ')
# Get list of all pages
url = 'http://wiki.wrl.unsw.edu.au/index.php'
page = requests.get(url + '/Special:Allpages', auth=(username, password))
tree = html.parse(io.BytesIO(page.content))
# Save page names
elements = tree.xpath('*//td/a')[1:]
with open('pages.txt', 'w') as f:
for e in elements:
f.write(e.text + '\n')

@ -0,0 +1,178 @@
import os
import re
import pandas as pd
from lxml import etree
import matplotlib.pyplot as plt
OUTPUT_DIR = 'csv'
PROJECTS_TEAM = [ # Oderered by first entry
'Anderson',
'Robert',
'Brett',
'Conrad',
'Matt',
'Duncan',
'Ianc',
'William',
'Laurent',
'Alexandra',
'Wendy',
'Alessio',
'Luis',
'Grantley',
'Beatrice',
'Sarah',
'Rossm',
'ROBERT',
'Steven',
'Wendyth',
'Melissa',
'Andy',
'Michael',
'Msa',
'Jamie',
'Toms',
'Sam',
'Larry',
'Annakg',
'Hamish',
'Francois',
'Annab',
'Erica',
'Coral',
'Priom',
'Barry',
'Nathan',
'Chrisd',
'Andrewf',
'Joshuas',
'Daniel',
'Danh',
'Duncanr',
'Robertt',
'Chrisdu',
'Brettm',
'Mathieud',
'Ianco',
'Larryp',
'Grantleys',
'Aliceh',
'Mattb',
'Tobyt',
'Benm',
'Jamess',
'Douga',
'Gabil',
'Francoisf',
'Lluna',
]
xml_name = 'WRL-20200930161350.xml'
def main(xml_name):
ns = {'mediawiki': 'http://www.mediawiki.org/xml/export-0.3/'}
root = etree.parse(xml_name)
pages = root.xpath('//mediawiki:page', namespaces=ns)
revisions = []
for page in pages:
title = page.xpath('./mediawiki:title', namespaces=ns)[0].text
for revision in page.xpath('./mediawiki:revision', namespaces=ns):
timestamp = revision.xpath('./mediawiki:timestamp',
namespaces=ns)[0].text
contributor = revision.getchildren()[2].getchildren()[0].text
revisions.append({
'page': title,
'user': contributor,
'date': str(timestamp)
})
df = pd.DataFrame(revisions)
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date').sort_index()
# Convert to local time, and round to nearest day
df.index = df.index.tz_convert('Australia/Sydney')
df.index = df.index.round('d')
# Drop duplicate entries (ignore edits for same user/same page/same day)
df = df.drop_duplicates()
# Only include non-academic users
df = df[df['user'].isin(PROJECTS_TEAM)]
# Get list of years
years = df.index.year.unique()
os.makedirs(OUTPUT_DIR, exist_ok=True)
summary = []
for year in years:
idx = df.index.year == year
pages = df[idx].groupby('page').count()
pages = pages.rename(columns={'user': 'edits'})
pages = pages.sort_values('edits', ascending=False)
users = df[idx].groupby('user').count()
users = users.rename(columns={'page': 'edits'})
users = users.sort_values('edits', ascending=False)
pages.to_csv(f'{OUTPUT_DIR}/{year}-pages.csv')
users.to_csv(f'{OUTPUT_DIR}/{year}-users.csv')
summary.append({
'year': year,
'page edits': pages.shape[0],
'active users': users.shape[0]
})
summary = pd.DataFrame(summary)
summary = summary.set_index('year')
fig, ax = plt.subplots(2,
1,
figsize=(6, 4),
sharex=True,
gridspec_kw={'hspace': 0.5})
summary[['page edits']].plot.bar(ax=ax[0], legend=False)
summary[['active users']].plot.bar(ax=ax[1], legend=False)
j = 0
for i, row in summary.iterrows():
ax[0].annotate(row['page edits'],
xy=(j, row['page edits']),
xytext=(0, 6),
textcoords='offset pixels',
ha='center',
fontsize=8)
ax[1].annotate(row['active users'],
xy=(j, row['active users']),
xytext=(0, 6),
textcoords='offset pixels',
ha='center',
fontsize=8)
j += 1
ax[0].set_title('Page edits', fontsize=10, y=0.9)
ax[1].set_title('Active users', fontsize=10, y=0.9)
ax[1].set_xlabel('')
ax[0].set_ylabel('Count', labelpad=10)
ax[1].set_ylabel('Count', labelpad=10)
for a in ax.ravel():
a.spines['top'].set_visible(False)
a.spines['right'].set_visible(False)
png_name = xml_name.replace('.xml', '.png')
plt.savefig(png_name, bbox_inches='tight', dpi=300)
if __name__ == '__main__':
xml_names = [f for f in os.listdir('.') if f.endswith('.xml')]
for xml_name in xml_names:
main(xml_name)
Loading…
Cancel
Save