First commit
commit
d020979867
@ -0,0 +1,22 @@
|
||||
# wiki-usage
|
||||
|
||||
Generate usage stats on a mediawiki site.
|
||||
|
||||
## Usage
|
||||
|
||||
1. Get list of all wiki pages with the script `wiki_pages.py`
|
||||
|
||||
2. Go to this wiki page:
|
||||
<http://wiki.wrl.unsw.edu.au/index.php/Special:Export>
|
||||
|
||||
3. Paste the list of page titles.
|
||||
4. Uncheck the box 'Include only the current revision, not the full history'.
|
||||
5. Click export.
|
||||
|
||||

|
||||
|
||||
6. Move the downloaded xml file into the same folder as `wiki-stats.py`, and run the script.
|
||||
|
||||
## Sample output
|
||||
|
||||

|
Binary file not shown.
After Width: | Height: | Size: 77 KiB |
@ -0,0 +1,19 @@
|
||||
import io
|
||||
import requests
|
||||
import getpass
|
||||
from lxml import html
|
||||
|
||||
# Get login details
|
||||
username = input('Username: ')
|
||||
password = getpass.getpass('Password: ')
|
||||
|
||||
# Get list of all pages
|
||||
url = 'http://wiki.wrl.unsw.edu.au/index.php'
|
||||
page = requests.get(url + '/Special:Allpages', auth=(username, password))
|
||||
tree = html.parse(io.BytesIO(page.content))
|
||||
|
||||
# Save page names
|
||||
elements = tree.xpath('*//td/a')[1:]
|
||||
with open('pages.txt', 'w') as f:
|
||||
for e in elements:
|
||||
f.write(e.text + '\n')
|
@ -0,0 +1,178 @@
|
||||
import os
|
||||
import re
|
||||
import pandas as pd
|
||||
from lxml import etree
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
OUTPUT_DIR = 'csv'
|
||||
PROJECTS_TEAM = [ # Oderered by first entry
|
||||
'Anderson',
|
||||
'Robert',
|
||||
'Brett',
|
||||
'Conrad',
|
||||
'Matt',
|
||||
'Duncan',
|
||||
'Ianc',
|
||||
'William',
|
||||
'Laurent',
|
||||
'Alexandra',
|
||||
'Wendy',
|
||||
'Alessio',
|
||||
'Luis',
|
||||
'Grantley',
|
||||
'Beatrice',
|
||||
'Sarah',
|
||||
'Rossm',
|
||||
'ROBERT',
|
||||
'Steven',
|
||||
'Wendyth',
|
||||
'Melissa',
|
||||
'Andy',
|
||||
'Michael',
|
||||
'Msa',
|
||||
'Jamie',
|
||||
'Toms',
|
||||
'Sam',
|
||||
'Larry',
|
||||
'Annakg',
|
||||
'Hamish',
|
||||
'Francois',
|
||||
'Annab',
|
||||
'Erica',
|
||||
'Coral',
|
||||
'Priom',
|
||||
'Barry',
|
||||
'Nathan',
|
||||
'Chrisd',
|
||||
'Andrewf',
|
||||
'Joshuas',
|
||||
'Daniel',
|
||||
'Danh',
|
||||
'Duncanr',
|
||||
'Robertt',
|
||||
'Chrisdu',
|
||||
'Brettm',
|
||||
'Mathieud',
|
||||
'Ianco',
|
||||
'Larryp',
|
||||
'Grantleys',
|
||||
'Aliceh',
|
||||
'Mattb',
|
||||
'Tobyt',
|
||||
'Benm',
|
||||
'Jamess',
|
||||
'Douga',
|
||||
'Gabil',
|
||||
'Francoisf',
|
||||
'Lluna',
|
||||
]
|
||||
|
||||
xml_name = 'WRL-20200930161350.xml'
|
||||
|
||||
|
||||
def main(xml_name):
|
||||
ns = {'mediawiki': 'http://www.mediawiki.org/xml/export-0.3/'}
|
||||
root = etree.parse(xml_name)
|
||||
pages = root.xpath('//mediawiki:page', namespaces=ns)
|
||||
|
||||
revisions = []
|
||||
for page in pages:
|
||||
title = page.xpath('./mediawiki:title', namespaces=ns)[0].text
|
||||
for revision in page.xpath('./mediawiki:revision', namespaces=ns):
|
||||
timestamp = revision.xpath('./mediawiki:timestamp',
|
||||
namespaces=ns)[0].text
|
||||
contributor = revision.getchildren()[2].getchildren()[0].text
|
||||
|
||||
revisions.append({
|
||||
'page': title,
|
||||
'user': contributor,
|
||||
'date': str(timestamp)
|
||||
})
|
||||
|
||||
df = pd.DataFrame(revisions)
|
||||
df['date'] = pd.to_datetime(df['date'])
|
||||
df = df.set_index('date').sort_index()
|
||||
|
||||
# Convert to local time, and round to nearest day
|
||||
df.index = df.index.tz_convert('Australia/Sydney')
|
||||
df.index = df.index.round('d')
|
||||
|
||||
# Drop duplicate entries (ignore edits for same user/same page/same day)
|
||||
df = df.drop_duplicates()
|
||||
|
||||
# Only include non-academic users
|
||||
df = df[df['user'].isin(PROJECTS_TEAM)]
|
||||
|
||||
# Get list of years
|
||||
years = df.index.year.unique()
|
||||
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
summary = []
|
||||
for year in years:
|
||||
idx = df.index.year == year
|
||||
pages = df[idx].groupby('page').count()
|
||||
pages = pages.rename(columns={'user': 'edits'})
|
||||
pages = pages.sort_values('edits', ascending=False)
|
||||
|
||||
users = df[idx].groupby('user').count()
|
||||
users = users.rename(columns={'page': 'edits'})
|
||||
users = users.sort_values('edits', ascending=False)
|
||||
|
||||
pages.to_csv(f'{OUTPUT_DIR}/{year}-pages.csv')
|
||||
users.to_csv(f'{OUTPUT_DIR}/{year}-users.csv')
|
||||
|
||||
summary.append({
|
||||
'year': year,
|
||||
'page edits': pages.shape[0],
|
||||
'active users': users.shape[0]
|
||||
})
|
||||
|
||||
summary = pd.DataFrame(summary)
|
||||
summary = summary.set_index('year')
|
||||
|
||||
fig, ax = plt.subplots(2,
|
||||
1,
|
||||
figsize=(6, 4),
|
||||
sharex=True,
|
||||
gridspec_kw={'hspace': 0.5})
|
||||
|
||||
summary[['page edits']].plot.bar(ax=ax[0], legend=False)
|
||||
summary[['active users']].plot.bar(ax=ax[1], legend=False)
|
||||
|
||||
j = 0
|
||||
for i, row in summary.iterrows():
|
||||
ax[0].annotate(row['page edits'],
|
||||
xy=(j, row['page edits']),
|
||||
xytext=(0, 6),
|
||||
textcoords='offset pixels',
|
||||
ha='center',
|
||||
fontsize=8)
|
||||
|
||||
ax[1].annotate(row['active users'],
|
||||
xy=(j, row['active users']),
|
||||
xytext=(0, 6),
|
||||
textcoords='offset pixels',
|
||||
ha='center',
|
||||
fontsize=8)
|
||||
j += 1
|
||||
|
||||
ax[0].set_title('Page edits', fontsize=10, y=0.9)
|
||||
ax[1].set_title('Active users', fontsize=10, y=0.9)
|
||||
|
||||
ax[1].set_xlabel('')
|
||||
ax[0].set_ylabel('Count', labelpad=10)
|
||||
ax[1].set_ylabel('Count', labelpad=10)
|
||||
|
||||
for a in ax.ravel():
|
||||
a.spines['top'].set_visible(False)
|
||||
a.spines['right'].set_visible(False)
|
||||
png_name = xml_name.replace('.xml', '.png')
|
||||
|
||||
plt.savefig(png_name, bbox_inches='tight', dpi=300)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
xml_names = [f for f in os.listdir('.') if f.endswith('.xml')]
|
||||
for xml_name in xml_names:
|
||||
main(xml_name)
|
Loading…
Reference in New Issue