First commit

5 years ago · d020979867
commit d020979867
4 changed files with 219 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,22 @@
 # wiki-usage
 Generate usage stats on a mediawiki site.
 ## Usage
 1.  Get list of all wiki pages with the script `wiki_pages.py`
 2.  Go to this wiki page:  
    <http://wiki.wrl.unsw.edu.au/index.php/Special:Export>
 3.  Paste the list of page titles.
 4.  Uncheck the box 'Include only the current revision, not the full history'.
 5.  Click export.
 ![](docs/export.png)
 6.  Move the downloaded xml file into the same folder as `wiki-stats.py`, and run the script.
 ## Sample output
 ![](docs/stats.png)
--- a/docs/stats.png
+++ b/docs/stats.png
--- a/wiki_pages.py
+++ b/wiki_pages.py
@ -0,0 +1,19 @@
 import io
 import requests
 import getpass
 from lxml import html
 # Get login details
 username = input('Username: ')
 password = getpass.getpass('Password: ')
 # Get list of all pages
 url = 'http://wiki.wrl.unsw.edu.au/index.php'
 page = requests.get(url + '/Special:Allpages', auth=(username, password))
 tree = html.parse(io.BytesIO(page.content))
 # Save page names
 elements = tree.xpath('*//td/a')[1:]
 with open('pages.txt', 'w') as f:
    for e in elements:
        f.write(e.text + '\n')
--- a/wiki_stats.py
+++ b/wiki_stats.py
@ -0,0 +1,178 @@
 import os
 import re
 import pandas as pd
 from lxml import etree
 import matplotlib.pyplot as plt
 OUTPUT_DIR = 'csv'
 PROJECTS_TEAM = [  # Oderered by first entry
    'Anderson',
    'Robert',
    'Brett',
    'Conrad',
    'Matt',
    'Duncan',
    'Ianc',
    'William',
    'Laurent',
    'Alexandra',
    'Wendy',
    'Alessio',
    'Luis',
    'Grantley',
    'Beatrice',
    'Sarah',
    'Rossm',
    'ROBERT',
    'Steven',
    'Wendyth',
    'Melissa',
    'Andy',
    'Michael',
    'Msa',
    'Jamie',
    'Toms',
    'Sam',
    'Larry',
    'Annakg',
    'Hamish',
    'Francois',
    'Annab',
    'Erica',
    'Coral',
    'Priom',
    'Barry',
    'Nathan',
    'Chrisd',
    'Andrewf',
    'Joshuas',
    'Daniel',
    'Danh',
    'Duncanr',
    'Robertt',
    'Chrisdu',
    'Brettm',
    'Mathieud',
    'Ianco',
    'Larryp',
    'Grantleys',
    'Aliceh',
    'Mattb',
    'Tobyt',
    'Benm',
    'Jamess',
    'Douga',
    'Gabil',
    'Francoisf',
    'Lluna',
 ]
 xml_name = 'WRL-20200930161350.xml'
 def main(xml_name):
    ns = {'mediawiki': 'http://www.mediawiki.org/xml/export-0.3/'}
    root = etree.parse(xml_name)
    pages = root.xpath('//mediawiki:page', namespaces=ns)
    revisions = []
    for page in pages:
        title = page.xpath('./mediawiki:title', namespaces=ns)[0].text
        for revision in page.xpath('./mediawiki:revision', namespaces=ns):
            timestamp = revision.xpath('./mediawiki:timestamp',
                                       namespaces=ns)[0].text
            contributor = revision.getchildren()[2].getchildren()[0].text
            revisions.append({
                'page': title,
                'user': contributor,
                'date': str(timestamp)
            })
    df = pd.DataFrame(revisions)
    df['date'] = pd.to_datetime(df['date'])
    df = df.set_index('date').sort_index()
    # Convert to local time, and round to nearest day
    df.index = df.index.tz_convert('Australia/Sydney')
    df.index = df.index.round('d')
    # Drop duplicate entries (ignore edits for same user/same page/same day)
    df = df.drop_duplicates()
    # Only include non-academic users
    df = df[df['user'].isin(PROJECTS_TEAM)]
    # Get list of years
    years = df.index.year.unique()
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    summary = []
    for year in years:
        idx = df.index.year == year
        pages = df[idx].groupby('page').count()
        pages = pages.rename(columns={'user': 'edits'})
        pages = pages.sort_values('edits', ascending=False)
        users = df[idx].groupby('user').count()
        users = users.rename(columns={'page': 'edits'})
        users = users.sort_values('edits', ascending=False)
        pages.to_csv(f'{OUTPUT_DIR}/{year}-pages.csv')
        users.to_csv(f'{OUTPUT_DIR}/{year}-users.csv')
        summary.append({
            'year': year,
            'page edits': pages.shape[0],
            'active users': users.shape[0]
        })
    summary = pd.DataFrame(summary)
    summary = summary.set_index('year')
    fig, ax = plt.subplots(2,
                           1,
                           figsize=(6, 4),
                           sharex=True,
                           gridspec_kw={'hspace': 0.5})
    summary[['page edits']].plot.bar(ax=ax[0], legend=False)
    summary[['active users']].plot.bar(ax=ax[1], legend=False)
    j = 0
    for i, row in summary.iterrows():
        ax[0].annotate(row['page edits'],
                       xy=(j, row['page edits']),
                       xytext=(0, 6),
                       textcoords='offset pixels',
                       ha='center',
                       fontsize=8)
        ax[1].annotate(row['active users'],
                       xy=(j, row['active users']),
                       xytext=(0, 6),
                       textcoords='offset pixels',
                       ha='center',
                       fontsize=8)
        j += 1
    ax[0].set_title('Page edits', fontsize=10, y=0.9)
    ax[1].set_title('Active users', fontsize=10, y=0.9)
    ax[1].set_xlabel('')
    ax[0].set_ylabel('Count', labelpad=10)
    ax[1].set_ylabel('Count', labelpad=10)
    for a in ax.ravel():
        a.spines['top'].set_visible(False)
        a.spines['right'].set_visible(False)
        png_name = xml_name.replace('.xml', '.png')
    plt.savefig(png_name, bbox_inches='tight', dpi=300)
 if __name__ == '__main__':
    xml_names = [f for f in os.listdir('.') if f.endswith('.xml')]
    for xml_name in xml_names:
        main(xml_name)