First commit

5 years ago · d020979867
commit d020979867
4 changed files with 219 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,22 @@
+# wiki-usage
+
+Generate usage stats on a mediawiki site.
+
+## Usage
+
+1.  Get list of all wiki pages with the script `wiki_pages.py`
+
+2.  Go to this wiki page:  
+    <http://wiki.wrl.unsw.edu.au/index.php/Special:Export>
+
+3.  Paste the list of page titles.
+4.  Uncheck the box 'Include only the current revision, not the full history'.
+5.  Click export.
+
+![](docs/export.png)
+
+6.  Move the downloaded xml file into the same folder as `wiki-stats.py`, and run the script.
+
+## Sample output
+
+![](docs/stats.png)
--- a/docs/stats.png
+++ b/docs/stats.png
--- a/wiki_pages.py
+++ b/wiki_pages.py
@ -0,0 +1,19 @@
+import io
+import requests
+import getpass
+from lxml import html
+
+# Get login details
+username = input('Username: ')
+password = getpass.getpass('Password: ')
+
+# Get list of all pages
+url = 'http://wiki.wrl.unsw.edu.au/index.php'
+page = requests.get(url + '/Special:Allpages', auth=(username, password))
+tree = html.parse(io.BytesIO(page.content))
+
+# Save page names
+elements = tree.xpath('*//td/a')[1:]
+with open('pages.txt', 'w') as f:
+    for e in elements:
+        f.write(e.text + '\n')
--- a/wiki_stats.py
+++ b/wiki_stats.py
@ -0,0 +1,178 @@
+import os
+import re
+import pandas as pd
+from lxml import etree
+import matplotlib.pyplot as plt
+
+OUTPUT_DIR = 'csv'
+PROJECTS_TEAM = [  # Oderered by first entry
+    'Anderson',
+    'Robert',
+    'Brett',
+    'Conrad',
+    'Matt',
+    'Duncan',
+    'Ianc',
+    'William',
+    'Laurent',
+    'Alexandra',
+    'Wendy',
+    'Alessio',
+    'Luis',
+    'Grantley',
+    'Beatrice',
+    'Sarah',
+    'Rossm',
+    'ROBERT',
+    'Steven',
+    'Wendyth',
+    'Melissa',
+    'Andy',
+    'Michael',
+    'Msa',
+    'Jamie',
+    'Toms',
+    'Sam',
+    'Larry',
+    'Annakg',
+    'Hamish',
+    'Francois',
+    'Annab',
+    'Erica',
+    'Coral',
+    'Priom',
+    'Barry',
+    'Nathan',
+    'Chrisd',
+    'Andrewf',
+    'Joshuas',
+    'Daniel',
+    'Danh',
+    'Duncanr',
+    'Robertt',
+    'Chrisdu',
+    'Brettm',
+    'Mathieud',
+    'Ianco',
+    'Larryp',
+    'Grantleys',
+    'Aliceh',
+    'Mattb',
+    'Tobyt',
+    'Benm',
+    'Jamess',
+    'Douga',
+    'Gabil',
+    'Francoisf',
+    'Lluna',
+]
+
+xml_name = 'WRL-20200930161350.xml'
+
+
+def main(xml_name):
+    ns = {'mediawiki': 'http://www.mediawiki.org/xml/export-0.3/'}
+    root = etree.parse(xml_name)
+    pages = root.xpath('//mediawiki:page', namespaces=ns)
+
+    revisions = []
+    for page in pages:
+        title = page.xpath('./mediawiki:title', namespaces=ns)[0].text
+        for revision in page.xpath('./mediawiki:revision', namespaces=ns):
+            timestamp = revision.xpath('./mediawiki:timestamp',
+                                       namespaces=ns)[0].text
+            contributor = revision.getchildren()[2].getchildren()[0].text
+
+            revisions.append({
+                'page': title,
+                'user': contributor,
+                'date': str(timestamp)
+            })
+
+    df = pd.DataFrame(revisions)
+    df['date'] = pd.to_datetime(df['date'])
+    df = df.set_index('date').sort_index()
+
+    # Convert to local time, and round to nearest day
+    df.index = df.index.tz_convert('Australia/Sydney')
+    df.index = df.index.round('d')
+
+    # Drop duplicate entries (ignore edits for same user/same page/same day)
+    df = df.drop_duplicates()
+
+    # Only include non-academic users
+    df = df[df['user'].isin(PROJECTS_TEAM)]
+
+    # Get list of years
+    years = df.index.year.unique()
+
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+    summary = []
+    for year in years:
+        idx = df.index.year == year
+        pages = df[idx].groupby('page').count()
+        pages = pages.rename(columns={'user': 'edits'})
+        pages = pages.sort_values('edits', ascending=False)
+
+        users = df[idx].groupby('user').count()
+        users = users.rename(columns={'page': 'edits'})
+        users = users.sort_values('edits', ascending=False)
+
+        pages.to_csv(f'{OUTPUT_DIR}/{year}-pages.csv')
+        users.to_csv(f'{OUTPUT_DIR}/{year}-users.csv')
+
+        summary.append({
+            'year': year,
+            'page edits': pages.shape[0],
+            'active users': users.shape[0]
+        })
+
+    summary = pd.DataFrame(summary)
+    summary = summary.set_index('year')
+
+    fig, ax = plt.subplots(2,
+                           1,
+                           figsize=(6, 4),
+                           sharex=True,
+                           gridspec_kw={'hspace': 0.5})
+
+    summary[['page edits']].plot.bar(ax=ax[0], legend=False)
+    summary[['active users']].plot.bar(ax=ax[1], legend=False)
+
+    j = 0
+    for i, row in summary.iterrows():
+        ax[0].annotate(row['page edits'],
+                       xy=(j, row['page edits']),
+                       xytext=(0, 6),
+                       textcoords='offset pixels',
+                       ha='center',
+                       fontsize=8)
+
+        ax[1].annotate(row['active users'],
+                       xy=(j, row['active users']),
+                       xytext=(0, 6),
+                       textcoords='offset pixels',
+                       ha='center',
+                       fontsize=8)
+        j += 1
+
+    ax[0].set_title('Page edits', fontsize=10, y=0.9)
+    ax[1].set_title('Active users', fontsize=10, y=0.9)
+
+    ax[1].set_xlabel('')
+    ax[0].set_ylabel('Count', labelpad=10)
+    ax[1].set_ylabel('Count', labelpad=10)
+
+    for a in ax.ravel():
+        a.spines['top'].set_visible(False)
+        a.spines['right'].set_visible(False)
+        png_name = xml_name.replace('.xml', '.png')
+
+    plt.savefig(png_name, bbox_inches='tight', dpi=300)
+
+
+if __name__ == '__main__':
+    xml_names = [f for f in os.listdir('.') if f.endswith('.xml')]
+    for xml_name in xml_names:
+        main(xml_name)