From 5e4c5e0361186dbf39f5be5ca60a1edfec6dcea1 Mon Sep 17 00:00:00 2001 From: Dan Howe Date: Thu, 1 Oct 2020 18:33:10 +0200 Subject: [PATCH] Parse xml file with mwxml package --- wiki_stats.py | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/wiki_stats.py b/wiki_stats.py index 90af9fa..85796b3 100644 --- a/wiki_stats.py +++ b/wiki_stats.py @@ -3,6 +3,7 @@ import re import pandas as pd from lxml import etree import matplotlib.pyplot as plt +import mwxml # pip install mwxml OUTPUT_DIR = 'csv' PROJECTS_TEAM = [ # Oderered by first entry @@ -68,25 +69,21 @@ PROJECTS_TEAM = [ # Oderered by first entry ] - def main(xml_name): - ns = {'mediawiki': 'http://www.mediawiki.org/xml/export-0.3/'} - root = etree.parse(xml_name) - pages = root.xpath('//mediawiki:page', namespaces=ns) - revisions = [] - for page in pages: - title = page.xpath('./mediawiki:title', namespaces=ns)[0].text - for revision in page.xpath('./mediawiki:revision', namespaces=ns): - timestamp = revision.xpath('./mediawiki:timestamp', - namespaces=ns)[0].text - contributor = revision.getchildren()[2].getchildren()[0].text - - revisions.append({ - 'page': title, - 'user': contributor, - 'date': str(timestamp) - }) + with open(xml_name, 'rb') as f: + pages = mwxml.Dump.from_file(f) + for page in pages: + title = page.title + for revision in page: + timestamp = str(revision.timestamp) + contributor = revision.user.text + + revisions.append({ + 'page': title, + 'user': contributor, + 'date': timestamp + }) df = pd.DataFrame(revisions) df['date'] = pd.to_datetime(df['date'])