Parse xml file with mwxml package

master
Dan Howe 4 years ago
parent 1e595e2dff
commit 5e4c5e0361

@ -3,6 +3,7 @@ import re
import pandas as pd import pandas as pd
from lxml import etree from lxml import etree
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import mwxml # pip install mwxml
OUTPUT_DIR = 'csv' OUTPUT_DIR = 'csv'
PROJECTS_TEAM = [ # Oderered by first entry PROJECTS_TEAM = [ # Oderered by first entry
@ -68,24 +69,20 @@ PROJECTS_TEAM = [ # Oderered by first entry
] ]
def main(xml_name): def main(xml_name):
ns = {'mediawiki': 'http://www.mediawiki.org/xml/export-0.3/'}
root = etree.parse(xml_name)
pages = root.xpath('//mediawiki:page', namespaces=ns)
revisions = [] revisions = []
with open(xml_name, 'rb') as f:
pages = mwxml.Dump.from_file(f)
for page in pages: for page in pages:
title = page.xpath('./mediawiki:title', namespaces=ns)[0].text title = page.title
for revision in page.xpath('./mediawiki:revision', namespaces=ns): for revision in page:
timestamp = revision.xpath('./mediawiki:timestamp', timestamp = str(revision.timestamp)
namespaces=ns)[0].text contributor = revision.user.text
contributor = revision.getchildren()[2].getchildren()[0].text
revisions.append({ revisions.append({
'page': title, 'page': title,
'user': contributor, 'user': contributor,
'date': str(timestamp) 'date': timestamp
}) })
df = pd.DataFrame(revisions) df = pd.DataFrame(revisions)

Loading…
Cancel
Save