Parse xml file with mwxml package

master
Dan Howe 4 years ago
parent 1e595e2dff
commit 5e4c5e0361

@ -3,6 +3,7 @@ import re
import pandas as pd
from lxml import etree
import matplotlib.pyplot as plt
import mwxml # pip install mwxml
OUTPUT_DIR = 'csv'
PROJECTS_TEAM = [ # Oderered by first entry
@ -68,25 +69,21 @@ PROJECTS_TEAM = [ # Oderered by first entry
]
def main(xml_name):
ns = {'mediawiki': 'http://www.mediawiki.org/xml/export-0.3/'}
root = etree.parse(xml_name)
pages = root.xpath('//mediawiki:page', namespaces=ns)
revisions = []
for page in pages:
title = page.xpath('./mediawiki:title', namespaces=ns)[0].text
for revision in page.xpath('./mediawiki:revision', namespaces=ns):
timestamp = revision.xpath('./mediawiki:timestamp',
namespaces=ns)[0].text
contributor = revision.getchildren()[2].getchildren()[0].text
revisions.append({
'page': title,
'user': contributor,
'date': str(timestamp)
})
with open(xml_name, 'rb') as f:
pages = mwxml.Dump.from_file(f)
for page in pages:
title = page.title
for revision in page:
timestamp = str(revision.timestamp)
contributor = revision.user.text
revisions.append({
'page': title,
'user': contributor,
'date': timestamp
})
df = pd.DataFrame(revisions)
df['date'] = pd.to_datetime(df['date'])

Loading…
Cancel
Save