|
|
|
@ -3,6 +3,7 @@ import re
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from lxml import etree
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
import mwxml # pip install mwxml
|
|
|
|
|
|
|
|
|
|
OUTPUT_DIR = 'csv'
|
|
|
|
|
PROJECTS_TEAM = [ # Oderered by first entry
|
|
|
|
@ -68,24 +69,20 @@ PROJECTS_TEAM = [ # Oderered by first entry
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(xml_name):
|
|
|
|
|
ns = {'mediawiki': 'http://www.mediawiki.org/xml/export-0.3/'}
|
|
|
|
|
root = etree.parse(xml_name)
|
|
|
|
|
pages = root.xpath('//mediawiki:page', namespaces=ns)
|
|
|
|
|
|
|
|
|
|
revisions = []
|
|
|
|
|
with open(xml_name, 'rb') as f:
|
|
|
|
|
pages = mwxml.Dump.from_file(f)
|
|
|
|
|
for page in pages:
|
|
|
|
|
title = page.xpath('./mediawiki:title', namespaces=ns)[0].text
|
|
|
|
|
for revision in page.xpath('./mediawiki:revision', namespaces=ns):
|
|
|
|
|
timestamp = revision.xpath('./mediawiki:timestamp',
|
|
|
|
|
namespaces=ns)[0].text
|
|
|
|
|
contributor = revision.getchildren()[2].getchildren()[0].text
|
|
|
|
|
title = page.title
|
|
|
|
|
for revision in page:
|
|
|
|
|
timestamp = str(revision.timestamp)
|
|
|
|
|
contributor = revision.user.text
|
|
|
|
|
|
|
|
|
|
revisions.append({
|
|
|
|
|
'page': title,
|
|
|
|
|
'user': contributor,
|
|
|
|
|
'date': str(timestamp)
|
|
|
|
|
'date': timestamp
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
df = pd.DataFrame(revisions)
|
|
|
|
|