From 83f62d2820a24ef0a49de8c473178629037d8d6c Mon Sep 17 00:00:00 2001 From: Dan Howe Date: Thu, 1 Oct 2020 18:34:26 +0200 Subject: [PATCH] Fix bug where dates were not indcluded when identifying duplicates --- wiki_stats.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/wiki_stats.py b/wiki_stats.py index 85796b3..78e6b65 100644 --- a/wiki_stats.py +++ b/wiki_stats.py @@ -86,15 +86,16 @@ def main(xml_name): }) df = pd.DataFrame(revisions) - df['date'] = pd.to_datetime(df['date']) - df = df.set_index('date').sort_index() + df.index = pd.to_datetime(df['date']) # Convert to local time, and round to nearest day - df.index = df.index.tz_convert('Australia/Sydney') - df.index = df.index.round('d') + df = df.tz_convert('Australia/Sydney') + df = df.sort_index() + df['date'] = df.index.date # Drop duplicate entries (ignore edits for same user/same page/same day) df = df.drop_duplicates() + df = df.drop(columns=['date']) # Only include non-academic users df = df[df['user'].isin(PROJECTS_TEAM)]