Fix bug where dates were not indcluded when identifying duplicates

master
Dan Howe 4 years ago
parent 5e4c5e0361
commit 83f62d2820

@ -86,15 +86,16 @@ def main(xml_name):
})
df = pd.DataFrame(revisions)
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date').sort_index()
df.index = pd.to_datetime(df['date'])
# Convert to local time, and round to nearest day
df.index = df.index.tz_convert('Australia/Sydney')
df.index = df.index.round('d')
df = df.tz_convert('Australia/Sydney')
df = df.sort_index()
df['date'] = df.index.date
# Drop duplicate entries (ignore edits for same user/same page/same day)
df = df.drop_duplicates()
df = df.drop(columns=['date'])
# Only include non-academic users
df = df[df['user'].isin(PROJECTS_TEAM)]

Loading…
Cancel
Save