Downsample if required

master
Dan Howe 6 years ago
parent 870da28f00
commit c081c9f69d

@ -249,6 +249,12 @@ def extract_records(input_dir, output_dir, clean_up=False):
# Prepare output directory
os.makedirs(output_dir, exist_ok=True)
# Create master dataframe
periods = ['all', 'weekly']
master = {}
for period in periods:
master[period] = pd.DataFrame()
for zip_name in tqdm(zip_names):
# Skip duplicate downloads
if re.search('\([0-9]+\)', zip_name):
@ -278,13 +284,8 @@ def extract_records(input_dir, output_dir, clean_up=False):
bore_id = re.search('^\S+', meta).group()
site, hole, pipe = bore_id.split('.')
# Add bore specifics to dataframe
df['Site'] = site
df['Hole'] = hole
df['Pipe'] = pipe
df['Lat'] = lat
df['Lon'] = lon
df['Elev'] = elev
# FIXME: detect basin automatically
basin_id = 'MB'
# Rename columns
df = df.rename(
@ -296,23 +297,47 @@ def extract_records(input_dir, output_dir, clean_up=False):
# Select output columns
df = df[[
'Site',
'Hole',
'Pipe',
'Date time',
'Below Measuring Point',
'Above Sea Level',
'Lat',
'Lon',
'Elev',
]]
# Get csv name from zip archive
csv_name = os.path.join(output_dir, zip_name.replace('.zip', '.csv'))
# Set date index for resampling
df.index = df['Date time']
# Export to csv
df.to_csv(csv_name, float_format='%0.3f')
# Append to master dataframe
for period in periods:
if period == 'weekly':
# Resample to weekly timestamps
df = df.resample('1w').mean()
df['Date time'] = df.index
# Add bore specifics to dataframe
df['Site'] = site
df['Hole'] = hole
df['Pipe'] = pipe
df['Lat'] = lat
df['Lon'] = lon
df['Elev'] = elev
df['Basin'] = basin_id
master[period] = pd.concat([master[period], df])
if clean_up:
# Remove original zip archive
os.remove(os.path.join(input_dir, zip_name))
for period in periods:
# Set column order
master[period] = master[period][[
'Date time', 'Basin', 'Site', 'Hole', 'Pipe',
'Below Measuring Point', 'Above Sea Level', 'Lat', 'Lon', 'Elev'
]]
# Get latest date from dataframe
latest_date = master[period]['Date time'].iloc[-1].strftime('%Y-%m-%d')
csv_name = os.path.join(
output_dir, '{}-{}-{}.csv'.format(basin_id, latest_date, period))
# Export to csv
master[period].to_csv(csv_name, index=False, float_format='%0.3f')

Loading…
Cancel
Save