diff --git a/waternsw_grabber/waternsw_grabber.py b/waternsw_grabber/waternsw_grabber.py index 0575048..96577be 100644 --- a/waternsw_grabber/waternsw_grabber.py +++ b/waternsw_grabber/waternsw_grabber.py @@ -249,6 +249,12 @@ def extract_records(input_dir, output_dir, clean_up=False): # Prepare output directory os.makedirs(output_dir, exist_ok=True) + # Create master dataframe + periods = ['all', 'weekly'] + master = {} + for period in periods: + master[period] = pd.DataFrame() + for zip_name in tqdm(zip_names): # Skip duplicate downloads if re.search('\([0-9]+\)', zip_name): @@ -278,13 +284,8 @@ def extract_records(input_dir, output_dir, clean_up=False): bore_id = re.search('^\S+', meta).group() site, hole, pipe = bore_id.split('.') - # Add bore specifics to dataframe - df['Site'] = site - df['Hole'] = hole - df['Pipe'] = pipe - df['Lat'] = lat - df['Lon'] = lon - df['Elev'] = elev + # FIXME: detect basin automatically + basin_id = 'MB' # Rename columns df = df.rename( @@ -296,23 +297,47 @@ def extract_records(input_dir, output_dir, clean_up=False): # Select output columns df = df[[ - 'Site', - 'Hole', - 'Pipe', 'Date time', 'Below Measuring Point', 'Above Sea Level', - 'Lat', - 'Lon', - 'Elev', ]] - # Get csv name from zip archive - csv_name = os.path.join(output_dir, zip_name.replace('.zip', '.csv')) + # Set date index for resampling + df.index = df['Date time'] - # Export to csv - df.to_csv(csv_name, float_format='%0.3f') + # Append to master dataframe + for period in periods: + if period == 'weekly': + # Resample to weekly timestamps + df = df.resample('1w').mean() + df['Date time'] = df.index + + # Add bore specifics to dataframe + df['Site'] = site + df['Hole'] = hole + df['Pipe'] = pipe + df['Lat'] = lat + df['Lon'] = lon + df['Elev'] = elev + df['Basin'] = basin_id + + master[period] = pd.concat([master[period], df]) if clean_up: # Remove original zip archive os.remove(os.path.join(input_dir, zip_name)) + + for period in periods: + # Set column order + master[period] = master[period][[ + 'Date time', 'Basin', 'Site', 'Hole', 'Pipe', + 'Below Measuring Point', 'Above Sea Level', 'Lat', 'Lon', 'Elev' + ]] + + # Get latest date from dataframe + latest_date = master[period]['Date time'].iloc[-1].strftime('%Y-%m-%d') + csv_name = os.path.join( + output_dir, '{}-{}-{}.csv'.format(basin_id, latest_date, period)) + + # Export to csv + master[period].to_csv(csv_name, index=False, float_format='%0.3f')