From 511de588aac718a4722ae15f21ae1205bdf60b90 Mon Sep 17 00:00:00 2001 From: Dan Howe Date: Wed, 27 Feb 2019 15:45:53 +1100 Subject: [PATCH] Add basin code and bore details to output --- waternsw_grabber/waternsw_grabber.py | 60 +++++++++++++++------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/waternsw_grabber/waternsw_grabber.py b/waternsw_grabber/waternsw_grabber.py index 8883996..b926f9e 100644 --- a/waternsw_grabber/waternsw_grabber.py +++ b/waternsw_grabber/waternsw_grabber.py @@ -363,6 +363,9 @@ def extract_definitions(input_dir, output_dir): variables.to_csv(csv_name_v) qualities.to_csv(csv_name_q) + sites = sites[~sites.index.duplicated(keep='first')] + return sites + def extract_records(input_dir, output_dir, clean_up=False): """Extract downloaded bore records. @@ -374,26 +377,14 @@ def extract_records(input_dir, output_dir, clean_up=False): """ # Update definition tables - extract_definitions(input_dir, output_dir) + sites = extract_definitions(input_dir, output_dir) - # Get basin info for telemetered site data - csv_name = os.path.join( - os.path.dirname(os.path.dirname(__file__)), 'data', - 'telemetered-sites.csv') - basins = pd.read_csv(csv_name, index_col=0) + # Keep unique basin codes + basin_codes = sites['Basin code'].unique() # Find zip files zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')] - # Get basin IDs for all zip files - basin_codes = [] - for zip_name in zip_names: - bore_id = os.path.splitext(zip_name)[0].replace('cf', '') - basin_codes.append(basins.loc[bore_id, 'Basin code']) - - # Keep uniue basin codes - basin_codes = list(set(basin_codes)) - # Prepare output directory os.makedirs(output_dir, exist_ok=True) @@ -441,13 +432,15 @@ def extract_records(input_dir, output_dir, clean_up=False): parse_dates=['Date'], index_col=['Date'], compression='zip', - dayfirst=True, - nrows=100) + dayfirst=True) - # FIXME: convert quality codes to integers + # Convert quality codes to integers + for col in df.columns: + if 'Quality' in col: + df[col] = df[col].astype(int) # Update column names - df.columns = columns + ['Metadata'] + df.columns = columns + ['Metadata'] # Get bore specifics meta = df['Metadata'].iloc[1] @@ -456,21 +449,31 @@ def extract_records(input_dir, output_dir, clean_up=False): df = df.drop(columns='Metadata') # Get basin ID - basin_code = basins.loc[bore_id, 'Basin code'] + basin_code = sites.loc[bore_id, 'Basin code'] - # Append to master dataframe + # Resample if necessary for period in periods: if period == 'daily': # Resample to daily timestamps df = df.resample('1d').mean() - # FIXME: add bore IDs elif period == 'weekly': # Resample to weekly timestamps df = df.resample('1w').mean() - # FIXME: add bore IDs - master[basin_code][period] = pd.concat([master[basin_code][period], df]) + # Add specific borehole details + df['Site'] = sites.loc[bore_id, 'Site'] + df['Hole'] = sites.loc[bore_id, 'Hole'] + df['Pipe'] = sites.loc[bore_id, 'Pipe'] + df['Basin'] = sites.loc[bore_id, 'Basin code'] + df = df[['Site', 'Hole', 'Pipe', 'Basin'] + columns] + + # Remove empty rows + df = df.dropna() + + # Add to master dataframe + master[basin_code][period] = pd.concat( + [master[basin_code][period], df]) if clean_up: # Remove original zip archive @@ -479,9 +482,12 @@ def extract_records(input_dir, output_dir, clean_up=False): for basin_code in basin_codes: for period in periods: # Get latest date from dataframe - latest_date = master[basin_code][period].index[-1].strftime('%Y-%m-%d') + latest_date = master[basin_code][period].index[-1].strftime( + '%Y-%m-%d') csv_name = os.path.join( - output_dir, '{}-{}-{}.csv'.format(basin_code, latest_date, period)) + output_dir, '{}-{}-{}.csv'.format(basin_code, latest_date, + period)) # Export to csv - master[basin_code][period].to_csv(csv_name, index=True, float_format='%0.3f') + master[basin_code][period].to_csv( + csv_name, index=True, float_format='%0.3f')