diff --git a/waternsw_grabber/waternsw_grabber.py b/waternsw_grabber/waternsw_grabber.py index 19a4fa6..8883996 100644 --- a/waternsw_grabber/waternsw_grabber.py +++ b/waternsw_grabber/waternsw_grabber.py @@ -262,7 +262,7 @@ def extract_definitions(input_dir, output_dir): # Prepare output directory os.makedirs(output_dir, exist_ok=True) - for zip_name in tqdm(zip_names): + for zip_name in zip_names: # Skip duplicate downloads if re.search(r'\([0-9]+\)', zip_name): continue @@ -373,6 +373,9 @@ def extract_records(input_dir, output_dir, clean_up=False): clean_up: delete original zip archive after extracting it """ + # Update definition tables + extract_definitions(input_dir, output_dir) + # Get basin info for telemetered site data csv_name = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'data', @@ -382,14 +385,25 @@ def extract_records(input_dir, output_dir, clean_up=False): # Find zip files zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')] + # Get basin IDs for all zip files + basin_codes = [] + for zip_name in zip_names: + bore_id = os.path.splitext(zip_name)[0].replace('cf', '') + basin_codes.append(basins.loc[bore_id, 'Basin code']) + + # Keep uniue basin codes + basin_codes = list(set(basin_codes)) + # Prepare output directory os.makedirs(output_dir, exist_ok=True) # Create master dataframe periods = ['all', 'daily', 'weekly'] master = {} - for period in periods: - master[period] = pd.DataFrame() + for basin_code in basin_codes: + master[basin_code] = {} + for period in periods: + master[basin_code][period] = pd.DataFrame() for zip_name in tqdm(zip_names): # Skip duplicate downloads @@ -425,44 +439,49 @@ def extract_records(input_dir, output_dir, clean_up=False): header=2, skiprows=[3], parse_dates=['Date'], + index_col=['Date'], compression='zip', - dayfirst=True) + dayfirst=True, + nrows=100) + + # FIXME: convert quality codes to integers # Update column names - df.columns = ['Date time'] + columns + ['Metadata'] + df.columns = columns + ['Metadata'] # Get bore specifics meta = df['Metadata'].iloc[1] bore_id = re.search(r'^\S+', meta).group() site, hole, pipe = bore_id.split('.') - df.drop(columns='Metadata') + df = df.drop(columns='Metadata') - # Set date index for resampling - df.index = df['Date time'] + # Get basin ID + basin_code = basins.loc[bore_id, 'Basin code'] # Append to master dataframe for period in periods: if period == 'daily': # Resample to daily timestamps df = df.resample('1d').mean() - df['Date time'] = df.index + # FIXME: add bore IDs elif period == 'weekly': # Resample to weekly timestamps df = df.resample('1w').mean() - df['Date time'] = df.index + # FIXME: add bore IDs - master[period] = pd.concat([master[period], df]) + master[basin_code][period] = pd.concat([master[basin_code][period], df]) if clean_up: # Remove original zip archive os.remove(os.path.join(input_dir, zip_name)) - for period in periods: - # Get latest date from dataframe - latest_date = master[period]['Date time'].iloc[-1].strftime('%Y-%m-%d') - csv_name = os.path.join( - output_dir, '{}-{}-{}.csv'.format(basin_id, latest_date, period)) + for basin_code in basin_codes: + for period in periods: + # Get latest date from dataframe + latest_date = master[basin_code][period].index[-1].strftime('%Y-%m-%d') + csv_name = os.path.join( + output_dir, '{}-{}-{}.csv'.format(basin_code, latest_date, period)) - # Export to csv - master[period].to_csv(csv_name, index=False, float_format='%0.3f') + # Export to csv + master[basin_code][period].to_csv(csv_name, index=True, float_format='%0.3f')