Get basin ID codes automatically

master
Dan Howe 6 years ago
parent 3b0c1e9747
commit 6f66bde339

@ -262,7 +262,7 @@ def extract_definitions(input_dir, output_dir):
# Prepare output directory # Prepare output directory
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
for zip_name in tqdm(zip_names): for zip_name in zip_names:
# Skip duplicate downloads # Skip duplicate downloads
if re.search(r'\([0-9]+\)', zip_name): if re.search(r'\([0-9]+\)', zip_name):
continue continue
@ -373,6 +373,9 @@ def extract_records(input_dir, output_dir, clean_up=False):
clean_up: delete original zip archive after extracting it clean_up: delete original zip archive after extracting it
""" """
# Update definition tables
extract_definitions(input_dir, output_dir)
# Get basin info for telemetered site data # Get basin info for telemetered site data
csv_name = os.path.join( csv_name = os.path.join(
os.path.dirname(os.path.dirname(__file__)), 'data', os.path.dirname(os.path.dirname(__file__)), 'data',
@ -382,14 +385,25 @@ def extract_records(input_dir, output_dir, clean_up=False):
# Find zip files # Find zip files
zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')] zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')]
# Get basin IDs for all zip files
basin_codes = []
for zip_name in zip_names:
bore_id = os.path.splitext(zip_name)[0].replace('cf', '')
basin_codes.append(basins.loc[bore_id, 'Basin code'])
# Keep uniue basin codes
basin_codes = list(set(basin_codes))
# Prepare output directory # Prepare output directory
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
# Create master dataframe # Create master dataframe
periods = ['all', 'daily', 'weekly'] periods = ['all', 'daily', 'weekly']
master = {} master = {}
for basin_code in basin_codes:
master[basin_code] = {}
for period in periods: for period in periods:
master[period] = pd.DataFrame() master[basin_code][period] = pd.DataFrame()
for zip_name in tqdm(zip_names): for zip_name in tqdm(zip_names):
# Skip duplicate downloads # Skip duplicate downloads
@ -425,44 +439,49 @@ def extract_records(input_dir, output_dir, clean_up=False):
header=2, header=2,
skiprows=[3], skiprows=[3],
parse_dates=['Date'], parse_dates=['Date'],
index_col=['Date'],
compression='zip', compression='zip',
dayfirst=True) dayfirst=True,
nrows=100)
# FIXME: convert quality codes to integers
# Update column names # Update column names
df.columns = ['Date time'] + columns + ['Metadata'] df.columns = columns + ['Metadata']
# Get bore specifics # Get bore specifics
meta = df['Metadata'].iloc[1] meta = df['Metadata'].iloc[1]
bore_id = re.search(r'^\S+', meta).group() bore_id = re.search(r'^\S+', meta).group()
site, hole, pipe = bore_id.split('.') site, hole, pipe = bore_id.split('.')
df.drop(columns='Metadata') df = df.drop(columns='Metadata')
# Set date index for resampling # Get basin ID
df.index = df['Date time'] basin_code = basins.loc[bore_id, 'Basin code']
# Append to master dataframe # Append to master dataframe
for period in periods: for period in periods:
if period == 'daily': if period == 'daily':
# Resample to daily timestamps # Resample to daily timestamps
df = df.resample('1d').mean() df = df.resample('1d').mean()
df['Date time'] = df.index # FIXME: add bore IDs
elif period == 'weekly': elif period == 'weekly':
# Resample to weekly timestamps # Resample to weekly timestamps
df = df.resample('1w').mean() df = df.resample('1w').mean()
df['Date time'] = df.index # FIXME: add bore IDs
master[period] = pd.concat([master[period], df]) master[basin_code][period] = pd.concat([master[basin_code][period], df])
if clean_up: if clean_up:
# Remove original zip archive # Remove original zip archive
os.remove(os.path.join(input_dir, zip_name)) os.remove(os.path.join(input_dir, zip_name))
for basin_code in basin_codes:
for period in periods: for period in periods:
# Get latest date from dataframe # Get latest date from dataframe
latest_date = master[period]['Date time'].iloc[-1].strftime('%Y-%m-%d') latest_date = master[basin_code][period].index[-1].strftime('%Y-%m-%d')
csv_name = os.path.join( csv_name = os.path.join(
output_dir, '{}-{}-{}.csv'.format(basin_id, latest_date, period)) output_dir, '{}-{}-{}.csv'.format(basin_code, latest_date, period))
# Export to csv # Export to csv
master[period].to_csv(csv_name, index=False, float_format='%0.3f') master[basin_code][period].to_csv(csv_name, index=True, float_format='%0.3f')

Loading…
Cancel
Save