|
|
|
@ -262,7 +262,7 @@ def extract_definitions(input_dir, output_dir):
|
|
|
|
|
# Prepare output directory
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
for zip_name in tqdm(zip_names):
|
|
|
|
|
for zip_name in zip_names:
|
|
|
|
|
# Skip duplicate downloads
|
|
|
|
|
if re.search(r'\([0-9]+\)', zip_name):
|
|
|
|
|
continue
|
|
|
|
@ -373,6 +373,9 @@ def extract_records(input_dir, output_dir, clean_up=False):
|
|
|
|
|
clean_up: delete original zip archive after extracting it
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Update definition tables
|
|
|
|
|
extract_definitions(input_dir, output_dir)
|
|
|
|
|
|
|
|
|
|
# Get basin info for telemetered site data
|
|
|
|
|
csv_name = os.path.join(
|
|
|
|
|
os.path.dirname(os.path.dirname(__file__)), 'data',
|
|
|
|
@ -382,14 +385,25 @@ def extract_records(input_dir, output_dir, clean_up=False):
|
|
|
|
|
# Find zip files
|
|
|
|
|
zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')]
|
|
|
|
|
|
|
|
|
|
# Get basin IDs for all zip files
|
|
|
|
|
basin_codes = []
|
|
|
|
|
for zip_name in zip_names:
|
|
|
|
|
bore_id = os.path.splitext(zip_name)[0].replace('cf', '')
|
|
|
|
|
basin_codes.append(basins.loc[bore_id, 'Basin code'])
|
|
|
|
|
|
|
|
|
|
# Keep uniue basin codes
|
|
|
|
|
basin_codes = list(set(basin_codes))
|
|
|
|
|
|
|
|
|
|
# Prepare output directory
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
# Create master dataframe
|
|
|
|
|
periods = ['all', 'daily', 'weekly']
|
|
|
|
|
master = {}
|
|
|
|
|
for basin_code in basin_codes:
|
|
|
|
|
master[basin_code] = {}
|
|
|
|
|
for period in periods:
|
|
|
|
|
master[period] = pd.DataFrame()
|
|
|
|
|
master[basin_code][period] = pd.DataFrame()
|
|
|
|
|
|
|
|
|
|
for zip_name in tqdm(zip_names):
|
|
|
|
|
# Skip duplicate downloads
|
|
|
|
@ -425,44 +439,49 @@ def extract_records(input_dir, output_dir, clean_up=False):
|
|
|
|
|
header=2,
|
|
|
|
|
skiprows=[3],
|
|
|
|
|
parse_dates=['Date'],
|
|
|
|
|
index_col=['Date'],
|
|
|
|
|
compression='zip',
|
|
|
|
|
dayfirst=True)
|
|
|
|
|
dayfirst=True,
|
|
|
|
|
nrows=100)
|
|
|
|
|
|
|
|
|
|
# FIXME: convert quality codes to integers
|
|
|
|
|
|
|
|
|
|
# Update column names
|
|
|
|
|
df.columns = ['Date time'] + columns + ['Metadata']
|
|
|
|
|
df.columns = columns + ['Metadata']
|
|
|
|
|
|
|
|
|
|
# Get bore specifics
|
|
|
|
|
meta = df['Metadata'].iloc[1]
|
|
|
|
|
bore_id = re.search(r'^\S+', meta).group()
|
|
|
|
|
site, hole, pipe = bore_id.split('.')
|
|
|
|
|
df.drop(columns='Metadata')
|
|
|
|
|
df = df.drop(columns='Metadata')
|
|
|
|
|
|
|
|
|
|
# Set date index for resampling
|
|
|
|
|
df.index = df['Date time']
|
|
|
|
|
# Get basin ID
|
|
|
|
|
basin_code = basins.loc[bore_id, 'Basin code']
|
|
|
|
|
|
|
|
|
|
# Append to master dataframe
|
|
|
|
|
for period in periods:
|
|
|
|
|
if period == 'daily':
|
|
|
|
|
# Resample to daily timestamps
|
|
|
|
|
df = df.resample('1d').mean()
|
|
|
|
|
df['Date time'] = df.index
|
|
|
|
|
# FIXME: add bore IDs
|
|
|
|
|
|
|
|
|
|
elif period == 'weekly':
|
|
|
|
|
# Resample to weekly timestamps
|
|
|
|
|
df = df.resample('1w').mean()
|
|
|
|
|
df['Date time'] = df.index
|
|
|
|
|
# FIXME: add bore IDs
|
|
|
|
|
|
|
|
|
|
master[period] = pd.concat([master[period], df])
|
|
|
|
|
master[basin_code][period] = pd.concat([master[basin_code][period], df])
|
|
|
|
|
|
|
|
|
|
if clean_up:
|
|
|
|
|
# Remove original zip archive
|
|
|
|
|
os.remove(os.path.join(input_dir, zip_name))
|
|
|
|
|
|
|
|
|
|
for basin_code in basin_codes:
|
|
|
|
|
for period in periods:
|
|
|
|
|
# Get latest date from dataframe
|
|
|
|
|
latest_date = master[period]['Date time'].iloc[-1].strftime('%Y-%m-%d')
|
|
|
|
|
latest_date = master[basin_code][period].index[-1].strftime('%Y-%m-%d')
|
|
|
|
|
csv_name = os.path.join(
|
|
|
|
|
output_dir, '{}-{}-{}.csv'.format(basin_id, latest_date, period))
|
|
|
|
|
output_dir, '{}-{}-{}.csv'.format(basin_code, latest_date, period))
|
|
|
|
|
|
|
|
|
|
# Export to csv
|
|
|
|
|
master[period].to_csv(csv_name, index=False, float_format='%0.3f')
|
|
|
|
|
master[basin_code][period].to_csv(csv_name, index=True, float_format='%0.3f')
|
|
|
|
|