|
|
|
@ -242,6 +242,128 @@ def telemetered_bore_downloader(bore_ids, start_date, end_date, download_dir):
|
|
|
|
|
driver.quit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_definitions(input_dir, output_dir):
|
|
|
|
|
"""Extract variable and quality metadata from bore records.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
input_dir: path to downloaded zip archives
|
|
|
|
|
output_dir: path to save csv files
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Get telemetered site data
|
|
|
|
|
csv_name = os.path.join(
|
|
|
|
|
os.path.dirname(os.path.dirname(__file__)), 'data',
|
|
|
|
|
'telemetered-sites.csv')
|
|
|
|
|
master = pd.read_csv(csv_name, index_col=0)
|
|
|
|
|
|
|
|
|
|
# Find zip files
|
|
|
|
|
zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')]
|
|
|
|
|
|
|
|
|
|
# Prepare output directory
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
for zip_name in tqdm(zip_names):
|
|
|
|
|
# Skip duplicate downloads
|
|
|
|
|
if re.search(r'\([0-9]+\)', zip_name):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# Rename '.part' file if zip was not correctly downloaded
|
|
|
|
|
if os.path.getsize(os.path.join(input_dir, zip_name)) == 0:
|
|
|
|
|
shutil.move(
|
|
|
|
|
os.path.join(input_dir, zip_name) + '.part',
|
|
|
|
|
os.path.join(input_dir, zip_name))
|
|
|
|
|
|
|
|
|
|
# Read csv file inside zip archive
|
|
|
|
|
df = pd.read_csv(
|
|
|
|
|
os.path.join(input_dir, zip_name),
|
|
|
|
|
header=2,
|
|
|
|
|
skiprows=[3],
|
|
|
|
|
parse_dates=['Date'],
|
|
|
|
|
compression='zip',
|
|
|
|
|
dayfirst=True,
|
|
|
|
|
nrows=100)
|
|
|
|
|
|
|
|
|
|
# Extract metadata from last column
|
|
|
|
|
keys = ['Sites:', 'Variables:', 'Qualities:']
|
|
|
|
|
meta = {k: [] for k in keys}
|
|
|
|
|
for i, row in df.iterrows():
|
|
|
|
|
line = row.values[-1]
|
|
|
|
|
if line in keys:
|
|
|
|
|
header = True
|
|
|
|
|
var = line
|
|
|
|
|
elif line == ' ':
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
meta[var].append(line)
|
|
|
|
|
|
|
|
|
|
# Get bore specifics
|
|
|
|
|
site_data = meta['Sites:'][0]
|
|
|
|
|
lat = float(re.search(r'(?<=Lat:)\S+', site_data).group())
|
|
|
|
|
lon = float(re.search(r'(?<=Long:)\S+', site_data).group())
|
|
|
|
|
elev = float(re.search(r'(?<=Elev:).+(?=m)', site_data).group())
|
|
|
|
|
address = re.search(r'(?<=\d\.\d\.\d - ).+(?=\sLat)',
|
|
|
|
|
site_data).group()
|
|
|
|
|
bore_id = re.search(r'^\S+', site_data).group()
|
|
|
|
|
site, hole, pipe = bore_id.split('.')
|
|
|
|
|
|
|
|
|
|
sites = pd.DataFrame()
|
|
|
|
|
sites['ID'] = [bore_id]
|
|
|
|
|
sites['Site'] = [site]
|
|
|
|
|
sites['Hole'] = [hole]
|
|
|
|
|
sites['Pipe'] = [pipe]
|
|
|
|
|
sites['Lat'] = [lat]
|
|
|
|
|
sites['Lon'] = [lon]
|
|
|
|
|
sites['Elev'] = [elev]
|
|
|
|
|
sites['Address'] = [address]
|
|
|
|
|
sites = sites.set_index('ID')
|
|
|
|
|
|
|
|
|
|
# Get basin from master site dataframe
|
|
|
|
|
sites['Basin name'] = master.loc[sites.index, 'Basin name']
|
|
|
|
|
sites['Basin code'] = master.loc[sites.index, 'Basin code']
|
|
|
|
|
|
|
|
|
|
# Save variable definitions
|
|
|
|
|
variables = pd.DataFrame(
|
|
|
|
|
[v.split(' - ', 1) for v in meta['Variables:']])
|
|
|
|
|
variables.columns = ['Code', 'Description']
|
|
|
|
|
variables['Code'] = variables['Code'].astype(int)
|
|
|
|
|
variables = variables.set_index('Code')
|
|
|
|
|
|
|
|
|
|
# Save quality definitions
|
|
|
|
|
qualities = pd.DataFrame(
|
|
|
|
|
[q.split(' - ', 1) for q in meta['Qualities:']])
|
|
|
|
|
qualities.columns = ['Code', 'Description']
|
|
|
|
|
qualities['Code'] = qualities['Code'].astype(int)
|
|
|
|
|
qualities = qualities.set_index('Code')
|
|
|
|
|
|
|
|
|
|
# Update existing values
|
|
|
|
|
csv_name_s = os.path.join(output_dir, 'sites.csv')
|
|
|
|
|
csv_name_v = os.path.join(output_dir, 'variables.csv')
|
|
|
|
|
csv_name_q = os.path.join(output_dir, 'qualities.csv')
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
sites = sites.append(pd.read_csv(csv_name_s, index_col=0))
|
|
|
|
|
sites = sites.drop_duplicates().sort_index()
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
variables = variables.append(pd.read_csv(csv_name_v, index_col=0))
|
|
|
|
|
variables = variables.drop_duplicates().sort_index()
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
variables = variables.append(pd.read_csv(csv_name_q, index_col=0))
|
|
|
|
|
qualities = qualities.drop_duplicates().sort_index()
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
# Export updated tables
|
|
|
|
|
sites.to_csv(csv_name_s)
|
|
|
|
|
variables.to_csv(csv_name_v)
|
|
|
|
|
qualities.to_csv(csv_name_q)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_records(input_dir, output_dir, clean_up=False):
|
|
|
|
|
"""Extract downloaded bore records.
|
|
|
|
|
|
|
|
|
@ -274,6 +396,23 @@ def extract_records(input_dir, output_dir, clean_up=False):
|
|
|
|
|
os.path.join(input_dir, zip_name) + '.part',
|
|
|
|
|
os.path.join(input_dir, zip_name))
|
|
|
|
|
|
|
|
|
|
# Read header
|
|
|
|
|
header = pd.read_csv(
|
|
|
|
|
os.path.join(input_dir, zip_name), compression='zip', nrows=3)
|
|
|
|
|
|
|
|
|
|
# Remove comments
|
|
|
|
|
header = header.iloc[:, 1:-1].T
|
|
|
|
|
|
|
|
|
|
# Apply product codes to all columns
|
|
|
|
|
header.iloc[1::2, 0] = header.iloc[::2, 0].values
|
|
|
|
|
header[0] = header[0].astype(float).astype(int).astype(str)
|
|
|
|
|
|
|
|
|
|
# Move quality label
|
|
|
|
|
header.iloc[1::2, 1] = header.iloc[1::2, 2]
|
|
|
|
|
|
|
|
|
|
# Combine labels
|
|
|
|
|
columns = [' '.join(c) for c in header.iloc[:, :-1].values]
|
|
|
|
|
|
|
|
|
|
# Read csv file inside zip archive
|
|
|
|
|
df = pd.read_csv(
|
|
|
|
|
os.path.join(input_dir, zip_name),
|
|
|
|
@ -283,32 +422,14 @@ def extract_records(input_dir, output_dir, clean_up=False):
|
|
|
|
|
compression='zip',
|
|
|
|
|
dayfirst=True)
|
|
|
|
|
|
|
|
|
|
# Update column names
|
|
|
|
|
df.columns = ['Date time'] + columns + ['Metadata']
|
|
|
|
|
|
|
|
|
|
# Get bore specifics
|
|
|
|
|
meta = df.iloc[1, -1]
|
|
|
|
|
lat = float(re.search(r'(?<=Lat:)\S+', meta).group())
|
|
|
|
|
lon = float(re.search(r'(?<=Long:)\S+', meta).group())
|
|
|
|
|
elev = float(re.search(r'(?<=Elev:).+(?=m)', meta).group())
|
|
|
|
|
address = re.search(r'(?<=\d\.\d\.\d - ).+(?=\sLat)', meta).group()
|
|
|
|
|
meta = df['Metadata'].iloc[1]
|
|
|
|
|
bore_id = re.search(r'^\S+', meta).group()
|
|
|
|
|
site, hole, pipe = bore_id.split('.')
|
|
|
|
|
|
|
|
|
|
# FIXME: detect basin automatically
|
|
|
|
|
basin_id = 'MB'
|
|
|
|
|
|
|
|
|
|
# Rename columns
|
|
|
|
|
df = df.rename(
|
|
|
|
|
columns={
|
|
|
|
|
'Date': 'Date time',
|
|
|
|
|
'Bore level below MP': 'Below Measuring Point',
|
|
|
|
|
'GW Level - m AHD': 'Above Sea Level'
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
# Select output columns
|
|
|
|
|
df = df[[
|
|
|
|
|
'Date time',
|
|
|
|
|
'Below Measuring Point',
|
|
|
|
|
'Above Sea Level',
|
|
|
|
|
]]
|
|
|
|
|
df.drop(columns='Metadata')
|
|
|
|
|
|
|
|
|
|
# Set date index for resampling
|
|
|
|
|
df.index = df['Date time']
|
|
|
|
@ -325,15 +446,6 @@ def extract_records(input_dir, output_dir, clean_up=False):
|
|
|
|
|
df = df.resample('1w').mean()
|
|
|
|
|
df['Date time'] = df.index
|
|
|
|
|
|
|
|
|
|
# Add bore specifics to dataframe
|
|
|
|
|
df['Site'] = site
|
|
|
|
|
df['Hole'] = hole
|
|
|
|
|
df['Pipe'] = pipe
|
|
|
|
|
df['Lat'] = lat
|
|
|
|
|
df['Lon'] = lon
|
|
|
|
|
df['Elev'] = elev
|
|
|
|
|
df['Basin'] = basin_id
|
|
|
|
|
|
|
|
|
|
master[period] = pd.concat([master[period], df])
|
|
|
|
|
|
|
|
|
|
if clean_up:
|
|
|
|
@ -341,12 +453,6 @@ def extract_records(input_dir, output_dir, clean_up=False):
|
|
|
|
|
os.remove(os.path.join(input_dir, zip_name))
|
|
|
|
|
|
|
|
|
|
for period in periods:
|
|
|
|
|
# Set column order
|
|
|
|
|
master[period] = master[period][[
|
|
|
|
|
'Date time', 'Basin', 'Site', 'Hole', 'Pipe',
|
|
|
|
|
'Below Measuring Point', 'Above Sea Level', 'Lat', 'Lon', 'Elev'
|
|
|
|
|
]]
|
|
|
|
|
|
|
|
|
|
# Get latest date from dataframe
|
|
|
|
|
latest_date = master[period]['Date time'].iloc[-1].strftime('%Y-%m-%d')
|
|
|
|
|
csv_name = os.path.join(
|
|
|
|
|