|
|
|
@ -297,6 +297,9 @@ def telemetered_bore_downloader(basin_name,
|
|
|
|
|
else:
|
|
|
|
|
os.remove(log_name)
|
|
|
|
|
|
|
|
|
|
# Wait for downloads to finish
|
|
|
|
|
time.sleep(10)
|
|
|
|
|
|
|
|
|
|
# Close browser
|
|
|
|
|
driver.quit()
|
|
|
|
|
|
|
|
|
@ -312,37 +315,41 @@ def get_basins():
|
|
|
|
|
return basins
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_definitions(input_dir, output_dir):
|
|
|
|
|
def extract_definitions(basin_name, download_dir):
|
|
|
|
|
"""Extract variable and quality metadata from bore records.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
input_dir: path to downloaded zip archives
|
|
|
|
|
output_dir: path to save csv files
|
|
|
|
|
basin_name: basin name (string)
|
|
|
|
|
download_dir: path to downloaded zip archives
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Get basin info for telemetered site data
|
|
|
|
|
basins = get_basins()
|
|
|
|
|
|
|
|
|
|
# Find zip files
|
|
|
|
|
zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')]
|
|
|
|
|
|
|
|
|
|
# Prepare output directory
|
|
|
|
|
output_dir = os.path.join(download_dir, basin_name)
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
# Find zip files
|
|
|
|
|
zip_names = [f for f in os.listdir(output_dir) if f.endswith('.zip')]
|
|
|
|
|
|
|
|
|
|
if not zip_names:
|
|
|
|
|
raise ValueError('No zip files found')
|
|
|
|
|
|
|
|
|
|
for zip_name in zip_names:
|
|
|
|
|
# Skip duplicate downloads
|
|
|
|
|
if re.search(r'\([0-9]+\)', zip_name):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# Rename '.part' file if zip was not correctly downloaded
|
|
|
|
|
if os.path.getsize(os.path.join(input_dir, zip_name)) == 0:
|
|
|
|
|
if os.path.getsize(os.path.join(output_dir, zip_name)) == 0:
|
|
|
|
|
shutil.move(
|
|
|
|
|
os.path.join(input_dir, zip_name) + '.part',
|
|
|
|
|
os.path.join(input_dir, zip_name))
|
|
|
|
|
os.path.join(output_dir, zip_name) + '.part',
|
|
|
|
|
os.path.join(output_dir, zip_name))
|
|
|
|
|
|
|
|
|
|
# Read csv file inside zip archive
|
|
|
|
|
df = pd.read_csv(
|
|
|
|
|
os.path.join(input_dir, zip_name),
|
|
|
|
|
os.path.join(output_dir, zip_name),
|
|
|
|
|
header=2,
|
|
|
|
|
skiprows=[3],
|
|
|
|
|
parse_dates=['Date'],
|
|
|
|
@ -388,15 +395,10 @@ def extract_definitions(input_dir, output_dir):
|
|
|
|
|
sites = sites.set_index('ID')
|
|
|
|
|
|
|
|
|
|
# Get basin from master site dataframe
|
|
|
|
|
try:
|
|
|
|
|
sites['Basin name'] = basins.loc[sites.index, 'Basin name']
|
|
|
|
|
sites['Basin code'] = basins.loc[sites.index, 'Basin code']
|
|
|
|
|
except ValueError:
|
|
|
|
|
# FIXME: Some bores have duplicate IDs!
|
|
|
|
|
# Get basin name from input directory
|
|
|
|
|
sites['Basin name'] = input_dir
|
|
|
|
|
basin_idx = basins['Basin name'] == input_dir
|
|
|
|
|
sites['Basin code'] = basins.loc[basin_idx, 'Basin code'].values[0]
|
|
|
|
|
codes = basins.groupby('Basin name').first()['Basin code']
|
|
|
|
|
basin_code = codes[basin_name]
|
|
|
|
|
sites['Basin name'] = basin_name
|
|
|
|
|
sites['Basin code'] = basin_code
|
|
|
|
|
|
|
|
|
|
# Save variable definitions
|
|
|
|
|
variables = pd.DataFrame(
|
|
|
|
@ -447,23 +449,24 @@ def extract_definitions(input_dir, output_dir):
|
|
|
|
|
return sites
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_records(input_dir, output_dir, clean_up=False):
|
|
|
|
|
def extract_records(basin_name, download_dir, clean_up=False):
|
|
|
|
|
"""Extract downloaded bore records.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
input_dir: path to downloaded zip archives
|
|
|
|
|
output_dir: path to save csv files
|
|
|
|
|
clean_up: delete original zip archive after extracting it
|
|
|
|
|
basin_name: basin name (string)
|
|
|
|
|
download_dir: path to downloaded zip archives
|
|
|
|
|
clean_up: delete original zip archive after extracting it
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Update definition tables
|
|
|
|
|
sites = extract_definitions(input_dir, output_dir)
|
|
|
|
|
sites = extract_definitions(basin_name, download_dir)
|
|
|
|
|
|
|
|
|
|
# Keep unique basin codes
|
|
|
|
|
basin_codes = sites['Basin code'].unique()
|
|
|
|
|
|
|
|
|
|
# Find zip files
|
|
|
|
|
zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')]
|
|
|
|
|
output_dir = os.path.join(download_dir, basin_name)
|
|
|
|
|
zip_names = [f for f in os.listdir(output_dir) if f.endswith('.zip')]
|
|
|
|
|
|
|
|
|
|
# Prepare output directory
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
@ -482,14 +485,14 @@ def extract_records(input_dir, output_dir, clean_up=False):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# Rename '.part' file if zip was not correctly downloaded
|
|
|
|
|
if os.path.getsize(os.path.join(input_dir, zip_name)) == 0:
|
|
|
|
|
if os.path.getsize(os.path.join(output_dir, zip_name)) == 0:
|
|
|
|
|
shutil.move(
|
|
|
|
|
os.path.join(input_dir, zip_name) + '.part',
|
|
|
|
|
os.path.join(input_dir, zip_name))
|
|
|
|
|
os.path.join(output_dir, zip_name) + '.part',
|
|
|
|
|
os.path.join(output_dir, zip_name))
|
|
|
|
|
|
|
|
|
|
# Read header
|
|
|
|
|
header = pd.read_csv(
|
|
|
|
|
os.path.join(input_dir, zip_name), compression='zip', nrows=3)
|
|
|
|
|
os.path.join(output_dir, zip_name), compression='zip', nrows=3)
|
|
|
|
|
|
|
|
|
|
# Remove comments
|
|
|
|
|
header = header.iloc[:, 1:-1].T
|
|
|
|
@ -506,7 +509,7 @@ def extract_records(input_dir, output_dir, clean_up=False):
|
|
|
|
|
|
|
|
|
|
# Read csv file inside zip archive
|
|
|
|
|
df = pd.read_csv(
|
|
|
|
|
os.path.join(input_dir, zip_name),
|
|
|
|
|
os.path.join(output_dir, zip_name),
|
|
|
|
|
header=2,
|
|
|
|
|
skiprows=[3],
|
|
|
|
|
parse_dates=['Date'],
|
|
|
|
@ -569,7 +572,7 @@ def extract_records(input_dir, output_dir, clean_up=False):
|
|
|
|
|
|
|
|
|
|
if clean_up:
|
|
|
|
|
# Remove original zip archive
|
|
|
|
|
os.remove(os.path.join(input_dir, zip_name))
|
|
|
|
|
os.remove(os.path.join(output_dir, zip_name))
|
|
|
|
|
|
|
|
|
|
for basin_code in basin_codes:
|
|
|
|
|
for period in periods:
|
|
|
|
|