Download bores by basin name

master
Dan Howe 6 years ago
parent 2092571b01
commit 4529dbdc28

@ -297,6 +297,9 @@ def telemetered_bore_downloader(basin_name,
else: else:
os.remove(log_name) os.remove(log_name)
# Wait for downloads to finish
time.sleep(10)
# Close browser # Close browser
driver.quit() driver.quit()
@ -312,37 +315,41 @@ def get_basins():
return basins return basins
def extract_definitions(input_dir, output_dir): def extract_definitions(basin_name, download_dir):
"""Extract variable and quality metadata from bore records. """Extract variable and quality metadata from bore records.
Args: Args:
input_dir: path to downloaded zip archives basin_name: basin name (string)
output_dir: path to save csv files download_dir: path to downloaded zip archives
""" """
# Get basin info for telemetered site data # Get basin info for telemetered site data
basins = get_basins() basins = get_basins()
# Find zip files
zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')]
# Prepare output directory # Prepare output directory
output_dir = os.path.join(download_dir, basin_name)
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
# Find zip files
zip_names = [f for f in os.listdir(output_dir) if f.endswith('.zip')]
if not zip_names:
raise ValueError('No zip files found')
for zip_name in zip_names: for zip_name in zip_names:
# Skip duplicate downloads # Skip duplicate downloads
if re.search(r'\([0-9]+\)', zip_name): if re.search(r'\([0-9]+\)', zip_name):
continue continue
# Rename '.part' file if zip was not correctly downloaded # Rename '.part' file if zip was not correctly downloaded
if os.path.getsize(os.path.join(input_dir, zip_name)) == 0: if os.path.getsize(os.path.join(output_dir, zip_name)) == 0:
shutil.move( shutil.move(
os.path.join(input_dir, zip_name) + '.part', os.path.join(output_dir, zip_name) + '.part',
os.path.join(input_dir, zip_name)) os.path.join(output_dir, zip_name))
# Read csv file inside zip archive # Read csv file inside zip archive
df = pd.read_csv( df = pd.read_csv(
os.path.join(input_dir, zip_name), os.path.join(output_dir, zip_name),
header=2, header=2,
skiprows=[3], skiprows=[3],
parse_dates=['Date'], parse_dates=['Date'],
@ -388,15 +395,10 @@ def extract_definitions(input_dir, output_dir):
sites = sites.set_index('ID') sites = sites.set_index('ID')
# Get basin from master site dataframe # Get basin from master site dataframe
try: codes = basins.groupby('Basin name').first()['Basin code']
sites['Basin name'] = basins.loc[sites.index, 'Basin name'] basin_code = codes[basin_name]
sites['Basin code'] = basins.loc[sites.index, 'Basin code'] sites['Basin name'] = basin_name
except ValueError: sites['Basin code'] = basin_code
# FIXME: Some bores have duplicate IDs!
# Get basin name from input directory
sites['Basin name'] = input_dir
basin_idx = basins['Basin name'] == input_dir
sites['Basin code'] = basins.loc[basin_idx, 'Basin code'].values[0]
# Save variable definitions # Save variable definitions
variables = pd.DataFrame( variables = pd.DataFrame(
@ -447,23 +449,24 @@ def extract_definitions(input_dir, output_dir):
return sites return sites
def extract_records(input_dir, output_dir, clean_up=False): def extract_records(basin_name, download_dir, clean_up=False):
"""Extract downloaded bore records. """Extract downloaded bore records.
Args: Args:
input_dir: path to downloaded zip archives basin_name: basin name (string)
output_dir: path to save csv files download_dir: path to downloaded zip archives
clean_up: delete original zip archive after extracting it clean_up: delete original zip archive after extracting it
""" """
# Update definition tables # Update definition tables
sites = extract_definitions(input_dir, output_dir) sites = extract_definitions(basin_name, download_dir)
# Keep unique basin codes # Keep unique basin codes
basin_codes = sites['Basin code'].unique() basin_codes = sites['Basin code'].unique()
# Find zip files # Find zip files
zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')] output_dir = os.path.join(download_dir, basin_name)
zip_names = [f for f in os.listdir(output_dir) if f.endswith('.zip')]
# Prepare output directory # Prepare output directory
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
@ -482,14 +485,14 @@ def extract_records(input_dir, output_dir, clean_up=False):
continue continue
# Rename '.part' file if zip was not correctly downloaded # Rename '.part' file if zip was not correctly downloaded
if os.path.getsize(os.path.join(input_dir, zip_name)) == 0: if os.path.getsize(os.path.join(output_dir, zip_name)) == 0:
shutil.move( shutil.move(
os.path.join(input_dir, zip_name) + '.part', os.path.join(output_dir, zip_name) + '.part',
os.path.join(input_dir, zip_name)) os.path.join(output_dir, zip_name))
# Read header # Read header
header = pd.read_csv( header = pd.read_csv(
os.path.join(input_dir, zip_name), compression='zip', nrows=3) os.path.join(output_dir, zip_name), compression='zip', nrows=3)
# Remove comments # Remove comments
header = header.iloc[:, 1:-1].T header = header.iloc[:, 1:-1].T
@ -506,7 +509,7 @@ def extract_records(input_dir, output_dir, clean_up=False):
# Read csv file inside zip archive # Read csv file inside zip archive
df = pd.read_csv( df = pd.read_csv(
os.path.join(input_dir, zip_name), os.path.join(output_dir, zip_name),
header=2, header=2,
skiprows=[3], skiprows=[3],
parse_dates=['Date'], parse_dates=['Date'],
@ -569,7 +572,7 @@ def extract_records(input_dir, output_dir, clean_up=False):
if clean_up: if clean_up:
# Remove original zip archive # Remove original zip archive
os.remove(os.path.join(input_dir, zip_name)) os.remove(os.path.join(output_dir, zip_name))
for basin_code in basin_codes: for basin_code in basin_codes:
for period in periods: for period in periods:

Loading…
Cancel
Save