From 4529dbdc28d08b5c5951e6a7415eb9b5e8709cb5 Mon Sep 17 00:00:00 2001 From: Dan Howe Date: Mon, 18 Mar 2019 17:17:07 +1100 Subject: [PATCH] Download bores by basin name --- waternsw_grabber/waternsw_grabber.py | 65 +++++++++++++++------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/waternsw_grabber/waternsw_grabber.py b/waternsw_grabber/waternsw_grabber.py index 5a65df2..b8bdc50 100644 --- a/waternsw_grabber/waternsw_grabber.py +++ b/waternsw_grabber/waternsw_grabber.py @@ -297,6 +297,9 @@ def telemetered_bore_downloader(basin_name, else: os.remove(log_name) + # Wait for downloads to finish + time.sleep(10) + # Close browser driver.quit() @@ -312,37 +315,41 @@ def get_basins(): return basins -def extract_definitions(input_dir, output_dir): +def extract_definitions(basin_name, download_dir): """Extract variable and quality metadata from bore records. Args: - input_dir: path to downloaded zip archives - output_dir: path to save csv files + basin_name: basin name (string) + download_dir: path to downloaded zip archives """ # Get basin info for telemetered site data basins = get_basins() - # Find zip files - zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')] - # Prepare output directory + output_dir = os.path.join(download_dir, basin_name) os.makedirs(output_dir, exist_ok=True) + # Find zip files + zip_names = [f for f in os.listdir(output_dir) if f.endswith('.zip')] + + if not zip_names: + raise ValueError('No zip files found') + for zip_name in zip_names: # Skip duplicate downloads if re.search(r'\([0-9]+\)', zip_name): continue # Rename '.part' file if zip was not correctly downloaded - if os.path.getsize(os.path.join(input_dir, zip_name)) == 0: + if os.path.getsize(os.path.join(output_dir, zip_name)) == 0: shutil.move( - os.path.join(input_dir, zip_name) + '.part', - os.path.join(input_dir, zip_name)) + os.path.join(output_dir, zip_name) + '.part', + os.path.join(output_dir, zip_name)) # Read csv file inside zip archive df = pd.read_csv( - os.path.join(input_dir, zip_name), + os.path.join(output_dir, zip_name), header=2, skiprows=[3], parse_dates=['Date'], @@ -388,15 +395,10 @@ def extract_definitions(input_dir, output_dir): sites = sites.set_index('ID') # Get basin from master site dataframe - try: - sites['Basin name'] = basins.loc[sites.index, 'Basin name'] - sites['Basin code'] = basins.loc[sites.index, 'Basin code'] - except ValueError: - # FIXME: Some bores have duplicate IDs! - # Get basin name from input directory - sites['Basin name'] = input_dir - basin_idx = basins['Basin name'] == input_dir - sites['Basin code'] = basins.loc[basin_idx, 'Basin code'].values[0] + codes = basins.groupby('Basin name').first()['Basin code'] + basin_code = codes[basin_name] + sites['Basin name'] = basin_name + sites['Basin code'] = basin_code # Save variable definitions variables = pd.DataFrame( @@ -447,23 +449,24 @@ def extract_definitions(input_dir, output_dir): return sites -def extract_records(input_dir, output_dir, clean_up=False): +def extract_records(basin_name, download_dir, clean_up=False): """Extract downloaded bore records. Args: - input_dir: path to downloaded zip archives - output_dir: path to save csv files - clean_up: delete original zip archive after extracting it + basin_name: basin name (string) + download_dir: path to downloaded zip archives + clean_up: delete original zip archive after extracting it """ # Update definition tables - sites = extract_definitions(input_dir, output_dir) + sites = extract_definitions(basin_name, download_dir) # Keep unique basin codes basin_codes = sites['Basin code'].unique() # Find zip files - zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')] + output_dir = os.path.join(download_dir, basin_name) + zip_names = [f for f in os.listdir(output_dir) if f.endswith('.zip')] # Prepare output directory os.makedirs(output_dir, exist_ok=True) @@ -482,14 +485,14 @@ def extract_records(input_dir, output_dir, clean_up=False): continue # Rename '.part' file if zip was not correctly downloaded - if os.path.getsize(os.path.join(input_dir, zip_name)) == 0: + if os.path.getsize(os.path.join(output_dir, zip_name)) == 0: shutil.move( - os.path.join(input_dir, zip_name) + '.part', - os.path.join(input_dir, zip_name)) + os.path.join(output_dir, zip_name) + '.part', + os.path.join(output_dir, zip_name)) # Read header header = pd.read_csv( - os.path.join(input_dir, zip_name), compression='zip', nrows=3) + os.path.join(output_dir, zip_name), compression='zip', nrows=3) # Remove comments header = header.iloc[:, 1:-1].T @@ -506,7 +509,7 @@ def extract_records(input_dir, output_dir, clean_up=False): # Read csv file inside zip archive df = pd.read_csv( - os.path.join(input_dir, zip_name), + os.path.join(output_dir, zip_name), header=2, skiprows=[3], parse_dates=['Date'], @@ -569,7 +572,7 @@ def extract_records(input_dir, output_dir, clean_up=False): if clean_up: # Remove original zip archive - os.remove(os.path.join(input_dir, zip_name)) + os.remove(os.path.join(output_dir, zip_name)) for basin_code in basin_codes: for period in periods: