diff --git a/waternsw_grabber/__init__.py b/waternsw_grabber/__init__.py index 16a5b02..63df102 100644 --- a/waternsw_grabber/__init__.py +++ b/waternsw_grabber/__init__.py @@ -1 +1,2 @@ -from .waternsw_grabber import open_browser, get_telemetered_bore, telemetered_bore_downloader +from .waternsw_grabber import (telemetered_bore_downloader, + extract_records) diff --git a/waternsw_grabber/waternsw_grabber.py b/waternsw_grabber/waternsw_grabber.py index 2d2388f..2f0c1c9 100644 --- a/waternsw_grabber/waternsw_grabber.py +++ b/waternsw_grabber/waternsw_grabber.py @@ -231,3 +231,81 @@ def telemetered_bore_downloader(bore_ids, start_date, end_date, download_dir): os.remove(log_name) driver.quit() + + +def extract_records(input_dir, output_dir): + """Extract downloaded bore records. + + Args: + input_dir: path to downloaded zip archives + output_dir: path to save csv files + """ + + # Find zip files + zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')] + + # Prepare output directory + os.makedirs(output_dir, exist_ok=True) + + for zip_name in tqdm(zip_names): + # Skip duplicate downloads + if re.search('\([0-9]+\)', zip_name): + continue + + # Use '.part' file if zip was not correctly downloaded + if os.path.getsize(os.path.join(input_dir, zip_name)) == 0: + zip_name += '.part' + + # Read csv file inside zip archive + df = pd.read_csv( + os.path.join(input_dir, zip_name), + header=2, + skiprows=[3], + parse_dates=['Date'], + compression='zip', + dayfirst=True) + + # Get bore specifics + meta = df.iloc[1, -1] + lat = float(re.search('(?<=Lat:)\S+', meta).group()) + lon = float(re.search('(?<=Long:)\S+', meta).group()) + elev = float(re.search('(?<=Elev:).+(?=m)', meta).group()) + address = re.search('(?<=\d\.\d\.\d - ).+(?=\sLat)', meta).group() + bore_id = re.search('^\S+', meta).group() + site, hole, pipe = bore_id.split('.') + + # Add bore specifics to dataframe + df['Site'] = site + df['Hole'] = hole + df['Pipe'] = pipe + df['Lat'] = lat + df['Lon'] = lon + df['Elev'] = elev + + # Rename columns + df = df.rename( + columns={ + 'Date': 'Date time', + 'Bore level below MP': 'Below Measuring Point', + 'GW Level - m AHD': 'Above Sea Level' + }) + + # Select output columns + df = df[[ + 'Site', + 'Hole', + 'Pipe', + 'Date time', + 'Below Measuring Point', + 'Above Sea Level', + 'Lat', + 'Lon', + 'Elev', + ]] + + # Get csv name from zip archive + zip_name = zip_name.replace('.part', '') + csv_name = os.path.join(output_dir, zip_name.replace('.zip', 'csv')) + + # Export to csv + master.to_csv(csv_name, float_format='%0.3f')