From e62c23b7f049426b1b78b2b639f7be5e2243780d Mon Sep 17 00:00:00 2001 From: Dan Howe Date: Mon, 25 Feb 2019 13:50:46 +1100 Subject: [PATCH] Add extract_definitions() function --- waternsw_grabber/waternsw_grabber.py | 182 +++++++++++++++++++++------ 1 file changed, 144 insertions(+), 38 deletions(-) diff --git a/waternsw_grabber/waternsw_grabber.py b/waternsw_grabber/waternsw_grabber.py index 672bbd4..9c2b443 100644 --- a/waternsw_grabber/waternsw_grabber.py +++ b/waternsw_grabber/waternsw_grabber.py @@ -242,6 +242,128 @@ def telemetered_bore_downloader(bore_ids, start_date, end_date, download_dir): driver.quit() +def extract_definitions(input_dir, output_dir): + """Extract variable and quality metadata from bore records. + + Args: + input_dir: path to downloaded zip archives + output_dir: path to save csv files + """ + + # Get telemetered site data + csv_name = os.path.join( + os.path.dirname(os.path.dirname(__file__)), 'data', + 'telemetered-sites.csv') + master = pd.read_csv(csv_name, index_col=0) + + # Find zip files + zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')] + + # Prepare output directory + os.makedirs(output_dir, exist_ok=True) + + for zip_name in tqdm(zip_names): + # Skip duplicate downloads + if re.search(r'\([0-9]+\)', zip_name): + continue + + # Rename '.part' file if zip was not correctly downloaded + if os.path.getsize(os.path.join(input_dir, zip_name)) == 0: + shutil.move( + os.path.join(input_dir, zip_name) + '.part', + os.path.join(input_dir, zip_name)) + + # Read csv file inside zip archive + df = pd.read_csv( + os.path.join(input_dir, zip_name), + header=2, + skiprows=[3], + parse_dates=['Date'], + compression='zip', + dayfirst=True, + nrows=100) + + # Extract metadata from last column + keys = ['Sites:', 'Variables:', 'Qualities:'] + meta = {k: [] for k in keys} + for i, row in df.iterrows(): + line = row.values[-1] + if line in keys: + header = True + var = line + elif line == ' ': + continue + else: + meta[var].append(line) + + # Get bore specifics + site_data = meta['Sites:'][0] + lat = float(re.search(r'(?<=Lat:)\S+', site_data).group()) + lon = float(re.search(r'(?<=Long:)\S+', site_data).group()) + elev = float(re.search(r'(?<=Elev:).+(?=m)', site_data).group()) + address = re.search(r'(?<=\d\.\d\.\d - ).+(?=\sLat)', + site_data).group() + bore_id = re.search(r'^\S+', site_data).group() + site, hole, pipe = bore_id.split('.') + + sites = pd.DataFrame() + sites['ID'] = [bore_id] + sites['Site'] = [site] + sites['Hole'] = [hole] + sites['Pipe'] = [pipe] + sites['Lat'] = [lat] + sites['Lon'] = [lon] + sites['Elev'] = [elev] + sites['Address'] = [address] + sites = sites.set_index('ID') + + # Get basin from master site dataframe + sites['Basin name'] = master.loc[sites.index, 'Basin name'] + sites['Basin code'] = master.loc[sites.index, 'Basin code'] + + # Save variable definitions + variables = pd.DataFrame( + [v.split(' - ', 1) for v in meta['Variables:']]) + variables.columns = ['Code', 'Description'] + variables['Code'] = variables['Code'].astype(int) + variables = variables.set_index('Code') + + # Save quality definitions + qualities = pd.DataFrame( + [q.split(' - ', 1) for q in meta['Qualities:']]) + qualities.columns = ['Code', 'Description'] + qualities['Code'] = qualities['Code'].astype(int) + qualities = qualities.set_index('Code') + + # Update existing values + csv_name_s = os.path.join(output_dir, 'sites.csv') + csv_name_v = os.path.join(output_dir, 'variables.csv') + csv_name_q = os.path.join(output_dir, 'qualities.csv') + + try: + sites = sites.append(pd.read_csv(csv_name_s, index_col=0)) + sites = sites.drop_duplicates().sort_index() + except FileNotFoundError: + pass + + try: + variables = variables.append(pd.read_csv(csv_name_v, index_col=0)) + variables = variables.drop_duplicates().sort_index() + except FileNotFoundError: + pass + + try: + variables = variables.append(pd.read_csv(csv_name_q, index_col=0)) + qualities = qualities.drop_duplicates().sort_index() + except FileNotFoundError: + pass + + # Export updated tables + sites.to_csv(csv_name_s) + variables.to_csv(csv_name_v) + qualities.to_csv(csv_name_q) + + def extract_records(input_dir, output_dir, clean_up=False): """Extract downloaded bore records. @@ -274,6 +396,23 @@ def extract_records(input_dir, output_dir, clean_up=False): os.path.join(input_dir, zip_name) + '.part', os.path.join(input_dir, zip_name)) + # Read header + header = pd.read_csv( + os.path.join(input_dir, zip_name), compression='zip', nrows=3) + + # Remove comments + header = header.iloc[:, 1:-1].T + + # Apply product codes to all columns + header.iloc[1::2, 0] = header.iloc[::2, 0].values + header[0] = header[0].astype(float).astype(int).astype(str) + + # Move quality label + header.iloc[1::2, 1] = header.iloc[1::2, 2] + + # Combine labels + columns = [' '.join(c) for c in header.iloc[:, :-1].values] + # Read csv file inside zip archive df = pd.read_csv( os.path.join(input_dir, zip_name), @@ -283,32 +422,14 @@ def extract_records(input_dir, output_dir, clean_up=False): compression='zip', dayfirst=True) + # Update column names + df.columns = ['Date time'] + columns + ['Metadata'] + # Get bore specifics - meta = df.iloc[1, -1] - lat = float(re.search(r'(?<=Lat:)\S+', meta).group()) - lon = float(re.search(r'(?<=Long:)\S+', meta).group()) - elev = float(re.search(r'(?<=Elev:).+(?=m)', meta).group()) - address = re.search(r'(?<=\d\.\d\.\d - ).+(?=\sLat)', meta).group() + meta = df['Metadata'].iloc[1] bore_id = re.search(r'^\S+', meta).group() site, hole, pipe = bore_id.split('.') - - # FIXME: detect basin automatically - basin_id = 'MB' - - # Rename columns - df = df.rename( - columns={ - 'Date': 'Date time', - 'Bore level below MP': 'Below Measuring Point', - 'GW Level - m AHD': 'Above Sea Level' - }) - - # Select output columns - df = df[[ - 'Date time', - 'Below Measuring Point', - 'Above Sea Level', - ]] + df.drop(columns='Metadata') # Set date index for resampling df.index = df['Date time'] @@ -325,15 +446,6 @@ def extract_records(input_dir, output_dir, clean_up=False): df = df.resample('1w').mean() df['Date time'] = df.index - # Add bore specifics to dataframe - df['Site'] = site - df['Hole'] = hole - df['Pipe'] = pipe - df['Lat'] = lat - df['Lon'] = lon - df['Elev'] = elev - df['Basin'] = basin_id - master[period] = pd.concat([master[period], df]) if clean_up: @@ -341,12 +453,6 @@ def extract_records(input_dir, output_dir, clean_up=False): os.remove(os.path.join(input_dir, zip_name)) for period in periods: - # Set column order - master[period] = master[period][[ - 'Date time', 'Basin', 'Site', 'Hole', 'Pipe', - 'Below Measuring Point', 'Above Sea Level', 'Lat', 'Lon', 'Elev' - ]] - # Get latest date from dataframe latest_date = master[period]['Date time'].iloc[-1].strftime('%Y-%m-%d') csv_name = os.path.join(