Get basin ID codes automatically

7 years ago · 6f66bde339
parent 3b0c1e9747
commit 6f66bde339
1 changed files with 37 additions and 18 deletions
--- a/waternsw_grabber/waternsw_grabber.py
+++ b/waternsw_grabber/waternsw_grabber.py
@ -262,7 +262,7 @@ def extract_definitions(input_dir, output_dir):
    # Prepare output directory
    os.makedirs(output_dir, exist_ok=True)

-    for zip_name in tqdm(zip_names):
+    for zip_name in zip_names:
        # Skip duplicate downloads
        if re.search(r'\([0-9]+\)', zip_name):
            continue
@ -373,6 +373,9 @@ def extract_records(input_dir, output_dir, clean_up=False):
        clean_up:    delete original zip archive after extracting it
    """

+    # Update definition tables
+    extract_definitions(input_dir, output_dir)
+
    # Get basin info for telemetered site data
    csv_name = os.path.join(
        os.path.dirname(os.path.dirname(__file__)), 'data',
@ -382,14 +385,25 @@ def extract_records(input_dir, output_dir, clean_up=False):
    # Find zip files
    zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')]

+    # Get basin IDs for all zip files
+    basin_codes = []
+    for zip_name in zip_names:
+        bore_id = os.path.splitext(zip_name)[0].replace('cf', '')
+        basin_codes.append(basins.loc[bore_id, 'Basin code'])
+
+    # Keep uniue basin codes
+    basin_codes = list(set(basin_codes))
+
    # Prepare output directory
    os.makedirs(output_dir, exist_ok=True)

    # Create master dataframe
    periods = ['all', 'daily', 'weekly']
    master = {}
-    for period in periods:
-        master[period] = pd.DataFrame()
+    for basin_code in basin_codes:
+        master[basin_code] = {}
+        for period in periods:
+            master[basin_code][period] = pd.DataFrame()

    for zip_name in tqdm(zip_names):
        # Skip duplicate downloads
@ -425,44 +439,49 @@ def extract_records(input_dir, output_dir, clean_up=False):
            header=2,
            skiprows=[3],
            parse_dates=['Date'],
+            index_col=['Date'],
            compression='zip',
-            dayfirst=True)
+            dayfirst=True,
+            nrows=100)
+
+        # FIXME: convert quality codes to integers

        # Update column names
-        df.columns = ['Date time'] + columns + ['Metadata']
+        df.columns =  columns + ['Metadata']

        # Get bore specifics
        meta = df['Metadata'].iloc[1]
        bore_id = re.search(r'^\S+', meta).group()
        site, hole, pipe = bore_id.split('.')
-        df.drop(columns='Metadata')
+        df = df.drop(columns='Metadata')

-        # Set date index for resampling
-        df.index = df['Date time']
+        # Get basin ID
+        basin_code = basins.loc[bore_id, 'Basin code']

        # Append to master dataframe
        for period in periods:
            if period == 'daily':
                # Resample to daily timestamps
                df = df.resample('1d').mean()
-                df['Date time'] = df.index
+                # FIXME: add bore IDs

            elif period == 'weekly':
                # Resample to weekly timestamps
                df = df.resample('1w').mean()
-                df['Date time'] = df.index
+                # FIXME: add bore IDs

-            master[period] = pd.concat([master[period], df])
+            master[basin_code][period] = pd.concat([master[basin_code][period], df])

        if clean_up:
            # Remove original zip archive
            os.remove(os.path.join(input_dir, zip_name))

-    for period in periods:
-        # Get latest date from dataframe
-        latest_date = master[period]['Date time'].iloc[-1].strftime('%Y-%m-%d')
-        csv_name = os.path.join(
-            output_dir, '{}-{}-{}.csv'.format(basin_id, latest_date, period))
+    for basin_code in basin_codes:
+        for period in periods:
+            # Get latest date from dataframe
+            latest_date = master[basin_code][period].index[-1].strftime('%Y-%m-%d')
+            csv_name = os.path.join(
+                output_dir, '{}-{}-{}.csv'.format(basin_code, latest_date, period))

-        # Export to csv
-        master[period].to_csv(csv_name, index=False, float_format='%0.3f')
+            # Export to csv
+            master[basin_code][period].to_csv(csv_name, index=True, float_format='%0.3f')