waternsw-grabber/waternsw_grabber/waternsw_grabber.py

"""waternsw_grabber.py
Download bore records from the WaterNSW data portal.
"""

import os
import re
import time
import shutil
import logging
import warnings
import requests
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException, StaleElementReferenceException, NoSuchElementException)


def has_admin():
    """Check if current user has admin rights.
    https://stackoverflow.com/questions/2946746
    """
    if os.name == 'nt':
        try:
            # Check if C:/Windows/temp is readable for current user
            os.listdir(os.path.join(os.environ.get('systemroot'), 'temp'))
        except PermissionError:
            return False
        else:
            return True
    else:
        if 'SUDO_USER' in os.environ and os.geteuid() == 0:
            return True
        else:
            return False


def wait_for_element(driver, by, x, timeout=180):
    """Wait for element on page to load.

    Args:
        driver:   selenium webdriver object
        by:       locator strategy (e.g. By.ID)
        x:        locator string
        timeout:  maximum wait time (seconds)

    Raises
        TimeoutException if element does not load within timeout period
    """
    element_present = EC.presence_of_element_located((by, x))
    WebDriverWait(driver, timeout).until(element_present)


def get_telemetered_bore(driver, bore_id, start_date, end_date):
    """Download single record from telemetered bore.

    Args:
        driver:      selenium webdriver object
        bore_id:     bore ID (string)
        start_date:  start date (string in YYYY-MM-DD format)
        end_date:    end date (string in YYYY-MM-DD format)
    """

    url = 'https://realtimedata.waternsw.com.au/water.stm'
    driver.get(url)

    driver.switch_to.default_content()
    webhyd = driver.find_element_by_id('webhyd')
    driver.switch_to.frame(webhyd)

    # Load site specific page
    driver.execute_script("go('{}','gw', 1)".format(bore_id))

    # Wait for results frame to load
    wait_for_element(driver, By.ID, 'gwgwlf_org')
    driver.switch_to.frame('gwgwlf_org')

    # Wait until body text of iframe has loaded
    body_text = None
    while not body_text:
        try:
            # Get contents of body text
            body_text = driver.find_element_by_xpath('//*/body').text
        except (StaleElementReferenceException, NoSuchElementException):
            pass
        time.sleep(0.5)

    # Detect if bore record does not exist
    if 'No SITE record found for site' in body_text:
        raise ValueError('No SITE record found for site {}'.format(bore_id))
    elif 'No variables data found for this site.' in body_text:
        raise ValueError('No variables data found for site {}'.format(bore_id))

    # Wait for navigation tabs
    wait_for_element(driver, By.XPATH, '//*[@id="tabstext"]')

    # Activate outputs tab, and wait for 'Get Output' button
    driver.execute_script("menuloc.display_frame('gw', 'gwcf_org', '1')")
    driver.switch_to.parent_frame()
    wait_for_element(driver, By.ID, 'gwgwcf_org')
    driver.switch_to.frame('gwgwcf_org')
    wait_for_element(driver, By.ID, 'submit')

    # Get output select controls
    selects = driver.find_elements_by_xpath('//*/select')
    for select in selects:
        s = Select(select)
        label = s.options[0].get_attribute('label')
        if label == 'All data':
            period = s
        elif label == 'Plot':
            output = s
        elif label == 'Annual':
            interval = s

    # Change period dropdown to 'Custom'
    period.select_by_visible_text('Custom')

    # Get date input fields
    fields = driver.find_elements_by_xpath('//*[starts-with(@id,"cdate")]')

    # Parse dates
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)

    # Update fields with specified dates
    for field, date in zip(fields, [start_date, end_date]):
        field.clear()
        field.send_keys(pd.datetime.strftime(date, '%H:%M_%d/%m/%Y'))

    # Set output dropdown to 'Download'
    output.select_by_visible_text('Download')

    # Set interval dropdown to 'All points'
    interval.select_by_visible_text('All points')

    # Make sure 'Groundwater Level - AHD' is selected as an output
    try:
        checkbox = driver.find_element_by_xpath(
            '//*/input[contains(@name, "sel__110.00_115.00")]')
        if not checkbox.get_attribute('selected'):
            checkbox.click()
    except NoSuchElementException:
        pass

    # Download data
    driver.execute_script("get_output()")
    driver.execute_script("hide_object('confirm');co(level,tab,1)")

    # Close popup
    wait_for_element(
        driver,
        By.XPATH,
        "//div[contains(@class, 'lity-container')]",
        timeout=60)
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()


def open_browser(download_dir):
    """Opens an automated Firefox browser instance.

    Args:
        download_dir:  path to where downloaded files will be saved

    Returns:
        A selenium web browser object
    """

    # Make download directory absolute
    download_dir = os.path.abspath(download_dir)

    # Set up Firefox to silently download files to specified folder
    profile = webdriver.FirefoxProfile()
    profile.set_preference('browser.download.folderList', 2)
    profile.set_preference('browser.download.manager.showWhenStarting', False)
    profile.set_preference('browser.download.dir', download_dir)
    profile.set_preference('browser.helperApps.neverAsk.saveToDisk',
                           ('application/zip,'
                            'application/octet-stream,'
                            'application/x-zip-compressed,'
                            'multipart/x-zip'))

    # Create download folder if it does not exist
    os.makedirs(download_dir, exist_ok=True)

    # Open browser
    driver = webdriver.Firefox(firefox_profile=profile)

    return driver


def telemetered_bore_downloader(bore_ids, start_date, end_date, download_dir):
    """Download multiple records from telemetered bore.

    Args:
        bore_ids:      bore ID values (array-like)
        start_date:    start date (string YYYY-MM-DD format)
        end_date:      end date (string YYYY-MM-DD format)
        download_dir:  path to where downloaded files will be saved

    Raises:
        ValueError when bore ID is invalid
    """
    driver = open_browser(download_dir)

    # Set up log File
    log_name = os.path.join(download_dir, 'errors.log')
    logging.basicConfig(filename=log_name, level=logging.ERROR)

    # Download bore logs
    pbar = tqdm(bore_ids)
    for bore_id in pbar:
        pbar.set_description(bore_id)
        try:
            get_telemetered_bore(driver, bore_id, start_date, end_date)
        except ValueError as e:
            logging.error(e)
        except TimeoutException:
            e = 'Request timed out on {}. Try again later?'.format(bore_id)
            logging.error(e)

    # Tidy up console after tqdm
    print('\n')

    # Stop logging
    logging.shutdown()
    with open(log_name, 'r') as f:
        log_data = f.read()

    # Check contents of log file
    if log_data:
        warnings.warn(
            'Some files failed to download. See log for details.',
            stacklevel=2)
    else:
        os.remove(log_name)

    driver.quit()


def extract_definitions(input_dir, output_dir):
    """Extract variable and quality metadata from bore records.

    Args:
        input_dir:   path to downloaded zip archives
        output_dir:  path to save csv files
    """

    # Get basin info for telemetered site data
    csv_name = os.path.join(
        os.path.dirname(__file__), 'data', 'telemetered-sites.csv')
    basins = pd.read_csv(csv_name, index_col=0)

    # Find zip files
    zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')]

    # Prepare output directory
    os.makedirs(output_dir, exist_ok=True)

    for zip_name in zip_names:
        # Skip duplicate downloads
        if re.search(r'\([0-9]+\)', zip_name):
            continue

        # Rename '.part' file if zip was not correctly downloaded
        if os.path.getsize(os.path.join(input_dir, zip_name)) == 0:
            shutil.move(
                os.path.join(input_dir, zip_name) + '.part',
                os.path.join(input_dir, zip_name))

        # Read csv file inside zip archive
        df = pd.read_csv(
            os.path.join(input_dir, zip_name),
            header=2,
            skiprows=[3],
            parse_dates=['Date'],
            compression='zip',
            dayfirst=True,
            nrows=100)

        # Extract metadata from last column
        keys = ['Sites:', 'Variables:', 'Qualities:']
        meta = {k: [] for k in keys}
        for i, row in df.iterrows():
            line = row.values[-1]
            if line in keys:
                header = True
                var = line
            elif line == ' ':
                continue
            else:
                meta[var].append(line)

        # Get bore specifics
        site_data = meta['Sites:'][0]
        lat = float(re.search(r'(?<=Lat:)\S+', site_data).group())
        lon = float(re.search(r'(?<=Long:)\S+', site_data).group())
        elev = float(re.search(r'(?<=Elev:).+(?=m)', site_data).group())
        address = re.search(r'(?<=\d\.\d\.\d - ).+(?=\sLat)',
                            site_data).group()
        bore_id = re.search(r'^\S+', site_data).group()
        site, hole, pipe = bore_id.split('.')

        sites = pd.DataFrame()
        sites['ID'] = [bore_id]
        sites['Site'] = [site]
        sites['Hole'] = [hole]
        sites['Pipe'] = [pipe]
        sites['Lat'] = [lat]
        sites['Lon'] = [lon]
        sites['Elev'] = [elev]
        sites['Address'] = [address]
        sites = sites.set_index('ID')

        # Get basin from master site dataframe
        sites['Basin name'] = basins.loc[sites.index, 'Basin name']
        sites['Basin code'] = basins.loc[sites.index, 'Basin code']

        # Save variable definitions
        variables = pd.DataFrame(
            [v.split(' - ', 1) for v in meta['Variables:']])
        variables.columns = ['Code', 'Description']
        variables['Code'] = variables['Code'].astype(int)
        variables = variables.set_index('Code')

        # Save quality definitions
        qualities = pd.DataFrame(
            [q.split(' - ', 1) for q in meta['Qualities:']])
        qualities.columns = ['Code', 'Description']
        qualities['Code'] = qualities['Code'].astype(int)
        qualities = qualities.set_index('Code')

        # Update existing values
        csv_name_s = os.path.join(output_dir, 'sites.csv')
        csv_name_v = os.path.join(output_dir, 'variables.csv')
        csv_name_q = os.path.join(output_dir, 'qualities.csv')

        try:
            sites = sites.append(pd.read_csv(csv_name_s, index_col=0))
            sites = sites.drop_duplicates().sort_index()
        except FileNotFoundError:
            pass

        try:
            variables = variables.append(pd.read_csv(csv_name_v, index_col=0))
            variables = variables.drop_duplicates().sort_index()
        except FileNotFoundError:
            pass

        try:
            variables = variables.append(pd.read_csv(csv_name_q, index_col=0))
            qualities = qualities.drop_duplicates().sort_index()
        except FileNotFoundError:
            pass

        # Export updated tables
        sites.to_csv(csv_name_s)
        variables.to_csv(csv_name_v)
        qualities.to_csv(csv_name_q)

    sites = sites[~sites.index.duplicated(keep='first')]
    return sites


def extract_records(input_dir, output_dir, clean_up=False):
    """Extract downloaded bore records.

    Args:
        input_dir:   path to downloaded zip archives
        output_dir:  path to save csv files
        clean_up:    delete original zip archive after extracting it
    """

    # Update definition tables
    sites = extract_definitions(input_dir, output_dir)

    # Keep unique basin codes
    basin_codes = sites['Basin code'].unique()

    # Find zip files
    zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')]

    # Prepare output directory
    os.makedirs(output_dir, exist_ok=True)

    # Create master dataframe
    periods = ['all', 'daily', 'weekly']
    master = {}
    for basin_code in basin_codes:
        master[basin_code] = {}
        for period in periods:
            master[basin_code][period] = pd.DataFrame()

    for zip_name in tqdm(zip_names):
        # Skip duplicate downloads
        if re.search(r'\([0-9]+\)', zip_name):
            continue

        # Rename '.part' file if zip was not correctly downloaded
        if os.path.getsize(os.path.join(input_dir, zip_name)) == 0:
            shutil.move(
                os.path.join(input_dir, zip_name) + '.part',
                os.path.join(input_dir, zip_name))

        # Read header
        header = pd.read_csv(
            os.path.join(input_dir, zip_name), compression='zip', nrows=3)

        # Remove comments
        header = header.iloc[:, 1:-1].T

        # Apply product codes to all columns
        header.iloc[1::2, 0] = header.iloc[::2, 0].values
        header[0] = header[0].astype(float).astype(int).astype(str)

        # Move quality label
        header.iloc[1::2, 1] = header.iloc[1::2, 2]

        # Combine labels
        columns = [' '.join(c) for c in header.iloc[:, :-1].values]

        # Read csv file inside zip archive
        df = pd.read_csv(
            os.path.join(input_dir, zip_name),
            header=2,
            skiprows=[3],
            parse_dates=['Date'],
            index_col=['Date'],
            compression='zip',
            dayfirst=True)

        # Convert quality codes to integers
        for col in df.columns:
            if 'Quality' in col:
                df[col] = df[col].astype(int)

        # Update column names
        df.columns = columns + ['Metadata']

        # Get bore specifics
        meta = df['Metadata'].iloc[1]
        bore_id = re.search(r'^\S+', meta).group()
        site, hole, pipe = bore_id.split('.')
        df = df.drop(columns='Metadata')

        # Get basin ID
        basin_code = sites.loc[bore_id, 'Basin code']

        # Resample if necessary
        for period in periods:
            if period == 'daily':
                # Resample to daily timestamps
                df = df.resample('1d').mean()

            elif period == 'weekly':
                # Resample to weekly timestamps
                df = df.resample('1w').mean()

            # Add specific borehole details
            df['Site'] = sites.loc[bore_id, 'Site']
            df['Hole'] = sites.loc[bore_id, 'Hole']
            df['Pipe'] = sites.loc[bore_id, 'Pipe']
            df['Basin'] = sites.loc[bore_id, 'Basin code']
            df = df[['Site', 'Hole', 'Pipe', 'Basin'] + columns]

            # Remove empty rows
            df = df.dropna()

            # Add to master dataframe
            master[basin_code][period] = pd.concat(
                [master[basin_code][period], df])

        if clean_up:
            # Remove original zip archive
            os.remove(os.path.join(input_dir, zip_name))

    for basin_code in basin_codes:
        for period in periods:
            # Get latest date from dataframe
            latest_date = master[basin_code][period].index[-1].strftime(
                '%Y-%m-%d')
            csv_name = os.path.join(
                output_dir, '{}-{}-{}.csv'.format(basin_code, latest_date,
                                                  period))

            # Export to csv
            master[basin_code][period].to_csv(
                csv_name, index=True, float_format='%0.3f')
Add setup.py 6 years ago			`"""waternsw_grabber.py`
			`Download bore records from the WaterNSW data portal.`
			`"""`

Initial commit 6 years ago			`import os`
			`import re`
Handle invalid bore IDs gracefully 6 years ago			`import time`
Fix broken downloaded '.part' files 6 years ago			`import shutil`
Create log for failed downloads 6 years ago			`import logging`
Initial commit 6 years ago			`import warnings`
Fix broken downloaded '.part' files 6 years ago			`import requests`
Initial commit 6 years ago			`import pandas as pd`
Add telemetered_bore_downloader() function 6 years ago			`from tqdm import tqdm`
Initial commit 6 years ago			`from selenium import webdriver`
			`from selenium.webdriver.common.by import By`
			`from selenium.webdriver.common.keys import Keys`
			`from selenium.webdriver.support.ui import WebDriverWait, Select`
			`from selenium.webdriver.support import expected_conditions as EC`
Handle invalid bore IDs gracefully 6 years ago			`from selenium.common.exceptions import (`
			`TimeoutException, StaleElementReferenceException, NoSuchElementException)`
Initial commit 6 years ago

			`def has_admin():`
			`"""Check if current user has admin rights.`
			`https://stackoverflow.com/questions/2946746`
			`"""`
			`if os.name == 'nt':`
			`try:`
			`# Check if C:/Windows/temp is readable for current user`
			`os.listdir(os.path.join(os.environ.get('systemroot'), 'temp'))`
			`except PermissionError:`
			`return False`
			`else:`
			`return True`
			`else:`
			`if 'SUDO_USER' in os.environ and os.geteuid() == 0:`
			`return True`
			`else:`
			`return False`


Increase timeout interval to 3 minutes 6 years ago			`def wait_for_element(driver, by, x, timeout=180):`
Raise TimeoutException when wait_for_element() fails 6 years ago			`"""Wait for element on page to load.`

			`Args:`
			`driver: selenium webdriver object`
			`by: locator strategy (e.g. By.ID)`
			`x: locator string`
			`timeout: maximum wait time (seconds)`

			`Raises`
			`TimeoutException if element does not load within timeout period`
			`"""`
Move TimeoutException handler to telemetered_bore_downloader() function 6 years ago			`element_present = EC.presence_of_element_located((by, x))`
			`WebDriverWait(driver, timeout).until(element_present)`
Initial commit 6 years ago

			`def get_telemetered_bore(driver, bore_id, start_date, end_date):`
Update docstrings 6 years ago			`"""Download single record from telemetered bore.`
Add setup.py 6 years ago
			`Args:`
			`driver: selenium webdriver object`
Update docstrings 6 years ago			`bore_id: bore ID (string)`
Update docstrings 6 years ago			`start_date: start date (string in YYYY-MM-DD format)`
			`end_date: end date (string in YYYY-MM-DD format)`
Add setup.py 6 years ago			`"""`

Handle invalid bore IDs gracefully 6 years ago			`url = 'https://realtimedata.waternsw.com.au/water.stm'`
			`driver.get(url)`

Initial commit 6 years ago			`driver.switch_to.default_content()`
			`webhyd = driver.find_element_by_id('webhyd')`
			`driver.switch_to.frame(webhyd)`

			`# Load site specific page`
			`driver.execute_script("go('{}','gw', 1)".format(bore_id))`

			`# Wait for results frame to load`
Use consistent function for waits 6 years ago			`wait_for_element(driver, By.ID, 'gwgwlf_org')`
			`driver.switch_to.frame('gwgwlf_org')`
Handle invalid bore IDs gracefully 6 years ago
			`# Wait until body text of iframe has loaded`
			`body_text = None`
			`while not body_text:`
			`try:`
			`# Get contents of body text`
			`body_text = driver.find_element_by_xpath('//*/body').text`
			`except (StaleElementReferenceException, NoSuchElementException):`
			`pass`
			`time.sleep(0.5)`

			`# Detect if bore record does not exist`
Handle errors for sites with no data 6 years ago			`if 'No SITE record found for site' in body_text:`
Handle invalid bore IDs gracefully 6 years ago			`raise ValueError('No SITE record found for site {}'.format(bore_id))`
Handle errors for sites with no data 6 years ago			`elif 'No variables data found for this site.' in body_text:`
			`raise ValueError('No variables data found for site {}'.format(bore_id))`
Handle invalid bore IDs gracefully 6 years ago
			`# Wait for navigation tabs`
Initial commit 6 years ago			`wait_for_element(driver, By.XPATH, '//*[@id="tabstext"]')`

			`# Activate outputs tab, and wait for 'Get Output' button`
Add missing quotes around javascript function call 6 years ago			`driver.execute_script("menuloc.display_frame('gw', 'gwcf_org', '1')")`
Initial commit 6 years ago			`driver.switch_to.parent_frame()`
Use consistent function for waits 6 years ago			`wait_for_element(driver, By.ID, 'gwgwcf_org')`
			`driver.switch_to.frame('gwgwcf_org')`
Initial commit 6 years ago			`wait_for_element(driver, By.ID, 'submit')`

			`# Get output select controls`
			`selects = driver.find_elements_by_xpath('//*/select')`
			`for select in selects:`
			`s = Select(select)`
			`label = s.options[0].get_attribute('label')`
			`if label == 'All data':`
			`period = s`
			`elif label == 'Plot':`
			`output = s`
			`elif label == 'Annual':`
			`interval = s`

			`# Change period dropdown to 'Custom'`
			`period.select_by_visible_text('Custom')`

			`# Get date input fields`
			`fields = driver.find_elements_by_xpath('//*[starts-with(@id,"cdate")]')`

			`# Parse dates`
			`start_date = pd.to_datetime(start_date)`
			`end_date = pd.to_datetime(end_date)`

			`# Update fields with specified dates`
			`for field, date in zip(fields, [start_date, end_date]):`
			`field.clear()`
			`field.send_keys(pd.datetime.strftime(date, '%H:%M_%d/%m/%Y'))`

			`# Set output dropdown to 'Download'`
			`output.select_by_visible_text('Download')`

			`# Set interval dropdown to 'All points'`
			`interval.select_by_visible_text('All points')`

			`# Make sure 'Groundwater Level - AHD' is selected as an output`
Handle errors where GW level ( m AHD) is not available 6 years ago			`try:`
			`checkbox = driver.find_element_by_xpath(`
			`'//*/input[contains(@name, "sel__110.00_115.00")]')`
			`if not checkbox.get_attribute('selected'):`
			`checkbox.click()`
			`except NoSuchElementException:`
			`pass`
Initial commit 6 years ago
			`# Download data`
			`driver.execute_script("get_output()")`
			`driver.execute_script("hide_object('confirm');co(level,tab,1)")`

			`# Close popup`
Handle invalid bore IDs gracefully 6 years ago			`wait_for_element(`
			`driver,`
			`By.XPATH,`
			`"//div[contains(@class, 'lity-container')]",`
Increase timeout duration to 60 s 6 years ago			`timeout=60)`
Initial commit 6 years ago			`webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()`


Use Firefox, and specify download location 6 years ago			`def open_browser(download_dir):`
Update docstrings 6 years ago			`"""Opens an automated Firefox browser instance.`

			`Args:`
			`download_dir: path to where downloaded files will be saved`

			`Returns:`
			`A selenium web browser object`
			`"""`

Use absolute paths for Firefox download directory 6 years ago			`# Make download directory absolute`
			`download_dir = os.path.abspath(download_dir)`

Don't warn if user is not an administrator 6 years ago			`# Set up Firefox to silently download files to specified folder`
Use Firefox, and specify download location 6 years ago			`profile = webdriver.FirefoxProfile()`
			`profile.set_preference('browser.download.folderList', 2)`
			`profile.set_preference('browser.download.manager.showWhenStarting', False)`
			`profile.set_preference('browser.download.dir', download_dir)`
			`profile.set_preference('browser.helperApps.neverAsk.saveToDisk',`
			`('application/zip,'`
			`'application/octet-stream,'`
			`'application/x-zip-compressed,'`
			`'multipart/x-zip'))`

Don't warn if user is not an administrator 6 years ago			`# Create download folder if it does not exist`
			`os.makedirs(download_dir, exist_ok=True)`

Initial commit 6 years ago			`# Open browser`
Use Firefox, and specify download location 6 years ago			`driver = webdriver.Firefox(firefox_profile=profile)`
Initial commit 6 years ago
			`return driver`
Add telemetered_bore_downloader() function 6 years ago

			`def telemetered_bore_downloader(bore_ids, start_date, end_date, download_dir):`
Update docstrings 6 years ago			`"""Download multiple records from telemetered bore.`

			`Args:`
			`bore_ids: bore ID values (array-like)`
Update docstrings 6 years ago			`start_date: start date (string YYYY-MM-DD format)`
			`end_date: end date (string YYYY-MM-DD format)`
Update docstrings 6 years ago			`download_dir: path to where downloaded files will be saved`

			`Raises:`
			`ValueError when bore ID is invalid`
			`"""`
Add telemetered_bore_downloader() function 6 years ago			`driver = open_browser(download_dir)`

Create log for failed downloads 6 years ago			`# Set up log File`
			`log_name = os.path.join(download_dir, 'errors.log')`
			`logging.basicConfig(filename=log_name, level=logging.ERROR)`

Add telemetered_bore_downloader() function 6 years ago			`# Download bore logs`
			`pbar = tqdm(bore_ids)`
			`for bore_id in pbar:`
			`pbar.set_description(bore_id)`
Handle invalid bore IDs gracefully 6 years ago			`try:`
			`get_telemetered_bore(driver, bore_id, start_date, end_date)`
Move TimeoutException handler to telemetered_bore_downloader() function 6 years ago			`except ValueError as e:`
			`logging.error(e)`
			`except TimeoutException:`
			`e = 'Request timed out on {}. Try again later?'.format(bore_id)`
Handle errors for sites with no data 6 years ago			`logging.error(e)`
Create log for failed downloads 6 years ago
			`# Tidy up console after tqdm`
			`print('\n')`

			`# Stop logging`
			`logging.shutdown()`
			`with open(log_name, 'r') as f:`
			`log_data = f.read()`

			`# Check contents of log file`
			`if log_data:`
			`warnings.warn(`
			`'Some files failed to download. See log for details.',`
			`stacklevel=2)`
			`else:`
			`os.remove(log_name)`
Add telemetered_bore_downloader() function 6 years ago
			`driver.quit()`
Add extract_records() function 6 years ago

Add extract_definitions() function 6 years ago			`def extract_definitions(input_dir, output_dir):`
			`"""Extract variable and quality metadata from bore records.`

			`Args:`
			`input_dir: path to downloaded zip archives`
			`output_dir: path to save csv files`
			`"""`

Change basin dataframe name 6 years ago			`# Get basin info for telemetered site data`
Add extract_definitions() function 6 years ago			`csv_name = os.path.join(`
Fix relative path 6 years ago			`os.path.dirname(__file__), 'data', 'telemetered-sites.csv')`
Change basin dataframe name 6 years ago			`basins = pd.read_csv(csv_name, index_col=0)`
Add extract_definitions() function 6 years ago
			`# Find zip files`
			`zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')]`

			`# Prepare output directory`
			`os.makedirs(output_dir, exist_ok=True)`

Get basin ID codes automatically 6 years ago			`for zip_name in zip_names:`
Add extract_definitions() function 6 years ago			`# Skip duplicate downloads`
			`if re.search(r'\([0-9]+\)', zip_name):`
			`continue`

			`# Rename '.part' file if zip was not correctly downloaded`
			`if os.path.getsize(os.path.join(input_dir, zip_name)) == 0:`
			`shutil.move(`
			`os.path.join(input_dir, zip_name) + '.part',`
			`os.path.join(input_dir, zip_name))`

			`# Read csv file inside zip archive`
			`df = pd.read_csv(`
			`os.path.join(input_dir, zip_name),`
			`header=2,`
			`skiprows=[3],`
			`parse_dates=['Date'],`
			`compression='zip',`
			`dayfirst=True,`
			`nrows=100)`

			`# Extract metadata from last column`
			`keys = ['Sites:', 'Variables:', 'Qualities:']`
			`meta = {k: [] for k in keys}`
			`for i, row in df.iterrows():`
			`line = row.values[-1]`
			`if line in keys:`
			`header = True`
			`var = line`
			`elif line == ' ':`
			`continue`
			`else:`
			`meta[var].append(line)`

			`# Get bore specifics`
			`site_data = meta['Sites:'][0]`
			`lat = float(re.search(r'(?<=Lat:)\S+', site_data).group())`
			`lon = float(re.search(r'(?<=Long:)\S+', site_data).group())`
			`elev = float(re.search(r'(?<=Elev:).+(?=m)', site_data).group())`
			`address = re.search(r'(?<=\d\.\d\.\d - ).+(?=\sLat)',`
			`site_data).group()`
			`bore_id = re.search(r'^\S+', site_data).group()`
			`site, hole, pipe = bore_id.split('.')`

			`sites = pd.DataFrame()`
			`sites['ID'] = [bore_id]`
			`sites['Site'] = [site]`
			`sites['Hole'] = [hole]`
			`sites['Pipe'] = [pipe]`
			`sites['Lat'] = [lat]`
			`sites['Lon'] = [lon]`
			`sites['Elev'] = [elev]`
			`sites['Address'] = [address]`
			`sites = sites.set_index('ID')`

			`# Get basin from master site dataframe`
Change basin dataframe name 6 years ago			`sites['Basin name'] = basins.loc[sites.index, 'Basin name']`
			`sites['Basin code'] = basins.loc[sites.index, 'Basin code']`
Add extract_definitions() function 6 years ago
			`# Save variable definitions`
			`variables = pd.DataFrame(`
			`[v.split(' - ', 1) for v in meta['Variables:']])`
			`variables.columns = ['Code', 'Description']`
			`variables['Code'] = variables['Code'].astype(int)`
			`variables = variables.set_index('Code')`

			`# Save quality definitions`
			`qualities = pd.DataFrame(`
			`[q.split(' - ', 1) for q in meta['Qualities:']])`
			`qualities.columns = ['Code', 'Description']`
			`qualities['Code'] = qualities['Code'].astype(int)`
			`qualities = qualities.set_index('Code')`

			`# Update existing values`
			`csv_name_s = os.path.join(output_dir, 'sites.csv')`
			`csv_name_v = os.path.join(output_dir, 'variables.csv')`
			`csv_name_q = os.path.join(output_dir, 'qualities.csv')`

			`try:`
			`sites = sites.append(pd.read_csv(csv_name_s, index_col=0))`
			`sites = sites.drop_duplicates().sort_index()`
			`except FileNotFoundError:`
			`pass`

			`try:`
			`variables = variables.append(pd.read_csv(csv_name_v, index_col=0))`
			`variables = variables.drop_duplicates().sort_index()`
			`except FileNotFoundError:`
			`pass`

			`try:`
			`variables = variables.append(pd.read_csv(csv_name_q, index_col=0))`
			`qualities = qualities.drop_duplicates().sort_index()`
			`except FileNotFoundError:`
			`pass`

			`# Export updated tables`
			`sites.to_csv(csv_name_s)`
			`variables.to_csv(csv_name_v)`
			`qualities.to_csv(csv_name_q)`

Add basin code and bore details to output 6 years ago			`sites = sites[~sites.index.duplicated(keep='first')]`
			`return sites`

Add extract_definitions() function 6 years ago
Clean up after extracting zip archives 6 years ago			`def extract_records(input_dir, output_dir, clean_up=False):`
Add extract_records() function 6 years ago			`"""Extract downloaded bore records.`

			`Args:`
			`input_dir: path to downloaded zip archives`
			`output_dir: path to save csv files`
Clean up after extracting zip archives 6 years ago			`clean_up: delete original zip archive after extracting it`
Add extract_records() function 6 years ago			`"""`

Get basin ID codes automatically 6 years ago			`# Update definition tables`
Add basin code and bore details to output 6 years ago			`sites = extract_definitions(input_dir, output_dir)`
Get basin ID codes automatically 6 years ago
Add basin code and bore details to output 6 years ago			`# Keep unique basin codes`
			`basin_codes = sites['Basin code'].unique()`
Change basin dataframe name 6 years ago
Add extract_records() function 6 years ago			`# Find zip files`
			`zip_names = [f for f in os.listdir(input_dir) if f.endswith('.zip')]`

			`# Prepare output directory`
			`os.makedirs(output_dir, exist_ok=True)`

Downsample if required 6 years ago			`# Create master dataframe`
Export daily means 6 years ago			`periods = ['all', 'daily', 'weekly']`
Downsample if required 6 years ago			`master = {}`
Get basin ID codes automatically 6 years ago			`for basin_code in basin_codes:`
			`master[basin_code] = {}`
			`for period in periods:`
			`master[basin_code][period] = pd.DataFrame()`
Downsample if required 6 years ago
Add extract_records() function 6 years ago			`for zip_name in tqdm(zip_names):`
			`# Skip duplicate downloads`
Use raw strings for regular expressions 6 years ago			`if re.search(r'\([0-9]+\)', zip_name):`
Add extract_records() function 6 years ago			`continue`

Fix broken downloaded '.part' files 6 years ago			`# Rename '.part' file if zip was not correctly downloaded`
Add extract_records() function 6 years ago			`if os.path.getsize(os.path.join(input_dir, zip_name)) == 0:`
Fix broken downloaded '.part' files 6 years ago			`shutil.move(`
			`os.path.join(input_dir, zip_name) + '.part',`
			`os.path.join(input_dir, zip_name))`
Add extract_records() function 6 years ago
Add extract_definitions() function 6 years ago			`# Read header`
			`header = pd.read_csv(`
			`os.path.join(input_dir, zip_name), compression='zip', nrows=3)`

			`# Remove comments`
			`header = header.iloc[:, 1:-1].T`

			`# Apply product codes to all columns`
			`header.iloc[1::2, 0] = header.iloc[::2, 0].values`
			`header[0] = header[0].astype(float).astype(int).astype(str)`

			`# Move quality label`
			`header.iloc[1::2, 1] = header.iloc[1::2, 2]`

			`# Combine labels`
			`columns = [' '.join(c) for c in header.iloc[:, :-1].values]`

Add extract_records() function 6 years ago			`# Read csv file inside zip archive`
			`df = pd.read_csv(`
			`os.path.join(input_dir, zip_name),`
			`header=2,`
			`skiprows=[3],`
			`parse_dates=['Date'],`
Get basin ID codes automatically 6 years ago			`index_col=['Date'],`
Add extract_records() function 6 years ago			`compression='zip',`
Add basin code and bore details to output 6 years ago			`dayfirst=True)`
Get basin ID codes automatically 6 years ago
Add basin code and bore details to output 6 years ago			`# Convert quality codes to integers`
			`for col in df.columns:`
			`if 'Quality' in col:`
			`df[col] = df[col].astype(int)`
Add extract_records() function 6 years ago
Add extract_definitions() function 6 years ago			`# Update column names`
Add basin code and bore details to output 6 years ago			`df.columns = columns + ['Metadata']`
Add extract_definitions() function 6 years ago
Add extract_records() function 6 years ago			`# Get bore specifics`
Add extract_definitions() function 6 years ago			`meta = df['Metadata'].iloc[1]`
Use raw strings for regular expressions 6 years ago			`bore_id = re.search(r'^\S+', meta).group()`
Add extract_records() function 6 years ago			`site, hole, pipe = bore_id.split('.')`
Get basin ID codes automatically 6 years ago			`df = df.drop(columns='Metadata')`
Add extract_records() function 6 years ago
Get basin ID codes automatically 6 years ago			`# Get basin ID`
Add basin code and bore details to output 6 years ago			`basin_code = sites.loc[bore_id, 'Basin code']`
Add extract_records() function 6 years ago
Add basin code and bore details to output 6 years ago			`# Resample if necessary`
Downsample if required 6 years ago			`for period in periods:`
Export daily means 6 years ago			`if period == 'daily':`
			`# Resample to daily timestamps`
			`df = df.resample('1d').mean()`

			`elif period == 'weekly':`
Downsample if required 6 years ago			`# Resample to weekly timestamps`
			`df = df.resample('1w').mean()`

Add basin code and bore details to output 6 years ago			`# Add specific borehole details`
			`df['Site'] = sites.loc[bore_id, 'Site']`
			`df['Hole'] = sites.loc[bore_id, 'Hole']`
			`df['Pipe'] = sites.loc[bore_id, 'Pipe']`
			`df['Basin'] = sites.loc[bore_id, 'Basin code']`
			`df = df[['Site', 'Hole', 'Pipe', 'Basin'] + columns]`

			`# Remove empty rows`
			`df = df.dropna()`

			`# Add to master dataframe`
			`master[basin_code][period] = pd.concat(`
			`[master[basin_code][period], df])`
Clean up after extracting zip archives 6 years ago
			`if clean_up:`
			`# Remove original zip archive`
			`os.remove(os.path.join(input_dir, zip_name))`
Downsample if required 6 years ago
Get basin ID codes automatically 6 years ago			`for basin_code in basin_codes:`
			`for period in periods:`
			`# Get latest date from dataframe`
Add basin code and bore details to output 6 years ago			`latest_date = master[basin_code][period].index[-1].strftime(`
			`'%Y-%m-%d')`
Get basin ID codes automatically 6 years ago			`csv_name = os.path.join(`
Add basin code and bore details to output 6 years ago			`output_dir, '{}-{}-{}.csv'.format(basin_code, latest_date,`
			`period))`
Downsample if required 6 years ago
Get basin ID codes automatically 6 years ago			`# Export to csv`
Add basin code and bore details to output 6 years ago			`master[basin_code][period].to_csv(`
			`csv_name, index=True, float_format='%0.3f')`