diff --git a/bom-wind/README.md b/bom-wind/README.md new file mode 100644 index 0000000..3d7bfa1 --- /dev/null +++ b/bom-wind/README.md @@ -0,0 +1,6 @@ +# bom_wind.py + +Download latest time series data from BOM website and update local copy. + +BOM does not allow direct http requests to data files, so a requests session +is opened on the product page before accessing the resource. diff --git a/bom-wind/bom_wind.py b/bom-wind/bom_wind.py new file mode 100644 index 0000000..6dc1284 --- /dev/null +++ b/bom-wind/bom_wind.py @@ -0,0 +1,86 @@ +"""Download latest time series data from BOM website and update local copy. + +BOM does not allow direct http requests to data files, so a requests session +is opened on the product page before accessing the resource. + +D. Howe +2019-09-04 +""" +import os +import json +import requests +import pandas as pd +from lxml import html + +# Set base URL +bom_url = 'http://www.bom.gov.au/' + +# Set product URL for North Head +product_url = 'products/IDN60901/IDN60901.95768.shtml' + +# Set output directory +output_dir = 'bom' + +# Set output column names +cols = [ + 'wind_dir', + 'wind_spd_kmh', + 'gust_kmh', +] + + +def update_master(output_dir, csv_name, df): + """Update master csv time series. + + Args: + output_dir (str): path to time series directory + csv_name (str): name of time series file + df (dataframe): dataframe with datetime index + + Returns: + None + """ + try: + # Load local master table if it exists + master = pd.read_csv(os.path.join(output_dir, csv_name), + index_col=0, + parse_dates=True) + + # Only include timestamps that do not already exist + df = df[~df.index.isin(master.index)] + + # Update master + master = master.append(df) + + except FileNotFoundError: + # Create new master table if none exists + master = df + + # Export master table + master.to_csv(os.path.join(output_dir, csv_name)) + + +# Open new session with BOM website +with requests.session() as session: + # Load product page + page = session.get(bom_url + product_url) + tree = html.fromstring(page.content) + + # Find and download JSON data + json_url = tree.xpath('//*[@id="content"]/p[4]/a')[0].get('href') + json_data = json.loads(session.get(bom_url + json_url).content) + +# Extract file base name +csv_name = json_url.split('/')[-1].replace('.json', '.csv') + +# Create dataframe +df = pd.DataFrame(json_data['observations']['data']) + +# Set local time as index +df.index = pd.to_datetime(df['local_date_time_full']) + +# Extract columns of interest +df = df[cols] + +# Update master table +update_master(output_dir, csv_name, df)