"""Download latest time series data from BOM website and update local copy. BOM does not allow direct http requests to data files, so a requests session is opened on the product page before accessing the resource. D. Howe 2019-09-04 """ import os import json import requests import pandas as pd from lxml import html # Set base URL bom_url = 'http://www.bom.gov.au/' # Set product URL for North Head product_url = 'products/IDN60901/IDN60901.95768.shtml' # Set output directory output_dir = 'bom' # Set output column names cols = [ 'wind_dir', 'wind_spd_kmh', 'gust_kmh', ] def update_master(output_dir, csv_name, df): """Update master csv time series. Args: output_dir (str): path to time series directory csv_name (str): name of time series file df (dataframe): dataframe with datetime index Returns: None """ try: # Load local master table if it exists master = pd.read_csv(os.path.join(output_dir, csv_name), index_col=0, parse_dates=True) # Only include timestamps that do not already exist df = df[~df.index.isin(master.index)] # Update master master = master.append(df) except FileNotFoundError: # Create new master table if none exists master = df # Export master table master.to_csv(os.path.join(output_dir, csv_name)) # Open new session with BOM website with requests.session() as session: # Load product page page = session.get(bom_url + product_url) tree = html.fromstring(page.content) # Find and download JSON data json_url = tree.xpath('//*[@id="content"]/p[4]/a')[0].get('href') json_data = json.loads(session.get(bom_url + json_url).content) # Extract file base name csv_name = json_url.split('/')[-1].replace('.json', '.csv') # Create dataframe df = pd.DataFrame(json_data['observations']['data']) # Set local time as index df.index = pd.to_datetime(df['local_date_time_full']) df = df.sort_index() # Extract columns of interest df = df[cols] # Update master table update_master(output_dir, csv_name, df)