"""Download latest time series data from BOM website and update local copy. BOM does not allow direct http requests to data files unless it believes the request comes from a browser. For example: >>> pd.read_csv(url) # This does not work HTTPError: HTTP Error 403: Forbidden >>> requests.get(url) # This works! D. Howe 2019-09-04 """ import os import json import requests import pandas as pd # Set json product URL for North Head # http://www.bom.gov.au/products/IDN60901/IDN60901.95768.shtml json_url = 'http://www.bom.gov.au/fwo/IDN60901/IDN60901.95768.json' # Set output directory output_dir = 'bom' # Set output column names cols = [ 'wind_dir', 'wind_spd_kmh', 'gust_kmh', ] def update_master(output_dir, csv_name, df): """Update master csv time series. Args: output_dir (str): path to time series directory csv_name (str): name of time series file df (dataframe): dataframe with datetime index Returns: None """ try: # Load local master table if it exists master = pd.read_csv(os.path.join(output_dir, csv_name), index_col=0, parse_dates=True) # Only include timestamps that do not already exist df = df[~df.index.isin(master.index)] # Update master master = master.append(df) except FileNotFoundError: # Create new master table if none exists master = df # Export master table master.to_csv(os.path.join(output_dir, csv_name)) # Download JSON data json_data = json.loads(requests.get(json_url).content) # Create dataframe df = pd.DataFrame(json_data['observations']['data']) # Set local time as index df.index = pd.to_datetime(df['local_date_time_full']) df = df.sort_index() # Extract columns of interest df = df[cols] # Update master table csv_name = json_url.split('/')[-1].replace('.json', '.csv') update_master(output_dir, csv_name, df)