diff --git a/bom-wind/bom_wind.py b/bom-wind/bom_wind.py index d88f7e8..200877d 100644 --- a/bom-wind/bom_wind.py +++ b/bom-wind/bom_wind.py @@ -1,7 +1,13 @@ """Download latest time series data from BOM website and update local copy. -BOM does not allow direct http requests to data files, so a requests session -is opened on the product page before accessing the resource. +BOM does not allow direct http requests to data files unless it believes the +request comes from a browser. For example: + +>>> pd.read_csv(url) # This does not work +HTTPError: HTTP Error 403: Forbidden + +>>> requests.get(url) # This works! + D. Howe 2019-09-04 @@ -10,13 +16,10 @@ import os import json import requests import pandas as pd -from lxml import html -# Set base URL -bom_url = 'http://www.bom.gov.au/' - -# Set product URL for North Head -product_url = 'products/IDN60901/IDN60901.95768.shtml' +# Set json product URL for North Head +# http://www.bom.gov.au/products/IDN60901/IDN60901.95768.shtml +json_url = 'http://www.bom.gov.au/fwo/IDN60901/IDN60901.95768.json' # Set output directory output_dir = 'bom' @@ -60,18 +63,8 @@ def update_master(output_dir, csv_name, df): master.to_csv(os.path.join(output_dir, csv_name)) -# Open new session with BOM website -with requests.session() as session: - # Load product page - page = session.get(bom_url + product_url) - tree = html.fromstring(page.content) - - # Find and download JSON data - json_url = tree.xpath('//*[@id="content"]/p[4]/a')[0].get('href') - json_data = json.loads(session.get(bom_url + json_url).content) - -# Extract file base name -csv_name = json_url.split('/')[-1].replace('.json', '.csv') +# Download JSON data +json_data = json.loads(requests.get(json_url).content) # Create dataframe df = pd.DataFrame(json_data['observations']['data']) @@ -84,4 +77,5 @@ df = df.sort_index() df = df[cols] # Update master table +csv_name = json_url.split('/')[-1].replace('.json', '.csv') update_master(output_dir, csv_name, df)