|
|
@ -1,7 +1,13 @@
|
|
|
|
"""Download latest time series data from BOM website and update local copy.
|
|
|
|
"""Download latest time series data from BOM website and update local copy.
|
|
|
|
|
|
|
|
|
|
|
|
BOM does not allow direct http requests to data files, so a requests session
|
|
|
|
BOM does not allow direct http requests to data files unless it believes the
|
|
|
|
is opened on the product page before accessing the resource.
|
|
|
|
request comes from a browser. For example:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
>>> pd.read_csv(url) # This does not work
|
|
|
|
|
|
|
|
HTTPError: HTTP Error 403: Forbidden
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
>>> requests.get(url) # This works!
|
|
|
|
|
|
|
|
<Response [200]>
|
|
|
|
|
|
|
|
|
|
|
|
D. Howe
|
|
|
|
D. Howe
|
|
|
|
2019-09-04
|
|
|
|
2019-09-04
|
|
|
@ -10,13 +16,10 @@ import os
|
|
|
|
import json
|
|
|
|
import json
|
|
|
|
import requests
|
|
|
|
import requests
|
|
|
|
import pandas as pd
|
|
|
|
import pandas as pd
|
|
|
|
from lxml import html
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Set base URL
|
|
|
|
# Set json product URL for North Head
|
|
|
|
bom_url = 'http://www.bom.gov.au/'
|
|
|
|
# http://www.bom.gov.au/products/IDN60901/IDN60901.95768.shtml
|
|
|
|
|
|
|
|
json_url = 'http://www.bom.gov.au/fwo/IDN60901/IDN60901.95768.json'
|
|
|
|
# Set product URL for North Head
|
|
|
|
|
|
|
|
product_url = 'products/IDN60901/IDN60901.95768.shtml'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Set output directory
|
|
|
|
# Set output directory
|
|
|
|
output_dir = 'bom'
|
|
|
|
output_dir = 'bom'
|
|
|
@ -60,18 +63,8 @@ def update_master(output_dir, csv_name, df):
|
|
|
|
master.to_csv(os.path.join(output_dir, csv_name))
|
|
|
|
master.to_csv(os.path.join(output_dir, csv_name))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Open new session with BOM website
|
|
|
|
# Download JSON data
|
|
|
|
with requests.session() as session:
|
|
|
|
json_data = json.loads(requests.get(json_url).content)
|
|
|
|
# Load product page
|
|
|
|
|
|
|
|
page = session.get(bom_url + product_url)
|
|
|
|
|
|
|
|
tree = html.fromstring(page.content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Find and download JSON data
|
|
|
|
|
|
|
|
json_url = tree.xpath('//*[@id="content"]/p[4]/a')[0].get('href')
|
|
|
|
|
|
|
|
json_data = json.loads(session.get(bom_url + json_url).content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Extract file base name
|
|
|
|
|
|
|
|
csv_name = json_url.split('/')[-1].replace('.json', '.csv')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Create dataframe
|
|
|
|
# Create dataframe
|
|
|
|
df = pd.DataFrame(json_data['observations']['data'])
|
|
|
|
df = pd.DataFrame(json_data['observations']['data'])
|
|
|
@ -84,4 +77,5 @@ df = df.sort_index()
|
|
|
|
df = df[cols]
|
|
|
|
df = df[cols]
|
|
|
|
|
|
|
|
|
|
|
|
# Update master table
|
|
|
|
# Update master table
|
|
|
|
|
|
|
|
csv_name = json_url.split('/')[-1].replace('.json', '.csv')
|
|
|
|
update_master(output_dir, csv_name, df)
|
|
|
|
update_master(output_dir, csv_name, df)
|
|
|
|