Simplify http request

master
Dan Howe 5 years ago
parent a5c035081b
commit 2e976bd120

@ -1,7 +1,13 @@
"""Download latest time series data from BOM website and update local copy. """Download latest time series data from BOM website and update local copy.
BOM does not allow direct http requests to data files, so a requests session BOM does not allow direct http requests to data files unless it believes the
is opened on the product page before accessing the resource. request comes from a browser. For example:
>>> pd.read_csv(url) # This does not work
HTTPError: HTTP Error 403: Forbidden
>>> requests.get(url) # This works!
<Response [200]>
D. Howe D. Howe
2019-09-04 2019-09-04
@ -10,13 +16,10 @@ import os
import json import json
import requests import requests
import pandas as pd import pandas as pd
from lxml import html
# Set base URL # Set json product URL for North Head
bom_url = 'http://www.bom.gov.au/' # http://www.bom.gov.au/products/IDN60901/IDN60901.95768.shtml
json_url = 'http://www.bom.gov.au/fwo/IDN60901/IDN60901.95768.json'
# Set product URL for North Head
product_url = 'products/IDN60901/IDN60901.95768.shtml'
# Set output directory # Set output directory
output_dir = 'bom' output_dir = 'bom'
@ -60,18 +63,8 @@ def update_master(output_dir, csv_name, df):
master.to_csv(os.path.join(output_dir, csv_name)) master.to_csv(os.path.join(output_dir, csv_name))
# Open new session with BOM website # Download JSON data
with requests.session() as session: json_data = json.loads(requests.get(json_url).content)
# Load product page
page = session.get(bom_url + product_url)
tree = html.fromstring(page.content)
# Find and download JSON data
json_url = tree.xpath('//*[@id="content"]/p[4]/a')[0].get('href')
json_data = json.loads(session.get(bom_url + json_url).content)
# Extract file base name
csv_name = json_url.split('/')[-1].replace('.json', '.csv')
# Create dataframe # Create dataframe
df = pd.DataFrame(json_data['observations']['data']) df = pd.DataFrame(json_data['observations']['data'])
@ -84,4 +77,5 @@ df = df.sort_index()
df = df[cols] df = df[cols]
# Update master table # Update master table
csv_name = json_url.split('/')[-1].replace('.json', '.csv')
update_master(output_dir, csv_name, df) update_master(output_dir, csv_name, df)

Loading…
Cancel
Save