From 2e976bd1203494600020d3318513e79762e8a7a4 Mon Sep 17 00:00:00 2001
From: Dan Howe <d.howe@wrl.unsw.edu.au>
Date: Wed, 4 Sep 2019 15:13:29 +1000
Subject: [PATCH] Simplify http request

---
 bom-wind/bom_wind.py | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)
diff --git a/bom-wind/bom_wind.py b/bom-wind/bom_wind.py
index d88f7e8..200877d 100644
--- a/bom-wind/bom_wind.py
+++ b/bom-wind/bom_wind.py
@@ -1,7 +1,13 @@
 """Download latest time series data from BOM website and update local copy.
 
-BOM does not allow direct http requests to data files, so a requests session
-is opened on the product page before accessing the resource.
+BOM does not allow direct http requests to data files unless it believes the
+request comes from a browser. For example:
+
+>>> pd.read_csv(url)  # This does not work
+HTTPError: HTTP Error 403: Forbidden
+
+>>> requests.get(url)  # This works!
+<Response [200]>
 
 D. Howe
 2019-09-04
@@ -10,13 +16,10 @@ import os
 import json
 import requests
 import pandas as pd
-from lxml import html
 
-# Set base URL
-bom_url = 'http://www.bom.gov.au/'
-
-# Set product URL for North Head
-product_url = 'products/IDN60901/IDN60901.95768.shtml'
+# Set json product URL for North Head
+# http://www.bom.gov.au/products/IDN60901/IDN60901.95768.shtml
+json_url = 'http://www.bom.gov.au/fwo/IDN60901/IDN60901.95768.json'
 
 # Set output directory
 output_dir = 'bom'
@@ -60,18 +63,8 @@ def update_master(output_dir, csv_name, df):
     master.to_csv(os.path.join(output_dir, csv_name))
 
 
-# Open new session with BOM website
-with requests.session() as session:
-    # Load product page
-    page = session.get(bom_url + product_url)
-    tree = html.fromstring(page.content)
-
-    # Find and download JSON data
-    json_url = tree.xpath('//*[@id="content"]/p[4]/a')[0].get('href')
-    json_data = json.loads(session.get(bom_url + json_url).content)
-
-# Extract file base name
-csv_name = json_url.split('/')[-1].replace('.json', '.csv')
+# Download JSON data
+json_data = json.loads(requests.get(json_url).content)
 
 # Create dataframe
 df = pd.DataFrame(json_data['observations']['data'])
@@ -84,4 +77,5 @@ df = df.sort_index()
 df = df[cols]
 
 # Update master table
+csv_name = json_url.split('/')[-1].replace('.json', '.csv')
 update_master(output_dir, csv_name, df)