You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

87 lines
2.1 KiB
Python

"""Download latest time series data from BOM website and update local copy.
BOM does not allow direct http requests to data files, so a requests session
is opened on the product page before accessing the resource.
D. Howe
2019-09-04
"""
import os
import json
import requests
import pandas as pd
from lxml import html
# Set base URL
bom_url = 'http://www.bom.gov.au/'
# Set product URL for North Head
product_url = 'products/IDN60901/IDN60901.95768.shtml'
# Set output directory
output_dir = 'bom'
# Set output column names
cols = [
'wind_dir',
'wind_spd_kmh',
'gust_kmh',
]
def update_master(output_dir, csv_name, df):
"""Update master csv time series.
Args:
output_dir (str): path to time series directory
csv_name (str): name of time series file
df (dataframe): dataframe with datetime index
Returns:
None
"""
try:
# Load local master table if it exists
master = pd.read_csv(os.path.join(output_dir, csv_name),
index_col=0,
parse_dates=True)
# Only include timestamps that do not already exist
df = df[~df.index.isin(master.index)]
# Update master
master = master.append(df)
except FileNotFoundError:
# Create new master table if none exists
master = df
# Export master table
master.to_csv(os.path.join(output_dir, csv_name))
# Open new session with BOM website
with requests.session() as session:
# Load product page
page = session.get(bom_url + product_url)
tree = html.fromstring(page.content)
# Find and download JSON data
json_url = tree.xpath('//*[@id="content"]/p[4]/a')[0].get('href')
json_data = json.loads(session.get(bom_url + json_url).content)
# Extract file base name
csv_name = json_url.split('/')[-1].replace('.json', '.csv')
# Create dataframe
df = pd.DataFrame(json_data['observations']['data'])
# Set local time as index
df.index = pd.to_datetime(df['local_date_time_full'])
# Extract columns of interest
df = df[cols]
# Update master table
update_master(output_dir, csv_name, df)