You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

88 lines
2.1 KiB
Python

5 years ago
"""Download latest time series data from BOM website and update local copy.
BOM does not allow direct http requests to data files, so a requests session
is opened on the product page before accessing the resource.
D. Howe
2019-09-04
"""
import os
import json
import requests
import pandas as pd
from lxml import html
# Set base URL
bom_url = 'http://www.bom.gov.au/'
# Set product URL for North Head
product_url = 'products/IDN60901/IDN60901.95768.shtml'
# Set output directory
output_dir = 'bom'
# Set output column names
cols = [
'wind_dir',
'wind_spd_kmh',
'gust_kmh',
]
def update_master(output_dir, csv_name, df):
"""Update master csv time series.
Args:
output_dir (str): path to time series directory
csv_name (str): name of time series file
df (dataframe): dataframe with datetime index
Returns:
None
"""
try:
# Load local master table if it exists
master = pd.read_csv(os.path.join(output_dir, csv_name),
index_col=0,
parse_dates=True)
# Only include timestamps that do not already exist
df = df[~df.index.isin(master.index)]
# Update master
master = master.append(df)
except FileNotFoundError:
# Create new master table if none exists
master = df
# Export master table
master.to_csv(os.path.join(output_dir, csv_name))
# Open new session with BOM website
with requests.session() as session:
# Load product page
page = session.get(bom_url + product_url)
tree = html.fromstring(page.content)
# Find and download JSON data
json_url = tree.xpath('//*[@id="content"]/p[4]/a')[0].get('href')
json_data = json.loads(session.get(bom_url + json_url).content)
# Extract file base name
csv_name = json_url.split('/')[-1].replace('.json', '.csv')
# Create dataframe
df = pd.DataFrame(json_data['observations']['data'])
# Set local time as index
df.index = pd.to_datetime(df['local_date_time_full'])
df = df.sort_index()
5 years ago
# Extract columns of interest
df = df[cols]
# Update master table
update_master(output_dir, csv_name, df)