"""Get latest observations from Port Authority of NSW and update local copy. Station IDs are below: 02: Offshore (Directional) Wave 03: Bombora (Directional) Wave 04: Captain Cook Channel (SG) Wave 05: Kurnell (SG) Wave 06: Molineaux Point Wind 07: Sydney Airport (Main Runway BOM) Wind 08: Brotherson Emergency Response Jetty Tide 09: Caltex (Directional) Current 12: Western Wedding Cake Wind 13: Fort Denison (Sth end BOM) Wind 14: Overseas Passenger Terminal Wind 15: Glebe Island Wind 16: Fort Denison-Primary (Nth end) Tide 17: Fort Denison-Secondary (Vegapuls64) Tide 18: Circular Quay ADCP Current 19: Balls Head Current 22: Twofold Bay - Munganno Point Wave 23: Twofold Bay - Multipurpose Wharf Wind 24: Breakwater Wharf Wind 27: Middle Wall (Vegapulse WL61) Tide 28: Goodwood (Vegapulse WL61) Tide """ import os import re import datetime import requests import pandas as pd from lxml import html # Set station as Fort Denison tide stn_id = 16 output_dir = 'csv' def update_master(output_dir, csv_name, df): """Update master csv time series. Args: output_dir (str): path to time series directory csv_name (str): name of time series file df (dataframe): dataframe with datetime index Returns: None """ try: # Load local master table if it exists master = pd.read_csv(os.path.join(output_dir, csv_name), index_col=0, parse_dates=True) # Only include timestamps that do not already exist df = df[~df.index.isin(master.index)] # Update master master = master.append(df) except FileNotFoundError: # Create new master table if none exists master = df # Export master table master.to_csv(os.path.join(output_dir, csv_name)) # Get main page url = 'http://wavewindtide.portauthoritynsw.com.au/' page = requests.get(url) tree = html.fromstring(page.content) # Get elements from selected station t_raw = tree.get_element_by_id(f'MainContent_ctl{stn_id:02}_lblRecordDate') meas = tree.get_element_by_id(f'MainContent_ctl{stn_id:02}_lblSummary') description = tree.get_element_by_id(f'MainContent_ctl{stn_id:02}_lblTitle') # Parse column names text = re.split(':', meas.text) parameters = text[::3] entries = [re.split('(\D+)$', t) for t in text[1::3]] values = [float(e[0]) for e in entries] units = [e[1] for e in entries] columns = [f'{p} ({u})' for p, u in zip(parameters, units)] # Parse time time = re.search('at ([0-9]{4})', t_raw.text).group(1) date = t_raw.text.split(',')[1].strip() t = datetime.datetime.strptime(date + time, '%d %b %Y%H%M') # Create dataframe df = pd.DataFrame({c: v for c, v in zip(columns, values)}, index=[t]) df.index.name = 'datetime' # Update master dataframe csv_name = description.text + '.csv' update_master(output_dir, csv_name, df)