Initial commit

7 years ago · 85fa9e55c2
commit 85fa9e55c2
10 changed files with 1672 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,11 @@
 # Jupyter NB Checkpoints
 .ipynb_checkpoints/
 # exclude data from source control by default
 /data/
 # Pycharm
 .idea
 # DotEnv configuration
 .env
--- a/45
+++ b/45
@ -0,0 +1,45 @@
 #################################################################################
 # PROJECT RULES                                                                 #
 #################################################################################
 .PHONY: mat_to_csv
 mat-to-csv: ##@data Converts raw .mat files to .csv for python
 	cd ./src/data/ && python mat_to_csv.py
 sites-csv-to-shp: ./data/interim/sites.shp
    cd ./src/data && python csv_to_shp.py
 #################################################################################
 # Self Documenting Commands                                                     #
 #################################################################################
 .DEFAULT_GOAL := help
 .PHONY: help
 # Refer to https://gist.github.com/prwhite/8168133
 #COLORS
 GREEN  := $(shell tput -Txterm setaf 2)
 WHITE  := $(shell tput -Txterm setaf 7)
 YELLOW := $(shell tput -Txterm setaf 3)
 RESET  := $(shell tput -Txterm sgr0)
 # Add the following 'help' target to your Makefile
 # And add help text after each target name starting with '\#\#'
 # A category can be added with @category
 HELP_FUN = \
    %help; \
    while(<>) { push @{$$help{$$2 // 'options'}}, [$$1, $$3] if /^([a-zA-Z\-]+)\s*:.*\#\#(?:@([a-zA-Z\-]+))?\s(.*)$$/ }; \
    print "usage: make [target]\n\n"; \
    for (sort keys %help) { \
    print "${WHITE}$$_:${RESET}\n"; \
    for (@{$$help{$$_}}) { \
    $$sep = " " x (32 - length $$_->[0]); \
    print "  ${YELLOW}$$_->[0]${RESET}$$sep${GREEN}$$_->[1]${RESET}\n"; \
    }; \
    print "\n"; }
 help: ##@other Show this help.
 	@perl -e '$(HELP_FUN)' $(MAKEFILE_LIST)
--- a/README.md
+++ b/README.md
@ -0,0 +1,21 @@
 # 2016 Narrabeen Storm EWS Performance
 This repository investigates whether the storm impacts (i.e. Sallenger, 2000) of the June 2016 Narrabeen Storm could 
 have been forecasted in advance.
 ## Repository and analysis format
 This repository follows the [Cookiecutter Data Science](https://drivendata.github.io/cookiecutter-data-science/) 
 structure where possible. The analysis is done in python (look at the `/src/` folder) with some interactive, exploratory notebooks located at `/notebooks`.
 ## Where to start?
 Check out jupyter notebook `./notebooks/01_exploration.ipynb` which has an example of how to import the data and some interactive widgets.
 ## Available data
 Raw, interim and processed data used in this analysis is kept in the `/data/` folder.
 - `/data/raw/processed_shorelines`: This data was recieved from Tom Beuzen in October 2018. It consists of pre/post storm profiles at every 100 m sections along beaches ranging from Dee Why to Nambucca . Profiles are based on raw aerial LIDAR and were processed by Mitch Harley. Tides and waves (10 m contour and reverse shoaled deepwater) for each individual 100 m section is also provided.
 - `/data/raw/raw_lidar`: This is the raw pre/post storm aerial LIDAR which was taken for the June 2016 storm. `.las` files are the raw files which have been processed into `.tiff` files using `PDAL`. Note that these files have not been corrected for systematic errors, so actual elevations should be taken from the `processed_shorelines` folder. Obtained November 2018 from Mitch Harley from the black external HDD labeled "UNSW LIDAR".
 - `/data/raw/profile_features`: Dune toe and crest locations based on prestorm LIDAR. Refer to `/notebooks/qgis.qgz` as this shows how they were manually extracted. Note that the shapefiles only show the location (lat/lon) of the dune crest and toe. For actual elevations, these locations need to related to the processed shorelines.
 ## Notebooks
 - `/notebooks/01_exploration.ipynb`: Shows how to import processed shorelines, waves and tides. An interactive widget plots the location and cross sections.
 - `/notebooks/qgis.qgz`: A QGIS file which is used to explore the aerial LIDAR data in `/data/raw/raw_lidar`. By examining the pre-strom lidar, dune crest and dune toe lines are manually extracted. These are stored in the `/data/profile_features/`. 
--- a/notebooks/01_exploration.ipynb
+++ b/notebooks/01_exploration.ipynb
--- a/notebooks/qgis.qgz
+++ b/notebooks/qgis.qgz
--- a/src/analysis/analysis.py
+++ b/src/analysis/analysis.py
@ -0,0 +1,13 @@
 import pandas as pd
 import os
 def main():
    data_folder = './data/interim'
    df_waves = pd.read_csv(os.path.join(data_folder, 'waves.csv'), index_col=[0,1])
    df_tides = pd.read_csv(os.path.join(data_folder, 'tides.csv'), index_col=[0,1])
    df_profiles = pd.read_csv(os.path.join(data_folder, 'profiles.csv'), index_col=[0,1,2])
    df_sites = pd.read_csv(os.path.join(data_folder, 'sites.csv'),index_col=[0])
 if __name__ == '__main__':
    main()
--- a/src/data/csv_to_shp.py
+++ b/src/data/csv_to_shp.py
@ -0,0 +1,38 @@
 """
 Converts .csv files to .shape files
 """
 from fiona.crs import from_epsg
 import fiona
 from shapely.geometry import Point, mapping
 from fiona import collection
 import pandas as pd
 import os
 def sites_csv_to_shp(input_csv='.\data\interim\sites.csv', output_shp='.\data\interim\sites.shp'):
    """
    Converts our dataframe of sites to .shp to load in QGis
    :param input_csv:
    :param output_shp:
    :return:
    """
    df_sites = pd.read_csv(input_csv, index_col=[0])
    schema = {
        'geometry': 'Point',
        'properties': {
            'beach': 'str',
            'site_id': 'str'
        }
    }
    with fiona.open(output_shp, 'w', crs=from_epsg(4326), driver='ESRI Shapefile', schema=schema) as output:
        for index, row in df_sites.iterrows():
            point = Point(row['lon'], row['lat'])
            prop = {
                'beach': row['beach'],
                'site_id': index,
            }
            output.write({'geometry': mapping(point), 'properties': prop})
 if __name__ == '__main__':
    sites_csv_to_shp()
--- a/src/data/mat_to_csv.py
+++ b/src/data/mat_to_csv.py
@ -0,0 +1,180 @@
 """
 Converts raw .mat files into a flattened .csv structure which can be imported into python pandas.
 """
 import logging.config
 from datetime import datetime, timedelta
 import pandas as pd
 from mat4py import loadmat
 logging.config.fileConfig('../logging.conf', disable_existing_loggers=False)
 logger = logging.getLogger(__name__)
 def parse_waves(waves_mat):
    """
    Parses the raw waves.mat file and returns a pandas dataframe
    :param waves_mat:
    :return:
    """
    logger.info('Parsing %s', waves_mat)
    mat_data = loadmat(waves_mat)['data']
    rows = []
    for i in range(0, len(mat_data['site'])):
        for j in range(0, len(mat_data['dates'][i])):
            rows.append({
                'beach': mat_data['site'][i],
                'lon': mat_data['lon'][i],
                'lat': mat_data['lat'][i],
                'datetime': matlab_datenum_to_datetime(mat_data['dates'][i][j][0]),
                'Hs': mat_data['H'][i][j][0],
                'Hs0': mat_data['Ho'][i][j][0],
                'Tp': mat_data['T'][i][j][0],
                'dir': mat_data['D'][i][j][0],
                'E': mat_data['E'][i][j][0],
                'P': mat_data['P'][i][j][0],
                'Exs': mat_data['Exs'][i][j][0],
                'Pxs': mat_data['Pxs'][i][j][0],
            })
    df = pd.DataFrame(rows)
    df['datetime'] = df['datetime'].dt.round('1s')
    return df
 def parse_tides(tides_mat):
    """
    Parses the raw tides.mat file and returns a pandas dataframe
    :param tides_mat:
    :return:
    """
    logger.info('Parsing %s', tides_mat)
    mat_data = loadmat(tides_mat)['data']
    rows = []
    for i in range(0, len(mat_data['site'])):
        for j in range(0, len(mat_data['time'])):
            rows.append({
                'beach': mat_data['site'][i][0],
                'lon': mat_data['lons'][i][0],
                'lat': mat_data['lats'][i][0],
                'datetime': matlab_datenum_to_datetime(mat_data['time'][j][0]),
                'tide': mat_data['tide'][i][j]
            })
    df = pd.DataFrame(rows)
    df['datetime'] = df['datetime'].dt.round('1s')
    return df
 def parse_profiles(profiles_mat):
    """
    Parses the raw profiles.mat file and returns a pandas dataframe
    :param tides_mat:
    :return:
    """
    logger.info('Parsing %s', profiles_mat)
    mat_data = loadmat(profiles_mat)['data']
    rows = []
    for i in range(0, len(mat_data['site'])):
        for j in range(0, len(mat_data['pfx'][i])):
            for profile_type in ['prestorm', 'poststorm']:
                if profile_type == 'prestorm':
                    z = mat_data['pf1'][i][j][0]
                if profile_type == 'poststorm':
                    z = mat_data['pf2'][i][j][0]
                rows.append({
                    'beach': mat_data['site'][i],
                    'lon': mat_data['lon'][i],
                    'lat': mat_data['lat'][i],
                    'profile_type': profile_type,
                    'x': mat_data['pfx'][i][j][0],
                    'z': z,
                })
    df = pd.DataFrame(rows)
    return df
 def matlab_datenum_to_datetime(matlab_datenum):
    # https://stackoverflow.com/a/13965852
    return datetime.fromordinal(int(matlab_datenum)) + timedelta(days=matlab_datenum % 1) - timedelta(
        days=366)
 def get_unique_sites(dfs, cols=['beach', 'lat', 'lon']):
    """
    Generates a dataframe of unique sites based on beach names, lats and lons. Creates a unique site ID for each.
    :param dfs:
    :param cols:
    :return:
    """
    rows = []
    df_all = pd.concat([df[cols] for df in dfs])
    beach_groups = df_all.groupby(['beach'])
    for beach_name, beach_group in beach_groups:
        site_groups = beach_group.groupby(['lat', 'lon'])
        siteNo = 1
        for site_name, site_group in site_groups:
            site = '{}{:04d}'.format(beach_name, siteNo)
            rows.append({'site_id': site,
                         'lat': site_name[0],
                         'lon': site_name[1],
                         'beach': beach_name})
            siteNo += 1
    df = pd.DataFrame(rows)
    return df
 def replace_unique_sites(df, df_sites, cols=['beach', 'lat', 'lon']):
    """
    Replaces beach/lat/lon columns with the unique site_id
    :param dfs:
    :param df_sites:
    :return:
    """
    df_merged = df.merge(df_sites, on=cols)
    # Check that all our records have a unique site identifier
    n_unmatched = len(df) - len(df_merged)
    if n_unmatched > 0:
        logger.warning('Not all records (%d of %d) matched with a unique site', n_unmatched, len(df))
    df_merged = df_merged.drop(columns=cols)
    return df_merged
 def main():
    df_waves = parse_waves(waves_mat='../../data/raw/waves.mat')
    df_tides = parse_tides(tides_mat='../../data/raw/tides.mat')
    df_profiles = parse_profiles(profiles_mat='../../data/raw/profiles.mat')
    df_sites = get_unique_sites(dfs=[df_waves, df_tides, df_profiles])
    logger.info('Identifying unique sites')
    df_waves = replace_unique_sites(df_waves, df_sites)
    df_tides = replace_unique_sites(df_tides, df_sites)
    df_profiles = replace_unique_sites(df_profiles, df_sites)
    logger.info('Setting pandas index')
    df_profiles.set_index(['site_id', 'profile_type', 'x'], inplace=True)
    df_waves.set_index(['site_id', 'datetime'], inplace=True)
    df_tides.set_index(['site_id', 'datetime'], inplace=True)
    df_sites.set_index(['site_id'], inplace=True)
    logger.info('Outputting .csv files')
    df_profiles.to_csv('../../data/interim/profiles.csv')
    df_tides.to_csv('../../data/interim/tides.csv')
    df_waves.to_csv('../../data/interim/waves.csv')
    df_sites.to_csv('../../data/interim/sites.csv')
    logger.info('Done!')
 if __name__ == '__main__':
    main()
--- a/src/data/profile_features.py
+++ b/src/data/profile_features.py
@ -0,0 +1,76 @@
 import pandas as pd
 import os
 import fiona
 from shapely.geometry import LineString, Point
 from shapely.geometry import shape
 from shapely.ops import transform
 import pyproj
 from functools import partial
 import numpy as np
 def shapes_from_shp(shp_file):
    """
    Parses a shape file and returns a list of shapely shapes
    :param shp_file:
    :return:
    """
    shapes = []
    for feat in fiona.open(shp_file):
        shapes.append(shape(feat['geometry']))
    return shapes
 def convert_coord_systems(g1, in_coord_system='EPSG:4326', out_coord_system='EPSG:28356'):
    """
    Converts coordinates from one coordinates system to another. Needed because shapefiles are usually defined in
    lat/lon but should be converted to GDA to calculated distances.
    https://gis.stackexchange.com/a/127432
    :param in_coord_system: Default is lat/lon WGS84
    :param out_coord_system: Default is GDA56 for NSW coastline
    :return:
    """
    project = partial(
        pyproj.transform,
        pyproj.Proj(init=in_coord_system), # source coordinate system
        pyproj.Proj(init=out_coord_system)) # destination coordinate system
    g2 = transform(project, g1)  # apply projection
    return g2
 def distance_to_intersection(lat,lon,orientation,line_strings):
    """
    Returns the distance at whjch a line drawn from a lat/lon at an orientation intersects a line stinrg
    :param lat:
    :param lon:
    :param orientation: Angle, clockwise positive from true north in degrees, of the tangent to the shoreline facing
    towards the
    land.
    :param line_string:
    :return:
    """
    start_point = Point(lon,lat)
    start_point = convert_coord_systems(start_point)
    distance = 1000 # m look up to 1000m for an intersection
    new_point = Point(start_point.coords.xy[0]+distance*np.cos(np.deg2rad(orientation)),
                      start_point.coords.xy[1]+distance*np.sin(np.deg2rad(orientation)))
    profile_line = LineString([start_point, new_point])
    # Check whether profile_line intersects with any lines in line_string
    for line_string in line_strings:
        intersection_points = profile_line.intersection(line_string)
        if not intersection_points.is_empty:
            return intersection_points.distance(start_point)
    return None
 def get_sites_dune_crest_toe():
    data_folder = './data/interim'
    df_sites = pd.read_csv(os.path.join(data_folder, 'sites.csv'),index_col=[0])
    # Import
    for f in ['./data/raw/profile_features/dune_crests.shp']:
        shapes = shapes_from_shp(f)
        shapes = [convert_coord_systems(x) for x in shapes]
    # Iterate through each site
--- a/src/logging.conf
+++ b/src/logging.conf
@ -0,0 +1,27 @@
 [loggers]
 keys=root, matplotlib
 [handlers]
 keys=consoleHandler
 [formatters]
 keys=simpleFormatter
 [logger_root]
 level=DEBUG
 handlers=consoleHandler
 [logger_matplotlib]
 level=WARNING
 handlers=consoleHandler
 qualname=matplotlib
 [handler_consoleHandler]
 class=StreamHandler
 level=DEBUG
 formatter=simpleFormatter
 args=(sys.stdout,)
 [formatter_simpleFormatter]
 format=%(asctime)s %(name)-17s %(levelname)-8s %(message)s
 datefmt=%a, %d %b %Y %H:%M:%S