From 851393ecd9ef855ddce6c5fdfffb298ffd0cd835 Mon Sep 17 00:00:00 2001 From: Chris Leaman Date: Tue, 27 Nov 2018 14:53:32 +1100 Subject: [PATCH] Parse new profiles.mat from Mitch Mitch has updated profiles.mat to include more information, like time of LIDAR acquisition, profile flags and recovery profiles. The updates to the code parse this new information. --- .env | 4 +- src/analysis/forecast_twl.py | 4 +- src/analysis/observed_storm_impacts.py | 44 +++-- src/data/__init__.py | 0 src/data/csv_to_shp.py | 15 +- src/data/parse_mat.py | 259 ++++++++++++++----------- src/data/profile_features.py | 34 ++-- src/logging.yaml | 50 +++++ src/utils.py | 16 ++ 9 files changed, 271 insertions(+), 155 deletions(-) create mode 100644 src/data/__init__.py create mode 100644 src/logging.yaml create mode 100644 src/utils.py diff --git a/.env b/.env index de5e5c4..8a4468b 100644 --- a/.env +++ b/.env @@ -10,11 +10,9 @@ MATLAB_PATH="C:/Program Files/MATLAB/R2016b/bin/win64/MATLAB.exe" # total water level. MULTIPROCESS_THREADS=2 -# GDAL_DATA="C://Users//z5189959//Desktop//nsw-2016-storm-impact//.venv//Library//share//gdal" - # The settings below should be left as is unless you know what you're doing. # Need to set pythonpath so that relative imports can be properly used in with pipenv # Refer to https://stackoverflow.com/q/52986500 and https://stackoverflow.com/a/49797761 -PYTHONPATH=${PWD} \ No newline at end of file +# PYTHONPATH=${PWD} \ No newline at end of file diff --git a/src/analysis/forecast_twl.py b/src/analysis/forecast_twl.py index e15df6e..ddc93a6 100644 --- a/src/analysis/forecast_twl.py +++ b/src/analysis/forecast_twl.py @@ -7,8 +7,7 @@ import numpy.ma as ma import pandas as pd from scipy import stats - -from src.analysis import runup_models +import runup_models logging.config.fileConfig("./src/logging.conf", disable_existing_loggers=False) logger = logging.getLogger(__name__) @@ -266,6 +265,7 @@ def crossings(profile_x, profile_z, constant_z): z = np.subtract(profile_z, constant_z) # Find all indices right before any crossing. + # TODO Sometimes this can give a runtime warning https://stackoverflow.com/a/36489085 indicies = np.where(z[:-1] * z[1:] < 0)[0] # Use linear interpolation to find intersample crossings. diff --git a/src/analysis/observed_storm_impacts.py b/src/analysis/observed_storm_impacts.py index 0aadbbe..80bdbda 100644 --- a/src/analysis/observed_storm_impacts.py +++ b/src/analysis/observed_storm_impacts.py @@ -48,6 +48,11 @@ def volume_change(df_profiles, df_profile_features, zone): if np.isnan(prestorm_dune_toe_x): prestorm_dune_toe_x = prestorm_dune_crest_x + # If no prestorm and poststorm profiles, skip site and continue + profile_lengths = [len(df_site.query("profile_type == '{}'".format(x))) for x in ['prestorm', 'poststorm']] + if any([length ==0 for length in profile_lengths]): + continue + # Find last x coordinate where we have both prestorm and poststorm measurements. If we don't do this, # the prestorm and poststorm values are going to be calculated over different lengths. df_zone = df_site.dropna(subset=["z"]) @@ -128,25 +133,26 @@ def storm_regime(df_observed_impacts): return df_observed_impacts -if __name__ == "__main__": - logger.info("Importing existing data") - data_folder = "./data/interim" - df_profiles = pd.read_csv(os.path.join(data_folder, "profiles.csv"), index_col=[0, 1, 2]) - df_profile_features = pd.read_csv(os.path.join(data_folder, "profile_features.csv"), index_col=[0]) - - logger.info("Creating new dataframe for observed impacts") - df_observed_impacts = pd.DataFrame(index=df_profile_features.index) - - logger.info("Getting pre/post storm volumes") - df_swash_vol_changes = volume_change(df_profiles, df_profile_features, zone="swash") - df_dune_face_vol_changes = volume_change(df_profiles, df_profile_features, zone="dune_face") - df_observed_impacts = df_observed_impacts.join([df_swash_vol_changes, df_dune_face_vol_changes]) - - # Classify regime based on volume changes - df_observed_impacts = storm_regime(df_observed_impacts) - - # Save dataframe to csv - df_observed_impacts.to_csv(os.path.join(data_folder, "impacts_observed.csv")) +# +# if __name__ == "__main__": +# logger.info("Importing existing data") +# data_folder = "./data/interim" +# df_profiles = pd.read_csv(os.path.join(data_folder, "profiles.csv"), index_col=[0, 1, 2]) +# df_profile_features = pd.read_csv(os.path.join(data_folder, "profile_features.csv"), index_col=[0]) +# +# logger.info("Creating new dataframe for observed impacts") +# df_observed_impacts = pd.DataFrame(index=df_profile_features.index) +# +# logger.info("Getting pre/post storm volumes") +# df_swash_vol_changes = volume_change(df_profiles, df_profile_features, zone="swash") +# df_dune_face_vol_changes = volume_change(df_profiles, df_profile_features, zone="dune_face") +# df_observed_impacts = df_observed_impacts.join([df_swash_vol_changes, df_dune_face_vol_changes]) +# +# # Classify regime based on volume changes +# df_observed_impacts = storm_regime(df_observed_impacts) +# +# # Save dataframe to csv +# df_observed_impacts.to_csv(os.path.join(data_folder, "impacts_observed.csv")) @click.command() diff --git a/src/data/__init__.py b/src/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/data/csv_to_shp.py b/src/data/csv_to_shp.py index 39525fa..a5438c2 100644 --- a/src/data/csv_to_shp.py +++ b/src/data/csv_to_shp.py @@ -2,15 +2,20 @@ Converts .csv files to .shape files """ +import os +import sys + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + import click import fiona import pandas as pd from fiona.crs import from_epsg from shapely.geometry import Point, mapping -import logging.config -logging.config.fileConfig("./src/logging.conf", disable_existing_loggers=False) -logger = logging.getLogger(__name__) +from utils import setup_logging + +logger = setup_logging() @click.command() @@ -25,11 +30,11 @@ def sites_csv_to_shp(input_csv, output_shp): """ logger.info("Converting %s to %s", input_csv, output_shp) df_sites = pd.read_csv(input_csv, index_col=[0]) - + logger.info(os.environ.get("GDAL_DATA", None)) schema = {"geometry": "Point", "properties": {"beach": "str", "site_id": "str"}} with fiona.open(output_shp, "w", crs=from_epsg(4326), driver="ESRI Shapefile", schema=schema) as output: for index, row in df_sites.iterrows(): - point = Point(row["lon"], row["lat"]) + point = Point(row["x_200_lon"], row["x_200_lat"]) prop = {"beach": row["beach"], "site_id": index} output.write({"geometry": mapping(point), "properties": prop}) logger.info("Done!") diff --git a/src/data/parse_mat.py b/src/data/parse_mat.py index d6f7cb2..de17dfc 100644 --- a/src/data/parse_mat.py +++ b/src/data/parse_mat.py @@ -2,15 +2,25 @@ Converts raw .mat files into a flattened .csv structure which can be imported into python pandas. """ -import logging.config +import os +import sys + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + from datetime import datetime, timedelta +import math + + import click +import numpy as np import pandas as pd from mat4py import loadmat -import numpy as np +from shapely.geometry import Point + +from profile_features import convert_coord_systems +from utils import setup_logging -logging.config.fileConfig("./src/logging.conf", disable_existing_loggers=False) -logger = logging.getLogger(__name__) +logger = setup_logging() def parse_orientations(orientations_mat): @@ -134,7 +144,7 @@ def parse_tides(tides_mat): return df -def parse_profiles(profiles_mat): +def parse_profiles_and_sites(profiles_mat): """ Parses the raw profiles.mat file and returns a pandas dataframe :param tides_mat: @@ -142,39 +152,108 @@ def parse_profiles(profiles_mat): """ logger.info("Parsing %s", profiles_mat) mat_data = loadmat(profiles_mat)["data"] - rows = [] - for i in range(0, len(mat_data["site"])): - for j in range(0, len(mat_data["pfx"][i])): - for profile_type in ["prestorm", "poststorm"]: - - if profile_type == "prestorm": - z = mat_data["pf1"][i][j][0] - if profile_type == "poststorm": - z = mat_data["pf2"][i][j][0] - - rows.append( - { - "beach": mat_data["site"][i], - "lon": mat_data["lon"][i], - "lat": mat_data["lat"][i], - "profile_type": profile_type, - "x": mat_data["pfx"][i][j][0], - "z": z, - } - ) + profile_rows = [] + site_rows = [] + site_counter = 0 + + for i, site in enumerate(mat_data["site"]): + + # Give each site a unique id + if len(site_rows) == 0 or site_rows[-1]["beach"] != site: + site_counter = 1 + else: + site_counter += 1 + site_id = "{}{:04d}".format(site, site_counter) + + # Initalize location of x=200m latitude and longitude + x_200_lat = np.nan + x_200_lon = np.nan + + # Want to calculation the orientation + orientation = {} + + for x, lat, lon, z_prestorm, z_poststorm, easting, northing in zip( + mat_data["x"][i], + mat_data["lats"][i], + mat_data["lons"][i], + mat_data["Zpre"][i], + mat_data["Zpost"][i], + mat_data["eastings"][i], + mat_data["northings"][i], + ): + + # Only extract pre and post storm profile + for j, profile_type in enumerate(["prestorm", "poststorm"]): + + if mat_data["isgood"][i][j] == 1: + land_lim = mat_data["landlims"][i][j] + survey_datetime = matlab_datenum_to_datetime(mat_data["surveydates"][i][j]) + + if profile_type == "prestorm": + z = z_prestorm + else: + z = z_poststorm + + # Keep a record of the where the center of the profile is located, and the locations of the land + # and sea + + # TODO: This code isn't very transferrable. What if we don't have lat/lons at 200 m? Relook at this + if x[0] == 200: + x_200_lat = lat[0] + x_200_lon = lon[0] + elif x[0] == 0: + orientation["land_easting"] = easting[0] + orientation["land_northing"] = northing[0] + elif x[0] == 400: + orientation["sea_easting"] = easting[0] + orientation["sea_northing"] = northing[0] + + profile_rows.append( + { + "site_id": site_id, + "lon": lon[0], + "lat": lat[0], + "profile_type": profile_type, + "x": x[0], + "z": z[0], + "land_lim": land_lim, + "survey_datetime": survey_datetime, + } + ) + + orientation = math.degrees( + math.atan2( + orientation["land_northing"] - orientation["sea_northing"], + orientation["land_easting"] - orientation["sea_easting"], + ) + ) + site_rows.append( + { + "site_id": site_id, + "beach": site, + "lat": x_200_lat, + "lon": x_200_lon, + "orientation": orientation, + "profile_x_lat_lon": 200, + } + ) - df = pd.DataFrame(rows) - return df + df_profiles = pd.DataFrame(profile_rows) + df_sites = pd.DataFrame(site_rows) + + logger.info("Parsed profiles and sites") + return df_profiles, df_sites def remove_zeros(df_profiles): """ When parsing the pre/post storm profiles, the end of some profiles have constant values of zero. Let's change these to NaNs for consistancy. Didn't use pandas fillnan because 0 may still be a valid value. - :param df: + :param df_profiles: :return: """ + logger.info("Removing zeros from end of profiles") df_profiles = df_profiles.sort_index() groups = df_profiles.groupby(level=["site_id", "profile_type"]) for key, _ in groups: @@ -185,6 +264,7 @@ def remove_zeros(df_profiles): df_profile = df_profiles[idx_site] x_last_ele = df_profile[df_profile.z != 0].index.get_level_values("x")[-1] df_profiles.loc[idx_site & (df_profiles.index.get_level_values("x") > x_last_ele), "z"] = np.nan + logger.info("Removed zeros from end of profiles") return df_profiles @@ -198,31 +278,7 @@ def matlab_datenum_to_datetime(matlab_datenum): return datetime.fromordinal(int(matlab_datenum)) + timedelta(days=matlab_datenum % 1) - timedelta(days=366) -def get_unique_sites(dfs, cols=["beach", "lat", "lon"]): - """ - Generates a dataframe of unique sites based on beach names, lats and lons. Creates a unique site ID for each. - :param dfs: - :param cols: - :return: - """ - - rows = [] - df_all = pd.concat([df[cols] for df in dfs]) - beach_groups = df_all.groupby(["beach"]) - for beach_name, beach_group in beach_groups: - site_groups = beach_group.groupby(["lat", "lon"]) - siteNo = 1 - for site_name, site_group in site_groups: - site = "{}{:04d}".format(beach_name, siteNo) - rows.append({"site_id": site, "lat": site_name[0], "lon": site_name[1], "beach": beach_name}) - siteNo += 1 - - df = pd.DataFrame(rows) - - return df - - -def replace_unique_sites(df, df_sites, cols=["lat", "lon"]): +def replace_unique_sites(df, df_sites): """ Replaces beach/lat/lon columns with the unique site_id :param dfs: @@ -232,56 +288,37 @@ def replace_unique_sites(df, df_sites, cols=["lat", "lon"]): # Make the sites index a column, so it can be merged into df df_sites["site_id"] = df_sites.index.get_level_values("site_id") - # Merging on a float can lead to subtle bugs. Lets convert lat/lons to integers and merge on that instead - precision = 8 - df_sites["lat_int"] = np.round(df_sites["lat"] * 10 ** precision).astype(np.int64) - df_sites["lon_int"] = np.round(df_sites["lon"] * 10 ** precision).astype(np.int64) - df["lat_int"] = np.round(df["lat"] * 10 ** precision).astype(np.int64) - df["lon_int"] = np.round(df["lon"] * 10 ** precision).astype(np.int64) + # Create eastings and northings so we can calculate distances + site_points = [convert_coord_systems(Point(lon, lat)).xy for lon, lat in zip(df_sites["lon"], df_sites["lat"])] + df_sites["easting"] = [x[0][0] for x in site_points] + df_sites["northing"] = [x[1][0] for x in site_points] - df_merged = df.merge(df_sites, on=["lat_int", "lon_int"]) + # Process each unique combination lat/lons in groups + groups = df.groupby(["lat", "lon"]) + for (lat, lon), df_group in groups: - # Check that all our records have a unique site identifier - n_unmatched = len(df) - len(df_merged) - if n_unmatched > 0: - logger.warning("Not all records (%d of %d) matched with a unique site", n_unmatched, len(df)) - - df_merged = df_merged.drop( - columns=[ - "lat_x", - "lon_x", - "lat_int", - "lon_int", - "beach_y", - "beach_x", - "lat_y", - "lon_y", - "orientation", - "profile_x_lat_lon", - ] - ) + # Calculate distances from each point to each site and determine closest site + easting, northing = [x[0] for x in convert_coord_systems(Point(lon, lat)).xy] + distances_to_sites = np.sqrt((df_sites["easting"] - easting) ** 2 + (df_sites["northing"] - northing) ** 2) + min_distance = distances_to_sites.min() + closest_site = distances_to_sites.idxmin() - return df_merged + # Do some logging so we can check later. + if min_distance > 1: + logger.warning("Closest site to (%.4f,%.4f) is %s (%.2f m away)", lat, lon, closest_site, min_distance) + else: + logger.info("Closest site to (%.4f,%.4f) is %s (%.2f m away)", lat, lon, closest_site, min_distance) + # Assign site_id based on closest site + df.loc[df_group.index, "site_id"] = closest_site -@click.command(short_help="create sites.csv") -@click.option("--waves-mat", required=True, help=".mat file containing wave records") -@click.option("--tides-mat", required=True, help=".mat file containing tide records") -@click.option("--profiles-mat", required=True, help=".mat file containing beach profiles") -@click.option("--orientations-mat", required=True, help=".mat file containing orientation of beach profiles") -@click.option("--output-file", required=True, help="where to save sites.csv") -def create_sites_csv(waves_mat, tides_mat, profiles_mat, orientations_mat, output_file): - logger.info("Creating %s", output_file) - df_waves = parse_waves(waves_mat=waves_mat) - df_tides = parse_tides(tides_mat=tides_mat) - df_profiles = parse_profiles(profiles_mat=profiles_mat) - df_orientations = parse_orientations(orientations_mat=orientations_mat) - df_sites = get_unique_sites(dfs=[df_waves, df_tides, df_profiles]) - df_sites = combine_sites_and_orientaions(df_sites, df_orientations) - df_sites = specify_lat_lon_profile_center(df_sites) - df_sites.set_index(["site_id"], inplace=True) - df_sites.to_csv(output_file) - logger.info("Created %s", output_file) + nan_count = df.site_id.isna().sum() + if nan_count > 0: + logger.warning("Not all records (%d of %d) matched with a unique site", nan_count, len(df)) + + df = df.drop(columns=["lat", "lon", "beach"]) + + return df @click.command(short_help="create waves.csv") @@ -301,17 +338,22 @@ def create_waves_csv(waves_mat, sites_csv, output_file): @click.command(short_help="create profiles.csv") @click.option("--profiles-mat", required=True, help=".mat file containing beach profiles") -@click.option("--sites-csv", required=True, help=".csv file description of cross section sites") -@click.option("--output-file", required=True, help="where to save profiles.csv") -def create_profiles_csv(profiles_mat, sites_csv, output_file): - logger.info("Creating %s", output_file) - df_profiles = parse_profiles(profiles_mat=profiles_mat) - df_sites = pd.read_csv(sites_csv, index_col=[0]) - df_profiles = replace_unique_sites(df_profiles, df_sites) +@click.option("--profiles-output-file", required=True, help="where to save profiles.csv") +@click.option("--sites-output-file", required=True, help="where to save sites.csv") +def create_sites_and_profiles_csv(profiles_mat, profiles_output_file, sites_output_file): + logger.info("Creating sites and profiles csvs") + df_profiles, df_sites = parse_profiles_and_sites(profiles_mat=profiles_mat) df_profiles.set_index(["site_id", "profile_type", "x"], inplace=True) df_profiles.sort_index(inplace=True) - df_profiles.to_csv(output_file) - logger.info("Created %s", output_file) + df_profiles = remove_zeros(df_profiles) + + df_sites.set_index(["site_id"], inplace=True) + df_sites.sort_index(inplace=True) + + df_profiles.to_csv(profiles_output_file) + logger.info("Created %s", profiles_output_file) + df_sites.to_csv(sites_output_file) + logger.info("Created %s", sites_output_file) @click.command(short_help="create profiles.csv") @@ -335,8 +377,7 @@ def cli(): if __name__ == "__main__": - cli.add_command(create_sites_csv) cli.add_command(create_waves_csv) - cli.add_command(create_profiles_csv) + cli.add_command(create_sites_and_profiles_csv) cli.add_command(create_tides_csv) cli() diff --git a/src/data/profile_features.py b/src/data/profile_features.py index 78ba406..cb63568 100644 --- a/src/data/profile_features.py +++ b/src/data/profile_features.py @@ -15,22 +15,6 @@ logging.config.fileConfig("./src/logging.conf", disable_existing_loggers=False) logger = logging.getLogger(__name__) -def shapes_from_shp(shp_file): - """ - Parses a shape file and returns a list of shapely shapes, ids and properties - :param shp_file: - :return: - """ - shapes = [] - ids = [] - properties = [] - for feat in fiona.open(shp_file, "r"): - shapes.append(shape(feat["geometry"])) - ids.append(feat["id"]) - properties.append(feat["properties"]) - return shapes, ids, properties - - def convert_coord_systems(g1, in_coord_system="EPSG:4326", out_coord_system="EPSG:28356"): """ Converts coordinates from one coordinates system to another. Needed because shapefiles are usually defined in @@ -50,6 +34,22 @@ def convert_coord_systems(g1, in_coord_system="EPSG:4326", out_coord_system="EPS return g2 +def shapes_from_shp(shp_file): + """ + Parses a shape file and returns a list of shapely shapes, ids and properties + :param shp_file: + :return: + """ + shapes = [] + ids = [] + properties = [] + for feat in fiona.open(shp_file, "r"): + shapes.append(shape(feat["geometry"])) + ids.append(feat["id"]) + properties.append(feat["properties"]) + return shapes, ids, properties + + def distance_to_intersection(lat, lon, landward_orientation, beach, line_strings, line_properties): """ Returns the distance at whjch a line drawn from a lat/lon at an orientation intersects a line stinrg @@ -143,7 +143,7 @@ def parse_profile_features(df_sites, df_profiles, dune_crest_shp, dune_toe_shp): lambda row: beach_profile_elevation(row["{}_x".format(feat)], df_profiles, "prestorm", row.name), axis=1 ) - df_profile_features = df_profile_features.drop(columns=["beach", "lat", "lon", "orientation"]) + df_profile_features = df_profile_features.drop(columns=["beach", "lat", "lon", "orientation", "profile_x_lat_lon"]) return df_profile_features diff --git a/src/logging.yaml b/src/logging.yaml new file mode 100644 index 0000000..fc82b31 --- /dev/null +++ b/src/logging.yaml @@ -0,0 +1,50 @@ +--- +version: 1 +disable_existing_loggers: False +formatters: + simple: + format: "%(asctime)s - %(filename)s - %(levelname)s - %(message)s" + +handlers: + console: + class: logging.StreamHandler + level: DEBUG + formatter: simple + stream: ext://sys.stdout + + info_file_handler: + class: logging.handlers.RotatingFileHandler + level: INFO + formatter: simple + filename: info.log + maxBytes: 10485760 # 10MB + backupCount: 3 + encoding: utf8 + + warning_file_handler: + class: logging.handlers.RotatingFileHandler + level: WARNING + formatter: simple + filename: warning.log + maxBytes: 10485760 # 10MB + backupCount: 3 + encoding: utf8 + + error_file_handler: + class: logging.handlers.RotatingFileHandler + level: ERROR + formatter: simple + filename: error.log + maxBytes: 10485760 # 10MB + backupCount: 3 + encoding: utf8 + +loggers: + my_module: + level: ERROR + handlers: [console] + propagate: no + +root: + level: INFO + handlers: [console, info_file_handler, error_file_handler, warning_file_handler] diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..0df1fc6 --- /dev/null +++ b/src/utils.py @@ -0,0 +1,16 @@ +import logging.config +import os +import yaml + + +def setup_logging(path="./src/logging.yaml", default_level=logging.INFO): + """ + Setup logging configuration + """ + if os.path.exists(path): + with open(path, "rt") as f: + config = yaml.safe_load(f.read()) + logging.config.dictConfig(config) + else: + logging.basicConfig(level=default_level) + return logging.getLogger(__name__)