nsw-2016-storm-impact/src/data/parse_mat.py

"""
Converts raw .mat files into a flattened .csv structure which can be imported into python pandas.
"""

import math
from datetime import datetime, timedelta

import click
import numpy as np
import pandas as pd
from mat4py import loadmat
from shapely.geometry import Point

from utils import convert_coord_systems
from logs import setup_logging

logger = setup_logging()


def parse_crest_toes(df_raw_features, df_profiles):
    """
    Parses profile_features_chris_leaman.csv
    :param profile_features_csv:
    :return:
    """

    # Puts profiles_features_csv into format expected by rest of analysis
    df_crest_toes = df_raw_features.reset_index().melt(
        id_vars=["site_id"],
        value_vars=[
            "prestorm_dune_crest_x",
            "prestorm_dune_toe_x",
            "poststorm_dune_crest_x",
            "poststorm_dune_toe_x",
        ],
    )
    df_crest_toes["profile_type"] = df_crest_toes.variable.str.extract(
        r"(prestorm|poststorm)"
    )
    df_crest_toes["point_type"] = df_crest_toes.variable.str.extract(
        r"(dune_crest_x|dune_toe_x)"
    )
    df_crest_toes = df_crest_toes.drop(columns=["variable"])
    df_crest_toes = df_crest_toes.sort_values("site_id")
    df_crest_toes = df_crest_toes.set_index(["site_id", "profile_type", "point_type"])
    df_crest_toes = df_crest_toes.unstack()
    df_crest_toes.columns = df_crest_toes.columns.droplevel()

    # Now let's calculate the corresponding z elevations for each of our x coordinates
    for site_id in df_crest_toes.index.get_level_values("site_id").unique():
        logger.info("Calculating dune toe/crest z elevations for {}".format(site_id))

        # Get profile for this site
        idx = pd.IndexSlice
        df_profile = df_profiles.loc[idx[site_id, :, :], :]

        for param in ["prestorm", "poststorm"]:
            for loc in ["crest", "toe"]:

                # Get x value to find corresponding z value
                x_val = df_crest_toes.loc[(site_id, param), "dune_{}_x".format(loc)]

                if np.isnan(x_val):
                    df_crest_toes.loc[
                        (site_id, param), "dune_{}_z".format(loc)
                    ] = np.nan
                    continue

                # Try get the value from the other profile if we return nan or empty dataframe
                df_z = df_profile.loc[idx[site_id, param, x_val], :]
                if df_z.empty:
                    if param == "prestorm":
                        new_param = "poststorm"
                    elif param == "poststorm":
                        new_param = "prestorm"
                    z_val = df_profile.loc[idx[site_id, new_param, x_val], :].z
                else:
                    z_val = df_z.z

                # # Try get the value from the other profile if we return nan or empty dataframe
                # if df_profile.query(query).empty:
                #     if param == "prestorm":
                #         query = query.replace('prestorm', 'poststorm')
                #     elif param == "poststorm":
                #         query = query.replace('poststorm', 'prestorm')
                #     z_val = df_profile.query(query).iloc[0].z
                # else:
                #     z_val = df_profile.query(query).iloc[0].z

                # Put results back into merged dataframe
                df_crest_toes.loc[(site_id, param), "dune_{}_z".format(loc)] = z_val

    return df_crest_toes


def parse_dune_crest_toes(df_sites, crest_mat, toe_mat):
    """
    :param df_sites:
    :param crest_mat:
    :param toe_mat:
    :return:
    """
    logger.info("Parsing dune crests and toes")

    rows = []
    crest_data = loadmat(crest_mat)
    toe_data = loadmat(toe_mat)

    for n, _ in enumerate(crest_data["xc1"]):
        rows.extend(
            [
                {
                    "dune_crest_x": crest_data["xc1"][n],
                    "dune_crest_z": crest_data["zc1"][n],
                    "dune_toe_x": toe_data["xt1"][n],
                    "dune_toe_z": toe_data["zt1"][n],
                    "profile_type": "prestorm",
                    "site_no": n + 1,
                },
                {
                    "dune_crest_x": crest_data["xc2"][n],
                    "dune_crest_z": crest_data["zc2"][n],
                    "dune_toe_x": toe_data["xt2"][n],
                    "dune_toe_z": toe_data["zt2"][n],
                    "profile_type": "poststorm",
                    "site_no": n + 1,
                },
            ]
        )

    df_profile_features = pd.DataFrame(rows)

    # Want the site_id instead of the site_no, so merge in df_sites
    df_sites.reset_index(inplace=True)
    df_profile_features = df_sites[["site_no", "site_id"]].merge(
        df_profile_features, how="outer", on=["site_no"]
    )
    df_profile_features.drop(columns=["site_no"], inplace=True)
    df_profile_features.set_index(["site_id", "profile_type"], inplace=True)
    df_profile_features.sort_index(inplace=True)
    df_profile_features = df_profile_features.round(3)

    return df_profile_features


def parse_waves(waves_mat):
    """
    Parses the raw waves.mat file and returns a pandas dataframe
    :param waves_mat:
    :return:
    """
    logger.info("Parsing %s", waves_mat)
    mat_data = loadmat(waves_mat)["data"]
    rows = []
    for i in range(0, len(mat_data["site"])):
        for j in range(0, len(mat_data["dates"][i])):
            rows.append(
                {
                    "beach": mat_data["site"][i],
                    "lon": mat_data["lon"][i],
                    "lat": mat_data["lat"][i],
                    "datetime": matlab_datenum_to_datetime(mat_data["dates"][i][j][0]),
                    "Hs": mat_data["H"][i][j][0],
                    "Hs0": mat_data["Ho"][i][j][0],
                    "Tp": mat_data["T"][i][j][0],
                    "dir": mat_data["D"][i][j][0],
                    "E": mat_data["E"][i][j][0],
                    "P": mat_data["P"][i][j][0],
                    "Exs": mat_data["Exs"][i][j][0],
                    "Pxs": mat_data["Pxs"][i][j][0],
                }
            )

    df = pd.DataFrame(rows)
    df["datetime"] = df["datetime"].dt.round("1s")
    return df


def parse_tides(tides_mat):
    """
    Parses the raw tides.mat file and returns a pandas dataframe
    :param tides_mat:
    :return:
    """
    logger.info("Parsing %s", tides_mat)
    mat_data = loadmat(tides_mat)["data"]
    rows = []
    for i in range(0, len(mat_data["site"])):
        for j in range(0, len(mat_data["time"])):
            rows.append(
                {
                    "beach": mat_data["site"][i][0],
                    "lon": mat_data["lons"][i][0],
                    "lat": mat_data["lats"][i][0],
                    "datetime": matlab_datenum_to_datetime(mat_data["time"][j][0]),
                    "tide": mat_data["tide"][i][j],
                }
            )

    df = pd.DataFrame(rows)
    df["datetime"] = df["datetime"].dt.round("1s")
    return df


def parse_profiles_and_sites(profiles_mat):
    """
    Parses the raw profiles.mat file and returns a pandas dataframe
    :param tides_mat:
    :return:
    """
    logger.info("Parsing %s", profiles_mat)
    mat_data = loadmat(profiles_mat)["data"]
    profile_rows = []
    site_rows = []
    site_counter = 0

    # Our z values can come from these columns, depending on the isgood flag.
    # Let's reoganise them into a list of list
    z_names = ["Zpre", "Zpost", "Zrec1", "Zrec2", "Zrec3", "Zrec4"]
    z_cols = [mat_data[col] for col in z_names]
    z_sites = []
    for cols in zip(*z_cols):
        z_vals = []
        for z_vector in zip(*cols):
            z_vals.append([z[0] for z in z_vector])
        z_sites.append(z_vals)

    for i, site in enumerate(mat_data["site"]):
        logger.debug("Processing site {} of {}".format(i + 1, len(mat_data["site"])))

        # Give each site a unique id
        if len(site_rows) == 0 or site_rows[-1]["beach"] != site:
            site_counter = 1
        else:
            site_counter += 1
        site_id = "{}{:04d}".format(site, site_counter)

        # Initalize location of x=200m latitude and longitude
        x_200_lat = np.nan
        x_200_lon = np.nan

        # Want to calculation the orientation
        orientation = {}

        for x, lat, lon, z_site, easting, northing in zip(
            mat_data["x"][i],
            mat_data["lats"][i],
            mat_data["lons"][i],
            z_sites[i],
            mat_data["eastings"][i],
            mat_data["northings"][i],
        ):

            profile_type = None
            for j, is_good in enumerate([1] + mat_data["isgood"][i]):

                # Assumes the first profile is always good and is the prestorm profike
                if j == 0:
                    profile_type = "prestorm"
                    z = z_site[j]
                    land_lim = np.nan

                # Skips bad profiles
                elif is_good == 0:
                    continue

                # Takes the first isgood profile as the post storm profile
                else:
                    profile_type = "poststorm"
                    z = z_site[j]
                    land_lim = mat_data["landlims"][i][j]

                survey_datetime = matlab_datenum_to_datetime(
                    mat_data["surveydates"][i][j]
                )

                # Keep a record of the where the center of the profile is located, and the locations of the land
                # and sea

                # TODO: This code isn't very transferrable. What if we don't have lat/lons at 200 m? Relook at this
                if x[0] == 200:
                    x_200_lat = lat[0]
                    x_200_lon = lon[0]
                elif x[0] == 0:
                    orientation["land_easting"] = easting[0]
                    orientation["land_northing"] = northing[0]
                elif x[0] == 400:
                    orientation["sea_easting"] = easting[0]
                    orientation["sea_northing"] = northing[0]

                profile_rows.append(
                    {
                        "site_id": site_id,
                        "lon": lon[0],
                        "lat": lat[0],
                        "profile_type": profile_type,
                        "x": x[0],
                        "z": z,
                        "land_lim": land_lim,
                        "survey_datetime": survey_datetime,
                    }
                )

                # Stop looking at profiles if we've got our post-storm profile
                if profile_type == "poststorm":
                    break

        orientation = math.degrees(
            math.atan2(
                orientation["land_northing"] - orientation["sea_northing"],
                orientation["land_easting"] - orientation["sea_easting"],
            )
        )
        site_rows.append(
            {
                "site_id": site_id,
                "site_no": i + 1,
                "beach": site,
                "lat": x_200_lat,
                "lon": x_200_lon,
                "orientation": orientation,
                "profile_x_lat_lon": 200,
            }
        )

    df_profiles = pd.DataFrame(profile_rows)
    df_sites = pd.DataFrame(site_rows)

    logger.info("Parsed profiles and sites")
    return df_profiles, df_sites


def remove_zeros(df_profiles):
    """
    When parsing the pre/post storm profiles, the end of some profiles have constant values of zero. Let's change
    these to NaNs for consistancy. Didn't use pandas fillnan because 0 may still be a valid value.
    :param df_profiles:
    :return:
    """

    logger.info("Removing zeros from end of profiles")
    df_profiles = df_profiles.sort_index()
    groups = df_profiles.groupby(level=["site_id", "profile_type"])
    for key, _ in groups:
        logger.debug("Removing zeros from {} profile at {}".format(key[1], key[0]))
        idx_site = (df_profiles.index.get_level_values("site_id") == key[0]) & (
            df_profiles.index.get_level_values("profile_type") == key[1]
        )
        df_profile = df_profiles[idx_site]
        x_last_ele = df_profile[df_profile.z == 0].index.get_level_values("x")[0]
        df_profiles.loc[
            idx_site & (df_profiles.index.get_level_values("x") > x_last_ele), "z"
        ] = np.nan
    logger.info("Removed zeros from end of profiles")

    return df_profiles


def matlab_datenum_to_datetime(matlab_datenum):
    """
    Adapted from https://stackoverflow.com/a/13965852
    :param matlab_datenum:
    :return:
    """
    return (
        datetime.fromordinal(int(matlab_datenum))
        + timedelta(days=matlab_datenum % 1)
        - timedelta(days=366)
    )


def replace_unique_sites(df, df_sites):
    """
    Replaces beach/lat/lon columns with the unique site_id
    :param dfs:
    :param df_sites:
    :return:
    """
    # Make the sites index a column, so it can be merged into df
    df_sites["site_id"] = df_sites.index.get_level_values("site_id")

    # Create eastings and northings so we can calculate distances
    site_points = [
        convert_coord_systems(Point(lon, lat)).xy
        for lon, lat in zip(df_sites["lon"], df_sites["lat"])
    ]
    df_sites["easting"] = [x[0][0] for x in site_points]
    df_sites["northing"] = [x[1][0] for x in site_points]

    # Process each unique combination lat/lons in groups
    groups = df.groupby(["lat", "lon"])
    for (lat, lon), df_group in groups:

        # Calculate distances from each point to each site and determine closest site
        easting, northing = [x[0] for x in convert_coord_systems(Point(lon, lat)).xy]
        distances_to_sites = np.sqrt(
            (df_sites["easting"] - easting) ** 2
            + (df_sites["northing"] - northing) ** 2
        )
        min_distance = distances_to_sites.min()
        closest_site = distances_to_sites.idxmin()

        # Do some logging so we can check later.
        if min_distance > 1:
            logger.warning(
                "Closest site to (%.4f,%.4f) is %s (%.2f m away)",
                lat,
                lon,
                closest_site,
                min_distance,
            )
        else:
            logger.info(
                "Closest site to (%.4f,%.4f) is %s (%.2f m away)",
                lat,
                lon,
                closest_site,
                min_distance,
            )

        # Assign site_id based on closest site
        df.loc[df_group.index, "site_id"] = closest_site

    nan_count = df.site_id.isna().sum()
    if nan_count > 0:
        logger.warning(
            "Not all records (%d of %d) matched with a unique site", nan_count, len(df)
        )

    df = df.drop(columns=["lat", "lon", "beach"])

    return df


@click.command(short_help="create waves.csv")
@click.option("--waves-mat", required=True, help=".mat file containing wave records")
@click.option(
    "--sites-csv", required=True, help=".csv file description of cross section sites"
)
@click.option("--output-file", required=True, help="where to save waves.csv")
def create_waves_csv(waves_mat, sites_csv, output_file):
    logger.info("Creating %s", output_file)
    df_waves = parse_waves(waves_mat=waves_mat)
    df_sites = pd.read_csv(sites_csv, index_col=[0])
    df_waves = replace_unique_sites(df_waves, df_sites)
    df_waves.set_index(["site_id", "datetime"], inplace=True)
    df_waves.sort_index(inplace=True)
    df_waves.to_csv(output_file)
    logger.info("Created %s", output_file)


# @click.command(short_help="create profile_features.csv")
# @click.option("--crest-mat", required=True, help=".mat file containing wave records")
# @click.option("--toe-mat", required=True, help=".mat file containing wave records")
# @click.option("--sites-csv", required=True, help=".csv file description of cross section sites")
# @click.option("--output-file", required=True, help="where to save waves.csv")
# def create_profile_features(crest_mat, toe_mat, sites_csv, output_file):
#     logger.info("Creating %s", output_file)
#     df_sites = pd.read_csv(sites_csv, index_col=[0])
#     df_profile_features = parse_dune_crest_toes(df_sites, crest_mat, toe_mat)
#     df_profile_features.to_csv(output_file)
#     logger.info("Created %s", output_file)


@click.command(short_help="create profile_features.csv")
@click.option(
    "--profile-features-csv", required=True, help=".mat file containing wave records"
)
@click.option("--profiles-csv", required=True, help=".mat file containing wave records")
@click.option("--output-file", required=True, help="where to save waves.csv")
def create_crest_toes(profile_features_csv, profiles_csv, output_file):
    logger.info("Creating %s", output_file)

    df_raw_features = pd.read_csv(profile_features_csv, index_col=[0])
    df_profiles = pd.read_csv(profiles_csv, index_col=[0, 1, 2])
    df_crest_toes = parse_crest_toes(df_raw_features, df_profiles)

    df_crest_toes.to_csv(output_file, float_format="%.3f")
    logger.info("Created %s", output_file)


@click.command(short_help="create profiles.csv")
@click.option(
    "--profiles-mat", required=True, help=".mat file containing beach profiles"
)
@click.option(
    "--profiles-output-file", required=True, help="where to save profiles.csv"
)
@click.option("--sites-output-file", required=True, help="where to save sites.csv")
def create_sites_and_profiles_csv(
    profiles_mat, profiles_output_file, sites_output_file
):
    logger.info("Creating sites and profiles csvs")
    df_profiles, df_sites = parse_profiles_and_sites(profiles_mat=profiles_mat)
    df_profiles.set_index(["site_id", "profile_type", "x"], inplace=True)
    df_profiles.sort_index(inplace=True)
    df_profiles = remove_zeros(df_profiles)

    df_sites.set_index(["site_id"], inplace=True)
    df_sites.sort_index(inplace=True)

    df_profiles.to_csv(profiles_output_file)
    logger.info("Created %s", profiles_output_file)
    df_sites.to_csv(sites_output_file, float_format="%.3f")
    logger.info("Created %s", sites_output_file)


@click.command(short_help="create profiles.csv")
@click.option("--tides-mat", required=True, help=".mat file containing tides")
@click.option(
    "--sites-csv", required=True, help=".csv file description of cross section sites"
)
@click.option("--output-file", required=True, help="where to save tides.csv")
def create_tides_csv(tides_mat, sites_csv, output_file):
    logger.info("Creating %s", output_file)
    df_tides = parse_tides(tides_mat=tides_mat)
    df_sites = pd.read_csv(sites_csv, index_col=[0])
    df_tides = replace_unique_sites(df_tides, df_sites)
    df_tides.set_index(["site_id", "datetime"], inplace=True)
    df_tides.sort_index(inplace=True)
    df_tides.to_csv(output_file)
    logger.info("Created %s", output_file)


@click.group()
def cli():
    pass


if __name__ == "__main__":
    cli.add_command(create_waves_csv)
    cli.add_command(create_sites_and_profiles_csv)
    cli.add_command(create_tides_csv)
    cli()