@ -2,15 +2,25 @@
Converts raw . mat files into a flattened . csv structure which can be imported into python pandas .
"""
import logging . config
import os
import sys
sys . path . append ( os . path . dirname ( os . path . dirname ( os . path . abspath ( __file__ ) ) ) )
from datetime import datetime , timedelta
import math
import click
import numpy as np
import pandas as pd
from mat4py import loadmat
import numpy as np
from shapely . geometry import Point
from profile_features import convert_coord_systems
from utils import setup_logging
logging . config . fileConfig ( " ./src/logging.conf " , disable_existing_loggers = False )
logger = logging . getLogger ( __name__ )
logger = setup_logging ( )
def parse_orientations ( orientations_mat ) :
@ -134,7 +144,7 @@ def parse_tides(tides_mat):
return df
def parse_profiles ( profiles_mat ) :
def parse_profiles _and_sites ( profiles_mat ) :
"""
Parses the raw profiles . mat file and returns a pandas dataframe
: param tides_mat :
@ -142,39 +152,108 @@ def parse_profiles(profiles_mat):
"""
logger . info ( " Parsing %s " , profiles_mat )
mat_data = loadmat ( profiles_mat ) [ " data " ]
rows = [ ]
for i in range ( 0 , len ( mat_data [ " site " ] ) ) :
for j in range ( 0 , len ( mat_data [ " pfx " ] [ i ] ) ) :
for profile_type in [ " prestorm " , " poststorm " ] :
profile_rows = [ ]
site_rows = [ ]
site_counter = 0
for i , site in enumerate ( mat_data [ " site " ] ) :
# Give each site a unique id
if len ( site_rows ) == 0 or site_rows [ - 1 ] [ " beach " ] != site :
site_counter = 1
else :
site_counter + = 1
site_id = " {} {:04d} " . format ( site , site_counter )
# Initalize location of x=200m latitude and longitude
x_200_lat = np . nan
x_200_lon = np . nan
# Want to calculation the orientation
orientation = { }
for x , lat , lon , z_prestorm , z_poststorm , easting , northing in zip (
mat_data [ " x " ] [ i ] ,
mat_data [ " lats " ] [ i ] ,
mat_data [ " lons " ] [ i ] ,
mat_data [ " Zpre " ] [ i ] ,
mat_data [ " Zpost " ] [ i ] ,
mat_data [ " eastings " ] [ i ] ,
mat_data [ " northings " ] [ i ] ,
) :
# Only extract pre and post storm profile
for j , profile_type in enumerate ( [ " prestorm " , " poststorm " ] ) :
if mat_data [ " isgood " ] [ i ] [ j ] == 1 :
land_lim = mat_data [ " landlims " ] [ i ] [ j ]
survey_datetime = matlab_datenum_to_datetime ( mat_data [ " surveydates " ] [ i ] [ j ] )
if profile_type == " prestorm " :
z = mat_data [ " pf1 " ] [ i ] [ j ] [ 0 ]
if profile_type == " poststorm " :
z = mat_data [ " pf2 " ] [ i ] [ j ] [ 0 ]
rows . append (
z = z_prestorm
else :
z = z_poststorm
# Keep a record of the where the center of the profile is located, and the locations of the land
# and sea
# TODO: This code isn't very transferrable. What if we don't have lat/lons at 200 m? Relook at this
if x [ 0 ] == 200 :
x_200_lat = lat [ 0 ]
x_200_lon = lon [ 0 ]
elif x [ 0 ] == 0 :
orientation [ " land_easting " ] = easting [ 0 ]
orientation [ " land_northing " ] = northing [ 0 ]
elif x [ 0 ] == 400 :
orientation [ " sea_easting " ] = easting [ 0 ]
orientation [ " sea_northing " ] = northing [ 0 ]
profile_rows . append (
{
" beach " : mat_data [ " site " ] [ i ] ,
" lon " : mat_data [ " lon " ] [ i ] ,
" lat " : mat_data [ " lat " ] [ i ] ,
" site_id " : site_id ,
" lon " : lon[ 0 ] ,
" lat " : lat[ 0 ] ,
" profile_type " : profile_type ,
" x " : mat_data [ " pfx " ] [ i ] [ j ] [ 0 ] ,
" z " : z ,
" x " : x [ 0 ] ,
" z " : z [ 0 ] ,
" land_lim " : land_lim ,
" survey_datetime " : survey_datetime ,
}
)
df = pd . DataFrame ( rows )
return df
orientation = math . degrees (
math . atan2 (
orientation [ " land_northing " ] - orientation [ " sea_northing " ] ,
orientation [ " land_easting " ] - orientation [ " sea_easting " ] ,
)
)
site_rows . append (
{
" site_id " : site_id ,
" beach " : site ,
" lat " : x_200_lat ,
" lon " : x_200_lon ,
" orientation " : orientation ,
" profile_x_lat_lon " : 200 ,
}
)
df_profiles = pd . DataFrame ( profile_rows )
df_sites = pd . DataFrame ( site_rows )
logger . info ( " Parsed profiles and sites " )
return df_profiles , df_sites
def remove_zeros ( df_profiles ) :
"""
When parsing the pre / post storm profiles , the end of some profiles have constant values of zero . Let ' s change
these to NaNs for consistancy . Didn ' t use pandas fillnan because 0 may still be a valid value.
: param df :
: param df _profiles :
: return :
"""
logger . info ( " Removing zeros from end of profiles " )
df_profiles = df_profiles . sort_index ( )
groups = df_profiles . groupby ( level = [ " site_id " , " profile_type " ] )
for key , _ in groups :
@ -185,6 +264,7 @@ def remove_zeros(df_profiles):
df_profile = df_profiles [ idx_site ]
x_last_ele = df_profile [ df_profile . z != 0 ] . index . get_level_values ( " x " ) [ - 1 ]
df_profiles . loc [ idx_site & ( df_profiles . index . get_level_values ( " x " ) > x_last_ele ) , " z " ] = np . nan
logger . info ( " Removed zeros from end of profiles " )
return df_profiles
@ -198,31 +278,7 @@ def matlab_datenum_to_datetime(matlab_datenum):
return datetime . fromordinal ( int ( matlab_datenum ) ) + timedelta ( days = matlab_datenum % 1 ) - timedelta ( days = 366 )
def get_unique_sites ( dfs , cols = [ " beach " , " lat " , " lon " ] ) :
"""
Generates a dataframe of unique sites based on beach names , lats and lons . Creates a unique site ID for each .
: param dfs :
: param cols :
: return :
"""
rows = [ ]
df_all = pd . concat ( [ df [ cols ] for df in dfs ] )
beach_groups = df_all . groupby ( [ " beach " ] )
for beach_name , beach_group in beach_groups :
site_groups = beach_group . groupby ( [ " lat " , " lon " ] )
siteNo = 1
for site_name , site_group in site_groups :
site = " {} {:04d} " . format ( beach_name , siteNo )
rows . append ( { " site_id " : site , " lat " : site_name [ 0 ] , " lon " : site_name [ 1 ] , " beach " : beach_name } )
siteNo + = 1
df = pd . DataFrame ( rows )
return df
def replace_unique_sites ( df , df_sites , cols = [ " lat " , " lon " ] ) :
def replace_unique_sites ( df , df_sites ) :
"""
Replaces beach / lat / lon columns with the unique site_id
: param dfs :
@ -232,56 +288,37 @@ def replace_unique_sites(df, df_sites, cols=["lat", "lon"]):
# Make the sites index a column, so it can be merged into df
df_sites [ " site_id " ] = df_sites . index . get_level_values ( " site_id " )
# Merging on a float can lead to subtle bugs. Lets convert lat/lons to integers and merge on that instead
precision = 8
df_sites [ " lat_int " ] = np . round ( df_sites [ " lat " ] * 10 * * precision ) . astype ( np . int64 )
df_sites [ " lon_int " ] = np . round ( df_sites [ " lon " ] * 10 * * precision ) . astype ( np . int64 )
df [ " lat_int " ] = np . round ( df [ " lat " ] * 10 * * precision ) . astype ( np . int64 )
df [ " lon_int " ] = np . round ( df [ " lon " ] * 10 * * precision ) . astype ( np . int64 )
# Create eastings and northings so we can calculate distances
site_points = [ convert_coord_systems ( Point ( lon , lat ) ) . xy for lon , lat in zip ( df_sites [ " lon " ] , df_sites [ " lat " ] ) ]
df_sites [ " easting " ] = [ x [ 0 ] [ 0 ] for x in site_points ]
df_sites [ " northing " ] = [ x [ 1 ] [ 0 ] for x in site_points ]
df_merged = df . merge ( df_sites , on = [ " lat_int " , " lon_int " ] )
# Process each unique combination lat/lons in groups
groups = df . groupby ( [ " lat " , " lon " ] )
for ( lat , lon ) , df_group in groups :
# Check that all our records have a unique site identifier
n_unmatched = len ( df ) - len ( df_merged )
if n_unmatched > 0 :
logger . warning ( " Not all records ( %d of %d ) matched with a unique site " , n_unmatched , len ( df ) )
df_merged = df_merged . drop (
columns = [
" lat_x " ,
" lon_x " ,
" lat_int " ,
" lon_int " ,
" beach_y " ,
" beach_x " ,
" lat_y " ,
" lon_y " ,
" orientation " ,
" profile_x_lat_lon " ,
]
)
# Calculate distances from each point to each site and determine closest site
easting , northing = [ x [ 0 ] for x in convert_coord_systems ( Point ( lon , lat ) ) . xy ]
distances_to_sites = np . sqrt ( ( df_sites [ " easting " ] - easting ) * * 2 + ( df_sites [ " northing " ] - northing ) * * 2 )
min_distance = distances_to_sites . min ( )
closest_site = distances_to_sites . idxmin ( )
return df_merged
# Do some logging so we can check later.
if min_distance > 1 :
logger . warning ( " Closest site to ( %.4f , %.4f ) is %s ( %.2f m away) " , lat , lon , closest_site , min_distance )
else :
logger . info ( " Closest site to ( %.4f , %.4f ) is %s ( %.2f m away) " , lat , lon , closest_site , min_distance )
# Assign site_id based on closest site
df . loc [ df_group . index , " site_id " ] = closest_site
@click.command ( short_help = " create sites.csv " )
@click.option ( " --waves-mat " , required = True , help = " .mat file containing wave records " )
@click.option ( " --tides-mat " , required = True , help = " .mat file containing tide records " )
@click.option ( " --profiles-mat " , required = True , help = " .mat file containing beach profiles " )
@click.option ( " --orientations-mat " , required = True , help = " .mat file containing orientation of beach profiles " )
@click.option ( " --output-file " , required = True , help = " where to save sites.csv " )
def create_sites_csv ( waves_mat , tides_mat , profiles_mat , orientations_mat , output_file ) :
logger . info ( " Creating %s " , output_file )
df_waves = parse_waves ( waves_mat = waves_mat )
df_tides = parse_tides ( tides_mat = tides_mat )
df_profiles = parse_profiles ( profiles_mat = profiles_mat )
df_orientations = parse_orientations ( orientations_mat = orientations_mat )
df_sites = get_unique_sites ( dfs = [ df_waves , df_tides , df_profiles ] )
df_sites = combine_sites_and_orientaions ( df_sites , df_orientations )
df_sites = specify_lat_lon_profile_center ( df_sites )
df_sites . set_index ( [ " site_id " ] , inplace = True )
df_sites . to_csv ( output_file )
logger . info ( " Created %s " , output_file )
nan_count = df . site_id . isna ( ) . sum ( )
if nan_count > 0 :
logger . warning ( " Not all records ( %d of %d ) matched with a unique site " , nan_count , len ( df ) )
df = df . drop ( columns = [ " lat " , " lon " , " beach " ] )
return df
@click.command ( short_help = " create waves.csv " )
@ -301,17 +338,22 @@ def create_waves_csv(waves_mat, sites_csv, output_file):
@click.command ( short_help = " create profiles.csv " )
@click.option ( " --profiles-mat " , required = True , help = " .mat file containing beach profiles " )
@click.option ( " --sites-csv " , required = True , help = " .csv file description of cross section sites " )
@click.option ( " --output-file " , required = True , help = " where to save profiles.csv " )
def create_profiles_csv ( profiles_mat , sites_csv , output_file ) :
logger . info ( " Creating %s " , output_file )
df_profiles = parse_profiles ( profiles_mat = profiles_mat )
df_sites = pd . read_csv ( sites_csv , index_col = [ 0 ] )
df_profiles = replace_unique_sites ( df_profiles , df_sites )
@click.option ( " --profiles-output-file " , required = True , help = " where to save profiles.csv " )
@click.option ( " --sites-output-file " , required = True , help = " where to save sites.csv " )
def create_sites_and_profiles_csv ( profiles_mat , profiles_output_file , sites_output_file ) :
logger . info ( " Creating sites and profiles csvs " )
df_profiles , df_sites = parse_profiles_and_sites ( profiles_mat = profiles_mat )
df_profiles . set_index ( [ " site_id " , " profile_type " , " x " ] , inplace = True )
df_profiles . sort_index ( inplace = True )
df_profiles . to_csv ( output_file )
logger . info ( " Created %s " , output_file )
df_profiles = remove_zeros ( df_profiles )
df_sites . set_index ( [ " site_id " ] , inplace = True )
df_sites . sort_index ( inplace = True )
df_profiles . to_csv ( profiles_output_file )
logger . info ( " Created %s " , profiles_output_file )
df_sites . to_csv ( sites_output_file )
logger . info ( " Created %s " , sites_output_file )
@click.command ( short_help = " create profiles.csv " )
@ -335,8 +377,7 @@ def cli():
if __name__ == " __main__ " :
cli . add_command ( create_sites_csv )
cli . add_command ( create_waves_csv )
cli . add_command ( create_ profiles_csv)
cli . add_command ( create_ sites_and_ profiles_csv)
cli . add_command ( create_tides_csv )
cli ( )