@ -2,15 +2,25 @@
Converts raw . mat files into a flattened . csv structure which can be imported into python pandas .
Converts raw . mat files into a flattened . csv structure which can be imported into python pandas .
"""
"""
import logging . config
import os
import sys
sys . path . append ( os . path . dirname ( os . path . dirname ( os . path . abspath ( __file__ ) ) ) )
from datetime import datetime , timedelta
from datetime import datetime , timedelta
import math
import click
import click
import numpy as np
import pandas as pd
import pandas as pd
from mat4py import loadmat
from mat4py import loadmat
import numpy as np
from shapely . geometry import Point
from profile_features import convert_coord_systems
from utils import setup_logging
logging . config . fileConfig ( " ./src/logging.conf " , disable_existing_loggers = False )
logger = setup_logging ( )
logger = logging . getLogger ( __name__ )
def parse_orientations ( orientations_mat ) :
def parse_orientations ( orientations_mat ) :
@ -134,7 +144,7 @@ def parse_tides(tides_mat):
return df
return df
def parse_profiles ( profiles_mat ) :
def parse_profiles _and_sites ( profiles_mat ) :
"""
"""
Parses the raw profiles . mat file and returns a pandas dataframe
Parses the raw profiles . mat file and returns a pandas dataframe
: param tides_mat :
: param tides_mat :
@ -142,39 +152,108 @@ def parse_profiles(profiles_mat):
"""
"""
logger . info ( " Parsing %s " , profiles_mat )
logger . info ( " Parsing %s " , profiles_mat )
mat_data = loadmat ( profiles_mat ) [ " data " ]
mat_data = loadmat ( profiles_mat ) [ " data " ]
rows = [ ]
profile_rows = [ ]
for i in range ( 0 , len ( mat_data [ " site " ] ) ) :
site_rows = [ ]
for j in range ( 0 , len ( mat_data [ " pfx " ] [ i ] ) ) :
site_counter = 0
for profile_type in [ " prestorm " , " poststorm " ] :
for i , site in enumerate ( mat_data [ " site " ] ) :
# Give each site a unique id
if len ( site_rows ) == 0 or site_rows [ - 1 ] [ " beach " ] != site :
site_counter = 1
else :
site_counter + = 1
site_id = " {} {:04d} " . format ( site , site_counter )
# Initalize location of x=200m latitude and longitude
x_200_lat = np . nan
x_200_lon = np . nan
# Want to calculation the orientation
orientation = { }
for x , lat , lon , z_prestorm , z_poststorm , easting , northing in zip (
mat_data [ " x " ] [ i ] ,
mat_data [ " lats " ] [ i ] ,
mat_data [ " lons " ] [ i ] ,
mat_data [ " Zpre " ] [ i ] ,
mat_data [ " Zpost " ] [ i ] ,
mat_data [ " eastings " ] [ i ] ,
mat_data [ " northings " ] [ i ] ,
) :
# Only extract pre and post storm profile
for j , profile_type in enumerate ( [ " prestorm " , " poststorm " ] ) :
if mat_data [ " isgood " ] [ i ] [ j ] == 1 :
land_lim = mat_data [ " landlims " ] [ i ] [ j ]
survey_datetime = matlab_datenum_to_datetime ( mat_data [ " surveydates " ] [ i ] [ j ] )
if profile_type == " prestorm " :
if profile_type == " prestorm " :
z = mat_data [ " pf1 " ] [ i ] [ j ] [ 0 ]
z = z_prestorm
if profile_type == " poststorm " :
else :
z = mat_data [ " pf2 " ] [ i ] [ j ] [ 0 ]
z = z_poststorm
rows . append (
# Keep a record of the where the center of the profile is located, and the locations of the land
# and sea
# TODO: This code isn't very transferrable. What if we don't have lat/lons at 200 m? Relook at this
if x [ 0 ] == 200 :
x_200_lat = lat [ 0 ]
x_200_lon = lon [ 0 ]
elif x [ 0 ] == 0 :
orientation [ " land_easting " ] = easting [ 0 ]
orientation [ " land_northing " ] = northing [ 0 ]
elif x [ 0 ] == 400 :
orientation [ " sea_easting " ] = easting [ 0 ]
orientation [ " sea_northing " ] = northing [ 0 ]
profile_rows . append (
{
{
" beach " : mat_data [ " site " ] [ i ] ,
" site_id " : site_id ,
" lon " : mat_data [ " lon " ] [ i ] ,
" lon " : lon[ 0 ] ,
" lat " : mat_data [ " lat " ] [ i ] ,
" lat " : lat[ 0 ] ,
" profile_type " : profile_type ,
" profile_type " : profile_type ,
" x " : mat_data [ " pfx " ] [ i ] [ j ] [ 0 ] ,
" x " : x [ 0 ] ,
" z " : z ,
" z " : z [ 0 ] ,
" land_lim " : land_lim ,
" survey_datetime " : survey_datetime ,
}
}
)
)
df = pd . DataFrame ( rows )
orientation = math . degrees (
return df
math . atan2 (
orientation [ " land_northing " ] - orientation [ " sea_northing " ] ,
orientation [ " land_easting " ] - orientation [ " sea_easting " ] ,
)
)
site_rows . append (
{
" site_id " : site_id ,
" beach " : site ,
" lat " : x_200_lat ,
" lon " : x_200_lon ,
" orientation " : orientation ,
" profile_x_lat_lon " : 200 ,
}
)
df_profiles = pd . DataFrame ( profile_rows )
df_sites = pd . DataFrame ( site_rows )
logger . info ( " Parsed profiles and sites " )
return df_profiles , df_sites
def remove_zeros ( df_profiles ) :
def remove_zeros ( df_profiles ) :
"""
"""
When parsing the pre / post storm profiles , the end of some profiles have constant values of zero . Let ' s change
When parsing the pre / post storm profiles , the end of some profiles have constant values of zero . Let ' s change
these to NaNs for consistancy . Didn ' t use pandas fillnan because 0 may still be a valid value.
these to NaNs for consistancy . Didn ' t use pandas fillnan because 0 may still be a valid value.
: param df :
: param df _profiles :
: return :
: return :
"""
"""
logger . info ( " Removing zeros from end of profiles " )
df_profiles = df_profiles . sort_index ( )
df_profiles = df_profiles . sort_index ( )
groups = df_profiles . groupby ( level = [ " site_id " , " profile_type " ] )
groups = df_profiles . groupby ( level = [ " site_id " , " profile_type " ] )
for key , _ in groups :
for key , _ in groups :
@ -185,6 +264,7 @@ def remove_zeros(df_profiles):
df_profile = df_profiles [ idx_site ]
df_profile = df_profiles [ idx_site ]
x_last_ele = df_profile [ df_profile . z != 0 ] . index . get_level_values ( " x " ) [ - 1 ]
x_last_ele = df_profile [ df_profile . z != 0 ] . index . get_level_values ( " x " ) [ - 1 ]
df_profiles . loc [ idx_site & ( df_profiles . index . get_level_values ( " x " ) > x_last_ele ) , " z " ] = np . nan
df_profiles . loc [ idx_site & ( df_profiles . index . get_level_values ( " x " ) > x_last_ele ) , " z " ] = np . nan
logger . info ( " Removed zeros from end of profiles " )
return df_profiles
return df_profiles
@ -198,31 +278,7 @@ def matlab_datenum_to_datetime(matlab_datenum):
return datetime . fromordinal ( int ( matlab_datenum ) ) + timedelta ( days = matlab_datenum % 1 ) - timedelta ( days = 366 )
return datetime . fromordinal ( int ( matlab_datenum ) ) + timedelta ( days = matlab_datenum % 1 ) - timedelta ( days = 366 )
def get_unique_sites ( dfs , cols = [ " beach " , " lat " , " lon " ] ) :
def replace_unique_sites ( df , df_sites ) :
"""
Generates a dataframe of unique sites based on beach names , lats and lons . Creates a unique site ID for each .
: param dfs :
: param cols :
: return :
"""
rows = [ ]
df_all = pd . concat ( [ df [ cols ] for df in dfs ] )
beach_groups = df_all . groupby ( [ " beach " ] )
for beach_name , beach_group in beach_groups :
site_groups = beach_group . groupby ( [ " lat " , " lon " ] )
siteNo = 1
for site_name , site_group in site_groups :
site = " {} {:04d} " . format ( beach_name , siteNo )
rows . append ( { " site_id " : site , " lat " : site_name [ 0 ] , " lon " : site_name [ 1 ] , " beach " : beach_name } )
siteNo + = 1
df = pd . DataFrame ( rows )
return df
def replace_unique_sites ( df , df_sites , cols = [ " lat " , " lon " ] ) :
"""
"""
Replaces beach / lat / lon columns with the unique site_id
Replaces beach / lat / lon columns with the unique site_id
: param dfs :
: param dfs :
@ -232,56 +288,37 @@ def replace_unique_sites(df, df_sites, cols=["lat", "lon"]):
# Make the sites index a column, so it can be merged into df
# Make the sites index a column, so it can be merged into df
df_sites [ " site_id " ] = df_sites . index . get_level_values ( " site_id " )
df_sites [ " site_id " ] = df_sites . index . get_level_values ( " site_id " )
# Merging on a float can lead to subtle bugs. Lets convert lat/lons to integers and merge on that instead
# Create eastings and northings so we can calculate distances
precision = 8
site_points = [ convert_coord_systems ( Point ( lon , lat ) ) . xy for lon , lat in zip ( df_sites [ " lon " ] , df_sites [ " lat " ] ) ]
df_sites [ " lat_int " ] = np . round ( df_sites [ " lat " ] * 10 * * precision ) . astype ( np . int64 )
df_sites [ " easting " ] = [ x [ 0 ] [ 0 ] for x in site_points ]
df_sites [ " lon_int " ] = np . round ( df_sites [ " lon " ] * 10 * * precision ) . astype ( np . int64 )
df_sites [ " northing " ] = [ x [ 1 ] [ 0 ] for x in site_points ]
df [ " lat_int " ] = np . round ( df [ " lat " ] * 10 * * precision ) . astype ( np . int64 )
df [ " lon_int " ] = np . round ( df [ " lon " ] * 10 * * precision ) . astype ( np . int64 )
df_merged = df . merge ( df_sites , on = [ " lat_int " , " lon_int " ] )
# Process each unique combination lat/lons in groups
groups = df . groupby ( [ " lat " , " lon " ] )
for ( lat , lon ) , df_group in groups :
# Check that all our records have a unique site identifier
# Calculate distances from each point to each site and determine closest site
n_unmatched = len ( df ) - len ( df_merged )
easting , northing = [ x [ 0 ] for x in convert_coord_systems ( Point ( lon , lat ) ) . xy ]
if n_unmatched > 0 :
distances_to_sites = np . sqrt ( ( df_sites [ " easting " ] - easting ) * * 2 + ( df_sites [ " northing " ] - northing ) * * 2 )
logger . warning ( " Not all records ( %d of %d ) matched with a unique site " , n_unmatched , len ( df ) )
min_distance = distances_to_sites . min ( )
closest_site = distances_to_sites . idxmin ( )
df_merged = df_merged . drop (
columns = [
" lat_x " ,
" lon_x " ,
" lat_int " ,
" lon_int " ,
" beach_y " ,
" beach_x " ,
" lat_y " ,
" lon_y " ,
" orientation " ,
" profile_x_lat_lon " ,
]
)
return df_merged
# Do some logging so we can check later.
if min_distance > 1 :
logger . warning ( " Closest site to ( %.4f , %.4f ) is %s ( %.2f m away) " , lat , lon , closest_site , min_distance )
else :
logger . info ( " Closest site to ( %.4f , %.4f ) is %s ( %.2f m away) " , lat , lon , closest_site , min_distance )
# Assign site_id based on closest site
df . loc [ df_group . index , " site_id " ] = closest_site
@click.command ( short_help = " create sites.csv " )
nan_count = df . site_id . isna ( ) . sum ( )
@click.option ( " --waves-mat " , required = True , help = " .mat file containing wave records " )
if nan_count > 0 :
@click.option ( " --tides-mat " , required = True , help = " .mat file containing tide records " )
logger . warning ( " Not all records ( %d of %d ) matched with a unique site " , nan_count , len ( df ) )
@click.option ( " --profiles-mat " , required = True , help = " .mat file containing beach profiles " )
@click.option ( " --orientations-mat " , required = True , help = " .mat file containing orientation of beach profiles " )
df = df . drop ( columns = [ " lat " , " lon " , " beach " ] )
@click.option ( " --output-file " , required = True , help = " where to save sites.csv " )
def create_sites_csv ( waves_mat , tides_mat , profiles_mat , orientations_mat , output_file ) :
return df
logger . info ( " Creating %s " , output_file )
df_waves = parse_waves ( waves_mat = waves_mat )
df_tides = parse_tides ( tides_mat = tides_mat )
df_profiles = parse_profiles ( profiles_mat = profiles_mat )
df_orientations = parse_orientations ( orientations_mat = orientations_mat )
df_sites = get_unique_sites ( dfs = [ df_waves , df_tides , df_profiles ] )
df_sites = combine_sites_and_orientaions ( df_sites , df_orientations )
df_sites = specify_lat_lon_profile_center ( df_sites )
df_sites . set_index ( [ " site_id " ] , inplace = True )
df_sites . to_csv ( output_file )
logger . info ( " Created %s " , output_file )
@click.command ( short_help = " create waves.csv " )
@click.command ( short_help = " create waves.csv " )
@ -301,17 +338,22 @@ def create_waves_csv(waves_mat, sites_csv, output_file):
@click.command ( short_help = " create profiles.csv " )
@click.command ( short_help = " create profiles.csv " )
@click.option ( " --profiles-mat " , required = True , help = " .mat file containing beach profiles " )
@click.option ( " --profiles-mat " , required = True , help = " .mat file containing beach profiles " )
@click.option ( " --sites-csv " , required = True , help = " .csv file description of cross section sites " )
@click.option ( " --profiles-output-file " , required = True , help = " where to save profiles.csv " )
@click.option ( " --output-file " , required = True , help = " where to save profiles.csv " )
@click.option ( " --sites-output-file " , required = True , help = " where to save sites.csv " )
def create_profiles_csv ( profiles_mat , sites_csv , output_file ) :
def create_sites_and_profiles_csv ( profiles_mat , profiles_output_file , sites_output_file ) :
logger . info ( " Creating %s " , output_file )
logger . info ( " Creating sites and profiles csvs " )
df_profiles = parse_profiles ( profiles_mat = profiles_mat )
df_profiles , df_sites = parse_profiles_and_sites ( profiles_mat = profiles_mat )
df_sites = pd . read_csv ( sites_csv , index_col = [ 0 ] )
df_profiles = replace_unique_sites ( df_profiles , df_sites )
df_profiles . set_index ( [ " site_id " , " profile_type " , " x " ] , inplace = True )
df_profiles . set_index ( [ " site_id " , " profile_type " , " x " ] , inplace = True )
df_profiles . sort_index ( inplace = True )
df_profiles . sort_index ( inplace = True )
df_profiles . to_csv ( output_file )
df_profiles = remove_zeros ( df_profiles )
logger . info ( " Created %s " , output_file )
df_sites . set_index ( [ " site_id " ] , inplace = True )
df_sites . sort_index ( inplace = True )
df_profiles . to_csv ( profiles_output_file )
logger . info ( " Created %s " , profiles_output_file )
df_sites . to_csv ( sites_output_file )
logger . info ( " Created %s " , sites_output_file )
@click.command ( short_help = " create profiles.csv " )
@click.command ( short_help = " create profiles.csv " )
@ -335,8 +377,7 @@ def cli():
if __name__ == " __main__ " :
if __name__ == " __main__ " :
cli . add_command ( create_sites_csv )
cli . add_command ( create_waves_csv )
cli . add_command ( create_waves_csv )
cli . add_command ( create_ profiles_csv)
cli . add_command ( create_ sites_and_ profiles_csv)
cli . add_command ( create_tides_csv )
cli . add_command ( create_tides_csv )
cli ( )
cli ( )