diff --git a/src/data/mat_to_csv.py b/src/data/mat_to_csv.py index d4dbbe2..0033ea9 100644 --- a/src/data/mat_to_csv.py +++ b/src/data/mat_to_csv.py @@ -7,6 +7,7 @@ from datetime import datetime, timedelta import pandas as pd from mat4py import loadmat +import numpy as np logging.config.fileConfig('./src/logging.conf', disable_existing_loggers=False) logger = logging.getLogger(__name__) @@ -152,6 +153,25 @@ def parse_profiles(profiles_mat): df = pd.DataFrame(rows) return df +def remove_zeros(df_profiles): + """ + When parsing the pre/post storm profiles, the end of some profiles have constant values of zero. Let's change + these to NaNs for consistancy. Didn't use pandas fillnan because 0 may still be a valid value. + :param df: + :return: + """ + + df_profiles = df_profiles.sort_index() + groups = df_profiles.groupby(level=['site_id','profile_type']) + for key, _ in groups: + logger.debug('Removing zeros from {} profile at {}'.format(key[1], key[0])) + idx_site = (df_profiles.index.get_level_values('site_id') == key[0]) & \ + (df_profiles.index.get_level_values('profile_type') == key[1]) + df_profile = df_profiles[idx_site] + x_last_ele = df_profile[df_profile.z!=0].index.get_level_values('x')[-1] + df_profiles.loc[idx_site & (df_profiles.index.get_level_values('x')>x_last_ele), 'z'] = np.nan + + return df_profiles def matlab_datenum_to_datetime(matlab_datenum): # https://stackoverflow.com/a/13965852 @@ -228,6 +248,9 @@ def main(): df_tides.set_index(['site_id', 'datetime'], inplace=True) df_sites.set_index(['site_id'], inplace=True) + logger.info('Nanning profile zero elevations') + df_profiles = remove_zeros(df_profiles) + logger.info('Outputting .csv files') df_profiles.to_csv('./data/interim/profiles.csv') df_tides.to_csv('./data/interim/tides.csv')