From e7d6aa8761ee80c89fbdbe33ff54bcda4ff1fbe3 Mon Sep 17 00:00:00 2001 From: Chris Leaman Date: Thu, 15 Nov 2018 12:40:34 +1100 Subject: [PATCH] Replace zeros with nans in beach profile In raw beach profile data, the end of some cross-sections can be padded with zero values. This probably shouldn't be the case - if we wanted to assume a zero elevation at these locations, we should do that in whatever calculation, not in the raw data. The added function will detect these padded zero values and replace them with nans. --- src/data/mat_to_csv.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/data/mat_to_csv.py b/src/data/mat_to_csv.py index d4dbbe2..0033ea9 100644 --- a/src/data/mat_to_csv.py +++ b/src/data/mat_to_csv.py @@ -7,6 +7,7 @@ from datetime import datetime, timedelta import pandas as pd from mat4py import loadmat +import numpy as np logging.config.fileConfig('./src/logging.conf', disable_existing_loggers=False) logger = logging.getLogger(__name__) @@ -152,6 +153,25 @@ def parse_profiles(profiles_mat): df = pd.DataFrame(rows) return df +def remove_zeros(df_profiles): + """ + When parsing the pre/post storm profiles, the end of some profiles have constant values of zero. Let's change + these to NaNs for consistancy. Didn't use pandas fillnan because 0 may still be a valid value. + :param df: + :return: + """ + + df_profiles = df_profiles.sort_index() + groups = df_profiles.groupby(level=['site_id','profile_type']) + for key, _ in groups: + logger.debug('Removing zeros from {} profile at {}'.format(key[1], key[0])) + idx_site = (df_profiles.index.get_level_values('site_id') == key[0]) & \ + (df_profiles.index.get_level_values('profile_type') == key[1]) + df_profile = df_profiles[idx_site] + x_last_ele = df_profile[df_profile.z!=0].index.get_level_values('x')[-1] + df_profiles.loc[idx_site & (df_profiles.index.get_level_values('x')>x_last_ele), 'z'] = np.nan + + return df_profiles def matlab_datenum_to_datetime(matlab_datenum): # https://stackoverflow.com/a/13965852 @@ -228,6 +248,9 @@ def main(): df_tides.set_index(['site_id', 'datetime'], inplace=True) df_sites.set_index(['site_id'], inplace=True) + logger.info('Nanning profile zero elevations') + df_profiles = remove_zeros(df_profiles) + logger.info('Outputting .csv files') df_profiles.to_csv('./data/interim/profiles.csv') df_tides.to_csv('./data/interim/tides.csv')