"""This module contains all the functions needed for data analysis """ # Initial settings import numpy as np import matplotlib.pyplot as plt import matplotlib.patches as mpatches from matplotlib import gridspec import pdb import ee # other modules from osgeo import gdal, ogr, osr import scipy.interpolate as interpolate import scipy.stats as sstats # image processing modules import skimage.filters as filters import skimage.exposure as exposure import skimage.transform as transform import sklearn.decomposition as decomposition import skimage.measure as measure import skimage.morphology as morphology # machine learning modules from sklearn.cluster import KMeans from sklearn.neural_network import MLPClassifier from sklearn.externals import joblib import time # import own modules import functions.utils as utils def get_tide(dates_sds, dates_tide, tide_level): tide = [] for i in range(len(dates_sds)): dates_diff = np.abs(np.array([ (dates_sds[i] - _).total_seconds() for _ in dates_tide])) if np.min(dates_diff) <= 1800: # half-an-hour idx_closest = np.argmin(dates_diff) tide.append(tide_level[idx_closest]) else: tide.append(np.nan) tide = np.array(tide) return tide def remove_duplicates(output, satname): " removes duplicates from output structure, keep the one with less cloud cover or best georeferencing " dates = output['dates'] dates_str = [_.strftime('%Y%m%d') for _ in dates] dupl = utils.duplicates_dict(dates_str) if dupl: output_nodup = dict([]) idx_remove = [] if satname == 'L8' or satname == 'L5': for k,v in dupl.items(): idx1 = v[0] idx2 = v[1] c1 = output['metadata']['cloud_cover'][idx1] c2 = output['metadata']['cloud_cover'][idx2] g1 = output['metadata']['acc_georef'][idx1] g2 = output['metadata']['acc_georef'][idx2] if c1 < c2 - 0.01: idx_remove.append(idx2) elif g1 < g2 - 0.1: idx_remove.append(idx2) else: idx_remove.append(idx1) else: for k,v in dupl.items(): idx1 = v[0] idx2 = v[1] c1 = output['metadata']['cloud_cover'][idx1] c2 = output['metadata']['cloud_cover'][idx2] if c1 < c2 - 0.01: idx_remove.append(idx2) else: idx_remove.append(idx1) idx_remove = sorted(idx_remove) idx_all = np.linspace(0, len(dates_str)-1, len(dates_str)) idx_keep = list(np.where(~np.isin(idx_all,idx_remove))[0]) output_nodup['dates'] = [output['dates'][k] for k in idx_keep] output_nodup['shorelines'] = [output['shorelines'][k] for k in idx_keep] output_nodup['metadata'] = dict([]) for key in list(output['metadata'].keys()): output_nodup['metadata'][key] = [output['metadata'][key][k] for k in idx_keep] print(satname + ' : ' + str(len(idx_remove)) + ' duplicates') return output_nodup else: print(satname + ' : ' + 'no duplicates') return output def merge(output): " merges data from the different satellites " # stack all list together under one key output_all = {'dates':[], 'shorelines':[], 'metadata':{'filenames':[], 'satname':[], 'cloud_cover':[], 'acc_georef':[]}} for satname in list(output.keys()): output_all['dates'] = output_all['dates'] + output[satname]['dates'] output_all['shorelines'] = output_all['shorelines'] + output[satname]['shorelines'] for key in list(output[satname]['metadata'].keys()): output_all['metadata'][key] = output_all['metadata'][key] + output[satname]['metadata'][key] output_all_sorted = {'dates':[], 'shorelines':[], 'metadata':{'filenames':[], 'satname':[], 'cloud_cover':[], 'acc_georef':[]}} # sort the dates idx_sorted = sorted(range(len(output_all['dates'])), key=output_all['dates'].__getitem__) output_all_sorted['dates'] = [output_all['dates'][i] for i in idx_sorted] output_all_sorted['shorelines'] = [output_all['shorelines'][i] for i in idx_sorted] for key in list(output_all['metadata'].keys()): output_all_sorted['metadata'][key] = [output_all['metadata'][key][i] for i in idx_sorted] return output_all_sorted def create_transects(x0, y0, orientation, chainage_length): " creates shore-normal transects " transects = [] for k in range(len(x0)): # orientation of cross-shore profile phi = (90 - orientation[k])*np.pi/180 # create a vector using the chainage length x = np.linspace(0,chainage_length,chainage_length+1) y = np.zeros(len(x)) coords = np.zeros((len(x),2)) coords[:,0] = x coords[:,1] = y # translate and rotate the vector using the origin and orientation tf = transform.EuclideanTransform(rotation=phi, translation=(x0[k],y0[k])) coords_tf = tf(coords) transects.append(coords_tf) return transects def calculate_chainage(sds, transects, orientation, along_dist): " intersect SDS with transect and compute chainage position " chainage_mtx = np.zeros((len(sds),len(transects),6)) for i in range(len(sds)): sl = sds[i] for j in range(len(transects)): # compute rotation matrix X0 = transects[j][0,0] Y0 = transects[j][0,1] phi = (90 - orientation[j])*np.pi/180 Mrot = np.array([[np.cos(phi), np.sin(phi)],[-np.sin(phi), np.cos(phi)]]) # calculate point to line distance between shoreline points and profile p1 = np.array([X0,Y0]) p2 = transects[j][-1,:] p3 = sl d = np.abs(np.cross(p2-p1,p3-p1)/np.linalg.norm(p2-p1)) idx_close = utils.find_indices(d, lambda e: e <= along_dist) # check if there are SDS points around the profile or not if not idx_close: chainage_mtx[i,j,:] = np.tile(np.nan,(1,6)) else: # change of base to shore-normal coordinate system xy_close = np.array([sl[idx_close,0],sl[idx_close,1]]) - np.tile(np.array([[X0],[Y0]]), (1,len(sl[idx_close]))) xy_rot = np.matmul(Mrot, xy_close) # put nan values if the chainage is negative (MAKE SURE TO PICK ORIGIN CORRECTLY) if np.any(xy_rot[0,:] < 0): xy_rot[0,np.where(xy_rot[0,:] < 0)] = np.nan # compute mean, median max and std of chainage position n_points = len(xy_rot[0,:]) mean_cross = np.nanmean(xy_rot[0,:]) median_cross = np.nanmedian(xy_rot[0,:]) max_cross = np.nanmax(xy_rot[0,:]) min_cross = np.nanmin(xy_rot[0,:]) std_cross = np.nanstd(xy_rot[0,:]) if std_cross > 10: # if large std, take the most seaward point mean_cross = max_cross median_cross = max_cross min_cross = max_cross # store the statistics chainage_mtx[i,j,:] = np.array([mean_cross, median_cross, max_cross, min_cross, n_points, std_cross]) # format into dictionnary chainage = dict([]) chainage['mean'] = chainage_mtx[:,:,0] chainage['median'] = chainage_mtx[:,:,1] chainage['max'] = chainage_mtx[:,:,2] chainage['min'] = chainage_mtx[:,:,3] chainage['npoints'] = chainage_mtx[:,:,4] chainage['std'] = chainage_mtx[:,:,5] return chainage def compare_sds(dates_sds, chain_sds, topo_profiles, mod=0, mindays=5): """ Compare sds with groundtruth data from topographic surveys / argus shorelines KV WRL 2018 Arguments: ----------- dates_sds: list list of dates corresponding to each row in chain_sds chain_sds: np.ndarray array with time series of chainage for each transect (each transect is one column) topo_profiles: dict dict containing the dates and chainage of the groundtruth mod: 0 or 1 0 for linear interpolation between 2 closest surveys, 1 for only nearest neighbour min_days: int minimum number of days for which the data can be compared Returns: ----------- stats: dict contains all the statistics of the comparison """ # create 3 figures fig1 = plt.figure() gs1 = gridspec.GridSpec(chain_sds.shape[1], 1) fig2 = plt.figure() gs2 = gridspec.GridSpec(2, chain_sds.shape[1]) fig3 = plt.figure() gs3 = gridspec.GridSpec(2,1) dates_sds_num = np.array([_.toordinal() for _ in dates_sds]) stats = dict([]) data_fin = dict([]) # for each transect compare and plot the data for i in range(chain_sds.shape[1]): pfname = list(topo_profiles.keys())[i] stats[pfname] = dict([]) data_fin[pfname] = dict([]) dates_sur = topo_profiles[pfname]['dates'] chain_sur = topo_profiles[pfname]['chainage'] # convert to datenum dates_sur_num = np.array([_.toordinal() for _ in dates_sur]) chain_sur_interp = [] diff_days = [] for j, satdate in enumerate(dates_sds_num): temp_diff = satdate - dates_sur_num if mod==0: # select measurement before and after sat image date and interpolate ind_before = np.where(temp_diff == temp_diff[temp_diff > 0][-1])[0] if ind_before == len(temp_diff)-1: chain_sur_interp.append(np.nan) diff_days.append(np.abs(satdate-dates_sur_num[ind_before])[0]) continue ind_after = np.where(temp_diff == temp_diff[temp_diff < 0][0])[0] tempx = np.zeros(2) tempx[0] = dates_sur_num[ind_before] tempx[1] = dates_sur_num[ind_after] tempy = np.zeros(2) tempy[0] = chain_sur[ind_before] tempy[1] = chain_sur[ind_after] diff_days.append(np.abs(np.max([satdate-tempx[0], satdate-tempx[1]]))) # interpolate f = interpolate.interp1d(tempx, tempy) chain_sur_interp.append(f(satdate)) elif mod==1: # select the closest measurement idx_closest = utils.find_indices(np.abs(temp_diff), lambda e: e == np.min(np.abs(temp_diff)))[0] diff_days.append(np.abs(satdate-dates_sur_num[idx_closest])) if diff_days[j] > mindays: chain_sur_interp.append(np.nan) else: chain_sur_interp.append(chain_sur[idx_closest]) chain_sur_interp = np.array(chain_sur_interp) # remove nan values idx_sur_nan = ~np.isnan(chain_sur_interp) idx_sat_nan = ~np.isnan(chain_sds[:,i]) idx_nan = np.logical_and(idx_sur_nan, idx_sat_nan) # groundtruth and sds chain_sur_fin = chain_sur_interp[idx_nan] chain_sds_fin = chain_sds[idx_nan,i] dates_fin = [k for (k, v) in zip(dates_sds, idx_nan) if v] # calculate statistics slope, intercept, rvalue, pvalue, std_err = sstats.linregress(chain_sur_fin, chain_sds_fin) R2 = rvalue**2 correlation = np.corrcoef(chain_sur_fin, chain_sds_fin)[0,1] diff_chain = chain_sur_fin - chain_sds_fin rmse = np.sqrt(np.nanmean((diff_chain)**2)) mean = np.nanmean(diff_chain) std = np.nanstd(diff_chain) q90 = np.percentile(np.abs(diff_chain), 90) # store data stats[pfname]['rmse'] = rmse stats[pfname]['mean'] = mean stats[pfname]['std'] = std stats[pfname]['q90'] = q90 stats[pfname]['diffdays'] = diff_days stats[pfname]['corr'] = correlation stats[pfname]['linfit'] = {'slope':slope, 'intercept':intercept, 'R2':R2, 'pvalue':pvalue} data_fin[pfname]['dates'] = dates_fin data_fin[pfname]['sds'] = chain_sds_fin data_fin[pfname]['survey'] = chain_sur_fin # make time-series plot plt.figure(fig1.number) fig1.add_subplot(gs1[i,0]) plt.plot(dates_sur, chain_sur, 'o-', color='C1', markersize=4, label='survey all') plt.plot(dates_fin, chain_sur_fin, 'o', color=[0.3, 0.3, 0.3], markersize=2, label='survey interp') plt.plot(dates_fin, chain_sds_fin, 'o--', color='b', markersize=4, label='SDS') plt.title(pfname, fontweight='bold') # plt.xlim([dates_sds[0], dates_sds[-1]]) plt.ylabel('chainage [m]') # make scatter plot plt.figure(fig2.number) fig2.add_subplot(gs2[0,i]) plt.axis('equal') plt.plot(chain_sur_fin, chain_sds_fin, 'ko', markersize=4, markerfacecolor='w', alpha=0.7) xmax = np.max([np.nanmax(chain_sds_fin),np.nanmax(chain_sur_fin)]) xmin = np.min([np.nanmin(chain_sds_fin),np.nanmin(chain_sur_fin)]) ymax = np.max([np.nanmax(chain_sds_fin),np.nanmax(chain_sur_fin)]) ymin = np.min([np.nanmin(chain_sds_fin),np.nanmin(chain_sur_fin)]) plt.plot([xmin, xmax], [ymin, ymax], 'k--') plt.plot([xmin, xmax], [xmin*slope + intercept, xmax*slope + intercept], 'b:') str_corr = ' y = %.2f x + %.2f\n R2 = %.2f' % (slope, intercept, R2) plt.text(xmin, ymax-5, str_corr, bbox=dict(facecolor=[0.7,0.7,0.7], alpha=0.5), horizontalalignment='left') plt.xlabel('chainage survey [m]') plt.ylabel('chainage satellite [m]') plt.title(pfname, fontweight='bold') fig2.add_subplot(gs2[1,i]) binwidth = 3 bins = np.arange(min(diff_chain), max(diff_chain) + binwidth, binwidth) density = plt.hist(diff_chain, bins=bins, density=True, color=[0.8, 0.8, 0.8], edgecolor='k') plt.xlim([-50, 50]) plt.xlabel('error [m]') str_stats = ' rmse = %.1f\n mean = %.1f\n std = %.1f\n q90 = %.1f' % (rmse, mean, std, q90) plt.text(15, np.max(density[0])-0.015, str_stats, bbox=dict(facecolor=[0.8,0.8,0.8], alpha=0.3), horizontalalignment='left', fontsize=10) fig1.set_size_inches(19.2, 9.28) fig1.set_tight_layout(True) fig2.set_size_inches(19.2, 9.28) fig2.set_tight_layout(True) # all transects together chain_sds_all = [] chain_sur_all = [] for i in range(chain_sds.shape[1]): pfname = list(topo_profiles.keys())[i] chain_sds_all = np.append(chain_sds_all,data_fin[pfname]['sds']) chain_sur_all = np.append(chain_sur_all,data_fin[pfname]['survey']) # calculate statistics slope, intercept, rvalue, pvalue, std_err = sstats.linregress(chain_sur_all, chain_sds_all) R2 = rvalue**2 correlation = np.corrcoef(chain_sur_all, chain_sds_all)[0,1] diff_chain_all = chain_sur_all - chain_sds_all rmse = np.sqrt(np.nanmean((diff_chain_all)**2)) mean = np.nanmean(diff_chain_all) std = np.nanstd(diff_chain_all) q90 = np.percentile(np.abs(diff_chain_all), 90) stats['all'] = {'rmse':rmse,'mean':mean,'std':std,'q90':q90, 'corr':correlation, 'linfit':{'slope':slope, 'intercept':intercept, 'R2':R2, 'pvalue':pvalue}} # make plot plt.figure(fig3.number) fig3.add_subplot(gs3[0,0]) plt.axis('equal') plt.plot(chain_sur_all, chain_sds_all, 'ko', markersize=4, markerfacecolor='w', alpha=0.7) xmax = np.max([np.nanmax(chain_sds_all),np.nanmax(chain_sur_all)]) xmin = np.min([np.nanmin(chain_sds_all),np.nanmin(chain_sur_all)]) ymax = np.max([np.nanmax(chain_sds_all),np.nanmax(chain_sur_all)]) ymin = np.min([np.nanmin(chain_sds_all),np.nanmin(chain_sur_all)]) plt.plot([xmin, xmax], [ymin, ymax], 'k--') plt.plot([xmin, xmax], [xmin*slope + intercept, xmax*slope + intercept], 'b:') str_corr = ' y = %.2f x + %.2f\n R2 = %.2f' % (slope, intercept, R2) plt.text(xmin, ymax-5, str_corr, bbox=dict(facecolor=[0.7,0.7,0.7], alpha=0.5), horizontalalignment='left') plt.xlabel('chainage survey [m]') plt.ylabel('chainage satellite [m]') plt.title(pfname, fontweight='bold') fig3.add_subplot(gs3[1,0]) binwidth = 3 bins = np.arange(min(diff_chain_all), max(diff_chain_all) + binwidth, binwidth) density = plt.hist(diff_chain_all, bins=bins, density=True, color=[0.8, 0.8, 0.8], edgecolor='k') plt.xlim([-50, 50]) plt.xlabel('error [m]') str_stats = ' rmse = %.1f\n mean = %.1f\n std = %.1f\n q90 = %.1f' % (rmse, mean, std, q90) plt.text(15, np.max(density[0])-0.015, str_stats, bbox=dict(facecolor=[0.8,0.8,0.8], alpha=0.3), horizontalalignment='left', fontsize=10) fig3.set_size_inches(9.2, 9.28) fig3.set_tight_layout(True) return stats