CoastSat_WRL/functions/data_analysis.py

"""This module contains all the functions needed for data analysis """

# Initial settings
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib import gridspec
import pdb
import ee

# other modules
from osgeo import gdal, ogr, osr
import scipy.interpolate as interpolate
import scipy.stats as sstats

# image processing modules
import skimage.filters as filters 
import skimage.exposure as exposure
import skimage.transform as transform
import sklearn.decomposition as decomposition
import skimage.measure as measure
import skimage.morphology as morphology

# machine learning modules
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib

import time

# import own modules
import functions.utils as utils

def get_tide(dates_sds, dates_tide, tide_level):
    
    tide = []
    for i in range(len(dates_sds)):
        dates_diff = np.abs(np.array([ (dates_sds[i] - _).total_seconds() for _ in dates_tide]))
        if np.min(dates_diff) <= 1800: # half-an-hour
            idx_closest = np.argmin(dates_diff)
            tide.append(tide_level[idx_closest])
        else:
            tide.append(np.nan)
    tide = np.array(tide)
    
    return tide

def remove_duplicates(output, satname):
    " removes duplicates from output structure, keep the one with less cloud cover or best georeferencing "
    dates = output['dates']
    dates_str = [_.strftime('%Y%m%d') for _ in dates]
    dupl = utils.duplicates_dict(dates_str)
    if dupl:
        output_nodup = dict([])
        idx_remove = []
        if satname == 'L8' or satname == 'L5':
            for k,v in dupl.items():
                
                idx1 = v[0]
                idx2 = v[1]
                
                c1 = output['metadata']['cloud_cover'][idx1]
                c2 = output['metadata']['cloud_cover'][idx2]
                g1 = output['metadata']['acc_georef'][idx1]
                g2 = output['metadata']['acc_georef'][idx2]
                
                if c1 < c2 - 0.01:
                    idx_remove.append(idx2)
                elif g1 < g2 - 0.1:
                    idx_remove.append(idx2)
                else:
                    idx_remove.append(idx1)
            
        else:
            for k,v in dupl.items():
                
                idx1 = v[0]
                idx2 = v[1]
                
                c1 = output['metadata']['cloud_cover'][idx1]
                c2 = output['metadata']['cloud_cover'][idx2]
                
                if c1 < c2 - 0.01:
                    idx_remove.append(idx2)
                else:
                    idx_remove.append(idx1)
                    
        idx_remove = sorted(idx_remove)
        idx_all = np.linspace(0, len(dates_str)-1, len(dates_str))
        idx_keep = list(np.where(~np.isin(idx_all,idx_remove))[0])        
        
        output_nodup['dates'] = [output['dates'][k] for k in idx_keep]
        output_nodup['shorelines'] = [output['shorelines'][k] for k in idx_keep]
        output_nodup['metadata'] = dict([])
        for key in list(output['metadata'].keys()):
            output_nodup['metadata'][key] = [output['metadata'][key][k] for k in idx_keep]
        print(satname + ' : ' + str(len(idx_remove)) + ' duplicates')
        return output_nodup
        
    else: 
        print(satname + ' : ' + 'no duplicates')
        return output
    
    
def merge(output):
    " merges data from the different satellites "
    
    # stack all list together under one key
    output_all = {'dates':[], 'shorelines':[],
                  'metadata':{'filenames':[], 'satname':[], 'cloud_cover':[], 'acc_georef':[]}}
    for satname in list(output.keys()):
        output_all['dates'] = output_all['dates'] + output[satname]['dates']
        output_all['shorelines'] = output_all['shorelines'] + output[satname]['shorelines']
        for key in list(output[satname]['metadata'].keys()):
            output_all['metadata'][key]  = output_all['metadata'][key] + output[satname]['metadata'][key]   
    
    output_all_sorted = {'dates':[], 'shorelines':[],
                         'metadata':{'filenames':[], 'satname':[], 'cloud_cover':[], 'acc_georef':[]}}
    # sort the dates
    idx_sorted = sorted(range(len(output_all['dates'])), key=output_all['dates'].__getitem__)
    output_all_sorted['dates'] = [output_all['dates'][i] for i in idx_sorted]
    output_all_sorted['shorelines'] = [output_all['shorelines'][i] for i in idx_sorted]
    for key in list(output_all['metadata'].keys()):
        output_all_sorted['metadata'][key] = [output_all['metadata'][key][i] for i in idx_sorted]
        
    return output_all_sorted

def create_transects(x0, y0, orientation, chainage_length):
    " creates shore-normal transects "
    
    transects = []
    
    for k in range(len(x0)):
        
        # orientation of cross-shore profile
        phi = (90 - orientation[k])*np.pi/180
        
        # create a vector using the chainage length
        x = np.linspace(0,chainage_length,chainage_length+1)
        y = np.zeros(len(x))
        coords = np.zeros((len(x),2))
        coords[:,0] = x
        coords[:,1] = y
        
        # translate and rotate the vector using the origin and orientation
        tf = transform.EuclideanTransform(rotation=phi, translation=(x0[k],y0[k]))
        coords_tf = tf(coords)
        
        transects.append(coords_tf)
        
    return transects

def calculate_chainage(sds, transects, orientation, along_dist):
    " intersect SDS with transect and compute chainage position "
    
    chainage_mtx = np.zeros((len(sds),len(transects),6))
    
    for i in range(len(sds)):
        
        sl = sds[i]
        
        for j in range(len(transects)): 
            
            # compute rotation matrix
            X0 = transects[j][0,0]
            Y0 = transects[j][0,1]
            phi = (90 - orientation[j])*np.pi/180
            Mrot = np.array([[np.cos(phi), np.sin(phi)],[-np.sin(phi), np.cos(phi)]])
    
            # calculate point to line distance between shoreline points and profile
            p1 = np.array([X0,Y0])
            p2 = transects[j][-1,:]
            p3 = sl
            d = np.abs(np.cross(p2-p1,p3-p1)/np.linalg.norm(p2-p1))
            idx_close = utils.find_indices(d, lambda e: e <= along_dist)
            
            # check if there are SDS points around the profile or not 
            if not idx_close:
                chainage_mtx[i,j,:] = np.tile(np.nan,(1,6))
                
            else:
                # change of base to shore-normal coordinate system
                xy_close = np.array([sl[idx_close,0],sl[idx_close,1]]) - np.tile(np.array([[X0],[Y0]]), (1,len(sl[idx_close])))
                xy_rot = np.matmul(Mrot, xy_close)
                
                # put nan values if the chainage is negative (MAKE SURE TO PICK ORIGIN CORRECTLY)
                if np.any(xy_rot[0,:] < 0):
                    xy_rot[0,np.where(xy_rot[0,:] < 0)] = np.nan
                    
                # compute mean, median max and std of chainage position
                n_points = len(xy_rot[0,:])
                mean_cross = np.nanmean(xy_rot[0,:])
                median_cross = np.nanmedian(xy_rot[0,:])
                max_cross = np.nanmax(xy_rot[0,:])
                min_cross = np.nanmin(xy_rot[0,:])
                std_cross = np.nanstd(xy_rot[0,:])
                
                if std_cross > 10: # if large std, take the most seaward point
                    mean_cross = max_cross
                    median_cross = max_cross
                    min_cross = max_cross
                
                # store the statistics
                chainage_mtx[i,j,:] = np.array([mean_cross, median_cross, max_cross,
                            min_cross, n_points, std_cross])   
     
    # format into dictionnary
    chainage = dict([])
    chainage['mean'] = chainage_mtx[:,:,0]
    chainage['median'] = chainage_mtx[:,:,1]
    chainage['max'] = chainage_mtx[:,:,2]
    chainage['min'] = chainage_mtx[:,:,3]
    chainage['npoints'] = chainage_mtx[:,:,4]
    chainage['std'] = chainage_mtx[:,:,5]
        
    return chainage

def compare_sds(dates_sds, chain_sds, topo_profiles, mod=0, mindays=5):
    """
    Compare sds with groundtruth data from topographic surveys / argus shorelines
    
    KV WRL 2018

    Arguments:
    -----------
        dates_sds: list
            list of dates corresponding to each row in chain_sds
        chain_sds: np.ndarray
            array with time series of chainage for each transect (each transect is one column)
        topo_profiles: dict
            dict containing the dates and chainage of the groundtruth
        mod: 0 or 1
            0 for linear interpolation between 2 closest surveys, 1 for only nearest neighbour
        min_days: int
            minimum number of days for which the data can be compared     
                
    Returns:    -----------
        stats: dict
            contains all the statistics of the comparison

    """       

    # create 3 figures       
    fig1 = plt.figure()
    gs1 = gridspec.GridSpec(chain_sds.shape[1], 1)
    fig2 = plt.figure()
    gs2 = gridspec.GridSpec(2, chain_sds.shape[1])
    fig3 = plt.figure()
    gs3 = gridspec.GridSpec(2,1)
    
    dates_sds_num = np.array([_.toordinal() for _ in dates_sds])
    stats = dict([])
    data_fin = dict([])
    
    # for each transect compare and plot the data
    for i in range(chain_sds.shape[1]):
        
        pfname = list(topo_profiles.keys())[i]
        stats[pfname] = dict([])
        data_fin[pfname] = dict([])
        
        dates_sur = topo_profiles[pfname]['dates']
        chain_sur = topo_profiles[pfname]['chainage']
        
        # convert to datenum
        dates_sur_num = np.array([_.toordinal() for _ in dates_sur])
        
        chain_sur_interp = []
        diff_days = []
        
        for j, satdate in enumerate(dates_sds_num):
            
            temp_diff = satdate - dates_sur_num
            
            if mod==0:
                # select measurement before and after sat image date and interpolate
                
                ind_before = np.where(temp_diff == temp_diff[temp_diff > 0][-1])[0]     
                if ind_before == len(temp_diff)-1:
                    chain_sur_interp.append(np.nan)
                    diff_days.append(np.abs(satdate-dates_sur_num[ind_before])[0])
                    continue         
                ind_after = np.where(temp_diff == temp_diff[temp_diff < 0][0])[0]            
                tempx = np.zeros(2)
                tempx[0] = dates_sur_num[ind_before]
                tempx[1] = dates_sur_num[ind_after]
                tempy = np.zeros(2)
                tempy[0] = chain_sur[ind_before]
                tempy[1] = chain_sur[ind_after]
                diff_days.append(np.abs(np.max([satdate-tempx[0], satdate-tempx[1]])))                
                # interpolate
                f = interpolate.interp1d(tempx, tempy)
                chain_sur_interp.append(f(satdate))
                
            elif mod==1:
                # select the closest measurement
                
                idx_closest = utils.find_indices(np.abs(temp_diff), lambda e: e == np.min(np.abs(temp_diff)))[0]
                diff_days.append(np.abs(satdate-dates_sur_num[idx_closest]))
                if diff_days[j] > mindays:
                    chain_sur_interp.append(np.nan)
                else:
                    chain_sur_interp.append(chain_sur[idx_closest])

        chain_sur_interp = np.array(chain_sur_interp)
        
        # remove nan values
        idx_sur_nan = ~np.isnan(chain_sur_interp)
        idx_sat_nan = ~np.isnan(chain_sds[:,i])
        idx_nan = np.logical_and(idx_sur_nan, idx_sat_nan)
        
        # groundtruth and sds
        chain_sur_fin = chain_sur_interp[idx_nan]
        chain_sds_fin = chain_sds[idx_nan,i]
        dates_fin = [k for (k, v) in zip(dates_sds, idx_nan) if v]
        
        # calculate statistics
        slope, intercept, rvalue, pvalue, std_err = sstats.linregress(chain_sur_fin, chain_sds_fin) 
        R2 = rvalue**2
        correlation = np.corrcoef(chain_sur_fin, chain_sds_fin)[0,1]
        diff_chain = chain_sur_fin - chain_sds_fin
                
        rmse = np.sqrt(np.nanmean((diff_chain)**2))
        mean = np.nanmean(diff_chain)
        std = np.nanstd(diff_chain)
        q90 = np.percentile(np.abs(diff_chain), 90)
        
        # store data
        stats[pfname]['rmse'] = rmse
        stats[pfname]['mean'] = mean
        stats[pfname]['std'] = std
        stats[pfname]['q90'] = q90
        stats[pfname]['diffdays'] = diff_days
        stats[pfname]['corr'] = correlation
        stats[pfname]['linfit'] = {'slope':slope, 'intercept':intercept, 'R2':R2, 'pvalue':pvalue}
        
        data_fin[pfname]['dates'] = dates_fin
        data_fin[pfname]['sds'] = chain_sds_fin
        data_fin[pfname]['survey'] = chain_sur_fin
        
        # make time-series plot
        plt.figure(fig1.number)
        fig1.add_subplot(gs1[i,0])
        plt.plot(dates_sur, chain_sur, 'o-', color='C1', markersize=4, label='survey all')
        plt.plot(dates_fin, chain_sur_fin, 'o', color=[0.3, 0.3, 0.3], markersize=2, label='survey interp')
        plt.plot(dates_fin, chain_sds_fin, 'o--', color='b', markersize=4, label='SDS')
        plt.title(pfname, fontweight='bold')
#        plt.xlim([dates_sds[0], dates_sds[-1]])
        plt.ylabel('chainage [m]')
        
        # make scatter plot
        plt.figure(fig2.number)
        fig2.add_subplot(gs2[0,i])
        plt.axis('equal')
        plt.plot(chain_sur_fin, chain_sds_fin, 'ko', markersize=4, markerfacecolor='w', alpha=0.7)
        xmax = np.max([np.nanmax(chain_sds_fin),np.nanmax(chain_sur_fin)])
        xmin = np.min([np.nanmin(chain_sds_fin),np.nanmin(chain_sur_fin)])
        ymax = np.max([np.nanmax(chain_sds_fin),np.nanmax(chain_sur_fin)])
        ymin = np.min([np.nanmin(chain_sds_fin),np.nanmin(chain_sur_fin)])
        plt.plot([xmin, xmax], [ymin, ymax], 'k--')
        plt.plot([xmin, xmax], [xmin*slope + intercept, xmax*slope + intercept], 'b:')
        str_corr = ' y = %.2f x + %.2f\n R2 = %.2f' % (slope, intercept, R2)
        plt.text(xmin, ymax-5, str_corr, bbox=dict(facecolor=[0.7,0.7,0.7], alpha=0.5), horizontalalignment='left')
        plt.xlabel('chainage survey [m]')
        plt.ylabel('chainage satellite [m]')
        plt.title(pfname, fontweight='bold')
        
        fig2.add_subplot(gs2[1,i])
        binwidth = 3
        bins = np.arange(min(diff_chain), max(diff_chain) + binwidth, binwidth)
        density = plt.hist(diff_chain, bins=bins, density=True, color=[0.8, 0.8, 0.8], edgecolor='k')
        plt.xlim([-50, 50])
        plt.xlabel('error [m]')
        str_stats = ' rmse = %.1f\n mean = %.1f\n std = %.1f\n q90 = %.1f' % (rmse, mean, std, q90) 
        plt.text(15, np.max(density[0])-0.015, str_stats, bbox=dict(facecolor=[0.8,0.8,0.8], alpha=0.3), horizontalalignment='left', fontsize=10)
                                  
    fig1.set_size_inches(19.2, 9.28)
    fig1.set_tight_layout(True)
    fig2.set_size_inches(19.2, 9.28)
    fig2.set_tight_layout(True)

    # all transects together
    chain_sds_all = []
    chain_sur_all = []
    for i in range(chain_sds.shape[1]):
        pfname = list(topo_profiles.keys())[i]
        chain_sds_all = np.append(chain_sds_all,data_fin[pfname]['sds'])
        chain_sur_all = np.append(chain_sur_all,data_fin[pfname]['survey'])
    
    # calculate statistics
    slope, intercept, rvalue, pvalue, std_err = sstats.linregress(chain_sur_all, chain_sds_all) 
    R2 = rvalue**2
    correlation = np.corrcoef(chain_sur_all, chain_sds_all)[0,1]
    diff_chain_all = chain_sur_all - chain_sds_all
    
    rmse = np.sqrt(np.nanmean((diff_chain_all)**2))
    mean = np.nanmean(diff_chain_all)
    std = np.nanstd(diff_chain_all)
    q90 = np.percentile(np.abs(diff_chain_all), 90)
    
    stats['all'] = {'rmse':rmse,'mean':mean,'std':std,'q90':q90, 'corr':correlation,
         'linfit':{'slope':slope, 'intercept':intercept, 'R2':R2, 'pvalue':pvalue}}
    
    # make plot
    plt.figure(fig3.number)
    fig3.add_subplot(gs3[0,0])
    plt.axis('equal')
    plt.plot(chain_sur_all, chain_sds_all, 'ko', markersize=4, markerfacecolor='w', alpha=0.7)
    xmax = np.max([np.nanmax(chain_sds_all),np.nanmax(chain_sur_all)])
    xmin = np.min([np.nanmin(chain_sds_all),np.nanmin(chain_sur_all)])
    ymax = np.max([np.nanmax(chain_sds_all),np.nanmax(chain_sur_all)])
    ymin = np.min([np.nanmin(chain_sds_all),np.nanmin(chain_sur_all)])
    plt.plot([xmin, xmax], [ymin, ymax], 'k--')
    plt.plot([xmin, xmax], [xmin*slope + intercept, xmax*slope + intercept], 'b:')
    str_corr = ' y = %.2f x + %.2f\n R2 = %.2f' % (slope, intercept, R2)
    plt.text(xmin, ymax-5, str_corr, bbox=dict(facecolor=[0.7,0.7,0.7], alpha=0.5), horizontalalignment='left')
    plt.xlabel('chainage survey [m]')
    plt.ylabel('chainage satellite [m]')
    plt.title(pfname, fontweight='bold')

    fig3.add_subplot(gs3[1,0])
    binwidth = 3
    bins = np.arange(min(diff_chain_all), max(diff_chain_all) + binwidth, binwidth)
    density = plt.hist(diff_chain_all, bins=bins, density=True, color=[0.8, 0.8, 0.8], edgecolor='k')
    plt.xlim([-50, 50])
    plt.xlabel('error [m]')
    str_stats = ' rmse = %.1f\n mean = %.1f\n std = %.1f\n q90 = %.1f' % (rmse, mean, std, q90) 
    plt.text(15, np.max(density[0])-0.015, str_stats, bbox=dict(facecolor=[0.8,0.8,0.8], alpha=0.3), horizontalalignment='left', fontsize=10)
    fig3.set_size_inches(9.2, 9.28)
    fig3.set_tight_layout(True)              
        
    return stats
updated master added download_images and read_images scripts 7 years ago			`"""This module contains all the functions needed for data analysis """`

			`# Initial settings`
			`import numpy as np`
			`import matplotlib.pyplot as plt`
			`import matplotlib.patches as mpatches`
			`from matplotlib import gridspec`
			`import pdb`
			`import ee`

			`# other modules`
			`from osgeo import gdal, ogr, osr`
			`import scipy.interpolate as interpolate`
			`import scipy.stats as sstats`

			`# image processing modules`
			`import skimage.filters as filters`
			`import skimage.exposure as exposure`
			`import skimage.transform as transform`
			`import sklearn.decomposition as decomposition`
			`import skimage.measure as measure`
			`import skimage.morphology as morphology`

			`# machine learning modules`
			`from sklearn.cluster import KMeans`
			`from sklearn.neural_network import MLPClassifier`
			`from sklearn.externals import joblib`

			`import time`

			`# import own modules`
			`import functions.utils as utils`

			`def get_tide(dates_sds, dates_tide, tide_level):`

			`tide = []`
			`for i in range(len(dates_sds)):`
			`dates_diff = np.abs(np.array([ (dates_sds[i] - _).total_seconds() for _ in dates_tide]))`
			`if np.min(dates_diff) <= 1800: # half-an-hour`
			`idx_closest = np.argmin(dates_diff)`
			`tide.append(tide_level[idx_closest])`
			`else:`
			`tide.append(np.nan)`
			`tide = np.array(tide)`

			`return tide`

			`def remove_duplicates(output, satname):`
			`" removes duplicates from output structure, keep the one with less cloud cover or best georeferencing "`
			`dates = output['dates']`
			`dates_str = [_.strftime('%Y%m%d') for _ in dates]`
			`dupl = utils.duplicates_dict(dates_str)`
			`if dupl:`
			`output_nodup = dict([])`
			`idx_remove = []`
			`if satname == 'L8' or satname == 'L5':`
			`for k,v in dupl.items():`

			`idx1 = v[0]`
			`idx2 = v[1]`

			`c1 = output['metadata']['cloud_cover'][idx1]`
			`c2 = output['metadata']['cloud_cover'][idx2]`
			`g1 = output['metadata']['acc_georef'][idx1]`
			`g2 = output['metadata']['acc_georef'][idx2]`

			`if c1 < c2 - 0.01:`
			`idx_remove.append(idx2)`
			`elif g1 < g2 - 0.1:`
			`idx_remove.append(idx2)`
			`else:`
			`idx_remove.append(idx1)`

			`else:`
			`for k,v in dupl.items():`

			`idx1 = v[0]`
			`idx2 = v[1]`

			`c1 = output['metadata']['cloud_cover'][idx1]`
			`c2 = output['metadata']['cloud_cover'][idx2]`

			`if c1 < c2 - 0.01:`
			`idx_remove.append(idx2)`
			`else:`
			`idx_remove.append(idx1)`

			`idx_remove = sorted(idx_remove)`
			`idx_all = np.linspace(0, len(dates_str)-1, len(dates_str))`
			`idx_keep = list(np.where(~np.isin(idx_all,idx_remove))[0])`

			`output_nodup['dates'] = [output['dates'][k] for k in idx_keep]`
			`output_nodup['shorelines'] = [output['shorelines'][k] for k in idx_keep]`
			`output_nodup['metadata'] = dict([])`
			`for key in list(output['metadata'].keys()):`
			`output_nodup['metadata'][key] = [output['metadata'][key][k] for k in idx_keep]`
			`print(satname + ' : ' + str(len(idx_remove)) + ' duplicates')`
			`return output_nodup`

			`else:`
			`print(satname + ' : ' + 'no duplicates')`
			`return output`


			`def merge(output):`
			`" merges data from the different satellites "`

			`# stack all list together under one key`
			`output_all = {'dates':[], 'shorelines':[],`
			`'metadata':{'filenames':[], 'satname':[], 'cloud_cover':[], 'acc_georef':[]}}`
			`for satname in list(output.keys()):`
			`output_all['dates'] = output_all['dates'] + output[satname]['dates']`
			`output_all['shorelines'] = output_all['shorelines'] + output[satname]['shorelines']`
			`for key in list(output[satname]['metadata'].keys()):`
			`output_all['metadata'][key] = output_all['metadata'][key] + output[satname]['metadata'][key]`

			`output_all_sorted = {'dates':[], 'shorelines':[],`
			`'metadata':{'filenames':[], 'satname':[], 'cloud_cover':[], 'acc_georef':[]}}`
			`# sort the dates`
			`idx_sorted = sorted(range(len(output_all['dates'])), key=output_all['dates'].__getitem__)`
			`output_all_sorted['dates'] = [output_all['dates'][i] for i in idx_sorted]`
			`output_all_sorted['shorelines'] = [output_all['shorelines'][i] for i in idx_sorted]`
			`for key in list(output_all['metadata'].keys()):`
			`output_all_sorted['metadata'][key] = [output_all['metadata'][key][i] for i in idx_sorted]`

			`return output_all_sorted`

			`def create_transects(x0, y0, orientation, chainage_length):`
			`" creates shore-normal transects "`

			`transects = []`

			`for k in range(len(x0)):`

			`# orientation of cross-shore profile`
			`phi = (90 - orientation[k])*np.pi/180`

			`# create a vector using the chainage length`
			`x = np.linspace(0,chainage_length,chainage_length+1)`
			`y = np.zeros(len(x))`
			`coords = np.zeros((len(x),2))`
			`coords[:,0] = x`
			`coords[:,1] = y`

			`# translate and rotate the vector using the origin and orientation`
			`tf = transform.EuclideanTransform(rotation=phi, translation=(x0[k],y0[k]))`
			`coords_tf = tf(coords)`

			`transects.append(coords_tf)`

			`return transects`

			`def calculate_chainage(sds, transects, orientation, along_dist):`
			`" intersect SDS with transect and compute chainage position "`

			`chainage_mtx = np.zeros((len(sds),len(transects),6))`

			`for i in range(len(sds)):`

			`sl = sds[i]`

			`for j in range(len(transects)):`

			`# compute rotation matrix`
			`X0 = transects[j][0,0]`
			`Y0 = transects[j][0,1]`
			`phi = (90 - orientation[j])*np.pi/180`
			`Mrot = np.array([[np.cos(phi), np.sin(phi)],[-np.sin(phi), np.cos(phi)]])`

			`# calculate point to line distance between shoreline points and profile`
			`p1 = np.array([X0,Y0])`
			`p2 = transects[j][-1,:]`
			`p3 = sl`
			`d = np.abs(np.cross(p2-p1,p3-p1)/np.linalg.norm(p2-p1))`
			`idx_close = utils.find_indices(d, lambda e: e <= along_dist)`

			`# check if there are SDS points around the profile or not`
			`if not idx_close:`
			`chainage_mtx[i,j,:] = np.tile(np.nan,(1,6))`

			`else:`
			`# change of base to shore-normal coordinate system`
			`xy_close = np.array([sl[idx_close,0],sl[idx_close,1]]) - np.tile(np.array([[X0],[Y0]]), (1,len(sl[idx_close])))`
			`xy_rot = np.matmul(Mrot, xy_close)`

			`# put nan values if the chainage is negative (MAKE SURE TO PICK ORIGIN CORRECTLY)`
			`if np.any(xy_rot[0,:] < 0):`
			`xy_rot[0,np.where(xy_rot[0,:] < 0)] = np.nan`

			`# compute mean, median max and std of chainage position`
			`n_points = len(xy_rot[0,:])`
			`mean_cross = np.nanmean(xy_rot[0,:])`
			`median_cross = np.nanmedian(xy_rot[0,:])`
			`max_cross = np.nanmax(xy_rot[0,:])`
			`min_cross = np.nanmin(xy_rot[0,:])`
			`std_cross = np.nanstd(xy_rot[0,:])`

			`if std_cross > 10: # if large std, take the most seaward point`
			`mean_cross = max_cross`
			`median_cross = max_cross`
			`min_cross = max_cross`

			`# store the statistics`
			`chainage_mtx[i,j,:] = np.array([mean_cross, median_cross, max_cross,`
			`min_cross, n_points, std_cross])`

			`# format into dictionnary`
			`chainage = dict([])`
			`chainage['mean'] = chainage_mtx[:,:,0]`
			`chainage['median'] = chainage_mtx[:,:,1]`
			`chainage['max'] = chainage_mtx[:,:,2]`
			`chainage['min'] = chainage_mtx[:,:,3]`
			`chainage['npoints'] = chainage_mtx[:,:,4]`
			`chainage['std'] = chainage_mtx[:,:,5]`

			`return chainage`

			`def compare_sds(dates_sds, chain_sds, topo_profiles, mod=0, mindays=5):`
			`"""`
			`Compare sds with groundtruth data from topographic surveys / argus shorelines`

			`KV WRL 2018`

			`Arguments:`
			`-----------`
			`dates_sds: list`
			`list of dates corresponding to each row in chain_sds`
			`chain_sds: np.ndarray`
			`array with time series of chainage for each transect (each transect is one column)`
			`topo_profiles: dict`
			`dict containing the dates and chainage of the groundtruth`
			`mod: 0 or 1`
			`0 for linear interpolation between 2 closest surveys, 1 for only nearest neighbour`
			`min_days: int`
			`minimum number of days for which the data can be compared`

			`Returns: -----------`
			`stats: dict`
			`contains all the statistics of the comparison`

			`"""`

			`# create 3 figures`
			`fig1 = plt.figure()`
			`gs1 = gridspec.GridSpec(chain_sds.shape[1], 1)`
			`fig2 = plt.figure()`
			`gs2 = gridspec.GridSpec(2, chain_sds.shape[1])`
			`fig3 = plt.figure()`
			`gs3 = gridspec.GridSpec(2,1)`

			`dates_sds_num = np.array([_.toordinal() for _ in dates_sds])`
			`stats = dict([])`
			`data_fin = dict([])`

			`# for each transect compare and plot the data`
			`for i in range(chain_sds.shape[1]):`

			`pfname = list(topo_profiles.keys())[i]`
			`stats[pfname] = dict([])`
			`data_fin[pfname] = dict([])`

			`dates_sur = topo_profiles[pfname]['dates']`
			`chain_sur = topo_profiles[pfname]['chainage']`

			`# convert to datenum`
			`dates_sur_num = np.array([_.toordinal() for _ in dates_sur])`

			`chain_sur_interp = []`
			`diff_days = []`

			`for j, satdate in enumerate(dates_sds_num):`

			`temp_diff = satdate - dates_sur_num`

			`if mod==0:`
			`# select measurement before and after sat image date and interpolate`

			`ind_before = np.where(temp_diff == temp_diff[temp_diff > 0][-1])[0]`
			`if ind_before == len(temp_diff)-1:`
			`chain_sur_interp.append(np.nan)`
			`diff_days.append(np.abs(satdate-dates_sur_num[ind_before])[0])`
			`continue`
			`ind_after = np.where(temp_diff == temp_diff[temp_diff < 0][0])[0]`
			`tempx = np.zeros(2)`
			`tempx[0] = dates_sur_num[ind_before]`
			`tempx[1] = dates_sur_num[ind_after]`
			`tempy = np.zeros(2)`
			`tempy[0] = chain_sur[ind_before]`
			`tempy[1] = chain_sur[ind_after]`
			`diff_days.append(np.abs(np.max([satdate-tempx[0], satdate-tempx[1]])))`
			`# interpolate`
			`f = interpolate.interp1d(tempx, tempy)`
			`chain_sur_interp.append(f(satdate))`

			`elif mod==1:`
			`# select the closest measurement`

			`idx_closest = utils.find_indices(np.abs(temp_diff), lambda e: e == np.min(np.abs(temp_diff)))[0]`
			`diff_days.append(np.abs(satdate-dates_sur_num[idx_closest]))`
			`if diff_days[j] > mindays:`
			`chain_sur_interp.append(np.nan)`
			`else:`
			`chain_sur_interp.append(chain_sur[idx_closest])`

			`chain_sur_interp = np.array(chain_sur_interp)`

			`# remove nan values`
			`idx_sur_nan = ~np.isnan(chain_sur_interp)`
			`idx_sat_nan = ~np.isnan(chain_sds[:,i])`
			`idx_nan = np.logical_and(idx_sur_nan, idx_sat_nan)`

			`# groundtruth and sds`
			`chain_sur_fin = chain_sur_interp[idx_nan]`
			`chain_sds_fin = chain_sds[idx_nan,i]`
			`dates_fin = [k for (k, v) in zip(dates_sds, idx_nan) if v]`

			`# calculate statistics`
			`slope, intercept, rvalue, pvalue, std_err = sstats.linregress(chain_sur_fin, chain_sds_fin)`
			`R2 = rvalue**2`
			`correlation = np.corrcoef(chain_sur_fin, chain_sds_fin)[0,1]`
			`diff_chain = chain_sur_fin - chain_sds_fin`

			`rmse = np.sqrt(np.nanmean((diff_chain)**2))`
			`mean = np.nanmean(diff_chain)`
			`std = np.nanstd(diff_chain)`
			`q90 = np.percentile(np.abs(diff_chain), 90)`

			`# store data`
			`stats[pfname]['rmse'] = rmse`
			`stats[pfname]['mean'] = mean`
			`stats[pfname]['std'] = std`
			`stats[pfname]['q90'] = q90`
			`stats[pfname]['diffdays'] = diff_days`
			`stats[pfname]['corr'] = correlation`
			`stats[pfname]['linfit'] = {'slope':slope, 'intercept':intercept, 'R2':R2, 'pvalue':pvalue}`

			`data_fin[pfname]['dates'] = dates_fin`
			`data_fin[pfname]['sds'] = chain_sds_fin`
			`data_fin[pfname]['survey'] = chain_sur_fin`

			`# make time-series plot`
			`plt.figure(fig1.number)`
			`fig1.add_subplot(gs1[i,0])`
			`plt.plot(dates_sur, chain_sur, 'o-', color='C1', markersize=4, label='survey all')`
			`plt.plot(dates_fin, chain_sur_fin, 'o', color=[0.3, 0.3, 0.3], markersize=2, label='survey interp')`
			`plt.plot(dates_fin, chain_sds_fin, 'o--', color='b', markersize=4, label='SDS')`
			`plt.title(pfname, fontweight='bold')`
			`# plt.xlim([dates_sds[0], dates_sds[-1]])`
			`plt.ylabel('chainage [m]')`

			`# make scatter plot`
			`plt.figure(fig2.number)`
			`fig2.add_subplot(gs2[0,i])`
			`plt.axis('equal')`
			`plt.plot(chain_sur_fin, chain_sds_fin, 'ko', markersize=4, markerfacecolor='w', alpha=0.7)`
			`xmax = np.max([np.nanmax(chain_sds_fin),np.nanmax(chain_sur_fin)])`
			`xmin = np.min([np.nanmin(chain_sds_fin),np.nanmin(chain_sur_fin)])`
			`ymax = np.max([np.nanmax(chain_sds_fin),np.nanmax(chain_sur_fin)])`
			`ymin = np.min([np.nanmin(chain_sds_fin),np.nanmin(chain_sur_fin)])`
			`plt.plot([xmin, xmax], [ymin, ymax], 'k--')`
			`plt.plot([xmin, xmax], [xminslope + intercept, xmaxslope + intercept], 'b:')`
			`str_corr = ' y = %.2f x + %.2f\n R2 = %.2f' % (slope, intercept, R2)`
			`plt.text(xmin, ymax-5, str_corr, bbox=dict(facecolor=[0.7,0.7,0.7], alpha=0.5), horizontalalignment='left')`
			`plt.xlabel('chainage survey [m]')`
			`plt.ylabel('chainage satellite [m]')`
			`plt.title(pfname, fontweight='bold')`

			`fig2.add_subplot(gs2[1,i])`
			`binwidth = 3`
			`bins = np.arange(min(diff_chain), max(diff_chain) + binwidth, binwidth)`
			`density = plt.hist(diff_chain, bins=bins, density=True, color=[0.8, 0.8, 0.8], edgecolor='k')`
			`plt.xlim([-50, 50])`
			`plt.xlabel('error [m]')`
			`str_stats = ' rmse = %.1f\n mean = %.1f\n std = %.1f\n q90 = %.1f' % (rmse, mean, std, q90)`
			`plt.text(15, np.max(density[0])-0.015, str_stats, bbox=dict(facecolor=[0.8,0.8,0.8], alpha=0.3), horizontalalignment='left', fontsize=10)`

			`fig1.set_size_inches(19.2, 9.28)`
			`fig1.set_tight_layout(True)`
			`fig2.set_size_inches(19.2, 9.28)`
			`fig2.set_tight_layout(True)`

			`# all transects together`
			`chain_sds_all = []`
			`chain_sur_all = []`
			`for i in range(chain_sds.shape[1]):`
			`pfname = list(topo_profiles.keys())[i]`
			`chain_sds_all = np.append(chain_sds_all,data_fin[pfname]['sds'])`
			`chain_sur_all = np.append(chain_sur_all,data_fin[pfname]['survey'])`

			`# calculate statistics`
			`slope, intercept, rvalue, pvalue, std_err = sstats.linregress(chain_sur_all, chain_sds_all)`
			`R2 = rvalue**2`
			`correlation = np.corrcoef(chain_sur_all, chain_sds_all)[0,1]`
			`diff_chain_all = chain_sur_all - chain_sds_all`

			`rmse = np.sqrt(np.nanmean((diff_chain_all)**2))`
			`mean = np.nanmean(diff_chain_all)`
			`std = np.nanstd(diff_chain_all)`
			`q90 = np.percentile(np.abs(diff_chain_all), 90)`

			`stats['all'] = {'rmse':rmse,'mean':mean,'std':std,'q90':q90, 'corr':correlation,`
			`'linfit':{'slope':slope, 'intercept':intercept, 'R2':R2, 'pvalue':pvalue}}`

			`# make plot`
			`plt.figure(fig3.number)`
			`fig3.add_subplot(gs3[0,0])`
			`plt.axis('equal')`
			`plt.plot(chain_sur_all, chain_sds_all, 'ko', markersize=4, markerfacecolor='w', alpha=0.7)`
			`xmax = np.max([np.nanmax(chain_sds_all),np.nanmax(chain_sur_all)])`
			`xmin = np.min([np.nanmin(chain_sds_all),np.nanmin(chain_sur_all)])`
			`ymax = np.max([np.nanmax(chain_sds_all),np.nanmax(chain_sur_all)])`
			`ymin = np.min([np.nanmin(chain_sds_all),np.nanmin(chain_sur_all)])`
			`plt.plot([xmin, xmax], [ymin, ymax], 'k--')`
			`plt.plot([xmin, xmax], [xminslope + intercept, xmaxslope + intercept], 'b:')`
			`str_corr = ' y = %.2f x + %.2f\n R2 = %.2f' % (slope, intercept, R2)`
			`plt.text(xmin, ymax-5, str_corr, bbox=dict(facecolor=[0.7,0.7,0.7], alpha=0.5), horizontalalignment='left')`
			`plt.xlabel('chainage survey [m]')`
			`plt.ylabel('chainage satellite [m]')`
			`plt.title(pfname, fontweight='bold')`

			`fig3.add_subplot(gs3[1,0])`
			`binwidth = 3`
			`bins = np.arange(min(diff_chain_all), max(diff_chain_all) + binwidth, binwidth)`
			`density = plt.hist(diff_chain_all, bins=bins, density=True, color=[0.8, 0.8, 0.8], edgecolor='k')`
			`plt.xlim([-50, 50])`
			`plt.xlabel('error [m]')`
			`str_stats = ' rmse = %.1f\n mean = %.1f\n std = %.1f\n q90 = %.1f' % (rmse, mean, std, q90)`
			`plt.text(15, np.max(density[0])-0.015, str_stats, bbox=dict(facecolor=[0.8,0.8,0.8], alpha=0.3), horizontalalignment='left', fontsize=10)`
			`fig3.set_size_inches(9.2, 9.28)`
			`fig3.set_tight_layout(True)`

			`return stats`