updated master

added download_images and read_images scripts
8 years ago · 62f5c5330f
parent cd74f6c39c
commit 62f5c5330f
7 changed files with 2032 additions and 556 deletions
--- a/download_images.py
+++ b/download_images.py
@ -0,0 +1,390 @@
+#==========================================================#
+#==========================================================#
+# Download L5, L7, L8, S2 images of a given area 
+#==========================================================#
+#==========================================================#
+
+
+
+#==========================================================#
+# Initial settings
+#==========================================================#
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+import pdb
+import ee
+
+# other modules
+from osgeo import gdal, ogr, osr
+from urllib.request import urlretrieve
+import zipfile
+from datetime import datetime
+import pytz
+import pickle
+
+# import own modules
+import functions.utils as utils
+import functions.sds as sds
+
+np.seterr(all='ignore') # raise/ignore divisions by 0 and nans
+ee.Initialize()
+
+#==========================================================#
+# Location
+#==========================================================#
+
+## location (Narrabeen-Collaroy beach)
+#polygon = [[[151.301454, -33.700754],
+#               [151.311453, -33.702075], 
+#               [151.307237, -33.739761],
+#               [151.294220, -33.736329],
+#               [151.301454, -33.700754]]]; 
+
+# location (Tairua beach)
+sitename = 'TAIRUA'
+polygon = [[[175.835574, -36.982022],
+               [175.888220, -36.980680], 
+               [175.893527, -37.029610],
+               [175.833444, -37.031767],
+               [175.835574, -36.982022]]];
+            
+# initialise metadata dictionnary (stores timestamps and georefencing accuracy of each image)       
+metadata = dict([])
+
+# create directories
+try:
+    os.makedirs(os.path.join(os.getcwd(), 'data',sitename))
+except:
+    print('directory already exists')
+        
+
+#%%
+#==========================================================#
+#==========================================================#
+# L5
+#==========================================================#
+#==========================================================#
+
+
+
+# define filenames for images
+suffix = '.tif'
+filepath = os.path.join(os.getcwd(), 'data', sitename, 'L5', '30m')
+try:
+    os.makedirs(filepath)
+except:
+    print('directory already exists')
+    
+#==========================================================#
+# Select L5 collection 
+#==========================================================#
+
+satname = 'L5'
+input_col = ee.ImageCollection('LANDSAT/LT05/C01/T1_TOA')
+
+# filter by location
+flt_col = input_col.filterBounds(ee.Geometry.Polygon(polygon))
+n_img = flt_col.size().getInfo()
+print('Number of images covering ' + sitename, n_img)
+im_all = flt_col.getInfo().get('features')
+
+
+#==========================================================#
+# Main loop trough images
+#==========================================================#
+
+timestamps = []
+acc_georef = []
+all_names = []
+for i in range(n_img):
+    
+    # find each image in ee database
+    im = ee.Image(im_all[i].get('id'))
+
+    im_dic = im.getInfo()
+    im_bands = im_dic.get('bands')
+    t = im_dic['properties']['system:time_start']
+    im_timestamp = datetime.fromtimestamp(t/1000, tz=pytz.utc)
+    timestamps.append(im_timestamp)
+    im_date = im_timestamp.strftime('%Y-%m-%d-%H-%M-%S')
+    im_epsg = int(im_dic['bands'][0]['crs'][5:])
+    try:
+        acc_georef.append(im_dic['properties']['GEOMETRIC_RMSE_MODEL'])
+    except:
+        acc_georef.append(12)   
+        print('No geometric rmse model property')
+    
+    # delete dimensions key from dictionnary, otherwise the entire image is extracted
+    for j in range(len(im_bands)): del im_bands[j]['dimensions']
+    
+    # bands for L5
+    ms_bands = [im_bands[0], im_bands[1], im_bands[2], im_bands[3], im_bands[4], im_bands[7]]
+    
+    # filenames
+    filename = im_date + '_' + satname + '_' + sitename + suffix
+
+    print(i)
+    if any(filename in _ for _ in all_names):
+        filename = im_date + '_' + satname + '_' + sitename + '_dup' + suffix 
+    all_names.append(filename)
+    
+    local_data = sds.download_tif(im, polygon, ms_bands, filepath)
+    os.rename(local_data, os.path.join(filepath, filename))
+
+# sort timestamps and georef accuracy (dowloaded images are sorted by date in directory)
+timestamps_sorted = sorted(timestamps)
+idx_sorted = sorted(range(len(timestamps)), key=timestamps.__getitem__)
+acc_georef_sorted = [acc_georef[j] for j in idx_sorted]
+
+metadata[satname] = {'dates':timestamps_sorted, 'acc_georef':acc_georef_sorted, 'epsg':im_epsg}
+
+#%%
+#==========================================================#
+#==========================================================#
+# L7&L8
+#==========================================================#
+#==========================================================#
+    
+
+
+# define filenames for images
+suffix = '.tif'
+filepath = os.path.join(os.getcwd(), 'data', sitename, 'L7&L8')
+filepath_pan = os.path.join(filepath, 'pan')
+filepath_ms = os.path.join(filepath, 'ms')
+try:
+    os.makedirs(filepath_pan)
+    os.makedirs(filepath_ms)
+except:
+    print('directory already exists')           
+
+#==========================================================#
+# Select L7 collection 
+#==========================================================#
+     
+satname = 'L7'   
+input_col = ee.ImageCollection('LANDSAT/LE07/C01/T1_RT_TOA')
+
+# filter by location
+flt_col = input_col.filterBounds(ee.Geometry.Polygon(polygon))
+n_img = flt_col.size().getInfo()
+print('Number of images covering ' + sitename, n_img)
+im_all = flt_col.getInfo().get('features')
+
+#==========================================================#
+# Main loop trough images
+#==========================================================#
+
+timestamps = []
+acc_georef = []
+all_names = []
+for i in range(n_img):
+    
+    # find each image in ee database
+    im = ee.Image(im_all[i].get('id'))
+    
+    im_dic = im.getInfo()
+    im_bands = im_dic.get('bands')
+    t = im_dic['properties']['system:time_start']
+    im_timestamp = datetime.fromtimestamp(t/1000, tz=pytz.utc)
+    timestamps.append(im_timestamp)
+    im_date = im_timestamp.strftime('%Y-%m-%d-%H-%M-%S')
+    im_epsg = int(im_dic['bands'][0]['crs'][5:])
+    try:
+        acc_georef.append(im_dic['properties']['GEOMETRIC_RMSE_MODEL'])
+    except:
+        acc_georef.append(12)   
+        print('No geometric rmse model property')
+    
+    # delete dimensions key from dictionnary, otherwise the entire image is extracted
+    for j in range(len(im_bands)): del im_bands[j]['dimensions']
+    
+    # bands for L7
+    pan_band = [im_bands[8]]
+    ms_bands = [im_bands[0], im_bands[1], im_bands[2], im_bands[3], im_bands[4], im_bands[9]]
+    
+    # filenames
+    filename_pan = im_date + '_' + satname + '_' + sitename + '_pan' + suffix
+    filename_ms = im_date + '_' + satname + '_' + sitename + '_ms' + suffix
+
+    print(i)
+    if any(filename_pan in _ for _ in all_names):
+        filename_pan = im_date + '_' + satname + '_' + sitename + '_pan' + '_dup' + suffix
+        filename_ms = im_date + '_' + satname + '_' + sitename + '_ms' + '_dup' + suffix 
+    all_names.append(filename_pan)
+    
+    local_data_pan = sds.download_tif(im, polygon, pan_band, filepath_pan)
+    os.rename(local_data_pan, os.path.join(filepath_pan, filename_pan))
+    local_data_ms = sds.download_tif(im, polygon, ms_bands, filepath_ms)
+    os.rename(local_data_ms, os.path.join(filepath_ms, filename_ms))
+
+#==========================================================#
+# Select L8 collection 
+#==========================================================#
+    
+satname = 'L8'
+input_col = ee.ImageCollection('LANDSAT/LC08/C01/T1_RT_TOA')
+
+# filter by location
+flt_col = input_col.filterBounds(ee.Geometry.Polygon(polygon))
+n_img = flt_col.size().getInfo()
+print('Number of images covering Narrabeen:', n_img)
+im_all = flt_col.getInfo().get('features')
+
+#==========================================================#
+# Main loop trough images
+#==========================================================#
+
+for i in range(n_img):
+    
+    # find each image in ee database
+    im = ee.Image(im_all[i].get('id'))
+    
+    im_dic = im.getInfo()
+    im_bands = im_dic.get('bands')
+    t = im_dic['properties']['system:time_start']
+    im_timestamp = datetime.fromtimestamp(t/1000, tz=pytz.utc)
+    timestamps.append(im_timestamp)
+    im_date = im_timestamp.strftime('%Y-%m-%d-%H-%M-%S')
+    im_epsg = int(im_dic['bands'][0]['crs'][5:])
+    try:
+        acc_georef.append(im_dic['properties']['GEOMETRIC_RMSE_MODEL'])
+    except:
+        acc_georef.append(12)   
+        print('No geometric rmse model property')
+    
+    # delete dimensions key from dictionnary, otherwise the entire image is extracted
+    for j in range(len(im_bands)): del im_bands[j]['dimensions']
+    
+    # bands for L8    
+    pan_band = [im_bands[7]]
+    ms_bands = [im_bands[1], im_bands[2], im_bands[3], im_bands[4], im_bands[5], im_bands[11]]
+
+    # filenames
+    filename_pan = im_date + '_' + satname + '_' + sitename + '_pan' + suffix
+    filename_ms = im_date + '_' + satname + '_' + sitename + '_ms' + suffix
+
+    print(i)
+    if any(filename_pan in _ for _ in all_names):
+        filename_pan = im_date + '_' + satname + '_' + sitename + '_pan' + '_dup' + suffix
+        filename_ms = im_date + '_' + satname + '_' + sitename + '_ms' + '_dup' + suffix 
+    all_names.append(filename_pan)
+    
+    local_data_pan = sds.download_tif(im, polygon, pan_band, filepath_pan)
+    os.rename(local_data_pan, os.path.join(filepath_pan, filename_pan))
+    local_data_ms = sds.download_tif(im, polygon, ms_bands, filepath_ms)
+    os.rename(local_data_ms, os.path.join(filepath_ms, filename_ms))
+
+
+# sort timestamps and georef accuracy (dowloaded images are sorted by date in directory)
+timestamps_sorted = sorted(timestamps)
+idx_sorted = sorted(range(len(timestamps)), key=timestamps.__getitem__)
+acc_georef_sorted = [acc_georef[j] for j in idx_sorted]
+
+metadata[satname] = {'dates':timestamps_sorted, 'acc_georef':acc_georef_sorted, 'epsg':im_epsg}
+
+#%%
+#==========================================================#
+#==========================================================#
+# S2
+#==========================================================#
+#==========================================================#
+    
+
+
+# define filenames for images
+suffix = '.tif'
+filepath = os.path.join(os.getcwd(), 'data', sitename, 'S2')
+try:
+    os.makedirs(os.path.join(filepath, '10m'))
+    os.makedirs(os.path.join(filepath, '20m'))
+    os.makedirs(os.path.join(filepath, '60m'))
+except:
+    print('directory already exists') 
+    
+#==========================================================#
+# Select L2 collection 
+#==========================================================#
+
+satname = 'S2'
+input_col = ee.ImageCollection('COPERNICUS/S2')
+
+# filter by location
+flt_col = input_col.filterBounds(ee.Geometry.Polygon(polygon))
+n_img = flt_col.size().getInfo()
+print('Number of images covering ' + sitename, n_img)
+im_all = flt_col.getInfo().get('features') 
+
+#==========================================================#
+# Main loop trough images
+#==========================================================#
+
+timestamps = []
+acc_georef = []
+all_names = []
+for i in range(n_img):
+    
+    # find each image in ee database
+    im = ee.Image(im_all[i].get('id'))
+    
+    im_dic = im.getInfo()
+    im_bands = im_dic.get('bands')
+    t = im_dic['properties']['system:time_start']
+    im_timestamp = datetime.fromtimestamp(t/1000, tz=pytz.utc)
+    im_date = im_timestamp.strftime('%Y-%m-%d-%H-%M-%S')
+    timestamps.append(im_timestamp)
+    im_epsg = int(im_dic['bands'][0]['crs'][5:])
+    try:
+        if im_dic['properties']['GEOMETRIC_QUALITY_FLAG'] == 'PASSED':
+            acc_georef.append(1)
+        else:
+            acc_georef.append(0)
+    except:
+        acc_georef.append(0)
+    
+    # delete dimensions key from dictionnary, otherwise the entire image is extracted
+    for j in range(len(im_bands)): del im_bands[j]['dimensions']
+    
+    # bands for S2
+    bands10 = [im_bands[1], im_bands[2], im_bands[3], im_bands[7]]
+    bands20 = [im_bands[11]]
+    bands60 = [im_bands[15]]
+    
+    # filenames
+    filename10 = im_date + '_' + satname + '_' + sitename + '_' + '10m' + suffix
+    filename20 = im_date + '_' + satname + '_' + sitename + '_' + '20m' + suffix
+    filename60 = im_date + '_' + satname + '_' + sitename + '_' + '60m' + suffix
+    
+    print(i)
+    if any(filename10 in _ for _ in all_names):
+        filename10 = im_date + '_' + satname + '_' + sitename + '_' + '10m' + '_dup' + suffix
+        filename20 = im_date + '_' + satname + '_' + sitename + '_' + '20m' + '_dup' + suffix
+        filename60 = im_date + '_' + satname + '_' + sitename + '_' + '60m' + '_dup' + suffix
+    all_names.append(filename10)
+    
+    local_data = sds.download_tif(im, polygon, bands10, filepath)
+    os.rename(local_data, os.path.join(filepath, '10m', filename10))
+    
+    local_data = sds.download_tif(im, polygon, bands20, filepath)
+    os.rename(local_data, os.path.join(filepath, '20m', filename20))
+    
+    local_data = sds.download_tif(im, polygon, bands60, filepath)
+    os.rename(local_data, os.path.join(filepath, '60m', filename60))
+    
+# sort timestamps and georef accuracy (dowloaded images are sorted by date in directory)
+timestamps_sorted = sorted(timestamps)
+idx_sorted = sorted(range(len(timestamps)), key=timestamps.__getitem__)
+acc_georef_sorted = [acc_georef[j] for j in idx_sorted]
+
+metadata[satname] = {'dates':timestamps_sorted, 'acc_georef':acc_georef_sorted, 'epsg':im_epsg}
+
+
+
+#%% save metadata
+
+filepath = os.path.join(os.getcwd(), 'data', sitename)
+with open(os.path.join(filepath, sitename + '_metadata' + '.pkl'), 'wb') as f:
+    pickle.dump(metadata, f)  
+    
+    
--- a/functions/NeuralNet_classif_nopan.pkl
+++ b/functions/NeuralNet_classif_nopan.pkl
--- a/functions/data_analysis.py
+++ b/functions/data_analysis.py
@ -0,0 +1,432 @@
+"""This module contains all the functions needed for data analysis """
+
+# Initial settings
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+from matplotlib import gridspec
+import pdb
+import ee
+
+# other modules
+from osgeo import gdal, ogr, osr
+import scipy.interpolate as interpolate
+import scipy.stats as sstats
+
+# image processing modules
+import skimage.filters as filters 
+import skimage.exposure as exposure
+import skimage.transform as transform
+import sklearn.decomposition as decomposition
+import skimage.measure as measure
+import skimage.morphology as morphology
+
+# machine learning modules
+from sklearn.cluster import KMeans
+from sklearn.neural_network import MLPClassifier
+from sklearn.externals import joblib
+
+import time
+
+# import own modules
+import functions.utils as utils
+
+def get_tide(dates_sds, dates_tide, tide_level):
+    
+    tide = []
+    for i in range(len(dates_sds)):
+        dates_diff = np.abs(np.array([ (dates_sds[i] - _).total_seconds() for _ in dates_tide]))
+        if np.min(dates_diff) <= 1800: # half-an-hour
+            idx_closest = np.argmin(dates_diff)
+            tide.append(tide_level[idx_closest])
+        else:
+            tide.append(np.nan)
+    tide = np.array(tide)
+    
+    return tide
+
+def remove_duplicates(output, satname):
+    " removes duplicates from output structure, keep the one with less cloud cover or best georeferencing "
+    dates = output['dates']
+    dates_str = [_.strftime('%Y%m%d') for _ in dates]
+    dupl = utils.duplicates_dict(dates_str)
+    if dupl:
+        output_nodup = dict([])
+        idx_remove = []
+        if satname == 'L8' or satname == 'L5':
+            for k,v in dupl.items():
+                
+                idx1 = v[0]
+                idx2 = v[1]
+                
+                c1 = output['metadata']['cloud_cover'][idx1]
+                c2 = output['metadata']['cloud_cover'][idx2]
+                g1 = output['metadata']['acc_georef'][idx1]
+                g2 = output['metadata']['acc_georef'][idx2]
+                
+                if c1 < c2 - 0.01:
+                    idx_remove.append(idx2)
+                elif g1 < g2 - 0.1:
+                    idx_remove.append(idx2)
+                else:
+                    idx_remove.append(idx1)
+            
+        else:
+            for k,v in dupl.items():
+                
+                idx1 = v[0]
+                idx2 = v[1]
+                
+                c1 = output['metadata']['cloud_cover'][idx1]
+                c2 = output['metadata']['cloud_cover'][idx2]
+                
+                if c1 < c2 - 0.01:
+                    idx_remove.append(idx2)
+                else:
+                    idx_remove.append(idx1)
+                    
+        idx_remove = sorted(idx_remove)
+        idx_all = np.linspace(0, len(dates_str)-1, len(dates_str))
+        idx_keep = list(np.where(~np.isin(idx_all,idx_remove))[0])        
+        
+        output_nodup['dates'] = [output['dates'][k] for k in idx_keep]
+        output_nodup['shorelines'] = [output['shorelines'][k] for k in idx_keep]
+        output_nodup['metadata'] = dict([])
+        for key in list(output['metadata'].keys()):
+            output_nodup['metadata'][key] = [output['metadata'][key][k] for k in idx_keep]
+        print(satname + ' : ' + str(len(idx_remove)) + ' duplicates')
+        return output_nodup
+        
+    else: 
+        print(satname + ' : ' + 'no duplicates')
+        return output
+    
+    
+def merge(output):
+    " merges data from the different satellites "
+    
+    # stack all list together under one key
+    output_all = {'dates':[], 'shorelines':[],
+                  'metadata':{'filenames':[], 'satname':[], 'cloud_cover':[], 'acc_georef':[]}}
+    for satname in list(output.keys()):
+        output_all['dates'] = output_all['dates'] + output[satname]['dates']
+        output_all['shorelines'] = output_all['shorelines'] + output[satname]['shorelines']
+        for key in list(output[satname]['metadata'].keys()):
+            output_all['metadata'][key]  = output_all['metadata'][key] + output[satname]['metadata'][key]   
+    
+    output_all_sorted = {'dates':[], 'shorelines':[],
+                         'metadata':{'filenames':[], 'satname':[], 'cloud_cover':[], 'acc_georef':[]}}
+    # sort the dates
+    idx_sorted = sorted(range(len(output_all['dates'])), key=output_all['dates'].__getitem__)
+    output_all_sorted['dates'] = [output_all['dates'][i] for i in idx_sorted]
+    output_all_sorted['shorelines'] = [output_all['shorelines'][i] for i in idx_sorted]
+    for key in list(output_all['metadata'].keys()):
+        output_all_sorted['metadata'][key] = [output_all['metadata'][key][i] for i in idx_sorted]
+        
+    return output_all_sorted
+
+def create_transects(x0, y0, orientation, chainage_length):
+    " creates shore-normal transects "
+    
+    transects = []
+    
+    for k in range(len(x0)):
+        
+        # orientation of cross-shore profile
+        phi = (90 - orientation[k])*np.pi/180
+        
+        # create a vector using the chainage length
+        x = np.linspace(0,chainage_length,chainage_length+1)
+        y = np.zeros(len(x))
+        coords = np.zeros((len(x),2))
+        coords[:,0] = x
+        coords[:,1] = y
+        
+        # translate and rotate the vector using the origin and orientation
+        tf = transform.EuclideanTransform(rotation=phi, translation=(x0[k],y0[k]))
+        coords_tf = tf(coords)
+        
+        transects.append(coords_tf)
+        
+    return transects
+
+def calculate_chainage(sds, transects, orientation, along_dist):
+    " intersect SDS with transect and compute chainage position "
+    
+    chainage_mtx = np.zeros((len(sds),len(transects),6))
+    
+    for i in range(len(sds)):
+        
+        sl = sds[i]
+        
+        for j in range(len(transects)): 
+            
+            # compute rotation matrix
+            X0 = transects[j][0,0]
+            Y0 = transects[j][0,1]
+            phi = (90 - orientation[j])*np.pi/180
+            Mrot = np.array([[np.cos(phi), np.sin(phi)],[-np.sin(phi), np.cos(phi)]])
+    
+            # calculate point to line distance between shoreline points and profile
+            p1 = np.array([X0,Y0])
+            p2 = transects[j][-1,:]
+            p3 = sl
+            d = np.abs(np.cross(p2-p1,p3-p1)/np.linalg.norm(p2-p1))
+            idx_close = utils.find_indices(d, lambda e: e <= along_dist)
+            
+            # check if there are SDS points around the profile or not 
+            if not idx_close:
+                chainage_mtx[i,j,:] = np.tile(np.nan,(1,6))
+                
+            else:
+                # change of base to shore-normal coordinate system
+                xy_close = np.array([sl[idx_close,0],sl[idx_close,1]]) - np.tile(np.array([[X0],[Y0]]), (1,len(sl[idx_close])))
+                xy_rot = np.matmul(Mrot, xy_close)
+                
+                # put nan values if the chainage is negative (MAKE SURE TO PICK ORIGIN CORRECTLY)
+                if np.any(xy_rot[0,:] < 0):
+                    xy_rot[0,np.where(xy_rot[0,:] < 0)] = np.nan
+                    
+                # compute mean, median max and std of chainage position
+                n_points = len(xy_rot[0,:])
+                mean_cross = np.nanmean(xy_rot[0,:])
+                median_cross = np.nanmedian(xy_rot[0,:])
+                max_cross = np.nanmax(xy_rot[0,:])
+                min_cross = np.nanmin(xy_rot[0,:])
+                std_cross = np.nanstd(xy_rot[0,:])
+                
+                if std_cross > 10: # if large std, take the most seaward point
+                    mean_cross = max_cross
+                    median_cross = max_cross
+                    min_cross = max_cross
+                
+                # store the statistics
+                chainage_mtx[i,j,:] = np.array([mean_cross, median_cross, max_cross,
+                            min_cross, n_points, std_cross])   
+     
+    # format into dictionnary
+    chainage = dict([])
+    chainage['mean'] = chainage_mtx[:,:,0]
+    chainage['median'] = chainage_mtx[:,:,1]
+    chainage['max'] = chainage_mtx[:,:,2]
+    chainage['min'] = chainage_mtx[:,:,3]
+    chainage['npoints'] = chainage_mtx[:,:,4]
+    chainage['std'] = chainage_mtx[:,:,5]
+        
+    return chainage
+
+def compare_sds(dates_sds, chain_sds, topo_profiles, mod=0, mindays=5):
+    """
+    Compare sds with groundtruth data from topographic surveys / argus shorelines
+    
+    KV WRL 2018
+
+    Arguments:
+    -----------
+        dates_sds: list
+            list of dates corresponding to each row in chain_sds
+        chain_sds: np.ndarray
+            array with time series of chainage for each transect (each transect is one column)
+        topo_profiles: dict
+            dict containing the dates and chainage of the groundtruth
+        mod: 0 or 1
+            0 for linear interpolation between 2 closest surveys, 1 for only nearest neighbour
+        min_days: int
+            minimum number of days for which the data can be compared     
+                
+    Returns:    -----------
+        stats: dict
+            contains all the statistics of the comparison
+
+    """       
+
+    # create 3 figures       
+    fig1 = plt.figure()
+    gs1 = gridspec.GridSpec(chain_sds.shape[1], 1)
+    fig2 = plt.figure()
+    gs2 = gridspec.GridSpec(2, chain_sds.shape[1])
+    fig3 = plt.figure()
+    gs3 = gridspec.GridSpec(2,1)
+    
+    dates_sds_num = np.array([_.toordinal() for _ in dates_sds])
+    stats = dict([])
+    data_fin = dict([])
+    
+    # for each transect compare and plot the data
+    for i in range(chain_sds.shape[1]):
+        
+        pfname = list(topo_profiles.keys())[i]
+        stats[pfname] = dict([])
+        data_fin[pfname] = dict([])
+        
+        dates_sur = topo_profiles[pfname]['dates']
+        chain_sur = topo_profiles[pfname]['chainage']
+        
+        # convert to datenum
+        dates_sur_num = np.array([_.toordinal() for _ in dates_sur])
+        
+        chain_sur_interp = []
+        diff_days = []
+        
+        for j, satdate in enumerate(dates_sds_num):
+            
+            temp_diff = satdate - dates_sur_num
+            
+            if mod==0:
+                # select measurement before and after sat image date and interpolate
+                
+                ind_before = np.where(temp_diff == temp_diff[temp_diff > 0][-1])[0]     
+                if ind_before == len(temp_diff)-1:
+                    chain_sur_interp.append(np.nan)
+                    diff_days.append(np.abs(satdate-dates_sur_num[ind_before])[0])
+                    continue         
+                ind_after = np.where(temp_diff == temp_diff[temp_diff < 0][0])[0]            
+                tempx = np.zeros(2)
+                tempx[0] = dates_sur_num[ind_before]
+                tempx[1] = dates_sur_num[ind_after]
+                tempy = np.zeros(2)
+                tempy[0] = chain_sur[ind_before]
+                tempy[1] = chain_sur[ind_after]
+                diff_days.append(np.abs(np.max([satdate-tempx[0], satdate-tempx[1]])))                
+                # interpolate
+                f = interpolate.interp1d(tempx, tempy)
+                chain_sur_interp.append(f(satdate))
+                
+            elif mod==1:
+                # select the closest measurement
+                
+                idx_closest = utils.find_indices(np.abs(temp_diff), lambda e: e == np.min(np.abs(temp_diff)))[0]
+                diff_days.append(np.abs(satdate-dates_sur_num[idx_closest]))
+                if diff_days[j] > mindays:
+                    chain_sur_interp.append(np.nan)
+                else:
+                    chain_sur_interp.append(chain_sur[idx_closest])
+
+        chain_sur_interp = np.array(chain_sur_interp)
+        
+        # remove nan values
+        idx_sur_nan = ~np.isnan(chain_sur_interp)
+        idx_sat_nan = ~np.isnan(chain_sds[:,i])
+        idx_nan = np.logical_and(idx_sur_nan, idx_sat_nan)
+        
+        # groundtruth and sds
+        chain_sur_fin = chain_sur_interp[idx_nan]
+        chain_sds_fin = chain_sds[idx_nan,i]
+        dates_fin = [k for (k, v) in zip(dates_sds, idx_nan) if v]
+        
+        # calculate statistics
+        slope, intercept, rvalue, pvalue, std_err = sstats.linregress(chain_sur_fin, chain_sds_fin) 
+        R2 = rvalue**2
+        correlation = np.corrcoef(chain_sur_fin, chain_sds_fin)[0,1]
+        diff_chain = chain_sur_fin - chain_sds_fin
+                
+        rmse = np.sqrt(np.nanmean((diff_chain)**2))
+        mean = np.nanmean(diff_chain)
+        std = np.nanstd(diff_chain)
+        q90 = np.percentile(np.abs(diff_chain), 90)
+        
+        # store data
+        stats[pfname]['rmse'] = rmse
+        stats[pfname]['mean'] = mean
+        stats[pfname]['std'] = std
+        stats[pfname]['q90'] = q90
+        stats[pfname]['diffdays'] = diff_days
+        stats[pfname]['corr'] = correlation
+        stats[pfname]['linfit'] = {'slope':slope, 'intercept':intercept, 'R2':R2, 'pvalue':pvalue}
+        
+        data_fin[pfname]['dates'] = dates_fin
+        data_fin[pfname]['sds'] = chain_sds_fin
+        data_fin[pfname]['survey'] = chain_sur_fin
+        
+        # make time-series plot
+        plt.figure(fig1.number)
+        fig1.add_subplot(gs1[i,0])
+        plt.plot(dates_sur, chain_sur, 'o-', color='C1', markersize=4, label='survey all')
+        plt.plot(dates_fin, chain_sur_fin, 'o', color=[0.3, 0.3, 0.3], markersize=2, label='survey interp')
+        plt.plot(dates_fin, chain_sds_fin, 'o--', color='b', markersize=4, label='SDS')
+        plt.title(pfname, fontweight='bold')
+#        plt.xlim([dates_sds[0], dates_sds[-1]])
+        plt.ylabel('chainage [m]')
+        
+        # make scatter plot
+        plt.figure(fig2.number)
+        fig2.add_subplot(gs2[0,i])
+        plt.axis('equal')
+        plt.plot(chain_sur_fin, chain_sds_fin, 'ko', markersize=4, markerfacecolor='w', alpha=0.7)
+        xmax = np.max([np.nanmax(chain_sds_fin),np.nanmax(chain_sur_fin)])
+        xmin = np.min([np.nanmin(chain_sds_fin),np.nanmin(chain_sur_fin)])
+        ymax = np.max([np.nanmax(chain_sds_fin),np.nanmax(chain_sur_fin)])
+        ymin = np.min([np.nanmin(chain_sds_fin),np.nanmin(chain_sur_fin)])
+        plt.plot([xmin, xmax], [ymin, ymax], 'k--')
+        plt.plot([xmin, xmax], [xmin*slope + intercept, xmax*slope + intercept], 'b:')
+        str_corr = ' y = %.2f x + %.2f\n R2 = %.2f' % (slope, intercept, R2)
+        plt.text(xmin, ymax-5, str_corr, bbox=dict(facecolor=[0.7,0.7,0.7], alpha=0.5), horizontalalignment='left')
+        plt.xlabel('chainage survey [m]')
+        plt.ylabel('chainage satellite [m]')
+        plt.title(pfname, fontweight='bold')
+        
+        fig2.add_subplot(gs2[1,i])
+        binwidth = 3
+        bins = np.arange(min(diff_chain), max(diff_chain) + binwidth, binwidth)
+        density = plt.hist(diff_chain, bins=bins, density=True, color=[0.8, 0.8, 0.8], edgecolor='k')
+        plt.xlim([-50, 50])
+        plt.xlabel('error [m]')
+        str_stats = ' rmse = %.1f\n mean = %.1f\n std = %.1f\n q90 = %.1f' % (rmse, mean, std, q90) 
+        plt.text(15, np.max(density[0])-0.015, str_stats, bbox=dict(facecolor=[0.8,0.8,0.8], alpha=0.3), horizontalalignment='left', fontsize=10)
+                                  
+    fig1.set_size_inches(19.2, 9.28)
+    fig1.set_tight_layout(True)
+    fig2.set_size_inches(19.2, 9.28)
+    fig2.set_tight_layout(True)
+
+    # all transects together
+    chain_sds_all = []
+    chain_sur_all = []
+    for i in range(chain_sds.shape[1]):
+        pfname = list(topo_profiles.keys())[i]
+        chain_sds_all = np.append(chain_sds_all,data_fin[pfname]['sds'])
+        chain_sur_all = np.append(chain_sur_all,data_fin[pfname]['survey'])
+    
+    # calculate statistics
+    slope, intercept, rvalue, pvalue, std_err = sstats.linregress(chain_sur_all, chain_sds_all) 
+    R2 = rvalue**2
+    correlation = np.corrcoef(chain_sur_all, chain_sds_all)[0,1]
+    diff_chain_all = chain_sur_all - chain_sds_all
+    
+    rmse = np.sqrt(np.nanmean((diff_chain_all)**2))
+    mean = np.nanmean(diff_chain_all)
+    std = np.nanstd(diff_chain_all)
+    q90 = np.percentile(np.abs(diff_chain_all), 90)
+    
+    stats['all'] = {'rmse':rmse,'mean':mean,'std':std,'q90':q90, 'corr':correlation,
+         'linfit':{'slope':slope, 'intercept':intercept, 'R2':R2, 'pvalue':pvalue}}
+    
+    # make plot
+    plt.figure(fig3.number)
+    fig3.add_subplot(gs3[0,0])
+    plt.axis('equal')
+    plt.plot(chain_sur_all, chain_sds_all, 'ko', markersize=4, markerfacecolor='w', alpha=0.7)
+    xmax = np.max([np.nanmax(chain_sds_all),np.nanmax(chain_sur_all)])
+    xmin = np.min([np.nanmin(chain_sds_all),np.nanmin(chain_sur_all)])
+    ymax = np.max([np.nanmax(chain_sds_all),np.nanmax(chain_sur_all)])
+    ymin = np.min([np.nanmin(chain_sds_all),np.nanmin(chain_sur_all)])
+    plt.plot([xmin, xmax], [ymin, ymax], 'k--')
+    plt.plot([xmin, xmax], [xmin*slope + intercept, xmax*slope + intercept], 'b:')
+    str_corr = ' y = %.2f x + %.2f\n R2 = %.2f' % (slope, intercept, R2)
+    plt.text(xmin, ymax-5, str_corr, bbox=dict(facecolor=[0.7,0.7,0.7], alpha=0.5), horizontalalignment='left')
+    plt.xlabel('chainage survey [m]')
+    plt.ylabel('chainage satellite [m]')
+    plt.title(pfname, fontweight='bold')
+
+    fig3.add_subplot(gs3[1,0])
+    binwidth = 3
+    bins = np.arange(min(diff_chain_all), max(diff_chain_all) + binwidth, binwidth)
+    density = plt.hist(diff_chain_all, bins=bins, density=True, color=[0.8, 0.8, 0.8], edgecolor='k')
+    plt.xlim([-50, 50])
+    plt.xlabel('error [m]')
+    str_stats = ' rmse = %.1f\n mean = %.1f\n std = %.1f\n q90 = %.1f' % (rmse, mean, std, q90) 
+    plt.text(15, np.max(density[0])-0.015, str_stats, bbox=dict(facecolor=[0.8,0.8,0.8], alpha=0.3), horizontalalignment='left', fontsize=10)
+    fig3.set_size_inches(9.2, 9.28)
+    fig3.set_tight_layout(True)              
+        
+    return stats
--- a/functions/sds.py
+++ b/functions/sds.py
@ -5,11 +5,13 @@ Created on Thu Mar  1 11:20:35 2018
@author: z5030440
 """

-"""This script contains the functions needed for satellite derived shoreline (SDS) extraction"""
+"""This module contains all the functions needed for extracting satellite derived shoreline (SDS) """

 # Initial settings
 import numpy as np
 import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+from matplotlib import gridspec
 import pdb
 import ee

@ -18,6 +20,7 @@ from osgeo import gdal, ogr, osr
 import tempfile
 from urllib.request import urlretrieve
 import zipfile
+import scipy.interpolate as interpolate

 # image processing modules
 import skimage.filters as filters 
@ -39,7 +42,7 @@ from functions.utils import *

 # Download from ee server function

-def download_tif(image, polygon, bandsId):
+def download_tif(image, polygon, bandsId, filepath):
    """downloads tif image (region and bands) from the ee server and stores it in a temp file"""
    url = ee.data.makeDownloadUrl(ee.data.getDownloadId({
        'image': image.serialize(),
@ -50,40 +53,7 @@ def download_tif(image, polygon, bandsId):
        }))
    local_zip, headers = urlretrieve(url)
    with zipfile.ZipFile(local_zip) as local_zipfile:
-        return local_zipfile.extract('data.tif', tempfile.mkdtemp())
-
-def load_image(image, polygon, bandsId): 
-    """
-    Loads an ee.Image() as a np.array. e.Image() is retrieved from the EE database.
-    The geographic area and bands to select can be specified
-    
-    KV WRL 2018
-    
-    Arguments:
-    -----------
-        image: ee.Image()
-            image objec from the EE database
-        polygon: list
-            coordinates of the points creating a polygon. Each point is a list with 2 values
-        bandsId: list
-            bands to select, each band is a dictionnary in the list containing the following keys:
-            crs, crs_transform, data_type and id. NOTE: you have to remove the key dimensions, otherwise
-            the entire image is retrieved.
-            
-    Returns:
-    -----------
-        image_array : np.ndarray
-            An array containing the image (2D if one band, otherwise 3D)
-        georef : np.ndarray
-            6 element vector containing the crs_parameters 
-            [X_ul_corner Xscale Xshear Y_ul_corner Yshear Yscale]
-    """
-    
-    local_tif_filename = download_tif(image, polygon, bandsId)
-    dataset = gdal.Open(local_tif_filename, gdal.GA_ReadOnly)
-    georef = np.array(dataset.GetGeoTransform())
-    bands = [dataset.GetRasterBand(i + 1).ReadAsArray() for i in range(dataset.RasterCount)]
-    return np.stack(bands, 2), georef
+        return local_zipfile.extract('data.tif', filepath)

 def create_cloud_mask(im_qa, satname, plot_bool):
    """
@ -109,8 +79,10 @@ def create_cloud_mask(im_qa, satname, plot_bool):
    # convert QA bits
    if satname == 'L8':
        cloud_values = [2800, 2804, 2808, 2812, 6896, 6900, 6904, 6908]
-    elif satname == 'L7':
+    elif satname == 'L7' or satname == 'L5' or satname == 'L4':
        cloud_values = [752, 756, 760, 764]
+    elif satname == 'S2':
+        cloud_values = [1024, 2048] # 1024 = dense cloud, 2048 = cirrus clouds
        
    cloud_mask = np.isin(im_qa, cloud_values)
    # remove isolated cloud pixels (there are some in the swash and they cause problems)
@ -127,109 +99,6 @@ def create_cloud_mask(im_qa, satname, plot_bool):
    
    return cloud_mask

-def read_eeimage(im, polygon, sat_name, plot_bool):
-    """
-    Read an ee.Image() object and returns the panchromatic band, multispectral bands (B, G, R, NIR, SWIR)
-    and a cloud mask. All outputs are at 15m resolution (bilinear interpolation for the multispectral bands)
-    
-    KV WRL 2018
-
-    Arguments:
-    -----------
-        im: ee.Image()
-            Image to read from the Google Earth Engine database
-        plot_bool: boolean
-            True if plot is wanted
-        
-    Returns:
-    -----------
-        im_pan: np.ndarray (2D)
-            The panchromatic band (15m)
-        im_ms: np.ndarray (3D)
-            The multispectral bands interpolated at 15m
-        im_cloud: np.ndarray (2D)
-            The cloud mask at 15m
-        crs_params: list
-            EPSG code and affine transformation parameters
-    """   
-
-    im_dic = im.getInfo()
-    # save metadata
-    im_meta = im_dic.get('properties')
-    meta = {'timestamp':im_meta['system:time_start'],
-            'date_acquired':im_meta['DATE_ACQUIRED'],
-            'geom_rmse_model':im_meta['GEOMETRIC_RMSE_MODEL'],
-            'gcp_model':im_meta['GROUND_CONTROL_POINTS_MODEL'],
-            'quality':im_meta['IMAGE_QUALITY_OLI'],
-            'sun_azimuth':im_meta['SUN_AZIMUTH'],
-            'sun_elevation':im_meta['SUN_ELEVATION']}
-    
-    im_bands = im_dic.get('bands')
-    
-    # delete dimensions key from dictionnary, otherwise the entire image is extracted
-    for i in range(len(im_bands)): del im_bands[i]['dimensions']
-    
-    # load panchromatic band
-    pan_band = [im_bands[7]]
-    im_pan, crs_pan = load_image(im, polygon, pan_band)
-    im_pan = im_pan[:,:,0]
-    
-    # load the multispectral bands (B2,B3,B4,B5,B6) = (blue,green,red,nir,swir1)
-    ms_bands = [im_bands[1], im_bands[2], im_bands[3], im_bands[4], im_bands[5]]
-    im_ms_30m, crs_ms = load_image(im, polygon, ms_bands)
-        
-    # create cloud mask
-    qa_band = [im_bands[11]]
-    im_qa, crs_qa = load_image(im, polygon, qa_band)
-    im_qa = im_qa[:,:,0]
-    im_cloud = create_cloud_mask(im_qa, sat_name, plot_bool)
-    im_cloud = transform.resize(im_cloud, (im_pan.shape[0], im_pan.shape[1]),
-                                order=0, preserve_range=True, mode='constant').astype('bool_')
-    
-    # resize the image using bilinear interpolation (order 1)
-    im_ms = transform.resize(im_ms_30m,(im_pan.shape[0], im_pan.shape[1]),
-                             order=1, preserve_range=True, mode='constant')
-    
-    # check if -inf values (means out of image) and add to cloud mask
-    im_inf = np.isin(im_ms[:,:,0], -np.inf)
-    im_nan = np.isnan(im_ms[:,:,0])
-    im_cloud = np.logical_or(np.logical_or(im_cloud, im_inf), im_nan)
-
-    # get the crs parameters for the image at 15m and 30m resolution
-    crs = {'crs_15m':crs_pan, 'crs_30m':crs_ms, 'epsg_code':int(pan_band[0]['crs'][5:])}
-
-    if plot_bool:
-        
-        # if there are -inf in the image, set them to 0 before plotting
-        if sum(sum(np.isin(im_ms_30m[:,:,0], -np.inf).astype(int))) > 0:
-            idx = np.isin(im_ms_30m[:,:,0], -np.inf)
-            im_ms_30m[idx,0] = 0; im_ms_30m[idx,1] = 0; im_ms_30m[idx,2] = 0; 
-            im_ms_30m[idx,3] = 0; im_ms_30m[idx,4] = 0
-
-        plt.figure()
-        
-        plt.subplot(221)
-        plt.imshow(im_pan, cmap='gray')
-        plt.title('PANCHROMATIC')
-        
-        plt.subplot(222)
-        plt.imshow(im_ms_30m[:,:,[2,1,0]])
-        plt.title('RGB')
-
-
-        plt.subplot(223)
-        plt.imshow(im_ms_30m[:,:,3], cmap='gray')
-        plt.title('NIR')
-        
-        plt.subplot(224)
-        plt.imshow(im_ms_30m[:,:,4], cmap='gray')
-        plt.title('SWIR')
-        
-        plt.show()
-    
-    return im_pan, im_ms, im_cloud, crs, meta
-
-
 def rescale_image_intensity(im, cloud_mask, prob_high, plot_bool):
    """
    Rescales the intensity of an image (multispectral or single band) by applying
@ -395,9 +264,28 @@ def pansharpen(im_ms, im_pan, cloud_mask, plot_bool):
    # apply PCA to RGB bands
    pca = decomposition.PCA()
    vec_pcs = pca.fit_transform(vec)
+    
    # replace 1st PC with pan band (after matching histograms)
    vec_pan = im_pan.reshape(im_pan.shape[0] * im_pan.shape[1])
    vec_pan = vec_pan[~vec_mask]
+    
+#    plt.figure()
+#    ax1 = plt.subplot(131)
+#    plt.imshow(im_pan, cmap='gray')
+#    plt.title('Pan band')
+#    plt.subplot(132, sharex=ax1, sharey=ax1)
+#    plt.imshow(vec_pcs[:,0].reshape(im_pan.shape[0],im_pan.shape[1]), cmap='gray')
+#    plt.title('PC1')
+#    plt.subplot(133, sharex=ax1, sharey=ax1)
+#    plt.imshow(hist_match(vec_pan, vec_pcs[:,0]).reshape(im_pan.shape[0],im_pan.shape[1]), cmap='gray')
+#    plt.title('Pan band histmatched')
+#
+#    plt.figure()
+#    plt.hist(hist_match(vec_pan, vec_pcs[:,0]), bins=300)
+#    plt.hist(vec_pcs[:,0], bins=300, alpha=0.5)   
+#    plt.hist(vec_pan, bins=300, alpha=0.5)   
+#    plt.draw()
+    
    vec_pcs[:,0] = hist_match(vec_pan, vec_pcs[:,0])
    vec_ms_ps = pca.inverse_transform(vec_pcs)
    
@ -407,13 +295,14 @@ def pansharpen(im_ms, im_pan, cloud_mask, plot_bool):
    im_ms_ps = vec_ms_ps_full.reshape(im_ms.shape[0], im_ms.shape[1], im_ms.shape[2])
    
    if plot_bool:
+        
        plt.figure()
        ax1 = plt.subplot(121)
-        plt.imshow(rescale_image_intensity(im_ms[:,:,[2,1,0]], cloud_mask, 100, False))
+        plt.imshow(rescale_image_intensity(im_ms[:,:,[2,1,0]], cloud_mask, 99.9, False))
        plt.axis('off')
        plt.title('Original')
        ax2 = plt.subplot(122, sharex=ax1, sharey=ax1)
-        plt.imshow(rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 100, False))
+        plt.imshow(rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 99.9, False))
        plt.axis('off')
        plt.title('Pansharpened')
        plt.show()
@ -458,9 +347,10 @@ def nd_index(im1, im2, cloud_mask, plot_bool):
    
    return im_nd

-def find_wl_contours(im_ndwi, cloud_mask, min_contour_points, plot_bool):
+def find_wl_contours(im_ndwi, cloud_mask, plot_bool):
    """
-    Computes normalised difference index on 2 images (2D), given a cloud mask (2D)
+    Finds the water line by thresholding the Normalized Difference Water Index and applying the Marching 
+    Squares Algorithm
    
    KV WRL 2018

@ -470,8 +360,6 @@ def find_wl_contours(im_ndwi, cloud_mask, min_contour_points, plot_bool):
            Image (2D) with the NDWI (water index)
        cloud_mask: np.ndarray
            2D cloud mask with True where cloud pixels are
-        min_contour_points: int
-            minimum number of points in each contour line
        plot_bool: boolean
            True if plot is wanted
                
@ -489,17 +377,19 @@ def find_wl_contours(im_ndwi, cloud_mask, min_contour_points, plot_bool):
    t_otsu = filters.threshold_otsu(vec)
    # use Marching Squares algorithm to detect contours on ndwi image
    contours = measure.find_contours(im_ndwi, t_otsu)
-    # filter water lines
-    contours_wl = []
-    for i, contour in enumerate(contours):
-        # remove contour points that are around clouds (nan values)
-        if np.any(np.isnan(contour)):
-            index_nan = np.where(np.isnan(contour))[0]
-            contour = np.delete(contour, index_nan, axis=0) 
-        # remove contours that have only few points (less than min_contour_points)
-        if contour.shape[0] > min_contour_points:
-            contours_wl.append(contour)
-        
+    
+    # remove contour points that are nans
+    contours_nonans = []
+    for k in range(len(contours)):
+        if np.any(np.isnan(contours[k])):
+            index_nan = np.where(np.isnan(contours[k]))[0]
+            contours_temp = np.delete(contours[k], index_nan, axis=0)
+            if len(contours_temp) > 1:
+                contours_nonans.append(contours_temp)
+        else:
+            contours_nonans.append(contours[k])
+    contours = contours_nonans
+    
    if plot_bool:
        # plot otsu's histogram segmentation
        plt.figure()
@ -512,12 +402,12 @@ def find_wl_contours(im_ndwi, cloud_mask, min_contour_points, plot_bool):
        plt.figure()
        plt.imshow(im_ndwi, cmap='seismic')
        plt.colorbar()
-        for i,contour in enumerate(contours_wl): plt.plot(contour[:, 1], contour[:, 0], linewidth=3, color='k')
+        for i,contour in enumerate(contours): plt.plot(contour[:, 1], contour[:, 0], linewidth=3, color='k')
        plt.axis('image')
        plt.title('Detected water lines')
        plt.show()
    
-    return contours_wl
+    return contours

 def convert_pix2world(points, crs_vec):
    """
@ -563,6 +453,49 @@ def convert_pix2world(points, crs_vec):
        
    return points_converted

+def convert_world2pix(points, crs_vec):
+    """
+    Converts world projected coordinates (X,Y) to image coordinates (row,column)
+    performing an affine transformation
+    
+    KV WRL 2018
+
+    Arguments:
+    -----------
+        points: np.ndarray or list of np.ndarray
+            array with 2 columns (rows first and columns second)
+        crs_vec: np.ndarray
+            vector of 6 elements [Xtr, Xscale, Xshear, Ytr, Yshear, Yscale]
+                
+    Returns:    -----------
+        points_converted: np.ndarray or list of np.ndarray 
+            converted coordinates, first columns with row and second column with column
+        
+    """
+    
+    # make affine transformation matrix
+    aff_mat = np.array([[crs_vec[1], crs_vec[2], crs_vec[0]],
+                       [crs_vec[4], crs_vec[5], crs_vec[3]],
+                       [0, 0, 1]])
+    # create affine transformation
+    tform = transform.AffineTransform(aff_mat)
+    
+    if type(points) is list:
+        points_converted = []
+        # iterate over the list
+        for i, arr in enumerate(points): 
+            points_converted.append(tform.inverse(points))
+            
+    elif type(points) is np.ndarray:
+        points_converted = tform.inverse(points)
+        
+    else:
+        print('invalid input type')
+        raise
+        
+    return points_converted
+
+
 def convert_epsg(points, epsg_in, epsg_out):
    """
    Converts from one spatial reference to another using the epsg codes
@ -676,7 +609,7 @@ def classify_sand_unsupervised(im_ms_ps, im_pan, cloud_mask, wl_pix, buffer_size
 #    im_sand = morphology.binary_dilation(im_sand, morphology.disk(1))
    
    if plot_bool:
-        im = np.copy(rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 100, False))
+        im = np.copy(rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 99.9, False))
        im[im_sand,0] = 0
        im[im_sand,1] = 0
        im[im_sand,2] = 1
@ -688,7 +621,7 @@ def classify_sand_unsupervised(im_ms_ps, im_pan, cloud_mask, wl_pix, buffer_size
    
    return im_sand

-def classify_image_NN(im_ms_ps, im_pan, cloud_mask, plot_bool):
+def classify_image_NN(im_ms_ps, im_pan, cloud_mask, min_beach_size, plot_bool):
    """
    Classifies every pixel in the image in one of 4 classes:
        - sand                                          --> label = 1
@ -714,8 +647,10 @@ def classify_image_NN(im_ms_ps, im_pan, cloud_mask, plot_bool):
            True if plot is wanted
                
    Returns:    -----------
-        im_labels: np.ndarray
-            2D binary image containing True where sand pixels are located
+        im_classif: np.ndarray
+            2D image containing labels
+        im_labels: np.ndarray of booleans
+            3D image containing a boolean image for each class (im_classif == label)

    """     
    
@ -743,17 +678,117 @@ def classify_image_NN(im_ms_ps, im_pan, cloud_mask, plot_bool):
    vec_classif = np.zeros((cloud_mask.shape[0]*cloud_mask.shape[1])) 
    vec_classif[~vec_mask] = labels
    im_classif = vec_classif.reshape((im_ms_ps.shape[0], im_ms_ps.shape[1]))
-#    im_classif = morphology.remove_small_objects(im_classif, min_size=min_beach_size, connectivity=2)
-        
+
+    # labels
+    im_sand = im_classif == 1
+    im_sand = morphology.remove_small_objects(im_sand, min_size=min_beach_size, connectivity=2)
+    im_swash = im_classif == 2
+    im_water = im_classif == 3
+    im_labels = np.stack((im_sand,im_swash,im_water), axis=-1)  
+    
    if plot_bool:
-        im_display = rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 100, False)
+        # display on top of pansharpened RGB
+        im_display = rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 99.9, False)
        im = np.copy(im_display)
-        colours = np.array([[1,0,0],[1,1,0],[0,1,1],[0,0,1]])
-        for k in range(4):
-            im[im_classif == k,0] = colours[k,0]
-            im[im_classif == k,1] = colours[k,1]
-            im[im_classif == k,2] = colours[k,2]
-        
+        # define colours for plot
+        colours = np.array([[1,128/255,0/255],[204/255,1,1],[0,0,204/255]])
+        for k in range(0,im_labels.shape[2]):
+            im[im_labels[:,:,k],0] = colours[k,0]
+            im[im_labels[:,:,k],1] = colours[k,1]
+            im[im_labels[:,:,k],2] = colours[k,2]
+             
+        plt.figure()
+        ax1 = plt.subplot(121)
+        plt.imshow(im_display)
+        plt.axis('off')
+        plt.title('Image')
+        ax2 = plt.subplot(122, sharex=ax1, sharey=ax1)
+        plt.imshow(im)
+        plt.axis('off')
+        plt.title('NN classifier')  
+        mng = plt.get_current_fig_manager()                                         
+        mng.window.showMaximized()
+        plt.tight_layout()   
+        plt.draw()
+    
+    return im_classif, im_labels
+
+def classify_image_NN_nopan(im_ms_ps, cloud_mask, min_beach_size, plot_bool):
+    """
+    Classifies every pixel in the image in one of 4 classes:
+        - sand                                          --> label = 1
+        - whitewater (breaking waves and swash)         --> label = 2
+        - water                                         --> label = 3
+        - other (vegetation, buildings, rocks...)       --> label = 0
+    
+    The classifier is a Neural Network, trained with 7000 pixels for the class SAND and 1500 pixels for
+    each of the other classes. This is because the class of interest for my application is SAND and I 
+    wanted to minimize the classification error for that class
+    
+    KV WRL 2018
+
+    Arguments:
+    -----------
+        im_ms_ps: np.ndarray
+            Pansharpened RGB + downsampled NIR and SWIR
+        im_pan:
+            Panchromatic band
+        cloud_mask: np.ndarray
+            2D cloud mask with True where cloud pixels are
+        plot_bool: boolean
+            True if plot is wanted
+                
+    Returns:    -----------
+        im_classif: np.ndarray
+            2D image containing labels
+        im_labels: np.ndarray of booleans
+            3D image containing a boolean image for each class (im_classif == label)
+
+    """     
+    
+    # load classifier
+    clf = joblib.load('functions/NeuralNet_classif_nopan.pkl')
+    
+    # calculate features
+    n_features = 9
+    im_features = np.zeros((im_ms_ps.shape[0], im_ms_ps.shape[1], n_features))
+    im_features[:,:,[0,1,2,3,4]] = im_ms_ps
+    im_features[:,:,5] = nd_index(im_ms_ps[:,:,3], im_ms_ps[:,:,1], cloud_mask, False) # (NIR-G)
+    im_features[:,:,6] = nd_index(im_ms_ps[:,:,3], im_ms_ps[:,:,2], cloud_mask, False) # ND(NIR-R)
+    im_features[:,:,7] = nd_index(im_ms_ps[:,:,0], im_ms_ps[:,:,2], cloud_mask, False) # ND(B-R)
+    im_features[:,:,8] = nd_index(im_ms_ps[:,:,4], im_ms_ps[:,:,1], cloud_mask, False) # ND(SWIR-G)
+    # remove NaNs and clouds
+    vec_features = im_features.reshape((im_ms_ps.shape[0] * im_ms_ps.shape[1], n_features))
+    vec_cloud = cloud_mask.reshape(cloud_mask.shape[0]*cloud_mask.shape[1])
+    vec_nan = np.any(np.isnan(vec_features), axis=1)
+    vec_mask = np.logical_or(vec_cloud, vec_nan)
+    vec_features = vec_features[~vec_mask, :]
+    # predict with NN classifier
+    labels = clf.predict(vec_features)
+    
+    # recompose image
+    vec_classif = np.zeros((cloud_mask.shape[0]*cloud_mask.shape[1])) 
+    vec_classif[~vec_mask] = labels
+    im_classif = vec_classif.reshape((im_ms_ps.shape[0], im_ms_ps.shape[1]))
+
+    # labels
+    im_sand = im_classif == 1
+    im_sand = morphology.remove_small_objects(im_sand, min_size=min_beach_size, connectivity=2)
+    im_swash = im_classif == 2
+    im_water = im_classif == 3
+    im_labels = np.stack((im_sand,im_swash,im_water), axis=-1)  
+    
+    if plot_bool:
+        # display on top of pansharpened RGB
+        im_display = rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 99.9, False)
+        im = np.copy(im_display)
+        # define colours for plot
+        colours = np.array([[1,128/255,0/255],[204/255,1,1],[0,0,204/255]])
+        for k in range(0,im_labels.shape[2]):
+            im[im_labels[:,:,k],0] = colours[k,0]
+            im[im_labels[:,:,k],1] = colours[k,1]
+            im[im_labels[:,:,k],2] = colours[k,2]
+             
        plt.figure()
        ax1 = plt.subplot(121)
        plt.imshow(im_display)
@ -762,8 +797,371 @@ def classify_image_NN(im_ms_ps, im_pan, cloud_mask, plot_bool):
        ax2 = plt.subplot(122, sharex=ax1, sharey=ax1)
        plt.imshow(im)
        plt.axis('off')
-        plt.title('NN')  
+        plt.title('NN classifier')  
+        mng = plt.get_current_fig_manager()                                         
+        mng.window.showMaximized()
+        plt.tight_layout()   
+        plt.draw()
+    
+    return im_classif, im_labels
+
+def find_wl_contours2(im_ms_ps, im_labels, cloud_mask, buffer_size, plot_bool):
+    """
+    New method for extracting shorelines (more robust)
+    
+    KV WRL 2018
+
+    Arguments:
+    -----------
+        im_ms_ps: np.ndarray
+            Pansharpened RGB + downsampled NIR and SWIR
+        im_labels: np.ndarray
+            3D image containing a boolean image for each class in the order (sand, swash, water)
+        cloud_mask: np.ndarray
+            2D cloud mask with True where cloud pixels are
+        buffer_size: int
+            size of the buffer around the sandy beach
+        plot_bool: boolean
+            True if plot is wanted
+                
+    Returns:    -----------
+        contours_wi: list of np.arrays 
+            contains the (row,column) coordinates of the contour lines extracted with the Water Index
+        contours_mwi: list of np.arrays 
+            contains the (row,column) coordinates of the contour lines extracted with the Modified Water Index
+
+    """  
+    
+    nrows = cloud_mask.shape[0]
+    ncols = cloud_mask.shape[1]
+    im_display = rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 99.9, False)
+
+    # calculate Normalized Difference Modified Water Index (SWIR - G)
+    im_mwi = nd_index(im_ms_ps[:,:,4], im_ms_ps[:,:,1], cloud_mask, False)
+    # calculate Normalized Difference Modified Water Index (NIR - G)
+    im_wi = nd_index(im_ms_ps[:,:,3], im_ms_ps[:,:,1], cloud_mask, False)
+    # stack indices together
+    im_ind = np.stack((im_wi, im_mwi), axis=-1)
+    vec_ind = im_ind.reshape(nrows*ncols,2)
+    
+    # process labels
+    vec_sand = im_labels[:,:,0].reshape(ncols*nrows)
+    vec_swash = im_labels[:,:,1].reshape(ncols*nrows)
+    vec_water = im_labels[:,:,2].reshape(ncols*nrows)
+    
+    # create a buffer around the sandy beach
+    se = morphology.disk(buffer_size)
+    im_buffer = morphology.binary_dilation(im_labels[:,:,0], se)
+    vec_buffer = im_buffer.reshape(nrows*ncols)
+    
+    # select water/sand/swash pixels that are within the buffer
+    int_water = vec_ind[np.logical_and(vec_buffer,vec_water),:]
+    int_sand = vec_ind[np.logical_and(vec_buffer,vec_sand),:]
+    int_swash = vec_ind[np.logical_and(vec_buffer,vec_swash),:]
+
+    # threshold the sand/water intensities 
+    int_all = np.append(int_water,int_sand, axis=0)
+    t_mwi = filters.threshold_otsu(int_all[:,0])
+    t_wi = filters.threshold_otsu(int_all[:,1])
+    
+    # find contour with MS algorithm
+    im_wi_buffer = np.copy(im_wi)
+    im_wi_buffer[~im_buffer] = np.nan
+    im_mwi_buffer = np.copy(im_mwi)
+    im_mwi_buffer[~im_buffer] = np.nan
+    contours_wi = measure.find_contours(im_wi_buffer, t_wi)
+    contours_mwi = measure.find_contours(im_mwi, t_mwi) # WARNING (on entire image)
+    
+    # remove contour points that are nans (around clouds)
+    
+    contours = contours_wi
+    contours_nonans = []
+    for k in range(len(contours)):
+        if np.any(np.isnan(contours[k])):
+            index_nan = np.where(np.isnan(contours[k]))[0]
+            contours_temp = np.delete(contours[k], index_nan, axis=0)
+            if len(contours_temp) > 1:
+                contours_nonans.append(contours_temp)
+        else:
+            contours_nonans.append(contours[k])
+    contours_wi = contours_nonans
+    
+    contours = contours_mwi
+    contours_nonans = []
+    for k in range(len(contours)):
+        if np.any(np.isnan(contours[k])):
+            index_nan = np.where(np.isnan(contours[k]))[0]
+            contours_temp = np.delete(contours[k], index_nan, axis=0)
+            if len(contours_temp) > 1:
+                contours_nonans.append(contours_temp)
+        else:
+            contours_nonans.append(contours[k])
+    contours_mwi = contours_nonans
+    
+    if plot_bool:
+        
+        im = np.copy(im_display)
+        # define colours for plot
+        colours = np.array([[1,128/255,0/255],[204/255,1,1],[0,0,204/255]])
+        for k in range(0,im_labels.shape[2]):
+            im[im_labels[:,:,k],0] = colours[k,0]
+            im[im_labels[:,:,k],1] = colours[k,1]
+            im[im_labels[:,:,k],2] = colours[k,2]
+        
+        fig = plt.figure()
+        gs = gridspec.GridSpec(3, 3, height_ratios=[1, 1, 3])
+        
+        ax1 = fig.add_subplot(gs[0,:]) 
+        vals = plt.hist(int_water[:,0], bins=100, label='water')
+        plt.hist(int_sand[:,0], bins=100, alpha=0.5, label='sand')
+        plt.hist(int_swash[:,0], bins=100, alpha=0.5, label='swash')
+        plt.plot([t_wi, t_wi], [0, np.max(vals[0])], 'r-')
+        plt.legend()
+        plt.title('Water Index NIR-G')
+        
+        ax2 = fig.add_subplot(gs[1,:], sharex=ax1)
+        vals = plt.hist(int_water[:,1], bins=100, label='water')
+        plt.hist(int_sand[:,1], bins=100, alpha=0.5, label='sand')
+        plt.hist(int_swash[:,1], bins=100, alpha=0.5, label='swash')
+        plt.plot([t_mwi, t_mwi], [0, np.max(vals[0])], 'r-')
+        plt.legend()
+        plt.title('Modified Water Index SWIR-G')        
+        
+        ax3 = fig.add_subplot(gs[2,0])
+        plt.imshow(im)
+        for i,contour in enumerate(contours_mwi): plt.plot(contour[:, 1], contour[:, 0], linewidth=3, color='k')
+        for i,contour in enumerate(contours_wi): plt.plot(contour[:, 1], contour[:, 0], linestyle='--', linewidth=1, color='w')
+        plt.grid(False)
+        plt.xticks([])
+        plt.yticks([])
+
+        ax4 = fig.add_subplot(gs[2,1], sharex=ax3, sharey=ax3)
+        plt.imshow(im_display)
+        for i,contour in enumerate(contours_mwi): plt.plot(contour[:, 1], contour[:, 0], linewidth=3, color='k')
+        for i,contour in enumerate(contours_wi): plt.plot(contour[:, 1], contour[:, 0], linestyle='--', linewidth=1, color='w')
+        plt.grid(False)     
+        plt.xticks([])
+        plt.yticks([])
+        
+        ax5 = fig.add_subplot(gs[2,2], sharex=ax3, sharey=ax3)
+        plt.imshow(im_mwi, cmap='seismic')
+        for i,contour in enumerate(contours_mwi): plt.plot(contour[:, 1], contour[:, 0], linewidth=3, color='k')
+        for i,contour in enumerate(contours_wi): plt.plot(contour[:, 1], contour[:, 0], linestyle='--', linewidth=1, color='w')
+        plt.grid(False)
+        plt.xticks([])
+        plt.yticks([])
+        
+#        plt.gcf().set_size_inches(17.99,7.55)
+        mng = plt.get_current_fig_manager()                                         
+        mng.window.showMaximized()
+        plt.gcf().set_tight_layout(True)
        plt.draw()
+        
+    return contours_wi, contours_mwi
+        
+def compare_sds(dates_sds, chain_sds, topo_profiles, mod=0, mindays=5):
+    """
+    Compare sds with groundtruth data from topographic surveys / argus shorelines
    
-    return im_classif
+    KV WRL 2018
+
+    Arguments:
+    -----------
+        dates_sds: list
+            list of dates corresponding to each row in chain_sds
+        chain_sds: np.ndarray
+            array with time series of chainage for each transect (each transect is one column)
+        topo_profiles: dict
+            dict containing the dates and chainage of the groundtruth
+        mod: 0 or 1
+            0 for linear interpolation between 2 closest surveys, 1 for only nearest neighbour
+        min_days: int
+            minimum number of days for which the data can be compared     
+                
+    Returns:    -----------
+        stats: dict
+            contains all the statistics of the comparison

+    """       
+
+    # create 3 figures       
+    fig1 = plt.figure()
+    gs1 = gridspec.GridSpec(chain_sds.shape[1], 1)
+    fig2 = plt.figure()
+    gs2 = gridspec.GridSpec(2, chain_sds.shape[1])
+    fig3 = plt.figure()
+    gs3 = gridspec.GridSpec(2,1)
+    
+    dates_sds_num = np.array([_.toordinal() for _ in dates_sds])
+    stats = dict([])
+    data_fin = dict([])
+    
+    # for each transect compare and plot the data
+    for i in range(chain_sds.shape[1]):
+        
+        pfname = list(topo_profiles.keys())[i]
+        stats[pfname] = dict([])
+        data_fin[pfname] = dict([])
+        
+        dates_sur = topo_profiles[pfname]['dates']
+        chain_sur = topo_profiles[pfname]['chainage']
+        
+        # convert to datenum
+        dates_sur_num = np.array([_.toordinal() for _ in dates_sur])
+        
+        chain_sur_interp = []
+        diff_days = []
+        
+        for j, satdate in enumerate(dates_sds_num):
+            
+            temp_diff = satdate - dates_sur_num
+            
+            if mod==0:
+                # select measurement before and after sat image date and interpolate
+                
+                ind_before = np.where(temp_diff == temp_diff[temp_diff > 0][-1])[0]     
+                if ind_before == len(temp_diff)-1:
+                    chain_sur_interp.append(np.nan)
+                    diff_days.append(np.abs(satdate-dates_sur_num[ind_before])[0])
+                    continue         
+                ind_after = np.where(temp_diff == temp_diff[temp_diff < 0][0])[0]            
+                tempx = np.zeros(2)
+                tempx[0] = dates_sur_num[ind_before]
+                tempx[1] = dates_sur_num[ind_after]
+                tempy = np.zeros(2)
+                tempy[0] = chain_sur[ind_before]
+                tempy[1] = chain_sur[ind_after]
+                diff_days.append(np.abs(np.max([satdate-tempx[0], satdate-tempx[1]])))                
+                # interpolate
+                f = interpolate.interp1d(tempx, tempy)
+                chain_sur_interp.append(f(satdate))
+                
+            elif mod==1:
+                # select the closest measurement
+                
+                idx_closest = find_indices(np.abs(temp_diff), lambda e: e == np.min(np.abs(temp_diff)))[0]
+                diff_days.append(np.abs(satdate-dates_sur_num[idx_closest]))
+                if diff_days[j] > mindays:
+                    chain_sur_interp.append(np.nan)
+                else:
+                    chain_sur_interp.append(chain_sur[idx_closest])
+
+        chain_sur_interp = np.array(chain_sur_interp)
+        
+        # remove nan values
+        idx_sur_nan = ~np.isnan(chain_sur_interp)
+        idx_sat_nan = ~np.isnan(chain_sds[:,i])
+        idx_nan = np.logical_and(idx_sur_nan, idx_sat_nan)
+        
+        # groundtruth and sds
+        chain_sur_fin = chain_sur_interp[idx_nan]
+        chain_sds_fin = chain_sds[idx_nan,i]
+        dates_fin = [k for (k, v) in zip(dates_sds, idx_nan) if v]
+        diff_chain = chain_sur_fin - chain_sds_fin
+                
+        # calculate statistics
+        rmse = np.sqrt(np.nanmean((diff_chain)**2))
+        mean = np.nanmean(diff_chain)
+        std = np.nanstd(diff_chain)
+        q90 = np.percentile(np.abs(diff_chain), 90)
+        
+        # store data
+        stats[pfname]['rmse'] = rmse
+        stats[pfname]['mean'] = mean
+        stats[pfname]['std'] = std
+        stats[pfname]['q90'] = q90
+        stats[pfname]['diffdays'] = diff_days
+        
+        data_fin[pfname]['dates'] = dates_fin
+        data_fin[pfname]['sds'] = chain_sds_fin
+        data_fin[pfname]['survey'] = chain_sur_fin
+        
+        # make time-series plot
+        plt.figure(fig1.number)
+        ax = fig1.add_subplot(gs1[i,0])
+        plt.plot(dates_sur, chain_sur, 'o-', color='C1', markersize=4, label='survey all')
+        plt.plot(dates_fin, chain_sur_fin, 'o', color=[0.3, 0.3, 0.3], markersize=2, label='survey interp')
+        plt.plot(dates_fin, chain_sds_fin, 'o--', color='b', markersize=4, label='SDS')
+        plt.title(pfname, fontweight='bold')
+        plt.xlim([dates_sds[0], dates_sds[-1]])
+        plt.ylabel('chainage [m]')
+        
+        # make scatter plot
+        plt.figure(fig2.number)
+        ax1 = fig2.add_subplot(gs2[0,i])
+        plt.axis('equal')
+        plt.plot(chain_sur_fin, chain_sds_fin, 'ko', markersize=4, markerfacecolor='w', alpha=0.7)
+        xmax = np.max([np.nanmax(chain_sds_fin),np.nanmax(chain_sur_fin)])
+        xmin = np.min([np.nanmin(chain_sds_fin),np.nanmin(chain_sur_fin)])
+        ymax = np.max([np.nanmax(chain_sds_fin),np.nanmax(chain_sur_fin)])
+        ymin = np.min([np.nanmin(chain_sds_fin),np.nanmin(chain_sur_fin)])
+        plt.plot([xmin, xmax], [ymin, ymax], 'r--')
+        correlation = np.corrcoef(chain_sur_fin, chain_sds_fin)[0,1]
+        str_corr = 'r = %.2f' % (correlation)
+        plt.text(xmin, ymax, str_corr, bbox=dict(facecolor=[0.7,0.7,0.7], alpha=0.5), horizontalalignment='left')
+        plt.xlabel('chainage survey [m]')
+        plt.ylabel('chainage satellite [m]')
+        plt.title(pfname, fontweight='bold')
+        
+        ax2 = fig2.add_subplot(gs2[1,i])
+        binwidth = 3
+        bins = np.arange(min(diff_chain), max(diff_chain) + binwidth, binwidth)
+        density = plt.hist(diff_chain, bins=bins, density=True, color=[0.8, 0.8, 0.8], edgecolor='k')
+        plt.xlim([-50, 50])
+        plt.xlabel('error [m]')
+        str_stats = ' rmse = %.1f\n mean = %.1f\n std = %.1f\n q90 = %.1f' % (rmse, mean, std, q90) 
+        plt.text(15, np.max(density[0])-0.015, str_stats, bbox=dict(facecolor=[0.8,0.8,0.8], alpha=0.5), horizontalalignment='left', fontsize=10)
+                                  
+    fig1.set_size_inches(19.2, 9.28)
+    fig1.set_tight_layout(True)
+    fig2.set_size_inches(19.2, 9.28)
+    fig2.set_tight_layout(True)
+
+    # plot all the data together
+    chain_sds_all = []
+    chain_sur_all = []
+    for i in range(chain_sds.shape[1]):
+        pfname = list(topo_profiles.keys())[i]
+        chain_sds_all = np.append(chain_sds_all,data_fin[pfname]['sds'])
+        chain_sur_all = np.append(chain_sur_all,data_fin[pfname]['survey'])
+    
+    diff_chain_all = chain_sur_all - chain_sds_all
+    
+    # calculate statistics
+    rmse = np.sqrt(np.nanmean((diff_chain_all)**2))
+    mean = np.nanmean(diff_chain_all)
+    std = np.nanstd(diff_chain_all)
+    q90 = np.percentile(np.abs(diff_chain_all), 90)
+    
+    stats['all'] = {'rmse':rmse,'mean':mean,'std':std,'q90':q90}
+    
+    # make plot with all datapoints (from all the transects)
+    plt.figure(fig3.number)
+    ax1 = fig3.add_subplot(gs3[0,0])
+    plt.axis('equal')
+    plt.plot(chain_sur_all, chain_sds_all, 'ko', markersize=4, markerfacecolor='w', alpha=0.7)
+    xmax = np.max([np.nanmax(chain_sds_all),np.nanmax(chain_sur_all)])
+    xmin = np.min([np.nanmin(chain_sds_all),np.nanmin(chain_sur_all)])
+    ymax = np.max([np.nanmax(chain_sds_all),np.nanmax(chain_sur_all)])
+    ymin = np.min([np.nanmin(chain_sds_all),np.nanmin(chain_sur_all)])
+    plt.plot([xmin, xmax], [ymin, ymax], 'r--')
+    correlation = np.corrcoef(chain_sur_all, chain_sds_all)[0,1]
+    str_corr = 'r = %.2f' % (correlation)
+    plt.text(xmin, ymax, str_corr, bbox=dict(facecolor=[0.7,0.7,0.7], alpha=0.5), horizontalalignment='left')
+    plt.xlabel('chainage survey [m]')
+    plt.ylabel('chainage satellite [m]')
+    plt.title(pfname, fontweight='bold')
+    
+    ax2 = fig3.add_subplot(gs3[1,0])
+    binwidth = 3
+    bins = np.arange(min(diff_chain_all), max(diff_chain_all) + binwidth, binwidth)
+    density = plt.hist(diff_chain_all, bins=bins, density=True, color=[0.8, 0.8, 0.8], edgecolor='k')
+    plt.xlim([-50, 50])
+    plt.xlabel('error [m]')
+    str_stats = ' rmse = %.1f\n mean = %.1f\n std = %.1f\n q90 = %.1f' % (rmse, mean, std, q90) 
+    plt.text(15, np.max(density[0])-0.015, str_stats, bbox=dict(facecolor=[0.8,0.8,0.8], alpha=0.5), horizontalalignment='left', fontsize=10)
+    fig3.set_size_inches(9.2, 9.28)
+    fig3.set_tight_layout(True)               
+        
+    return stats
+                
--- a/functions/utils.py
+++ b/functions/utils.py
@ -8,7 +8,9 @@ Contains all the utilities, convenience functions and small functions that do si
 """

 import matplotlib.pyplot as plt
+from datetime import datetime, timedelta
 import numpy as np
+import scipy.io as sio
 import pdb


@ -59,3 +61,50 @@ def find_indices(lst, condition):
 def reject_outliers(data, m=2):
    "rejects outliers in a numpy array"
    return data[abs(data - np.mean(data)) < m * np.std(data)]
+
+def duplicates_dict(lst):
+    "return duplicates and indices"
+    # nested function
+    def duplicates(lst, item):
+            return [i for i, x in enumerate(lst) if x == item]
+        
+    return dict((x, duplicates(lst, x)) for x in set(lst) if lst.count(x) > 1)
+
+def datenum2datetime(datenum):
+    "convert datenum to datetime"
+    #takes in datenum and outputs python datetime
+    time = [datetime.fromordinal(int(dn)) + timedelta(days=float(dn)%1) - timedelta(days = 366) for dn in datenum]
+    return time
+
+def loadmat(filename):
+    '''
+    this function should be called instead of direct spio.loadmat
+    as it cures the problem of not properly recovering python dictionaries
+    from mat files. It calls the function check keys to cure all entries
+    which are still mat-objects
+    '''
+    data = sio.loadmat(filename, struct_as_record=False, squeeze_me=True)
+    return _check_keys(data)
+
+def _check_keys(dict):
+    '''
+    checks if entries in dictionary are mat-objects. If yes
+    todict is called to change them to nested dictionaries
+    '''
+    for key in dict:
+        if isinstance(dict[key], sio.matlab.mio5_params.mat_struct):
+            dict[key] = _todict(dict[key])
+    return dict        
+
+def _todict(matobj):
+    '''
+    A recursive function which constructs from matobjects nested dictionaries
+    '''
+    dict = {}
+    for strg in matobj._fieldnames:
+        elem = matobj.__dict__[strg]
+        if isinstance(elem, sio.matlab.mio5_params.mat_struct):
+            dict[strg] = _todict(elem)
+        else:
+            dict[strg] = elem
+    return dict
--- a/read_images.py
+++ b/read_images.py
@ -0,0 +1,589 @@
+#==========================================================#
+#==========================================================#
+# Extract shorelines from Landsat images
+#==========================================================#
+#==========================================================#
+
+
+#==========================================================#
+# Initial settings
+#==========================================================#
+
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+import ee
+import pdb
+
+# other modules
+from osgeo import gdal, ogr, osr
+import pickle
+import matplotlib.cm as cm
+from pylab import ginput
+from shapely.geometry import LineString
+
+# image processing modules
+import skimage.filters as filters 
+import skimage.exposure as exposure
+import skimage.transform as transform
+import sklearn.decomposition as decomposition
+import skimage.measure as measure
+import skimage.morphology as morphology
+
+# machine learning modules
+from sklearn.model_selection import train_test_split
+from sklearn.neural_network import MLPClassifier
+from sklearn.preprocessing import StandardScaler, Normalizer 
+from sklearn.externals import joblib
+
+# import own modules
+import functions.utils as utils
+import functions.sds as sds
+
+# some other settings
+np.seterr(all='ignore') # raise/ignore divisions by 0 and nans
+plt.rcParams['axes.grid'] = True
+plt.rcParams['figure.max_open_warning'] = 100
+ee.Initialize()
+
+#==========================================================#
+# Parameters
+#==========================================================#
+
+sitename = 'NARRA'
+
+cloud_thresh = 0.7      # threshold for cloud cover
+plot_bool = False       # if you want the plots
+output_epsg = 28356     # GDA94 / MGA Zone 56
+buffer_size = 7         # radius (in pixels) of disk for buffer (pixel classification)
+min_beach_size = 20     # number of pixels in a beach (pixel classification)
+dist_ref = 100          # maximum distance from reference point
+min_length_wl = 200     # minimum length of shoreline LineString to be kept 
+manual_bool = True      # to manually check images
+
+
+output = dict([])
+
+#==========================================================#
+# Metadata
+#==========================================================#
+
+filepath = os.path.join(os.getcwd(), 'data', sitename)
+with open(os.path.join(filepath, sitename + '_metadata' + '.pkl'), 'rb') as f:
+    metadata = pickle.load(f)
+
+
+#%%    
+#==========================================================#
+# Read S2 images
+#==========================================================#
+
+satname = 'S2'
+dates = metadata[satname]['dates']
+input_epsg = 32756 # metadata[satname]['epsg']
+
+# path to images
+filepath10 = os.path.join(os.getcwd(), 'data', sitename, satname, '10m')
+filenames10 = os.listdir(filepath10)
+filepath20 = os.path.join(os.getcwd(), 'data', sitename, satname, '20m')
+filenames20 = os.listdir(filepath20)
+filepath60 = os.path.join(os.getcwd(), 'data', sitename, satname, '60m')
+filenames60 = os.listdir(filepath60)
+if (not len(filenames10) == len(filenames20)) or (not len(filenames20) == len(filenames60)):
+    raise 'error: not the same amount of files for 10, 20 and 60 m'
+N = len(filenames10)
+
+# initialise variables
+cloud_cover_ts = []
+acc_georef_ts = []
+date_acquired_ts = []
+filename_ts = []
+satname_ts = []
+timestamp = []
+shorelines = []
+idx_skipped = []
+
+spacing = '=========================================================='
+msg = ' %s\n %s\n %s' % (spacing, satname, spacing)
+print(msg)
+
+for i in range(N):
+    
+    # read 10m bands
+    fn = os.path.join(filepath10, filenames10[i])
+    data = gdal.Open(fn, gdal.GA_ReadOnly)
+    georef = np.array(data.GetGeoTransform())
+    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
+    im10 = np.stack(bands, 2)
+    im10 = im10/10000 # TOA scaled to 10000
+    
+    # if image is only zeros, skip it
+    if sum(sum(sum(im10))) < 1:
+        print('skip ' + str(i) + ' - no data')
+        idx_skipped.append(i)
+        continue        
+    
+    nrows = im10.shape[0]
+    ncols = im10.shape[1]
+    
+    # read 20m band (SWIR1)
+    fn = os.path.join(filepath20, filenames20[i])
+    data = gdal.Open(fn, gdal.GA_ReadOnly)
+    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
+    im20 = np.stack(bands, 2)
+    im20 = im20[:,:,0]
+    im20 = im20/10000 # TOA scaled to 10000
+    im_swir = transform.resize(im20, (nrows, ncols), order=1, preserve_range=True, mode='constant')
+    im_swir = np.expand_dims(im_swir, axis=2)
+    
+    # append down-sampled swir band to the 10m bands
+    im_ms = np.append(im10, im_swir, axis=2)
+    
+    # read 60m band (QA)
+    fn = os.path.join(filepath60, filenames60[i])
+    data = gdal.Open(fn, gdal.GA_ReadOnly)
+    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
+    im60 = np.stack(bands, 2)
+    im_qa = im60[:,:,0]
+    cloud_mask = sds.create_cloud_mask(im_qa, satname, plot_bool)
+    cloud_mask = transform.resize(cloud_mask,(nrows, ncols), order=0, preserve_range=True, mode='constant')
+    # check if -inf or nan values on any band and add to cloud mask
+    for k in range(im_ms.shape[2]):   
+            im_inf = np.isin(im_ms[:,:,k], -np.inf)
+            im_nan = np.isnan(im_ms[:,:,k])
+            cloud_mask = np.logical_or(np.logical_or(cloud_mask, im_inf), im_nan)
+            
+    # calculate cloud cover and if above threshold, skip it
+    cloud_cover = sum(sum(cloud_mask.astype(int)))/(cloud_mask.shape[0]*cloud_mask.shape[1])
+    if cloud_cover > cloud_thresh:
+        print('skip ' + str(i) + ' - cloudy (' + str(np.round(cloud_cover*100).astype(int)) + '%)')
+        idx_skipped.append(i)
+        continue
+    
+    # rescale image intensity for display purposes
+    im_display = sds.rescale_image_intensity(im_ms[:,:,[2,1,0]], cloud_mask, 99.9, False)
+    
+    # classify image in 4 classes (sand, whitewater, water, other) with NN classifier
+    im_classif, im_labels = sds.classify_image_NN_nopan(im_ms, cloud_mask, min_beach_size, plot_bool)
+    
+    # if there aren't any sandy pixels
+    if sum(sum(im_labels[:,:,0])) == 0 :
+        # use global threshold
+        im_ndwi = sds.nd_index(im_ms[:,:,4], im_ms[:,:,1], cloud_mask, plot_bool)
+        contours = sds.find_wl_contours(im_ndwi, cloud_mask, plot_bool)
+    else:
+        # use specific threhsold
+        contours_wi, contours_mwi = sds.find_wl_contours2(im_ms, im_labels, cloud_mask, buffer_size, plot_bool)
+
+    # convert from pixels to world coordinates
+    wl_coords = sds.convert_pix2world(contours_mwi, georef)
+    # convert to output epsg spatial reference
+    wl = sds.convert_epsg(wl_coords, input_epsg, output_epsg)
+    
+    # remove contour lines that have a perimeter < min_length_wl
+    wl_good = []
+    for l, wls in enumerate(wl):
+        coords = [(wls[k,0], wls[k,1]) for k in range(len(wls))]
+        a = LineString(coords) # shapely LineString structure
+        if a.length >= min_length_wl:
+            wl_good.append(wls)
+
+    # format points and only select the ones close to the refpoints
+    x_points = np.array([])
+    y_points = np.array([])
+    for k in range(len(wl_good)):
+        x_points = np.append(x_points,wl_good[k][:,0])
+        y_points = np.append(y_points,wl_good[k][:,1])
+    wl_good = np.transpose(np.array([x_points,y_points]))
+    temp = np.zeros((len(wl_good))).astype(bool)
+    for k in range(len(refpoints)): 
+        temp = np.logical_or(np.linalg.norm(wl_good - refpoints[k,[0,1]], axis=1) < dist_ref, temp) 
+    wl_final = wl_good[temp]
+    
+
+    # plot output
+    plt.figure()
+    im = np.copy(im_display)
+    colours = np.array([[1,128/255,0/255],[204/255,1,1],[0,0,204/255]])
+    for k in range(0,im_labels.shape[2]):
+        im[im_labels[:,:,k],0] = colours[k,0]
+        im[im_labels[:,:,k],1] = colours[k,1]
+        im[im_labels[:,:,k],2] = colours[k,2]
+    plt.imshow(im)
+    for k,contour in enumerate(contours_mwi): plt.plot(contour[:, 1], contour[:, 0], linewidth=2, color='k', linestyle='--')
+    plt.title(satname + '     ' + metadata[satname]['dates'][i].strftime('%Y-%m-%d') + '      acc : ' + str(metadata[satname]['acc_georef'][i]) + ' m' )
+    plt.draw()
+    
+    pt_in = np.array(ginput(n=1, timeout=1000))
+    plt.close()
+    
+    # if image is rejected, skip it
+    if pt_in[0][1] > nrows/2:
+        print('skip ' + str(i) + ' - rejected')
+        idx_skipped.append(i)
+        continue
+        
+    # if accepted, store the data
+    cloud_cover_ts.append(cloud_cover)
+    acc_georef_ts.append(metadata[satname]['acc_georef'][i])
+    
+    filename_ts.append(filenames10[i])
+    satname_ts.append(satname)
+    date_acquired_ts.append(filenames10[i][:10])
+
+    timestamp.append(metadata[satname]['dates'][i])
+    shorelines.append(wl_final)
+
+# store in output structure
+output[satname] = {'dates':timestamp, 'shorelines':shorelines, 'idx_skipped':idx_skipped,
+      'metadata':{'filenames':filename_ts, 'satname':satname_ts, 'cloud_cover':cloud_cover_ts,
+                  'acc_georef':acc_georef_ts}}
+del idx_skipped
+
+
+#%%
+#==========================================================#
+# Read L7&L8 images
+#==========================================================#
+
+satname = 'L8'
+dates = metadata[satname]['dates']
+input_epsg = 32656 # metadata[satname]['epsg']
+
+# path to images
+filepath_pan = os.path.join(os.getcwd(), 'data', sitename, 'L7&L8', 'pan')
+filepath_ms = os.path.join(os.getcwd(), 'data', sitename, 'L7&L8', 'ms')
+filenames_pan = os.listdir(filepath_pan)
+filenames_ms = os.listdir(filepath_ms)
+if (not len(filenames_pan) == len(filenames_ms)):
+    raise 'error: not the same amount of files for pan and ms'
+N = len(filenames_pan)
+
+# initialise variables
+cloud_cover_ts = []
+acc_georef_ts = []
+date_acquired_ts = []
+filename_ts = []
+satname_ts = []
+timestamp = []
+shorelines = []
+idx_skipped = []
+
+ 
+spacing = '=========================================================='
+msg = ' %s\n %s\n %s' % (spacing, satname, spacing)
+print(msg)
+
+for i in range(N):
+    
+    # get satellite name
+    sat = filenames_pan[i][20:22]
+    
+    # read pan image
+    fn_pan = os.path.join(filepath_pan, filenames_pan[i])
+    data = gdal.Open(fn_pan, gdal.GA_ReadOnly)
+    georef = np.array(data.GetGeoTransform())
+    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
+    im_pan = np.stack(bands, 2)[:,:,0]
+    nrows = im_pan.shape[0]
+    ncols = im_pan.shape[1]
+
+    # read ms image
+    fn_ms = os.path.join(filepath_ms, filenames_ms[i])
+    data = gdal.Open(fn_ms, gdal.GA_ReadOnly)
+    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
+    im_ms = np.stack(bands, 2)
+    
+    # cloud mask
+    im_qa = im_ms[:,:,5]
+    cloud_mask = sds.create_cloud_mask(im_qa, sat, plot_bool)
+    cloud_mask = transform.resize(cloud_mask, (nrows, ncols), order=0, preserve_range=True, mode='constant').astype('bool_')    
+    # resize the image using bilinear interpolation (order 1)
+    im_ms = im_ms[:,:,:5]
+    im_ms = transform.resize(im_ms,(nrows, ncols), order=1, preserve_range=True, mode='constant')
+    
+    # check if -inf or nan values on any band and add to cloud mask
+    for k in range(im_ms.shape[2]+1): 
+        if k == 5:
+            im_inf = np.isin(im_pan, -np.inf)
+            im_nan = np.isnan(im_pan)        
+        else:  
+            im_inf = np.isin(im_ms[:,:,k], -np.inf)
+            im_nan = np.isnan(im_ms[:,:,k])
+        cloud_mask = np.logical_or(np.logical_or(cloud_mask, im_inf), im_nan)
+
+    # calculate cloud cover and skip image if above threshold
+    cloud_cover = sum(sum(cloud_mask.astype(int)))/(cloud_mask.shape[0]*cloud_mask.shape[1])
+    if cloud_cover > cloud_thresh:
+        print('skip ' + str(i) + ' - cloudy (' + str(np.round(cloud_cover*100).astype(int)) + '%)')
+        idx_skipped.append(i)
+        continue
+    
+    # Pansharpen image (different for L8 and L7)
+    if sat == 'L7':
+        # pansharpen (Green, Red, NIR) and downsample Blue and SWIR1
+        im_ms_ps = sds.pansharpen(im_ms[:,:,[1,2,3]], im_pan, cloud_mask, plot_bool)
+        im_ms_ps = np.append(im_ms[:,:,[0]], im_ms_ps, axis=2)
+        im_ms_ps = np.append(im_ms_ps, im_ms[:,:,[4]], axis=2)
+        im_display = sds.rescale_image_intensity(im_ms[:,:,[2,1,0]], cloud_mask, 99.9, False)
+    elif sat == 'L8':
+        # pansharpen RGB image and downsample NIR and SWIR1
+        im_ms_ps = sds.pansharpen(im_ms[:,:,[0,1,2]], im_pan, cloud_mask, plot_bool)
+        im_ms_ps = np.append(im_ms_ps, im_ms[:,:,[3,4]], axis=2) 
+        im_display = sds.rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 99.9, False)
+
+    # classify image in 4 classes (sand, whitewater, water, other) with NN classifier
+    im_classif, im_labels = sds.classify_image_NN(im_ms_ps, im_pan, cloud_mask, min_beach_size, plot_bool)
+    
+    # if there aren't any sandy pixels
+    if sum(sum(im_labels[:,:,0])) == 0 :
+        # use global threshold
+        im_ndwi = sds.nd_index(im_ms_ps[:,:,4], im_ms_ps[:,:,1], cloud_mask, plot_bool)
+        contours = sds.find_wl_contours(im_ndwi, cloud_mask, plot_bool)
+    else:
+        # use specific threhsold
+        contours_wi, contours_mwi = sds.find_wl_contours2(im_ms_ps, im_labels, cloud_mask, buffer_size, plot_bool)
+    
+    # convert from pixels to world coordinates
+    wl_coords = sds.convert_pix2world(contours_mwi, georef)
+    # convert to output epsg spatial reference
+    wl = sds.convert_epsg(wl_coords, input_epsg, output_epsg)
+    
+    # remove contour lines that have a perimeter < min_length_wl
+    wl_good = []
+    for l, wls in enumerate(wl):
+        coords = [(wls[k,0], wls[k,1]) for k in range(len(wls))]
+        a = LineString(coords) # shapely LineString structure
+        if a.length >= min_length_wl:
+            wl_good.append(wls)   
+    
+    # format points and only select the ones close to the refpoints
+    x_points = np.array([])
+    y_points = np.array([])
+    for k in range(len(wl_good)):
+        x_points = np.append(x_points,wl_good[k][:,0])
+        y_points = np.append(y_points,wl_good[k][:,1])
+    wl_good = np.transpose(np.array([x_points,y_points]))
+    temp = np.zeros((len(wl_good))).astype(bool)
+    for k in range(len(refpoints)): 
+        temp = np.logical_or(np.linalg.norm(wl_good - refpoints[k,[0,1]], axis=1) < dist_ref, temp) 
+    wl_final = wl_good[temp]   
+        
+    # plot output
+    plt.figure()
+    plt.subplot(121)
+    im = np.copy(im_display)
+    colours = np.array([[1,128/255,0/255],[204/255,1,1],[0,0,204/255]])
+    for k in range(0,im_labels.shape[2]):
+        im[im_labels[:,:,k],0] = colours[k,0]
+        im[im_labels[:,:,k],1] = colours[k,1]
+        im[im_labels[:,:,k],2] = colours[k,2]
+    plt.imshow(im)
+    for k,contour in enumerate(contours_mwi): plt.plot(contour[:, 1], contour[:, 0], linewidth=2, color='k', linestyle='--')
+    plt.title(sat + '     ' + metadata[satname]['dates'][i].strftime('%Y-%m-%d') + '      acc : ' + str(metadata[satname]['acc_georef'][i]) + ' m' )
+
+    pt_in = np.array(ginput(n=1, timeout=1000))
+    plt.close()
+    
+    # if image is rejected, skip it
+    if pt_in[0][1] > nrows/2:
+        print('skip ' + str(i) + ' - rejected')
+        idx_skipped.append(i)
+        continue
+        
+    # if accepted, store the data
+    cloud_cover_ts.append(cloud_cover)
+    acc_georef_ts.append(metadata[satname]['acc_georef'][i])
+    
+    filename_ts.append(filenames_pan[i])
+    satname_ts.append(sat)
+    date_acquired_ts.append(filenames_pan[i][:10])
+
+    timestamp.append(metadata[satname]['dates'][i])
+    shorelines.append(wl_final)
+
+# store in output structure
+output[satname] = {'dates':timestamp, 'shorelines':shorelines, 'idx_skipped':idx_skipped,
+      'metadata':{'filenames':filename_ts, 'satname':satname_ts, 'cloud_cover':cloud_cover_ts,
+                  'acc_georef':acc_georef_ts}}    
+
+del idx_skipped
+
+
+
+#%%
+#==========================================================#
+# Read L5 images
+#==========================================================#
+
+satname = 'L5'
+dates = metadata[satname]['dates']
+input_epsg = 32656 # metadata[satname]['epsg']
+
+# path to images
+filepath_img = os.path.join(os.getcwd(), 'data', sitename, satname, '30m')
+filenames = os.listdir(filepath_img)
+N = len(filenames)
+
+# initialise variables
+cloud_cover_ts = []
+acc_georef_ts = []
+date_acquired_ts = []
+filename_ts = []
+satname_ts = []
+timestamp = []
+shorelines = []
+idx_skipped = []
+
+
+spacing = '=========================================================='
+msg = ' %s\n %s\n %s' % (spacing, satname, spacing)
+print(msg)
+
+for i in range(N):
+    
+    # read ms image
+    fn = os.path.join(filepath_img, filenames[i])
+    data = gdal.Open(fn, gdal.GA_ReadOnly)
+    georef = np.array(data.GetGeoTransform())
+    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
+    im_ms = np.stack(bands, 2)
+
+    # down-sample to half hte original pixel size
+    nrows = im_ms.shape[0]*2
+    ncols = im_ms.shape[1]*2
+    
+    # cloud mask
+    im_qa = im_ms[:,:,5]
+    im_ms = im_ms[:,:,:-1]
+    cloud_mask = sds.create_cloud_mask(im_qa, satname, plot_bool)
+    cloud_mask = transform.resize(cloud_mask, (nrows, ncols), order=0, preserve_range=True, mode='constant').astype('bool_')
+    
+    # resize the image using bilinear interpolation (order 1)
+    im_ms = transform.resize(im_ms,(nrows, ncols), order=1, preserve_range=True, mode='constant')
+    
+    # adjust georef vector (scale becomes 15m and origin is adjusted to the center of new corner pixel)
+    georef[1] = 15
+    georef[5] = -15
+    georef[0] = georef[0] + 7.5
+    georef[3] = georef[3] - 7.5
+    
+    # check if -inf or nan values on any band and add to cloud mask
+    for k in range(im_ms.shape[2]):   
+        im_inf = np.isin(im_ms[:,:,k], -np.inf)
+        im_nan = np.isnan(im_ms[:,:,k])
+        cloud_mask = np.logical_or(np.logical_or(cloud_mask, im_inf), im_nan)
+        
+    # calculate cloud cover and skip image if above threshold
+    cloud_cover = sum(sum(cloud_mask.astype(int)))/(cloud_mask.shape[0]*cloud_mask.shape[1])
+    if cloud_cover > cloud_thresh:
+        print('skip ' + str(i) + ' - cloudy (' + str(np.round(cloud_cover*100).astype(int)) + '%)')
+        idx_skipped.append(i)
+        continue
+    
+    # rescale image intensity for display purposes
+    im_display = sds.rescale_image_intensity(im_ms[:,:,[2,1,0]], cloud_mask, 99.9, False)
+    
+    # classify image in 4 classes (sand, whitewater, water, other) with NN classifier
+    im_classif, im_labels = sds.classify_image_NN_nopan(im_ms, cloud_mask, min_beach_size, plot_bool)
+        
+    # if there aren't any sandy pixels
+    if sum(sum(im_labels[:,:,0])) == 0 :
+        # use global threshold
+        im_ndwi = sds.nd_index(im_ms[:,:,4], im_ms[:,:,1], cloud_mask, plot_bool)
+        contours = sds.find_wl_contours(im_ndwi, cloud_mask, plot_bool)
+    else:
+        # use specific threhsold
+        contours_wi, contours_mwi = sds.find_wl_contours2(im_ms, im_labels, cloud_mask, buffer_size, plot_bool)
+
+    # convert from pixels to world coordinates
+    wl_coords = sds.convert_pix2world(contours_mwi, georef)
+    # convert to output epsg spatial reference
+    wl = sds.convert_epsg(wl_coords, input_epsg, output_epsg)
+    
+    # remove contour lines that have a perimeter < min_length_wl
+    wl_good = []
+    for l, wls in enumerate(wl):
+        coords = [(wls[k,0], wls[k,1]) for k in range(len(wls))]
+        a = LineString(coords) # shapely LineString structure
+        if a.length >= min_length_wl:
+            wl_good.append(wls)
+
+    # format points and only select the ones close to the refpoints
+    x_points = np.array([])
+    y_points = np.array([])
+    for k in range(len(wl_good)):
+        x_points = np.append(x_points,wl_good[k][:,0])
+        y_points = np.append(y_points,wl_good[k][:,1])
+    wl_good = np.transpose(np.array([x_points,y_points]))
+    temp = np.zeros((len(wl_good))).astype(bool)
+    for k in range(len(refpoints)): 
+        temp = np.logical_or(np.linalg.norm(wl_good - refpoints[k,[0,1]], axis=1) < dist_ref, temp) 
+    wl_final = wl_good[temp]
+    
+    # plot output
+    plt.figure()
+    plt.subplot(121)
+    im = np.copy(im_display)
+    colours = np.array([[1,128/255,0/255],[204/255,1,1],[0,0,204/255]])
+    for k in range(0,im_labels.shape[2]):
+        im[im_labels[:,:,k],0] = colours[k,0]
+        im[im_labels[:,:,k],1] = colours[k,1]
+        im[im_labels[:,:,k],2] = colours[k,2]
+    plt.imshow(im)
+    for k,contour in enumerate(contours_mwi): plt.plot(contour[:, 1], contour[:, 0], linewidth=2, color='k', linestyle='--')
+    plt.title(satname + '     ' + metadata[satname]['dates'][i].strftime('%Y-%m-%d') + '      acc : ' + str(metadata[satname]['acc_georef'][i]) + ' m' )
+    plt.subplot(122)
+    plt.axis('equal')
+    plt.axis('off')
+    plt.plot(refpoints[:,0], refpoints[:,1], 'k.')
+    plt.plot(wl_final[:,0], wl_final[:,1], 'r.')
+    mng = plt.get_current_fig_manager()                                         
+    mng.window.showMaximized()
+    plt.tight_layout()
+    plt.draw()
+
+    pt_in = np.array(ginput(n=1, timeout=1000))
+    plt.close()
+    
+    # if image is rejected, skip it
+    if pt_in[0][1] > nrows/2:
+        print('skip ' + str(i) + ' - rejected')
+        idx_skipped.append(i)
+        continue
+        
+    # if accepted, store the data
+    cloud_cover_ts.append(cloud_cover)
+    acc_georef_ts.append(metadata[satname]['acc_georef'][i])
+    
+    filename_ts.append(filenames[i])
+    satname_ts.append(satname)
+    date_acquired_ts.append(filenames[i][:10])
+
+    timestamp.append(metadata[satname]['dates'][i])
+    shorelines.append(wl_final)
+
+# store in output structure
+output[satname] = {'dates':timestamp, 'shorelines':shorelines, 'idx_skipped':idx_skipped,
+      'metadata':{'filenames':filename_ts, 'satname':satname_ts, 'cloud_cover':cloud_cover_ts,
+                  'acc_georef':acc_georef_ts}}    
+
+del idx_skipped
+
+#==========================================================#
+#==========================================================#
+#==========================================================#
+#==========================================================#
+
+#%%
+# save output
+with open(os.path.join(filepath, sitename + '_output' + '.pkl'), 'wb') as f:
+    pickle.dump(output, f)
+    
+# save idx_skipped
+#idx_skipped = dict([])
+#for satname in list(output.keys()):
+#    idx_skipped[satname] = output[satname]['idx_skipped']
+#with open(os.path.join(filepath, sitename + '_idxskipped' + '.pkl'), 'wb') as f:
+#    pickle.dump(idx_skipped, f)
+    
--- a/sl_comparison_NARRA.py
+++ b/sl_comparison_NARRA.py
@ -1,382 +0,0 @@
-# -*- coding: utf-8 -*-
-
-#==========================================================#
-# Compare Narrabeen SDS with 3D quadbike surveys
-#==========================================================#
-
-# Initial settings
-
-import os
-import numpy as np
-import matplotlib.pyplot as plt
-import pdb
-import ee
-
-import matplotlib.dates as mdates
-import matplotlib.cm as cm
-from datetime import datetime, timedelta
-import pickle
-import pytz
-import scipy.io as sio
-import scipy.interpolate as interpolate
-import statsmodels.api as sm
-import skimage.measure as measure
-
-# my functions
-import functions.utils as utils
-
-# some settings
-np.seterr(all='ignore') # raise/ignore divisions by 0 and nans
-plt.rcParams['axes.grid'] = True
-plt.rcParams['figure.max_open_warning'] = 100
-
-au_tz = pytz.timezone('Australia/Sydney')
-
-# load quadbike dates and convert from datenum to datetime
-filename = 'data\quadbike\survey_dates.mat'
-filepath = os.path.join(os.getcwd(), filename)
-dates_quad = sio.loadmat(filepath)['dates'] # matrix containing year, month, day
-dates_quad = [datetime(dates_quad[i,0], dates_quad[i,1], dates_quad[i,2],
-                       tzinfo=au_tz) for i in range(dates_quad.shape[0])]
-
-# load timestamps from satellite images
-satname = 'L8'
-sitename = 'NARRA'
-filepath = os.path.join(os.getcwd(), 'data', satname, sitename)
-with open(os.path.join(filepath, sitename + '_output_new' + '.pkl'), 'rb') as f:
-    output = pickle.load(f)
-        
-dates_l8 = output['t']
-# convert to AEST
-dates_l8 = [_.astimezone(au_tz) for _ in dates_l8]
-# remove duplicates
-dates_l8_str = [_.strftime('%Y%m%d') for _ in dates_l8]
-dupl = utils.duplicates_dict(dates_l8_str)
-idx_remove = []
-for k,v in dupl.items():
-    
-    idx1 = v[0]
-    idx2 = v[1]
-    
-    c1 = output['cloud_cover'][idx1]
-    c2 = output['cloud_cover'][idx2]
-    g1 = output['acc_georef'][idx1]
-    g2 = output['acc_georef'][idx2]
-    
-    if c1 < c2 - 0.01:
-        idx_remove.append(idx2)
-    elif g1 < g2 - 0.1:
-        idx_remove.append(idx2)
-    else:
-        idx_remove.append(idx1)
-idx_remove = sorted(idx_remove)
-idx_all = np.linspace(0,70,71)
-idx_keep = list(np.where(~np.isin(idx_all,idx_remove))[0])
-output['t'] = [output['t'][k] for k in idx_keep]
-output['shorelines'] = [output['shorelines'][k] for k in idx_keep]
-output['cloud_cover'] = [output['cloud_cover'][k] for k in idx_keep]
-output['acc_georef'] = [output['acc_georef'][k] for k in idx_keep]
-# convert to AEST
-dates_l8 = output['t']
-dates_l8 = [_.astimezone(au_tz) for _ in dates_l8]
-
-# load wave data (already AEST)
-filename = 'data\wave\SydneyProcessed.mat'
-filepath = os.path.join(os.getcwd(), filename)
-wave_data = sio.loadmat(filepath)
-idx = utils.find_indices(wave_data['dates'][:,0], lambda e: e >= dates_l8[0].year and e <= dates_l8[-1].year)
-hsig = np.array([wave_data['Hsig'][i][0] for i in idx])
-wdir = np.array([wave_data['Wdir'][i][0] for i in idx])
-dates_wave = [datetime(wave_data['dates'][i,0], wave_data['dates'][i,1],
-                       wave_data['dates'][i,2], wave_data['dates'][i,3],
-                       wave_data['dates'][i,4], wave_data['dates'][i,5],
-                       tzinfo=au_tz) for i in idx]
-    
-# load tide data (already AEST)
-filename = 'SydTideData.mat'
-filepath = os.path.join(os.getcwd(), 'data', 'tide', filename)
-tide_data = sio.loadmat(filepath)
-idx = utils.find_indices(tide_data['dates'][:,0], lambda e: e >= dates_l8[0].year and e <= dates_l8[-1].year)
-tide = np.array([tide_data['tide'][i][0] for i in idx])
-dates_tide = [datetime(tide_data['dates'][i,0], tide_data['dates'][i,1],
-                       tide_data['dates'][i,2], tide_data['dates'][i,3],
-                       tide_data['dates'][i,4], tide_data['dates'][i,5],
-                       tzinfo=au_tz) for i in idx]
-
-#%% make a plot of all the dates with wave data
-orange = [255/255,140/255,0]
-blue = [0,191/255,255/255]
-f = plt.figure()
-months = mdates.MonthLocator()
-month_fmt = mdates.DateFormatter('%b %Y')
-days = mdates.DayLocator()
-years = [2013,2014,2015,2016]
-for k in range(len(years)):
-    sel_year = years[k]
-    ax = plt.subplot(4,1,k+1)
-    idx_year = utils.find_indices(dates_wave, lambda e : e.year >= sel_year and e.year <= sel_year)
-    plt.plot([dates_wave[i] for i in idx_year], [hsig[i] for i in idx_year], 'k-', linewidth=0.5)
-    hsigmax = np.nanmax([hsig[i] for i in idx_year])
-    cbool = True
-    for j in range(len(dates_quad)):
-        if dates_quad[j].year == sel_year:
-            if cbool:
-                plt.plot([dates_quad[j], dates_quad[j]], [0, hsigmax], color=orange, label='survey')
-                cbool = False
-            else:
-                plt.plot([dates_quad[j], dates_quad[j]], [0, hsigmax], color=orange)
-    cbool = True
-    for j in range(len(dates_l8)):
-        if dates_l8[j].year == sel_year:
-            if cbool:
-                plt.plot([dates_l8[j], dates_l8[j]], [0, hsigmax], color=blue, label='landsat8')
-                cbool = False
-            else:
-                plt.plot([dates_l8[j], dates_l8[j]], [0, hsigmax], color=blue)
-    if k == 3:
-        plt.legend()
-    plt.xlim((datetime(sel_year,1,1), datetime(sel_year,12,31, tzinfo=au_tz)))
-    plt.ylim((0, hsigmax))
-    plt.ylabel('Hs [m]')
-    ax.xaxis.set_major_locator = months
-    ax.xaxis.set_major_formatter(month_fmt)
-f.subplots_adjust(hspace=0.2)
-plt.draw()
-
-#%% calculate difference between dates (quad and sat)
-diff_days = [ [(x - _).days for _ in dates_quad] for x in dates_l8]
-max_diff = 14
-idx_closest = [utils.find_indices(_, lambda e: abs(e) <= max_diff) for _ in diff_days]
-# store in dates_diff dictionnary
-dates_diff = []
-cloud_cover = []
-for i in range(len(idx_closest)):
-    if not idx_closest[i]:
-        continue
-    elif len(idx_closest[i]) > 1:
-        idx_best = np.argmin(np.abs([diff_days[i][_] for _ in idx_closest[i]]))
-        dates_temp = [dates_quad[_] for _ in idx_closest[i]]
-        days_temp = [diff_days[i][_] for _ in idx_closest[i]]
-        dates_diff.append({"date sat": dates_l8[i],
-                           "date quad": dates_temp[idx_best],
-                           "days diff": days_temp[idx_best]})
-    else:
-        dates_diff.append({"date sat": dates_l8[i],
-                           "date quad": dates_quad[idx_closest[i][0]],
-                           "days diff": diff_days[i][idx_closest[i][0]]
-                           }) 
-    # store cloud data
-    cloud_cover.append(output['cloud_cover'][i])
-
-# store wave data
-wave_hsig = []
-for i in range(len(dates_diff)):
-    wave_hsig.append(hsig[np.argmin(np.abs(np.array([(dates_diff[i]['date sat'] - _).total_seconds() for _ in dates_wave])))])
-
-# make a plot
-plt.figure()
-counter = 0
-for i in range(len(dates_diff)):
-    counter = counter + 1
-    if dates_diff[i]['date quad'] > dates_diff[i]['date sat']:
-        date_min = dates_diff[i]['date sat']
-        date_max = dates_diff[i]['date quad']
-        color1 = orange
-        color2 = blue
-    else:
-        date_min = dates_diff[i]['date quad']
-        date_max = dates_diff[i]['date sat']
-        color1 = blue
-        color2 = orange
-    idx_t = utils.find_indices(dates_wave, lambda e : e >= date_min and e <= date_max)
-    hsigmax = np.nanmax([hsig[i] for i in idx_t])
-    hsigmin = np.nanmin([hsig[i] for i in idx_t])
-    if counter > 9:
-        counter = 1
-        plt.figure()
-    ax = plt.subplot(3,3,counter)
-    plt.plot([dates_wave[i] for i in idx_t], [hsig[i] for i in idx_t], 'k-', linewidth=1.5)
-    plt.plot([date_min, date_min], [0, 4.5], color=color2, label='survey')
-    plt.plot([date_max, date_max], [0, 4.5], color=color1, label='landsat8')
-    plt.ylabel('Hs [m]')
-    ax.xaxis.set_major_locator(mdates.DayLocator(tz=au_tz))
-    ax.xaxis.set_minor_locator(mdates.HourLocator(tz=au_tz))
-    ax.xaxis.set_major_formatter(mdates.DateFormatter('%d'))
-    ax.xaxis.set_minor_locator(months)
-    plt.title(dates_diff[i]['date sat'].strftime('%b %Y') + '   (' + str(abs(dates_diff[i]['days diff'])) + ' days)')
-    plt.draw()
-    plt.gcf().subplots_adjust(hspace=0.5)   
-
-# mean day difference
-np.mean([ np.abs(_['days diff']) for _ in dates_diff])
-
-#%% Compare shorelines in elevation
-
-dist_buffer = 50 # buffer of points selected for interpolation
-
-# load quadbike .mat files 
-foldername = 'data\quadbike\surveys3D'
-folderpath = os.path.join(os.getcwd(), foldername)
-filenames = os.listdir(folderpath)
-
-# get the satellite shorelines
-sl = output['shorelines']
-
-# get dates from filenames
-dates_quad = [datetime(int(_[6:10]), int(_[11:13]), int(_[14:16]), tzinfo= au_tz) for _ in filenames]
-
-zav = []
-ztide = []
-sl_gt = []
-for i in range(len(dates_diff)):
-    
-    sl_smooth = sl[i]
-    
-    # select closest 3D survey and load .mat file
-    idx_closest = np.argmin(np.abs(np.array([(dates_diff[i]['date sat'] - _).days for _ in dates_quad])))
-    survey3d = sio.loadmat(os.path.join(folderpath, filenames[idx_closest]))
-    # reshape to a vector
-    xs = survey3d['x'].reshape(survey3d['x'].shape[0] * survey3d['x'].shape[1])
-    ys = survey3d['y'].reshape(survey3d['y'].shape[0] * survey3d['y'].shape[1])
-    zs = survey3d['z'].reshape(survey3d['z'].shape[0] * survey3d['z'].shape[1])
-    # remove nan values
-    idx_nan = np.isnan(zs)
-    xs = xs[~idx_nan]
-    ys = ys[~idx_nan]
-    zs = zs[~idx_nan]
-    
-    # find water level at the time the image was acquired
-    idx_closest = np.argmin(np.abs(np.array([(dates_diff[i]['date sat'] - _).total_seconds() for _ in dates_tide])))
-    tide_level = tide[idx_closest]
-    ztide.append(tide_level)
-    
-    # find contour corresponding to the water level on 3D surface (if below minimum, add 0.05m increments)
-    if tide_level < np.nanmin(survey3d['z']):
-        tide_level = np.nanmin(survey3d['z'])
-        sl_tide = measure.find_contours(survey3d['z'], tide_level)
-        sl_tide = sl_tide[np.argmax(np.array([len(_) for _ in sl_tide]))]
-        count = 0
-        while len(sl_tide) < 900:
-            count = count + 1
-            tide_level = tide_level + 0.05*count
-            sl_tide = measure.find_contours(survey3d['z'], tide_level)
-            sl_tide = sl_tide[np.argmax(np.array([len(_) for _ in sl_tide]))]
-        print('added ' + str(0.05*count) + ' cm   -  contour with ' + str(len(sl_tide)) + ' points')    
-    else:
-        sl_tide = measure.find_contours(survey3d['z'], tide_level)
-        sl_tide = sl_tide[np.argmax(np.array([len(_) for _ in sl_tide]))]
-    # remove nans
-    if np.any(np.isnan(sl_tide)):
-        index_nan = np.where(np.isnan(sl_tide))[0]
-        sl_tide = np.delete(sl_tide, index_nan, axis=0) 
-    # get x,y coordinates
-    xtide = [survey3d['x'][int(np.round(sl_tide[m,0])), int(np.round(sl_tide[m,1]))] for m in range(sl_tide.shape[0])]
-    ytide = [survey3d['y'][int(np.round(sl_tide[m,0])), int(np.round(sl_tide[m,1]))] for m in range(sl_tide.shape[0])]
-    sl_gt.append(np.transpose(np.array([np.array(xtide), np.array(ytide)])))
-    # interpolate SDS on 3D surface to get elevation (point by point)
-    zq = []
-    for j in range(sl_smooth.shape[0]):
-        xq = sl_smooth[j,0]
-        yq = sl_smooth[j,1]
-        dist_q = np.linalg.norm(np.transpose(np.array([[xq - _ for _ in xs],[yq - _ for _ in ys]])), axis=1)
-        idx_buffer = dist_q <= dist_buffer
-        if sum(idx_buffer) > 0:
-            tck = interpolate.bisplrep(xs[idx_buffer], ys[idx_buffer], zs[idx_buffer])
-            zq.append(interpolate.bisplev(xq, yq, tck))
-            
-    zq = np.array(zq)
-    plt.figure()
-    plt.hist(zq, bins=100)
-    plt.draw()
-#        plt.figure()
-#        plt.axis('equal')
-#        plt.scatter(xs, ys, s=10, c=zs, marker='o', cmap=cm.get_cmap('jet'),
-#                    label='quad data')
-#        plt.plot(xs[idx_buffer], ys[idx_buffer], 'ko')
-#        plt.plot(xq,yq,'ro')
-#        plt.draw()
-        
-    # store the alongshore median elevation
-    zav.append(np.median(utils.reject_outliers(zq, m=2)))
-    
-    # make plot
-    red = [255/255, 0, 0]
-    gray = [0.75, 0.75, 0.75]
-    plt.figure()
-    plt.subplot(121)
-    plt.axis('equal')
-    plt.scatter(xs, ys, s=10, c=zs, marker='o', cmap=cm.get_cmap('jet'),
-            label='3D survey')
-    plt.plot(xtide, ytide, '--', color=gray, linewidth=2.5, label='tide level contour')
-    plt.plot(sl_smooth[:,0], sl_smooth[:,1], '-', color=red, linewidth=2.5, label='SDS')
-#    plt.plot(sl[i][idx_beach,0], sl[i][idx_beach,1], 'w-', linewidth=2)
-    plt.xlabel('Eastings [m]')
-    plt.ylabel('Northings [m]')
-    plt.title('Shoreline comparison')
-    plt.colorbar(label='mAHD')
-    plt.legend()
-    plt.ylim((6266100, 6267000))
-    plt.subplot(122)
-    plt.plot(np.linspace(0,1,len(zq)), zq, 'ko-', markersize=5)
-    plt.plot([0, 1], [zav[i], zav[i]], 'r-', label='median')
-    plt.plot([0, 1], [ztide[i], ztide[i]], 'g--', label = 'measured tide')
-    plt.xlabel('Northings [m]')
-    plt.ylabel('Elevation [mAHD]')
-    plt.title('Alongshore SDS elevation')
-    plt.legend()
-    mng = plt.get_current_fig_manager()                                         
-    mng.window.showMaximized()
-    plt.tight_layout()   
-    plt.draw()
-    
-    print(i)
-
-#%% Calculate some error statistics
-zav = np.array(zav)
-ztide = np.array(ztide)
-
-f = plt.figure()
-plt.subplot(3,1,1)
-plt.bar(np.linspace(1,len(zav),len(zav)), zav-ztide)
-plt.ylabel('Error in z [m]')
-plt.title('Elevation error')
-plt.xticks([])
-plt.draw()
-
-plt.subplot(3,1,2)
-plt.bar(np.linspace(1,len(zav),len(zav)), wave_hsig, color=orange)
-plt.ylabel('Hsig [m]')
-plt.xticks([])
-plt.draw()
-
-plt.subplot(3,1,3)
-plt.bar(np.linspace(1,len(zav),len(zav)), np.array(cloud_cover)*100, color='g')
-plt.ylabel('Cloud cover %')
-plt.xlabel('comparison #')
-plt.grid(False)
-plt.grid(axis='y')
-f.subplots_adjust(hspace=0)
-plt.draw()
-
-np.sqrt(np.mean((zav - ztide)**2))
-
-
-
-#%% plot to show LOWESS smoothing
-#i = 0
-#idx_beach = [np.min(np.linalg.norm(sl[i][k,:] - narrabeach, axis=1)) < dist_thresh for k in range(sl[i].shape[0])]
-#x = sl[i][idx_beach,0]
-#y = sl[i][idx_beach,1]
-#sl_smooth = lowess(x,y, frac=1./10, it = 10)
-#
-#plt.figure()
-#plt.axis('equal')
-#plt.scatter
-#plt.plot(x,y,'bo', linewidth=2, label='original SDS')
-#plt.plot(sl_smooth[:,1], sl_smooth[:,0], 'ro', linewidth=2, label='smoothed SDS')
-#plt.legend()
-#plt.xlabel('Eastings [m]')
-#plt.ylabel('Northings [m]')
-#plt.title('Local weighted scatterplot smoothing (LOWESS)')
-#plt.draw()
-