updated master

added download_images and read_images scripts
7 years ago · 62f5c5330f
parent cd74f6c39c
commit 62f5c5330f
7 changed files with 2032 additions and 556 deletions
--- a/download_images.py
+++ b/download_images.py
@ -0,0 +1,390 @@
 #==========================================================#
 #==========================================================#
 # Download L5, L7, L8, S2 images of a given area 
 #==========================================================#
 #==========================================================#
 #==========================================================#
 # Initial settings
 #==========================================================#
 import os
 import numpy as np
 import matplotlib.pyplot as plt
 import pdb
 import ee
 # other modules
 from osgeo import gdal, ogr, osr
 from urllib.request import urlretrieve
 import zipfile
 from datetime import datetime
 import pytz
 import pickle
 # import own modules
 import functions.utils as utils
 import functions.sds as sds
 np.seterr(all='ignore') # raise/ignore divisions by 0 and nans
 ee.Initialize()
 #==========================================================#
 # Location
 #==========================================================#
 ## location (Narrabeen-Collaroy beach)
 #polygon = [[[151.301454, -33.700754],
 #               [151.311453, -33.702075], 
 #               [151.307237, -33.739761],
 #               [151.294220, -33.736329],
 #               [151.301454, -33.700754]]]; 
 # location (Tairua beach)
 sitename = 'TAIRUA'
 polygon = [[[175.835574, -36.982022],
               [175.888220, -36.980680], 
               [175.893527, -37.029610],
               [175.833444, -37.031767],
               [175.835574, -36.982022]]];
 # initialise metadata dictionnary (stores timestamps and georefencing accuracy of each image)       
 metadata = dict([])
 # create directories
 try:
    os.makedirs(os.path.join(os.getcwd(), 'data',sitename))
 except:
    print('directory already exists')
 #%%
 #==========================================================#
 #==========================================================#
 # L5
 #==========================================================#
 #==========================================================#
 # define filenames for images
 suffix = '.tif'
 filepath = os.path.join(os.getcwd(), 'data', sitename, 'L5', '30m')
 try:
    os.makedirs(filepath)
 except:
    print('directory already exists')
 #==========================================================#
 # Select L5 collection 
 #==========================================================#
 satname = 'L5'
 input_col = ee.ImageCollection('LANDSAT/LT05/C01/T1_TOA')
 # filter by location
 flt_col = input_col.filterBounds(ee.Geometry.Polygon(polygon))
 n_img = flt_col.size().getInfo()
 print('Number of images covering ' + sitename, n_img)
 im_all = flt_col.getInfo().get('features')
 #==========================================================#
 # Main loop trough images
 #==========================================================#
 timestamps = []
 acc_georef = []
 all_names = []
 for i in range(n_img):
    # find each image in ee database
    im = ee.Image(im_all[i].get('id'))
    im_dic = im.getInfo()
    im_bands = im_dic.get('bands')
    t = im_dic['properties']['system:time_start']
    im_timestamp = datetime.fromtimestamp(t/1000, tz=pytz.utc)
    timestamps.append(im_timestamp)
    im_date = im_timestamp.strftime('%Y-%m-%d-%H-%M-%S')
    im_epsg = int(im_dic['bands'][0]['crs'][5:])
    try:
        acc_georef.append(im_dic['properties']['GEOMETRIC_RMSE_MODEL'])
    except:
        acc_georef.append(12)   
        print('No geometric rmse model property')
    # delete dimensions key from dictionnary, otherwise the entire image is extracted
    for j in range(len(im_bands)): del im_bands[j]['dimensions']
    # bands for L5
    ms_bands = [im_bands[0], im_bands[1], im_bands[2], im_bands[3], im_bands[4], im_bands[7]]
    # filenames
    filename = im_date + '_' + satname + '_' + sitename + suffix
    print(i)
    if any(filename in _ for _ in all_names):
        filename = im_date + '_' + satname + '_' + sitename + '_dup' + suffix 
    all_names.append(filename)
    local_data = sds.download_tif(im, polygon, ms_bands, filepath)
    os.rename(local_data, os.path.join(filepath, filename))
 # sort timestamps and georef accuracy (dowloaded images are sorted by date in directory)
 timestamps_sorted = sorted(timestamps)
 idx_sorted = sorted(range(len(timestamps)), key=timestamps.__getitem__)
 acc_georef_sorted = [acc_georef[j] for j in idx_sorted]
 metadata[satname] = {'dates':timestamps_sorted, 'acc_georef':acc_georef_sorted, 'epsg':im_epsg}
 #%%
 #==========================================================#
 #==========================================================#
 # L7&L8
 #==========================================================#
 #==========================================================#
 # define filenames for images
 suffix = '.tif'
 filepath = os.path.join(os.getcwd(), 'data', sitename, 'L7&L8')
 filepath_pan = os.path.join(filepath, 'pan')
 filepath_ms = os.path.join(filepath, 'ms')
 try:
    os.makedirs(filepath_pan)
    os.makedirs(filepath_ms)
 except:
    print('directory already exists')           
 #==========================================================#
 # Select L7 collection 
 #==========================================================#
 satname = 'L7'   
 input_col = ee.ImageCollection('LANDSAT/LE07/C01/T1_RT_TOA')
 # filter by location
 flt_col = input_col.filterBounds(ee.Geometry.Polygon(polygon))
 n_img = flt_col.size().getInfo()
 print('Number of images covering ' + sitename, n_img)
 im_all = flt_col.getInfo().get('features')
 #==========================================================#
 # Main loop trough images
 #==========================================================#
 timestamps = []
 acc_georef = []
 all_names = []
 for i in range(n_img):
    # find each image in ee database
    im = ee.Image(im_all[i].get('id'))
    im_dic = im.getInfo()
    im_bands = im_dic.get('bands')
    t = im_dic['properties']['system:time_start']
    im_timestamp = datetime.fromtimestamp(t/1000, tz=pytz.utc)
    timestamps.append(im_timestamp)
    im_date = im_timestamp.strftime('%Y-%m-%d-%H-%M-%S')
    im_epsg = int(im_dic['bands'][0]['crs'][5:])
    try:
        acc_georef.append(im_dic['properties']['GEOMETRIC_RMSE_MODEL'])
    except:
        acc_georef.append(12)   
        print('No geometric rmse model property')
    # delete dimensions key from dictionnary, otherwise the entire image is extracted
    for j in range(len(im_bands)): del im_bands[j]['dimensions']
    # bands for L7
    pan_band = [im_bands[8]]
    ms_bands = [im_bands[0], im_bands[1], im_bands[2], im_bands[3], im_bands[4], im_bands[9]]
    # filenames
    filename_pan = im_date + '_' + satname + '_' + sitename + '_pan' + suffix
    filename_ms = im_date + '_' + satname + '_' + sitename + '_ms' + suffix
    print(i)
    if any(filename_pan in _ for _ in all_names):
        filename_pan = im_date + '_' + satname + '_' + sitename + '_pan' + '_dup' + suffix
        filename_ms = im_date + '_' + satname + '_' + sitename + '_ms' + '_dup' + suffix 
    all_names.append(filename_pan)
    local_data_pan = sds.download_tif(im, polygon, pan_band, filepath_pan)
    os.rename(local_data_pan, os.path.join(filepath_pan, filename_pan))
    local_data_ms = sds.download_tif(im, polygon, ms_bands, filepath_ms)
    os.rename(local_data_ms, os.path.join(filepath_ms, filename_ms))
 #==========================================================#
 # Select L8 collection 
 #==========================================================#
 satname = 'L8'
 input_col = ee.ImageCollection('LANDSAT/LC08/C01/T1_RT_TOA')
 # filter by location
 flt_col = input_col.filterBounds(ee.Geometry.Polygon(polygon))
 n_img = flt_col.size().getInfo()
 print('Number of images covering Narrabeen:', n_img)
 im_all = flt_col.getInfo().get('features')
 #==========================================================#
 # Main loop trough images
 #==========================================================#
 for i in range(n_img):
    # find each image in ee database
    im = ee.Image(im_all[i].get('id'))
    im_dic = im.getInfo()
    im_bands = im_dic.get('bands')
    t = im_dic['properties']['system:time_start']
    im_timestamp = datetime.fromtimestamp(t/1000, tz=pytz.utc)
    timestamps.append(im_timestamp)
    im_date = im_timestamp.strftime('%Y-%m-%d-%H-%M-%S')
    im_epsg = int(im_dic['bands'][0]['crs'][5:])
    try:
        acc_georef.append(im_dic['properties']['GEOMETRIC_RMSE_MODEL'])
    except:
        acc_georef.append(12)   
        print('No geometric rmse model property')
    # delete dimensions key from dictionnary, otherwise the entire image is extracted
    for j in range(len(im_bands)): del im_bands[j]['dimensions']
    # bands for L8    
    pan_band = [im_bands[7]]
    ms_bands = [im_bands[1], im_bands[2], im_bands[3], im_bands[4], im_bands[5], im_bands[11]]
    # filenames
    filename_pan = im_date + '_' + satname + '_' + sitename + '_pan' + suffix
    filename_ms = im_date + '_' + satname + '_' + sitename + '_ms' + suffix
    print(i)
    if any(filename_pan in _ for _ in all_names):
        filename_pan = im_date + '_' + satname + '_' + sitename + '_pan' + '_dup' + suffix
        filename_ms = im_date + '_' + satname + '_' + sitename + '_ms' + '_dup' + suffix 
    all_names.append(filename_pan)
    local_data_pan = sds.download_tif(im, polygon, pan_band, filepath_pan)
    os.rename(local_data_pan, os.path.join(filepath_pan, filename_pan))
    local_data_ms = sds.download_tif(im, polygon, ms_bands, filepath_ms)
    os.rename(local_data_ms, os.path.join(filepath_ms, filename_ms))
 # sort timestamps and georef accuracy (dowloaded images are sorted by date in directory)
 timestamps_sorted = sorted(timestamps)
 idx_sorted = sorted(range(len(timestamps)), key=timestamps.__getitem__)
 acc_georef_sorted = [acc_georef[j] for j in idx_sorted]
 metadata[satname] = {'dates':timestamps_sorted, 'acc_georef':acc_georef_sorted, 'epsg':im_epsg}
 #%%
 #==========================================================#
 #==========================================================#
 # S2
 #==========================================================#
 #==========================================================#
 # define filenames for images
 suffix = '.tif'
 filepath = os.path.join(os.getcwd(), 'data', sitename, 'S2')
 try:
    os.makedirs(os.path.join(filepath, '10m'))
    os.makedirs(os.path.join(filepath, '20m'))
    os.makedirs(os.path.join(filepath, '60m'))
 except:
    print('directory already exists') 
 #==========================================================#
 # Select L2 collection 
 #==========================================================#
 satname = 'S2'
 input_col = ee.ImageCollection('COPERNICUS/S2')
 # filter by location
 flt_col = input_col.filterBounds(ee.Geometry.Polygon(polygon))
 n_img = flt_col.size().getInfo()
 print('Number of images covering ' + sitename, n_img)
 im_all = flt_col.getInfo().get('features') 
 #==========================================================#
 # Main loop trough images
 #==========================================================#
 timestamps = []
 acc_georef = []
 all_names = []
 for i in range(n_img):
    # find each image in ee database
    im = ee.Image(im_all[i].get('id'))
    im_dic = im.getInfo()
    im_bands = im_dic.get('bands')
    t = im_dic['properties']['system:time_start']
    im_timestamp = datetime.fromtimestamp(t/1000, tz=pytz.utc)
    im_date = im_timestamp.strftime('%Y-%m-%d-%H-%M-%S')
    timestamps.append(im_timestamp)
    im_epsg = int(im_dic['bands'][0]['crs'][5:])
    try:
        if im_dic['properties']['GEOMETRIC_QUALITY_FLAG'] == 'PASSED':
            acc_georef.append(1)
        else:
            acc_georef.append(0)
    except:
        acc_georef.append(0)
    # delete dimensions key from dictionnary, otherwise the entire image is extracted
    for j in range(len(im_bands)): del im_bands[j]['dimensions']
    # bands for S2
    bands10 = [im_bands[1], im_bands[2], im_bands[3], im_bands[7]]
    bands20 = [im_bands[11]]
    bands60 = [im_bands[15]]
    # filenames
    filename10 = im_date + '_' + satname + '_' + sitename + '_' + '10m' + suffix
    filename20 = im_date + '_' + satname + '_' + sitename + '_' + '20m' + suffix
    filename60 = im_date + '_' + satname + '_' + sitename + '_' + '60m' + suffix
    print(i)
    if any(filename10 in _ for _ in all_names):
        filename10 = im_date + '_' + satname + '_' + sitename + '_' + '10m' + '_dup' + suffix
        filename20 = im_date + '_' + satname + '_' + sitename + '_' + '20m' + '_dup' + suffix
        filename60 = im_date + '_' + satname + '_' + sitename + '_' + '60m' + '_dup' + suffix
    all_names.append(filename10)
    local_data = sds.download_tif(im, polygon, bands10, filepath)
    os.rename(local_data, os.path.join(filepath, '10m', filename10))
    local_data = sds.download_tif(im, polygon, bands20, filepath)
    os.rename(local_data, os.path.join(filepath, '20m', filename20))
    local_data = sds.download_tif(im, polygon, bands60, filepath)
    os.rename(local_data, os.path.join(filepath, '60m', filename60))
 # sort timestamps and georef accuracy (dowloaded images are sorted by date in directory)
 timestamps_sorted = sorted(timestamps)
 idx_sorted = sorted(range(len(timestamps)), key=timestamps.__getitem__)
 acc_georef_sorted = [acc_georef[j] for j in idx_sorted]
 metadata[satname] = {'dates':timestamps_sorted, 'acc_georef':acc_georef_sorted, 'epsg':im_epsg}
 #%% save metadata
 filepath = os.path.join(os.getcwd(), 'data', sitename)
 with open(os.path.join(filepath, sitename + '_metadata' + '.pkl'), 'wb') as f:
    pickle.dump(metadata, f)  
--- a/functions/NeuralNet_classif_nopan.pkl
+++ b/functions/NeuralNet_classif_nopan.pkl
--- a/functions/data_analysis.py
+++ b/functions/data_analysis.py
@ -0,0 +1,432 @@
 """This module contains all the functions needed for data analysis """
 # Initial settings
 import numpy as np
 import matplotlib.pyplot as plt
 import matplotlib.patches as mpatches
 from matplotlib import gridspec
 import pdb
 import ee
 # other modules
 from osgeo import gdal, ogr, osr
 import scipy.interpolate as interpolate
 import scipy.stats as sstats
 # image processing modules
 import skimage.filters as filters 
 import skimage.exposure as exposure
 import skimage.transform as transform
 import sklearn.decomposition as decomposition
 import skimage.measure as measure
 import skimage.morphology as morphology
 # machine learning modules
 from sklearn.cluster import KMeans
 from sklearn.neural_network import MLPClassifier
 from sklearn.externals import joblib
 import time
 # import own modules
 import functions.utils as utils
 def get_tide(dates_sds, dates_tide, tide_level):
    tide = []
    for i in range(len(dates_sds)):
        dates_diff = np.abs(np.array([ (dates_sds[i] - _).total_seconds() for _ in dates_tide]))
        if np.min(dates_diff) <= 1800: # half-an-hour
            idx_closest = np.argmin(dates_diff)
            tide.append(tide_level[idx_closest])
        else:
            tide.append(np.nan)
    tide = np.array(tide)
    return tide
 def remove_duplicates(output, satname):
    " removes duplicates from output structure, keep the one with less cloud cover or best georeferencing "
    dates = output['dates']
    dates_str = [_.strftime('%Y%m%d') for _ in dates]
    dupl = utils.duplicates_dict(dates_str)
    if dupl:
        output_nodup = dict([])
        idx_remove = []
        if satname == 'L8' or satname == 'L5':
            for k,v in dupl.items():
                idx1 = v[0]
                idx2 = v[1]
                c1 = output['metadata']['cloud_cover'][idx1]
                c2 = output['metadata']['cloud_cover'][idx2]
                g1 = output['metadata']['acc_georef'][idx1]
                g2 = output['metadata']['acc_georef'][idx2]
                if c1 < c2 - 0.01:
                    idx_remove.append(idx2)
                elif g1 < g2 - 0.1:
                    idx_remove.append(idx2)
                else:
                    idx_remove.append(idx1)
        else:
            for k,v in dupl.items():
                idx1 = v[0]
                idx2 = v[1]
                c1 = output['metadata']['cloud_cover'][idx1]
                c2 = output['metadata']['cloud_cover'][idx2]
                if c1 < c2 - 0.01:
                    idx_remove.append(idx2)
                else:
                    idx_remove.append(idx1)
        idx_remove = sorted(idx_remove)
        idx_all = np.linspace(0, len(dates_str)-1, len(dates_str))
        idx_keep = list(np.where(~np.isin(idx_all,idx_remove))[0])        
        output_nodup['dates'] = [output['dates'][k] for k in idx_keep]
        output_nodup['shorelines'] = [output['shorelines'][k] for k in idx_keep]
        output_nodup['metadata'] = dict([])
        for key in list(output['metadata'].keys()):
            output_nodup['metadata'][key] = [output['metadata'][key][k] for k in idx_keep]
        print(satname + ' : ' + str(len(idx_remove)) + ' duplicates')
        return output_nodup
    else: 
        print(satname + ' : ' + 'no duplicates')
        return output
 def merge(output):
    " merges data from the different satellites "
    # stack all list together under one key
    output_all = {'dates':[], 'shorelines':[],
                  'metadata':{'filenames':[], 'satname':[], 'cloud_cover':[], 'acc_georef':[]}}
    for satname in list(output.keys()):
        output_all['dates'] = output_all['dates'] + output[satname]['dates']
        output_all['shorelines'] = output_all['shorelines'] + output[satname]['shorelines']
        for key in list(output[satname]['metadata'].keys()):
            output_all['metadata'][key]  = output_all['metadata'][key] + output[satname]['metadata'][key]   
    output_all_sorted = {'dates':[], 'shorelines':[],
                         'metadata':{'filenames':[], 'satname':[], 'cloud_cover':[], 'acc_georef':[]}}
    # sort the dates
    idx_sorted = sorted(range(len(output_all['dates'])), key=output_all['dates'].__getitem__)
    output_all_sorted['dates'] = [output_all['dates'][i] for i in idx_sorted]
    output_all_sorted['shorelines'] = [output_all['shorelines'][i] for i in idx_sorted]
    for key in list(output_all['metadata'].keys()):
        output_all_sorted['metadata'][key] = [output_all['metadata'][key][i] for i in idx_sorted]
    return output_all_sorted
 def create_transects(x0, y0, orientation, chainage_length):
    " creates shore-normal transects "
    transects = []
    for k in range(len(x0)):
        # orientation of cross-shore profile
        phi = (90 - orientation[k])*np.pi/180
        # create a vector using the chainage length
        x = np.linspace(0,chainage_length,chainage_length+1)
        y = np.zeros(len(x))
        coords = np.zeros((len(x),2))
        coords[:,0] = x
        coords[:,1] = y
        # translate and rotate the vector using the origin and orientation
        tf = transform.EuclideanTransform(rotation=phi, translation=(x0[k],y0[k]))
        coords_tf = tf(coords)
        transects.append(coords_tf)
    return transects
 def calculate_chainage(sds, transects, orientation, along_dist):
    " intersect SDS with transect and compute chainage position "
    chainage_mtx = np.zeros((len(sds),len(transects),6))
    for i in range(len(sds)):
        sl = sds[i]
        for j in range(len(transects)): 
            # compute rotation matrix
            X0 = transects[j][0,0]
            Y0 = transects[j][0,1]
            phi = (90 - orientation[j])*np.pi/180
            Mrot = np.array([[np.cos(phi), np.sin(phi)],[-np.sin(phi), np.cos(phi)]])
            # calculate point to line distance between shoreline points and profile
            p1 = np.array([X0,Y0])
            p2 = transects[j][-1,:]
            p3 = sl
            d = np.abs(np.cross(p2-p1,p3-p1)/np.linalg.norm(p2-p1))
            idx_close = utils.find_indices(d, lambda e: e <= along_dist)
            # check if there are SDS points around the profile or not 
            if not idx_close:
                chainage_mtx[i,j,:] = np.tile(np.nan,(1,6))
            else:
                # change of base to shore-normal coordinate system
                xy_close = np.array([sl[idx_close,0],sl[idx_close,1]]) - np.tile(np.array([[X0],[Y0]]), (1,len(sl[idx_close])))
                xy_rot = np.matmul(Mrot, xy_close)
                # put nan values if the chainage is negative (MAKE SURE TO PICK ORIGIN CORRECTLY)
                if np.any(xy_rot[0,:] < 0):
                    xy_rot[0,np.where(xy_rot[0,:] < 0)] = np.nan
                # compute mean, median max and std of chainage position
                n_points = len(xy_rot[0,:])
                mean_cross = np.nanmean(xy_rot[0,:])
                median_cross = np.nanmedian(xy_rot[0,:])
                max_cross = np.nanmax(xy_rot[0,:])
                min_cross = np.nanmin(xy_rot[0,:])
                std_cross = np.nanstd(xy_rot[0,:])
                if std_cross > 10: # if large std, take the most seaward point
                    mean_cross = max_cross
                    median_cross = max_cross
                    min_cross = max_cross
                # store the statistics
                chainage_mtx[i,j,:] = np.array([mean_cross, median_cross, max_cross,
                            min_cross, n_points, std_cross])   
    # format into dictionnary
    chainage = dict([])
    chainage['mean'] = chainage_mtx[:,:,0]
    chainage['median'] = chainage_mtx[:,:,1]
    chainage['max'] = chainage_mtx[:,:,2]
    chainage['min'] = chainage_mtx[:,:,3]
    chainage['npoints'] = chainage_mtx[:,:,4]
    chainage['std'] = chainage_mtx[:,:,5]
    return chainage
 def compare_sds(dates_sds, chain_sds, topo_profiles, mod=0, mindays=5):
    """
    Compare sds with groundtruth data from topographic surveys / argus shorelines
    KV WRL 2018
    Arguments:
    -----------
        dates_sds: list
            list of dates corresponding to each row in chain_sds
        chain_sds: np.ndarray
            array with time series of chainage for each transect (each transect is one column)
        topo_profiles: dict
            dict containing the dates and chainage of the groundtruth
        mod: 0 or 1
            0 for linear interpolation between 2 closest surveys, 1 for only nearest neighbour
        min_days: int
            minimum number of days for which the data can be compared     
    Returns:    -----------
        stats: dict
            contains all the statistics of the comparison
    """       
    # create 3 figures       
    fig1 = plt.figure()
    gs1 = gridspec.GridSpec(chain_sds.shape[1], 1)
    fig2 = plt.figure()
    gs2 = gridspec.GridSpec(2, chain_sds.shape[1])
    fig3 = plt.figure()
    gs3 = gridspec.GridSpec(2,1)
    dates_sds_num = np.array([_.toordinal() for _ in dates_sds])
    stats = dict([])
    data_fin = dict([])
    # for each transect compare and plot the data
    for i in range(chain_sds.shape[1]):
        pfname = list(topo_profiles.keys())[i]
        stats[pfname] = dict([])
        data_fin[pfname] = dict([])
        dates_sur = topo_profiles[pfname]['dates']
        chain_sur = topo_profiles[pfname]['chainage']
        # convert to datenum
        dates_sur_num = np.array([_.toordinal() for _ in dates_sur])
        chain_sur_interp = []
        diff_days = []
        for j, satdate in enumerate(dates_sds_num):
            temp_diff = satdate - dates_sur_num
            if mod==0:
                # select measurement before and after sat image date and interpolate
                ind_before = np.where(temp_diff == temp_diff[temp_diff > 0][-1])[0]     
                if ind_before == len(temp_diff)-1:
                    chain_sur_interp.append(np.nan)
                    diff_days.append(np.abs(satdate-dates_sur_num[ind_before])[0])
                    continue         
                ind_after = np.where(temp_diff == temp_diff[temp_diff < 0][0])[0]            
                tempx = np.zeros(2)
                tempx[0] = dates_sur_num[ind_before]
                tempx[1] = dates_sur_num[ind_after]
                tempy = np.zeros(2)
                tempy[0] = chain_sur[ind_before]
                tempy[1] = chain_sur[ind_after]
                diff_days.append(np.abs(np.max([satdate-tempx[0], satdate-tempx[1]])))                
                # interpolate
                f = interpolate.interp1d(tempx, tempy)
                chain_sur_interp.append(f(satdate))
            elif mod==1:
                # select the closest measurement
                idx_closest = utils.find_indices(np.abs(temp_diff), lambda e: e == np.min(np.abs(temp_diff)))[0]
                diff_days.append(np.abs(satdate-dates_sur_num[idx_closest]))
                if diff_days[j] > mindays:
                    chain_sur_interp.append(np.nan)
                else:
                    chain_sur_interp.append(chain_sur[idx_closest])
        chain_sur_interp = np.array(chain_sur_interp)
        # remove nan values
        idx_sur_nan = ~np.isnan(chain_sur_interp)
        idx_sat_nan = ~np.isnan(chain_sds[:,i])
        idx_nan = np.logical_and(idx_sur_nan, idx_sat_nan)
        # groundtruth and sds
        chain_sur_fin = chain_sur_interp[idx_nan]
        chain_sds_fin = chain_sds[idx_nan,i]
        dates_fin = [k for (k, v) in zip(dates_sds, idx_nan) if v]
        # calculate statistics
        slope, intercept, rvalue, pvalue, std_err = sstats.linregress(chain_sur_fin, chain_sds_fin) 
        R2 = rvalue**2
        correlation = np.corrcoef(chain_sur_fin, chain_sds_fin)[0,1]
        diff_chain = chain_sur_fin - chain_sds_fin
        rmse = np.sqrt(np.nanmean((diff_chain)**2))
        mean = np.nanmean(diff_chain)
        std = np.nanstd(diff_chain)
        q90 = np.percentile(np.abs(diff_chain), 90)
        # store data
        stats[pfname]['rmse'] = rmse
        stats[pfname]['mean'] = mean
        stats[pfname]['std'] = std
        stats[pfname]['q90'] = q90
        stats[pfname]['diffdays'] = diff_days
        stats[pfname]['corr'] = correlation
        stats[pfname]['linfit'] = {'slope':slope, 'intercept':intercept, 'R2':R2, 'pvalue':pvalue}
        data_fin[pfname]['dates'] = dates_fin
        data_fin[pfname]['sds'] = chain_sds_fin
        data_fin[pfname]['survey'] = chain_sur_fin
        # make time-series plot
        plt.figure(fig1.number)
        fig1.add_subplot(gs1[i,0])
        plt.plot(dates_sur, chain_sur, 'o-', color='C1', markersize=4, label='survey all')
        plt.plot(dates_fin, chain_sur_fin, 'o', color=[0.3, 0.3, 0.3], markersize=2, label='survey interp')
        plt.plot(dates_fin, chain_sds_fin, 'o--', color='b', markersize=4, label='SDS')
        plt.title(pfname, fontweight='bold')
 #        plt.xlim([dates_sds[0], dates_sds[-1]])
        plt.ylabel('chainage [m]')
        # make scatter plot
        plt.figure(fig2.number)
        fig2.add_subplot(gs2[0,i])
        plt.axis('equal')
        plt.plot(chain_sur_fin, chain_sds_fin, 'ko', markersize=4, markerfacecolor='w', alpha=0.7)
        xmax = np.max([np.nanmax(chain_sds_fin),np.nanmax(chain_sur_fin)])
        xmin = np.min([np.nanmin(chain_sds_fin),np.nanmin(chain_sur_fin)])
        ymax = np.max([np.nanmax(chain_sds_fin),np.nanmax(chain_sur_fin)])
        ymin = np.min([np.nanmin(chain_sds_fin),np.nanmin(chain_sur_fin)])
        plt.plot([xmin, xmax], [ymin, ymax], 'k--')
        plt.plot([xmin, xmax], [xmin*slope + intercept, xmax*slope + intercept], 'b:')
        str_corr = ' y = %.2f x + %.2f\n R2 = %.2f' % (slope, intercept, R2)
        plt.text(xmin, ymax-5, str_corr, bbox=dict(facecolor=[0.7,0.7,0.7], alpha=0.5), horizontalalignment='left')
        plt.xlabel('chainage survey [m]')
        plt.ylabel('chainage satellite [m]')
        plt.title(pfname, fontweight='bold')
        fig2.add_subplot(gs2[1,i])
        binwidth = 3
        bins = np.arange(min(diff_chain), max(diff_chain) + binwidth, binwidth)
        density = plt.hist(diff_chain, bins=bins, density=True, color=[0.8, 0.8, 0.8], edgecolor='k')
        plt.xlim([-50, 50])
        plt.xlabel('error [m]')
        str_stats = ' rmse = %.1f\n mean = %.1f\n std = %.1f\n q90 = %.1f' % (rmse, mean, std, q90) 
        plt.text(15, np.max(density[0])-0.015, str_stats, bbox=dict(facecolor=[0.8,0.8,0.8], alpha=0.3), horizontalalignment='left', fontsize=10)
    fig1.set_size_inches(19.2, 9.28)
    fig1.set_tight_layout(True)
    fig2.set_size_inches(19.2, 9.28)
    fig2.set_tight_layout(True)
    # all transects together
    chain_sds_all = []
    chain_sur_all = []
    for i in range(chain_sds.shape[1]):
        pfname = list(topo_profiles.keys())[i]
        chain_sds_all = np.append(chain_sds_all,data_fin[pfname]['sds'])
        chain_sur_all = np.append(chain_sur_all,data_fin[pfname]['survey'])
    # calculate statistics
    slope, intercept, rvalue, pvalue, std_err = sstats.linregress(chain_sur_all, chain_sds_all) 
    R2 = rvalue**2
    correlation = np.corrcoef(chain_sur_all, chain_sds_all)[0,1]
    diff_chain_all = chain_sur_all - chain_sds_all
    rmse = np.sqrt(np.nanmean((diff_chain_all)**2))
    mean = np.nanmean(diff_chain_all)
    std = np.nanstd(diff_chain_all)
    q90 = np.percentile(np.abs(diff_chain_all), 90)
    stats['all'] = {'rmse':rmse,'mean':mean,'std':std,'q90':q90, 'corr':correlation,
         'linfit':{'slope':slope, 'intercept':intercept, 'R2':R2, 'pvalue':pvalue}}
    # make plot
    plt.figure(fig3.number)
    fig3.add_subplot(gs3[0,0])
    plt.axis('equal')
    plt.plot(chain_sur_all, chain_sds_all, 'ko', markersize=4, markerfacecolor='w', alpha=0.7)
    xmax = np.max([np.nanmax(chain_sds_all),np.nanmax(chain_sur_all)])
    xmin = np.min([np.nanmin(chain_sds_all),np.nanmin(chain_sur_all)])
    ymax = np.max([np.nanmax(chain_sds_all),np.nanmax(chain_sur_all)])
    ymin = np.min([np.nanmin(chain_sds_all),np.nanmin(chain_sur_all)])
    plt.plot([xmin, xmax], [ymin, ymax], 'k--')
    plt.plot([xmin, xmax], [xmin*slope + intercept, xmax*slope + intercept], 'b:')
    str_corr = ' y = %.2f x + %.2f\n R2 = %.2f' % (slope, intercept, R2)
    plt.text(xmin, ymax-5, str_corr, bbox=dict(facecolor=[0.7,0.7,0.7], alpha=0.5), horizontalalignment='left')
    plt.xlabel('chainage survey [m]')
    plt.ylabel('chainage satellite [m]')
    plt.title(pfname, fontweight='bold')
    fig3.add_subplot(gs3[1,0])
    binwidth = 3
    bins = np.arange(min(diff_chain_all), max(diff_chain_all) + binwidth, binwidth)
    density = plt.hist(diff_chain_all, bins=bins, density=True, color=[0.8, 0.8, 0.8], edgecolor='k')
    plt.xlim([-50, 50])
    plt.xlabel('error [m]')
    str_stats = ' rmse = %.1f\n mean = %.1f\n std = %.1f\n q90 = %.1f' % (rmse, mean, std, q90) 
    plt.text(15, np.max(density[0])-0.015, str_stats, bbox=dict(facecolor=[0.8,0.8,0.8], alpha=0.3), horizontalalignment='left', fontsize=10)
    fig3.set_size_inches(9.2, 9.28)
    fig3.set_tight_layout(True)              
    return stats
--- a/functions/sds.py
+++ b/functions/sds.py
@ -5,11 +5,13 @@ Created on Thu Mar  1 11:20:35 2018
@author: z5030440
 """
-"""This script contains the functions needed for satellite derived shoreline (SDS) extraction"""
+"""This module contains all the functions needed for extracting satellite derived shoreline (SDS) """
 # Initial settings
 import numpy as np
 import matplotlib.pyplot as plt
 import matplotlib.patches as mpatches
 from matplotlib import gridspec
 import pdb
 import ee
@ -18,6 +20,7 @@ from osgeo import gdal, ogr, osr
 import tempfile
 from urllib.request import urlretrieve
 import zipfile
 import scipy.interpolate as interpolate
 # image processing modules
 import skimage.filters as filters 
@ -39,7 +42,7 @@ from functions.utils import *
 # Download from ee server function
-def download_tif(image, polygon, bandsId):
+def download_tif(image, polygon, bandsId, filepath):
    """downloads tif image (region and bands) from the ee server and stores it in a temp file"""
    url = ee.data.makeDownloadUrl(ee.data.getDownloadId({
        'image': image.serialize(),
@ -50,40 +53,7 @@ def download_tif(image, polygon, bandsId):
        }))
    local_zip, headers = urlretrieve(url)
    with zipfile.ZipFile(local_zip) as local_zipfile:
-        return local_zipfile.extract('data.tif', tempfile.mkdtemp())
+        return local_zipfile.extract('data.tif', filepath)
 def load_image(image, polygon, bandsId): 
    """
    Loads an ee.Image() as a np.array. e.Image() is retrieved from the EE database.
    The geographic area and bands to select can be specified
    KV WRL 2018
    Arguments:
    -----------
        image: ee.Image()
            image objec from the EE database
        polygon: list
            coordinates of the points creating a polygon. Each point is a list with 2 values
        bandsId: list
            bands to select, each band is a dictionnary in the list containing the following keys:
            crs, crs_transform, data_type and id. NOTE: you have to remove the key dimensions, otherwise
            the entire image is retrieved.
    Returns:
    -----------
        image_array : np.ndarray
            An array containing the image (2D if one band, otherwise 3D)
        georef : np.ndarray
            6 element vector containing the crs_parameters 
            [X_ul_corner Xscale Xshear Y_ul_corner Yshear Yscale]
    """
    local_tif_filename = download_tif(image, polygon, bandsId)
    dataset = gdal.Open(local_tif_filename, gdal.GA_ReadOnly)
    georef = np.array(dataset.GetGeoTransform())
    bands = [dataset.GetRasterBand(i + 1).ReadAsArray() for i in range(dataset.RasterCount)]
    return np.stack(bands, 2), georef
 def create_cloud_mask(im_qa, satname, plot_bool):
    """
@ -109,8 +79,10 @@ def create_cloud_mask(im_qa, satname, plot_bool):
    # convert QA bits
    if satname == 'L8':
        cloud_values = [2800, 2804, 2808, 2812, 6896, 6900, 6904, 6908]
-    elif satname == 'L7':
+    elif satname == 'L7' or satname == 'L5' or satname == 'L4':
        cloud_values = [752, 756, 760, 764]
    elif satname == 'S2':
        cloud_values = [1024, 2048] # 1024 = dense cloud, 2048 = cirrus clouds
    cloud_mask = np.isin(im_qa, cloud_values)
    # remove isolated cloud pixels (there are some in the swash and they cause problems)
@ -127,109 +99,6 @@ def create_cloud_mask(im_qa, satname, plot_bool):
    return cloud_mask
 def read_eeimage(im, polygon, sat_name, plot_bool):
    """
    Read an ee.Image() object and returns the panchromatic band, multispectral bands (B, G, R, NIR, SWIR)
    and a cloud mask. All outputs are at 15m resolution (bilinear interpolation for the multispectral bands)
    KV WRL 2018
    Arguments:
    -----------
        im: ee.Image()
            Image to read from the Google Earth Engine database
        plot_bool: boolean
            True if plot is wanted
    Returns:
    -----------
        im_pan: np.ndarray (2D)
            The panchromatic band (15m)
        im_ms: np.ndarray (3D)
            The multispectral bands interpolated at 15m
        im_cloud: np.ndarray (2D)
            The cloud mask at 15m
        crs_params: list
            EPSG code and affine transformation parameters
    """   
    im_dic = im.getInfo()
    # save metadata
    im_meta = im_dic.get('properties')
    meta = {'timestamp':im_meta['system:time_start'],
            'date_acquired':im_meta['DATE_ACQUIRED'],
            'geom_rmse_model':im_meta['GEOMETRIC_RMSE_MODEL'],
            'gcp_model':im_meta['GROUND_CONTROL_POINTS_MODEL'],
            'quality':im_meta['IMAGE_QUALITY_OLI'],
            'sun_azimuth':im_meta['SUN_AZIMUTH'],
            'sun_elevation':im_meta['SUN_ELEVATION']}
    im_bands = im_dic.get('bands')
    # delete dimensions key from dictionnary, otherwise the entire image is extracted
    for i in range(len(im_bands)): del im_bands[i]['dimensions']
    # load panchromatic band
    pan_band = [im_bands[7]]
    im_pan, crs_pan = load_image(im, polygon, pan_band)
    im_pan = im_pan[:,:,0]
    # load the multispectral bands (B2,B3,B4,B5,B6) = (blue,green,red,nir,swir1)
    ms_bands = [im_bands[1], im_bands[2], im_bands[3], im_bands[4], im_bands[5]]
    im_ms_30m, crs_ms = load_image(im, polygon, ms_bands)
    # create cloud mask
    qa_band = [im_bands[11]]
    im_qa, crs_qa = load_image(im, polygon, qa_band)
    im_qa = im_qa[:,:,0]
    im_cloud = create_cloud_mask(im_qa, sat_name, plot_bool)
    im_cloud = transform.resize(im_cloud, (im_pan.shape[0], im_pan.shape[1]),
                                order=0, preserve_range=True, mode='constant').astype('bool_')
    # resize the image using bilinear interpolation (order 1)
    im_ms = transform.resize(im_ms_30m,(im_pan.shape[0], im_pan.shape[1]),
                             order=1, preserve_range=True, mode='constant')
    # check if -inf values (means out of image) and add to cloud mask
    im_inf = np.isin(im_ms[:,:,0], -np.inf)
    im_nan = np.isnan(im_ms[:,:,0])
    im_cloud = np.logical_or(np.logical_or(im_cloud, im_inf), im_nan)
    # get the crs parameters for the image at 15m and 30m resolution
    crs = {'crs_15m':crs_pan, 'crs_30m':crs_ms, 'epsg_code':int(pan_band[0]['crs'][5:])}
    if plot_bool:
        # if there are -inf in the image, set them to 0 before plotting
        if sum(sum(np.isin(im_ms_30m[:,:,0], -np.inf).astype(int))) > 0:
            idx = np.isin(im_ms_30m[:,:,0], -np.inf)
            im_ms_30m[idx,0] = 0; im_ms_30m[idx,1] = 0; im_ms_30m[idx,2] = 0; 
            im_ms_30m[idx,3] = 0; im_ms_30m[idx,4] = 0
        plt.figure()
        plt.subplot(221)
        plt.imshow(im_pan, cmap='gray')
        plt.title('PANCHROMATIC')
        plt.subplot(222)
        plt.imshow(im_ms_30m[:,:,[2,1,0]])
        plt.title('RGB')
        plt.subplot(223)
        plt.imshow(im_ms_30m[:,:,3], cmap='gray')
        plt.title('NIR')
        plt.subplot(224)
        plt.imshow(im_ms_30m[:,:,4], cmap='gray')
        plt.title('SWIR')
        plt.show()
    return im_pan, im_ms, im_cloud, crs, meta
 def rescale_image_intensity(im, cloud_mask, prob_high, plot_bool):
    """
    Rescales the intensity of an image (multispectral or single band) by applying
@ -395,9 +264,28 @@ def pansharpen(im_ms, im_pan, cloud_mask, plot_bool):
    # apply PCA to RGB bands
    pca = decomposition.PCA()
    vec_pcs = pca.fit_transform(vec)
    # replace 1st PC with pan band (after matching histograms)
    vec_pan = im_pan.reshape(im_pan.shape[0] * im_pan.shape[1])
    vec_pan = vec_pan[~vec_mask]
 #    plt.figure()
 #    ax1 = plt.subplot(131)
 #    plt.imshow(im_pan, cmap='gray')
 #    plt.title('Pan band')
 #    plt.subplot(132, sharex=ax1, sharey=ax1)
 #    plt.imshow(vec_pcs[:,0].reshape(im_pan.shape[0],im_pan.shape[1]), cmap='gray')
 #    plt.title('PC1')
 #    plt.subplot(133, sharex=ax1, sharey=ax1)
 #    plt.imshow(hist_match(vec_pan, vec_pcs[:,0]).reshape(im_pan.shape[0],im_pan.shape[1]), cmap='gray')
 #    plt.title('Pan band histmatched')
 #
 #    plt.figure()
 #    plt.hist(hist_match(vec_pan, vec_pcs[:,0]), bins=300)
 #    plt.hist(vec_pcs[:,0], bins=300, alpha=0.5)   
 #    plt.hist(vec_pan, bins=300, alpha=0.5)   
 #    plt.draw()
    vec_pcs[:,0] = hist_match(vec_pan, vec_pcs[:,0])
    vec_ms_ps = pca.inverse_transform(vec_pcs)
@ -407,13 +295,14 @@ def pansharpen(im_ms, im_pan, cloud_mask, plot_bool):
    im_ms_ps = vec_ms_ps_full.reshape(im_ms.shape[0], im_ms.shape[1], im_ms.shape[2])
    if plot_bool:
        plt.figure()
        ax1 = plt.subplot(121)
-        plt.imshow(rescale_image_intensity(im_ms[:,:,[2,1,0]], cloud_mask, 100, False))
+        plt.imshow(rescale_image_intensity(im_ms[:,:,[2,1,0]], cloud_mask, 99.9, False))
        plt.axis('off')
        plt.title('Original')
        ax2 = plt.subplot(122, sharex=ax1, sharey=ax1)
-        plt.imshow(rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 100, False))
+        plt.imshow(rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 99.9, False))
        plt.axis('off')
        plt.title('Pansharpened')
        plt.show()
@ -458,9 +347,10 @@ def nd_index(im1, im2, cloud_mask, plot_bool):
    return im_nd
-def find_wl_contours(im_ndwi, cloud_mask, min_contour_points, plot_bool):
+def find_wl_contours(im_ndwi, cloud_mask, plot_bool):
    """
-    Computes normalised difference index on 2 images (2D), given a cloud mask (2D)
+    Finds the water line by thresholding the Normalized Difference Water Index and applying the Marching 
    Squares Algorithm
    KV WRL 2018
@ -470,8 +360,6 @@ def find_wl_contours(im_ndwi, cloud_mask, min_contour_points, plot_bool):
            Image (2D) with the NDWI (water index)
        cloud_mask: np.ndarray
            2D cloud mask with True where cloud pixels are
        min_contour_points: int
            minimum number of points in each contour line
        plot_bool: boolean
            True if plot is wanted
@ -489,16 +377,18 @@ def find_wl_contours(im_ndwi, cloud_mask, min_contour_points, plot_bool):
    t_otsu = filters.threshold_otsu(vec)
    # use Marching Squares algorithm to detect contours on ndwi image
    contours = measure.find_contours(im_ndwi, t_otsu)
-    # filter water lines
+    
-    contours_wl = []
+    # remove contour points that are nans
-    for i, contour in enumerate(contours):
+    contours_nonans = []
-        # remove contour points that are around clouds (nan values)
+    for k in range(len(contours)):
-        if np.any(np.isnan(contour)):
+        if np.any(np.isnan(contours[k])):
-            index_nan = np.where(np.isnan(contour))[0]
+            index_nan = np.where(np.isnan(contours[k]))[0]
-            contour = np.delete(contour, index_nan, axis=0) 
+            contours_temp = np.delete(contours[k], index_nan, axis=0)
-        # remove contours that have only few points (less than min_contour_points)
+            if len(contours_temp) > 1:
-        if contour.shape[0] > min_contour_points:
+                contours_nonans.append(contours_temp)
-            contours_wl.append(contour)
+        else:
            contours_nonans.append(contours[k])
    contours = contours_nonans
    if plot_bool:
        # plot otsu's histogram segmentation
@ -512,12 +402,12 @@ def find_wl_contours(im_ndwi, cloud_mask, min_contour_points, plot_bool):
        plt.figure()
        plt.imshow(im_ndwi, cmap='seismic')
        plt.colorbar()
-        for i,contour in enumerate(contours_wl): plt.plot(contour[:, 1], contour[:, 0], linewidth=3, color='k')
+        for i,contour in enumerate(contours): plt.plot(contour[:, 1], contour[:, 0], linewidth=3, color='k')
        plt.axis('image')
        plt.title('Detected water lines')
        plt.show()
-    return contours_wl
+    return contours
 def convert_pix2world(points, crs_vec):
    """
@ -563,6 +453,49 @@ def convert_pix2world(points, crs_vec):
    return points_converted
 def convert_world2pix(points, crs_vec):
    """
    Converts world projected coordinates (X,Y) to image coordinates (row,column)
    performing an affine transformation
    KV WRL 2018
    Arguments:
    -----------
        points: np.ndarray or list of np.ndarray
            array with 2 columns (rows first and columns second)
        crs_vec: np.ndarray
            vector of 6 elements [Xtr, Xscale, Xshear, Ytr, Yshear, Yscale]
    Returns:    -----------
        points_converted: np.ndarray or list of np.ndarray 
            converted coordinates, first columns with row and second column with column
    """
    # make affine transformation matrix
    aff_mat = np.array([[crs_vec[1], crs_vec[2], crs_vec[0]],
                       [crs_vec[4], crs_vec[5], crs_vec[3]],
                       [0, 0, 1]])
    # create affine transformation
    tform = transform.AffineTransform(aff_mat)
    if type(points) is list:
        points_converted = []
        # iterate over the list
        for i, arr in enumerate(points): 
            points_converted.append(tform.inverse(points))
    elif type(points) is np.ndarray:
        points_converted = tform.inverse(points)
    else:
        print('invalid input type')
        raise
    return points_converted
 def convert_epsg(points, epsg_in, epsg_out):
    """
    Converts from one spatial reference to another using the epsg codes
@ -676,7 +609,7 @@ def classify_sand_unsupervised(im_ms_ps, im_pan, cloud_mask, wl_pix, buffer_size
 #    im_sand = morphology.binary_dilation(im_sand, morphology.disk(1))
    if plot_bool:
-        im = np.copy(rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 100, False))
+        im = np.copy(rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 99.9, False))
        im[im_sand,0] = 0
        im[im_sand,1] = 0
        im[im_sand,2] = 1
@ -688,7 +621,7 @@ def classify_sand_unsupervised(im_ms_ps, im_pan, cloud_mask, wl_pix, buffer_size
    return im_sand
-def classify_image_NN(im_ms_ps, im_pan, cloud_mask, plot_bool):
+def classify_image_NN(im_ms_ps, im_pan, cloud_mask, min_beach_size, plot_bool):
    """
    Classifies every pixel in the image in one of 4 classes:
        - sand                                          --> label = 1
@ -714,8 +647,10 @@ def classify_image_NN(im_ms_ps, im_pan, cloud_mask, plot_bool):
            True if plot is wanted
    Returns:    -----------
-        im_labels: np.ndarray
+        im_classif: np.ndarray
-            2D binary image containing True where sand pixels are located
+            2D image containing labels
        im_labels: np.ndarray of booleans
            3D image containing a boolean image for each class (im_classif == label)
    """     
@ -743,16 +678,24 @@ def classify_image_NN(im_ms_ps, im_pan, cloud_mask, plot_bool):
    vec_classif = np.zeros((cloud_mask.shape[0]*cloud_mask.shape[1])) 
    vec_classif[~vec_mask] = labels
    im_classif = vec_classif.reshape((im_ms_ps.shape[0], im_ms_ps.shape[1]))
-#    im_classif = morphology.remove_small_objects(im_classif, min_size=min_beach_size, connectivity=2)
+
    # labels
    im_sand = im_classif == 1
    im_sand = morphology.remove_small_objects(im_sand, min_size=min_beach_size, connectivity=2)
    im_swash = im_classif == 2
    im_water = im_classif == 3
    im_labels = np.stack((im_sand,im_swash,im_water), axis=-1)  
    if plot_bool:
-        im_display = rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 100, False)
+        # display on top of pansharpened RGB
        im_display = rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 99.9, False)
        im = np.copy(im_display)
-        colours = np.array([[1,0,0],[1,1,0],[0,1,1],[0,0,1]])
+        # define colours for plot
-        for k in range(4):
+        colours = np.array([[1,128/255,0/255],[204/255,1,1],[0,0,204/255]])
-            im[im_classif == k,0] = colours[k,0]
+        for k in range(0,im_labels.shape[2]):
-            im[im_classif == k,1] = colours[k,1]
+            im[im_labels[:,:,k],0] = colours[k,0]
-            im[im_classif == k,2] = colours[k,2]
+            im[im_labels[:,:,k],1] = colours[k,1]
            im[im_labels[:,:,k],2] = colours[k,2]
        plt.figure()
        ax1 = plt.subplot(121)
@ -762,8 +705,463 @@ def classify_image_NN(im_ms_ps, im_pan, cloud_mask, plot_bool):
        ax2 = plt.subplot(122, sharex=ax1, sharey=ax1)
        plt.imshow(im)
        plt.axis('off')
-        plt.title('NN')  
+        plt.title('NN classifier')  
        mng = plt.get_current_fig_manager()                                         
        mng.window.showMaximized()
        plt.tight_layout()   
        plt.draw()
-    return im_classif
+    return im_classif, im_labels
 def classify_image_NN_nopan(im_ms_ps, cloud_mask, min_beach_size, plot_bool):
    """
    Classifies every pixel in the image in one of 4 classes:
        - sand                                          --> label = 1
        - whitewater (breaking waves and swash)         --> label = 2
        - water                                         --> label = 3
        - other (vegetation, buildings, rocks...)       --> label = 0
    The classifier is a Neural Network, trained with 7000 pixels for the class SAND and 1500 pixels for
    each of the other classes. This is because the class of interest for my application is SAND and I 
    wanted to minimize the classification error for that class
    KV WRL 2018
    Arguments:
    -----------
        im_ms_ps: np.ndarray
            Pansharpened RGB + downsampled NIR and SWIR
        im_pan:
            Panchromatic band
        cloud_mask: np.ndarray
            2D cloud mask with True where cloud pixels are
        plot_bool: boolean
            True if plot is wanted
    Returns:    -----------
        im_classif: np.ndarray
            2D image containing labels
        im_labels: np.ndarray of booleans
            3D image containing a boolean image for each class (im_classif == label)
    """     
    # load classifier
    clf = joblib.load('functions/NeuralNet_classif_nopan.pkl')
    # calculate features
    n_features = 9
    im_features = np.zeros((im_ms_ps.shape[0], im_ms_ps.shape[1], n_features))
    im_features[:,:,[0,1,2,3,4]] = im_ms_ps
    im_features[:,:,5] = nd_index(im_ms_ps[:,:,3], im_ms_ps[:,:,1], cloud_mask, False) # (NIR-G)
    im_features[:,:,6] = nd_index(im_ms_ps[:,:,3], im_ms_ps[:,:,2], cloud_mask, False) # ND(NIR-R)
    im_features[:,:,7] = nd_index(im_ms_ps[:,:,0], im_ms_ps[:,:,2], cloud_mask, False) # ND(B-R)
    im_features[:,:,8] = nd_index(im_ms_ps[:,:,4], im_ms_ps[:,:,1], cloud_mask, False) # ND(SWIR-G)
    # remove NaNs and clouds
    vec_features = im_features.reshape((im_ms_ps.shape[0] * im_ms_ps.shape[1], n_features))
    vec_cloud = cloud_mask.reshape(cloud_mask.shape[0]*cloud_mask.shape[1])
    vec_nan = np.any(np.isnan(vec_features), axis=1)
    vec_mask = np.logical_or(vec_cloud, vec_nan)
    vec_features = vec_features[~vec_mask, :]
    # predict with NN classifier
    labels = clf.predict(vec_features)
    # recompose image
    vec_classif = np.zeros((cloud_mask.shape[0]*cloud_mask.shape[1])) 
    vec_classif[~vec_mask] = labels
    im_classif = vec_classif.reshape((im_ms_ps.shape[0], im_ms_ps.shape[1]))
    # labels
    im_sand = im_classif == 1
    im_sand = morphology.remove_small_objects(im_sand, min_size=min_beach_size, connectivity=2)
    im_swash = im_classif == 2
    im_water = im_classif == 3
    im_labels = np.stack((im_sand,im_swash,im_water), axis=-1)  
    if plot_bool:
        # display on top of pansharpened RGB
        im_display = rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 99.9, False)
        im = np.copy(im_display)
        # define colours for plot
        colours = np.array([[1,128/255,0/255],[204/255,1,1],[0,0,204/255]])
        for k in range(0,im_labels.shape[2]):
            im[im_labels[:,:,k],0] = colours[k,0]
            im[im_labels[:,:,k],1] = colours[k,1]
            im[im_labels[:,:,k],2] = colours[k,2]
        plt.figure()
        ax1 = plt.subplot(121)
        plt.imshow(im_display)
        plt.axis('off')
        plt.title('Image')
        ax2 = plt.subplot(122, sharex=ax1, sharey=ax1)
        plt.imshow(im)
        plt.axis('off')
        plt.title('NN classifier')  
        mng = plt.get_current_fig_manager()                                         
        mng.window.showMaximized()
        plt.tight_layout()   
        plt.draw()
    return im_classif, im_labels
 def find_wl_contours2(im_ms_ps, im_labels, cloud_mask, buffer_size, plot_bool):
    """
    New method for extracting shorelines (more robust)
    KV WRL 2018
    Arguments:
    -----------
        im_ms_ps: np.ndarray
            Pansharpened RGB + downsampled NIR and SWIR
        im_labels: np.ndarray
            3D image containing a boolean image for each class in the order (sand, swash, water)
        cloud_mask: np.ndarray
            2D cloud mask with True where cloud pixels are
        buffer_size: int
            size of the buffer around the sandy beach
        plot_bool: boolean
            True if plot is wanted
    Returns:    -----------
        contours_wi: list of np.arrays 
            contains the (row,column) coordinates of the contour lines extracted with the Water Index
        contours_mwi: list of np.arrays 
            contains the (row,column) coordinates of the contour lines extracted with the Modified Water Index
    """  
    nrows = cloud_mask.shape[0]
    ncols = cloud_mask.shape[1]
    im_display = rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 99.9, False)
    # calculate Normalized Difference Modified Water Index (SWIR - G)
    im_mwi = nd_index(im_ms_ps[:,:,4], im_ms_ps[:,:,1], cloud_mask, False)
    # calculate Normalized Difference Modified Water Index (NIR - G)
    im_wi = nd_index(im_ms_ps[:,:,3], im_ms_ps[:,:,1], cloud_mask, False)
    # stack indices together
    im_ind = np.stack((im_wi, im_mwi), axis=-1)
    vec_ind = im_ind.reshape(nrows*ncols,2)
    # process labels
    vec_sand = im_labels[:,:,0].reshape(ncols*nrows)
    vec_swash = im_labels[:,:,1].reshape(ncols*nrows)
    vec_water = im_labels[:,:,2].reshape(ncols*nrows)
    # create a buffer around the sandy beach
    se = morphology.disk(buffer_size)
    im_buffer = morphology.binary_dilation(im_labels[:,:,0], se)
    vec_buffer = im_buffer.reshape(nrows*ncols)
    # select water/sand/swash pixels that are within the buffer
    int_water = vec_ind[np.logical_and(vec_buffer,vec_water),:]
    int_sand = vec_ind[np.logical_and(vec_buffer,vec_sand),:]
    int_swash = vec_ind[np.logical_and(vec_buffer,vec_swash),:]
    # threshold the sand/water intensities 
    int_all = np.append(int_water,int_sand, axis=0)
    t_mwi = filters.threshold_otsu(int_all[:,0])
    t_wi = filters.threshold_otsu(int_all[:,1])
    # find contour with MS algorithm
    im_wi_buffer = np.copy(im_wi)
    im_wi_buffer[~im_buffer] = np.nan
    im_mwi_buffer = np.copy(im_mwi)
    im_mwi_buffer[~im_buffer] = np.nan
    contours_wi = measure.find_contours(im_wi_buffer, t_wi)
    contours_mwi = measure.find_contours(im_mwi, t_mwi) # WARNING (on entire image)
    # remove contour points that are nans (around clouds)
    contours = contours_wi
    contours_nonans = []
    for k in range(len(contours)):
        if np.any(np.isnan(contours[k])):
            index_nan = np.where(np.isnan(contours[k]))[0]
            contours_temp = np.delete(contours[k], index_nan, axis=0)
            if len(contours_temp) > 1:
                contours_nonans.append(contours_temp)
        else:
            contours_nonans.append(contours[k])
    contours_wi = contours_nonans
    contours = contours_mwi
    contours_nonans = []
    for k in range(len(contours)):
        if np.any(np.isnan(contours[k])):
            index_nan = np.where(np.isnan(contours[k]))[0]
            contours_temp = np.delete(contours[k], index_nan, axis=0)
            if len(contours_temp) > 1:
                contours_nonans.append(contours_temp)
        else:
            contours_nonans.append(contours[k])
    contours_mwi = contours_nonans
    if plot_bool:
        im = np.copy(im_display)
        # define colours for plot
        colours = np.array([[1,128/255,0/255],[204/255,1,1],[0,0,204/255]])
        for k in range(0,im_labels.shape[2]):
            im[im_labels[:,:,k],0] = colours[k,0]
            im[im_labels[:,:,k],1] = colours[k,1]
            im[im_labels[:,:,k],2] = colours[k,2]
        fig = plt.figure()
        gs = gridspec.GridSpec(3, 3, height_ratios=[1, 1, 3])
        ax1 = fig.add_subplot(gs[0,:]) 
        vals = plt.hist(int_water[:,0], bins=100, label='water')
        plt.hist(int_sand[:,0], bins=100, alpha=0.5, label='sand')
        plt.hist(int_swash[:,0], bins=100, alpha=0.5, label='swash')
        plt.plot([t_wi, t_wi], [0, np.max(vals[0])], 'r-')
        plt.legend()
        plt.title('Water Index NIR-G')
        ax2 = fig.add_subplot(gs[1,:], sharex=ax1)
        vals = plt.hist(int_water[:,1], bins=100, label='water')
        plt.hist(int_sand[:,1], bins=100, alpha=0.5, label='sand')
        plt.hist(int_swash[:,1], bins=100, alpha=0.5, label='swash')
        plt.plot([t_mwi, t_mwi], [0, np.max(vals[0])], 'r-')
        plt.legend()
        plt.title('Modified Water Index SWIR-G')        
        ax3 = fig.add_subplot(gs[2,0])
        plt.imshow(im)
        for i,contour in enumerate(contours_mwi): plt.plot(contour[:, 1], contour[:, 0], linewidth=3, color='k')
        for i,contour in enumerate(contours_wi): plt.plot(contour[:, 1], contour[:, 0], linestyle='--', linewidth=1, color='w')
        plt.grid(False)
        plt.xticks([])
        plt.yticks([])
        ax4 = fig.add_subplot(gs[2,1], sharex=ax3, sharey=ax3)
        plt.imshow(im_display)
        for i,contour in enumerate(contours_mwi): plt.plot(contour[:, 1], contour[:, 0], linewidth=3, color='k')
        for i,contour in enumerate(contours_wi): plt.plot(contour[:, 1], contour[:, 0], linestyle='--', linewidth=1, color='w')
        plt.grid(False)     
        plt.xticks([])
        plt.yticks([])
        ax5 = fig.add_subplot(gs[2,2], sharex=ax3, sharey=ax3)
        plt.imshow(im_mwi, cmap='seismic')
        for i,contour in enumerate(contours_mwi): plt.plot(contour[:, 1], contour[:, 0], linewidth=3, color='k')
        for i,contour in enumerate(contours_wi): plt.plot(contour[:, 1], contour[:, 0], linestyle='--', linewidth=1, color='w')
        plt.grid(False)
        plt.xticks([])
        plt.yticks([])
 #        plt.gcf().set_size_inches(17.99,7.55)
        mng = plt.get_current_fig_manager()                                         
        mng.window.showMaximized()
        plt.gcf().set_tight_layout(True)
        plt.draw()
    return contours_wi, contours_mwi
 def compare_sds(dates_sds, chain_sds, topo_profiles, mod=0, mindays=5):
    """
    Compare sds with groundtruth data from topographic surveys / argus shorelines
    KV WRL 2018
    Arguments:
    -----------
        dates_sds: list
            list of dates corresponding to each row in chain_sds
        chain_sds: np.ndarray
            array with time series of chainage for each transect (each transect is one column)
        topo_profiles: dict
            dict containing the dates and chainage of the groundtruth
        mod: 0 or 1
            0 for linear interpolation between 2 closest surveys, 1 for only nearest neighbour
        min_days: int
            minimum number of days for which the data can be compared     
    Returns:    -----------
        stats: dict
            contains all the statistics of the comparison
    """       
    # create 3 figures       
    fig1 = plt.figure()
    gs1 = gridspec.GridSpec(chain_sds.shape[1], 1)
    fig2 = plt.figure()
    gs2 = gridspec.GridSpec(2, chain_sds.shape[1])
    fig3 = plt.figure()
    gs3 = gridspec.GridSpec(2,1)
    dates_sds_num = np.array([_.toordinal() for _ in dates_sds])
    stats = dict([])
    data_fin = dict([])
    # for each transect compare and plot the data
    for i in range(chain_sds.shape[1]):
        pfname = list(topo_profiles.keys())[i]
        stats[pfname] = dict([])
        data_fin[pfname] = dict([])
        dates_sur = topo_profiles[pfname]['dates']
        chain_sur = topo_profiles[pfname]['chainage']
        # convert to datenum
        dates_sur_num = np.array([_.toordinal() for _ in dates_sur])
        chain_sur_interp = []
        diff_days = []
        for j, satdate in enumerate(dates_sds_num):
            temp_diff = satdate - dates_sur_num
            if mod==0:
                # select measurement before and after sat image date and interpolate
                ind_before = np.where(temp_diff == temp_diff[temp_diff > 0][-1])[0]     
                if ind_before == len(temp_diff)-1:
                    chain_sur_interp.append(np.nan)
                    diff_days.append(np.abs(satdate-dates_sur_num[ind_before])[0])
                    continue         
                ind_after = np.where(temp_diff == temp_diff[temp_diff < 0][0])[0]            
                tempx = np.zeros(2)
                tempx[0] = dates_sur_num[ind_before]
                tempx[1] = dates_sur_num[ind_after]
                tempy = np.zeros(2)
                tempy[0] = chain_sur[ind_before]
                tempy[1] = chain_sur[ind_after]
                diff_days.append(np.abs(np.max([satdate-tempx[0], satdate-tempx[1]])))                
                # interpolate
                f = interpolate.interp1d(tempx, tempy)
                chain_sur_interp.append(f(satdate))
            elif mod==1:
                # select the closest measurement
                idx_closest = find_indices(np.abs(temp_diff), lambda e: e == np.min(np.abs(temp_diff)))[0]
                diff_days.append(np.abs(satdate-dates_sur_num[idx_closest]))
                if diff_days[j] > mindays:
                    chain_sur_interp.append(np.nan)
                else:
                    chain_sur_interp.append(chain_sur[idx_closest])
        chain_sur_interp = np.array(chain_sur_interp)
        # remove nan values
        idx_sur_nan = ~np.isnan(chain_sur_interp)
        idx_sat_nan = ~np.isnan(chain_sds[:,i])
        idx_nan = np.logical_and(idx_sur_nan, idx_sat_nan)
        # groundtruth and sds
        chain_sur_fin = chain_sur_interp[idx_nan]
        chain_sds_fin = chain_sds[idx_nan,i]
        dates_fin = [k for (k, v) in zip(dates_sds, idx_nan) if v]
        diff_chain = chain_sur_fin - chain_sds_fin
        # calculate statistics
        rmse = np.sqrt(np.nanmean((diff_chain)**2))
        mean = np.nanmean(diff_chain)
        std = np.nanstd(diff_chain)
        q90 = np.percentile(np.abs(diff_chain), 90)
        # store data
        stats[pfname]['rmse'] = rmse
        stats[pfname]['mean'] = mean
        stats[pfname]['std'] = std
        stats[pfname]['q90'] = q90
        stats[pfname]['diffdays'] = diff_days
        data_fin[pfname]['dates'] = dates_fin
        data_fin[pfname]['sds'] = chain_sds_fin
        data_fin[pfname]['survey'] = chain_sur_fin
        # make time-series plot
        plt.figure(fig1.number)
        ax = fig1.add_subplot(gs1[i,0])
        plt.plot(dates_sur, chain_sur, 'o-', color='C1', markersize=4, label='survey all')
        plt.plot(dates_fin, chain_sur_fin, 'o', color=[0.3, 0.3, 0.3], markersize=2, label='survey interp')
        plt.plot(dates_fin, chain_sds_fin, 'o--', color='b', markersize=4, label='SDS')
        plt.title(pfname, fontweight='bold')
        plt.xlim([dates_sds[0], dates_sds[-1]])
        plt.ylabel('chainage [m]')
        # make scatter plot
        plt.figure(fig2.number)
        ax1 = fig2.add_subplot(gs2[0,i])
        plt.axis('equal')
        plt.plot(chain_sur_fin, chain_sds_fin, 'ko', markersize=4, markerfacecolor='w', alpha=0.7)
        xmax = np.max([np.nanmax(chain_sds_fin),np.nanmax(chain_sur_fin)])
        xmin = np.min([np.nanmin(chain_sds_fin),np.nanmin(chain_sur_fin)])
        ymax = np.max([np.nanmax(chain_sds_fin),np.nanmax(chain_sur_fin)])
        ymin = np.min([np.nanmin(chain_sds_fin),np.nanmin(chain_sur_fin)])
        plt.plot([xmin, xmax], [ymin, ymax], 'r--')
        correlation = np.corrcoef(chain_sur_fin, chain_sds_fin)[0,1]
        str_corr = 'r = %.2f' % (correlation)
        plt.text(xmin, ymax, str_corr, bbox=dict(facecolor=[0.7,0.7,0.7], alpha=0.5), horizontalalignment='left')
        plt.xlabel('chainage survey [m]')
        plt.ylabel('chainage satellite [m]')
        plt.title(pfname, fontweight='bold')
        ax2 = fig2.add_subplot(gs2[1,i])
        binwidth = 3
        bins = np.arange(min(diff_chain), max(diff_chain) + binwidth, binwidth)
        density = plt.hist(diff_chain, bins=bins, density=True, color=[0.8, 0.8, 0.8], edgecolor='k')
        plt.xlim([-50, 50])
        plt.xlabel('error [m]')
        str_stats = ' rmse = %.1f\n mean = %.1f\n std = %.1f\n q90 = %.1f' % (rmse, mean, std, q90) 
        plt.text(15, np.max(density[0])-0.015, str_stats, bbox=dict(facecolor=[0.8,0.8,0.8], alpha=0.5), horizontalalignment='left', fontsize=10)
    fig1.set_size_inches(19.2, 9.28)
    fig1.set_tight_layout(True)
    fig2.set_size_inches(19.2, 9.28)
    fig2.set_tight_layout(True)
    # plot all the data together
    chain_sds_all = []
    chain_sur_all = []
    for i in range(chain_sds.shape[1]):
        pfname = list(topo_profiles.keys())[i]
        chain_sds_all = np.append(chain_sds_all,data_fin[pfname]['sds'])
        chain_sur_all = np.append(chain_sur_all,data_fin[pfname]['survey'])
    diff_chain_all = chain_sur_all - chain_sds_all
    # calculate statistics
    rmse = np.sqrt(np.nanmean((diff_chain_all)**2))
    mean = np.nanmean(diff_chain_all)
    std = np.nanstd(diff_chain_all)
    q90 = np.percentile(np.abs(diff_chain_all), 90)
    stats['all'] = {'rmse':rmse,'mean':mean,'std':std,'q90':q90}
    # make plot with all datapoints (from all the transects)
    plt.figure(fig3.number)
    ax1 = fig3.add_subplot(gs3[0,0])
    plt.axis('equal')
    plt.plot(chain_sur_all, chain_sds_all, 'ko', markersize=4, markerfacecolor='w', alpha=0.7)
    xmax = np.max([np.nanmax(chain_sds_all),np.nanmax(chain_sur_all)])
    xmin = np.min([np.nanmin(chain_sds_all),np.nanmin(chain_sur_all)])
    ymax = np.max([np.nanmax(chain_sds_all),np.nanmax(chain_sur_all)])
    ymin = np.min([np.nanmin(chain_sds_all),np.nanmin(chain_sur_all)])
    plt.plot([xmin, xmax], [ymin, ymax], 'r--')
    correlation = np.corrcoef(chain_sur_all, chain_sds_all)[0,1]
    str_corr = 'r = %.2f' % (correlation)
    plt.text(xmin, ymax, str_corr, bbox=dict(facecolor=[0.7,0.7,0.7], alpha=0.5), horizontalalignment='left')
    plt.xlabel('chainage survey [m]')
    plt.ylabel('chainage satellite [m]')
    plt.title(pfname, fontweight='bold')
    ax2 = fig3.add_subplot(gs3[1,0])
    binwidth = 3
    bins = np.arange(min(diff_chain_all), max(diff_chain_all) + binwidth, binwidth)
    density = plt.hist(diff_chain_all, bins=bins, density=True, color=[0.8, 0.8, 0.8], edgecolor='k')
    plt.xlim([-50, 50])
    plt.xlabel('error [m]')
    str_stats = ' rmse = %.1f\n mean = %.1f\n std = %.1f\n q90 = %.1f' % (rmse, mean, std, q90) 
    plt.text(15, np.max(density[0])-0.015, str_stats, bbox=dict(facecolor=[0.8,0.8,0.8], alpha=0.5), horizontalalignment='left', fontsize=10)
    fig3.set_size_inches(9.2, 9.28)
    fig3.set_tight_layout(True)               
    return stats
--- a/functions/utils.py
+++ b/functions/utils.py
@ -8,7 +8,9 @@ Contains all the utilities, convenience functions and small functions that do si
 """
 import matplotlib.pyplot as plt
 from datetime import datetime, timedelta
 import numpy as np
 import scipy.io as sio
 import pdb
@ -59,3 +61,50 @@ def find_indices(lst, condition):
 def reject_outliers(data, m=2):
    "rejects outliers in a numpy array"
    return data[abs(data - np.mean(data)) < m * np.std(data)]
 def duplicates_dict(lst):
    "return duplicates and indices"
    # nested function
    def duplicates(lst, item):
            return [i for i, x in enumerate(lst) if x == item]
    return dict((x, duplicates(lst, x)) for x in set(lst) if lst.count(x) > 1)
 def datenum2datetime(datenum):
    "convert datenum to datetime"
    #takes in datenum and outputs python datetime
    time = [datetime.fromordinal(int(dn)) + timedelta(days=float(dn)%1) - timedelta(days = 366) for dn in datenum]
    return time
 def loadmat(filename):
    '''
    this function should be called instead of direct spio.loadmat
    as it cures the problem of not properly recovering python dictionaries
    from mat files. It calls the function check keys to cure all entries
    which are still mat-objects
    '''
    data = sio.loadmat(filename, struct_as_record=False, squeeze_me=True)
    return _check_keys(data)
 def _check_keys(dict):
    '''
    checks if entries in dictionary are mat-objects. If yes
    todict is called to change them to nested dictionaries
    '''
    for key in dict:
        if isinstance(dict[key], sio.matlab.mio5_params.mat_struct):
            dict[key] = _todict(dict[key])
    return dict        
 def _todict(matobj):
    '''
    A recursive function which constructs from matobjects nested dictionaries
    '''
    dict = {}
    for strg in matobj._fieldnames:
        elem = matobj.__dict__[strg]
        if isinstance(elem, sio.matlab.mio5_params.mat_struct):
            dict[strg] = _todict(elem)
        else:
            dict[strg] = elem
    return dict
--- a/read_images.py
+++ b/read_images.py
@ -0,0 +1,589 @@
 #==========================================================#
 #==========================================================#
 # Extract shorelines from Landsat images
 #==========================================================#
 #==========================================================#
 #==========================================================#
 # Initial settings
 #==========================================================#
 import os
 import numpy as np
 import matplotlib.pyplot as plt
 import ee
 import pdb
 # other modules
 from osgeo import gdal, ogr, osr
 import pickle
 import matplotlib.cm as cm
 from pylab import ginput
 from shapely.geometry import LineString
 # image processing modules
 import skimage.filters as filters 
 import skimage.exposure as exposure
 import skimage.transform as transform
 import sklearn.decomposition as decomposition
 import skimage.measure as measure
 import skimage.morphology as morphology
 # machine learning modules
 from sklearn.model_selection import train_test_split
 from sklearn.neural_network import MLPClassifier
 from sklearn.preprocessing import StandardScaler, Normalizer 
 from sklearn.externals import joblib
 # import own modules
 import functions.utils as utils
 import functions.sds as sds
 # some other settings
 np.seterr(all='ignore') # raise/ignore divisions by 0 and nans
 plt.rcParams['axes.grid'] = True
 plt.rcParams['figure.max_open_warning'] = 100
 ee.Initialize()
 #==========================================================#
 # Parameters
 #==========================================================#
 sitename = 'NARRA'
 cloud_thresh = 0.7      # threshold for cloud cover
 plot_bool = False       # if you want the plots
 output_epsg = 28356     # GDA94 / MGA Zone 56
 buffer_size = 7         # radius (in pixels) of disk for buffer (pixel classification)
 min_beach_size = 20     # number of pixels in a beach (pixel classification)
 dist_ref = 100          # maximum distance from reference point
 min_length_wl = 200     # minimum length of shoreline LineString to be kept 
 manual_bool = True      # to manually check images
 output = dict([])
 #==========================================================#
 # Metadata
 #==========================================================#
 filepath = os.path.join(os.getcwd(), 'data', sitename)
 with open(os.path.join(filepath, sitename + '_metadata' + '.pkl'), 'rb') as f:
    metadata = pickle.load(f)
 #%%    
 #==========================================================#
 # Read S2 images
 #==========================================================#
 satname = 'S2'
 dates = metadata[satname]['dates']
 input_epsg = 32756 # metadata[satname]['epsg']
 # path to images
 filepath10 = os.path.join(os.getcwd(), 'data', sitename, satname, '10m')
 filenames10 = os.listdir(filepath10)
 filepath20 = os.path.join(os.getcwd(), 'data', sitename, satname, '20m')
 filenames20 = os.listdir(filepath20)
 filepath60 = os.path.join(os.getcwd(), 'data', sitename, satname, '60m')
 filenames60 = os.listdir(filepath60)
 if (not len(filenames10) == len(filenames20)) or (not len(filenames20) == len(filenames60)):
    raise 'error: not the same amount of files for 10, 20 and 60 m'
 N = len(filenames10)
 # initialise variables
 cloud_cover_ts = []
 acc_georef_ts = []
 date_acquired_ts = []
 filename_ts = []
 satname_ts = []
 timestamp = []
 shorelines = []
 idx_skipped = []
 spacing = '=========================================================='
 msg = ' %s\n %s\n %s' % (spacing, satname, spacing)
 print(msg)
 for i in range(N):
    # read 10m bands
    fn = os.path.join(filepath10, filenames10[i])
    data = gdal.Open(fn, gdal.GA_ReadOnly)
    georef = np.array(data.GetGeoTransform())
    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
    im10 = np.stack(bands, 2)
    im10 = im10/10000 # TOA scaled to 10000
    # if image is only zeros, skip it
    if sum(sum(sum(im10))) < 1:
        print('skip ' + str(i) + ' - no data')
        idx_skipped.append(i)
        continue        
    nrows = im10.shape[0]
    ncols = im10.shape[1]
    # read 20m band (SWIR1)
    fn = os.path.join(filepath20, filenames20[i])
    data = gdal.Open(fn, gdal.GA_ReadOnly)
    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
    im20 = np.stack(bands, 2)
    im20 = im20[:,:,0]
    im20 = im20/10000 # TOA scaled to 10000
    im_swir = transform.resize(im20, (nrows, ncols), order=1, preserve_range=True, mode='constant')
    im_swir = np.expand_dims(im_swir, axis=2)
    # append down-sampled swir band to the 10m bands
    im_ms = np.append(im10, im_swir, axis=2)
    # read 60m band (QA)
    fn = os.path.join(filepath60, filenames60[i])
    data = gdal.Open(fn, gdal.GA_ReadOnly)
    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
    im60 = np.stack(bands, 2)
    im_qa = im60[:,:,0]
    cloud_mask = sds.create_cloud_mask(im_qa, satname, plot_bool)
    cloud_mask = transform.resize(cloud_mask,(nrows, ncols), order=0, preserve_range=True, mode='constant')
    # check if -inf or nan values on any band and add to cloud mask
    for k in range(im_ms.shape[2]):   
            im_inf = np.isin(im_ms[:,:,k], -np.inf)
            im_nan = np.isnan(im_ms[:,:,k])
            cloud_mask = np.logical_or(np.logical_or(cloud_mask, im_inf), im_nan)
    # calculate cloud cover and if above threshold, skip it
    cloud_cover = sum(sum(cloud_mask.astype(int)))/(cloud_mask.shape[0]*cloud_mask.shape[1])
    if cloud_cover > cloud_thresh:
        print('skip ' + str(i) + ' - cloudy (' + str(np.round(cloud_cover*100).astype(int)) + '%)')
        idx_skipped.append(i)
        continue
    # rescale image intensity for display purposes
    im_display = sds.rescale_image_intensity(im_ms[:,:,[2,1,0]], cloud_mask, 99.9, False)
    # classify image in 4 classes (sand, whitewater, water, other) with NN classifier
    im_classif, im_labels = sds.classify_image_NN_nopan(im_ms, cloud_mask, min_beach_size, plot_bool)
    # if there aren't any sandy pixels
    if sum(sum(im_labels[:,:,0])) == 0 :
        # use global threshold
        im_ndwi = sds.nd_index(im_ms[:,:,4], im_ms[:,:,1], cloud_mask, plot_bool)
        contours = sds.find_wl_contours(im_ndwi, cloud_mask, plot_bool)
    else:
        # use specific threhsold
        contours_wi, contours_mwi = sds.find_wl_contours2(im_ms, im_labels, cloud_mask, buffer_size, plot_bool)
    # convert from pixels to world coordinates
    wl_coords = sds.convert_pix2world(contours_mwi, georef)
    # convert to output epsg spatial reference
    wl = sds.convert_epsg(wl_coords, input_epsg, output_epsg)
    # remove contour lines that have a perimeter < min_length_wl
    wl_good = []
    for l, wls in enumerate(wl):
        coords = [(wls[k,0], wls[k,1]) for k in range(len(wls))]
        a = LineString(coords) # shapely LineString structure
        if a.length >= min_length_wl:
            wl_good.append(wls)
    # format points and only select the ones close to the refpoints
    x_points = np.array([])
    y_points = np.array([])
    for k in range(len(wl_good)):
        x_points = np.append(x_points,wl_good[k][:,0])
        y_points = np.append(y_points,wl_good[k][:,1])
    wl_good = np.transpose(np.array([x_points,y_points]))
    temp = np.zeros((len(wl_good))).astype(bool)
    for k in range(len(refpoints)): 
        temp = np.logical_or(np.linalg.norm(wl_good - refpoints[k,[0,1]], axis=1) < dist_ref, temp) 
    wl_final = wl_good[temp]
    # plot output
    plt.figure()
    im = np.copy(im_display)
    colours = np.array([[1,128/255,0/255],[204/255,1,1],[0,0,204/255]])
    for k in range(0,im_labels.shape[2]):
        im[im_labels[:,:,k],0] = colours[k,0]
        im[im_labels[:,:,k],1] = colours[k,1]
        im[im_labels[:,:,k],2] = colours[k,2]
    plt.imshow(im)
    for k,contour in enumerate(contours_mwi): plt.plot(contour[:, 1], contour[:, 0], linewidth=2, color='k', linestyle='--')
    plt.title(satname + '     ' + metadata[satname]['dates'][i].strftime('%Y-%m-%d') + '      acc : ' + str(metadata[satname]['acc_georef'][i]) + ' m' )
    plt.draw()
    pt_in = np.array(ginput(n=1, timeout=1000))
    plt.close()
    # if image is rejected, skip it
    if pt_in[0][1] > nrows/2:
        print('skip ' + str(i) + ' - rejected')
        idx_skipped.append(i)
        continue
    # if accepted, store the data
    cloud_cover_ts.append(cloud_cover)
    acc_georef_ts.append(metadata[satname]['acc_georef'][i])
    filename_ts.append(filenames10[i])
    satname_ts.append(satname)
    date_acquired_ts.append(filenames10[i][:10])
    timestamp.append(metadata[satname]['dates'][i])
    shorelines.append(wl_final)
 # store in output structure
 output[satname] = {'dates':timestamp, 'shorelines':shorelines, 'idx_skipped':idx_skipped,
      'metadata':{'filenames':filename_ts, 'satname':satname_ts, 'cloud_cover':cloud_cover_ts,
                  'acc_georef':acc_georef_ts}}
 del idx_skipped
 #%%
 #==========================================================#
 # Read L7&L8 images
 #==========================================================#
 satname = 'L8'
 dates = metadata[satname]['dates']
 input_epsg = 32656 # metadata[satname]['epsg']
 # path to images
 filepath_pan = os.path.join(os.getcwd(), 'data', sitename, 'L7&L8', 'pan')
 filepath_ms = os.path.join(os.getcwd(), 'data', sitename, 'L7&L8', 'ms')
 filenames_pan = os.listdir(filepath_pan)
 filenames_ms = os.listdir(filepath_ms)
 if (not len(filenames_pan) == len(filenames_ms)):
    raise 'error: not the same amount of files for pan and ms'
 N = len(filenames_pan)
 # initialise variables
 cloud_cover_ts = []
 acc_georef_ts = []
 date_acquired_ts = []
 filename_ts = []
 satname_ts = []
 timestamp = []
 shorelines = []
 idx_skipped = []
 spacing = '=========================================================='
 msg = ' %s\n %s\n %s' % (spacing, satname, spacing)
 print(msg)
 for i in range(N):
    # get satellite name
    sat = filenames_pan[i][20:22]
    # read pan image
    fn_pan = os.path.join(filepath_pan, filenames_pan[i])
    data = gdal.Open(fn_pan, gdal.GA_ReadOnly)
    georef = np.array(data.GetGeoTransform())
    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
    im_pan = np.stack(bands, 2)[:,:,0]
    nrows = im_pan.shape[0]
    ncols = im_pan.shape[1]
    # read ms image
    fn_ms = os.path.join(filepath_ms, filenames_ms[i])
    data = gdal.Open(fn_ms, gdal.GA_ReadOnly)
    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
    im_ms = np.stack(bands, 2)
    # cloud mask
    im_qa = im_ms[:,:,5]
    cloud_mask = sds.create_cloud_mask(im_qa, sat, plot_bool)
    cloud_mask = transform.resize(cloud_mask, (nrows, ncols), order=0, preserve_range=True, mode='constant').astype('bool_')    
    # resize the image using bilinear interpolation (order 1)
    im_ms = im_ms[:,:,:5]
    im_ms = transform.resize(im_ms,(nrows, ncols), order=1, preserve_range=True, mode='constant')
    # check if -inf or nan values on any band and add to cloud mask
    for k in range(im_ms.shape[2]+1): 
        if k == 5:
            im_inf = np.isin(im_pan, -np.inf)
            im_nan = np.isnan(im_pan)        
        else:  
            im_inf = np.isin(im_ms[:,:,k], -np.inf)
            im_nan = np.isnan(im_ms[:,:,k])
        cloud_mask = np.logical_or(np.logical_or(cloud_mask, im_inf), im_nan)
    # calculate cloud cover and skip image if above threshold
    cloud_cover = sum(sum(cloud_mask.astype(int)))/(cloud_mask.shape[0]*cloud_mask.shape[1])
    if cloud_cover > cloud_thresh:
        print('skip ' + str(i) + ' - cloudy (' + str(np.round(cloud_cover*100).astype(int)) + '%)')
        idx_skipped.append(i)
        continue
    # Pansharpen image (different for L8 and L7)
    if sat == 'L7':
        # pansharpen (Green, Red, NIR) and downsample Blue and SWIR1
        im_ms_ps = sds.pansharpen(im_ms[:,:,[1,2,3]], im_pan, cloud_mask, plot_bool)
        im_ms_ps = np.append(im_ms[:,:,[0]], im_ms_ps, axis=2)
        im_ms_ps = np.append(im_ms_ps, im_ms[:,:,[4]], axis=2)
        im_display = sds.rescale_image_intensity(im_ms[:,:,[2,1,0]], cloud_mask, 99.9, False)
    elif sat == 'L8':
        # pansharpen RGB image and downsample NIR and SWIR1
        im_ms_ps = sds.pansharpen(im_ms[:,:,[0,1,2]], im_pan, cloud_mask, plot_bool)
        im_ms_ps = np.append(im_ms_ps, im_ms[:,:,[3,4]], axis=2) 
        im_display = sds.rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 99.9, False)
    # classify image in 4 classes (sand, whitewater, water, other) with NN classifier
    im_classif, im_labels = sds.classify_image_NN(im_ms_ps, im_pan, cloud_mask, min_beach_size, plot_bool)
    # if there aren't any sandy pixels
    if sum(sum(im_labels[:,:,0])) == 0 :
        # use global threshold
        im_ndwi = sds.nd_index(im_ms_ps[:,:,4], im_ms_ps[:,:,1], cloud_mask, plot_bool)
        contours = sds.find_wl_contours(im_ndwi, cloud_mask, plot_bool)
    else:
        # use specific threhsold
        contours_wi, contours_mwi = sds.find_wl_contours2(im_ms_ps, im_labels, cloud_mask, buffer_size, plot_bool)
    # convert from pixels to world coordinates
    wl_coords = sds.convert_pix2world(contours_mwi, georef)
    # convert to output epsg spatial reference
    wl = sds.convert_epsg(wl_coords, input_epsg, output_epsg)
    # remove contour lines that have a perimeter < min_length_wl
    wl_good = []
    for l, wls in enumerate(wl):
        coords = [(wls[k,0], wls[k,1]) for k in range(len(wls))]
        a = LineString(coords) # shapely LineString structure
        if a.length >= min_length_wl:
            wl_good.append(wls)   
    # format points and only select the ones close to the refpoints
    x_points = np.array([])
    y_points = np.array([])
    for k in range(len(wl_good)):
        x_points = np.append(x_points,wl_good[k][:,0])
        y_points = np.append(y_points,wl_good[k][:,1])
    wl_good = np.transpose(np.array([x_points,y_points]))
    temp = np.zeros((len(wl_good))).astype(bool)
    for k in range(len(refpoints)): 
        temp = np.logical_or(np.linalg.norm(wl_good - refpoints[k,[0,1]], axis=1) < dist_ref, temp) 
    wl_final = wl_good[temp]   
    # plot output
    plt.figure()
    plt.subplot(121)
    im = np.copy(im_display)
    colours = np.array([[1,128/255,0/255],[204/255,1,1],[0,0,204/255]])
    for k in range(0,im_labels.shape[2]):
        im[im_labels[:,:,k],0] = colours[k,0]
        im[im_labels[:,:,k],1] = colours[k,1]
        im[im_labels[:,:,k],2] = colours[k,2]
    plt.imshow(im)
    for k,contour in enumerate(contours_mwi): plt.plot(contour[:, 1], contour[:, 0], linewidth=2, color='k', linestyle='--')
    plt.title(sat + '     ' + metadata[satname]['dates'][i].strftime('%Y-%m-%d') + '      acc : ' + str(metadata[satname]['acc_georef'][i]) + ' m' )
    pt_in = np.array(ginput(n=1, timeout=1000))
    plt.close()
    # if image is rejected, skip it
    if pt_in[0][1] > nrows/2:
        print('skip ' + str(i) + ' - rejected')
        idx_skipped.append(i)
        continue
    # if accepted, store the data
    cloud_cover_ts.append(cloud_cover)
    acc_georef_ts.append(metadata[satname]['acc_georef'][i])
    filename_ts.append(filenames_pan[i])
    satname_ts.append(sat)
    date_acquired_ts.append(filenames_pan[i][:10])
    timestamp.append(metadata[satname]['dates'][i])
    shorelines.append(wl_final)
 # store in output structure
 output[satname] = {'dates':timestamp, 'shorelines':shorelines, 'idx_skipped':idx_skipped,
      'metadata':{'filenames':filename_ts, 'satname':satname_ts, 'cloud_cover':cloud_cover_ts,
                  'acc_georef':acc_georef_ts}}    
 del idx_skipped
 #%%
 #==========================================================#
 # Read L5 images
 #==========================================================#
 satname = 'L5'
 dates = metadata[satname]['dates']
 input_epsg = 32656 # metadata[satname]['epsg']
 # path to images
 filepath_img = os.path.join(os.getcwd(), 'data', sitename, satname, '30m')
 filenames = os.listdir(filepath_img)
 N = len(filenames)
 # initialise variables
 cloud_cover_ts = []
 acc_georef_ts = []
 date_acquired_ts = []
 filename_ts = []
 satname_ts = []
 timestamp = []
 shorelines = []
 idx_skipped = []
 spacing = '=========================================================='
 msg = ' %s\n %s\n %s' % (spacing, satname, spacing)
 print(msg)
 for i in range(N):
    # read ms image
    fn = os.path.join(filepath_img, filenames[i])
    data = gdal.Open(fn, gdal.GA_ReadOnly)
    georef = np.array(data.GetGeoTransform())
    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
    im_ms = np.stack(bands, 2)
    # down-sample to half hte original pixel size
    nrows = im_ms.shape[0]*2
    ncols = im_ms.shape[1]*2
    # cloud mask
    im_qa = im_ms[:,:,5]
    im_ms = im_ms[:,:,:-1]
    cloud_mask = sds.create_cloud_mask(im_qa, satname, plot_bool)
    cloud_mask = transform.resize(cloud_mask, (nrows, ncols), order=0, preserve_range=True, mode='constant').astype('bool_')
    # resize the image using bilinear interpolation (order 1)
    im_ms = transform.resize(im_ms,(nrows, ncols), order=1, preserve_range=True, mode='constant')
    # adjust georef vector (scale becomes 15m and origin is adjusted to the center of new corner pixel)
    georef[1] = 15
    georef[5] = -15
    georef[0] = georef[0] + 7.5
    georef[3] = georef[3] - 7.5
    # check if -inf or nan values on any band and add to cloud mask
    for k in range(im_ms.shape[2]):   
        im_inf = np.isin(im_ms[:,:,k], -np.inf)
        im_nan = np.isnan(im_ms[:,:,k])
        cloud_mask = np.logical_or(np.logical_or(cloud_mask, im_inf), im_nan)
    # calculate cloud cover and skip image if above threshold
    cloud_cover = sum(sum(cloud_mask.astype(int)))/(cloud_mask.shape[0]*cloud_mask.shape[1])
    if cloud_cover > cloud_thresh:
        print('skip ' + str(i) + ' - cloudy (' + str(np.round(cloud_cover*100).astype(int)) + '%)')
        idx_skipped.append(i)
        continue
    # rescale image intensity for display purposes
    im_display = sds.rescale_image_intensity(im_ms[:,:,[2,1,0]], cloud_mask, 99.9, False)
    # classify image in 4 classes (sand, whitewater, water, other) with NN classifier
    im_classif, im_labels = sds.classify_image_NN_nopan(im_ms, cloud_mask, min_beach_size, plot_bool)
    # if there aren't any sandy pixels
    if sum(sum(im_labels[:,:,0])) == 0 :
        # use global threshold
        im_ndwi = sds.nd_index(im_ms[:,:,4], im_ms[:,:,1], cloud_mask, plot_bool)
        contours = sds.find_wl_contours(im_ndwi, cloud_mask, plot_bool)
    else:
        # use specific threhsold
        contours_wi, contours_mwi = sds.find_wl_contours2(im_ms, im_labels, cloud_mask, buffer_size, plot_bool)
    # convert from pixels to world coordinates
    wl_coords = sds.convert_pix2world(contours_mwi, georef)
    # convert to output epsg spatial reference
    wl = sds.convert_epsg(wl_coords, input_epsg, output_epsg)
    # remove contour lines that have a perimeter < min_length_wl
    wl_good = []
    for l, wls in enumerate(wl):
        coords = [(wls[k,0], wls[k,1]) for k in range(len(wls))]
        a = LineString(coords) # shapely LineString structure
        if a.length >= min_length_wl:
            wl_good.append(wls)
    # format points and only select the ones close to the refpoints
    x_points = np.array([])
    y_points = np.array([])
    for k in range(len(wl_good)):
        x_points = np.append(x_points,wl_good[k][:,0])
        y_points = np.append(y_points,wl_good[k][:,1])
    wl_good = np.transpose(np.array([x_points,y_points]))
    temp = np.zeros((len(wl_good))).astype(bool)
    for k in range(len(refpoints)): 
        temp = np.logical_or(np.linalg.norm(wl_good - refpoints[k,[0,1]], axis=1) < dist_ref, temp) 
    wl_final = wl_good[temp]
    # plot output
    plt.figure()
    plt.subplot(121)
    im = np.copy(im_display)
    colours = np.array([[1,128/255,0/255],[204/255,1,1],[0,0,204/255]])
    for k in range(0,im_labels.shape[2]):
        im[im_labels[:,:,k],0] = colours[k,0]
        im[im_labels[:,:,k],1] = colours[k,1]
        im[im_labels[:,:,k],2] = colours[k,2]
    plt.imshow(im)
    for k,contour in enumerate(contours_mwi): plt.plot(contour[:, 1], contour[:, 0], linewidth=2, color='k', linestyle='--')
    plt.title(satname + '     ' + metadata[satname]['dates'][i].strftime('%Y-%m-%d') + '      acc : ' + str(metadata[satname]['acc_georef'][i]) + ' m' )
    plt.subplot(122)
    plt.axis('equal')
    plt.axis('off')
    plt.plot(refpoints[:,0], refpoints[:,1], 'k.')
    plt.plot(wl_final[:,0], wl_final[:,1], 'r.')
    mng = plt.get_current_fig_manager()                                         
    mng.window.showMaximized()
    plt.tight_layout()
    plt.draw()
    pt_in = np.array(ginput(n=1, timeout=1000))
    plt.close()
    # if image is rejected, skip it
    if pt_in[0][1] > nrows/2:
        print('skip ' + str(i) + ' - rejected')
        idx_skipped.append(i)
        continue
    # if accepted, store the data
    cloud_cover_ts.append(cloud_cover)
    acc_georef_ts.append(metadata[satname]['acc_georef'][i])
    filename_ts.append(filenames[i])
    satname_ts.append(satname)
    date_acquired_ts.append(filenames[i][:10])
    timestamp.append(metadata[satname]['dates'][i])
    shorelines.append(wl_final)
 # store in output structure
 output[satname] = {'dates':timestamp, 'shorelines':shorelines, 'idx_skipped':idx_skipped,
      'metadata':{'filenames':filename_ts, 'satname':satname_ts, 'cloud_cover':cloud_cover_ts,
                  'acc_georef':acc_georef_ts}}    
 del idx_skipped
 #==========================================================#
 #==========================================================#
 #==========================================================#
 #==========================================================#
 #%%
 # save output
 with open(os.path.join(filepath, sitename + '_output' + '.pkl'), 'wb') as f:
    pickle.dump(output, f)
 # save idx_skipped
 #idx_skipped = dict([])
 #for satname in list(output.keys()):
 #    idx_skipped[satname] = output[satname]['idx_skipped']
 #with open(os.path.join(filepath, sitename + '_idxskipped' + '.pkl'), 'wb') as f:
 #    pickle.dump(idx_skipped, f)
--- a/sl_comparison_NARRA.py
+++ b/sl_comparison_NARRA.py
@ -1,382 +0,0 @@
 # -*- coding: utf-8 -*-
 #==========================================================#
 # Compare Narrabeen SDS with 3D quadbike surveys
 #==========================================================#
 # Initial settings
 import os
 import numpy as np
 import matplotlib.pyplot as plt
 import pdb
 import ee
 import matplotlib.dates as mdates
 import matplotlib.cm as cm
 from datetime import datetime, timedelta
 import pickle
 import pytz
 import scipy.io as sio
 import scipy.interpolate as interpolate
 import statsmodels.api as sm
 import skimage.measure as measure
 # my functions
 import functions.utils as utils
 # some settings
 np.seterr(all='ignore') # raise/ignore divisions by 0 and nans
 plt.rcParams['axes.grid'] = True
 plt.rcParams['figure.max_open_warning'] = 100
 au_tz = pytz.timezone('Australia/Sydney')
 # load quadbike dates and convert from datenum to datetime
 filename = 'data\quadbike\survey_dates.mat'
 filepath = os.path.join(os.getcwd(), filename)
 dates_quad = sio.loadmat(filepath)['dates'] # matrix containing year, month, day
 dates_quad = [datetime(dates_quad[i,0], dates_quad[i,1], dates_quad[i,2],
                       tzinfo=au_tz) for i in range(dates_quad.shape[0])]
 # load timestamps from satellite images
 satname = 'L8'
 sitename = 'NARRA'
 filepath = os.path.join(os.getcwd(), 'data', satname, sitename)
 with open(os.path.join(filepath, sitename + '_output_new' + '.pkl'), 'rb') as f:
    output = pickle.load(f)
 dates_l8 = output['t']
 # convert to AEST
 dates_l8 = [_.astimezone(au_tz) for _ in dates_l8]
 # remove duplicates
 dates_l8_str = [_.strftime('%Y%m%d') for _ in dates_l8]
 dupl = utils.duplicates_dict(dates_l8_str)
 idx_remove = []
 for k,v in dupl.items():
    idx1 = v[0]
    idx2 = v[1]
    c1 = output['cloud_cover'][idx1]
    c2 = output['cloud_cover'][idx2]
    g1 = output['acc_georef'][idx1]
    g2 = output['acc_georef'][idx2]
    if c1 < c2 - 0.01:
        idx_remove.append(idx2)
    elif g1 < g2 - 0.1:
        idx_remove.append(idx2)
    else:
        idx_remove.append(idx1)
 idx_remove = sorted(idx_remove)
 idx_all = np.linspace(0,70,71)
 idx_keep = list(np.where(~np.isin(idx_all,idx_remove))[0])
 output['t'] = [output['t'][k] for k in idx_keep]
 output['shorelines'] = [output['shorelines'][k] for k in idx_keep]
 output['cloud_cover'] = [output['cloud_cover'][k] for k in idx_keep]
 output['acc_georef'] = [output['acc_georef'][k] for k in idx_keep]
 # convert to AEST
 dates_l8 = output['t']
 dates_l8 = [_.astimezone(au_tz) for _ in dates_l8]
 # load wave data (already AEST)
 filename = 'data\wave\SydneyProcessed.mat'
 filepath = os.path.join(os.getcwd(), filename)
 wave_data = sio.loadmat(filepath)
 idx = utils.find_indices(wave_data['dates'][:,0], lambda e: e >= dates_l8[0].year and e <= dates_l8[-1].year)
 hsig = np.array([wave_data['Hsig'][i][0] for i in idx])
 wdir = np.array([wave_data['Wdir'][i][0] for i in idx])
 dates_wave = [datetime(wave_data['dates'][i,0], wave_data['dates'][i,1],
                       wave_data['dates'][i,2], wave_data['dates'][i,3],
                       wave_data['dates'][i,4], wave_data['dates'][i,5],
                       tzinfo=au_tz) for i in idx]
 # load tide data (already AEST)
 filename = 'SydTideData.mat'
 filepath = os.path.join(os.getcwd(), 'data', 'tide', filename)
 tide_data = sio.loadmat(filepath)
 idx = utils.find_indices(tide_data['dates'][:,0], lambda e: e >= dates_l8[0].year and e <= dates_l8[-1].year)
 tide = np.array([tide_data['tide'][i][0] for i in idx])
 dates_tide = [datetime(tide_data['dates'][i,0], tide_data['dates'][i,1],
                       tide_data['dates'][i,2], tide_data['dates'][i,3],
                       tide_data['dates'][i,4], tide_data['dates'][i,5],
                       tzinfo=au_tz) for i in idx]
 #%% make a plot of all the dates with wave data
 orange = [255/255,140/255,0]
 blue = [0,191/255,255/255]
 f = plt.figure()
 months = mdates.MonthLocator()
 month_fmt = mdates.DateFormatter('%b %Y')
 days = mdates.DayLocator()
 years = [2013,2014,2015,2016]
 for k in range(len(years)):
    sel_year = years[k]
    ax = plt.subplot(4,1,k+1)
    idx_year = utils.find_indices(dates_wave, lambda e : e.year >= sel_year and e.year <= sel_year)
    plt.plot([dates_wave[i] for i in idx_year], [hsig[i] for i in idx_year], 'k-', linewidth=0.5)
    hsigmax = np.nanmax([hsig[i] for i in idx_year])
    cbool = True
    for j in range(len(dates_quad)):
        if dates_quad[j].year == sel_year:
            if cbool:
                plt.plot([dates_quad[j], dates_quad[j]], [0, hsigmax], color=orange, label='survey')
                cbool = False
            else:
                plt.plot([dates_quad[j], dates_quad[j]], [0, hsigmax], color=orange)
    cbool = True
    for j in range(len(dates_l8)):
        if dates_l8[j].year == sel_year:
            if cbool:
                plt.plot([dates_l8[j], dates_l8[j]], [0, hsigmax], color=blue, label='landsat8')
                cbool = False
            else:
                plt.plot([dates_l8[j], dates_l8[j]], [0, hsigmax], color=blue)
    if k == 3:
        plt.legend()
    plt.xlim((datetime(sel_year,1,1), datetime(sel_year,12,31, tzinfo=au_tz)))
    plt.ylim((0, hsigmax))
    plt.ylabel('Hs [m]')
    ax.xaxis.set_major_locator = months
    ax.xaxis.set_major_formatter(month_fmt)
 f.subplots_adjust(hspace=0.2)
 plt.draw()
 #%% calculate difference between dates (quad and sat)
 diff_days = [ [(x - _).days for _ in dates_quad] for x in dates_l8]
 max_diff = 14
 idx_closest = [utils.find_indices(_, lambda e: abs(e) <= max_diff) for _ in diff_days]
 # store in dates_diff dictionnary
 dates_diff = []
 cloud_cover = []
 for i in range(len(idx_closest)):
    if not idx_closest[i]:
        continue
    elif len(idx_closest[i]) > 1:
        idx_best = np.argmin(np.abs([diff_days[i][_] for _ in idx_closest[i]]))
        dates_temp = [dates_quad[_] for _ in idx_closest[i]]
        days_temp = [diff_days[i][_] for _ in idx_closest[i]]
        dates_diff.append({"date sat": dates_l8[i],
                           "date quad": dates_temp[idx_best],
                           "days diff": days_temp[idx_best]})
    else:
        dates_diff.append({"date sat": dates_l8[i],
                           "date quad": dates_quad[idx_closest[i][0]],
                           "days diff": diff_days[i][idx_closest[i][0]]
                           }) 
    # store cloud data
    cloud_cover.append(output['cloud_cover'][i])
 # store wave data
 wave_hsig = []
 for i in range(len(dates_diff)):
    wave_hsig.append(hsig[np.argmin(np.abs(np.array([(dates_diff[i]['date sat'] - _).total_seconds() for _ in dates_wave])))])
 # make a plot
 plt.figure()
 counter = 0
 for i in range(len(dates_diff)):
    counter = counter + 1
    if dates_diff[i]['date quad'] > dates_diff[i]['date sat']:
        date_min = dates_diff[i]['date sat']
        date_max = dates_diff[i]['date quad']
        color1 = orange
        color2 = blue
    else:
        date_min = dates_diff[i]['date quad']
        date_max = dates_diff[i]['date sat']
        color1 = blue
        color2 = orange
    idx_t = utils.find_indices(dates_wave, lambda e : e >= date_min and e <= date_max)
    hsigmax = np.nanmax([hsig[i] for i in idx_t])
    hsigmin = np.nanmin([hsig[i] for i in idx_t])
    if counter > 9:
        counter = 1
        plt.figure()
    ax = plt.subplot(3,3,counter)
    plt.plot([dates_wave[i] for i in idx_t], [hsig[i] for i in idx_t], 'k-', linewidth=1.5)
    plt.plot([date_min, date_min], [0, 4.5], color=color2, label='survey')
    plt.plot([date_max, date_max], [0, 4.5], color=color1, label='landsat8')
    plt.ylabel('Hs [m]')
    ax.xaxis.set_major_locator(mdates.DayLocator(tz=au_tz))
    ax.xaxis.set_minor_locator(mdates.HourLocator(tz=au_tz))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%d'))
    ax.xaxis.set_minor_locator(months)
    plt.title(dates_diff[i]['date sat'].strftime('%b %Y') + '   (' + str(abs(dates_diff[i]['days diff'])) + ' days)')
    plt.draw()
    plt.gcf().subplots_adjust(hspace=0.5)   
 # mean day difference
 np.mean([ np.abs(_['days diff']) for _ in dates_diff])
 #%% Compare shorelines in elevation
 dist_buffer = 50 # buffer of points selected for interpolation
 # load quadbike .mat files 
 foldername = 'data\quadbike\surveys3D'
 folderpath = os.path.join(os.getcwd(), foldername)
 filenames = os.listdir(folderpath)
 # get the satellite shorelines
 sl = output['shorelines']
 # get dates from filenames
 dates_quad = [datetime(int(_[6:10]), int(_[11:13]), int(_[14:16]), tzinfo= au_tz) for _ in filenames]
 zav = []
 ztide = []
 sl_gt = []
 for i in range(len(dates_diff)):
    sl_smooth = sl[i]
    # select closest 3D survey and load .mat file
    idx_closest = np.argmin(np.abs(np.array([(dates_diff[i]['date sat'] - _).days for _ in dates_quad])))
    survey3d = sio.loadmat(os.path.join(folderpath, filenames[idx_closest]))
    # reshape to a vector
    xs = survey3d['x'].reshape(survey3d['x'].shape[0] * survey3d['x'].shape[1])
    ys = survey3d['y'].reshape(survey3d['y'].shape[0] * survey3d['y'].shape[1])
    zs = survey3d['z'].reshape(survey3d['z'].shape[0] * survey3d['z'].shape[1])
    # remove nan values
    idx_nan = np.isnan(zs)
    xs = xs[~idx_nan]
    ys = ys[~idx_nan]
    zs = zs[~idx_nan]
    # find water level at the time the image was acquired
    idx_closest = np.argmin(np.abs(np.array([(dates_diff[i]['date sat'] - _).total_seconds() for _ in dates_tide])))
    tide_level = tide[idx_closest]
    ztide.append(tide_level)
    # find contour corresponding to the water level on 3D surface (if below minimum, add 0.05m increments)
    if tide_level < np.nanmin(survey3d['z']):
        tide_level = np.nanmin(survey3d['z'])
        sl_tide = measure.find_contours(survey3d['z'], tide_level)
        sl_tide = sl_tide[np.argmax(np.array([len(_) for _ in sl_tide]))]
        count = 0
        while len(sl_tide) < 900:
            count = count + 1
            tide_level = tide_level + 0.05*count
            sl_tide = measure.find_contours(survey3d['z'], tide_level)
            sl_tide = sl_tide[np.argmax(np.array([len(_) for _ in sl_tide]))]
        print('added ' + str(0.05*count) + ' cm   -  contour with ' + str(len(sl_tide)) + ' points')    
    else:
        sl_tide = measure.find_contours(survey3d['z'], tide_level)
        sl_tide = sl_tide[np.argmax(np.array([len(_) for _ in sl_tide]))]
    # remove nans
    if np.any(np.isnan(sl_tide)):
        index_nan = np.where(np.isnan(sl_tide))[0]
        sl_tide = np.delete(sl_tide, index_nan, axis=0) 
    # get x,y coordinates
    xtide = [survey3d['x'][int(np.round(sl_tide[m,0])), int(np.round(sl_tide[m,1]))] for m in range(sl_tide.shape[0])]
    ytide = [survey3d['y'][int(np.round(sl_tide[m,0])), int(np.round(sl_tide[m,1]))] for m in range(sl_tide.shape[0])]
    sl_gt.append(np.transpose(np.array([np.array(xtide), np.array(ytide)])))
    # interpolate SDS on 3D surface to get elevation (point by point)
    zq = []
    for j in range(sl_smooth.shape[0]):
        xq = sl_smooth[j,0]
        yq = sl_smooth[j,1]
        dist_q = np.linalg.norm(np.transpose(np.array([[xq - _ for _ in xs],[yq - _ for _ in ys]])), axis=1)
        idx_buffer = dist_q <= dist_buffer
        if sum(idx_buffer) > 0:
            tck = interpolate.bisplrep(xs[idx_buffer], ys[idx_buffer], zs[idx_buffer])
            zq.append(interpolate.bisplev(xq, yq, tck))
    zq = np.array(zq)
    plt.figure()
    plt.hist(zq, bins=100)
    plt.draw()
 #        plt.figure()
 #        plt.axis('equal')
 #        plt.scatter(xs, ys, s=10, c=zs, marker='o', cmap=cm.get_cmap('jet'),
 #                    label='quad data')
 #        plt.plot(xs[idx_buffer], ys[idx_buffer], 'ko')
 #        plt.plot(xq,yq,'ro')
 #        plt.draw()
    # store the alongshore median elevation
    zav.append(np.median(utils.reject_outliers(zq, m=2)))
    # make plot
    red = [255/255, 0, 0]
    gray = [0.75, 0.75, 0.75]
    plt.figure()
    plt.subplot(121)
    plt.axis('equal')
    plt.scatter(xs, ys, s=10, c=zs, marker='o', cmap=cm.get_cmap('jet'),
            label='3D survey')
    plt.plot(xtide, ytide, '--', color=gray, linewidth=2.5, label='tide level contour')
    plt.plot(sl_smooth[:,0], sl_smooth[:,1], '-', color=red, linewidth=2.5, label='SDS')
 #    plt.plot(sl[i][idx_beach,0], sl[i][idx_beach,1], 'w-', linewidth=2)
    plt.xlabel('Eastings [m]')
    plt.ylabel('Northings [m]')
    plt.title('Shoreline comparison')
    plt.colorbar(label='mAHD')
    plt.legend()
    plt.ylim((6266100, 6267000))
    plt.subplot(122)
    plt.plot(np.linspace(0,1,len(zq)), zq, 'ko-', markersize=5)
    plt.plot([0, 1], [zav[i], zav[i]], 'r-', label='median')
    plt.plot([0, 1], [ztide[i], ztide[i]], 'g--', label = 'measured tide')
    plt.xlabel('Northings [m]')
    plt.ylabel('Elevation [mAHD]')
    plt.title('Alongshore SDS elevation')
    plt.legend()
    mng = plt.get_current_fig_manager()                                         
    mng.window.showMaximized()
    plt.tight_layout()   
    plt.draw()
    print(i)
 #%% Calculate some error statistics
 zav = np.array(zav)
 ztide = np.array(ztide)
 f = plt.figure()
 plt.subplot(3,1,1)
 plt.bar(np.linspace(1,len(zav),len(zav)), zav-ztide)
 plt.ylabel('Error in z [m]')
 plt.title('Elevation error')
 plt.xticks([])
 plt.draw()
 plt.subplot(3,1,2)
 plt.bar(np.linspace(1,len(zav),len(zav)), wave_hsig, color=orange)
 plt.ylabel('Hsig [m]')
 plt.xticks([])
 plt.draw()
 plt.subplot(3,1,3)
 plt.bar(np.linspace(1,len(zav),len(zav)), np.array(cloud_cover)*100, color='g')
 plt.ylabel('Cloud cover %')
 plt.xlabel('comparison #')
 plt.grid(False)
 plt.grid(axis='y')
 f.subplots_adjust(hspace=0)
 plt.draw()
 np.sqrt(np.mean((zav - ztide)**2))
 #%% plot to show LOWESS smoothing
 #i = 0
 #idx_beach = [np.min(np.linalg.norm(sl[i][k,:] - narrabeach, axis=1)) < dist_thresh for k in range(sl[i].shape[0])]
 #x = sl[i][idx_beach,0]
 #y = sl[i][idx_beach,1]
 #sl_smooth = lowess(x,y, frac=1./10, it = 10)
 #
 #plt.figure()
 #plt.axis('equal')
 #plt.scatter
 #plt.plot(x,y,'bo', linewidth=2, label='original SDS')
 #plt.plot(sl_smooth[:,1], sl_smooth[:,0], 'ro', linewidth=2, label='smoothed SDS')
 #plt.legend()
 #plt.xlabel('Eastings [m]')
 #plt.ylabel('Northings [m]')
 #plt.title('Local weighted scatterplot smoothing (LOWESS)')
 #plt.draw()