From f9e8468015734024574b055a1b1ec5499d4a34fc Mon Sep 17 00:00:00 2001 From: Kilian Vos Date: Sun, 30 Sep 2018 16:52:01 +1000 Subject: [PATCH] Supprimer 'functions/data_analysis.py' --- functions/data_analysis.py | 446 ------------------------------------- 1 file changed, 446 deletions(-) delete mode 100644 functions/data_analysis.py diff --git a/functions/data_analysis.py b/functions/data_analysis.py deleted file mode 100644 index 7ba36d5..0000000 --- a/functions/data_analysis.py +++ /dev/null @@ -1,446 +0,0 @@ -"""This module contains all the functions needed for data analysis """ - -# Initial settings -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.patches as mpatches -from matplotlib import gridspec -import pdb -import ee - -# other modules -from osgeo import gdal, ogr, osr -import scipy.interpolate as interpolate -import scipy.stats as sstats - -# image processing modules -import skimage.filters as filters -import skimage.exposure as exposure -import skimage.transform as transform -import sklearn.decomposition as decomposition -import skimage.measure as measure -import skimage.morphology as morphology - -# machine learning modules -from sklearn.cluster import KMeans -from sklearn.neural_network import MLPClassifier -from sklearn.externals import joblib - -import time - -# import own modules -import functions.utils as utils - -def get_tide(dates_sds, dates_tide, tide_level): - - tide = [] - for i in range(len(dates_sds)): - dates_diff = np.abs(np.array([ (dates_sds[i] - _).total_seconds() for _ in dates_tide])) - if np.min(dates_diff) <= 1800: # half-an-hour - idx_closest = np.argmin(dates_diff) - tide.append(tide_level[idx_closest]) - else: - tide.append(np.nan) - tide = np.array(tide) - - return tide - -def remove_duplicates(output, satname): - " removes duplicates from output structure, keep the one with less cloud cover or best georeferencing " - dates = output['dates'] - dates_str = [_.strftime('%Y%m%d') for _ in dates] - dupl = utils.duplicates_dict(dates_str) - if dupl: - output_nodup = dict([]) - idx_remove = [] - if satname == 'L8' or satname == 'L5': - for k,v in dupl.items(): - - idx1 = v[0] - idx2 = v[1] - - c1 = output['metadata']['cloud_cover'][idx1] - c2 = output['metadata']['cloud_cover'][idx2] - g1 = output['metadata']['acc_georef'][idx1] - g2 = output['metadata']['acc_georef'][idx2] - - if c1 < c2 - 0.01: - idx_remove.append(idx2) - elif g1 < g2 - 0.1: - idx_remove.append(idx2) - else: - idx_remove.append(idx1) - - else: - for k,v in dupl.items(): - - idx1 = v[0] - idx2 = v[1] - - c1 = output['metadata']['cloud_cover'][idx1] - c2 = output['metadata']['cloud_cover'][idx2] - - if c1 < c2 - 0.01: - idx_remove.append(idx2) - else: - idx_remove.append(idx1) - - idx_remove = sorted(idx_remove) - idx_all = np.linspace(0, len(dates_str)-1, len(dates_str)) - idx_keep = list(np.where(~np.isin(idx_all,idx_remove))[0]) - - output_nodup['dates'] = [output['dates'][k] for k in idx_keep] - output_nodup['shorelines'] = [output['shorelines'][k] for k in idx_keep] - output_nodup['metadata'] = dict([]) - for key in list(output['metadata'].keys()): - output_nodup['metadata'][key] = [output['metadata'][key][k] for k in idx_keep] - print(satname + ' : ' + str(len(idx_remove)) + ' duplicates') - return output_nodup - - else: - print(satname + ' : ' + 'no duplicates') - return output - - -def merge(output): - " merges data from the different satellites " - - # stack all list together under one key - output_all = {'dates':[], 'shorelines':[], - 'metadata':{'filenames':[], 'satname':[], 'cloud_cover':[], 'acc_georef':[]}} - for satname in list(output.keys()): - output_all['dates'] = output_all['dates'] + output[satname]['dates'] - output_all['shorelines'] = output_all['shorelines'] + output[satname]['shorelines'] - for key in list(output[satname]['metadata'].keys()): - output_all['metadata'][key] = output_all['metadata'][key] + output[satname]['metadata'][key] - - output_all_sorted = {'dates':[], 'shorelines':[], - 'metadata':{'filenames':[], 'satname':[], 'cloud_cover':[], 'acc_georef':[]}} - # sort the dates - idx_sorted = sorted(range(len(output_all['dates'])), key=output_all['dates'].__getitem__) - output_all_sorted['dates'] = [output_all['dates'][i] for i in idx_sorted] - output_all_sorted['shorelines'] = [output_all['shorelines'][i] for i in idx_sorted] - for key in list(output_all['metadata'].keys()): - output_all_sorted['metadata'][key] = [output_all['metadata'][key][i] for i in idx_sorted] - - return output_all_sorted - -def create_transects(x0, y0, orientation, chainage_length): - " creates shore-normal transects " - - transects = [] - - for k in range(len(x0)): - - # orientation of cross-shore profile - phi = (90 - orientation[k])*np.pi/180 - - # create a vector using the chainage length - x = np.linspace(0,chainage_length,chainage_length+1) - y = np.zeros(len(x)) - coords = np.zeros((len(x),2)) - coords[:,0] = x - coords[:,1] = y - - # translate and rotate the vector using the origin and orientation - tf = transform.EuclideanTransform(rotation=phi, translation=(x0[k],y0[k])) - coords_tf = tf(coords) - - transects.append(coords_tf) - - return transects - -def calculate_chainage(sds, transects, orientation, along_dist): - " intersect SDS with transect and compute chainage position " - - chainage_mtx = np.zeros((len(sds),len(transects),6)) - - for i in range(len(sds)): - - sl = sds[i] - - for j in range(len(transects)): - - # compute rotation matrix - X0 = transects[j][0,0] - Y0 = transects[j][0,1] - phi = (90 - orientation[j])*np.pi/180 - Mrot = np.array([[np.cos(phi), np.sin(phi)],[-np.sin(phi), np.cos(phi)]]) - - # calculate point to line distance between shoreline points and profile - p1 = np.array([X0,Y0]) - p2 = transects[j][-1,:] - p3 = sl - d = np.abs(np.cross(p2-p1,p3-p1)/np.linalg.norm(p2-p1)) - idx_close = utils.find_indices(d, lambda e: e <= along_dist) - - # check if there are SDS points around the profile or not - if not idx_close: - chainage_mtx[i,j,:] = np.tile(np.nan,(1,6)) - - else: - # change of base to shore-normal coordinate system - xy_close = np.array([sl[idx_close,0],sl[idx_close,1]]) - np.tile(np.array([[X0],[Y0]]), (1,len(sl[idx_close]))) - xy_rot = np.matmul(Mrot, xy_close) - - # put nan values if the chainage is negative (MAKE SURE TO PICK ORIGIN CORRECTLY) - if np.any(xy_rot[0,:] < 0): - xy_rot[0,np.where(xy_rot[0,:] < 0)] = np.nan - - # compute mean, median max and std of chainage position - n_points = len(xy_rot[0,:]) - mean_cross = np.nanmean(xy_rot[0,:]) - median_cross = np.nanmedian(xy_rot[0,:]) - max_cross = np.nanmax(xy_rot[0,:]) - min_cross = np.nanmin(xy_rot[0,:]) - std_cross = np.nanstd(xy_rot[0,:]) - ################################################### - if std_cross > 10: # if large std, take the most seaward point - mean_cross = max_cross - median_cross = max_cross - min_cross = max_cross -# mean_cross = np.nan -# median_cross = np.nan -# min_cross = np.nan - - # store the statistics - chainage_mtx[i,j,:] = np.array([mean_cross, median_cross, max_cross, - min_cross, n_points, std_cross]) - - # format into dictionnary - chainage = dict([]) - chainage['mean'] = chainage_mtx[:,:,0] - chainage['median'] = chainage_mtx[:,:,1] - chainage['max'] = chainage_mtx[:,:,2] - chainage['min'] = chainage_mtx[:,:,3] - chainage['npoints'] = chainage_mtx[:,:,4] - chainage['std'] = chainage_mtx[:,:,5] - - return chainage - -def compare_sds(dates_sds, chain_sds, topo_profiles, mod=0, mindays=5): - """ - Compare sds with groundtruth data from topographic surveys / argus shorelines - - KV WRL 2018 - - Arguments: - ----------- - dates_sds: list - list of dates corresponding to each row in chain_sds - chain_sds: np.ndarray - array with time series of chainage for each transect (each transect is one column) - topo_profiles: dict - dict containing the dates and chainage of the groundtruth - mod: 0 or 1 - 0 for linear interpolation between 2 closest surveys, 1 for only nearest neighbour - min_days: int - minimum number of days for which the data can be compared - - Returns: ----------- - stats: dict - contains all the statistics of the comparison - - """ - - # create 3 figures - fig1 = plt.figure() - gs1 = gridspec.GridSpec(chain_sds.shape[1], 1) - axfig1 = [] - fig2 = plt.figure() - gs2 = gridspec.GridSpec(2, chain_sds.shape[1]) - axfig2 = [] - fig3 = plt.figure() - gs3 = gridspec.GridSpec(2,1) - axfig3 = [] - - dates_sds_num = np.array([_.toordinal() for _ in dates_sds]) - stats = dict([]) - data_fin = dict([]) - - # for each transect compare and plot the data - for i in range(chain_sds.shape[1]): - - pfname = list(topo_profiles.keys())[i] - stats[pfname] = dict([]) - data_fin[pfname] = dict([]) - - dates_sur = topo_profiles[pfname]['dates'] - chain_sur = topo_profiles[pfname]['chainage'] - - # convert to datenum - dates_sur_num = np.array([_.toordinal() for _ in dates_sur]) - - chain_sur_interp = [] - diff_days = [] - - for j, satdate in enumerate(dates_sds_num): - - temp_diff = satdate - dates_sur_num - - if mod==0: - # select measurement before and after sat image date and interpolate - - ind_before = np.where(temp_diff == temp_diff[temp_diff > 0][-1])[0] - if ind_before == len(temp_diff)-1: - chain_sur_interp.append(np.nan) - diff_days.append(np.abs(satdate-dates_sur_num[ind_before])[0]) - continue - ind_after = np.where(temp_diff == temp_diff[temp_diff < 0][0])[0] - tempx = np.zeros(2) - tempx[0] = dates_sur_num[ind_before] - tempx[1] = dates_sur_num[ind_after] - tempy = np.zeros(2) - tempy[0] = chain_sur[ind_before] - tempy[1] = chain_sur[ind_after] - diff_days.append(np.abs(np.max([satdate-tempx[0], satdate-tempx[1]]))) - # interpolate - f = interpolate.interp1d(tempx, tempy) - chain_sur_interp.append(f(satdate)) - - elif mod==1: - # select the closest measurement - - idx_closest = utils.find_indices(np.abs(temp_diff), lambda e: e == np.min(np.abs(temp_diff)))[0] - diff_days.append(np.abs(satdate-dates_sur_num[idx_closest])) - if diff_days[j] > mindays: - chain_sur_interp.append(np.nan) - else: - chain_sur_interp.append(chain_sur[idx_closest]) - - chain_sur_interp = np.array(chain_sur_interp) - - # remove nan values - idx_sur_nan = ~np.isnan(chain_sur_interp) - idx_sat_nan = ~np.isnan(chain_sds[:,i]) - idx_nan = np.logical_and(idx_sur_nan, idx_sat_nan) - - # groundtruth and sds - chain_sur_fin = chain_sur_interp[idx_nan] - chain_sds_fin = chain_sds[idx_nan,i] - dates_fin = [k for (k, v) in zip(dates_sds, idx_nan) if v] - - # calculate statistics - slope, intercept, rvalue, pvalue, std_err = sstats.linregress(chain_sur_fin, chain_sds_fin) - R2 = rvalue**2 - correlation = np.corrcoef(chain_sur_fin, chain_sds_fin)[0,1] - diff_chain = chain_sur_fin - chain_sds_fin - - rmse = np.sqrt(np.nanmean((diff_chain)**2)) - mean = np.nanmean(diff_chain) - std = np.nanstd(diff_chain) - q90 = np.percentile(np.abs(diff_chain), 90) - - # store data - stats[pfname]['rmse'] = rmse - stats[pfname]['mean'] = mean - stats[pfname]['std'] = std - stats[pfname]['q90'] = q90 - stats[pfname]['diffdays'] = diff_days - stats[pfname]['corr'] = correlation - stats[pfname]['linfit'] = {'slope':slope, 'intercept':intercept, 'R2':R2, 'pvalue':pvalue} - - data_fin[pfname]['dates'] = dates_fin - data_fin[pfname]['sds'] = chain_sds_fin - data_fin[pfname]['survey'] = chain_sur_fin - - # make time-series plot - plt.figure(fig1.number) - ax = fig1.add_subplot(gs1[i,0]) - axfig1.append(ax) - plt.plot(dates_sur, chain_sur, '-', color='C1', markersize=2, label='survey data') -# plt.plot(dates_fin, chain_sur_fin, 'o', color=[0.3, 0.3, 0.3], markersize=2, label='survey interp') - plt.plot(dates_fin, chain_sds_fin, 'o--', color='C0', markersize=4, alpha=1, label='satellite data') - strtitle = '%s (correlation = %.2f)' % (pfname, correlation) - plt.title(strtitle, fontweight='bold') - plt.xlim([dates_sds[0], dates_sds[-1]]) - plt.ylabel('cross-shore position [m]') - plt.legend() - - # make scatter plot - plt.figure(fig2.number) - fig2.add_subplot(gs2[0,i]) - plt.axis('equal') - plt.plot(chain_sur_fin, chain_sds_fin, 'ko', markersize=4, markerfacecolor='w', alpha=0.7) - xmax = np.max([np.nanmax(chain_sds_fin),np.nanmax(chain_sur_fin)]) - xmin = np.min([np.nanmin(chain_sds_fin),np.nanmin(chain_sur_fin)]) - ymax = np.max([np.nanmax(chain_sds_fin),np.nanmax(chain_sur_fin)]) - ymin = np.min([np.nanmin(chain_sds_fin),np.nanmin(chain_sur_fin)]) - plt.plot([xmin, xmax], [ymin, ymax], 'k--') - plt.plot([xmin, xmax], [xmin*slope + intercept, xmax*slope + intercept], 'r:') - str_corr = ' y = %.2f x + %.2f\n R2 = %.2f\n n = %d' % (slope, intercept, R2, len(diff_chain)) - plt.text(xmin, 0.9*ymax, str_corr, bbox=dict(facecolor=[0.7,0.7,0.7], alpha=0.5), horizontalalignment='left') - plt.xlabel('chainage survey [m]') - plt.ylabel('chainage satellite [m]') - plt.title(pfname, fontweight='bold') - - fig2.add_subplot(gs2[1,i]) - binwidth = 3 - bins = np.arange(min(diff_chain), max(diff_chain) + binwidth, binwidth) - density = plt.hist(diff_chain, bins=bins, density=True, color=[0.8, 0.8, 0.8], edgecolor='k') - plt.xlim([-50, 50]) - plt.xlabel('error [m]') - str_stats = ' rmse = %.1f\n mean = %.1f\n std = %.1f\n q90 = %.1f' % (rmse, mean, std, q90) - plt.text(15, np.max(density[0])-0.015, str_stats, bbox=dict(facecolor=[0.8,0.8,0.8], alpha=0.3), horizontalalignment='left', fontsize=10) - - fig1.set_size_inches(19.2, 9.28) - fig1.set_tight_layout(True) - fig2.set_size_inches(19.2, 9.28) - fig2.set_tight_layout(True) - - # all transects together - chain_sds_all = [] - chain_sur_all = [] - for i in range(chain_sds.shape[1]): - pfname = list(topo_profiles.keys())[i] - chain_sds_all = np.append(chain_sds_all,data_fin[pfname]['sds']) - chain_sur_all = np.append(chain_sur_all,data_fin[pfname]['survey']) - - # calculate statistics - slope, intercept, rvalue, pvalue, std_err = sstats.linregress(chain_sur_all, chain_sds_all) - R2 = rvalue**2 - correlation = np.corrcoef(chain_sur_all, chain_sds_all)[0,1] - diff_chain_all = chain_sur_all - chain_sds_all - - rmse = np.sqrt(np.nanmean((diff_chain_all)**2)) - mean = np.nanmean(diff_chain_all) - std = np.nanstd(diff_chain_all) - q90 = np.percentile(np.abs(diff_chain_all), 90) - - stats['all'] = {'rmse':rmse,'mean':mean,'std':std,'q90':q90, 'corr':correlation, - 'linfit':{'slope':slope, 'intercept':intercept, 'R2':R2, 'pvalue':pvalue}} - - # make plot - plt.figure(fig3.number) - fig3.add_subplot(gs3[0,0]) - plt.axis('equal') - plt.plot(chain_sur_all, chain_sds_all, 'ko', markersize=4, markerfacecolor='w', alpha=0.7) - xmax = np.max([np.nanmax(chain_sds_all),np.nanmax(chain_sur_all)]) - xmin = np.min([np.nanmin(chain_sds_all),np.nanmin(chain_sur_all)]) - ymax = np.max([np.nanmax(chain_sds_all),np.nanmax(chain_sur_all)]) - ymin = np.min([np.nanmin(chain_sds_all),np.nanmin(chain_sur_all)]) - plt.plot([xmin, xmax], [ymin, ymax], 'k--') - plt.plot([xmin, xmax], [xmin*slope + intercept, xmax*slope + intercept], 'r:') - str_corr = ' y = %.2f x + %.2f\n R2 = %.2f\n n = %d' % (slope, intercept, R2, len(diff_chain_all)) - plt.text(xmin, 0.9*ymax, str_corr, bbox=dict(facecolor=[0.7,0.7,0.7], alpha=0.5), horizontalalignment='left') - plt.xlabel('chainage survey [m]') - plt.ylabel('chainage satellite [m]') - plt.title(pfname, fontweight='bold') - - fig3.add_subplot(gs3[1,0]) - binwidth = 3 - bins = np.arange(min(diff_chain_all), max(diff_chain_all) + binwidth, binwidth) - density = plt.hist(diff_chain_all, bins=bins, density=True, color=[0.8, 0.8, 0.8], edgecolor='k') - plt.xlim([-50, 50]) - plt.xlabel('error [m]') - plt.ylabel('pdf') - str_stats = ' rmse = %.1f\n mean = %.1f\n std = %.1f\n q90 = %.1f' % (rmse, mean, std, q90) - plt.text(15, np.max(density[0])-0.015, str_stats, bbox=dict(facecolor=[0.8,0.8,0.8], alpha=0.3), horizontalalignment='left', fontsize=10) - fig3.set_size_inches(9.2, 9.28) - fig3.set_tight_layout(True) - -# for i in range(len(axfig1)): -# axfig1[i].set_ylim([0,150]) # Narrabeen data -# axfig1[i].set_ylim([25,110]) # Tairua data - - return stats \ No newline at end of file