geetools_VH/read_images.py

#==========================================================#
#==========================================================#
# Extract shorelines from Landsat images
#==========================================================#
#==========================================================#


#==========================================================#
# Initial settings
#==========================================================#

import os
import numpy as np
import matplotlib.pyplot as plt
import ee
import pdb

# other modules
from osgeo import gdal, ogr, osr
import pickle
import matplotlib.cm as cm
from pylab import ginput
from shapely.geometry import LineString

# image processing modules
import skimage.filters as filters
import skimage.exposure as exposure
import skimage.transform as transform
import sklearn.decomposition as decomposition
import skimage.measure as measure
import skimage.morphology as morphology

# machine learning modules
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.externals import joblib

# import own modules
import functions.utils as utils
import functions.sds as sds

# some other settings
np.seterr(all='ignore') # raise/ignore divisions by 0 and nans
plt.rcParams['axes.grid'] = True
plt.rcParams['figure.max_open_warning'] = 100
ee.Initialize()

#==========================================================#
# Parameters
#==========================================================#

sitename = 'NARRA'

cloud_thresh = 0.7      # threshold for cloud cover
plot_bool = False       # if you want the plots
output_epsg = 28356     # GDA94 / MGA Zone 56
buffer_size = 7         # radius (in pixels) of disk for buffer (pixel classification)
min_beach_size = 20     # number of pixels in a beach (pixel classification)
dist_ref = 100          # maximum distance from reference point
min_length_wl = 200     # minimum length of shoreline LineString to be kept
manual_bool = True      # to manually check images


output = dict([])

#==========================================================#
# Metadata
#==========================================================#

filepath = os.path.join(os.getcwd(), 'data', sitename)
with open(os.path.join(filepath, sitename + '_metadata' + '.pkl'), 'rb') as f:
    metadata = pickle.load(f)


#%%
#==========================================================#
# Read S2 images
#==========================================================#

satname = 'S2'
dates = metadata[satname]['dates']
input_epsg = 32756 # metadata[satname]['epsg']

# path to images
filepath10 = os.path.join(os.getcwd(), 'data', sitename, satname, '10m')
filenames10 = os.listdir(filepath10)
filepath20 = os.path.join(os.getcwd(), 'data', sitename, satname, '20m')
filenames20 = os.listdir(filepath20)
filepath60 = os.path.join(os.getcwd(), 'data', sitename, satname, '60m')
filenames60 = os.listdir(filepath60)
if (not len(filenames10) == len(filenames20)) or (not len(filenames20) == len(filenames60)):
    raise 'error: not the same amount of files for 10, 20 and 60 m'
N = len(filenames10)

# initialise variables
cloud_cover_ts = []
acc_georef_ts = []
date_acquired_ts = []
filename_ts = []
satname_ts = []
timestamp = []
shorelines = []
idx_skipped = []

spacing = '=========================================================='
msg = ' %s\n %s\n %s' % (spacing, satname, spacing)
print(msg)

for i in range(N):

    # read 10m bands
    fn = os.path.join(filepath10, filenames10[i])
    data = gdal.Open(fn, gdal.GA_ReadOnly)
    georef = np.array(data.GetGeoTransform())
    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
    im10 = np.stack(bands, 2)
    im10 = im10/10000 # TOA scaled to 10000

    # if image is only zeros, skip it
    if sum(sum(sum(im10))) < 1:
        print('skip ' + str(i) + ' - no data')
        idx_skipped.append(i)
        continue

    nrows = im10.shape[0]
    ncols = im10.shape[1]

    # read 20m band (SWIR1)
    fn = os.path.join(filepath20, filenames20[i])
    data = gdal.Open(fn, gdal.GA_ReadOnly)
    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
    im20 = np.stack(bands, 2)
    im20 = im20[:,:,0]
    im20 = im20/10000 # TOA scaled to 10000
    im_swir = transform.resize(im20, (nrows, ncols), order=1, preserve_range=True, mode='constant')
    im_swir = np.expand_dims(im_swir, axis=2)

    # append down-sampled swir band to the 10m bands
    im_ms = np.append(im10, im_swir, axis=2)

    # read 60m band (QA)
    fn = os.path.join(filepath60, filenames60[i])
    data = gdal.Open(fn, gdal.GA_ReadOnly)
    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
    im60 = np.stack(bands, 2)
    im_qa = im60[:,:,0]
    cloud_mask = sds.create_cloud_mask(im_qa, satname, plot_bool)
    cloud_mask = transform.resize(cloud_mask,(nrows, ncols), order=0, preserve_range=True, mode='constant')
    # check if -inf or nan values on any band and add to cloud mask
    for k in range(im_ms.shape[2]):
            im_inf = np.isin(im_ms[:,:,k], -np.inf)
            im_nan = np.isnan(im_ms[:,:,k])
            cloud_mask = np.logical_or(np.logical_or(cloud_mask, im_inf), im_nan)

    # calculate cloud cover and if above threshold, skip it
    cloud_cover = sum(sum(cloud_mask.astype(int)))/(cloud_mask.shape[0]*cloud_mask.shape[1])
    if cloud_cover > cloud_thresh:
        print('skip ' + str(i) + ' - cloudy (' + str(np.round(cloud_cover*100).astype(int)) + '%)')
        idx_skipped.append(i)
        continue

    # rescale image intensity for display purposes
    im_display = sds.rescale_image_intensity(im_ms[:,:,[2,1,0]], cloud_mask, 99.9, False)

    # classify image in 4 classes (sand, whitewater, water, other) with NN classifier
    im_classif, im_labels = sds.classify_image_NN_nopan(im_ms, cloud_mask, min_beach_size, plot_bool)

    # if there aren't any sandy pixels
    if sum(sum(im_labels[:,:,0])) == 0 :
        # use global threshold
        im_ndwi = sds.nd_index(im_ms[:,:,4], im_ms[:,:,1], cloud_mask, plot_bool)
        contours = sds.find_wl_contours(im_ndwi, cloud_mask, plot_bool)
    else:
        # use specific threhsold
        contours_wi, contours_mwi = sds.find_wl_contours2(im_ms, im_labels, cloud_mask, buffer_size, plot_bool)

    # convert from pixels to world coordinates
    wl_coords = sds.convert_pix2world(contours_mwi, georef)
    # convert to output epsg spatial reference
    wl = sds.convert_epsg(wl_coords, input_epsg, output_epsg)

    # remove contour lines that have a perimeter < min_length_wl
    wl_good = []
    for l, wls in enumerate(wl):
        coords = [(wls[k,0], wls[k,1]) for k in range(len(wls))]
        a = LineString(coords) # shapely LineString structure
        if a.length >= min_length_wl:
            wl_good.append(wls)

    # format points and only select the ones close to the refpoints
    x_points = np.array([])
    y_points = np.array([])
    for k in range(len(wl_good)):
        x_points = np.append(x_points,wl_good[k][:,0])
        y_points = np.append(y_points,wl_good[k][:,1])
    wl_good = np.transpose(np.array([x_points,y_points]))
    temp = np.zeros((len(wl_good))).astype(bool)
    for k in range(len(refpoints)):
        temp = np.logical_or(np.linalg.norm(wl_good - refpoints[k,[0,1]], axis=1) < dist_ref, temp)
    wl_final = wl_good[temp]


    # plot output
    plt.figure()
    im = np.copy(im_display)
    colours = np.array([[1,128/255,0/255],[204/255,1,1],[0,0,204/255]])
    for k in range(0,im_labels.shape[2]):
        im[im_labels[:,:,k],0] = colours[k,0]
        im[im_labels[:,:,k],1] = colours[k,1]
        im[im_labels[:,:,k],2] = colours[k,2]
    plt.imshow(im)
    for k,contour in enumerate(contours_mwi): plt.plot(contour[:, 1], contour[:, 0], linewidth=2, color='k', linestyle='--')
    plt.title(satname + '     ' + metadata[satname]['dates'][i].strftime('%Y-%m-%d') + '      acc : ' + str(metadata[satname]['acc_georef'][i]) + ' m' )
    plt.draw()

    pt_in = np.array(ginput(n=1, timeout=1000))
    plt.close()

    # if image is rejected, skip it
    if pt_in[0][1] > nrows/2:
        print('skip ' + str(i) + ' - rejected')
        idx_skipped.append(i)
        continue

    # if accepted, store the data
    cloud_cover_ts.append(cloud_cover)
    acc_georef_ts.append(metadata[satname]['acc_georef'][i])

    filename_ts.append(filenames10[i])
    satname_ts.append(satname)
    date_acquired_ts.append(filenames10[i][:10])

    timestamp.append(metadata[satname]['dates'][i])
    shorelines.append(wl_final)

# store in output structure
output[satname] = {'dates':timestamp, 'shorelines':shorelines, 'idx_skipped':idx_skipped,
      'metadata':{'filenames':filename_ts, 'satname':satname_ts, 'cloud_cover':cloud_cover_ts,
                  'acc_georef':acc_georef_ts}}
del idx_skipped


#%%
#==========================================================#
# Read L7&L8 images
#==========================================================#

satname = 'L8'
dates = metadata[satname]['dates']
input_epsg = 32656 # metadata[satname]['epsg']

# path to images
filepath_pan = os.path.join(os.getcwd(), 'data', sitename, 'L7&L8', 'pan')
filepath_ms = os.path.join(os.getcwd(), 'data', sitename, 'L7&L8', 'ms')
filenames_pan = os.listdir(filepath_pan)
filenames_ms = os.listdir(filepath_ms)
if (not len(filenames_pan) == len(filenames_ms)):
    raise 'error: not the same amount of files for pan and ms'
N = len(filenames_pan)

# initialise variables
cloud_cover_ts = []
acc_georef_ts = []
date_acquired_ts = []
filename_ts = []
satname_ts = []
timestamp = []
shorelines = []
idx_skipped = []


spacing = '=========================================================='
msg = ' %s\n %s\n %s' % (spacing, satname, spacing)
print(msg)

for i in range(N):

    # get satellite name
    sat = filenames_pan[i][20:22]

    # read pan image
    fn_pan = os.path.join(filepath_pan, filenames_pan[i])
    data = gdal.Open(fn_pan, gdal.GA_ReadOnly)
    georef = np.array(data.GetGeoTransform())
    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
    im_pan = np.stack(bands, 2)[:,:,0]
    nrows = im_pan.shape[0]
    ncols = im_pan.shape[1]

    # read ms image
    fn_ms = os.path.join(filepath_ms, filenames_ms[i])
    data = gdal.Open(fn_ms, gdal.GA_ReadOnly)
    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
    im_ms = np.stack(bands, 2)

    # cloud mask
    im_qa = im_ms[:,:,5]
    cloud_mask = sds.create_cloud_mask(im_qa, sat, plot_bool)
    cloud_mask = transform.resize(cloud_mask, (nrows, ncols), order=0, preserve_range=True, mode='constant').astype('bool_')
    # resize the image using bilinear interpolation (order 1)
    im_ms = im_ms[:,:,:5]
    im_ms = transform.resize(im_ms,(nrows, ncols), order=1, preserve_range=True, mode='constant')

    # check if -inf or nan values on any band and add to cloud mask
    for k in range(im_ms.shape[2]+1):
        if k == 5:
            im_inf = np.isin(im_pan, -np.inf)
            im_nan = np.isnan(im_pan)
        else:
            im_inf = np.isin(im_ms[:,:,k], -np.inf)
            im_nan = np.isnan(im_ms[:,:,k])
        cloud_mask = np.logical_or(np.logical_or(cloud_mask, im_inf), im_nan)

    # calculate cloud cover and skip image if above threshold
    cloud_cover = sum(sum(cloud_mask.astype(int)))/(cloud_mask.shape[0]*cloud_mask.shape[1])
    if cloud_cover > cloud_thresh:
        print('skip ' + str(i) + ' - cloudy (' + str(np.round(cloud_cover*100).astype(int)) + '%)')
        idx_skipped.append(i)
        continue

    # Pansharpen image (different for L8 and L7)
    if sat == 'L7':
        # pansharpen (Green, Red, NIR) and downsample Blue and SWIR1
        im_ms_ps = sds.pansharpen(im_ms[:,:,[1,2,3]], im_pan, cloud_mask, plot_bool)
        im_ms_ps = np.append(im_ms[:,:,[0]], im_ms_ps, axis=2)
        im_ms_ps = np.append(im_ms_ps, im_ms[:,:,[4]], axis=2)
        im_display = sds.rescale_image_intensity(im_ms[:,:,[2,1,0]], cloud_mask, 99.9, False)
    elif sat == 'L8':
        # pansharpen RGB image and downsample NIR and SWIR1
        im_ms_ps = sds.pansharpen(im_ms[:,:,[0,1,2]], im_pan, cloud_mask, plot_bool)
        im_ms_ps = np.append(im_ms_ps, im_ms[:,:,[3,4]], axis=2)
        im_display = sds.rescale_image_intensity(im_ms_ps[:,:,[2,1,0]], cloud_mask, 99.9, False)

    # classify image in 4 classes (sand, whitewater, water, other) with NN classifier
    im_classif, im_labels = sds.classify_image_NN(im_ms_ps, im_pan, cloud_mask, min_beach_size, plot_bool)

    # if there aren't any sandy pixels
    if sum(sum(im_labels[:,:,0])) == 0 :
        # use global threshold
        im_ndwi = sds.nd_index(im_ms_ps[:,:,4], im_ms_ps[:,:,1], cloud_mask, plot_bool)
        contours = sds.find_wl_contours(im_ndwi, cloud_mask, plot_bool)
    else:
        # use specific threhsold
        contours_wi, contours_mwi = sds.find_wl_contours2(im_ms_ps, im_labels, cloud_mask, buffer_size, plot_bool)

    # convert from pixels to world coordinates
    wl_coords = sds.convert_pix2world(contours_mwi, georef)
    # convert to output epsg spatial reference
    wl = sds.convert_epsg(wl_coords, input_epsg, output_epsg)

    # remove contour lines that have a perimeter < min_length_wl
    wl_good = []
    for l, wls in enumerate(wl):
        coords = [(wls[k,0], wls[k,1]) for k in range(len(wls))]
        a = LineString(coords) # shapely LineString structure
        if a.length >= min_length_wl:
            wl_good.append(wls)

    # format points and only select the ones close to the refpoints
    x_points = np.array([])
    y_points = np.array([])
    for k in range(len(wl_good)):
        x_points = np.append(x_points,wl_good[k][:,0])
        y_points = np.append(y_points,wl_good[k][:,1])
    wl_good = np.transpose(np.array([x_points,y_points]))
    temp = np.zeros((len(wl_good))).astype(bool)
    for k in range(len(refpoints)):
        temp = np.logical_or(np.linalg.norm(wl_good - refpoints[k,[0,1]], axis=1) < dist_ref, temp)
    wl_final = wl_good[temp]

    # plot output
    plt.figure()
    plt.subplot(121)
    im = np.copy(im_display)
    colours = np.array([[1,128/255,0/255],[204/255,1,1],[0,0,204/255]])
    for k in range(0,im_labels.shape[2]):
        im[im_labels[:,:,k],0] = colours[k,0]
        im[im_labels[:,:,k],1] = colours[k,1]
        im[im_labels[:,:,k],2] = colours[k,2]
    plt.imshow(im)
    for k,contour in enumerate(contours_mwi): plt.plot(contour[:, 1], contour[:, 0], linewidth=2, color='k', linestyle='--')
    plt.title(sat + '     ' + metadata[satname]['dates'][i].strftime('%Y-%m-%d') + '      acc : ' + str(metadata[satname]['acc_georef'][i]) + ' m' )

    pt_in = np.array(ginput(n=1, timeout=1000))
    plt.close()

    # if image is rejected, skip it
    if pt_in[0][1] > nrows/2:
        print('skip ' + str(i) + ' - rejected')
        idx_skipped.append(i)
        continue

    # if accepted, store the data
    cloud_cover_ts.append(cloud_cover)
    acc_georef_ts.append(metadata[satname]['acc_georef'][i])

    filename_ts.append(filenames_pan[i])
    satname_ts.append(sat)
    date_acquired_ts.append(filenames_pan[i][:10])

    timestamp.append(metadata[satname]['dates'][i])
    shorelines.append(wl_final)

# store in output structure
output[satname] = {'dates':timestamp, 'shorelines':shorelines, 'idx_skipped':idx_skipped,
      'metadata':{'filenames':filename_ts, 'satname':satname_ts, 'cloud_cover':cloud_cover_ts,
                  'acc_georef':acc_georef_ts}}

del idx_skipped


#%%
#==========================================================#
# Read L5 images
#==========================================================#

satname = 'L5'
dates = metadata[satname]['dates']
input_epsg = 32656 # metadata[satname]['epsg']

# path to images
filepath_img = os.path.join(os.getcwd(), 'data', sitename, satname, '30m')
filenames = os.listdir(filepath_img)
N = len(filenames)

# initialise variables
cloud_cover_ts = []
acc_georef_ts = []
date_acquired_ts = []
filename_ts = []
satname_ts = []
timestamp = []
shorelines = []
idx_skipped = []


spacing = '=========================================================='
msg = ' %s\n %s\n %s' % (spacing, satname, spacing)
print(msg)

for i in range(N):

    # read ms image
    fn = os.path.join(filepath_img, filenames[i])
    data = gdal.Open(fn, gdal.GA_ReadOnly)
    georef = np.array(data.GetGeoTransform())
    bands = [data.GetRasterBand(k + 1).ReadAsArray() for k in range(data.RasterCount)]
    im_ms = np.stack(bands, 2)

    # down-sample to half hte original pixel size
    nrows = im_ms.shape[0]*2
    ncols = im_ms.shape[1]*2

    # cloud mask
    im_qa = im_ms[:,:,5]
    im_ms = im_ms[:,:,:-1]
    cloud_mask = sds.create_cloud_mask(im_qa, satname, plot_bool)
    cloud_mask = transform.resize(cloud_mask, (nrows, ncols), order=0, preserve_range=True, mode='constant').astype('bool_')

    # resize the image using bilinear interpolation (order 1)
    im_ms = transform.resize(im_ms,(nrows, ncols), order=1, preserve_range=True, mode='constant')

    # adjust georef vector (scale becomes 15m and origin is adjusted to the center of new corner pixel)
    georef[1] = 15
    georef[5] = -15
    georef[0] = georef[0] + 7.5
    georef[3] = georef[3] - 7.5

    # check if -inf or nan values on any band and add to cloud mask
    for k in range(im_ms.shape[2]):
        im_inf = np.isin(im_ms[:,:,k], -np.inf)
        im_nan = np.isnan(im_ms[:,:,k])
        cloud_mask = np.logical_or(np.logical_or(cloud_mask, im_inf), im_nan)

    # calculate cloud cover and skip image if above threshold
    cloud_cover = sum(sum(cloud_mask.astype(int)))/(cloud_mask.shape[0]*cloud_mask.shape[1])
    if cloud_cover > cloud_thresh:
        print('skip ' + str(i) + ' - cloudy (' + str(np.round(cloud_cover*100).astype(int)) + '%)')
        idx_skipped.append(i)
        continue

    # rescale image intensity for display purposes
    im_display = sds.rescale_image_intensity(im_ms[:,:,[2,1,0]], cloud_mask, 99.9, False)

    # classify image in 4 classes (sand, whitewater, water, other) with NN classifier
    im_classif, im_labels = sds.classify_image_NN_nopan(im_ms, cloud_mask, min_beach_size, plot_bool)

    # if there aren't any sandy pixels
    if sum(sum(im_labels[:,:,0])) == 0 :
        # use global threshold
        im_ndwi = sds.nd_index(im_ms[:,:,4], im_ms[:,:,1], cloud_mask, plot_bool)
        contours = sds.find_wl_contours(im_ndwi, cloud_mask, plot_bool)
    else:
        # use specific threhsold
        contours_wi, contours_mwi = sds.find_wl_contours2(im_ms, im_labels, cloud_mask, buffer_size, plot_bool)

    # convert from pixels to world coordinates
    wl_coords = sds.convert_pix2world(contours_mwi, georef)
    # convert to output epsg spatial reference
    wl = sds.convert_epsg(wl_coords, input_epsg, output_epsg)

    # remove contour lines that have a perimeter < min_length_wl
    wl_good = []
    for l, wls in enumerate(wl):
        coords = [(wls[k,0], wls[k,1]) for k in range(len(wls))]
        a = LineString(coords) # shapely LineString structure
        if a.length >= min_length_wl:
            wl_good.append(wls)

    # format points and only select the ones close to the refpoints
    x_points = np.array([])
    y_points = np.array([])
    for k in range(len(wl_good)):
        x_points = np.append(x_points,wl_good[k][:,0])
        y_points = np.append(y_points,wl_good[k][:,1])
    wl_good = np.transpose(np.array([x_points,y_points]))
    temp = np.zeros((len(wl_good))).astype(bool)
    for k in range(len(refpoints)):
        temp = np.logical_or(np.linalg.norm(wl_good - refpoints[k,[0,1]], axis=1) < dist_ref, temp)
    wl_final = wl_good[temp]

    # plot output
    plt.figure()
    plt.subplot(121)
    im = np.copy(im_display)
    colours = np.array([[1,128/255,0/255],[204/255,1,1],[0,0,204/255]])
    for k in range(0,im_labels.shape[2]):
        im[im_labels[:,:,k],0] = colours[k,0]
        im[im_labels[:,:,k],1] = colours[k,1]
        im[im_labels[:,:,k],2] = colours[k,2]
    plt.imshow(im)
    for k,contour in enumerate(contours_mwi): plt.plot(contour[:, 1], contour[:, 0], linewidth=2, color='k', linestyle='--')
    plt.title(satname + '     ' + metadata[satname]['dates'][i].strftime('%Y-%m-%d') + '      acc : ' + str(metadata[satname]['acc_georef'][i]) + ' m' )
    plt.subplot(122)
    plt.axis('equal')
    plt.axis('off')
    plt.plot(refpoints[:,0], refpoints[:,1], 'k.')
    plt.plot(wl_final[:,0], wl_final[:,1], 'r.')
    mng = plt.get_current_fig_manager()
    mng.window.showMaximized()
    plt.tight_layout()
    plt.draw()

    pt_in = np.array(ginput(n=1, timeout=1000))
    plt.close()

    # if image is rejected, skip it
    if pt_in[0][1] > nrows/2:
        print('skip ' + str(i) + ' - rejected')
        idx_skipped.append(i)
        continue

    # if accepted, store the data
    cloud_cover_ts.append(cloud_cover)
    acc_georef_ts.append(metadata[satname]['acc_georef'][i])

    filename_ts.append(filenames[i])
    satname_ts.append(satname)
    date_acquired_ts.append(filenames[i][:10])

    timestamp.append(metadata[satname]['dates'][i])
    shorelines.append(wl_final)

# store in output structure
output[satname] = {'dates':timestamp, 'shorelines':shorelines, 'idx_skipped':idx_skipped,
      'metadata':{'filenames':filename_ts, 'satname':satname_ts, 'cloud_cover':cloud_cover_ts,
                  'acc_georef':acc_georef_ts}}

del idx_skipped

#==========================================================#
#==========================================================#
#==========================================================#
#==========================================================#

#%%
# save output
with open(os.path.join(filepath, sitename + '_output' + '.pkl'), 'wb') as f:
    pickle.dump(output, f)

# save idx_skipped
#idx_skipped = dict([])
#for satname in list(output.keys()):
#    idx_skipped[satname] = output[satname]['idx_skipped']
#with open(os.path.join(filepath, sitename + '_idxskipped' + '.pkl'), 'wb') as f:
#    pickle.dump(idx_skipped, f)