5 changed files with 170 additions and 698 deletions
--- a/5
+++ b/5
@ -53,6 +53,9 @@ pull-data: ##@data Copies data from data backup directory to ./data/
 # Command for activating our virtual environment and calling the CLI entry point
 PYTHON_CLI = activate ./.venv && python ./src/cli.py
 impacts: ./data/interim/impacts_forecasted_foreshore_slope_sto06.csv ./data/interim/impacts_forecasted_premean_slope_sto06.csv ./data/interim/impacts_observed.csv ##@products makes obsered and forecasted impacts
 ### Parses raw matfiles
 ./data/raw/processed_shorelines/orientations.mat: ./data/raw/processed_shorelines/profiles.mat
@ -91,6 +94,7 @@ PYTHON_CLI = activate ./.venv && python ./src/cli.py
 ### TWLs
 twls: ./data/interim/twl_foreshore_slope_sto06.csv
 twls: ./data/interim/twl_premean_slope_sto06.csv
 twls: ./data/interim/twl_premean_slope_hol86.csv
 twls: ./data/interim/twl_premean_slope_nie91.csv
@ -316,6 +320,7 @@ twls: ./data/interim/twl_postintertidal_slope_pow18.csv
 ### IMPACTS
 impacts: ./data/interim/impacts_observed.csv
 impacts: ./data/interim/impacts_forecasted_foreshore_slope_sto06.csv
 impacts: ./data/interim/impacts_forecasted_premean_slope_sto06.csv
 impacts: ./data/interim/impacts_forecasted_premean_slope_hol86.csv
 impacts: ./data/interim/impacts_forecasted_premean_slope_nie91.csv
--- a/notebooks/07_evaluate_model_performance.ipynb
+++ b/notebooks/07_evaluate_model_performance.ipynb
@ -485,6 +485,20 @@
    "print('Done!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -493,6 +507,13 @@
    "Use sklearn metrics to generate classification reports for each forecasting model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -561,355 +582,6 @@
    "            df_for.storm_regime.astype(cat_type).cat.codes.values)\n",
    "    print('{}: {:.2f}'.format(model,m))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import confusion_matrix\n",
    "# Check confusion matrix\n",
    "for model in impacts['forecasted']:\n",
    "    df_for = impacts['forecasted'][model]\n",
    "    df_for.storm_regime = df_for.storm_regime.astype(cat_type)\n",
    "\n",
    "    m = sklearn.metrics.confusion_matrix(\n",
    "            df_obs.storm_regime.astype(cat_type).cat.codes.values,\n",
    "            df_for.storm_regime.astype(cat_type).cat.codes.values,\n",
    "            labels=[0,1,2,3])\n",
    "    print('{}\\n{}'.format(model,m))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create confusion matrix figure\n",
    "From https://github.com/wcipriano/pretty-print-confusion-matrix/blob/master/confusion_matrix_pretty_print.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "\"\"\"\n",
    "plot a pretty confusion matrix with seaborn\n",
    "Created on Mon Jun 25 14:17:37 2018\n",
    "@author: Wagner Cipriano - wagnerbhbr - gmail - CEFETMG / MMC\n",
    "REFerences:\n",
    "  https://www.mathworks.com/help/nnet/ref/plotconfusion.html\n",
    "  https://stackoverflow.com/questions/28200786/how-to-plot-scikit-learn-classification-report\n",
    "  https://stackoverflow.com/questions/5821125/how-to-plot-confusion-matrix-with-string-axis-rather-than-integer-in-python\n",
    "  https://www.programcreek.com/python/example/96197/seaborn.heatmap\n",
    "  https://stackoverflow.com/questions/19233771/sklearn-plot-confusion-matrix-with-labels/31720054\n",
    "  http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py\n",
    "\"\"\"\n",
    "\n",
    "#imports\n",
    "from pandas import DataFrame\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.font_manager as fm\n",
    "from matplotlib.collections import QuadMesh\n",
    "import seaborn as sn\n",
    "\n",
    "\n",
    "def get_new_fig(fn, figsize=[9,9]):\n",
    "    \"\"\" Init graphics \"\"\"\n",
    "    fig1 = plt.figure(fn, figsize)\n",
    "    ax1 = fig1.gca()   #Get Current Axis\n",
    "    ax1.cla() # clear existing plot\n",
    "    return fig1, ax1\n",
    "#\n",
    "\n",
    "def configcell_text_and_colors(array_df, lin, col, oText, facecolors, posi, fz, fmt, show_null_values=0):\n",
    "    \"\"\"\n",
    "      config cell text and colors\n",
    "      and return text elements to add and to dell\n",
    "      @TODO: use fmt\n",
    "    \"\"\"\n",
    "    text_add = []; text_del = [];\n",
    "    cell_val = array_df[lin][col]\n",
    "    tot_all = array_df[-1][-1]\n",
    "    per = (float(cell_val) / tot_all) * 100\n",
    "    curr_column = array_df[:,col]\n",
    "    ccl = len(curr_column)\n",
    "\n",
    "    #last line  and/or last column\n",
    "    if(col == (ccl - 1)) or (lin == (ccl - 1)):\n",
    "        #tots and percents\n",
    "        if(cell_val != 0):\n",
    "            if(col == ccl - 1) and (lin == ccl - 1):\n",
    "                tot_rig = 0\n",
    "                for i in range(array_df.shape[0] - 1):\n",
    "                    tot_rig += array_df[i][i]\n",
    "                per_ok = (float(tot_rig) / cell_val) * 100\n",
    "            elif(col == ccl - 1):\n",
    "                tot_rig = array_df[lin][lin]\n",
    "                per_ok = (float(tot_rig) / cell_val) * 100\n",
    "            elif(lin == ccl - 1):\n",
    "                tot_rig = array_df[col][col]\n",
    "                per_ok = (float(tot_rig) / cell_val) * 100\n",
    "            per_err = 100 - per_ok\n",
    "        else:\n",
    "            per_ok = per_err = 0\n",
    "\n",
    "        per_ok_s = ['%.1f%%'%(per_ok), '100%'] [per_ok == 100]\n",
    "\n",
    "        #text to DEL\n",
    "        text_del.append(oText)\n",
    "\n",
    "        #text to ADD\n",
    "        font_prop = fm.FontProperties(weight='bold', size=fz)\n",
    "        text_kwargs = dict(color='w', ha=\"center\", va=\"center\", gid='sum', fontproperties=font_prop)\n",
    "        lis_txt = ['%d'%(cell_val), per_ok_s, '%.1f%%'%(per_err)]\n",
    "        lis_kwa = [text_kwargs]\n",
    "        dic = text_kwargs.copy(); dic['color'] = 'g'; lis_kwa.append(dic);\n",
    "        dic = text_kwargs.copy(); dic['color'] = 'r'; lis_kwa.append(dic);\n",
    "        lis_pos = [(oText._x, oText._y-0.3), (oText._x, oText._y), (oText._x, oText._y+0.3)]\n",
    "        for i in range(len(lis_txt)):\n",
    "            newText = dict(x=lis_pos[i][0], y=lis_pos[i][1], text=lis_txt[i], kw=lis_kwa[i])\n",
    "            #print 'lin: %s, col: %s, newText: %s' %(lin, col, newText)\n",
    "            text_add.append(newText)\n",
    "        #print '\\n'\n",
    "\n",
    "        #set background color for sum cells (last line and last column)\n",
    "        carr = [0.27, 0.30, 0.27, 1.0]\n",
    "        if(col == ccl - 1) and (lin == ccl - 1):\n",
    "            carr = [0.17, 0.20, 0.17, 1.0]\n",
    "        facecolors[posi] = carr\n",
    "\n",
    "    else:\n",
    "        if(per > 0):\n",
    "            txt = '%s\\n%.1f%%' %(cell_val, per)\n",
    "        else:\n",
    "            if(show_null_values == 0):\n",
    "                txt = ''\n",
    "            elif(show_null_values == 1):\n",
    "                txt = '0'\n",
    "            else:\n",
    "                txt = '0\\n0.0%'\n",
    "        oText.set_text(txt)\n",
    "\n",
    "        #main diagonal\n",
    "        if(col == lin):\n",
    "            #set color of the textin the diagonal to white\n",
    "            oText.set_color('w')\n",
    "            # set background color in the diagonal to blue\n",
    "            facecolors[posi] = [0.35, 0.8, 0.55, 1.0]\n",
    "        else:\n",
    "            oText.set_color('r')\n",
    "\n",
    "    return text_add, text_del\n",
    "#\n",
    "\n",
    "def insert_totals(df_cm):\n",
    "    \"\"\" insert total column and line (the last ones) \"\"\"\n",
    "    sum_col = []\n",
    "    for c in df_cm.columns:\n",
    "        sum_col.append( df_cm[c].sum() )\n",
    "    sum_lin = []\n",
    "    for item_line in df_cm.iterrows():\n",
    "        sum_lin.append( item_line[1].sum() )\n",
    "    df_cm['sum_lin'] = sum_lin\n",
    "    sum_col.append(np.sum(sum_lin))\n",
    "    df_cm.loc['sum_col'] = sum_col\n",
    "    #print ('\\ndf_cm:\\n', df_cm, '\\n\\b\\n')\n",
    "#\n",
    "\n",
    "def pretty_plot_confusion_matrix(df_cm, annot=True, cmap=\"Oranges\", fmt='.2f', fz=11,\n",
    "      lw=0.5, cbar=False, figsize=[8,8], show_null_values=0, pred_val_axis='y'):\n",
    "    \"\"\"\n",
    "      print conf matrix with default layout (like matlab)\n",
    "      params:\n",
    "        df_cm          dataframe (pandas) without totals\n",
    "        annot          print text in each cell\n",
    "        cmap           Oranges,Oranges_r,YlGnBu,Blues,RdBu, ... see:\n",
    "        fz             fontsize\n",
    "        lw             linewidth\n",
    "        pred_val_axis  where to show the prediction values (x or y axis)\n",
    "                        'col' or 'x': show predicted values in columns (x axis) instead lines\n",
    "                        'lin' or 'y': show predicted values in lines   (y axis)\n",
    "    \"\"\"\n",
    "    if(pred_val_axis in ('col', 'x')):\n",
    "        xlbl = 'Predicted'\n",
    "        ylbl = 'Actual'\n",
    "    else:\n",
    "        xlbl = 'Actual'\n",
    "        ylbl = 'Predicted'\n",
    "        df_cm = df_cm.T\n",
    "\n",
    "    # create \"Total\" column\n",
    "    insert_totals(df_cm)\n",
    "\n",
    "    #this is for print allways in the same window\n",
    "    fig, ax1 = get_new_fig('Conf matrix default', figsize)\n",
    "\n",
    "    #thanks for seaborn\n",
    "    ax = sn.heatmap(df_cm, annot=annot, annot_kws={\"size\": fz}, linewidths=lw, ax=ax1,\n",
    "                    cbar=cbar, cmap=cmap, linecolor='w', fmt=fmt)\n",
    "\n",
    "    #set ticklabels rotation\n",
    "    ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, fontsize = 10)\n",
    "    ax.set_yticklabels(ax.get_yticklabels(), rotation = 25, fontsize = 10)\n",
    "\n",
    "    # Turn off all the ticks\n",
    "    for t in ax.xaxis.get_major_ticks():\n",
    "        t.tick1On = False\n",
    "        t.tick2On = False\n",
    "    for t in ax.yaxis.get_major_ticks():\n",
    "        t.tick1On = False\n",
    "        t.tick2On = False\n",
    "\n",
    "    #face colors list\n",
    "    quadmesh = ax.findobj(QuadMesh)[0]\n",
    "    facecolors = quadmesh.get_facecolors()\n",
    "\n",
    "    #iter in text elements\n",
    "    array_df = np.array( df_cm.to_records(index=False).tolist() )\n",
    "    text_add = []; text_del = [];\n",
    "    posi = -1 #from left to right, bottom to top.\n",
    "    for t in ax.collections[0].axes.texts: #ax.texts:\n",
    "        pos = np.array( t.get_position()) - [0.5,0.5]\n",
    "        lin = int(pos[1]); col = int(pos[0]);\n",
    "        posi += 1\n",
    "        #print ('>>> pos: %s, posi: %s, val: %s, txt: %s' %(pos, posi, array_df[lin][col], t.get_text()))\n",
    "\n",
    "        #set text\n",
    "        txt_res = configcell_text_and_colors(array_df, lin, col, t, facecolors, posi, fz, fmt, show_null_values)\n",
    "\n",
    "        text_add.extend(txt_res[0])\n",
    "        text_del.extend(txt_res[1])\n",
    "\n",
    "    #remove the old ones\n",
    "    for item in text_del:\n",
    "        item.remove()\n",
    "    #append the new ones\n",
    "    for item in text_add:\n",
    "        ax.text(item['x'], item['y'], item['text'], **item['kw'])\n",
    "\n",
    "    #titles and legends\n",
    "    ax.set_title('Confusion matrix')\n",
    "    ax.set_xlabel(xlbl)\n",
    "    ax.set_ylabel(ylbl)\n",
    "    plt.tight_layout()  #set layout slim\n",
    "    plt.show()\n",
    "    return fig\n",
    "#\n",
    "\n",
    "def plot_confusion_matrix_from_data(y_test, predictions, columns=None, annot=True, cmap=\"Oranges\",\n",
    "      fmt='.2f', fz=11, lw=0.5, cbar=False, figsize=[8,8], show_null_values=0, pred_val_axis='lin'):\n",
    "    \"\"\"\n",
    "        plot confusion matrix function with y_test (actual values) and predictions (predic),\n",
    "        whitout a confusion matrix yet\n",
    "    \"\"\"\n",
    "    from sklearn.metrics import confusion_matrix\n",
    "    from pandas import DataFrame\n",
    "\n",
    "    #data\n",
    "    if(not columns):\n",
    "        #labels axis integer:\n",
    "        ##columns = range(1, len(np.unique(y_test))+1)\n",
    "        #labels axis string:\n",
    "        from string import ascii_uppercase\n",
    "        columns = ['class %s' %(i) for i in list(ascii_uppercase)[0:len(np.unique(y_test))]]\n",
    "\n",
    "    confm = confusion_matrix(y_test, predictions)\n",
    "    cmap = 'Oranges';\n",
    "    fz = 11;\n",
    "    figsize=[9,9];\n",
    "    show_null_values = 2\n",
    "    df_cm = DataFrame(confm, index=columns, columns=columns)\n",
    "    pretty_plot_confusion_matrix(df_cm, fz=fz, cmap=cmap, figsize=figsize, show_null_values=show_null_values, pred_val_axis=pred_val_axis)\n",
    "#\n",
    "\n",
    "\n",
    "\n",
    "#\n",
    "#TEST functions\n",
    "#\n",
    "def _test_cm():\n",
    "    #test function with confusion matrix done\n",
    "    array = np.array( [[13,  0,  1,  0,  2,  0],\n",
    "                       [ 0, 50,  2,  0, 10,  0],\n",
    "                       [ 0, 13, 16,  0,  0,  3],\n",
    "                       [ 0,  0,  0, 13,  1,  0],\n",
    "                       [ 0, 40,  0,  1, 15,  0],\n",
    "                       [ 0,  0,  0,  0,  0, 20]])\n",
    "    #get pandas dataframe\n",
    "    df_cm = DataFrame(array, index=range(1,7), columns=range(1,7))\n",
    "    #colormap: see this and choose your more dear\n",
    "    cmap = 'PuRd'\n",
    "    pretty_plot_confusion_matrix(df_cm, cmap=cmap)\n",
    "#\n",
    "\n",
    "def _test_data_class():\n",
    "    \"\"\" test function with y_test (actual values) and predictions (predic) \"\"\"\n",
    "    #data\n",
    "    y_test = np.array([1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5])\n",
    "    predic = np.array([1,2,4,3,5, 1,2,4,3,5, 1,2,3,4,4, 1,4,3,4,5, 1,2,4,4,5, 1,2,4,4,5, 1,2,4,4,5, 1,2,4,4,5, 1,2,3,3,5, 1,2,3,3,5, 1,2,3,4,4, 1,2,3,4,1, 1,2,3,4,1, 1,2,3,4,1, 1,2,4,4,5, 1,2,4,4,5, 1,2,4,4,5, 1,2,4,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5])\n",
    "    \"\"\"\n",
    "      Examples to validate output (confusion matrix plot)\n",
    "        actual: 5 and prediction 1   >>  3\n",
    "        actual: 2 and prediction 4   >>  1\n",
    "        actual: 3 and prediction 4   >>  10\n",
    "    \"\"\"\n",
    "    columns = []\n",
    "    annot = True;\n",
    "    cmap = 'Oranges';\n",
    "    fmt = '.2f'\n",
    "    lw = 0.5\n",
    "    cbar = False\n",
    "    show_null_values = 2\n",
    "    pred_val_axis = 'y'\n",
    "    #size::\n",
    "    fz = 12;\n",
    "    figsize = [9,9];\n",
    "    if(len(y_test) > 10):\n",
    "        fz=9; figsize=[14,14];\n",
    "    plot_confusion_matrix_from_data(y_test, predic, columns,\n",
    "      annot, cmap, fmt, fz, lw, cbar, figsize, show_null_values, pred_val_axis)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot_confusion_matrix_from_data(y_test, predictions, columns=None, annot=True, cmap=\"Oranges\",\n",
    "#       fmt='.2f', fz=11, lw=0.5, cbar=False, figsize=[8,8], show_null_values=0, pred_val_axis='lin'):\n",
    "\n",
    "matplotlib.rcParams['text.usetex'] = False\n",
    "\n",
    "forecast_model = 'postintertidal_slope_sto06'\n",
    "\n",
    "df_for = impacts['forecasted'][forecast_model]\n",
    "df_for.storm_regime = df_for.storm_regime.astype(cat_type)\n",
    "observed_regimes = df_obs.storm_regime.astype(cat_type).cat.codes.values\n",
    "forecasted_regimes = df_for.storm_regime.astype(cat_type).cat.codes.values\n",
    "\n",
    "\n",
    "confm = confusion_matrix(observed_regimes, forecasted_regimes,labels=[0,1,2,3])\n",
    "labels=['swash','collision','overwash','inundation']\n",
    "df_cm = DataFrame(confm, index=labels, columns=labels)\n",
    "\n",
    "fig = pretty_plot_confusion_matrix(df_cm, annot=True, cmap=\"Oranges\", fmt='.1f', fz=13,\n",
    "      lw=0.1, cbar=False, figsize=[8,5], show_null_values=1, pred_val_axis='y')\n",
    "\n",
    "fig.savefig('11_confusion_matrix',dpi=600)"
   ]
  }
 ],
 "metadata": {
@ -940,14 +612,9 @@
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
-   "toc_position": {
+   "toc_position": {},
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "286.391px"
   },
   "toc_section_display": true,
-   "toc_window_display": true
+   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
--- a/notebooks/11_investigate.ipynb
+++ b/notebooks/11_investigate.ipynb
@ -365,244 +365,12 @@
    "f.savefig('11_change_in_slopes_accuracy_classes.png',dpi=600)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Swash zone volume change histogram"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How much does the beach width change variation can we expect in the swash regime?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f, ax = plt.subplots(figsize=(5,4))\n",
    "\n",
    "sns.distplot(df.loc[df.storm_regime=='swash'].width_msl_change_pct.dropna(), \n",
    "             kde=False);\n",
    "\n",
    "ax.set_title('Distribution of beach width change for swash regime')\n",
    "ax.set_xlabel('$\\Delta$ beach width (%)')\n",
    "ax.set_ylabel('Count')\n",
    "\n",
    "f.savefig('11_change_in_beach_width.png',dpi=600)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Split swash regime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['beta_Pxscum'] = df.beta_prestorm_mean * df.Ecum\n",
    "df['neg_swash_vol_change'] = -df.swash_vol_change\n",
    "\n",
    "ax = sns.scatterplot(data=df.loc[df.storm_regime=='swash'],\n",
    "                       x=\"beta_Pxscum\", \n",
    "                       y='neg_swash_vol_change')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": [
+   "source": []
    "corr = df.corr(method ='pearson') \n",
    "sns.heatmap(corr, \n",
    "        xticklabels=corr.columns,\n",
    "        yticklabels=corr.columns,\n",
    "           cmap='RdBu_r')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(corr.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import scipy.stats as ss\n",
    "#https://stackoverflow.com/a/24469099\n",
    "\n",
    "corr = df.corr(method ='pearson') \n",
    "\n",
    "n=len(corr.columns)\n",
    "t=corr*np.sqrt((n-2)/(1-corr*corr))\n",
    "\n",
    "\n",
    "pvals = ss.t.cdf(t, n-2)\n",
    "\n",
    "corr.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": []
   },
   "outputs": [],
   "source": [
    "from matplotlib.patches import Ellipse\n",
    "def corrplot(data, pvalues, labels):\n",
    "    \"\"\"Creates a correlation plot of the passed data.\n",
    "    The function returns the plot which can then be shown with\n",
    "    plot.show(), saved to a file with plot.savefig(), or manipulated\n",
    "    in any other standard matplotlib way.\n",
    "    data is the correlation matrix, a 2-D numpy array containing\n",
    "    the pairwise correlations between variables;\n",
    "    pvalues is a matrix containing the pvalue for each corresponding\n",
    "    correlation value; if none it is assumed to be the zero matrix\n",
    "    labels is an array containing the variable names\n",
    "    https://github.com/louridas/corrplot/blob/master/corrplot.py\n",
    "    \"\"\"\n",
    "\n",
    "    plt.figure(1)\n",
    "\n",
    "    column_labels = labels\n",
    "    row_labels = labels\n",
    "    \n",
    "    f = plt.figure(figsize=(8,8))\n",
    "    ax = plt.subplot(1, 1, 1, aspect='equal')\n",
    "\n",
    "    width, height = data.shape\n",
    "    num_cols, num_rows = width, height\n",
    "\n",
    "    if pvalues is None:\n",
    "        pvalues = np.zeros([num_rows, num_cols])\n",
    "        \n",
    "    shrink = 0.9\n",
    "\n",
    "    poscm = cm.get_cmap('Blues')\n",
    "    negcm = cm.get_cmap('Oranges')\n",
    "\n",
    "    for x in range(width):\n",
    "        for y in range(height):\n",
    "            d = data[x, y]\n",
    "            c = pvalues[x, y]\n",
    "            rotate = -45 if d > 0 else +45\n",
    "            clrmap = poscm if d >= 0 else negcm\n",
    "            d_abs = np.abs(d)\n",
    "            ellipse = Ellipse((x, y),\n",
    "                              width=1 * shrink,\n",
    "                              height=(shrink - d_abs*shrink),\n",
    "                              angle=rotate)\n",
    "            ellipse.set_edgecolor('black')\n",
    "            ellipse.set_facecolor(clrmap(d_abs))\n",
    "            if c > 0.05:\n",
    "                ellipse.set_linestyle('dotted')\n",
    "                ellipse.set_alpha(0.5)\n",
    "            ax.add_artist(ellipse)\n",
    "\n",
    "    ax.set_xlim(-1, num_cols)\n",
    "    ax.set_ylim(-1, num_rows)\n",
    "        \n",
    "    ax.xaxis.tick_top()\n",
    "    xtickslocs = np.arange(len(row_labels))\n",
    "    ax.set_xticks(xtickslocs)\n",
    "    ax.set_xticklabels(row_labels, rotation=30, fontsize='small', ha='left')\n",
    "\n",
    "    ax.invert_yaxis()\n",
    "    ytickslocs = np.arange(len(row_labels))\n",
    "    ax.set_yticks(ytickslocs)\n",
    "    ax.set_yticklabels(column_labels, fontsize='small')\n",
    "\n",
    "    return plt\n",
    "\n",
    "import string\n",
    "num_rows = 20\n",
    "num_cols = num_rows\n",
    "\n",
    "min_length = 10\n",
    "max_length = 20\n",
    "\n",
    "alnums = list(string.ascii_uppercase + string.digits)\n",
    "labels = [''.join(np.random.choice(alnums,\n",
    "                                   np.random.randint(min_length,\n",
    "                                                     max_length)))\n",
    "          for y in np.arange(num_rows)]\n",
    "\n",
    "\n",
    "data = np.random.random([num_rows, num_cols])\n",
    "\n",
    "# data[np.random.choice(num_rows, num_rows / 2), :] *= -1\n",
    "\n",
    "np.fill_diagonal(data, 1)\n",
    "\n",
    "data_symm = (data + data.T) / 2\n",
    "\n",
    "# plot = corrplot(data_symm, None, labels)\n",
    "plot = corrplot(corr.values, pvals, corr.columns.tolist())\n",
    "plot.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy import stats\n",
    "\n",
    "x_col = 'beta_prestorm_intertidal'\n",
    "y_col = \"beta_diff_intertidal\"\n",
    "# y_col = 'swash_vol_change'\n",
    "# x_col = \"Pxscum\"\n",
    "data = df.loc[df.storm_regime=='swash']\n",
    "\n",
    "slope, intercept, r_value, p_value, std_err = stats.linregress(data[x_col],data[y_col])\n",
    "\n",
    "ax = sns.regplot(data=data,\n",
    "                       y=y_col, \n",
    "                       x=x_col, marker=\"+\",fit_reg=True, scatter_kws={'linewidth':1},\n",
    "                line_kws={'label':\"y={0:.2f}x+{1:.2f}\\n(r2={2:.2f})\".format(slope,intercept,r_value**2)})\n",
    "ax.legend()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df.loc[(df.storm_regime=='swash')&(df.swash_vol_change<10)].sort_values(by=['Pxscum'],ascending=False)\n",
    "df.loc[(df.swash_vol_change>200)].sort_values(by=['Pxscum'],ascending=False)"
   ]
  }
 ],
 "metadata": {
--- a/src/analysis/forecast_twl.py
+++ b/src/analysis/forecast_twl.py
@ -316,14 +316,7 @@ def slope_from_profile(
    """
    # Need all data to get the slope
-    # Check validity of profile arrays and return None if data is not good
+    if any([x is None for x in [profile_x, profile_z, top_elevation, btm_elevation]]):
    for profile in [profile_x, profile_z]:
        if all(np.isnan(profile)) or all(x is None for x in profile):
            return None
    # Check validity of elevation values
    for ele in [top_elevation, btm_elevation]:
        if np.isnan(ele) or ele is None:
        return None
    end_points = {"top": {"z": top_elevation}, "btm": {"z": btm_elevation}}
@ -384,11 +377,6 @@ def slope_from_profile(
            True if end_points["top"]["x"] < pts < end_points["btm"]["x"] else False
            for pts in profile_x
        ]
        # Need at least two points to do linear regression
        if sum(profile_mask) < 2:
            return None
        slope_x = np.array(profile_x)[profile_mask].tolist()
        slope_z = np.array(profile_z)[profile_mask].tolist()
        slope, _, _, _, _ = stats.linregress(slope_x, slope_z)
--- a/src/analysis/observed_storm_impacts.py
+++ b/src/analysis/observed_storm_impacts.py
@ -2,11 +2,7 @@ import click
 import numpy as np
 import pandas as pd
 from scipy.integrate import simps
 from scipy.signal import savgol_filter
 from scipy.interpolate import interp1d
 from numpy import ma as ma
 from tqdm import tqdm
 from logs import setup_logging
 from utils import crossings, get_i_or_default
 from analysis.forecast_twl import get_mean_slope, get_intertidal_slope
@ -26,101 +22,149 @@ def return_first_or_nan(l):
        return l[0]
-def round_up_to_odd(f):
+def volume_change(df_profiles, df_profile_features, zone):
    """
    https://stackoverflow.com/a/31648815
    :param f:
    :return:
    """
    return int(np.ceil(f) // 2 * 2 + 1)
 def volume_change(df_profiles, df_profile_features):
    """
    Calculates how much the volume change there is between prestrom and post storm profiles.
    :param df_profiles:
    :param df_profile_features:
    :param zone: Either 'swash' or 'dune_face'
    :return:
    """
-    logger.info("Calculating change in beach volume")
+    logger.info("Calculating change in beach volume in {} zone".format(zone))
    df_vol_changes = pd.DataFrame(
        index=df_profile_features.index.get_level_values("site_id").unique()
    )
    df_profiles = df_profiles.dropna(subset=["z"])
    df_profiles = df_profiles.sort_index()
    sites = df_profiles.groupby(level=["site_id"])
-    for site_id, df_site in tqdm(sites):
+    for site_id, df_site in sites:
        logger.debug(
            "Calculating change in beach volume at {} in {} zone".format(site_id, zone)
        )
-        params = {}
+        prestorm_row = df_profile_features.loc[(site_id, "prestorm")]
        prestorm_dune_toe_x = prestorm_row.dune_toe_x
        prestorm_dune_crest_x = prestorm_row.dune_crest_x
-        for zone in ["dune", "swash"]:
+        # If no dune toe has been defined, Dlow = Dhigh. Refer to Sallenger (2000).
-            params[zone] = {}
+        if np.isnan(prestorm_dune_toe_x):
            prestorm_dune_toe_x = prestorm_dune_crest_x
-            for profile_type in ["prestorm", "poststorm"]:
+        # If no prestorm and poststorm profiles, skip site and continue
-                params[zone][profile_type] = {}
+        profile_lengths = [
            len(df_site.xs(x, level="profile_type")) for x in ["prestorm", "poststorm"]
        ]
        if any([length == 0 for length in profile_lengths]):
            continue
-                # Define the edges of the swash and dunes where we want to calculate subaeraial volume.
+        # Find last x coordinate where we have both prestorm and poststorm measurements. If we don't do this,
        # the prestorm and poststorm values are going to be calculated over different lengths.
        df_zone = df_site.dropna(subset=["z"])
        x_last_obs = min(
            [
                max(
                    df_zone.query(
                        "profile_type == '{}'".format(profile_type)
                    ).index.get_level_values("x")
                )
                for profile_type in ["prestorm", "poststorm"]
            ]
        )
        x_first_obs = max(
            [
                min(
                    df_zone.query(
                        "profile_type == '{}'".format(profile_type)
                    ).index.get_level_values("x")
                )
                for profile_type in ["prestorm", "poststorm"]
            ]
        )
        # Where we want to measure pre and post storm volume is dependant on the zone selected
        if zone == "swash":
-                    params[zone][profile_type]["x_min"] = df_profile_features.loc[
+            x_min = max(prestorm_dune_toe_x, x_first_obs)
-                        (site_id, profile_type)
+            x_max = x_last_obs
-                    ].dune_toe_x
+        elif zone == "dune_face":
-                    params[zone][profile_type]["x_max"] = max(
+            x_min = max(prestorm_dune_crest_x, x_first_obs)
-                        df_profiles.loc[(site_id, profile_type)].index.get_level_values(
+            x_max = min(prestorm_dune_toe_x, x_last_obs)
-                            "x"
+        else:
            logger.warning("Zone argument not properly specified. Please check")
            x_min = None
            x_max = None
        # Now, compute the volume of sand between the x-coordinates prestorm_dune_toe_x and x_swash_last for both prestorm
        # and post storm profiles.
        prestorm_vol = beach_volume(
            x=df_zone.query("profile_type=='prestorm'").index.get_level_values("x"),
            z=df_zone.query("profile_type=='prestorm'").z,
            x_min=x_min,
            x_max=x_max,
        )
        poststorm_vol = beach_volume(
            x=df_zone.query("profile_type=='poststorm'").index.get_level_values("x"),
            z=df_zone.query("profile_type=='poststorm'").z,
            x_min=x_min,
            x_max=x_max,
        )
-                    # For profiles with no Dlow value, we take Dhigh as the minimum value to calculate swash
+        # Identify the x location where our pre and post storm profiles start to differ. This is so changes no due to
-                    if np.isnan(params[zone][profile_type]["x_min"]):
+        # the storm are not included when calculating volume.
-                        params[zone][profile_type]["x_min"] = df_profile_features.loc[
+        df_prestorm = (
-                            (site_id, profile_type)
+            df_site.xs("prestorm", level="profile_type").z.rename("z_pre").to_frame()
-                        ].dune_crest_x
+        )
-
+        df_poststorm = (
-                elif zone == "dune":
+            df_site.xs("poststorm", level="profile_type").z.rename("z_post").to_frame()
                    params[zone][profile_type]["x_min"] = df_profile_features.loc[
                        (site_id, profile_type)
                    ].dune_crest_x
                    params[zone][profile_type]["x_max"] = df_profile_features.loc[
                        (site_id, profile_type)
                    ].dune_toe_x
                # For profiles with no Dlow value, the dune is undefined and we cannot calculate a dune volume.
                # Calculate subaerial volume based on our x min and maxes
                params[zone][profile_type]["subaerial_vol"] = beach_volume(
                    x=df_profiles.loc[(site_id, profile_type)].index.get_level_values(
                        "x"
                    ),
                    z=df_profiles.loc[(site_id, profile_type)].z.values,
                    x_min=params[zone][profile_type]["x_min"],
                    x_max=params[zone][profile_type]["x_max"],
        )
        df_diff = df_prestorm.merge(df_poststorm, on=["site_id", "x"])
        df_diff["z_diff"] = df_diff.z_pre - df_diff.z_post
-            # TODO Landward limit still needs to be incorporated
+        # Find all locations where the difference in pre and post storm is zero. Take the most seaward location as the
        # x location where our profiles are the same.
        x_crossings = crossings(df_diff.index.get_level_values("x"), df_diff.z_diff, 0)
-            params[zone]["vol_change"] = (
+        if len(x_crossings) != 0:
-                params[zone]["poststorm"]["subaerial_vol"]
+            x_change_point = x_crossings[-1]
-                - params[zone]["prestorm"]["subaerial_vol"]
+        else:
-            )
+            x_change_point = np.nan
-            params[zone]["pct_change"] = (
+
-                params[zone]["vol_change"] / params[zone]["prestorm"]["subaerial_vol"]
+        # # For debugging
        # import matplotlib.pyplot as plt
        # f,(ax1,ax2) = plt.subplots(2,1,sharex=True)
        # ax1.plot(df_prestorm.index.get_level_values('x'), df_prestorm.z_pre,label='prestorm')
        # ax1.plot(df_poststorm.index.get_level_values('x'), df_poststorm.z_post,label='poststorm')
        # ax1.axvline(x_crossings[-1], color='red', linestyle='--', linewidth=0.5, label='Change point')
        # ax1.legend()
        # ax1.set_title(site_id)
        # ax1.set_ylabel('elevation (m AHD)')
        # ax2.plot(df_diff.index.get_level_values('x'), df_diff.z_diff)
        # ax2.set_xlabel('x coordinate (m)')
        # ax2.set_ylabel('elevation diff (m)')
        # ax2.axvline(x_crossings[-1],color='red',linestyle='--',linewidth=0.5)
        # plt.show()
        diff_vol = beach_volume(
            x=df_diff.index.get_level_values("x"),
            z=df_diff.z_diff,
            x_min=np.nanmax([x_min, x_change_point]),
            x_max=np.nanmax([x_max, x_change_point]),
        )
-            df_vol_changes.loc[site_id, "prestorm_{}_vol".format(zone)] = params[zone][
+        # Here, if cannot calculate the difference volume, assume no volume change
-                "prestorm"
+        if np.isnan(diff_vol):
-            ]["subaerial_vol"]
+            diff_vol = 0
-            df_vol_changes.loc[site_id, "poststorm_{}_vol".format(zone)] = params[zone][
+
-                "poststorm"
+        # Base pct change on diff volume
-            ]["subaerial_vol"]
+        if diff_vol == 0:
-            df_vol_changes.loc[site_id, "{}_vol_change".format(zone)] = params[zone][
+            pct_change = 0
-                "vol_change"
+        else:
-            ]
+            pct_change = diff_vol / prestorm_vol * 100
-            df_vol_changes.loc[site_id, "{}_pct_change".format(zone)] = params[zone][
+
-                "pct_change"
+        df_vol_changes.loc[site_id, "prestorm_{}_vol".format(zone)] = prestorm_vol
-            ]
+        df_vol_changes.loc[site_id, "poststorm_{}_vol".format(zone)] = poststorm_vol
        df_vol_changes.loc[site_id, "{}_vol_change".format(zone)] = diff_vol
        df_vol_changes.loc[site_id, "{}_pct_change".format(zone)] = pct_change
    return df_vol_changes
@ -153,11 +197,11 @@ def storm_regime(df_observed_impacts):
    """
    logger.info("Getting observed storm regimes")
-    swash = (df_observed_impacts.dune_pct_change <= 2) & (
+    swash = (df_observed_impacts.dune_face_pct_change <= 2) & (
-        df_observed_impacts.dune_vol_change <= 3
+        df_observed_impacts.dune_face_vol_change <= 3
    )
-    collision = (df_observed_impacts.dune_pct_change >= 2) | (
+    collision = (df_observed_impacts.dune_face_pct_change >= 2) | (
-        df_observed_impacts.dune_vol_change > 3
+        df_observed_impacts.dune_face_vol_change > 3
    )
    df_observed_impacts.loc[swash, "storm_regime"] = "swash"
@ -214,9 +258,13 @@ def create_observed_impacts(
    # TODO Review volume change with changing dune toe/crests
    logger.info("Getting pre/post storm volumes")
-
+    df_swash_vol_changes = volume_change(df_profiles, df_profile_features, zone="swash")
-    df_vol_changes = volume_change(df_profiles, df_profile_features)
+    df_dune_face_vol_changes = volume_change(
-    df_observed_impacts = df_observed_impacts.join(df_vol_changes)
+        df_profiles, df_profile_features, zone="dune_face"
    )
    df_observed_impacts = df_observed_impacts.join(
        [df_swash_vol_changes, df_dune_face_vol_changes]
    )
    # Classify regime based on volume changes
    df_observed_impacts = storm_regime(df_observed_impacts)
@ -295,12 +343,8 @@ def create_observed_impacts(
        ele=0,
        col_name="width_msl_poststorm",
    )
-    df_width_msl_change_m = (df_width_msl_poststorm - df_width_msl_prestorm).rename(
+    df_width_msl_change_m = (df_width_msl_poststorm - df_width_msl_prestorm).rename('width_msl_change_m')
-        "width_msl_change_m"
+    df_width_msl_change_pct = (df_width_msl_change_m / df_width_msl_prestorm * 100).rename('width_msl_change_pct')
    )
    df_width_msl_change_pct = (
        df_width_msl_change_m / df_width_msl_prestorm * 100
    ).rename("width_msl_change_pct")
    # Join beach width change onto observed impacts dataframe
    df_observed_impacts = pd.concat(