nsw-2016-storm-impact/notebooks/11_investigate.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Investigate "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup notebook\n",
    "Import our required packages and set default plotting options."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Enable autoreloading of our modules. \n",
    "# Most of the code will be located in the /src/ folder, \n",
    "# and then called from the notebook.\n",
    "%matplotlib inline\n",
    "%reload_ext autoreload\n",
    "%autoreload"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.debugger import set_trace\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import decimal\n",
    "import plotly\n",
    "import plotly.graph_objs as go\n",
    "import plotly.plotly as py\n",
    "import plotly.tools as tls\n",
    "import plotly.figure_factory as ff\n",
    "from plotly import tools\n",
    "import plotly.io as pio\n",
    "from scipy import stats\n",
    "import math\n",
    "import matplotlib\n",
    "from matplotlib import cm\n",
    "import colorlover as cl\n",
    "from tqdm import tqdm_notebook\n",
    "from ipywidgets import widgets, Output\n",
    "from IPython.display import display, clear_output, Image, HTML\n",
    "from scipy import stats\n",
    "from sklearn.metrics import confusion_matrix\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.ticker import MultipleLocator\n",
    "from matplotlib.lines import Line2D\n",
    "from cycler import cycler\n",
    "from scipy.interpolate import interp1d\n",
    "from pandas.api.types import CategoricalDtype\n",
    "import seaborn as sns\n",
    "sns.set(style=\"white\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Matplot lib default settings\n",
    "plt.rcParams[\"figure.figsize\"] = (10,6)\n",
    "plt.rcParams['axes.grid']=True\n",
    "plt.rcParams['grid.alpha'] = 0.5\n",
    "plt.rcParams['grid.color'] = \"grey\"\n",
    "plt.rcParams['grid.linestyle'] = \"--\"\n",
    "plt.rcParams['axes.grid']=True\n",
    "\n",
    "# https://stackoverflow.com/a/20709149\n",
    "# matplotlib.rcParams['text.usetex'] = True\n",
    "\n",
    "matplotlib.rcParams['text.latex.preamble'] = [\n",
    "       r'\\usepackage{siunitx}',   # i need upright \\micro symbols, but you need...\n",
    "       r'\\sisetup{detect-all}',   # ...this to force siunitx to actually use your fonts\n",
    "       r'\\usepackage{helvet}',    # set the normal font here\n",
    "       r'\\usepackage{amsmath}',\n",
    "       r'\\usepackage{sansmath}',  # load up the sansmath so that math -> helvet\n",
    "       r'\\sansmath',              # <- tricky! -- gotta actually tell tex to use!\n",
    "]  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import data\n",
    "Import our data from the `./data/interim/` folder and load it into pandas dataframes. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def df_from_csv(csv, index_col, data_folder='../data/interim'):\n",
    "    print('Importing {}'.format(csv))\n",
    "    return pd.read_csv(os.path.join(data_folder,csv), index_col=index_col)\n",
    "\n",
    "df_waves = df_from_csv('waves.csv', index_col=[0, 1])\n",
    "df_tides = df_from_csv('tides.csv', index_col=[0, 1])\n",
    "df_profiles = df_from_csv('profiles.csv', index_col=[0, 1, 2])\n",
    "df_sites = df_from_csv('sites.csv', index_col=[0])\n",
    "df_sites_waves = df_from_csv('sites_waves.csv', index_col=[0])\n",
    "df_profile_features_crest_toes = df_from_csv('profile_features_crest_toes.csv', index_col=[0,1])\n",
    "\n",
    "# Note that the forecasted data sets should be in the same order for impacts and twls\n",
    "impacts = {\n",
    "    'forecasted': {\n",
    "    'postintertidal_slope_sto06': df_from_csv('impacts_forecasted_postintertidal_slope_sto06.csv', index_col=[0]),\n",
    "    'postmean_slope_sto06': df_from_csv('impacts_forecasted_postmean_slope_sto06.csv', index_col=[0]),\n",
    "    'preintertidal_slope_sto06': df_from_csv('impacts_forecasted_preintertidal_slope_sto06.csv', index_col=[0]),\n",
    "    'premean_slope_sto06': df_from_csv('impacts_forecasted_premean_slope_sto06.csv', index_col=[0]),\n",
    "        },\n",
    "    'observed': df_from_csv('impacts_observed.csv', index_col=[0])\n",
    "    }\n",
    "\n",
    "twls = {\n",
    "    'forecasted': {\n",
    "    'postintertidal_slope_sto06': df_from_csv('twl_postintertidal_slope_sto06.csv', index_col=[0,1]),\n",
    "    'postmean_slope_sto06': df_from_csv('twl_postmean_slope_sto06.csv', index_col=[0,1]),\n",
    "    'preintertidal_slope_sto06': df_from_csv('twl_preintertidal_slope_sto06.csv', index_col=[0,1]),\n",
    "    'premean_slope_sto06': df_from_csv('twl_premean_slope_sto06.csv', index_col=[0,1]),\n",
    "    }\n",
    "}\n",
    "print('Done!')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Gather data into one dataframe\n",
    "For plotting, gather all our data into one dataframe."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Which forecasted impacts dataframe should we use to assess prediction performance?\n",
    "df_selected_forecast = impacts['forecasted']['postintertidal_slope_sto06']\n",
    "\n",
    "# Create df with all our data\n",
    "df = impacts['observed'].merge(\n",
    "    df_sites_waves, left_index=True, right_index=True)\n",
    "\n",
    "# Join observed/forecasted regimes\n",
    "df_forecasted = df_selected_forecast.rename(\n",
    "    {'storm_regime': 'forecasted_regime'\n",
    "    }, axis='columns').forecasted_regime\n",
    "df = pd.concat([df, df_forecasted], axis=1)\n",
    "\n",
    "# Create new accuracy column which categorises each prediction\n",
    "df.loc[(df.storm_regime == 'swash') & (df.forecasted_regime == 'swash'), 'accuracy'] = 'correct swash'\n",
    "df.loc[(df.storm_regime == 'collision') & (df.forecasted_regime == 'collision'), 'accuracy'] = 'correct collision'\n",
    "df.loc[(df.storm_regime == 'swash') & (df.forecasted_regime == 'collision'), 'accuracy'] = 'overpredicted swash'\n",
    "df.loc[(df.storm_regime == 'collision') & (df.forecasted_regime == 'swash'), 'accuracy'] = 'underpredicted collision'\n",
    "\n",
    "print('df columns:\\n===')\n",
    "for col in sorted(df.columns):\n",
    "    print(col)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create plots"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Variable pairplot, by observed storm impact\n",
    "Create pairplot of selected variables and look for relationships between each. Colors represent the different observed storm impact regimes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "g = sns.pairplot(\n",
    "    data=df,\n",
    "    hue='storm_regime',\n",
    "    dropna=True,\n",
    "    palette={\n",
    "        'swash': 'blue',\n",
    "        'collision': 'orange',\n",
    "        'overwash': 'red'\n",
    "    },\n",
    "    plot_kws=dict(s=20, edgecolor=\"white\", linewidth=0.1, alpha=0.1),\n",
    "    vars=['beta_prestorm_mean',\n",
    "          'beta_poststorm_mean',\n",
    "          'beta_diff_mean',\n",
    "          'swash_pct_change',\n",
    "          'width_msl_change_m',\n",
    "          'width_msl_change_pct',\n",
    "          'Exscum'])\n",
    "g.savefig('11_pairplot_observed_impacts.png')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Variable pairplot, by observed/prediction class\n",
    "Create pairplot of selected variables and look for relationships between each. Colors represent the different observed/prediction classes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "g = sns.pairplot(\n",
    "    data=df,\n",
    "    hue='accuracy',\n",
    "    dropna=True,\n",
    "    palette={\n",
    "        'correct swash': 'blue',\n",
    "        'correct collision': 'green',\n",
    "        'overpredicted swash': 'orange',\n",
    "        'underpredicted collision': 'red',\n",
    "    },\n",
    "    plot_kws=dict(s=20, edgecolor=\"white\", linewidth=0.1, alpha=0.1),\n",
    "    vars=['beta_prestorm_mean',\n",
    "          'beta_poststorm_mean',\n",
    "          'beta_diff_mean',\n",
    "          'swash_pct_change',\n",
    "          'width_msl_change_m',\n",
    "          'width_msl_change_pct',\n",
    "          'Exscum'])\n",
    "g.savefig('11_pairplot_accuracy_classes.png')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pre/post storm slope by observed/predicted class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# First create a melted dataframe since our coulmn's aren't exactly as they should be for plotting\n",
    "df_temp = df.copy()\n",
    "df_temp = df_temp.reset_index()\n",
    "\n",
    "df_melt = pd.melt(\n",
    "    df_temp,\n",
    "    id_vars=['site_id', 'accuracy'],\n",
    "    value_vars=['beta_prestorm_mean', 'beta_poststorm_mean'],\n",
    "    var_name='profile_type',\n",
    "    value_name='beta_mean')\n",
    "\n",
    "df_melt.loc[df_melt.profile_type == 'beta_prestorm_mean','profile_type'] = 'prestorm'\n",
    "df_melt.loc[df_melt.profile_type == 'beta_poststorm_mean','profile_type'] = 'poststorm'\n",
    "df_melt.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f, ax = plt.subplots(figsize=(6,5))\n",
    "\n",
    "cats = ['correct swash', 'overpredicted swash','underpredicted collision','correct collision']\n",
    "\n",
    "# Plot the orbital period with horizontal boxes\n",
    "sns.boxplot(\n",
    "    data=df_melt,\n",
    "    x=\"accuracy\",\n",
    "    y=\"beta_mean\",\n",
    "    hue=\"profile_type\",\n",
    "    order=cats\n",
    ")\n",
    "\n",
    "group_labels = [x.replace(' ','\\n') for x in cats]\n",
    "ax.set_xticklabels(group_labels)\n",
    "\n",
    "# Setup ticks and grid\n",
    "ax.xaxis.grid(True)\n",
    "major_ticks = np.arange(-1, 1, 0.05)\n",
    "minor_ticks = np.arange(-1, 1, 0.01)\n",
    "ax.set_yticks(major_ticks)\n",
    "ax.set_yticks(minor_ticks, minor=True)\n",
    "ax.grid(which='both')\n",
    "ax.grid(which='minor', alpha=0.3,linestyle='--')\n",
    "ax.grid(which='major', alpha=0.8,linestyle='-')\n",
    "\n",
    "ax.set_ylim([-0.02,0.3])\n",
    "\n",
    "f.savefig('11_prepost_slopes_accuracy_classes.png',dpi=600)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Change in slope by observed/predicted class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f, ax = plt.subplots(figsize=(6,5))\n",
    "\n",
    "cats = ['correct swash', 'overpredicted swash','underpredicted collision','correct collision']\n",
    "\n",
    "# Plot the orbital period with horizontal boxes\n",
    "sns.boxplot(\n",
    "    data=df,\n",
    "    x=\"accuracy\",\n",
    "    y=\"beta_diff_mean\",\n",
    "    order=cats\n",
    ")\n",
    "\n",
    "group_labels = [x.replace(' ','\\n') for x in cats]\n",
    "ax.set_xticklabels(group_labels)\n",
    "\n",
    "# Setup ticks and grid\n",
    "ax.xaxis.grid(True)\n",
    "major_ticks = np.arange(-1, 1, 0.05)\n",
    "minor_ticks = np.arange(-1, 1, 0.01)\n",
    "ax.set_yticks(major_ticks)\n",
    "ax.set_yticks(minor_ticks, minor=True)\n",
    "ax.grid(which='both')\n",
    "ax.grid(which='minor', alpha=0.3,linestyle='--')\n",
    "ax.grid(which='major', alpha=0.8,linestyle='-')\n",
    "\n",
    "ax.set_ylim([-0.2,0.2])\n",
    "\n",
    "f.savefig('11_change_in_slopes_accuracy_classes.png',dpi=600)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Swash zone volume change histogram"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How much does the beach width change variation can we expect in the swash regime?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f, ax = plt.subplots(figsize=(5,4))\n",
    "\n",
    "sns.distplot(df.loc[df.storm_regime=='swash'].width_msl_change_pct.dropna(), \n",
    "             kde=False);\n",
    "\n",
    "ax.set_title('Distribution of beach width change for swash regime')\n",
    "ax.set_xlabel('$\\Delta$ beach width (%)')\n",
    "ax.set_ylabel('Count')\n",
    "\n",
    "f.savefig('11_change_in_beach_width.png',dpi=600)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Split swash regime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['beta_Pxscum'] = df.beta_prestorm_mean * df.Ecum\n",
    "df['neg_swash_vol_change'] = -df.swash_vol_change\n",
    "\n",
    "ax = sns.scatterplot(data=df.loc[df.storm_regime=='swash'],\n",
    "                       x=\"beta_Pxscum\", \n",
    "                       y='neg_swash_vol_change')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "corr = df.corr(method ='pearson') \n",
    "sns.heatmap(corr, \n",
    "        xticklabels=corr.columns,\n",
    "        yticklabels=corr.columns,\n",
    "           cmap='RdBu_r')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(corr.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import scipy.stats as ss\n",
    "#https://stackoverflow.com/a/24469099\n",
    "\n",
    "corr = df.corr(method ='pearson') \n",
    "\n",
    "n=len(corr.columns)\n",
    "t=corr*np.sqrt((n-2)/(1-corr*corr))\n",
    "\n",
    "\n",
    "pvals = ss.t.cdf(t, n-2)\n",
    "\n",
    "corr.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": []
   },
   "outputs": [],
   "source": [
    "from matplotlib.patches import Ellipse\n",
    "def corrplot(data, pvalues, labels):\n",
    "    \"\"\"Creates a correlation plot of the passed data.\n",
    "    The function returns the plot which can then be shown with\n",
    "    plot.show(), saved to a file with plot.savefig(), or manipulated\n",
    "    in any other standard matplotlib way.\n",
    "    data is the correlation matrix, a 2-D numpy array containing\n",
    "    the pairwise correlations between variables;\n",
    "    pvalues is a matrix containing the pvalue for each corresponding\n",
    "    correlation value; if none it is assumed to be the zero matrix\n",
    "    labels is an array containing the variable names\n",
    "    https://github.com/louridas/corrplot/blob/master/corrplot.py\n",
    "    \"\"\"\n",
    "\n",
    "    plt.figure(1)\n",
    "\n",
    "    column_labels = labels\n",
    "    row_labels = labels\n",
    "    \n",
    "    f = plt.figure(figsize=(8,8))\n",
    "    ax = plt.subplot(1, 1, 1, aspect='equal')\n",
    "\n",
    "    width, height = data.shape\n",
    "    num_cols, num_rows = width, height\n",
    "\n",
    "    if pvalues is None:\n",
    "        pvalues = np.zeros([num_rows, num_cols])\n",
    "        \n",
    "    shrink = 0.9\n",
    "\n",
    "    poscm = cm.get_cmap('Blues')\n",
    "    negcm = cm.get_cmap('Oranges')\n",
    "\n",
    "    for x in range(width):\n",
    "        for y in range(height):\n",
    "            d = data[x, y]\n",
    "            c = pvalues[x, y]\n",
    "            rotate = -45 if d > 0 else +45\n",
    "            clrmap = poscm if d >= 0 else negcm\n",
    "            d_abs = np.abs(d)\n",
    "            ellipse = Ellipse((x, y),\n",
    "                              width=1 * shrink,\n",
    "                              height=(shrink - d_abs*shrink),\n",
    "                              angle=rotate)\n",
    "            ellipse.set_edgecolor('black')\n",
    "            ellipse.set_facecolor(clrmap(d_abs))\n",
    "            if c > 0.05:\n",
    "                ellipse.set_linestyle('dotted')\n",
    "                ellipse.set_alpha(0.5)\n",
    "            ax.add_artist(ellipse)\n",
    "\n",
    "    ax.set_xlim(-1, num_cols)\n",
    "    ax.set_ylim(-1, num_rows)\n",
    "        \n",
    "    ax.xaxis.tick_top()\n",
    "    xtickslocs = np.arange(len(row_labels))\n",
    "    ax.set_xticks(xtickslocs)\n",
    "    ax.set_xticklabels(row_labels, rotation=30, fontsize='small', ha='left')\n",
    "\n",
    "    ax.invert_yaxis()\n",
    "    ytickslocs = np.arange(len(row_labels))\n",
    "    ax.set_yticks(ytickslocs)\n",
    "    ax.set_yticklabels(column_labels, fontsize='small')\n",
    "\n",
    "    return plt\n",
    "\n",
    "import string\n",
    "num_rows = 20\n",
    "num_cols = num_rows\n",
    "\n",
    "min_length = 10\n",
    "max_length = 20\n",
    "\n",
    "alnums = list(string.ascii_uppercase + string.digits)\n",
    "labels = [''.join(np.random.choice(alnums,\n",
    "                                   np.random.randint(min_length,\n",
    "                                                     max_length)))\n",
    "          for y in np.arange(num_rows)]\n",
    "\n",
    "\n",
    "data = np.random.random([num_rows, num_cols])\n",
    "\n",
    "# data[np.random.choice(num_rows, num_rows / 2), :] *= -1\n",
    "\n",
    "np.fill_diagonal(data, 1)\n",
    "\n",
    "data_symm = (data + data.T) / 2\n",
    "\n",
    "# plot = corrplot(data_symm, None, labels)\n",
    "plot = corrplot(corr.values, pvals, corr.columns.tolist())\n",
    "plot.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy import stats\n",
    "\n",
    "x_col = 'beta_prestorm_intertidal'\n",
    "y_col = \"beta_diff_intertidal\"\n",
    "# y_col = 'swash_vol_change'\n",
    "# x_col = \"Pxscum\"\n",
    "data = df.loc[df.storm_regime=='swash']\n",
    "\n",
    "slope, intercept, r_value, p_value, std_err = stats.linregress(data[x_col],data[y_col])\n",
    "\n",
    "ax = sns.regplot(data=data,\n",
    "                       y=y_col, \n",
    "                       x=x_col, marker=\"+\",fit_reg=True, scatter_kws={'linewidth':1},\n",
    "                line_kws={'label':\"y={0:.2f}x+{1:.2f}\\n(r2={2:.2f})\".format(slope,intercept,r_value**2)})\n",
    "ax.legend()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df.loc[(df.storm_regime=='swash')&(df.swash_vol_change<10)].sort_values(by=['Pxscum'],ascending=False)\n",
    "df.loc[(df.swash_vol_change>200)].sort_values(by=['Pxscum'],ascending=False)"
   ]
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "223.594px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}