Improve performance of TWL forecasting function

Use faster pandas indexing instead of .query function
6 years ago · a9a5e02933
parent 25a26d9e46
commit a9a5e02933
1 changed files with 37 additions and 25 deletions
--- a/src/analysis/forecast_twl.py
+++ b/src/analysis/forecast_twl.py
@ -35,10 +35,11 @@ def forecast_twl(

    # Estimate foreshore slope. Do the analysis per site_id. This is so we only have to query the x and z
    # cross-section profiles once per site.
-    logger.info("Calculating beach slopes")
+
    site_ids = df_twl.index.get_level_values("site_id").unique()

    if slope == "foreshore":
+        logger.info("Calculating foreshore slopes")
        # Process each site_id with a different process and combine results at the end
        with Pool(processes=n_processes) as pool:
            results = pool.starmap(
@ -48,35 +49,46 @@ def forecast_twl(
        df_twl["beta"] = pd.concat(results)

    elif slope == "mean":
-        df_temp = df_twl.join(
-            df_profile_features.query(
-                "profile_type=='{}'".format(profile_type)
-            ).reset_index(level="profile_type"),
-            how="inner",
-        )
-        df_temp["mhw"] = 0.5
+        logger.info("Calculating mean (dune toe to MHW) slopes")
+        btm_z = 0.5  # m AHD

        # When calculating mean slope, we go from the dune toe to mhw. However, in some profiles, the dune toe is not
-        # defined. In these cases, we should go to the dune crest
-        df_temp["top_elevation"] = df_temp["dune_toe_z"]
-        df_temp.loc[df_temp.dune_toe_z.isnull(), "top_elevation"] = df_temp.loc[
-            df_temp.dune_toe_z.isnull(), "dune_crest_z"
-        ]
-        df_temp["top_x"] = df_temp["dune_toe_x"]
-        df_temp.loc[df_temp.dune_toe_x.isnull(), "top_x"] = df_temp.loc[
-            df_temp.dune_toe_x.isnull(), "dune_crest_x"
-        ]
+        # defined. In these cases, we should go to the dune crest. Let's make a temporary dataframe which has this
+        # already calculated.
+        df_top_ele = df_profile_features.xs(profile_type, level="profile_type").copy()
+        df_top_ele.loc[:, "top_ele"] = df_top_ele.dune_toe_z
+        df_top_ele.loc[
+            df_top_ele.top_ele.isnull().values, "top_ele"
+        ] = df_top_ele.dune_crest_z
+
+        n_no_top_ele = len(df_top_ele[df_top_ele.top_ele.isnull()].index)
+        if n_no_top_ele != 0:
+            logger.warning(
+                "{} sites do not have dune toes/crests to calculate mean slope".format(
+                    n_no_top_ele
+                )
+            )

-        with Pool(processes=n_processes) as pool:
-            results = pool.starmap(
-                mean_slope_for_site_id,
-                [
-                    (site_id, df_temp, df_profiles, "top_elevation", "top_x", "mhw")
-                    for site_id in site_ids
-                ],
+        df_slopes = (
+            df_profiles.xs(profile_type, level="profile_type")
+            .dropna(subset=["z"])
+            .groupby("site_id")
+            .apply(
+                lambda x: slope_from_profile(
+                    profile_x=x.index.get_level_values("x").tolist(),
+                    profile_z=x.z.tolist(),
+                    top_elevation=df_top_ele.loc[x.index[0][0], :].top_ele,
+                    btm_elevation=btm_z,
+                    method="least_squares",
+                )
            )
+            .rename("beta")
+            .to_frame()
+        )
+
+        # Merge calculated slopes onto each twl timestep
+        df_twl = df_twl.merge(df_slopes, left_index=True, right_index=True)

-        df_twl["beta"] = pd.concat(results)
    elif slope == "intertidal":

        logger.info("Calculating intertidal slopes")