CoastsnapAuto/coastsnap/spotteron_batch_download.py

"""
Test download
"""

import datetime
import re
from pathlib import Path
from urllib.parse import urljoin
from os import path, makedirs, getcwd

import attr
import pytz
import requests
import pandas as pd
from loguru import logger
from timezonefinder import TimezoneFinder
from werkzeug.utils import secure_filename

code_images_dir = str(Path(getcwd()).parent)
coastsnap_sites_path = path.join(code_images_dir, "coastsnap_sites.csv")
coastsnap_sites = pd.read_csv(coastsnap_sites_path)

@attr.s()
class SpotteronImage:
    """
    Parses the dictionary from the Spotteron API into an object to make it easier to
    get the required parameters
    """

    raw_data = attr.ib()
    site_name = attr.ib()

    __img_url = "https://files.spotteron.com/images/spots/"

    def exists(self, folder):
        """
        Check if image has already been downloaded
        """
        folder = path.join(folder, str(self.dt.year))
        output_filepath = Path(folder, self.output_filename)

        if output_filepath.is_file():
            return True
        else:
            return False

    def save(self, folder):

        # Concatenate year to parent dir
        # For example: "C:\Users\z5079346\OneDrive - UNSW\My files\CoastSnap\Images\alex\Processed"
        # + "\2022"
        folder = path.join(folder, str(self.dt.year))

        # Check if the folder already exists
        if not path.exists(folder):
            makedirs(folder)

        # Concatentate filename to parent dir
        # For example: "C:\Users\z5079346\OneDrive - UNSW\My files\CoastSnap\Images\alex\Processed\2022"
        # + "\1641158046.Mon.Jan.03_07_14_06.AEST.2022.alex.snap.Raymond_b.jpg"
        output_filepath = Path(folder, self.output_filename)

        logger.info(f"Downloading {output_filepath}")
        response = requests.get(self.url, stream=True)
        if response.status_code == 200:
            with open(output_filepath, "wb") as f:
                f.write(response.content)

    @property
    def id(self):
        return self.raw_data["id"]

    @property
    def lat(self):
        return self.raw_data["attributes"]["latitude"]

    @property
    def lon(self):
        return self.raw_data["attributes"]["longitude"]

    @property
    def tz(self):
        """
        Finds timezone based on lon/lat
        """

        tf = TimezoneFinder()
        return tf.timezone_at(lng=self.lon, lat=self.lat)

    @property
    def dt(self):
        """
        Parses 'spotted at' attributes and returns a timezone aware python datetime
        """
        spotted_at = self.raw_data["attributes"]["spotted_at"]
        spotted_dt = datetime.datetime.strptime(spotted_at, "%Y-%m-%d %H:%M:%S")
        spotted_dt_tz = pytz.timezone(self.tz).localize(spotted_dt)
        return spotted_dt_tz

    @property
    def timestamp(self):
        return datetime.datetime.timestamp(self.dt)

    @property
    def url(self):
        """
        URL to download the image
        """
        img_name = f"{self.raw_data['attributes']['image']}.jpg"
        return urljoin(self.__img_url, img_name)

    @property
    def author(self):
        author = self.raw_data["attributes"]["spotted_by_name"]

        # Sanitize author and remove spaces
        author = secure_filename(author)
        author = re.sub(r"\s+", "", author)
        return author

    @property
    def output_filename(self):
        """
        Define the name of the image depending on its properties. Optional site_name
        can be included.
        """

        if self.site_name:
            return (
                f"{int(self.timestamp)}."
                f'{self.dt.strftime("%a.%b.%d_%H_%M_%S.")}{self.dt.tzname()}.{self.dt.strftime("%Y")}.'
                f"{self.site_name}.snap.{self.author}.jpg"
            )
        else:
            print("Please provide a site-name. Otherwise file names won't follow the reccomended naming convention")
            return (
                f"{int(self.timestamp)}."
                f'{self.dt.strftime("%a.%b.%d_%H_%M_%S.%z.%Y")}.{self.author}.jpg'
            )


@attr.s
class Spotteron:
    """
    Refer to https://www.spotteron.com/docs/api/v2?topic_id=37&key=LDazWbK5n62lbNA4hRNHtLa6hkyqz6Tr
    for API documentation
    """

    api_url = "https://www.spotteron.com/api/v2/spots"

    def save_images(self, root_id, output_folder, site_name, limit, overwrite):

        page = 1
        n_downloaded = 0
        while True:
            json_data = self.get_data(page=page, root_id=root_id)
            images = [
                SpotteronImage(raw_data=x, site_name=site_name)
                for x in json_data["data"]
            ]

            if not images:
                logger.info("No images returned. Check correct root_id is supplied")
                break

            for index, img in enumerate(images):

                if img.exists(output_folder) and overwrite == False:

                    if img.output_filename == images[index-1].output_filename: # Check for sequential duplicate images
                        logger.info(f"Duplicate Spotteron upload: {img.output_filename}")
                        if len(images) == 1:
                            break
                        else:
                            continue

                    logger.info("Existing images found. Stopping getting images")
                    break
                else:
                    img.save(output_folder) # THIS SHOULD BE THE PARENT DIR
                    n_downloaded += 1

                if n_downloaded >= limit:
                    logger.info(f"Downloaded limit of {limit} images. Stopping.")
                    break

            # Break out of the nested loop if we break on an image
            else:
                page += 1
                continue
            break

            pass

        logger.info("Download completed")

    @classmethod
    def get_data(self, page, root_id=None):
        """
        Gets the json data for a particular topic_id and root_id. Returns a dictionary
        containing data returned by api.
        """

        # Defined by Spotteron for coastsnap stations
        topic_id = 37

        payload = {
            "filter[topic_id]": topic_id,
            "limit": 5,
            "page": page,
        }

        if root_id:
            payload["filter[root_id]"] = root_id

        r = requests.get(self.api_url, params=payload)
        return r.json()


# @app.command()
# def from_spotteron(
#     root_id: int = typer.Argument(..., help="Spotteron id of Coastsnap station."),
#     output_folder: str = typer.Argument(..., help="Path to save images to."),
#     site_name: str = typer.Option(None, help="Add site to filename."),
#     limit: int = typer.Option(30, help="Max number of images to save."),
#     overwrite: bool = typer.Option(False, help="Overwrite downloaded images?"),
# ):
    """
    Downloads images from Spotteron API and saves to folder

    """
def from_spotteron(root_id, output_folder, site_name, limit, overwrite):
    spot = Spotteron()
    spot.save_images(root_id, output_folder, site_name, limit, overwrite)


# @app.command()
# def from_spotteron_batch(
#         overwrite: bool = typer.Option(False, help="Overwrite downloaded images?"),
#         ):
#     """
#     Downloads images from Spotteron API for all beaches specified in batch_download.csv
#     """

#     #all_beaches = pd.read_csv(r"C:\Users\z5079346\OneDrive - UNSW\Code\coastsnap\coastsnap\spotteron_batch_download\batch_download.csv")

#     # Retrieve Parent Directory in batch_download.csv
#     parent_directory = coastsnap_sites.parent_directory[0]
#     print(parent_directory)

#     for index, beach in coastsnap_sites.iterrows():

#         # Concatentate the parent directory, site name and 'Processed'
#         # to create the output site_path
#         site_name = beach.site_name
#         site_path = path.join(parent_directory, site_name, 'Processed')

#         # Download the images for a given site
#         logger.info(f"Downloading images for {beach.site_name}")
#         from_spotteron(beach.root_id, site_path, site_name, limit = beach.limit, overwrite = overwrite)

# if __name__ == "__main__":
#     app()

#overwrite: bool = typer.Option(False, help="Overwrite downloaded images?"),
#    ):
"""
Downloads images from Spotteron API for all beaches specified in batch_download.csv
"""

#all_beaches = pd.read_csv(r"C:\Users\z5079346\OneDrive - UNSW\Code\coastsnap\coastsnap\spotteron_batch_download\batch_download.csv")

# Retrieve Parent Directory in batch_download.csv
CoastSnap_directory = coastsnap_sites.parent_directory[0]
parent_directory = path.join(CoastSnap_directory, 'Images')

for index, beach in coastsnap_sites.iterrows():

    # Concatentate the parent directory, site name and 'Processed'
    # to create the output site_path
    site_name = beach.site_name
    site_path = path.join(parent_directory, site_name, 'Processed')

    # Download the images for a given site
    logger.info(f"Downloading images for {beach.site_name}")
    from_spotteron(beach.root_id, site_path, site_name, limit = beach.limit, overwrite = False)