# -*- coding: utf-8 -*-
"""
Created on Wed Jun 28 09:48:06 2023

@author: z5079346
"""

import os
import re
import pandas as pd
from datetime import datetime

base_dir = "C:/Users/z5079346/OneDrive - UNSW/CoastSnap/Images"

# Empty dataframe
df = pd.DataFrame(columns=["Site", "Date", "Creator", "Processed", "Photoshop", "Registered", "Rectified", "FilePath"])

last_site = None  # initialize last_site variable

# Recursive directory walk
for root, dirs, files in os.walk(base_dir):
    for file in files:
        # Check if the file is a .jpg and its parent directory is a year
        if file.endswith(".jpg") and re.match(r'^\d{4}$', os.path.basename(root)):
            
            # IGNORE TRAILCAM IMAGES
            if ('4Gtrailcam' in file) or \
               ('3gtrailcam' in file) or \
               ('ReconSC950' in file) or \
               ('swiftenduro' in file) or \
               ('SwiftEnduro4G' in file): 
               continue

            
            # Get full file path
            full_path = os.path.join(root, file)
            split_path = full_path.split(os.sep)
            # Extract data from path
            site = split_path[1]
            # Add this code to print the site name once
            if site != last_site:
                print(f"Processing site: {site}")
                last_site = site
            type = split_path[2]
            year = split_path[3]

            # Extract data from filename
            raw_filename = os.path.splitext(file)[0]
            # Clean up filename to ignore '_tagged' or '_registered' suffixes
            clean_filename = re.sub(r'(_tagged|_registered)$', '', raw_filename)
            date_and_creator = clean_filename.split('.')
            
            # Check if the first string is in the format '_0009_1633055855'
            epoch_time_match = re.search(r'(_\d+)?(\d+)$', date_and_creator[0])
            if epoch_time_match:
                epoch_time = epoch_time_match.group(2)  # group 2 will have the digits at the end
            else:
                print(f"Unexpected filename format: {file}.")
                epoch_time = date_and_creator[0]  # fallback to using the first part as epoch time as it is

            # Convert epoch time to readable date
            try:
                date = datetime.fromtimestamp(int(epoch_time)).strftime('%Y-%m-%d %H:%M:%S')
            except ValueError:
                print(f"Unable to convert epoch time to date for file: {file}. Using raw epoch time as date.")
                date = epoch_time  # if conversion fails, use raw epoch time as date

            creator = date_and_creator[-2]  # The creator's name is the second last part before 'snap'

            # Check if the row already exists in the dataframe
            mask = (df['Site'] == site) & (df['Date'] == date) & (df['Creator'] == creator)
            df_exists = df.loc[mask]
            
            if df_exists.empty:
                # Create new row if it doesn't exist
                new_df = pd.DataFrame({
                    "Site": [site],
                    "Date": [date],
                    "Creator": [creator],
                    "Processed": [type == "Processed"],
                    "Photoshop": [type == "Photoshop"],
                    "Registered": [type == "Registered"],
                    "Rectified": [type == "Rectified"],
                    "FilePath": [full_path]
                })
                df = pd.concat([df, new_df], ignore_index=True)
            else:
                # Update existing row if it exists
                df.loc[mask, type] = True


# Save the DataFrame to a CSV file
df.to_csv("image_data.csv", index=False)