diff --git a/coastsnap/images_db.py b/coastsnap/images_db.py new file mode 100644 index 0000000..aed9c20 --- /dev/null +++ b/coastsnap/images_db.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jun 28 09:48:06 2023 + +@author: z5079346 +""" + +import os +import re +import pandas as pd +from datetime import datetime + +base_dir = "C:/Users/z5079346/OneDrive - UNSW/CoastSnap/Images" + +# Empty dataframe +df = pd.DataFrame(columns=["Site", "Date", "Creator", "Processed", "Photoshop", "Registered", "Rectified", "FilePath"]) + +last_site = None # initialize last_site variable + +# Recursive directory walk +for root, dirs, files in os.walk(base_dir): + for file in files: + # Check if the file is a .jpg and its parent directory is a year + if file.endswith(".jpg") and re.match(r'^\d{4}$', os.path.basename(root)): + + # IGNORE TRAILCAM IMAGES + if ('4Gtrailcam' in file) or \ + ('3gtrailcam' in file) or \ + ('ReconSC950' in file) or \ + ('swiftenduro' in file) or \ + ('SwiftEnduro4G' in file): + continue + + + # Get full file path + full_path = os.path.join(root, file) + split_path = full_path.split(os.sep) + # Extract data from path + site = split_path[1] + # Add this code to print the site name once + if site != last_site: + print(f"Processing site: {site}") + last_site = site + type = split_path[2] + year = split_path[3] + + # Extract data from filename + raw_filename = os.path.splitext(file)[0] + # Clean up filename to ignore '_tagged' or '_registered' suffixes + clean_filename = re.sub(r'(_tagged|_registered)$', '', raw_filename) + date_and_creator = clean_filename.split('.') + + # Check if the first string is in the format '_0009_1633055855' + epoch_time_match = re.search(r'(_\d+)?(\d+)$', date_and_creator[0]) + if epoch_time_match: + epoch_time = epoch_time_match.group(2) # group 2 will have the digits at the end + else: + print(f"Unexpected filename format: {file}.") + epoch_time = date_and_creator[0] # fallback to using the first part as epoch time as it is + + # Convert epoch time to readable date + try: + date = datetime.fromtimestamp(int(epoch_time)).strftime('%Y-%m-%d %H:%M:%S') + except ValueError: + print(f"Unable to convert epoch time to date for file: {file}. Using raw epoch time as date.") + date = epoch_time # if conversion fails, use raw epoch time as date + + creator = date_and_creator[-2] # The creator's name is the second last part before 'snap' + + # Check if the row already exists in the dataframe + mask = (df['Site'] == site) & (df['Date'] == date) & (df['Creator'] == creator) + df_exists = df.loc[mask] + + if df_exists.empty: + # Create new row if it doesn't exist + new_df = pd.DataFrame({ + "Site": [site], + "Date": [date], + "Creator": [creator], + "Processed": [type == "Processed"], + "Photoshop": [type == "Photoshop"], + "Registered": [type == "Registered"], + "Rectified": [type == "Rectified"], + "FilePath": [full_path] + }) + df = pd.concat([df, new_df], ignore_index=True) + else: + # Update existing row if it exists + df.loc[mask, type] = True + + + + +# Save the DataFrame to a CSV file +df.to_csv("image_data.csv", index=False) \ No newline at end of file