import os import csv import pandas as pd import numpy as np from datetime import datetime from pathlib import Path import matplotlib.pyplot as plt import matplotlib.dates as mdates import matplotlib.ticker as ticker import difflib # Provide the path to the CSV file in the parent directory code_dir = str(Path(os.getcwd()).parent) csv_path = os.path.join(code_dir, "coastsnap_sites.csv") coastsnap_sites_csv = pd.read_csv(csv_path) parent_directories = coastsnap_sites_csv.parent_directory[0] # Extract site names and parent directories site_names = coastsnap_sites_csv['site_name'] root_id = coastsnap_sites_csv['root_id'] dfoverall = pd.DataFrame(columns=['Site','Root_ID','Total count']) # Iterate over site names and parent directories for index, row in coastsnap_sites_csv.iterrows(): site_name = row['site_name'] root_id = row['root_id'] print(site_name) # Create a dictionary to store photo counts day_of_week_counts = {} month_counts = {} hour_counts = {} year_counts = {} username_counts = {} # Construct the full path to the photo directory photo_directory = os.path.join(parent_directories, "Images", site_name, "Processed") df = pd.DataFrame(columns=['Site','Total count','Datetime','Year','Month','Date','Hour','Minute','Second','Day','Month name','User']) total = -1 # Iterate over the years in the photo directory for year_directory in os.listdir(photo_directory): # Check if the folder name is in the 4 number year format if not year_directory.isdigit() or len(year_directory) != 4: continue # Skip this folder # Construct the full path to the year directory year_path = os.path.join(photo_directory, year_directory) # Iterate over the files in the year directory for filename in os.listdir(year_path): if filename.endswith(".jpg"): # Check that it isn't the first photo uploaded total = total + 1 if total == 0: continue # Extract information from the filename filename = filename.replace("_",".") file_parts = filename.split(".") username = file_parts[-2] timestamp = ".".join([file_parts[1],file_parts[2],file_parts[3],file_parts[4],file_parts[5],file_parts[6],file_parts[8]]) # Parse the timestamp date_format = "%a.%b.%d.%H.%M.%S.%Y" timestamp_datetime = datetime.strptime(timestamp, date_format) # Extract relevant information from the timestamp day_of_week = timestamp_datetime.strftime("%A") month = timestamp_datetime.strftime("%B") hour = timestamp_datetime.hour year = timestamp_datetime.year # Update photo counts day_of_week_counts[day_of_week] = day_of_week_counts.get(day_of_week, 0) + 1 month_counts[month] = month_counts.get(month, 0) + 1 hour_counts[hour] = hour_counts.get(hour, 0) + 1 year_counts[year] = year_counts.get(year, 0) + 1 # Update username counts username_counts[username] = username_counts.get(username, 0) + 1 # Month string to number monthnum = datetime.strptime(file_parts[2], '%b').month column_map = { 'Site': file_parts[9], 'Datetime': timestamp_datetime, 'Year': int(file_parts[8]), 'Month': monthnum, 'Date': int(file_parts[3]), 'Hour': int(file_parts[4]), 'Minute': int(file_parts[5]), 'Second': int(file_parts[6]), 'Month name': file_parts[2], 'Day': file_parts[1], 'User': file_parts[11], 'Total count': total } new_row = pd.DataFrame([column_map]) # Add the new row to the existing DataFrame df = pd.concat([df, new_row], ignore_index=True) if total < 1: continue # Determine top three usernames top_usernames = sorted(username_counts.items(), key=lambda x: x[1], reverse=True)[:3] dftopuser = pd.DataFrame(top_usernames, columns=['User', 'Count']).sort_values(by=['Count'], ascending=True).reset_index(drop=True) ### # Save statistics to an excel spreadsheet ### # Creat Statistics folder in Images directory if it doesn't exist statistics_path = os.path.join(parent_directories, "Images", site_name, "Statistics") if not os.path.exists(statistics_path): os.makedirs(statistics_path) # Create an Excel writer object to save the statistics writer = pd.ExcelWriter(statistics_path+'\statistics.xlsx') # Create a summary sheet which includes all photo data from the site df.to_excel(writer, sheet_name='Summary', index=False) catday = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] catmonth = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] # Create a sheet summarising the hours of the day counts dfhours = pd.DataFrame(list(hour_counts.items()), columns=['Hour', 'Count']).sort_values(by=['Hour']) dfhours.to_excel(writer, sheet_name='Hours', index=False) #create a sheet summarising the days of the week counts dfdays = pd.DataFrame(list(day_of_week_counts.items()), columns=['Day', 'Count']) dfdays = dfdays.groupby(['Day']).sum().reindex(catday) dfdays.to_excel(writer, sheet_name='Days', index=True) #Create a sheet summarising the months of the year counts dfmonths = pd.DataFrame(list(month_counts.items()), columns=['Month', 'Count']) dfmonths = dfmonths.groupby(['Month']).sum().reindex(catmonth) dfmonths.to_excel(writer, sheet_name='Months', index=True) # Create a sheet summarising counts by year dfyears = pd.DataFrame(list(year_counts.items()), columns=['Year', 'Count']).sort_values(by=['Year']) dfyears.to_excel(writer, sheet_name='Years', index=False) # Create a sheet summarising the counts by user dfuser = pd.DataFrame(list(username_counts.items()), columns=['User', 'Count']).sort_values(by=['Count'], ascending=False) ### Combines similar username entries to avoid repeated top users # Initialize a dictionary to store combined entries and their corresponding values combined_entries = {} # Create a copy of the column with the original capitalization dfuser['User' + '_original'] = dfuser['User'] # Convert the column to lowercase for case-insensitive comparison dfuser['User'] = dfuser['User'].str.lower() # Iterate over each entry in the column for entry, original_entry, value in zip(dfuser['User'], dfuser['User' + '_original'], dfuser["Count"]): # Check if a similar entry already exists in the combined_entries dictionary similar_entry = next((key for key in combined_entries if difflib.SequenceMatcher(None, entry, key).ratio() >= 0.9), None) if similar_entry: # If a similar entry exists, add the value to the existing entry combined_entries[similar_entry][1] += value print(f"Combined entry: {original_entry} -> {combined_entries[similar_entry][0]}") else: # If no similar entry exists, create a new entry in the dictionary combined_entries[entry] = [original_entry, value] # Restore the original capitalization in the combined_entries dictionary for key, value in combined_entries.items(): value[0] = dfuser.loc[dfuser['User'] == key, 'User' + '_original'].values[0] # Create a new dataframe with the combined entries and their summed values dfuser = pd.DataFrame(list(combined_entries.values()), columns=['User', "Count"]) dfuser.to_excel(writer, sheet_name='Users', index=False) writer.close() ### # Create overall statistics sheet column_map_ovr = { 'Site': site_name, 'Root_ID': root_id, 'Total count': total} new_row_ov = pd.DataFrame([column_map_ovr]) # Add the new row to the existing DataFrame dfoverall = pd.concat([dfoverall, new_row_ov], ignore_index=True) # Create the plot for total counts fig, ax = plt.subplots(figsize=(7,4)) fig.subplots_adjust(right=0.8) ax.plot(df['Datetime'], df['Total count']) time_range = df['Datetime'].max() - df['Datetime'].min() if time_range == pd.Timedelta(days=0): continue if time_range < pd.Timedelta(days=365): # Less than 1 years minor_locator = mdates.MonthLocator() major_locator = mdates.MonthLocator(bymonth=[1, 3, 5, 7, 9, 11]) major_formatter = mdates.DateFormatter('%b\n%Y') if time_range < pd.Timedelta(days=365*2) and time_range >= pd.Timedelta(days=365): # Between 1 and 2 years minor_locator = mdates.MonthLocator() minor_formatter = mdates.DateFormatter('%b') major_locator = mdates.MonthLocator(bymonth=[1, 4, 7, 10]) major_formatter = mdates.DateFormatter('%b\n%Y') if time_range >= pd.Timedelta(days=365*2): # Longer than 2 years minor_locator = mdates.MonthLocator() major_locator = mdates.MonthLocator(bymonth=[1, 7]) major_formatter = mdates.DateFormatter('%b\n%Y') # Set the x-axis tick locators and formatters ax.xaxis.set_minor_locator(minor_locator) ax.xaxis.set_major_locator(major_locator) ax.xaxis.set_major_formatter(major_formatter) # Add labels and title to the plot plt.xlabel('Month') plt.ylabel('Cumulative count') plt.title('Cumulative submissions since installation') # Rotate x-axis labels if needed #plt.xticks(rotation=45) # Add a table to the plot table_data = [ ["Total number\nof images", total], ["Submissions\nper week", round(total * 7 / time_range.days,1)]] table = ax.table(cellText=table_data, colWidths = [0.2,0.1],cellLoc='left',loc='right', bbox=[1.05, 0.35, 0.4, 0.4]) # Adjust table properties table.auto_set_font_size(False) table.set_fontsize(10) table.scale(1, 3) cells = table.properties()["celld"] for i in range(0,1): cells[i, 1]._loc = 'center' # Adjust the plot layout to accommodate the table plt.subplots_adjust(bottom=0.2) # Save the plot to a temporary file plt.savefig(statistics_path + "/total_count_plot.png", bbox_inches = 'tight')#, pad_inches = 0.5) plt.close() # Create 3 subplots for hours, days and months submissions # Set up the figure and axes fig, axs = plt.subplots(1, 3, figsize=(12, 4)) # Three subplots side by side # First subplot: Hourly Counts axs[0].bar(dfhours["Hour"], dfhours["Count"]) axs[0].set_xlabel("Time of Day") axs[0].set_ylabel("Counts") axs[0].set_title("Submissions by\ntime of the day") axs[0].set_xticks(np.arange(0, 24, 3)) # Major ticks every hour axs[0].set_xticks(np.arange(0, 24, 1), minor=True) # Minor ticks every 3 hours axs[0].set_xticklabels([f"{h:02d}" for h in np.arange(0, 24, 3)]) # Second subplot: Daily Counts axs[1].bar(dfdays.index, dfdays["Count"]) axs[1].set_xlabel("Days of the Week") axs[1].set_ylabel("Counts") axs[1].set_title("Submissions by\nday of the week") axs[1].set_xticks(np.arange(7)) axs[1].set_xticklabels(["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]) # Third subplot: Monthly Counts axs[2].bar(dfmonths.index, dfmonths["Count"]) axs[2].set_xlabel("Months of the Year") axs[2].set_ylabel("Counts") axs[2].set_title("Submissions by\nmonth of the year") axs[2].set_xticks(np.arange(12)) axs[2].set_xticklabels(["J", "F", "M", "A", "M", "J", "J", "A", "S", "O", "N", "D"]) # Adjust the spacing between the subplots plt.tight_layout() # Save the plot plt.savefig(statistics_path + "/hour_day_month_plot.png") plt.close() # Create plot showing top users # Extract data from the DataFrame users = dftopuser['User'] counts = dftopuser['Count'] # Set the size of the plot fig, ax = plt.subplots(figsize=(7, 2)) # Create the bar plot colors = ['#AD8A56', '#D7D7D7', '#AF9500'] # Define a list of pale pastel colors ax.barh(users, counts, color=colors) # Add labels to the bars label_offset = 0.02 * max(counts) # Offset for the text labels from the end of the bars for i, user in enumerate(users): ax.text(counts[i] - label_offset, i, str(user), ha='right', va='center') # Set plot title and axis labels ax.set_title('Top Users') ax.set_xlabel('Count') # Remove y-axis ticks ax.yaxis.set_ticks([]) # Adjust subplot parameters to avoid x-axis label cutoff plt.subplots_adjust(bottom=0.3) # Save the plot plt.savefig(statistics_path + "/top_user_plot.png") plt.close() # Save overall counts in the Images directory writer = pd.ExcelWriter(os.path.join(parent_directories, "Images")+'\\total_counts.xlsx') dfoverall.to_excel(writer, index=False) writer.close()