diff --git a/coastsnap/user_statistics.py b/coastsnap/user_statistics.py new file mode 100644 index 0000000..e9db731 --- /dev/null +++ b/coastsnap/user_statistics.py @@ -0,0 +1,356 @@ +import os +import csv +import pandas as pd +import numpy as np +from datetime import datetime +from pathlib import Path +import matplotlib.pyplot as plt +import matplotlib.dates as mdates +import matplotlib.ticker as ticker + +import difflib + + + +# Provide the path to the CSV file in the parent directory + +code_dir = str(Path(os.getcwd()).parent) +csv_path = os.path.join(code_dir, "coastsnap_sites.csv") + +coastsnap_sites_csv = pd.read_csv(csv_path) +parent_directories = coastsnap_sites_csv.parent_directory[0] + +# Extract site names and parent directories +site_names = coastsnap_sites_csv['site_name'] +root_id = coastsnap_sites_csv['root_id'] + +dfoverall = pd.DataFrame(columns=['Site','Root_ID','Total count']) + + +# Iterate over site names and parent directories +for index, row in coastsnap_sites_csv.iterrows(): + site_name = row['site_name'] + root_id = row['root_id'] + + print(site_name) + # Create a dictionary to store photo counts + day_of_week_counts = {} + month_counts = {} + hour_counts = {} + year_counts = {} + username_counts = {} + + # Construct the full path to the photo directory + photo_directory = os.path.join(parent_directories, "Images", site_name, "Processed") + + df = pd.DataFrame(columns=['Site','Total count','Datetime','Year','Month','Date','Hour','Minute','Second','Day','Month name','User']) + total = -1 + + # Iterate over the years in the photo directory + for year_directory in os.listdir(photo_directory): + + # Check if the folder name is in the 4 number year format + if not year_directory.isdigit() or len(year_directory) != 4: + continue # Skip this folder + + # Construct the full path to the year directory + year_path = os.path.join(photo_directory, year_directory) + + + # Iterate over the files in the year directory + for filename in os.listdir(year_path): + if filename.endswith(".jpg"): + + # Check that it isn't the first photo uploaded + total = total + 1 + if total == 0: + continue + + + # Extract information from the filename + filename = filename.replace("_",".") + file_parts = filename.split(".") + username = file_parts[-2] + timestamp = ".".join([file_parts[1],file_parts[2],file_parts[3],file_parts[4],file_parts[5],file_parts[6],file_parts[8]]) + + # Parse the timestamp + date_format = "%a.%b.%d.%H.%M.%S.%Y" + timestamp_datetime = datetime.strptime(timestamp, date_format) + + # Extract relevant information from the timestamp + day_of_week = timestamp_datetime.strftime("%A") + month = timestamp_datetime.strftime("%B") + hour = timestamp_datetime.hour + year = timestamp_datetime.year + + # Update photo counts + day_of_week_counts[day_of_week] = day_of_week_counts.get(day_of_week, 0) + 1 + month_counts[month] = month_counts.get(month, 0) + 1 + hour_counts[hour] = hour_counts.get(hour, 0) + 1 + year_counts[year] = year_counts.get(year, 0) + 1 + + # Update username counts + username_counts[username] = username_counts.get(username, 0) + 1 + + # Month string to number + monthnum = datetime.strptime(file_parts[2], '%b').month + + + + column_map = { + 'Site': file_parts[9], + 'Datetime': timestamp_datetime, + 'Year': int(file_parts[8]), + 'Month': monthnum, + 'Date': int(file_parts[3]), + 'Hour': int(file_parts[4]), + 'Minute': int(file_parts[5]), + 'Second': int(file_parts[6]), + 'Month name': file_parts[2], + 'Day': file_parts[1], + 'User': file_parts[11], + 'Total count': total + } + + new_row = pd.DataFrame([column_map]) + + # Add the new row to the existing DataFrame + df = pd.concat([df, new_row], ignore_index=True) + + if total < 1: + continue + + # Determine top three usernames + top_usernames = sorted(username_counts.items(), key=lambda x: x[1], reverse=True)[:3] + dftopuser = pd.DataFrame(top_usernames, columns=['User', 'Count']).sort_values(by=['Count'], ascending=True).reset_index(drop=True) + + ### + # Save statistics to an excel spreadsheet + ### + + # Creat Statistics folder in Images directory if it doesn't exist + statistics_path = os.path.join(parent_directories, "Images", site_name, "Statistics") + + if not os.path.exists(statistics_path): + os.makedirs(statistics_path) + + # Create an Excel writer object to save the statistics + writer = pd.ExcelWriter(statistics_path+'\statistics.xlsx') + + # Create a summary sheet which includes all photo data from the site + df.to_excel(writer, sheet_name='Summary', index=False) + + catday = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] + catmonth = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] + + # Create a sheet summarising the hours of the day counts + dfhours = pd.DataFrame(list(hour_counts.items()), columns=['Hour', 'Count']).sort_values(by=['Hour']) + dfhours.to_excel(writer, sheet_name='Hours', index=False) + + #create a sheet summarising the days of the week counts + dfdays = pd.DataFrame(list(day_of_week_counts.items()), columns=['Day', 'Count']) + dfdays = dfdays.groupby(['Day']).sum().reindex(catday) + dfdays.to_excel(writer, sheet_name='Days', index=True) + + #Create a sheet summarising the months of the year counts + dfmonths = pd.DataFrame(list(month_counts.items()), columns=['Month', 'Count']) + dfmonths = dfmonths.groupby(['Month']).sum().reindex(catmonth) + dfmonths.to_excel(writer, sheet_name='Months', index=True) + + # Create a sheet summarising counts by year + dfyears = pd.DataFrame(list(year_counts.items()), columns=['Year', 'Count']).sort_values(by=['Year']) + dfyears.to_excel(writer, sheet_name='Years', index=False) + + # Create a sheet summarising the counts by user + dfuser = pd.DataFrame(list(username_counts.items()), columns=['User', 'Count']).sort_values(by=['Count'], ascending=False) + + ### Combines similar username entries to avoid repeated top users + # Initialize a dictionary to store combined entries and their corresponding values + combined_entries = {} + + # Create a copy of the column with the original capitalization + dfuser['User' + '_original'] = dfuser['User'] + + # Convert the column to lowercase for case-insensitive comparison + dfuser['User'] = dfuser['User'].str.lower() + + # Iterate over each entry in the column + for entry, original_entry, value in zip(dfuser['User'], dfuser['User' + '_original'], dfuser["Count"]): + # Check if a similar entry already exists in the combined_entries dictionary + similar_entry = next((key for key in combined_entries if difflib.SequenceMatcher(None, entry, key).ratio() >= 0.9), None) + + if similar_entry: + # If a similar entry exists, add the value to the existing entry + combined_entries[similar_entry][1] += value + print(f"Combined entry: {original_entry} -> {combined_entries[similar_entry][0]}") + else: + # If no similar entry exists, create a new entry in the dictionary + combined_entries[entry] = [original_entry, value] + + # Restore the original capitalization in the combined_entries dictionary + for key, value in combined_entries.items(): + value[0] = dfuser.loc[dfuser['User'] == key, 'User' + '_original'].values[0] + + # Create a new dataframe with the combined entries and their summed values + dfuser = pd.DataFrame(list(combined_entries.values()), columns=['User', "Count"]) + + dfuser.to_excel(writer, sheet_name='Users', index=False) + + writer.close() + ### + + + # Create overall statistics sheet + + column_map_ovr = { + 'Site': site_name, + 'Root_ID': root_id, + 'Total count': total} + + new_row_ov = pd.DataFrame([column_map_ovr]) + + # Add the new row to the existing DataFrame + dfoverall = pd.concat([dfoverall, new_row_ov], ignore_index=True) + + + + # Create the plot for total counts + fig, ax = plt.subplots(figsize=(7,4)) + fig.subplots_adjust(right=0.8) + ax.plot(df['Datetime'], df['Total count']) + + + time_range = df['Datetime'].max() - df['Datetime'].min() + + if time_range == pd.Timedelta(days=0): + continue + if time_range < pd.Timedelta(days=365): # Less than 1 years + minor_locator = mdates.MonthLocator() + major_locator = mdates.MonthLocator(bymonth=[1, 3, 5, 7, 9, 11]) + major_formatter = mdates.DateFormatter('%b\n%Y') + if time_range < pd.Timedelta(days=365*2) and time_range >= pd.Timedelta(days=365): # Between 1 and 2 years + minor_locator = mdates.MonthLocator() + minor_formatter = mdates.DateFormatter('%b') + major_locator = mdates.MonthLocator(bymonth=[1, 4, 7, 10]) + major_formatter = mdates.DateFormatter('%b\n%Y') + if time_range >= pd.Timedelta(days=365*2): # Longer than 2 years + minor_locator = mdates.MonthLocator() + major_locator = mdates.MonthLocator(bymonth=[1, 7]) + major_formatter = mdates.DateFormatter('%b\n%Y') + + # Set the x-axis tick locators and formatters + ax.xaxis.set_minor_locator(minor_locator) + ax.xaxis.set_major_locator(major_locator) + ax.xaxis.set_major_formatter(major_formatter) + + # Add labels and title to the plot + plt.xlabel('Month') + plt.ylabel('Cumulative count') + plt.title('Cumulative submissions since installation') + + # Rotate x-axis labels if needed + #plt.xticks(rotation=45) + + + # Add a table to the plot + table_data = [ + ["Total number\nof images", total], + ["Submissions\nper week", round(total * 7 / time_range.days,1)]] + table = ax.table(cellText=table_data, colWidths = [0.2,0.1],cellLoc='left',loc='right', bbox=[1.05, 0.35, 0.4, 0.4]) + + + # Adjust table properties + table.auto_set_font_size(False) + table.set_fontsize(10) + table.scale(1, 3) + + + cells = table.properties()["celld"] + for i in range(0,1): + cells[i, 1]._loc = 'center' + + + # Adjust the plot layout to accommodate the table + plt.subplots_adjust(bottom=0.2) + + # Save the plot to a temporary file + plt.savefig(statistics_path + "/total_count_plot.png", bbox_inches = 'tight')#, pad_inches = 0.5) + plt.close() + + + # Create 3 subplots for hours, days and months submissions + + # Set up the figure and axes + fig, axs = plt.subplots(1, 3, figsize=(12, 4)) # Three subplots side by side + + # First subplot: Hourly Counts + axs[0].bar(dfhours["Hour"], dfhours["Count"]) + axs[0].set_xlabel("Time of Day") + axs[0].set_ylabel("Counts") + axs[0].set_title("Submissions by\ntime of the day") + axs[0].set_xticks(np.arange(0, 24, 3)) # Major ticks every hour + axs[0].set_xticks(np.arange(0, 24, 1), minor=True) # Minor ticks every 3 hours + axs[0].set_xticklabels([f"{h:02d}" for h in np.arange(0, 24, 3)]) + + # Second subplot: Daily Counts + axs[1].bar(dfdays.index, dfdays["Count"]) + axs[1].set_xlabel("Days of the Week") + axs[1].set_ylabel("Counts") + axs[1].set_title("Submissions by\nday of the week") + axs[1].set_xticks(np.arange(7)) + axs[1].set_xticklabels(["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]) + + # Third subplot: Monthly Counts + axs[2].bar(dfmonths.index, dfmonths["Count"]) + axs[2].set_xlabel("Months of the Year") + axs[2].set_ylabel("Counts") + axs[2].set_title("Submissions by\nmonth of the year") + axs[2].set_xticks(np.arange(12)) + axs[2].set_xticklabels(["J", "F", "M", "A", "M", "J", "J", "A", "S", "O", "N", "D"]) + + # Adjust the spacing between the subplots + plt.tight_layout() + + # Save the plot + plt.savefig(statistics_path + "/hour_day_month_plot.png") + plt.close() + + + # Create plot showing top users + + # Extract data from the DataFrame + users = dftopuser['User'] + counts = dftopuser['Count'] + + # Set the size of the plot + fig, ax = plt.subplots(figsize=(7, 2)) + + # Create the bar plot + colors = ['#AD8A56', '#D7D7D7', '#AF9500'] # Define a list of pale pastel colors + ax.barh(users, counts, color=colors) + + # Add labels to the bars + + label_offset = 0.02 * max(counts) # Offset for the text labels from the end of the bars + for i, user in enumerate(users): + ax.text(counts[i] - label_offset, i, str(user), ha='right', va='center') + + # Set plot title and axis labels + ax.set_title('Top Users') + ax.set_xlabel('Count') + # Remove y-axis ticks + ax.yaxis.set_ticks([]) + + # Adjust subplot parameters to avoid x-axis label cutoff + plt.subplots_adjust(bottom=0.3) + + # Save the plot + plt.savefig(statistics_path + "/top_user_plot.png") + plt.close() + + +# Save overall counts in the Images directory +writer = pd.ExcelWriter(os.path.join(parent_directories, "Images")+'\\total_counts.xlsx') + +dfoverall.to_excel(writer, index=False) +writer.close() \ No newline at end of file diff --git a/workflow.pptx b/workflow.pptx deleted file mode 100644 index e4230e3..0000000 Binary files a/workflow.pptx and /dev/null differ