Added by Fred

3 years ago · 4406a74b3b
parent 1e1567ab14
commit 4406a74b3b
2 changed files with 356 additions and 0 deletions
--- a/coastsnap/user_statistics.py
+++ b/coastsnap/user_statistics.py
@ -0,0 +1,356 @@
+import os
+import csv
+import pandas as pd
+import numpy as np
+from datetime import datetime
+from pathlib import Path
+import matplotlib.pyplot as plt
+import matplotlib.dates as mdates
+import matplotlib.ticker as ticker
+
+import difflib
+
+
+
+# Provide the path to the CSV file in the parent directory
+
+code_dir = str(Path(os.getcwd()).parent)
+csv_path = os.path.join(code_dir, "coastsnap_sites.csv")
+
+coastsnap_sites_csv = pd.read_csv(csv_path)
+parent_directories = coastsnap_sites_csv.parent_directory[0] 
+
+# Extract site names and parent directories
+site_names = coastsnap_sites_csv['site_name']
+root_id = coastsnap_sites_csv['root_id']
+
+dfoverall = pd.DataFrame(columns=['Site','Root_ID','Total count'])
+
+
+# Iterate over site names and parent directories
+for index, row in coastsnap_sites_csv.iterrows():
+    site_name = row['site_name']
+    root_id = row['root_id']
+    
+    print(site_name)
+    # Create a dictionary to store photo counts
+    day_of_week_counts = {}
+    month_counts = {}
+    hour_counts = {}
+    year_counts = {}
+    username_counts = {}    
+    
+    # Construct the full path to the photo directory
+    photo_directory = os.path.join(parent_directories, "Images", site_name, "Processed")
+
+    df = pd.DataFrame(columns=['Site','Total count','Datetime','Year','Month','Date','Hour','Minute','Second','Day','Month name','User'])
+    total = -1
+
+    # Iterate over the years in the photo directory
+    for year_directory in os.listdir(photo_directory):
+        
+        # Check if the folder name is in the 4 number year format
+        if not year_directory.isdigit() or len(year_directory) != 4:
+            continue  # Skip this folder
+        
+        # Construct the full path to the year directory
+        year_path = os.path.join(photo_directory, year_directory)
+        
+
+        # Iterate over the files in the year directory
+        for filename in os.listdir(year_path):
+            if filename.endswith(".jpg"):
+                
+                # Check that it isn't the first photo uploaded
+                total = total + 1
+                if total == 0:
+                    continue
+                
+                
+                # Extract information from the filename
+                filename = filename.replace("_",".")
+                file_parts = filename.split(".")
+                username = file_parts[-2]
+                timestamp = ".".join([file_parts[1],file_parts[2],file_parts[3],file_parts[4],file_parts[5],file_parts[6],file_parts[8]])
+
+                # Parse the timestamp
+                date_format = "%a.%b.%d.%H.%M.%S.%Y"
+                timestamp_datetime = datetime.strptime(timestamp, date_format)
+
+                # Extract relevant information from the timestamp
+                day_of_week = timestamp_datetime.strftime("%A")
+                month = timestamp_datetime.strftime("%B")
+                hour = timestamp_datetime.hour
+                year = timestamp_datetime.year
+
+                # Update photo counts
+                day_of_week_counts[day_of_week] = day_of_week_counts.get(day_of_week, 0) + 1
+                month_counts[month] = month_counts.get(month, 0) + 1
+                hour_counts[hour] = hour_counts.get(hour, 0) + 1
+                year_counts[year] = year_counts.get(year, 0) + 1
+
+                # Update username counts
+                username_counts[username] = username_counts.get(username, 0) + 1
+                
+                # Month string to number
+                monthnum = datetime.strptime(file_parts[2], '%b').month
+                
+                
+                
+                column_map = {
+                    'Site': file_parts[9],
+                    'Datetime': timestamp_datetime,
+                    'Year': int(file_parts[8]),
+                    'Month': monthnum,
+                    'Date': int(file_parts[3]),
+                    'Hour': int(file_parts[4]),
+                    'Minute': int(file_parts[5]),
+                    'Second': int(file_parts[6]),
+                    'Month name': file_parts[2],
+                    'Day': file_parts[1],
+                    'User': file_parts[11],
+                    'Total count': total
+                    }
+
+                new_row = pd.DataFrame([column_map])
+
+                # Add the new row to the existing DataFrame
+                df = pd.concat([df, new_row], ignore_index=True)
+    
+    if total < 1:
+        continue            
+    
+    # Determine top three usernames
+    top_usernames = sorted(username_counts.items(), key=lambda x: x[1], reverse=True)[:3]
+    dftopuser = pd.DataFrame(top_usernames, columns=['User', 'Count']).sort_values(by=['Count'], ascending=True).reset_index(drop=True)
+
+    ###
+    # Save statistics to an excel spreadsheet
+    ###
+
+    # Creat Statistics folder in Images directory if it doesn't exist
+    statistics_path = os.path.join(parent_directories, "Images", site_name, "Statistics")
+    
+    if not os.path.exists(statistics_path):
+        os.makedirs(statistics_path)
+
+    # Create an Excel writer object to save the statistics
+    writer = pd.ExcelWriter(statistics_path+'\statistics.xlsx')
+    
+    # Create a summary sheet which includes all photo data from the site
+    df.to_excel(writer, sheet_name='Summary', index=False)
+    
+    catday = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
+    catmonth = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
+    
+    # Create a sheet summarising the hours of the day counts
+    dfhours = pd.DataFrame(list(hour_counts.items()), columns=['Hour', 'Count']).sort_values(by=['Hour'])
+    dfhours.to_excel(writer, sheet_name='Hours', index=False)    
+    
+    #create a sheet summarising the days of the week counts
+    dfdays = pd.DataFrame(list(day_of_week_counts.items()), columns=['Day', 'Count'])
+    dfdays = dfdays.groupby(['Day']).sum().reindex(catday) 
+    dfdays.to_excel(writer, sheet_name='Days', index=True)
+
+    #Create a sheet summarising the months of the year counts
+    dfmonths = pd.DataFrame(list(month_counts.items()), columns=['Month', 'Count'])
+    dfmonths = dfmonths.groupby(['Month']).sum().reindex(catmonth)
+    dfmonths.to_excel(writer, sheet_name='Months', index=True)
+    
+    # Create a sheet summarising counts by year
+    dfyears = pd.DataFrame(list(year_counts.items()), columns=['Year', 'Count']).sort_values(by=['Year'])
+    dfyears.to_excel(writer, sheet_name='Years', index=False)
+    
+    # Create a sheet summarising the counts by user
+    dfuser = pd.DataFrame(list(username_counts.items()), columns=['User', 'Count']).sort_values(by=['Count'], ascending=False)
+    
+    ### Combines similar username entries to avoid repeated top users
+    # Initialize a dictionary to store combined entries and their corresponding values
+    combined_entries = {}
+
+    # Create a copy of the column with the original capitalization
+    dfuser['User' + '_original'] = dfuser['User']
+
+    # Convert the column to lowercase for case-insensitive comparison
+    dfuser['User'] = dfuser['User'].str.lower()
+
+    # Iterate over each entry in the column
+    for entry, original_entry, value in zip(dfuser['User'], dfuser['User' + '_original'], dfuser["Count"]):
+        # Check if a similar entry already exists in the combined_entries dictionary
+        similar_entry = next((key for key in combined_entries if difflib.SequenceMatcher(None, entry, key).ratio() >= 0.9), None)
+
+        if similar_entry:
+            # If a similar entry exists, add the value to the existing entry
+            combined_entries[similar_entry][1] += value
+            print(f"Combined entry: {original_entry} -> {combined_entries[similar_entry][0]}")
+        else:
+            # If no similar entry exists, create a new entry in the dictionary
+            combined_entries[entry] = [original_entry, value]
+    
+    # Restore the original capitalization in the combined_entries dictionary
+    for key, value in combined_entries.items():
+        value[0] = dfuser.loc[dfuser['User'] == key, 'User' + '_original'].values[0]
+    
+    # Create a new dataframe with the combined entries and their summed values
+    dfuser = pd.DataFrame(list(combined_entries.values()), columns=['User', "Count"])
+    
+    dfuser.to_excel(writer, sheet_name='Users', index=False)
+
+    writer.close()
+    ###
+
+
+    # Create overall statistics sheet
+
+    column_map_ovr = {
+        'Site': site_name,
+        'Root_ID': root_id,
+        'Total count': total}
+    
+    new_row_ov = pd.DataFrame([column_map_ovr])
+
+    # Add the new row to the existing DataFrame
+    dfoverall = pd.concat([dfoverall, new_row_ov], ignore_index=True)
+
+
+
+    # Create the plot for total counts
+    fig, ax = plt.subplots(figsize=(7,4))
+    fig.subplots_adjust(right=0.8)
+    ax.plot(df['Datetime'], df['Total count'])
+
+    
+    time_range = df['Datetime'].max() - df['Datetime'].min()
+    
+    if time_range == pd.Timedelta(days=0):
+        continue
+    if time_range < pd.Timedelta(days=365):  # Less than 1 years
+        minor_locator = mdates.MonthLocator()
+        major_locator = mdates.MonthLocator(bymonth=[1, 3, 5, 7, 9, 11])
+        major_formatter = mdates.DateFormatter('%b\n%Y')
+    if time_range < pd.Timedelta(days=365*2) and time_range >= pd.Timedelta(days=365):  # Between 1 and 2 years
+        minor_locator = mdates.MonthLocator()
+        minor_formatter = mdates.DateFormatter('%b')
+        major_locator = mdates.MonthLocator(bymonth=[1, 4, 7, 10])
+        major_formatter = mdates.DateFormatter('%b\n%Y')
+    if time_range >= pd.Timedelta(days=365*2):  # Longer than 2 years
+        minor_locator = mdates.MonthLocator()
+        major_locator = mdates.MonthLocator(bymonth=[1, 7])
+        major_formatter = mdates.DateFormatter('%b\n%Y')
+    
+    # Set the x-axis tick locators and formatters
+    ax.xaxis.set_minor_locator(minor_locator)
+    ax.xaxis.set_major_locator(major_locator)
+    ax.xaxis.set_major_formatter(major_formatter)
+    
+    # Add labels and title to the plot
+    plt.xlabel('Month')
+    plt.ylabel('Cumulative count')
+    plt.title('Cumulative submissions since installation')
+    
+    # Rotate x-axis labels if needed
+    #plt.xticks(rotation=45)
+    
+    
+    # Add a table to the plot
+    table_data = [
+    ["Total number\nof images", total],
+    ["Submissions\nper week", round(total * 7 / time_range.days,1)]]
+    table = ax.table(cellText=table_data, colWidths = [0.2,0.1],cellLoc='left',loc='right', bbox=[1.05, 0.35, 0.4, 0.4])
+
+
+    # Adjust table properties
+    table.auto_set_font_size(False)
+    table.set_fontsize(10)
+    table.scale(1, 3)
+
+        
+    cells = table.properties()["celld"]
+    for i in range(0,1):
+        cells[i, 1]._loc = 'center'
+
+    
+    # Adjust the plot layout to accommodate the table
+    plt.subplots_adjust(bottom=0.2)
+    
+    # Save the plot to a temporary file
+    plt.savefig(statistics_path + "/total_count_plot.png", bbox_inches = 'tight')#, pad_inches = 0.5)
+    plt.close()
+
+
+    # Create 3 subplots for hours, days and months submissions
+    
+    # Set up the figure and axes
+    fig, axs = plt.subplots(1, 3, figsize=(12, 4))  # Three subplots side by side
+    
+    # First subplot: Hourly Counts
+    axs[0].bar(dfhours["Hour"], dfhours["Count"])
+    axs[0].set_xlabel("Time of Day")
+    axs[0].set_ylabel("Counts")
+    axs[0].set_title("Submissions by\ntime of the day")
+    axs[0].set_xticks(np.arange(0, 24, 3))  # Major ticks every hour
+    axs[0].set_xticks(np.arange(0, 24, 1), minor=True)  # Minor ticks every 3 hours
+    axs[0].set_xticklabels([f"{h:02d}" for h in np.arange(0, 24, 3)])
+    
+    # Second subplot: Daily Counts
+    axs[1].bar(dfdays.index, dfdays["Count"])
+    axs[1].set_xlabel("Days of the Week")
+    axs[1].set_ylabel("Counts")
+    axs[1].set_title("Submissions by\nday of the week")
+    axs[1].set_xticks(np.arange(7))
+    axs[1].set_xticklabels(["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])
+    
+    # Third subplot: Monthly Counts
+    axs[2].bar(dfmonths.index, dfmonths["Count"])
+    axs[2].set_xlabel("Months of the Year")
+    axs[2].set_ylabel("Counts")
+    axs[2].set_title("Submissions by\nmonth of the year")
+    axs[2].set_xticks(np.arange(12))
+    axs[2].set_xticklabels(["J", "F", "M", "A", "M", "J", "J", "A", "S", "O", "N", "D"])
+    
+    # Adjust the spacing between the subplots
+    plt.tight_layout()
+    
+    # Save the plot
+    plt.savefig(statistics_path + "/hour_day_month_plot.png")
+    plt.close()
+
+
+    # Create plot showing top users
+
+    # Extract data from the DataFrame
+    users = dftopuser['User']
+    counts = dftopuser['Count']
+    
+    # Set the size of the plot
+    fig, ax = plt.subplots(figsize=(7, 2))
+    
+    # Create the bar plot
+    colors = ['#AD8A56', '#D7D7D7', '#AF9500'] # Define a list of pale pastel colors
+    ax.barh(users, counts, color=colors)
+    
+    # Add labels to the bars
+    
+    label_offset = 0.02 * max(counts) # Offset for the text labels from the end of the bars 
+    for i, user in enumerate(users):
+        ax.text(counts[i] - label_offset, i, str(user), ha='right', va='center')
+    
+    # Set plot title and axis labels
+    ax.set_title('Top Users')
+    ax.set_xlabel('Count')
+    # Remove y-axis ticks
+    ax.yaxis.set_ticks([])
+
+    # Adjust subplot parameters to avoid x-axis label cutoff
+    plt.subplots_adjust(bottom=0.3)
+    
+    # Save the plot
+    plt.savefig(statistics_path + "/top_user_plot.png")
+    plt.close()
+
+    
+# Save overall counts in the Images directory
+writer = pd.ExcelWriter(os.path.join(parent_directories, "Images")+'\\total_counts.xlsx')
+
+dfoverall.to_excel(writer, index=False)
+writer.close()
--- a/workflow.pptx
+++ b/workflow.pptx