You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
356 lines
13 KiB
Python
356 lines
13 KiB
Python
1 year ago
|
import os
|
||
|
import csv
|
||
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
from datetime import datetime
|
||
|
from pathlib import Path
|
||
|
import matplotlib.pyplot as plt
|
||
|
import matplotlib.dates as mdates
|
||
|
import matplotlib.ticker as ticker
|
||
|
|
||
|
import difflib
|
||
|
|
||
|
|
||
|
|
||
|
# Provide the path to the CSV file in the parent directory
|
||
|
|
||
|
code_dir = str(Path(os.getcwd()).parent)
|
||
|
csv_path = os.path.join(code_dir, "coastsnap_sites.csv")
|
||
|
|
||
|
coastsnap_sites_csv = pd.read_csv(csv_path)
|
||
|
parent_directories = coastsnap_sites_csv.parent_directory[0]
|
||
|
|
||
|
# Extract site names and parent directories
|
||
|
site_names = coastsnap_sites_csv['site_name']
|
||
|
root_id = coastsnap_sites_csv['root_id']
|
||
|
|
||
|
dfoverall = pd.DataFrame(columns=['Site','Root_ID','Total count'])
|
||
|
|
||
|
|
||
|
# Iterate over site names and parent directories
|
||
|
for index, row in coastsnap_sites_csv.iterrows():
|
||
|
site_name = row['site_name']
|
||
|
root_id = row['root_id']
|
||
|
|
||
|
print(site_name)
|
||
|
# Create a dictionary to store photo counts
|
||
|
day_of_week_counts = {}
|
||
|
month_counts = {}
|
||
|
hour_counts = {}
|
||
|
year_counts = {}
|
||
|
username_counts = {}
|
||
|
|
||
|
# Construct the full path to the photo directory
|
||
|
photo_directory = os.path.join(parent_directories, "Images", site_name, "Processed")
|
||
|
|
||
|
df = pd.DataFrame(columns=['Site','Total count','Datetime','Year','Month','Date','Hour','Minute','Second','Day','Month name','User'])
|
||
|
total = -1
|
||
|
|
||
|
# Iterate over the years in the photo directory
|
||
|
for year_directory in os.listdir(photo_directory):
|
||
|
|
||
|
# Check if the folder name is in the 4 number year format
|
||
|
if not year_directory.isdigit() or len(year_directory) != 4:
|
||
|
continue # Skip this folder
|
||
|
|
||
|
# Construct the full path to the year directory
|
||
|
year_path = os.path.join(photo_directory, year_directory)
|
||
|
|
||
|
|
||
|
# Iterate over the files in the year directory
|
||
|
for filename in os.listdir(year_path):
|
||
|
if filename.endswith(".jpg"):
|
||
|
|
||
|
# Check that it isn't the first photo uploaded
|
||
|
total = total + 1
|
||
|
if total == 0:
|
||
|
continue
|
||
|
|
||
|
|
||
|
# Extract information from the filename
|
||
|
filename = filename.replace("_",".")
|
||
|
file_parts = filename.split(".")
|
||
|
username = file_parts[-2]
|
||
|
timestamp = ".".join([file_parts[1],file_parts[2],file_parts[3],file_parts[4],file_parts[5],file_parts[6],file_parts[8]])
|
||
|
|
||
|
# Parse the timestamp
|
||
|
date_format = "%a.%b.%d.%H.%M.%S.%Y"
|
||
|
timestamp_datetime = datetime.strptime(timestamp, date_format)
|
||
|
|
||
|
# Extract relevant information from the timestamp
|
||
|
day_of_week = timestamp_datetime.strftime("%A")
|
||
|
month = timestamp_datetime.strftime("%B")
|
||
|
hour = timestamp_datetime.hour
|
||
|
year = timestamp_datetime.year
|
||
|
|
||
|
# Update photo counts
|
||
|
day_of_week_counts[day_of_week] = day_of_week_counts.get(day_of_week, 0) + 1
|
||
|
month_counts[month] = month_counts.get(month, 0) + 1
|
||
|
hour_counts[hour] = hour_counts.get(hour, 0) + 1
|
||
|
year_counts[year] = year_counts.get(year, 0) + 1
|
||
|
|
||
|
# Update username counts
|
||
|
username_counts[username] = username_counts.get(username, 0) + 1
|
||
|
|
||
|
# Month string to number
|
||
|
monthnum = datetime.strptime(file_parts[2], '%b').month
|
||
|
|
||
|
|
||
|
|
||
|
column_map = {
|
||
|
'Site': file_parts[9],
|
||
|
'Datetime': timestamp_datetime,
|
||
|
'Year': int(file_parts[8]),
|
||
|
'Month': monthnum,
|
||
|
'Date': int(file_parts[3]),
|
||
|
'Hour': int(file_parts[4]),
|
||
|
'Minute': int(file_parts[5]),
|
||
|
'Second': int(file_parts[6]),
|
||
|
'Month name': file_parts[2],
|
||
|
'Day': file_parts[1],
|
||
|
'User': file_parts[11],
|
||
|
'Total count': total
|
||
|
}
|
||
|
|
||
|
new_row = pd.DataFrame([column_map])
|
||
|
|
||
|
# Add the new row to the existing DataFrame
|
||
|
df = pd.concat([df, new_row], ignore_index=True)
|
||
|
|
||
|
if total < 1:
|
||
|
continue
|
||
|
|
||
|
# Determine top three usernames
|
||
|
top_usernames = sorted(username_counts.items(), key=lambda x: x[1], reverse=True)[:3]
|
||
|
dftopuser = pd.DataFrame(top_usernames, columns=['User', 'Count']).sort_values(by=['Count'], ascending=True).reset_index(drop=True)
|
||
|
|
||
|
###
|
||
|
# Save statistics to an excel spreadsheet
|
||
|
###
|
||
|
|
||
|
# Creat Statistics folder in Images directory if it doesn't exist
|
||
|
statistics_path = os.path.join(parent_directories, "Images", site_name, "Statistics")
|
||
|
|
||
|
if not os.path.exists(statistics_path):
|
||
|
os.makedirs(statistics_path)
|
||
|
|
||
|
# Create an Excel writer object to save the statistics
|
||
|
writer = pd.ExcelWriter(statistics_path+'\statistics.xlsx')
|
||
|
|
||
|
# Create a summary sheet which includes all photo data from the site
|
||
|
df.to_excel(writer, sheet_name='Summary', index=False)
|
||
|
|
||
|
catday = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
|
||
|
catmonth = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
|
||
|
|
||
|
# Create a sheet summarising the hours of the day counts
|
||
|
dfhours = pd.DataFrame(list(hour_counts.items()), columns=['Hour', 'Count']).sort_values(by=['Hour'])
|
||
|
dfhours.to_excel(writer, sheet_name='Hours', index=False)
|
||
|
|
||
|
#create a sheet summarising the days of the week counts
|
||
|
dfdays = pd.DataFrame(list(day_of_week_counts.items()), columns=['Day', 'Count'])
|
||
|
dfdays = dfdays.groupby(['Day']).sum().reindex(catday)
|
||
|
dfdays.to_excel(writer, sheet_name='Days', index=True)
|
||
|
|
||
|
#Create a sheet summarising the months of the year counts
|
||
|
dfmonths = pd.DataFrame(list(month_counts.items()), columns=['Month', 'Count'])
|
||
|
dfmonths = dfmonths.groupby(['Month']).sum().reindex(catmonth)
|
||
|
dfmonths.to_excel(writer, sheet_name='Months', index=True)
|
||
|
|
||
|
# Create a sheet summarising counts by year
|
||
|
dfyears = pd.DataFrame(list(year_counts.items()), columns=['Year', 'Count']).sort_values(by=['Year'])
|
||
|
dfyears.to_excel(writer, sheet_name='Years', index=False)
|
||
|
|
||
|
# Create a sheet summarising the counts by user
|
||
|
dfuser = pd.DataFrame(list(username_counts.items()), columns=['User', 'Count']).sort_values(by=['Count'], ascending=False)
|
||
|
|
||
|
### Combines similar username entries to avoid repeated top users
|
||
|
# Initialize a dictionary to store combined entries and their corresponding values
|
||
|
combined_entries = {}
|
||
|
|
||
|
# Create a copy of the column with the original capitalization
|
||
|
dfuser['User' + '_original'] = dfuser['User']
|
||
|
|
||
|
# Convert the column to lowercase for case-insensitive comparison
|
||
|
dfuser['User'] = dfuser['User'].str.lower()
|
||
|
|
||
|
# Iterate over each entry in the column
|
||
|
for entry, original_entry, value in zip(dfuser['User'], dfuser['User' + '_original'], dfuser["Count"]):
|
||
|
# Check if a similar entry already exists in the combined_entries dictionary
|
||
|
similar_entry = next((key for key in combined_entries if difflib.SequenceMatcher(None, entry, key).ratio() >= 0.9), None)
|
||
|
|
||
|
if similar_entry:
|
||
|
# If a similar entry exists, add the value to the existing entry
|
||
|
combined_entries[similar_entry][1] += value
|
||
|
print(f"Combined entry: {original_entry} -> {combined_entries[similar_entry][0]}")
|
||
|
else:
|
||
|
# If no similar entry exists, create a new entry in the dictionary
|
||
|
combined_entries[entry] = [original_entry, value]
|
||
|
|
||
|
# Restore the original capitalization in the combined_entries dictionary
|
||
|
for key, value in combined_entries.items():
|
||
|
value[0] = dfuser.loc[dfuser['User'] == key, 'User' + '_original'].values[0]
|
||
|
|
||
|
# Create a new dataframe with the combined entries and their summed values
|
||
|
dfuser = pd.DataFrame(list(combined_entries.values()), columns=['User', "Count"])
|
||
|
|
||
|
dfuser.to_excel(writer, sheet_name='Users', index=False)
|
||
|
|
||
|
writer.close()
|
||
|
###
|
||
|
|
||
|
|
||
|
# Create overall statistics sheet
|
||
|
|
||
|
column_map_ovr = {
|
||
|
'Site': site_name,
|
||
|
'Root_ID': root_id,
|
||
|
'Total count': total}
|
||
|
|
||
|
new_row_ov = pd.DataFrame([column_map_ovr])
|
||
|
|
||
|
# Add the new row to the existing DataFrame
|
||
|
dfoverall = pd.concat([dfoverall, new_row_ov], ignore_index=True)
|
||
|
|
||
|
|
||
|
|
||
|
# Create the plot for total counts
|
||
|
fig, ax = plt.subplots(figsize=(7,4))
|
||
|
fig.subplots_adjust(right=0.8)
|
||
|
ax.plot(df['Datetime'], df['Total count'])
|
||
|
|
||
|
|
||
|
time_range = df['Datetime'].max() - df['Datetime'].min()
|
||
|
|
||
|
if time_range == pd.Timedelta(days=0):
|
||
|
continue
|
||
|
if time_range < pd.Timedelta(days=365): # Less than 1 years
|
||
|
minor_locator = mdates.MonthLocator()
|
||
|
major_locator = mdates.MonthLocator(bymonth=[1, 3, 5, 7, 9, 11])
|
||
|
major_formatter = mdates.DateFormatter('%b\n%Y')
|
||
|
if time_range < pd.Timedelta(days=365*2) and time_range >= pd.Timedelta(days=365): # Between 1 and 2 years
|
||
|
minor_locator = mdates.MonthLocator()
|
||
|
minor_formatter = mdates.DateFormatter('%b')
|
||
|
major_locator = mdates.MonthLocator(bymonth=[1, 4, 7, 10])
|
||
|
major_formatter = mdates.DateFormatter('%b\n%Y')
|
||
|
if time_range >= pd.Timedelta(days=365*2): # Longer than 2 years
|
||
|
minor_locator = mdates.MonthLocator()
|
||
|
major_locator = mdates.MonthLocator(bymonth=[1, 7])
|
||
|
major_formatter = mdates.DateFormatter('%b\n%Y')
|
||
|
|
||
|
# Set the x-axis tick locators and formatters
|
||
|
ax.xaxis.set_minor_locator(minor_locator)
|
||
|
ax.xaxis.set_major_locator(major_locator)
|
||
|
ax.xaxis.set_major_formatter(major_formatter)
|
||
|
|
||
|
# Add labels and title to the plot
|
||
|
plt.xlabel('Month')
|
||
|
plt.ylabel('Cumulative count')
|
||
|
plt.title('Cumulative submissions since installation')
|
||
|
|
||
|
# Rotate x-axis labels if needed
|
||
|
#plt.xticks(rotation=45)
|
||
|
|
||
|
|
||
|
# Add a table to the plot
|
||
|
table_data = [
|
||
|
["Total number\nof images", total],
|
||
|
["Submissions\nper week", round(total * 7 / time_range.days,1)]]
|
||
|
table = ax.table(cellText=table_data, colWidths = [0.2,0.1],cellLoc='left',loc='right', bbox=[1.05, 0.35, 0.4, 0.4])
|
||
|
|
||
|
|
||
|
# Adjust table properties
|
||
|
table.auto_set_font_size(False)
|
||
|
table.set_fontsize(10)
|
||
|
table.scale(1, 3)
|
||
|
|
||
|
|
||
|
cells = table.properties()["celld"]
|
||
|
for i in range(0,1):
|
||
|
cells[i, 1]._loc = 'center'
|
||
|
|
||
|
|
||
|
# Adjust the plot layout to accommodate the table
|
||
|
plt.subplots_adjust(bottom=0.2)
|
||
|
|
||
|
# Save the plot to a temporary file
|
||
|
plt.savefig(statistics_path + "/total_count_plot.png", bbox_inches = 'tight')#, pad_inches = 0.5)
|
||
|
plt.close()
|
||
|
|
||
|
|
||
|
# Create 3 subplots for hours, days and months submissions
|
||
|
|
||
|
# Set up the figure and axes
|
||
|
fig, axs = plt.subplots(1, 3, figsize=(12, 4)) # Three subplots side by side
|
||
|
|
||
|
# First subplot: Hourly Counts
|
||
|
axs[0].bar(dfhours["Hour"], dfhours["Count"])
|
||
|
axs[0].set_xlabel("Time of Day")
|
||
|
axs[0].set_ylabel("Counts")
|
||
|
axs[0].set_title("Submissions by\ntime of the day")
|
||
|
axs[0].set_xticks(np.arange(0, 24, 3)) # Major ticks every hour
|
||
|
axs[0].set_xticks(np.arange(0, 24, 1), minor=True) # Minor ticks every 3 hours
|
||
|
axs[0].set_xticklabels([f"{h:02d}" for h in np.arange(0, 24, 3)])
|
||
|
|
||
|
# Second subplot: Daily Counts
|
||
|
axs[1].bar(dfdays.index, dfdays["Count"])
|
||
|
axs[1].set_xlabel("Days of the Week")
|
||
|
axs[1].set_ylabel("Counts")
|
||
|
axs[1].set_title("Submissions by\nday of the week")
|
||
|
axs[1].set_xticks(np.arange(7))
|
||
|
axs[1].set_xticklabels(["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])
|
||
|
|
||
|
# Third subplot: Monthly Counts
|
||
|
axs[2].bar(dfmonths.index, dfmonths["Count"])
|
||
|
axs[2].set_xlabel("Months of the Year")
|
||
|
axs[2].set_ylabel("Counts")
|
||
|
axs[2].set_title("Submissions by\nmonth of the year")
|
||
|
axs[2].set_xticks(np.arange(12))
|
||
|
axs[2].set_xticklabels(["J", "F", "M", "A", "M", "J", "J", "A", "S", "O", "N", "D"])
|
||
|
|
||
|
# Adjust the spacing between the subplots
|
||
|
plt.tight_layout()
|
||
|
|
||
|
# Save the plot
|
||
|
plt.savefig(statistics_path + "/hour_day_month_plot.png")
|
||
|
plt.close()
|
||
|
|
||
|
|
||
|
# Create plot showing top users
|
||
|
|
||
|
# Extract data from the DataFrame
|
||
|
users = dftopuser['User']
|
||
|
counts = dftopuser['Count']
|
||
|
|
||
|
# Set the size of the plot
|
||
|
fig, ax = plt.subplots(figsize=(7, 2))
|
||
|
|
||
|
# Create the bar plot
|
||
|
colors = ['#AD8A56', '#D7D7D7', '#AF9500'] # Define a list of pale pastel colors
|
||
|
ax.barh(users, counts, color=colors)
|
||
|
|
||
|
# Add labels to the bars
|
||
|
|
||
|
label_offset = 0.02 * max(counts) # Offset for the text labels from the end of the bars
|
||
|
for i, user in enumerate(users):
|
||
|
ax.text(counts[i] - label_offset, i, str(user), ha='right', va='center')
|
||
|
|
||
|
# Set plot title and axis labels
|
||
|
ax.set_title('Top Users')
|
||
|
ax.set_xlabel('Count')
|
||
|
# Remove y-axis ticks
|
||
|
ax.yaxis.set_ticks([])
|
||
|
|
||
|
# Adjust subplot parameters to avoid x-axis label cutoff
|
||
|
plt.subplots_adjust(bottom=0.3)
|
||
|
|
||
|
# Save the plot
|
||
|
plt.savefig(statistics_path + "/top_user_plot.png")
|
||
|
plt.close()
|
||
|
|
||
|
|
||
|
# Save overall counts in the Images directory
|
||
|
writer = pd.ExcelWriter(os.path.join(parent_directories, "Images")+'\\total_counts.xlsx')
|
||
|
|
||
|
dfoverall.to_excel(writer, index=False)
|
||
|
writer.close()
|