You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

356 lines
13 KiB
Python

1 year ago
import os
import csv
import pandas as pd
import numpy as np
from datetime import datetime
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import difflib
# Provide the path to the CSV file in the parent directory
code_dir = str(Path(os.getcwd()).parent)
csv_path = os.path.join(code_dir, "coastsnap_sites.csv")
coastsnap_sites_csv = pd.read_csv(csv_path)
parent_directories = coastsnap_sites_csv.parent_directory[0]
# Extract site names and parent directories
site_names = coastsnap_sites_csv['site_name']
root_id = coastsnap_sites_csv['root_id']
dfoverall = pd.DataFrame(columns=['Site','Root_ID','Total count'])
# Iterate over site names and parent directories
for index, row in coastsnap_sites_csv.iterrows():
site_name = row['site_name']
root_id = row['root_id']
print(site_name)
# Create a dictionary to store photo counts
day_of_week_counts = {}
month_counts = {}
hour_counts = {}
year_counts = {}
username_counts = {}
# Construct the full path to the photo directory
photo_directory = os.path.join(parent_directories, "Images", site_name, "Processed")
df = pd.DataFrame(columns=['Site','Total count','Datetime','Year','Month','Date','Hour','Minute','Second','Day','Month name','User'])
total = -1
# Iterate over the years in the photo directory
for year_directory in os.listdir(photo_directory):
# Check if the folder name is in the 4 number year format
if not year_directory.isdigit() or len(year_directory) != 4:
continue # Skip this folder
# Construct the full path to the year directory
year_path = os.path.join(photo_directory, year_directory)
# Iterate over the files in the year directory
for filename in os.listdir(year_path):
if filename.endswith(".jpg"):
# Check that it isn't the first photo uploaded
total = total + 1
if total == 0:
continue
# Extract information from the filename
filename = filename.replace("_",".")
file_parts = filename.split(".")
username = file_parts[-2]
timestamp = ".".join([file_parts[1],file_parts[2],file_parts[3],file_parts[4],file_parts[5],file_parts[6],file_parts[8]])
# Parse the timestamp
date_format = "%a.%b.%d.%H.%M.%S.%Y"
timestamp_datetime = datetime.strptime(timestamp, date_format)
# Extract relevant information from the timestamp
day_of_week = timestamp_datetime.strftime("%A")
month = timestamp_datetime.strftime("%B")
hour = timestamp_datetime.hour
year = timestamp_datetime.year
# Update photo counts
day_of_week_counts[day_of_week] = day_of_week_counts.get(day_of_week, 0) + 1
month_counts[month] = month_counts.get(month, 0) + 1
hour_counts[hour] = hour_counts.get(hour, 0) + 1
year_counts[year] = year_counts.get(year, 0) + 1
# Update username counts
username_counts[username] = username_counts.get(username, 0) + 1
# Month string to number
monthnum = datetime.strptime(file_parts[2], '%b').month
column_map = {
'Site': file_parts[9],
'Datetime': timestamp_datetime,
'Year': int(file_parts[8]),
'Month': monthnum,
'Date': int(file_parts[3]),
'Hour': int(file_parts[4]),
'Minute': int(file_parts[5]),
'Second': int(file_parts[6]),
'Month name': file_parts[2],
'Day': file_parts[1],
'User': file_parts[11],
'Total count': total
}
new_row = pd.DataFrame([column_map])
# Add the new row to the existing DataFrame
df = pd.concat([df, new_row], ignore_index=True)
if total < 1:
continue
# Determine top three usernames
top_usernames = sorted(username_counts.items(), key=lambda x: x[1], reverse=True)[:3]
dftopuser = pd.DataFrame(top_usernames, columns=['User', 'Count']).sort_values(by=['Count'], ascending=True).reset_index(drop=True)
###
# Save statistics to an excel spreadsheet
###
# Creat Statistics folder in Images directory if it doesn't exist
statistics_path = os.path.join(parent_directories, "Images", site_name, "Statistics")
if not os.path.exists(statistics_path):
os.makedirs(statistics_path)
# Create an Excel writer object to save the statistics
writer = pd.ExcelWriter(statistics_path+'\statistics.xlsx')
# Create a summary sheet which includes all photo data from the site
df.to_excel(writer, sheet_name='Summary', index=False)
catday = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
catmonth = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
# Create a sheet summarising the hours of the day counts
dfhours = pd.DataFrame(list(hour_counts.items()), columns=['Hour', 'Count']).sort_values(by=['Hour'])
dfhours.to_excel(writer, sheet_name='Hours', index=False)
#create a sheet summarising the days of the week counts
dfdays = pd.DataFrame(list(day_of_week_counts.items()), columns=['Day', 'Count'])
dfdays = dfdays.groupby(['Day']).sum().reindex(catday)
dfdays.to_excel(writer, sheet_name='Days', index=True)
#Create a sheet summarising the months of the year counts
dfmonths = pd.DataFrame(list(month_counts.items()), columns=['Month', 'Count'])
dfmonths = dfmonths.groupby(['Month']).sum().reindex(catmonth)
dfmonths.to_excel(writer, sheet_name='Months', index=True)
# Create a sheet summarising counts by year
dfyears = pd.DataFrame(list(year_counts.items()), columns=['Year', 'Count']).sort_values(by=['Year'])
dfyears.to_excel(writer, sheet_name='Years', index=False)
# Create a sheet summarising the counts by user
dfuser = pd.DataFrame(list(username_counts.items()), columns=['User', 'Count']).sort_values(by=['Count'], ascending=False)
### Combines similar username entries to avoid repeated top users
# Initialize a dictionary to store combined entries and their corresponding values
combined_entries = {}
# Create a copy of the column with the original capitalization
dfuser['User' + '_original'] = dfuser['User']
# Convert the column to lowercase for case-insensitive comparison
dfuser['User'] = dfuser['User'].str.lower()
# Iterate over each entry in the column
for entry, original_entry, value in zip(dfuser['User'], dfuser['User' + '_original'], dfuser["Count"]):
# Check if a similar entry already exists in the combined_entries dictionary
similar_entry = next((key for key in combined_entries if difflib.SequenceMatcher(None, entry, key).ratio() >= 0.9), None)
if similar_entry:
# If a similar entry exists, add the value to the existing entry
combined_entries[similar_entry][1] += value
print(f"Combined entry: {original_entry} -> {combined_entries[similar_entry][0]}")
else:
# If no similar entry exists, create a new entry in the dictionary
combined_entries[entry] = [original_entry, value]
# Restore the original capitalization in the combined_entries dictionary
for key, value in combined_entries.items():
value[0] = dfuser.loc[dfuser['User'] == key, 'User' + '_original'].values[0]
# Create a new dataframe with the combined entries and their summed values
dfuser = pd.DataFrame(list(combined_entries.values()), columns=['User', "Count"])
dfuser.to_excel(writer, sheet_name='Users', index=False)
writer.close()
###
# Create overall statistics sheet
column_map_ovr = {
'Site': site_name,
'Root_ID': root_id,
'Total count': total}
new_row_ov = pd.DataFrame([column_map_ovr])
# Add the new row to the existing DataFrame
dfoverall = pd.concat([dfoverall, new_row_ov], ignore_index=True)
# Create the plot for total counts
fig, ax = plt.subplots(figsize=(7,4))
fig.subplots_adjust(right=0.8)
ax.plot(df['Datetime'], df['Total count'])
time_range = df['Datetime'].max() - df['Datetime'].min()
if time_range == pd.Timedelta(days=0):
continue
if time_range < pd.Timedelta(days=365): # Less than 1 years
minor_locator = mdates.MonthLocator()
major_locator = mdates.MonthLocator(bymonth=[1, 3, 5, 7, 9, 11])
major_formatter = mdates.DateFormatter('%b\n%Y')
if time_range < pd.Timedelta(days=365*2) and time_range >= pd.Timedelta(days=365): # Between 1 and 2 years
minor_locator = mdates.MonthLocator()
minor_formatter = mdates.DateFormatter('%b')
major_locator = mdates.MonthLocator(bymonth=[1, 4, 7, 10])
major_formatter = mdates.DateFormatter('%b\n%Y')
if time_range >= pd.Timedelta(days=365*2): # Longer than 2 years
minor_locator = mdates.MonthLocator()
major_locator = mdates.MonthLocator(bymonth=[1, 7])
major_formatter = mdates.DateFormatter('%b\n%Y')
# Set the x-axis tick locators and formatters
ax.xaxis.set_minor_locator(minor_locator)
ax.xaxis.set_major_locator(major_locator)
ax.xaxis.set_major_formatter(major_formatter)
# Add labels and title to the plot
plt.xlabel('Month')
plt.ylabel('Cumulative count')
plt.title('Cumulative submissions since installation')
# Rotate x-axis labels if needed
#plt.xticks(rotation=45)
# Add a table to the plot
table_data = [
["Total number\nof images", total],
["Submissions\nper week", round(total * 7 / time_range.days,1)]]
table = ax.table(cellText=table_data, colWidths = [0.2,0.1],cellLoc='left',loc='right', bbox=[1.05, 0.35, 0.4, 0.4])
# Adjust table properties
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 3)
cells = table.properties()["celld"]
for i in range(0,1):
cells[i, 1]._loc = 'center'
# Adjust the plot layout to accommodate the table
plt.subplots_adjust(bottom=0.2)
# Save the plot to a temporary file
plt.savefig(statistics_path + "/total_count_plot.png", bbox_inches = 'tight')#, pad_inches = 0.5)
plt.close()
# Create 3 subplots for hours, days and months submissions
# Set up the figure and axes
fig, axs = plt.subplots(1, 3, figsize=(12, 4)) # Three subplots side by side
# First subplot: Hourly Counts
axs[0].bar(dfhours["Hour"], dfhours["Count"])
axs[0].set_xlabel("Time of Day")
axs[0].set_ylabel("Counts")
axs[0].set_title("Submissions by\ntime of the day")
axs[0].set_xticks(np.arange(0, 24, 3)) # Major ticks every hour
axs[0].set_xticks(np.arange(0, 24, 1), minor=True) # Minor ticks every 3 hours
axs[0].set_xticklabels([f"{h:02d}" for h in np.arange(0, 24, 3)])
# Second subplot: Daily Counts
axs[1].bar(dfdays.index, dfdays["Count"])
axs[1].set_xlabel("Days of the Week")
axs[1].set_ylabel("Counts")
axs[1].set_title("Submissions by\nday of the week")
axs[1].set_xticks(np.arange(7))
axs[1].set_xticklabels(["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])
# Third subplot: Monthly Counts
axs[2].bar(dfmonths.index, dfmonths["Count"])
axs[2].set_xlabel("Months of the Year")
axs[2].set_ylabel("Counts")
axs[2].set_title("Submissions by\nmonth of the year")
axs[2].set_xticks(np.arange(12))
axs[2].set_xticklabels(["J", "F", "M", "A", "M", "J", "J", "A", "S", "O", "N", "D"])
# Adjust the spacing between the subplots
plt.tight_layout()
# Save the plot
plt.savefig(statistics_path + "/hour_day_month_plot.png")
plt.close()
# Create plot showing top users
# Extract data from the DataFrame
users = dftopuser['User']
counts = dftopuser['Count']
# Set the size of the plot
fig, ax = plt.subplots(figsize=(7, 2))
# Create the bar plot
colors = ['#AD8A56', '#D7D7D7', '#AF9500'] # Define a list of pale pastel colors
ax.barh(users, counts, color=colors)
# Add labels to the bars
label_offset = 0.02 * max(counts) # Offset for the text labels from the end of the bars
for i, user in enumerate(users):
ax.text(counts[i] - label_offset, i, str(user), ha='right', va='center')
# Set plot title and axis labels
ax.set_title('Top Users')
ax.set_xlabel('Count')
# Remove y-axis ticks
ax.yaxis.set_ticks([])
# Adjust subplot parameters to avoid x-axis label cutoff
plt.subplots_adjust(bottom=0.3)
# Save the plot
plt.savefig(statistics_path + "/top_user_plot.png")
plt.close()
# Save overall counts in the Images directory
writer = pd.ExcelWriter(os.path.join(parent_directories, "Images")+'\\total_counts.xlsx')
dfoverall.to_excel(writer, index=False)
writer.close()