Vizualizing webpage visits using Webclipper plugin data

I thought about why should Google and Big tech only make decisions based on the pages that I visit? Can I do something with this data? Based on this premise I created the firefox plugin that saves every webpage you visit in a database. - Webclipper Firefox Plugin Based on the …

I thought about why should Google and Big tech only make decisions based on the pages that I visit? Can I do something with this data?

Based on this premise I created the firefox plugin that saves every webpage you visit in a database. - Webclipper Firefox Plugin

Based on the data that I captured in the last 4 months, just created this starting viz to understand the domains and times I do a lot of my browsing.

I have not shared some of the deep dive analysis to protect my privacy, but I intend to use this data to feed to an agent that works as my personal assistant to help me with my daily tasks.

Also why am I so much on linkedin and ikea.com? :

Here are some visualizations I was able to generate:

Alt text

Alt text

Alt text

# pages and timestamps
data = PageAssociation.objects.all().values('website', 'date_associated')

# for value in website, replace by finding website name

# for entry in data:
#     entry['website'] = Page.objects.get(id=entry['website']).website

import pandas as pd

df = pd.DataFrame.from_records(data)
df["website"] = df["website"].apply(lambda x: Page.objects.get(id=x).website if x else x)
df['object'] = df['website'].apply(lambda x: Page.objects.get(website=x) if x else x)
df['website_domain'] = df['object'].apply(lambda x: x.website_domain if x else x)
# df remove where data_associated day is 2025-04-23
df = df[df['date_associated'].dt.date != pd.Timestamp('2025-04-23').date()]
import matplotlib.pyplot as plt
# (bar chart by domains, top hourly slots, top urls)

# Bar chart by domains
domain_counts = df['website_domain'].value_counts()
# top 10
domain_counts = domain_counts.head(10)
plt.figure(figsize=(12, 6))
domain_counts.plot(kind='bar')
plt.title('Top Website Domains')
plt.xlabel('Domain')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
# timeseseries of daily page counts


# heatmap of visits by hour and day of week
import seaborn as sns
heatmap_data = df.groupby([df['date_associated'].dt.date, df['date_associated'].dt.hour]).size().unstack(fill_value=0)
plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_data, cmap='YlGnBu')
plt.title('Heatmap of Visits by Hour and Day')
plt.xlabel('Hour')
plt.ylabel('Date')
plt.show()

daily_counts = df.groupby(df['date_associated'].dt.date).size()
cutoff_date = (pd.Timestamp.now().normalize() - pd.Timedelta(days=30)).date()
daily_counts = daily_counts[daily_counts.index >= cutoff_date]
plt.figure(figsize=(12, 6))
daily_counts.plot(kind='bar')
plt.title('Daily Page Counts (Last 30 Days)')
plt.xlabel('Date')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

df["domains"] = df["website"].apply(lambda x: x.split(".")[1] if x and len(x.split(".")) > 1 else x)
df["hour"] = df["date_associated"].dt.hour
df["day"] = df["date_associated"].dt.day
df["month"] = df["date_associated"].dt.month
# bar chart by domains, top hourly slots, top urls)
import matplotlib.pyplot as plt

# # Bar chart by domains
# domain_counts = df["domains"].value_counts()
# plt.figure(figsize=(12, 6))
# domain_counts.plot(kind="bar")
# plt.title("Top Domains")
# plt.xlabel("Domain")
# plt.ylabel("Count")
# plt.xticks(rotation=45)
# plt.show()

# # Bar chart by top hourly slots
# hour_counts = df["hour"].value_counts().sort_index()
# plt.figure(figsize=(12, 6))
# hour_counts.plot(kind="bar")
# plt.title("Top Hourly Slots")
# plt.xlabel("Hour")
# plt.ylabel("Count")
# plt.xticks(rotation=45)
# plt.show()

# # Bar chart by top URLs
# url_counts = df["website"].value_counts()
# plt.figure(figsize=(12, 6))
# url_counts.plot(kind="bar")
# plt.title("Top URLs")
# plt.xlabel("URL")
# plt.ylabel("Count")
# plt.xticks(rotation=45)
# plt.show()