File size: 7,489 Bytes
5350119 af3125a 5350119 af3125a 5350119 af3125a 5350119 af3125a 5350119 af3125a 5350119 af3125a 5350119 af3125a 5350119 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import streamlit as st
import requests
import trafilatura
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd
import advertools as adv
from sklearn.cluster import KMeans
from collections import Counter
# Initialize session state variables
if 'urls' not in st.session_state:
st.session_state.urls = []
if 'results' not in st.session_state:
st.session_state.results = None
if 'processing_complete' not in st.session_state:
st.session_state.processing_complete = False
# Title of the app
st.title("Site Focus Calculator")
st.write("A tool for calculating the site focus score of a website or a series of URLs.")
# Load the model
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
# Input fields for sitemap or list of URLs (separated by newlines)
sitemap_url = st.text_input("Enter your XML sitemap URL (optional)", st.session_state.get('sitemap_url', ""))
url_list_input = st.text_area("Enter a list of URLs (separated by newlines, optional)", st.session_state.get('url_list_input', ""))
# Store inputs in session state
if sitemap_url:
st.session_state.sitemap_url = sitemap_url
if url_list_input:
st.session_state.url_list_input = url_list_input
# Add a "Run" button to trigger the URL processing
if st.button("Run Analysis"):
st.session_state.processing_complete = False
urls = []
if sitemap_url:
st.write("Fetching URLs from the sitemap...")
sitemap_df = adv.sitemap_to_df(sitemap_url)
urls = sitemap_df['loc'].tolist()
st.session_state.urls = urls # Store URLs in session state
st.write(f"Processing {len(urls)} URLs from sitemap.")
elif url_list_input:
urls = [url.strip() for url in url_list_input.split('\n') if url.strip()]
st.session_state.urls = urls # Store URLs in session state
st.write(f"Processing {len(urls)} URLs from the input list.")
else:
st.warning("Please provide either a sitemap URL or a list of URLs.")
# Function to get embeddings
def get_embedding(text):
"""Generate embedding for the given text using the mxbai-embed-large-v1 model."""
prompt = "Represent this sentence for searching relevant passages: " + text
embedding = model.encode(prompt)
return embedding
# Initialize lists to store embeddings and corresponding URLs
embeddings = []
valid_urls = []
extracted_texts = []
error_urls = []
# Define headers with User-Agent
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/58.0.3029.110 Safari/537.3'
}
# Only process if URLs are provided
if urls:
st.write("Processing URLs...")
for url in urls:
try:
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 200:
html_content = response.text
extracted_text = trafilatura.extract(html_content)
if extracted_text:
embedding = get_embedding(extracted_text)
embeddings.append(embedding)
valid_urls.append(url)
extracted_texts.append(extracted_text)
else:
error_urls.append((url, "No content extracted"))
else:
error_urls.append((url, f"Status code {response.status_code}"))
except Exception as e:
error_urls.append((url, f"Error: {str(e)}"))
# Check if we have any valid embeddings
if embeddings:
# Stack embeddings into a single array
embeddings_array = np.vstack(embeddings)
# Compute the site embedding by averaging all embeddings
site_embedding = np.mean(embeddings_array, axis=0)
# Compute cosine similarity between each content embedding and the site embedding
similarities = util.cos_sim(embeddings_array, site_embedding)
similarities = similarities.numpy().flatten()
# Calculate pairwise cosine similarities for site focus score
pairwise_similarities = []
for i in range(len(embeddings_array)):
for j in range(i+1, len(embeddings_array)):
sim = util.cos_sim(embeddings_array[i], embeddings_array[j]).item()
pairwise_similarities.append(sim)
# Calculate average pairwise similarity
if pairwise_similarities:
site_focus_score = sum(pairwise_similarities) / len(pairwise_similarities)
else:
site_focus_score = 0.0
st.write(f"Site Focus Score: {site_focus_score:.4f}")
# Perform KMeans clustering if there are enough samples
if len(embeddings_array) >= 2:
try:
n_clusters = 2 # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(embeddings_array)
labels = kmeans.labels_
# Analyze cluster sizes
cluster_counts = Counter(labels)
# Assign a cluster-based score to each page based on cluster size
cluster_sizes = dict(cluster_counts)
page_cluster_scores = []
for label in labels:
score = cluster_sizes[label] / len(embeddings_array) # Fraction of pages in the cluster
page_cluster_scores.append(score)
# Create a DataFrame with the desired columns
df = pd.DataFrame({
'URL': valid_urls,
'PageSiteSimilarity': similarities,
'ClusterLabel': labels,
'ClusterScore': page_cluster_scores
})
# Store results in session state
st.session_state.results = df
st.session_state.processing_complete = True
# Display the DataFrame
st.write("URL Analysis Results")
st.dataframe(df)
# Option to download the results as CSV
csv = df.to_csv(index=False)
st.download_button(label="Download data as CSV", data=csv, file_name='url_analysis_results.csv', mime='text/csv')
except ValueError as ve:
st.error(f"KMeans error: {ve}. Try using a smaller number of clusters.")
else:
st.warning("Not enough URLs to perform clustering. Need at least 2 samples.")
else:
st.warning("No valid embeddings were generated.")
# If there are any error URLs, show them
if error_urls:
st.write("The following URLs encountered errors and were not processed:")
error_df = pd.DataFrame(error_urls, columns=["URL", "Error"])
st.dataframe(error_df)
else:
# Display results if processing is complete
if st.session_state.processing_complete and st.session_state.results is not None:
st.write("URL Analysis Results")
st.dataframe(st.session_state.results)
# Option to download the results as CSV
csv = st.session_state.results.to_csv(index=False)
st.download_button(label="Download data as CSV", data=csv, file_name='url_analysis_results.csv', mime='text/csv')
st.info("Click 'Run Analysis' to start the process.")
|