Spaces:
Sleeping
Sleeping
import streamlit as st | |
import requests | |
import trafilatura | |
from sentence_transformers import SentenceTransformer, util | |
import numpy as np | |
import pandas as pd | |
import advertools as adv | |
from sklearn.cluster import KMeans | |
from collections import Counter | |
# Initialize session state variables | |
if 'urls' not in st.session_state: | |
st.session_state.urls = [] | |
if 'results' not in st.session_state: | |
st.session_state.results = None | |
if 'processing_complete' not in st.session_state: | |
st.session_state.processing_complete = False | |
# Title of the app | |
st.title("Site Focus Calculator") | |
st.write("A tool for calculating the site focus score of a website or a series of URLs.") | |
# Load the model | |
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1") | |
# Input fields for sitemap or list of URLs (separated by newlines) | |
sitemap_url = st.text_input("Enter your XML sitemap URL (optional)", st.session_state.get('sitemap_url', "")) | |
url_list_input = st.text_area("Enter a list of URLs (separated by newlines, optional)", st.session_state.get('url_list_input', "")) | |
# Store inputs in session state | |
if sitemap_url: | |
st.session_state.sitemap_url = sitemap_url | |
if url_list_input: | |
st.session_state.url_list_input = url_list_input | |
# Add a "Run" button to trigger the URL processing | |
if st.button("Run Analysis"): | |
st.session_state.processing_complete = False | |
urls = [] | |
if sitemap_url: | |
st.write("Fetching URLs from the sitemap...") | |
sitemap_df = adv.sitemap_to_df(sitemap_url) | |
urls = sitemap_df['loc'].tolist() | |
st.session_state.urls = urls # Store URLs in session state | |
st.write(f"Processing {len(urls)} URLs from sitemap.") | |
elif url_list_input: | |
urls = [url.strip() for url in url_list_input.split('\n') if url.strip()] | |
st.session_state.urls = urls # Store URLs in session state | |
st.write(f"Processing {len(urls)} URLs from the input list.") | |
else: | |
st.warning("Please provide either a sitemap URL or a list of URLs.") | |
# Function to get embeddings | |
def get_embedding(text): | |
"""Generate embedding for the given text using the mxbai-embed-large-v1 model.""" | |
prompt = "Represent this sentence for searching relevant passages: " + text | |
embedding = model.encode(prompt) | |
return embedding | |
# Initialize lists to store embeddings and corresponding URLs | |
embeddings = [] | |
valid_urls = [] | |
extracted_texts = [] | |
error_urls = [] | |
# Define headers with User-Agent | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' | |
'AppleWebKit/537.36 (KHTML, like Gecko) ' | |
'Chrome/58.0.3029.110 Safari/537.3' | |
} | |
# Only process if URLs are provided | |
if urls: | |
st.write("Processing URLs...") | |
for url in urls: | |
try: | |
response = requests.get(url, headers=headers, timeout=10) | |
if response.status_code == 200: | |
html_content = response.text | |
extracted_text = trafilatura.extract(html_content) | |
if extracted_text: | |
embedding = get_embedding(extracted_text) | |
embeddings.append(embedding) | |
valid_urls.append(url) | |
extracted_texts.append(extracted_text) | |
else: | |
error_urls.append((url, "No content extracted")) | |
else: | |
error_urls.append((url, f"Status code {response.status_code}")) | |
except Exception as e: | |
error_urls.append((url, f"Error: {str(e)}")) | |
# Check if we have any valid embeddings | |
if embeddings: | |
# Stack embeddings into a single array | |
embeddings_array = np.vstack(embeddings) | |
# Compute the site embedding by averaging all embeddings | |
site_embedding = np.mean(embeddings_array, axis=0) | |
# Compute cosine similarity between each content embedding and the site embedding | |
similarities = util.cos_sim(embeddings_array, site_embedding) | |
similarities = similarities.numpy().flatten() | |
# Calculate pairwise cosine similarities for site focus score | |
pairwise_similarities = [] | |
for i in range(len(embeddings_array)): | |
for j in range(i+1, len(embeddings_array)): | |
sim = util.cos_sim(embeddings_array[i], embeddings_array[j]).item() | |
pairwise_similarities.append(sim) | |
# Calculate average pairwise similarity | |
if pairwise_similarities: | |
site_focus_score = sum(pairwise_similarities) / len(pairwise_similarities) | |
else: | |
site_focus_score = 0.0 | |
st.write(f"Site Focus Score: {site_focus_score:.4f}") | |
# Perform KMeans clustering if there are enough samples | |
if len(embeddings_array) >= 2: | |
try: | |
n_clusters = 2 # Adjust the number of clusters as needed | |
kmeans = KMeans(n_clusters=n_clusters, random_state=42) | |
kmeans.fit(embeddings_array) | |
labels = kmeans.labels_ | |
# Analyze cluster sizes | |
cluster_counts = Counter(labels) | |
# Assign a cluster-based score to each page based on cluster size | |
cluster_sizes = dict(cluster_counts) | |
page_cluster_scores = [] | |
for label in labels: | |
score = cluster_sizes[label] / len(embeddings_array) # Fraction of pages in the cluster | |
page_cluster_scores.append(score) | |
# Create a DataFrame with the desired columns | |
df = pd.DataFrame({ | |
'URL': valid_urls, | |
'PageSiteSimilarity': similarities, | |
'ClusterLabel': labels, | |
'ClusterScore': page_cluster_scores | |
}) | |
# Store results in session state | |
st.session_state.results = df | |
st.session_state.processing_complete = True | |
# Display the DataFrame | |
st.write("URL Analysis Results") | |
st.dataframe(df) | |
# Option to download the results as CSV | |
csv = df.to_csv(index=False) | |
st.download_button(label="Download data as CSV", data=csv, file_name='url_analysis_results.csv', mime='text/csv') | |
except ValueError as ve: | |
st.error(f"KMeans error: {ve}. Try using a smaller number of clusters.") | |
else: | |
st.warning("Not enough URLs to perform clustering. Need at least 2 samples.") | |
else: | |
st.warning("No valid embeddings were generated.") | |
# If there are any error URLs, show them | |
if error_urls: | |
st.write("The following URLs encountered errors and were not processed:") | |
error_df = pd.DataFrame(error_urls, columns=["URL", "Error"]) | |
st.dataframe(error_df) | |
else: | |
# Display results if processing is complete | |
if st.session_state.processing_complete and st.session_state.results is not None: | |
st.write("URL Analysis Results") | |
st.dataframe(st.session_state.results) | |
# Option to download the results as CSV | |
csv = st.session_state.results.to_csv(index=False) | |
st.download_button(label="Download data as CSV", data=csv, file_name='url_analysis_results.csv', mime='text/csv') | |
st.info("Click 'Run Analysis' to start the process.") | |