ipullrank's picture
Update app.py (#1)
af3125a verified
import streamlit as st
import requests
import trafilatura
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd
import advertools as adv
from sklearn.cluster import KMeans
from collections import Counter
# Initialize session state variables
if 'urls' not in st.session_state:
st.session_state.urls = []
if 'results' not in st.session_state:
st.session_state.results = None
if 'processing_complete' not in st.session_state:
st.session_state.processing_complete = False
# Title of the app
st.title("Site Focus Calculator")
st.write("A tool for calculating the site focus score of a website or a series of URLs.")
# Load the model
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
# Input fields for sitemap or list of URLs (separated by newlines)
sitemap_url = st.text_input("Enter your XML sitemap URL (optional)", st.session_state.get('sitemap_url', ""))
url_list_input = st.text_area("Enter a list of URLs (separated by newlines, optional)", st.session_state.get('url_list_input', ""))
# Store inputs in session state
if sitemap_url:
st.session_state.sitemap_url = sitemap_url
if url_list_input:
st.session_state.url_list_input = url_list_input
# Add a "Run" button to trigger the URL processing
if st.button("Run Analysis"):
st.session_state.processing_complete = False
urls = []
if sitemap_url:
st.write("Fetching URLs from the sitemap...")
sitemap_df = adv.sitemap_to_df(sitemap_url)
urls = sitemap_df['loc'].tolist()
st.session_state.urls = urls # Store URLs in session state
st.write(f"Processing {len(urls)} URLs from sitemap.")
elif url_list_input:
urls = [url.strip() for url in url_list_input.split('\n') if url.strip()]
st.session_state.urls = urls # Store URLs in session state
st.write(f"Processing {len(urls)} URLs from the input list.")
else:
st.warning("Please provide either a sitemap URL or a list of URLs.")
# Function to get embeddings
def get_embedding(text):
"""Generate embedding for the given text using the mxbai-embed-large-v1 model."""
prompt = "Represent this sentence for searching relevant passages: " + text
embedding = model.encode(prompt)
return embedding
# Initialize lists to store embeddings and corresponding URLs
embeddings = []
valid_urls = []
extracted_texts = []
error_urls = []
# Define headers with User-Agent
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/58.0.3029.110 Safari/537.3'
}
# Only process if URLs are provided
if urls:
st.write("Processing URLs...")
for url in urls:
try:
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 200:
html_content = response.text
extracted_text = trafilatura.extract(html_content)
if extracted_text:
embedding = get_embedding(extracted_text)
embeddings.append(embedding)
valid_urls.append(url)
extracted_texts.append(extracted_text)
else:
error_urls.append((url, "No content extracted"))
else:
error_urls.append((url, f"Status code {response.status_code}"))
except Exception as e:
error_urls.append((url, f"Error: {str(e)}"))
# Check if we have any valid embeddings
if embeddings:
# Stack embeddings into a single array
embeddings_array = np.vstack(embeddings)
# Compute the site embedding by averaging all embeddings
site_embedding = np.mean(embeddings_array, axis=0)
# Compute cosine similarity between each content embedding and the site embedding
similarities = util.cos_sim(embeddings_array, site_embedding)
similarities = similarities.numpy().flatten()
# Calculate pairwise cosine similarities for site focus score
pairwise_similarities = []
for i in range(len(embeddings_array)):
for j in range(i+1, len(embeddings_array)):
sim = util.cos_sim(embeddings_array[i], embeddings_array[j]).item()
pairwise_similarities.append(sim)
# Calculate average pairwise similarity
if pairwise_similarities:
site_focus_score = sum(pairwise_similarities) / len(pairwise_similarities)
else:
site_focus_score = 0.0
st.write(f"Site Focus Score: {site_focus_score:.4f}")
# Perform KMeans clustering if there are enough samples
if len(embeddings_array) >= 2:
try:
n_clusters = 2 # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(embeddings_array)
labels = kmeans.labels_
# Analyze cluster sizes
cluster_counts = Counter(labels)
# Assign a cluster-based score to each page based on cluster size
cluster_sizes = dict(cluster_counts)
page_cluster_scores = []
for label in labels:
score = cluster_sizes[label] / len(embeddings_array) # Fraction of pages in the cluster
page_cluster_scores.append(score)
# Create a DataFrame with the desired columns
df = pd.DataFrame({
'URL': valid_urls,
'PageSiteSimilarity': similarities,
'ClusterLabel': labels,
'ClusterScore': page_cluster_scores
})
# Store results in session state
st.session_state.results = df
st.session_state.processing_complete = True
# Display the DataFrame
st.write("URL Analysis Results")
st.dataframe(df)
# Option to download the results as CSV
csv = df.to_csv(index=False)
st.download_button(label="Download data as CSV", data=csv, file_name='url_analysis_results.csv', mime='text/csv')
except ValueError as ve:
st.error(f"KMeans error: {ve}. Try using a smaller number of clusters.")
else:
st.warning("Not enough URLs to perform clustering. Need at least 2 samples.")
else:
st.warning("No valid embeddings were generated.")
# If there are any error URLs, show them
if error_urls:
st.write("The following URLs encountered errors and were not processed:")
error_df = pd.DataFrame(error_urls, columns=["URL", "Error"])
st.dataframe(error_df)
else:
# Display results if processing is complete
if st.session_state.processing_complete and st.session_state.results is not None:
st.write("URL Analysis Results")
st.dataframe(st.session_state.results)
# Option to download the results as CSV
csv = st.session_state.results.to_csv(index=False)
st.download_button(label="Download data as CSV", data=csv, file_name='url_analysis_results.csv', mime='text/csv')
st.info("Click 'Run Analysis' to start the process.")