|
import streamlit as st |
|
import requests |
|
import trafilatura |
|
from sentence_transformers import SentenceTransformer, util |
|
import numpy as np |
|
import pandas as pd |
|
import advertools as adv |
|
from sklearn.cluster import KMeans |
|
from collections import Counter |
|
|
|
|
|
if 'urls' not in st.session_state: |
|
st.session_state.urls = [] |
|
if 'results' not in st.session_state: |
|
st.session_state.results = None |
|
if 'processing_complete' not in st.session_state: |
|
st.session_state.processing_complete = False |
|
|
|
|
|
st.title("Site Focus Calculator") |
|
st.write("A tool for calculating the site focus score of a website or a series of URLs.") |
|
|
|
|
|
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1") |
|
|
|
|
|
sitemap_url = st.text_input("Enter your XML sitemap URL (optional)", st.session_state.get('sitemap_url', "")) |
|
url_list_input = st.text_area("Enter a list of URLs (separated by newlines, optional)", st.session_state.get('url_list_input', "")) |
|
|
|
|
|
if sitemap_url: |
|
st.session_state.sitemap_url = sitemap_url |
|
if url_list_input: |
|
st.session_state.url_list_input = url_list_input |
|
|
|
|
|
if st.button("Run Analysis"): |
|
st.session_state.processing_complete = False |
|
urls = [] |
|
if sitemap_url: |
|
st.write("Fetching URLs from the sitemap...") |
|
sitemap_df = adv.sitemap_to_df(sitemap_url) |
|
urls = sitemap_df['loc'].tolist() |
|
st.session_state.urls = urls |
|
st.write(f"Processing {len(urls)} URLs from sitemap.") |
|
elif url_list_input: |
|
urls = [url.strip() for url in url_list_input.split('\n') if url.strip()] |
|
st.session_state.urls = urls |
|
st.write(f"Processing {len(urls)} URLs from the input list.") |
|
else: |
|
st.warning("Please provide either a sitemap URL or a list of URLs.") |
|
|
|
|
|
def get_embedding(text): |
|
"""Generate embedding for the given text using the mxbai-embed-large-v1 model.""" |
|
prompt = "Represent this sentence for searching relevant passages: " + text |
|
embedding = model.encode(prompt) |
|
return embedding |
|
|
|
|
|
embeddings = [] |
|
valid_urls = [] |
|
extracted_texts = [] |
|
error_urls = [] |
|
|
|
|
|
headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' |
|
'AppleWebKit/537.36 (KHTML, like Gecko) ' |
|
'Chrome/58.0.3029.110 Safari/537.3' |
|
} |
|
|
|
|
|
if urls: |
|
st.write("Processing URLs...") |
|
for url in urls: |
|
try: |
|
response = requests.get(url, headers=headers, timeout=10) |
|
if response.status_code == 200: |
|
html_content = response.text |
|
extracted_text = trafilatura.extract(html_content) |
|
if extracted_text: |
|
embedding = get_embedding(extracted_text) |
|
embeddings.append(embedding) |
|
valid_urls.append(url) |
|
extracted_texts.append(extracted_text) |
|
else: |
|
error_urls.append((url, "No content extracted")) |
|
else: |
|
error_urls.append((url, f"Status code {response.status_code}")) |
|
except Exception as e: |
|
error_urls.append((url, f"Error: {str(e)}")) |
|
|
|
|
|
if embeddings: |
|
|
|
embeddings_array = np.vstack(embeddings) |
|
|
|
|
|
site_embedding = np.mean(embeddings_array, axis=0) |
|
|
|
|
|
similarities = util.cos_sim(embeddings_array, site_embedding) |
|
similarities = similarities.numpy().flatten() |
|
|
|
|
|
pairwise_similarities = [] |
|
for i in range(len(embeddings_array)): |
|
for j in range(i+1, len(embeddings_array)): |
|
sim = util.cos_sim(embeddings_array[i], embeddings_array[j]).item() |
|
pairwise_similarities.append(sim) |
|
|
|
|
|
if pairwise_similarities: |
|
site_focus_score = sum(pairwise_similarities) / len(pairwise_similarities) |
|
else: |
|
site_focus_score = 0.0 |
|
|
|
st.write(f"Site Focus Score: {site_focus_score:.4f}") |
|
|
|
|
|
if len(embeddings_array) >= 2: |
|
try: |
|
n_clusters = 2 |
|
kmeans = KMeans(n_clusters=n_clusters, random_state=42) |
|
kmeans.fit(embeddings_array) |
|
labels = kmeans.labels_ |
|
|
|
|
|
cluster_counts = Counter(labels) |
|
|
|
|
|
cluster_sizes = dict(cluster_counts) |
|
page_cluster_scores = [] |
|
for label in labels: |
|
score = cluster_sizes[label] / len(embeddings_array) |
|
page_cluster_scores.append(score) |
|
|
|
|
|
df = pd.DataFrame({ |
|
'URL': valid_urls, |
|
'PageSiteSimilarity': similarities, |
|
'ClusterLabel': labels, |
|
'ClusterScore': page_cluster_scores |
|
}) |
|
|
|
|
|
st.session_state.results = df |
|
st.session_state.processing_complete = True |
|
|
|
|
|
st.write("URL Analysis Results") |
|
st.dataframe(df) |
|
|
|
|
|
csv = df.to_csv(index=False) |
|
st.download_button(label="Download data as CSV", data=csv, file_name='url_analysis_results.csv', mime='text/csv') |
|
except ValueError as ve: |
|
st.error(f"KMeans error: {ve}. Try using a smaller number of clusters.") |
|
else: |
|
st.warning("Not enough URLs to perform clustering. Need at least 2 samples.") |
|
else: |
|
st.warning("No valid embeddings were generated.") |
|
|
|
|
|
if error_urls: |
|
st.write("The following URLs encountered errors and were not processed:") |
|
error_df = pd.DataFrame(error_urls, columns=["URL", "Error"]) |
|
st.dataframe(error_df) |
|
else: |
|
|
|
if st.session_state.processing_complete and st.session_state.results is not None: |
|
st.write("URL Analysis Results") |
|
st.dataframe(st.session_state.results) |
|
|
|
|
|
csv = st.session_state.results.to_csv(index=False) |
|
st.download_button(label="Download data as CSV", data=csv, file_name='url_analysis_results.csv', mime='text/csv') |
|
|
|
st.info("Click 'Run Analysis' to start the process.") |
|
|