File size: 7,489 Bytes
5350119
 
 
 
 
 
 
 
 
 
af3125a
 
 
 
 
 
 
 
5350119
 
 
 
 
 
 
 
af3125a
 
 
 
 
 
 
 
5350119
 
 
af3125a
5350119
 
 
 
 
af3125a
5350119
 
 
af3125a
5350119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af3125a
 
 
 
5350119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af3125a
 
 
 
 
 
 
 
 
5350119
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import streamlit as st
import requests
import trafilatura
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd
import advertools as adv
from sklearn.cluster import KMeans
from collections import Counter

# Initialize session state variables
if 'urls' not in st.session_state:
    st.session_state.urls = []
if 'results' not in st.session_state:
    st.session_state.results = None
if 'processing_complete' not in st.session_state:
    st.session_state.processing_complete = False

# Title of the app
st.title("Site Focus Calculator")
st.write("A tool for calculating the site focus score of a website or a series of URLs.")

# Load the model
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

# Input fields for sitemap or list of URLs (separated by newlines)
sitemap_url = st.text_input("Enter your XML sitemap URL (optional)", st.session_state.get('sitemap_url', ""))
url_list_input = st.text_area("Enter a list of URLs (separated by newlines, optional)", st.session_state.get('url_list_input', ""))

# Store inputs in session state
if sitemap_url:
    st.session_state.sitemap_url = sitemap_url
if url_list_input:
    st.session_state.url_list_input = url_list_input

# Add a "Run" button to trigger the URL processing
if st.button("Run Analysis"):
    st.session_state.processing_complete = False
    urls = []
    if sitemap_url:
        st.write("Fetching URLs from the sitemap...")
        sitemap_df = adv.sitemap_to_df(sitemap_url)
        urls = sitemap_df['loc'].tolist()
        st.session_state.urls = urls  # Store URLs in session state
        st.write(f"Processing {len(urls)} URLs from sitemap.")
    elif url_list_input:
        urls = [url.strip() for url in url_list_input.split('\n') if url.strip()]
        st.session_state.urls = urls  # Store URLs in session state
        st.write(f"Processing {len(urls)} URLs from the input list.")
    else:
        st.warning("Please provide either a sitemap URL or a list of URLs.")

    # Function to get embeddings
    def get_embedding(text):
        """Generate embedding for the given text using the mxbai-embed-large-v1 model."""
        prompt = "Represent this sentence for searching relevant passages: " + text
        embedding = model.encode(prompt)
        return embedding

    # Initialize lists to store embeddings and corresponding URLs
    embeddings = []
    valid_urls = []
    extracted_texts = []
    error_urls = []

    # Define headers with User-Agent
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                      'AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/58.0.3029.110 Safari/537.3'
    }

    # Only process if URLs are provided
    if urls:
        st.write("Processing URLs...")
        for url in urls:
            try:
                response = requests.get(url, headers=headers, timeout=10)
                if response.status_code == 200:
                    html_content = response.text
                    extracted_text = trafilatura.extract(html_content)
                    if extracted_text:
                        embedding = get_embedding(extracted_text)
                        embeddings.append(embedding)
                        valid_urls.append(url)
                        extracted_texts.append(extracted_text)
                    else:
                        error_urls.append((url, "No content extracted"))
                else:
                    error_urls.append((url, f"Status code {response.status_code}"))
            except Exception as e:
                error_urls.append((url, f"Error: {str(e)}"))

    # Check if we have any valid embeddings
    if embeddings:
        # Stack embeddings into a single array
        embeddings_array = np.vstack(embeddings)

        # Compute the site embedding by averaging all embeddings
        site_embedding = np.mean(embeddings_array, axis=0)

        # Compute cosine similarity between each content embedding and the site embedding
        similarities = util.cos_sim(embeddings_array, site_embedding)
        similarities = similarities.numpy().flatten()

        # Calculate pairwise cosine similarities for site focus score
        pairwise_similarities = []
        for i in range(len(embeddings_array)):
            for j in range(i+1, len(embeddings_array)):
                sim = util.cos_sim(embeddings_array[i], embeddings_array[j]).item()
                pairwise_similarities.append(sim)

        # Calculate average pairwise similarity
        if pairwise_similarities:
            site_focus_score = sum(pairwise_similarities) / len(pairwise_similarities)
        else:
            site_focus_score = 0.0

        st.write(f"Site Focus Score: {site_focus_score:.4f}")

        # Perform KMeans clustering if there are enough samples
        if len(embeddings_array) >= 2:
            try:
                n_clusters = 2  # Adjust the number of clusters as needed
                kmeans = KMeans(n_clusters=n_clusters, random_state=42)
                kmeans.fit(embeddings_array)
                labels = kmeans.labels_

                # Analyze cluster sizes
                cluster_counts = Counter(labels)

                # Assign a cluster-based score to each page based on cluster size
                cluster_sizes = dict(cluster_counts)
                page_cluster_scores = []
                for label in labels:
                    score = cluster_sizes[label] / len(embeddings_array)  # Fraction of pages in the cluster
                    page_cluster_scores.append(score)

                # Create a DataFrame with the desired columns
                df = pd.DataFrame({
                    'URL': valid_urls,
                    'PageSiteSimilarity': similarities,
                    'ClusterLabel': labels,
                    'ClusterScore': page_cluster_scores
                })

                # Store results in session state
                st.session_state.results = df
                st.session_state.processing_complete = True

                # Display the DataFrame
                st.write("URL Analysis Results")
                st.dataframe(df)

                # Option to download the results as CSV
                csv = df.to_csv(index=False)
                st.download_button(label="Download data as CSV", data=csv, file_name='url_analysis_results.csv', mime='text/csv')
            except ValueError as ve:
                st.error(f"KMeans error: {ve}. Try using a smaller number of clusters.")
        else:
            st.warning("Not enough URLs to perform clustering. Need at least 2 samples.")
    else:
        st.warning("No valid embeddings were generated.")

    # If there are any error URLs, show them
    if error_urls:
        st.write("The following URLs encountered errors and were not processed:")
        error_df = pd.DataFrame(error_urls, columns=["URL", "Error"])
        st.dataframe(error_df)
else:
    # Display results if processing is complete
    if st.session_state.processing_complete and st.session_state.results is not None:
        st.write("URL Analysis Results")
        st.dataframe(st.session_state.results)

        # Option to download the results as CSV
        csv = st.session_state.results.to_csv(index=False)
        st.download_button(label="Download data as CSV", data=csv, file_name='url_analysis_results.csv', mime='text/csv')

    st.info("Click 'Run Analysis' to start the process.")