ipullrank commited on
Commit
5350119
·
1 Parent(s): e5b0b32

Adding to HF Spaces

Browse files
Files changed (3) hide show
  1. README.md +2 -13
  2. app.py +163 -0
  3. requirements.txt +11 -0
README.md CHANGED
@@ -1,13 +1,2 @@
1
- ---
2
- title: Site Focus Calculator
3
- emoji: 📊
4
- colorFrom: green
5
- colorTo: yellow
6
- sdk: streamlit
7
- sdk_version: 1.40.1
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # sitefocus-sl
2
+ Streamlit app for Site Focus Score
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Install required libraries (in Streamlit you would install them via requirements.txt or manually in the terminal)
2
+ # !pip install requests trafilatura sentence-transformers numpy torch tqdm scikit-learn pandas advertools streamlit
3
+
4
+ import streamlit as st
5
+ import requests
6
+ import trafilatura
7
+ from sentence_transformers import SentenceTransformer, util
8
+ import numpy as np
9
+ import pandas as pd
10
+ import advertools as adv
11
+ from sklearn.cluster import KMeans
12
+ from collections import Counter
13
+
14
+ # Title of the app
15
+ st.title("Site Focus Calculator")
16
+ st.write("A tool for calculating the site focus score of a website or a series of URLs.")
17
+
18
+
19
+ # Load the model
20
+ model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
21
+ #model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
22
+
23
+ #mxbai-embed-xsmall-v1
24
+
25
+
26
+
27
+ # Input fields for sitemap or list of URLs (separated by newlines)
28
+ sitemap_url = st.text_input("Enter your XML sitemap URL (optional)", "")
29
+ url_list_input = st.text_area("Enter a list of URLs (separated by newlines, optional)", "")
30
+
31
+ # Add a "Run" button to trigger the URL processing
32
+ if st.button("Run Analysis"):
33
+ # Process either sitemap or URL list
34
+ urls = []
35
+ if sitemap_url:
36
+ st.write("Fetching URLs from the sitemap...")
37
+ # Read sitemap and extract URLs using advertools
38
+ sitemap_df = adv.sitemap_to_df(sitemap_url)
39
+ urls = sitemap_df['loc'].tolist()
40
+ #urls = urls[:50] # Limit to first 50 URLs for testing purposes
41
+ st.write(f"Processing {len(urls)} URLs from sitemap.")
42
+ elif url_list_input:
43
+ # Parse URL list from input (newlines separated)
44
+ urls = [url.strip() for url in url_list_input.split('\n') if url.strip()]
45
+ st.write(f"Processing {len(urls)} URLs from the input list.")
46
+ else:
47
+ st.warning("Please provide either a sitemap URL or a list of URLs.")
48
+
49
+ # Function to get embeddings
50
+ def get_embedding(text):
51
+ """Generate embedding for the given text using the mxbai-embed-large-v1 model."""
52
+ prompt = "Represent this sentence for searching relevant passages: " + text
53
+ embedding = model.encode(prompt)
54
+ return embedding
55
+
56
+ # Initialize lists to store embeddings and corresponding URLs
57
+ embeddings = []
58
+ valid_urls = []
59
+ extracted_texts = []
60
+ error_urls = []
61
+
62
+ # Define headers with User-Agent
63
+ headers = {
64
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
65
+ 'AppleWebKit/537.36 (KHTML, like Gecko) '
66
+ 'Chrome/58.0.3029.110 Safari/537.3'
67
+ }
68
+
69
+ # Only process if URLs are provided
70
+ if urls:
71
+ st.write("Processing URLs...")
72
+ for url in urls:
73
+ try:
74
+ response = requests.get(url, headers=headers, timeout=10)
75
+ if response.status_code == 200:
76
+ html_content = response.text
77
+ extracted_text = trafilatura.extract(html_content)
78
+ if extracted_text:
79
+ embedding = get_embedding(extracted_text)
80
+ embeddings.append(embedding)
81
+ valid_urls.append(url)
82
+ extracted_texts.append(extracted_text)
83
+ else:
84
+ error_urls.append((url, "No content extracted"))
85
+ else:
86
+ error_urls.append((url, f"Status code {response.status_code}"))
87
+ except Exception as e:
88
+ error_urls.append((url, f"Error: {str(e)}"))
89
+
90
+ # Check if we have any valid embeddings
91
+ if embeddings:
92
+ # Stack embeddings into a single array
93
+ embeddings_array = np.vstack(embeddings)
94
+
95
+ # Compute the site embedding by averaging all embeddings
96
+ site_embedding = np.mean(embeddings_array, axis=0)
97
+
98
+ # Compute cosine similarity between each content embedding and the site embedding
99
+ similarities = util.cos_sim(embeddings_array, site_embedding)
100
+ similarities = similarities.numpy().flatten()
101
+
102
+ # Calculate pairwise cosine similarities for site focus score
103
+ pairwise_similarities = []
104
+ for i in range(len(embeddings_array)):
105
+ for j in range(i+1, len(embeddings_array)):
106
+ sim = util.cos_sim(embeddings_array[i], embeddings_array[j]).item()
107
+ pairwise_similarities.append(sim)
108
+
109
+ # Calculate average pairwise similarity
110
+ if pairwise_similarities:
111
+ site_focus_score = sum(pairwise_similarities) / len(pairwise_similarities)
112
+ else:
113
+ site_focus_score = 0.0
114
+
115
+ st.write(f"Site Focus Score: {site_focus_score:.4f}")
116
+
117
+ # Perform KMeans clustering if there are enough samples
118
+ if len(embeddings_array) >= 2:
119
+ try:
120
+ n_clusters = 2 # Adjust the number of clusters as needed
121
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
122
+ kmeans.fit(embeddings_array)
123
+ labels = kmeans.labels_
124
+
125
+ # Analyze cluster sizes
126
+ cluster_counts = Counter(labels)
127
+
128
+ # Assign a cluster-based score to each page based on cluster size
129
+ cluster_sizes = dict(cluster_counts)
130
+ page_cluster_scores = []
131
+ for label in labels:
132
+ score = cluster_sizes[label] / len(embeddings_array) # Fraction of pages in the cluster
133
+ page_cluster_scores.append(score)
134
+
135
+ # Create a DataFrame with the desired columns
136
+ df = pd.DataFrame({
137
+ 'URL': valid_urls,
138
+ 'PageSiteSimilarity': similarities,
139
+ 'ClusterLabel': labels,
140
+ 'ClusterScore': page_cluster_scores
141
+ })
142
+
143
+ # Display the DataFrame
144
+ st.write("URL Analysis Results")
145
+ st.dataframe(df)
146
+
147
+ # Option to download the results as CSV
148
+ csv = df.to_csv(index=False)
149
+ st.download_button(label="Download data as CSV", data=csv, file_name='url_analysis_results.csv', mime='text/csv')
150
+ except ValueError as ve:
151
+ st.error(f"KMeans error: {ve}. Try using a smaller number of clusters.")
152
+ else:
153
+ st.warning("Not enough URLs to perform clustering. Need at least 2 samples.")
154
+ else:
155
+ st.warning("No valid embeddings were generated.")
156
+
157
+ # If there are any error URLs, show them
158
+ if error_urls:
159
+ st.write("The following URLs encountered errors and were not processed:")
160
+ error_df = pd.DataFrame(error_urls, columns=["URL", "Error"])
161
+ st.dataframe(error_df)
162
+ else:
163
+ st.info("Click 'Run Analysis' to start the process.")
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ requests
3
+ trafilatura
4
+ sentence-transformers
5
+ numpy
6
+ torch
7
+ tqdm
8
+ scikit-learn
9
+ pandas
10
+ advertools
11
+ einops