poemsforaphrodite
commited on
Upload folder using huggingface_hub
Browse files- .gitignore +1 -0
- README.md +11 -0
- app.py +383 -0
- links.txt +223 -0
- requirements.txt +8 -0
- scrape.py +104 -0
- upsert.py +128 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.env
|
README.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: RAG Chat
|
3 |
+
emoji: ⚡
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: blue
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.37.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
app.py
ADDED
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import streamlit as st
|
3 |
+
from openai import OpenAI
|
4 |
+
from PyPDF2 import PdfReader
|
5 |
+
import requests
|
6 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
7 |
+
from urllib.parse import urlparse, parse_qs
|
8 |
+
from pinecone import Pinecone
|
9 |
+
import uuid
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
import time
|
12 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
13 |
+
from bs4 import BeautifulSoup
|
14 |
+
from selenium import webdriver
|
15 |
+
from selenium.webdriver.chrome.service import Service
|
16 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
17 |
+
from selenium.webdriver.chrome.options import Options
|
18 |
+
import time
|
19 |
+
import re
|
20 |
+
from pymongo import MongoClient
|
21 |
+
from pymongo.errors import ConnectionFailure
|
22 |
+
from datetime import datetime
|
23 |
+
|
24 |
+
# Set page config at the very beginning
|
25 |
+
st.set_page_config(layout="wide")
|
26 |
+
|
27 |
+
# Load environment variables
|
28 |
+
load_dotenv()
|
29 |
+
|
30 |
+
# Set up OpenAI client
|
31 |
+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
32 |
+
|
33 |
+
# Set up Pinecone
|
34 |
+
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
35 |
+
|
36 |
+
index_name = "lyca" # Your index name
|
37 |
+
index = pc.Index(index_name)
|
38 |
+
|
39 |
+
# Set up MongoDB connection
|
40 |
+
mongo_uri = os.getenv("MONGODB_URI")
|
41 |
+
if not mongo_uri:
|
42 |
+
st.error("MONGO_URI is not set. Please check your .env file.")
|
43 |
+
else:
|
44 |
+
print(f"MONGO_URI loaded: {mongo_uri[:10]}...") # Print only first 10 chars for security
|
45 |
+
|
46 |
+
|
47 |
+
try:
|
48 |
+
client = MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
|
49 |
+
client.server_info() # This will raise an exception if the connection fails
|
50 |
+
db = client['lyca']
|
51 |
+
sim_swap_collection = db['sim_swap_requests']
|
52 |
+
except ConnectionFailure:
|
53 |
+
st.error("Failed to connect to MongoDB. Please check your connection and try again later.")
|
54 |
+
sim_swap_collection = None
|
55 |
+
|
56 |
+
def get_embedding(text):
|
57 |
+
response = client.embeddings.create(input=text, model="text-embedding-3-large")
|
58 |
+
return response.data[0].embedding
|
59 |
+
|
60 |
+
def process_pdf(file):
|
61 |
+
reader = PdfReader(file)
|
62 |
+
text = ""
|
63 |
+
for page in reader.pages:
|
64 |
+
text += page.extract_text() + "\n"
|
65 |
+
return text
|
66 |
+
|
67 |
+
def process_web_link(url):
|
68 |
+
try:
|
69 |
+
# Set up Selenium options
|
70 |
+
chrome_options = Options()
|
71 |
+
chrome_options.add_argument("--headless") # Run in headless mode for performance
|
72 |
+
chrome_options.add_argument("--no-sandbox")
|
73 |
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
74 |
+
|
75 |
+
# Install the Chrome driver automatically using webdriver-manager
|
76 |
+
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
|
77 |
+
|
78 |
+
# Navigate to the URL
|
79 |
+
driver.get(url)
|
80 |
+
|
81 |
+
# Give the page some time to load fully
|
82 |
+
time.sleep(3)
|
83 |
+
|
84 |
+
# Extract the rendered page's content
|
85 |
+
page_source = driver.page_source
|
86 |
+
|
87 |
+
# Close the browser after extracting content
|
88 |
+
driver.quit()
|
89 |
+
|
90 |
+
# Parse the page content using BeautifulSoup
|
91 |
+
soup = BeautifulSoup(page_source, 'lxml')
|
92 |
+
|
93 |
+
# Remove script and style elements
|
94 |
+
for script in soup(["script", "style"]):
|
95 |
+
script.decompose()
|
96 |
+
|
97 |
+
# Get text
|
98 |
+
text = soup.get_text()
|
99 |
+
|
100 |
+
# Clean up the text
|
101 |
+
lines = (line.strip() for line in text.splitlines())
|
102 |
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
103 |
+
text = '\n'.join(chunk for chunk in chunks if chunk)
|
104 |
+
|
105 |
+
return text
|
106 |
+
except Exception as e:
|
107 |
+
print(f"Error processing web link {url}: {str(e)}")
|
108 |
+
return f"Error processing {url}: {str(e)}"
|
109 |
+
|
110 |
+
def process_youtube_link(url):
|
111 |
+
video_id = extract_video_id(url)
|
112 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
113 |
+
return " ".join([entry['text'] for entry in transcript])
|
114 |
+
|
115 |
+
def extract_video_id(url):
|
116 |
+
parsed_url = urlparse(url)
|
117 |
+
if parsed_url.hostname == 'youtu.be':
|
118 |
+
return parsed_url.path[1:]
|
119 |
+
if parsed_url.hostname in ('www.youtube.com', 'youtube.com'):
|
120 |
+
if parsed_url.path == '/watch':
|
121 |
+
return parse_qs(parsed_url.query)['v'][0]
|
122 |
+
if parsed_url.path[:7] == '/embed/':
|
123 |
+
return parsed_url.path.split('/')[2]
|
124 |
+
if parsed_url.path[:3] == '/v/':
|
125 |
+
return parsed_url.path.split('/')[2]
|
126 |
+
return None
|
127 |
+
|
128 |
+
def process_upload(upload_type, file_or_link, file_name=None):
|
129 |
+
print(f"Starting process_upload for {upload_type}")
|
130 |
+
doc_id = str(uuid.uuid4())
|
131 |
+
print(f"Generated doc_id: {doc_id}")
|
132 |
+
|
133 |
+
if upload_type == "PDF":
|
134 |
+
content = process_pdf(file_or_link)
|
135 |
+
doc_name = file_name or "Uploaded PDF"
|
136 |
+
elif upload_type == "Web Link":
|
137 |
+
content = process_web_link(file_or_link)
|
138 |
+
doc_name = file_or_link
|
139 |
+
elif upload_type == "YouTube Link":
|
140 |
+
content = process_youtube_link(file_or_link)
|
141 |
+
doc_name = f"YouTube: {file_or_link}"
|
142 |
+
else:
|
143 |
+
print("Invalid upload type")
|
144 |
+
return "Invalid upload type"
|
145 |
+
|
146 |
+
content_length = len(content)
|
147 |
+
print(f"Content extracted, length: {content_length}")
|
148 |
+
|
149 |
+
# Dynamically adjust chunk size based on content length
|
150 |
+
if content_length < 10000:
|
151 |
+
chunk_size = 1000
|
152 |
+
elif content_length < 100000:
|
153 |
+
chunk_size = 2000
|
154 |
+
else:
|
155 |
+
chunk_size = 4000
|
156 |
+
print(f"Using chunk size: {chunk_size}")
|
157 |
+
|
158 |
+
chunks = [content[i:i+chunk_size] for i in range(0, content_length, chunk_size)]
|
159 |
+
|
160 |
+
vectors = []
|
161 |
+
with ThreadPoolExecutor() as executor:
|
162 |
+
futures = [executor.submit(process_chunk, chunk, doc_id, i, upload_type, doc_name) for i, chunk in enumerate(chunks)]
|
163 |
+
|
164 |
+
for future in as_completed(futures):
|
165 |
+
vectors.append(future.result())
|
166 |
+
# Update progress
|
167 |
+
progress = len(vectors) / len(chunks)
|
168 |
+
st.session_state.upload_progress.progress(progress)
|
169 |
+
|
170 |
+
print(f"Generated {len(vectors)} vectors")
|
171 |
+
|
172 |
+
index.upsert(vectors=vectors)
|
173 |
+
print("Vectors upserted to Pinecone")
|
174 |
+
|
175 |
+
return f"Processing complete for {upload_type}. Document Name: {doc_name}"
|
176 |
+
|
177 |
+
def process_chunk(chunk, doc_id, i, upload_type, doc_name):
|
178 |
+
embedding = get_embedding(chunk)
|
179 |
+
return (f"{doc_id}_{i}", embedding, {
|
180 |
+
"text": chunk,
|
181 |
+
"type": upload_type,
|
182 |
+
"doc_id": doc_id,
|
183 |
+
"doc_name": doc_name,
|
184 |
+
"chunk_index": i
|
185 |
+
})
|
186 |
+
|
187 |
+
def get_relevant_context(query, top_k=5):
|
188 |
+
print(f"Getting relevant context for query: {query}")
|
189 |
+
query_embedding = get_embedding(query)
|
190 |
+
|
191 |
+
search_results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
|
192 |
+
print(f"Found {len(search_results['matches'])} relevant results")
|
193 |
+
|
194 |
+
# Sort results by doc_id and chunk_index to maintain document structure
|
195 |
+
sorted_results = sorted(search_results['matches'], key=lambda x: (x['metadata']['doc_id'], x['metadata']['chunk_index']))
|
196 |
+
|
197 |
+
context = "\n".join([result['metadata']['text'] for result in sorted_results])
|
198 |
+
return context, sorted_results
|
199 |
+
|
200 |
+
def check_lyca_data_loaded():
|
201 |
+
# Check if there are any vectors in the index
|
202 |
+
stats = index.describe_index_stats()
|
203 |
+
return stats['total_vector_count'] > 0
|
204 |
+
|
205 |
+
def load_lyca_mobile_data():
|
206 |
+
if check_lyca_data_loaded():
|
207 |
+
return "Lyca Mobile data is already loaded."
|
208 |
+
|
209 |
+
lyca_links = [line.strip() for line in open('links.txt', 'r')]
|
210 |
+
for link in lyca_links:
|
211 |
+
process_upload("Web Link", link)
|
212 |
+
return "Lyca Mobile data loaded into vector database"
|
213 |
+
|
214 |
+
def general_conversation(message):
|
215 |
+
response = client.chat.completions.create(
|
216 |
+
model="gpt-4o-mini",
|
217 |
+
messages=[
|
218 |
+
{"role": "system", "content": "You are a helpful assistant for Lyca Mobile customers. If you don't know the answer, politely say so."},
|
219 |
+
{"role": "user", "content": message}
|
220 |
+
]
|
221 |
+
)
|
222 |
+
return response.choices[0].message.content
|
223 |
+
|
224 |
+
def is_sim_swap_request(message):
|
225 |
+
sim_swap_keywords = {'sim', 'swap', 'change', 'new', 'replace'}
|
226 |
+
# Remove the question mark at the end if it exists
|
227 |
+
message = message.rstrip('?')
|
228 |
+
message_words = set(message.lower().split())
|
229 |
+
return len(sim_swap_keywords.intersection(message_words)) >= 2
|
230 |
+
|
231 |
+
# Add a print statement for debugging
|
232 |
+
print(f"is_sim_swap_request result: {is_sim_swap_request('how to change my sim?')}")
|
233 |
+
|
234 |
+
def trigger_sim_swap_workflow():
|
235 |
+
st.session_state.workflow = 'sim_swap'
|
236 |
+
st.session_state.workflow_step = 0
|
237 |
+
|
238 |
+
def process_sim_swap_workflow():
|
239 |
+
st.subheader("SIM Swap Request Form")
|
240 |
+
|
241 |
+
with st.form("sim_swap_form"):
|
242 |
+
full_name = st.text_input("Please enter your full name:")
|
243 |
+
phone_number = st.text_input("Please enter your phone number:")
|
244 |
+
email = st.text_input("Please enter your email address:")
|
245 |
+
current_sim = st.text_input("Please enter your current SIM card number:")
|
246 |
+
reason = st.text_area("Please enter the reason for SIM swap:")
|
247 |
+
|
248 |
+
submitted = st.form_submit_button("Submit")
|
249 |
+
|
250 |
+
if submitted:
|
251 |
+
if sim_swap_collection is None:
|
252 |
+
st.error("Unable to process your request due to a database connection issue. Please try again later.")
|
253 |
+
else:
|
254 |
+
user_data = {
|
255 |
+
"full_name": full_name,
|
256 |
+
"phone_number": phone_number,
|
257 |
+
"email": email,
|
258 |
+
"current_sim": current_sim,
|
259 |
+
"reason": reason,
|
260 |
+
"timestamp": datetime.now()
|
261 |
+
}
|
262 |
+
|
263 |
+
try:
|
264 |
+
sim_swap_collection.insert_one(user_data)
|
265 |
+
st.success("Thank you for providing your information. Your SIM swap request has been submitted and stored successfully.")
|
266 |
+
st.session_state.workflow = None
|
267 |
+
except Exception as e:
|
268 |
+
st.error(f"An error occurred while storing your information: {str(e)}")
|
269 |
+
st.warning("Please try submitting your request again. If the problem persists, please contact support.")
|
270 |
+
|
271 |
+
def chat_with_ai(message):
|
272 |
+
try:
|
273 |
+
query_embedding = get_embedding(message)
|
274 |
+
context, results = get_relevant_context(message)
|
275 |
+
|
276 |
+
if results and results[0]['score'] >= 0.4:
|
277 |
+
messages = [
|
278 |
+
{"role": "system", "content": "You are a helpful assistant for Lyca Mobile. Use the following information to answer the user's question, but don't mention the context directly in your response. If the information isn't in the context, say you don't know."},
|
279 |
+
{"role": "system", "content": f"Context: {context}"},
|
280 |
+
{"role": "user", "content": message}
|
281 |
+
]
|
282 |
+
|
283 |
+
response = client.chat.completions.create(
|
284 |
+
model="gpt-4o-mini",
|
285 |
+
messages=messages
|
286 |
+
)
|
287 |
+
|
288 |
+
ai_response = response.choices[0].message.content
|
289 |
+
|
290 |
+
sources = [
|
291 |
+
{
|
292 |
+
"doc_id": result['metadata']['doc_id'],
|
293 |
+
"doc_name": result['metadata']['doc_name'],
|
294 |
+
"chunk_index": result['metadata']['chunk_index'],
|
295 |
+
"text": result['metadata']['text'],
|
296 |
+
"type": result['metadata']['type'],
|
297 |
+
"score": result['score']
|
298 |
+
}
|
299 |
+
for result in results
|
300 |
+
]
|
301 |
+
else:
|
302 |
+
# Fallback to general conversation if no relevant context is found or similarity is low
|
303 |
+
ai_response = general_conversation(message)
|
304 |
+
sources = []
|
305 |
+
|
306 |
+
return ai_response, sources
|
307 |
+
except Exception as e:
|
308 |
+
print(f"Error in chat_with_ai: {str(e)}")
|
309 |
+
return "I'm sorry, but I encountered an error while processing your request. Please try again later.", []
|
310 |
+
|
311 |
+
def clear_database():
|
312 |
+
print("Clearing database...")
|
313 |
+
index.delete(delete_all=True)
|
314 |
+
print("Database cleared")
|
315 |
+
return "Database cleared successfully."
|
316 |
+
|
317 |
+
# Streamlit UI
|
318 |
+
st.title("Lyca Mobile Assistant")
|
319 |
+
|
320 |
+
if 'workflow' not in st.session_state:
|
321 |
+
st.session_state.workflow = None
|
322 |
+
st.session_state.workflow_data = []
|
323 |
+
st.session_state.workflow_step = 0
|
324 |
+
|
325 |
+
if 'chat_history' not in st.session_state:
|
326 |
+
st.session_state.chat_history = []
|
327 |
+
|
328 |
+
# Create two columns instead of three
|
329 |
+
col1, col2 = st.columns([2, 1])
|
330 |
+
|
331 |
+
with col1:
|
332 |
+
st.header("Chat")
|
333 |
+
|
334 |
+
if st.session_state.workflow == 'sim_swap':
|
335 |
+
process_sim_swap_workflow()
|
336 |
+
else:
|
337 |
+
# Display chat history
|
338 |
+
for message in st.session_state.chat_history:
|
339 |
+
st.markdown(f"**{'You' if message['role'] == 'user' else 'AI'}:** {message['content']}")
|
340 |
+
|
341 |
+
user_input = st.text_input("How can I assist you with Lyca Mobile today?")
|
342 |
+
if st.button("Send"):
|
343 |
+
if user_input:
|
344 |
+
# Add debug print
|
345 |
+
print(f"User input: {user_input}")
|
346 |
+
is_swap_request = is_sim_swap_request(user_input)
|
347 |
+
print(f"Is sim swap request: {is_swap_request}")
|
348 |
+
|
349 |
+
if is_swap_request:
|
350 |
+
print("Triggering SIM swap workflow")
|
351 |
+
st.session_state.chat_history.append({"role": "user", "content": user_input})
|
352 |
+
st.session_state.chat_history.append({"role": "assistant", "content": "Certainly! I can help you with changing your SIM. Please fill out the following form to start the SIM swap process."})
|
353 |
+
st.session_state.workflow = 'sim_swap'
|
354 |
+
else:
|
355 |
+
print("Proceeding with regular chat flow")
|
356 |
+
# Existing code for non-sim-swap requests
|
357 |
+
st.session_state.chat_progress = st.progress(0)
|
358 |
+
response, sources = chat_with_ai(user_input)
|
359 |
+
st.session_state.chat_progress.progress(1.0)
|
360 |
+
|
361 |
+
# Add to chat history
|
362 |
+
st.session_state.chat_history.append({"role": "user", "content": user_input})
|
363 |
+
st.session_state.chat_history.append({"role": "assistant", "content": response})
|
364 |
+
|
365 |
+
# Display the latest messages
|
366 |
+
st.markdown("**You:** " + user_input)
|
367 |
+
st.markdown("**AI:** " + response)
|
368 |
+
|
369 |
+
# Store sources in session state for display in col2
|
370 |
+
st.session_state.sources = sources
|
371 |
+
st.session_state.chat_progress.empty()
|
372 |
+
else:
|
373 |
+
st.warning("Please enter a question.")
|
374 |
+
|
375 |
+
with col2:
|
376 |
+
st.header("Source Information")
|
377 |
+
if 'sources' in st.session_state and st.session_state.sources:
|
378 |
+
for i, source in enumerate(st.session_state.sources, 1):
|
379 |
+
with st.expander(f"Source {i} - {source['type']} ({source['doc_name']})"):
|
380 |
+
st.markdown(f"**Chunk Index:** {source['chunk_index']}")
|
381 |
+
st.text(source['text'])
|
382 |
+
else:
|
383 |
+
st.info("Ask a question to see source information here.")
|
links.txt
ADDED
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
https://www.lycamobile.us/faq/do-you-offer-4g-hspa
|
2 |
+
https://www.lycamobile.us/help/frequently-asked-question
|
3 |
+
https://www.lycamobile.us/help/contact-us
|
4 |
+
https://www.lycamobile.us/en/online-security
|
5 |
+
https://www.lycamobile.us/store-locator
|
6 |
+
https://www.lycamobile.us/international-credit/can-i-schedule-automatic-top-up-of-international-credit-for-my-lyca-mobile-account
|
7 |
+
https://www.lycamobile.us/faq/how-to-unlock-my-handset
|
8 |
+
https://www.lycamobile.us/activate-sim
|
9 |
+
https://www.lycamobile.us/faq/how-do-i-make-an-international-call-with-lyca-mobile
|
10 |
+
https://www.lycamobile.us/international-credit/how-can-i-top-up-international-credit-my-lyca-mobile-number
|
11 |
+
https://www.lycamobile.us/how-to/how-do-i-check-if-a-bundle-is-active-on-my-lycamobile
|
12 |
+
https://www.lycamobile.us/help-support
|
13 |
+
https://www.lycamobile.us/use-of-this-website
|
14 |
+
https://www.lycamobile.us/help/how-to-switch-to-lyca-mobile/
|
15 |
+
https://www.lycamobile.us/california-mts/
|
16 |
+
https://www.lycamobile.us/about-us
|
17 |
+
https://www.lycamobile.us/help/mobile-web-settings/
|
18 |
+
https://www.lycamobile.us/how-to/how-do-i-send-a-text-message-from-the-us-to-another-country-with-lyca-mobile
|
19 |
+
https://www.lycamobile.us/faq/how-do-i-activate-roaming-facility-using-lyca-mobile
|
20 |
+
https://www.lycamobile.us/how-to/how-to-activate-my-new-sim-and-prepay-plan
|
21 |
+
https://www.lycamobile.us/termscondition
|
22 |
+
https://www.lycamobile.us/how-to/how-do-i-make-an-international-call-with-lyca-mobile
|
23 |
+
https://www.lycamobile.us/registration
|
24 |
+
https://www.lycamobile.us/faq/how-much-does-it-cost-to-access-the-voicemail
|
25 |
+
https://www.lycamobile.us/help/wi-fi-calling-and-text/
|
26 |
+
https://www.lycamobile.us/how-to/how-to-retrieve-your-lyca-mobile-number
|
27 |
+
https://www.lycamobile.us/help/order
|
28 |
+
https://www.lycamobile.us/faq/how-do-i-set-up-auto-renewal
|
29 |
+
https://www.lycamobile.us/ios
|
30 |
+
https://www.lycamobile.us/faq/where-can-i-find-my-lyca-mobile-number
|
31 |
+
https://www.lycamobile.us/faq/can-i-send-premium-sms-and-make-premium-calls-using-lyca-mobile
|
32 |
+
https://www.lycamobile.us/plans/prepaid-phone-plans#best-value
|
33 |
+
https://www.lycamobile.us/how-to/how-long-would-it-take-to-swap-my-sim-to-a-plus-sim-card
|
34 |
+
https://www.lycamobile.us/en/
|
35 |
+
https://www.lycamobile.us/how-to/how-do-i-check-my-lycamobile-number-data-and-call
|
36 |
+
https://www.lycamobile.us/help/international-credit
|
37 |
+
https://www.lycamobile.us/plans/prepaid-phone-plans/refill-plans/
|
38 |
+
https://www.lycamobile.us/en/activate-sim/?utm_source=website&utm_medium=onpage&utm_campaign=JoinLyca-EGO_USA_ENG_WCO_GLP_BRND
|
39 |
+
https://www.lycamobile.us/joined-lyca/how-to-switch-to-lyca-mobile
|
40 |
+
https://www.lycamobile.us/about-to-join-lyca-mobile/what-information-do-i-need-to-provide
|
41 |
+
https://www.lycamobile.us/help/how-to
|
42 |
+
https://www.lycamobile.us/faq/i-have-not-used-my-lyca-mobile-for-a-while-and-it-has-now-stopped-working-why-is-this
|
43 |
+
https://www.lycamobile.us/en/lia-chat
|
44 |
+
https://www.lycamobile.us/2g_shutdown/
|
45 |
+
https://www.lycamobile.us/en/cookie-policy
|
46 |
+
https://www.lycamobile.us/help/data-add-on
|
47 |
+
https://www.lycamobile.us/faq/i-have-forgotten-my-pin-puk-code-where-can-i-find-it
|
48 |
+
https://www.lycamobile.us/joined-lyca/can-i-retain-my-current-mobile-number
|
49 |
+
https://www.lycamobile.us/help/esim
|
50 |
+
https://www.lycamobile.us/help/portin-status
|
51 |
+
https://www.lycamobile.us/faq/how-can-i-stop-receiving-unwanted-texts-or-spam
|
52 |
+
https://www.lycamobile.us/en/plans/prepay-plans/?utm_source=website&utm_medium=onpage&utm_campaign=JoinLyca-EGO_USA_ENG_WCO_GLP_BRND#best-value
|
53 |
+
https://www.lycamobile.us/international-credit/can-i-transfer-my-top-up-balance-to-another-customer
|
54 |
+
https://www.lycamobile.us/blog/en/
|
55 |
+
https://www.lycamobile.us/faq/what-is-my-lyca-mobile
|
56 |
+
https://www.lycamobile.us/faq/how-long-would-it-take-to-swap-my-sim-to-a-plus-sim-card
|
57 |
+
https://www.lycamobile.us/faq/do-i-have-to-sign-a-contract
|
58 |
+
https://www.lycamobile.us/plans/prepaid-phone-plans/refill-plans
|
59 |
+
https://www.lycamobile.us/plans/prepaid-phone-plans#30-days-plans
|
60 |
+
https://www.lycamobile.us/faq/does-lyca-mobile-charge-taxes-on-my-order
|
61 |
+
https://www.lycamobile.us/faq/how-do-i-send-a-text-message-from-the-us-to-another-country-with-lyca-mobile
|
62 |
+
https://www.lycamobile.us/already-joined-us/how-to-enable-auto-renewal
|
63 |
+
https://www.lycamobile.us/android
|
64 |
+
https://www.lycamobile.us/port-in/
|
65 |
+
https://www.lycamobile.us/become-a-retailer
|
66 |
+
https://www.lycamobile.us/joined-lyca/can-i-top-up-online-my-lyca-mobile
|
67 |
+
https://www.lycamobile.us/international-credit/how-can-i-check-my-balance-after-topping-up
|
68 |
+
https://www.lycamobile.us/how-to/how-to-check-balance-of-my-plan-allowance
|
69 |
+
https://www.lycamobile.us/california-billing-notice
|
70 |
+
https://www.lycamobile.us/en/freesim/?utm_source=website&utm_medium=onpage&utm_campaign=JoinLyca-EGO_USA_ENG_WCO_GLP_BRND
|
71 |
+
https://www.lycamobile.us/faq/are-there-any-monthly-or-hidden-charges
|
72 |
+
https://www.lycamobile.us/en/activate-sim/?utm_source=website&utm_medium=onpage&utm_campaign=HelpFAQ-EGO_USA_ENG_WCO_GLP_BRND
|
73 |
+
https://www.lycamobile.us/help/general-faq
|
74 |
+
https://www.lycamobile.us/faq/does-lyca-mobile-offer-group-porting/multi-subscription
|
75 |
+
https://www.lycamobile.us/international-credit/how-can-i-view-my-international-credit-top-up-history-and-transactions
|
76 |
+
https://www.lycamobile.us/port-in
|
77 |
+
https://www.lycamobile.us/help/frequently-asked-question/
|
78 |
+
https://www.lycamobile.us/help/renewal
|
79 |
+
https://www.lycamobile.us/faq/what-different-plans-do-you-offer
|
80 |
+
https://www.lycamobile.us/cheap_call/cheap-calls-to-india
|
81 |
+
https://www.lycamobile.us/help/data-allowance
|
82 |
+
https://www.lycamobile.us/help/4g-coverage-and-services
|
83 |
+
https://www.lycamobile.us/en/lia-chat/
|
84 |
+
https://www.lycamobile.us/en/registration/?utm_source=website&utm_medium=onpage&utm_campaign=HelpFAQ-EGO_USA_ENG_WCO_GLP_BRND
|
85 |
+
https://www.lycamobile.us/how-to/do-you-have-any-monthly-or-hidden-fees
|
86 |
+
https://www.lycamobile.us/about-to-join-lyca-mobile/do-i-need-to-sign-a-contract-for-a-prepay
|
87 |
+
https://www.lycamobile.us/en/?utm_source=website&utm_medium=onpage&utm_campaign=JoinedLyca-EGO_USA_ENG_WCO_GLP_BRND
|
88 |
+
https://www.lycamobile.us/quick-top-up
|
89 |
+
https://www.lycamobile.us/about-to-join-lyca-mobile/how-to-order-a-lyca-mobile-sim
|
90 |
+
https://www.lycamobile.us/joined-lyca/what-is-the-process-for-obtaining-my-lyca-mobile
|
91 |
+
https://www.lycamobile.us/how-to/how-to-check-lycamobile-internet-data-balance
|
92 |
+
https://www.lycamobile.us/faq/how-can-i-deactivate-my-voicemail-service
|
93 |
+
https://www.lycamobile.us/faq/is-this-website-accessible
|
94 |
+
https://www.lycamobile.us/how-to/how-can-i-deactivate-my-voicemail-service
|
95 |
+
https://www.lycamobile.us/faq/why-is-there-a-flashing-envelope-on-my-cell
|
96 |
+
https://www.lycamobile.us/help/joined-lyca
|
97 |
+
https://www.lycamobile.us/international-credit/what-do-i-do-if-i-face-issues-while-topping-up-international-credit-online
|
98 |
+
https://www.lycamobile.us/freesim
|
99 |
+
https://www.lycamobile.us/international-credit/is-there-a-minimum-top-up-amount-for-my-prepay-number
|
100 |
+
https://www.lycamobile.us/help-support/
|
101 |
+
https://www.lycamobile.us/help/about-to-join-lyca-mobile
|
102 |
+
https://www.lycamobile.us/become-a-retailer/
|
103 |
+
https://www.lycamobile.us/how-to/i-have-not-used-my-lycamobile-for-a-while-and-it-has-now-stopped-working-why-is-this
|
104 |
+
https://www.lycamobile.us/international-credit/what-does-international-credit-addon-mean
|
105 |
+
https://www.lycamobile.us/plans/prepaid-phone-plans#long-term-plans
|
106 |
+
https://www.lycamobile.us/en/activate-sim/
|
107 |
+
https://www.lycamobile.us/about-to-join-lyca-mobile/is-it-mandatory-to-order-lyca-mobile
|
108 |
+
https://www.lycamobile.us/rates/national
|
109 |
+
https://www.lycamobile.us/faq/how-do-i-contact-someone-if-i-have-a-problem
|
110 |
+
https://www.lycamobile.us/studentbeans/
|
111 |
+
https://www.lycamobile.us/privacy-policy
|
112 |
+
https://www.lycamobile.us/en/activate-sim/?utm_source=website&utm_medium=onpage&utm_campaign=JoinedLyca-EGO_USA_ENG_WCO_GLP_BRND
|
113 |
+
https://www.lycamobile.us/help/lycamobile.co.uk
|
114 |
+
https://www.lycamobile.us/activate-plan/
|
115 |
+
https://www.lycamobile.us/help/sms-notifications
|
116 |
+
https://www.lycamobile.us/joined-lyca/how-to-activate-your-pay-as-you-go-sim
|
117 |
+
https://www.lycamobile.us/help/how-to-switch-to-lyca-mobile
|
118 |
+
https://www.lycamobile.us/how-to/how-to-activate-mobile-internet-on-my-phone
|
119 |
+
https://www.lycamobile.us/help/General
|
120 |
+
https://www.lycamobile.us/faq/how-to-activate-my-sim-card
|
121 |
+
https://www.lycamobile.us/faq/my-sim-card-is-lost-stolen-how-do-i-prevent-someone-else-from-using-it
|
122 |
+
https://www.lycamobile.us/how-to/how-can-i-do-a-quick-recharge-or-refill-my-lyca-prepay-number-international-calling-credit
|
123 |
+
https://www.lycamobile.us/en/quick-top-up/?utm_source=website&utm_medium=onpage&utm_campaign=JoinedLyca-EGO_USA_ENG_WCO_GLP_BRND
|
124 |
+
https://www.lycamobile.us/plans/buy-a-additional-line
|
125 |
+
https://www.lycamobile.us/help/already-joined-us
|
126 |
+
https://www.lycamobile.us/plan-changes-update/
|
127 |
+
https://www.lycamobile.us/already-joined-us/how-to-manage-your-saved-credit-debit-cards
|
128 |
+
https://www.lycamobile.us/faq/which-mobile-handsets-can-i-use-with-lyca-mobile
|
129 |
+
https://www.lycamobile.us/plans/prepaid-phone-plans#best-value
|
130 |
+
https://www.lycamobile.us/help/contact-us
|
131 |
+
https://www.lycamobile.us/ios
|
132 |
+
https://www.lycamobile.us/help/how-to-switch-to-lyca-mobile
|
133 |
+
https://www.lycamobile.us/cheap_call/cheap-calls-to-india
|
134 |
+
https://www.lycamobile.us/store-locator
|
135 |
+
https://www.lycamobile.us/help/portin-status
|
136 |
+
https://www.lycamobile.us/quick-top-up
|
137 |
+
https://www.lycamobile.us/termscondition
|
138 |
+
https://www.lycamobile.us/activate-plan/
|
139 |
+
https://www.lycamobile.us/become-a-retailer/
|
140 |
+
https://www.lycamobile.us/activate-sim
|
141 |
+
https://www.lycamobile.us/help-support
|
142 |
+
https://www.lycamobile.us/help/mobile-web-settings/
|
143 |
+
https://www.lycamobile.us/en/cookie-policy
|
144 |
+
https://www.lycamobile.us/rates/national
|
145 |
+
https://www.lycamobile.us/plans/prepaid-phone-plans#30-days-plans
|
146 |
+
https://www.lycamobile.us/use-of-this-website
|
147 |
+
https://www.lycamobile.us/registration
|
148 |
+
https://www.lycamobile.us/plans/prepaid-phone-plans/refill-plans/
|
149 |
+
https://www.lycamobile.us/california-mts/
|
150 |
+
https://www.lycamobile.us/en/activate-sim/
|
151 |
+
https://www.lycamobile.us/en/lia-chat/
|
152 |
+
https://www.lycamobile.us/en/online-security
|
153 |
+
https://www.lycamobile.us/studentbeans/
|
154 |
+
https://www.lycamobile.us/california-billing-notice
|
155 |
+
https://www.lycamobile.us/become-a-retailer
|
156 |
+
https://www.lycamobile.us/plan-changes-update/
|
157 |
+
https://www.lycamobile.us/2g_shutdown/
|
158 |
+
https://www.lycamobile.us/android
|
159 |
+
https://www.lycamobile.us/port-in
|
160 |
+
https://www.lycamobile.us/en/lia-chat
|
161 |
+
https://www.lycamobile.us/help/frequently-asked-question
|
162 |
+
https://www.lycamobile.us/freesim
|
163 |
+
https://www.lycamobile.us/help/4g-coverage-and-services
|
164 |
+
https://www.lycamobile.us/plans/prepaid-phone-plans#long-term-plans
|
165 |
+
https://www.lycamobile.us/blog/en/
|
166 |
+
https://www.lycamobile.us/about-us
|
167 |
+
https://www.lycamobile.us/help/frequently-asked-question/
|
168 |
+
https://www.lycamobile.us/privacy-policy
|
169 |
+
https://www.lycamobile.us/become-a-retailer/
|
170 |
+
https://www.lycamobile.us/help/General
|
171 |
+
https://www.lycamobile.us/help/data-add-on
|
172 |
+
https://www.lycamobile.us/blog/en/
|
173 |
+
https://www.lycamobile.us/activate-sim
|
174 |
+
https://www.lycamobile.us/studentbeans/
|
175 |
+
https://www.lycamobile.us/use-of-this-website
|
176 |
+
https://www.lycamobile.us/help/portin-status
|
177 |
+
https://www.lycamobile.us/california-billing-notice
|
178 |
+
https://www.lycamobile.us/california-mts/
|
179 |
+
https://www.lycamobile.us/help/about-to-join-lyca-mobile
|
180 |
+
https://www.lycamobile.us/en/cookie-policy
|
181 |
+
https://www.lycamobile.us/help/mobile-web-settings/
|
182 |
+
https://www.lycamobile.us/quick-top-up
|
183 |
+
https://www.lycamobile.us/registration
|
184 |
+
https://www.lycamobile.us/about-to-join-lyca-mobile/do-i-need-to-sign-a-contract-for-a-prepay
|
185 |
+
https://www.lycamobile.us/help/contact-us
|
186 |
+
https://www.lycamobile.us/activate-plan/
|
187 |
+
https://www.lycamobile.us/help/data-allowance
|
188 |
+
https://www.lycamobile.us/about-to-join-lyca-mobile/how-to-order-a-lyca-mobile-sim
|
189 |
+
https://www.lycamobile.us/rates/national
|
190 |
+
https://www.lycamobile.us/help/international-credit
|
191 |
+
https://www.lycamobile.us/en/online-security
|
192 |
+
https://www.lycamobile.us/joined-lyca/can-i-top-up-online-my-lyca-mobile
|
193 |
+
https://www.lycamobile.us/android
|
194 |
+
https://www.lycamobile.us/about-us
|
195 |
+
https://www.lycamobile.us/ios
|
196 |
+
https://www.lycamobile.us/store-locator
|
197 |
+
https://www.lycamobile.us/help/how-to
|
198 |
+
https://www.lycamobile.us/help/order
|
199 |
+
https://www.lycamobile.us/plans/prepaid-phone-plans#best-value
|
200 |
+
https://www.lycamobile.us/help/general-faq
|
201 |
+
https://www.lycamobile.us/help/frequently-asked-question
|
202 |
+
https://www.lycamobile.us/help/already-joined-us
|
203 |
+
https://www.lycamobile.us/help/esim
|
204 |
+
https://www.lycamobile.us/help/joined-lyca
|
205 |
+
https://www.lycamobile.us/already-joined-us/how-to-enable-auto-renewal
|
206 |
+
https://www.lycamobile.us/help/sms-notifications
|
207 |
+
https://www.lycamobile.us/termscondition
|
208 |
+
https://www.lycamobile.us/help/how-to-switch-to-lyca-mobile
|
209 |
+
https://www.lycamobile.us/help/wi-fi-calling-and-text/
|
210 |
+
https://www.lycamobile.us/privacy-policy
|
211 |
+
https://www.lycamobile.us/help/renewal
|
212 |
+
https://www.lycamobile.us/joined-lyca/how-to-activate-your-pay-as-you-go-sim
|
213 |
+
https://www.lycamobile.us/en/
|
214 |
+
https://www.lycamobile.us/joined-lyca/can-i-retain-my-current-mobile-number
|
215 |
+
https://www.lycamobile.us/help-support
|
216 |
+
https://www.lycamobile.us/plan-changes-update/
|
217 |
+
https://www.lycamobile.us/about-to-join-lyca-mobile/is-it-mandatory-to-order-lyca-mobile
|
218 |
+
https://www.lycamobile.us/about-to-join-lyca-mobile/what-information-do-i-need-to-provide
|
219 |
+
https://www.lycamobile.us/joined-lyca/how-to-switch-to-lyca-mobile
|
220 |
+
https://www.lycamobile.us/help/4g-coverage-and-services
|
221 |
+
https://www.lycamobile.us/already-joined-us/how-to-manage-your-saved-credit-debit-cards
|
222 |
+
https://www.lycamobile.us/freesim
|
223 |
+
https://www.lycamobile.us/2g_shutdown/
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
openai
|
2 |
+
pinecone-client
|
3 |
+
python-dotenv
|
4 |
+
beautifulsoup4
|
5 |
+
selenium
|
6 |
+
webdriver-manager
|
7 |
+
lxml
|
8 |
+
uuid
|
scrape.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from selenium.webdriver.chrome.service import Service
|
3 |
+
from selenium.webdriver.chrome.options import Options
|
4 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
from urllib.parse import urljoin, urlparse
|
7 |
+
import time
|
8 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9 |
+
import threading
|
10 |
+
|
11 |
+
# Create a lock for thread-safe operations
|
12 |
+
visited_lock = threading.Lock()
|
13 |
+
|
14 |
+
# Thread-safe set for visited URLs
|
15 |
+
visited = set()
|
16 |
+
|
17 |
+
# Function to scrape links with depth control
|
18 |
+
def get_all_links(url, max_depth, current_depth=0):
|
19 |
+
if current_depth > max_depth:
|
20 |
+
return []
|
21 |
+
|
22 |
+
try:
|
23 |
+
# Print the current URL being scraped
|
24 |
+
print(f"Scraping: {url} at depth {current_depth}")
|
25 |
+
|
26 |
+
# Set up Chrome options
|
27 |
+
chrome_options = Options()
|
28 |
+
chrome_options.add_argument("--headless") # Run in headless mode
|
29 |
+
|
30 |
+
# Set up the Chrome driver
|
31 |
+
service = Service(ChromeDriverManager().install())
|
32 |
+
driver = webdriver.Chrome(service=service, options=chrome_options)
|
33 |
+
|
34 |
+
# Navigate to the URL
|
35 |
+
driver.get(url)
|
36 |
+
|
37 |
+
# Wait for the page to load (adjust the sleep time if needed)
|
38 |
+
time.sleep(5)
|
39 |
+
|
40 |
+
# Get the page source and parse it with BeautifulSoup
|
41 |
+
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
42 |
+
|
43 |
+
# Find all 'a' tags and extract the 'href' attribute
|
44 |
+
links = set()
|
45 |
+
for a_tag in soup.find_all('a', href=True):
|
46 |
+
href = a_tag['href']
|
47 |
+
full_url = urljoin(url, href)
|
48 |
+
|
49 |
+
# Only include links from the same domain and not already visited
|
50 |
+
with visited_lock:
|
51 |
+
if urlparse(full_url).netloc == urlparse(url).netloc and full_url not in visited:
|
52 |
+
visited.add(full_url)
|
53 |
+
links.add(full_url)
|
54 |
+
|
55 |
+
# Close the browser
|
56 |
+
driver.quit()
|
57 |
+
|
58 |
+
return list(links)
|
59 |
+
|
60 |
+
except Exception as e:
|
61 |
+
print(f"Error fetching the URL: {e}")
|
62 |
+
return []
|
63 |
+
|
64 |
+
def scrape_recursive(urls, max_depth, current_depth, executor):
|
65 |
+
if current_depth > max_depth:
|
66 |
+
return []
|
67 |
+
|
68 |
+
# Submit tasks for the URLs to the ThreadPoolExecutor
|
69 |
+
futures = [executor.submit(get_all_links, url, max_depth, current_depth) for url in urls]
|
70 |
+
all_links = set()
|
71 |
+
|
72 |
+
for future in as_completed(futures):
|
73 |
+
try:
|
74 |
+
links = future.result()
|
75 |
+
all_links.update(links)
|
76 |
+
except Exception as e:
|
77 |
+
print(f"Error in thread: {e}")
|
78 |
+
|
79 |
+
# Recursively scrape the new set of links
|
80 |
+
if current_depth + 1 <= max_depth:
|
81 |
+
new_links = scrape_recursive(all_links, max_depth, current_depth + 1, executor)
|
82 |
+
all_links.update(new_links)
|
83 |
+
|
84 |
+
return all_links
|
85 |
+
|
86 |
+
def main():
|
87 |
+
# Get input URL and depth from the user
|
88 |
+
input_url = input("Enter the URL to scrape: ")
|
89 |
+
max_depth = int(input("Enter the maximum depth: "))
|
90 |
+
|
91 |
+
# ThreadPoolExecutor for multithreading
|
92 |
+
with ThreadPoolExecutor(max_workers=10) as executor:
|
93 |
+
# Start scraping
|
94 |
+
all_links = scrape_recursive([input_url], max_depth, 0, executor)
|
95 |
+
|
96 |
+
# Save the results to links.txt
|
97 |
+
with open("links.txt", "w") as file:
|
98 |
+
for link in all_links:
|
99 |
+
file.write(f"{link}\n")
|
100 |
+
|
101 |
+
print(f"\nFound {len(all_links)} links on the page. Saved to links.txt.")
|
102 |
+
|
103 |
+
if __name__ == "__main__":
|
104 |
+
main()
|
upsert.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from openai import OpenAI
|
3 |
+
from pinecone import Pinecone, ServerlessSpec
|
4 |
+
import uuid
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
+
import requests
|
8 |
+
import time
|
9 |
+
import argparse
|
10 |
+
from playwright.sync_api import sync_playwright
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
# Set up OpenAI client
|
15 |
+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
16 |
+
|
17 |
+
# Set up Pinecone
|
18 |
+
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
19 |
+
|
20 |
+
index_name = "lyca" # Your index name
|
21 |
+
|
22 |
+
def ensure_index_exists():
|
23 |
+
try:
|
24 |
+
index = pc.Index(index_name)
|
25 |
+
print(f"Index '{index_name}' already exists.")
|
26 |
+
except Exception as e:
|
27 |
+
print(f"Index '{index_name}' does not exist. Creating it now...")
|
28 |
+
pc.create_index(
|
29 |
+
name=index_name,
|
30 |
+
dimension=3072, # Dimension for text-embedding-3-large
|
31 |
+
metric="cosine",
|
32 |
+
spec=ServerlessSpec(
|
33 |
+
cloud="aws",
|
34 |
+
region="us-west-2"
|
35 |
+
)
|
36 |
+
)
|
37 |
+
print(f"Index '{index_name}' created successfully.")
|
38 |
+
|
39 |
+
return pc.Index(index_name)
|
40 |
+
|
41 |
+
def get_embedding(text):
|
42 |
+
response = client.embeddings.create(input=text, model="text-embedding-3-large")
|
43 |
+
return response.data[0].embedding
|
44 |
+
|
45 |
+
def process_web_link(url):
|
46 |
+
try:
|
47 |
+
with sync_playwright() as p:
|
48 |
+
browser = p.chromium.launch(headless=True)
|
49 |
+
page = browser.new_page()
|
50 |
+
page.goto(url)
|
51 |
+
|
52 |
+
# Wait for the content to load
|
53 |
+
time.sleep(5) # Adjust this value if needed
|
54 |
+
|
55 |
+
# Get the full page content
|
56 |
+
content = page.content()
|
57 |
+
|
58 |
+
browser.close()
|
59 |
+
|
60 |
+
# Parse the page content using BeautifulSoup
|
61 |
+
soup = BeautifulSoup(content, 'lxml')
|
62 |
+
|
63 |
+
# Remove script and style elements
|
64 |
+
for script in soup(["script", "style"]):
|
65 |
+
script.decompose()
|
66 |
+
|
67 |
+
# Get text
|
68 |
+
text = soup.get_text()
|
69 |
+
|
70 |
+
# Clean up the text
|
71 |
+
lines = (line.strip() for line in text.splitlines())
|
72 |
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
73 |
+
text = '\n'.join(chunk for chunk in chunks if chunk)
|
74 |
+
|
75 |
+
return text
|
76 |
+
except Exception as e:
|
77 |
+
print(f"Error processing web link {url}: {str(e)}")
|
78 |
+
return f"Error processing {url}: {str(e)}"
|
79 |
+
|
80 |
+
def process_and_upsert_link(url, index):
|
81 |
+
print(f"Processing {url}")
|
82 |
+
content = process_web_link(url)
|
83 |
+
doc_id = str(uuid.uuid4())
|
84 |
+
content = content[:5000]
|
85 |
+
content_length = len(content)
|
86 |
+
print(f"Content extracted, length: {content_length}")
|
87 |
+
|
88 |
+
embedding = get_embedding(content)
|
89 |
+
vector = (doc_id, embedding, {
|
90 |
+
"text": content,
|
91 |
+
"type": "Web Link",
|
92 |
+
"doc_id": doc_id,
|
93 |
+
"doc_name": url,
|
94 |
+
"chunk_index": 0
|
95 |
+
})
|
96 |
+
|
97 |
+
print(f"Generated vector for {url}")
|
98 |
+
|
99 |
+
index.upsert(vectors=[vector])
|
100 |
+
print(f"Vector upserted to Pinecone for {url}")
|
101 |
+
|
102 |
+
def clean_database(index):
|
103 |
+
try:
|
104 |
+
print("Cleaning the database...")
|
105 |
+
index.delete(delete_all=True)
|
106 |
+
print("Database cleaned.")
|
107 |
+
except Exception as e:
|
108 |
+
print(f"Error cleaning database: {str(e)}")
|
109 |
+
print("Continuing with the script...")
|
110 |
+
|
111 |
+
def main():
|
112 |
+
parser = argparse.ArgumentParser(description="Process web links and upsert to Pinecone.")
|
113 |
+
parser.add_argument("--clean", action="store_true", help="Clean the database before upserting")
|
114 |
+
args = parser.parse_args()
|
115 |
+
|
116 |
+
index = ensure_index_exists()
|
117 |
+
|
118 |
+
if args.clean:
|
119 |
+
clean_database(index)
|
120 |
+
|
121 |
+
with open('links.txt', 'r') as file:
|
122 |
+
links = [line.strip() for line in file if line.strip()]
|
123 |
+
|
124 |
+
for link in links:
|
125 |
+
process_and_upsert_link(link, index)
|
126 |
+
|
127 |
+
if __name__ == "__main__":
|
128 |
+
main()
|