File size: 14,799 Bytes
c9e6ba4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 |
import os
import streamlit as st
from openai import OpenAI
from PyPDF2 import PdfReader
import requests
from youtube_transcript_api import YouTubeTranscriptApi
from urllib.parse import urlparse, parse_qs
from pinecone import Pinecone
import uuid
from dotenv import load_dotenv
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import time
import re
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
from datetime import datetime
# Set page config at the very beginning
st.set_page_config(layout="wide")
# Load environment variables
load_dotenv()
# Set up OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# Set up Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = "lyca" # Your index name
index = pc.Index(index_name)
# Set up MongoDB connection
mongo_uri = os.getenv("MONGODB_URI")
if not mongo_uri:
st.error("MONGO_URI is not set. Please check your .env file.")
else:
print(f"MONGO_URI loaded: {mongo_uri[:10]}...") # Print only first 10 chars for security
try:
client = MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
client.server_info() # This will raise an exception if the connection fails
db = client['lyca']
sim_swap_collection = db['sim_swap_requests']
except ConnectionFailure:
st.error("Failed to connect to MongoDB. Please check your connection and try again later.")
sim_swap_collection = None
def get_embedding(text):
response = client.embeddings.create(input=text, model="text-embedding-3-large")
return response.data[0].embedding
def process_pdf(file):
reader = PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def process_web_link(url):
try:
# Set up Selenium options
chrome_options = Options()
chrome_options.add_argument("--headless") # Run in headless mode for performance
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# Install the Chrome driver automatically using webdriver-manager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
# Navigate to the URL
driver.get(url)
# Give the page some time to load fully
time.sleep(3)
# Extract the rendered page's content
page_source = driver.page_source
# Close the browser after extracting content
driver.quit()
# Parse the page content using BeautifulSoup
soup = BeautifulSoup(page_source, 'lxml')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Get text
text = soup.get_text()
# Clean up the text
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
print(f"Error processing web link {url}: {str(e)}")
return f"Error processing {url}: {str(e)}"
def process_youtube_link(url):
video_id = extract_video_id(url)
transcript = YouTubeTranscriptApi.get_transcript(video_id)
return " ".join([entry['text'] for entry in transcript])
def extract_video_id(url):
parsed_url = urlparse(url)
if parsed_url.hostname == 'youtu.be':
return parsed_url.path[1:]
if parsed_url.hostname in ('www.youtube.com', 'youtube.com'):
if parsed_url.path == '/watch':
return parse_qs(parsed_url.query)['v'][0]
if parsed_url.path[:7] == '/embed/':
return parsed_url.path.split('/')[2]
if parsed_url.path[:3] == '/v/':
return parsed_url.path.split('/')[2]
return None
def process_upload(upload_type, file_or_link, file_name=None):
print(f"Starting process_upload for {upload_type}")
doc_id = str(uuid.uuid4())
print(f"Generated doc_id: {doc_id}")
if upload_type == "PDF":
content = process_pdf(file_or_link)
doc_name = file_name or "Uploaded PDF"
elif upload_type == "Web Link":
content = process_web_link(file_or_link)
doc_name = file_or_link
elif upload_type == "YouTube Link":
content = process_youtube_link(file_or_link)
doc_name = f"YouTube: {file_or_link}"
else:
print("Invalid upload type")
return "Invalid upload type"
content_length = len(content)
print(f"Content extracted, length: {content_length}")
# Dynamically adjust chunk size based on content length
if content_length < 10000:
chunk_size = 1000
elif content_length < 100000:
chunk_size = 2000
else:
chunk_size = 4000
print(f"Using chunk size: {chunk_size}")
chunks = [content[i:i+chunk_size] for i in range(0, content_length, chunk_size)]
vectors = []
with ThreadPoolExecutor() as executor:
futures = [executor.submit(process_chunk, chunk, doc_id, i, upload_type, doc_name) for i, chunk in enumerate(chunks)]
for future in as_completed(futures):
vectors.append(future.result())
# Update progress
progress = len(vectors) / len(chunks)
st.session_state.upload_progress.progress(progress)
print(f"Generated {len(vectors)} vectors")
index.upsert(vectors=vectors)
print("Vectors upserted to Pinecone")
return f"Processing complete for {upload_type}. Document Name: {doc_name}"
def process_chunk(chunk, doc_id, i, upload_type, doc_name):
embedding = get_embedding(chunk)
return (f"{doc_id}_{i}", embedding, {
"text": chunk,
"type": upload_type,
"doc_id": doc_id,
"doc_name": doc_name,
"chunk_index": i
})
def get_relevant_context(query, top_k=5):
print(f"Getting relevant context for query: {query}")
query_embedding = get_embedding(query)
search_results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
print(f"Found {len(search_results['matches'])} relevant results")
# Sort results by doc_id and chunk_index to maintain document structure
sorted_results = sorted(search_results['matches'], key=lambda x: (x['metadata']['doc_id'], x['metadata']['chunk_index']))
context = "\n".join([result['metadata']['text'] for result in sorted_results])
return context, sorted_results
def check_lyca_data_loaded():
# Check if there are any vectors in the index
stats = index.describe_index_stats()
return stats['total_vector_count'] > 0
def load_lyca_mobile_data():
if check_lyca_data_loaded():
return "Lyca Mobile data is already loaded."
lyca_links = [line.strip() for line in open('links.txt', 'r')]
for link in lyca_links:
process_upload("Web Link", link)
return "Lyca Mobile data loaded into vector database"
def general_conversation(message):
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant for Lyca Mobile customers. If you don't know the answer, politely say so."},
{"role": "user", "content": message}
]
)
return response.choices[0].message.content
def is_sim_swap_request(message):
sim_swap_keywords = {'sim', 'swap', 'change', 'new', 'replace'}
# Remove the question mark at the end if it exists
message = message.rstrip('?')
message_words = set(message.lower().split())
return len(sim_swap_keywords.intersection(message_words)) >= 2
# Add a print statement for debugging
print(f"is_sim_swap_request result: {is_sim_swap_request('how to change my sim?')}")
def trigger_sim_swap_workflow():
st.session_state.workflow = 'sim_swap'
st.session_state.workflow_step = 0
def process_sim_swap_workflow():
st.subheader("SIM Swap Request Form")
with st.form("sim_swap_form"):
full_name = st.text_input("Please enter your full name:")
phone_number = st.text_input("Please enter your phone number:")
email = st.text_input("Please enter your email address:")
current_sim = st.text_input("Please enter your current SIM card number:")
reason = st.text_area("Please enter the reason for SIM swap:")
submitted = st.form_submit_button("Submit")
if submitted:
if sim_swap_collection is None:
st.error("Unable to process your request due to a database connection issue. Please try again later.")
else:
user_data = {
"full_name": full_name,
"phone_number": phone_number,
"email": email,
"current_sim": current_sim,
"reason": reason,
"timestamp": datetime.now()
}
try:
sim_swap_collection.insert_one(user_data)
st.success("Thank you for providing your information. Your SIM swap request has been submitted and stored successfully.")
st.session_state.workflow = None
except Exception as e:
st.error(f"An error occurred while storing your information: {str(e)}")
st.warning("Please try submitting your request again. If the problem persists, please contact support.")
def chat_with_ai(message):
try:
query_embedding = get_embedding(message)
context, results = get_relevant_context(message)
if results and results[0]['score'] >= 0.4:
messages = [
{"role": "system", "content": "You are a helpful assistant for Lyca Mobile. Use the following information to answer the user's question, but don't mention the context directly in your response. If the information isn't in the context, say you don't know."},
{"role": "system", "content": f"Context: {context}"},
{"role": "user", "content": message}
]
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=messages
)
ai_response = response.choices[0].message.content
sources = [
{
"doc_id": result['metadata']['doc_id'],
"doc_name": result['metadata']['doc_name'],
"chunk_index": result['metadata']['chunk_index'],
"text": result['metadata']['text'],
"type": result['metadata']['type'],
"score": result['score']
}
for result in results
]
else:
# Fallback to general conversation if no relevant context is found or similarity is low
ai_response = general_conversation(message)
sources = []
return ai_response, sources
except Exception as e:
print(f"Error in chat_with_ai: {str(e)}")
return "I'm sorry, but I encountered an error while processing your request. Please try again later.", []
def clear_database():
print("Clearing database...")
index.delete(delete_all=True)
print("Database cleared")
return "Database cleared successfully."
# Streamlit UI
st.title("Lyca Mobile Assistant")
if 'workflow' not in st.session_state:
st.session_state.workflow = None
st.session_state.workflow_data = []
st.session_state.workflow_step = 0
if 'chat_history' not in st.session_state:
st.session_state.chat_history = []
# Create two columns instead of three
col1, col2 = st.columns([2, 1])
with col1:
st.header("Chat")
if st.session_state.workflow == 'sim_swap':
process_sim_swap_workflow()
else:
# Display chat history
for message in st.session_state.chat_history:
st.markdown(f"**{'You' if message['role'] == 'user' else 'AI'}:** {message['content']}")
user_input = st.text_input("How can I assist you with Lyca Mobile today?")
if st.button("Send"):
if user_input:
# Add debug print
print(f"User input: {user_input}")
is_swap_request = is_sim_swap_request(user_input)
print(f"Is sim swap request: {is_swap_request}")
if is_swap_request:
print("Triggering SIM swap workflow")
st.session_state.chat_history.append({"role": "user", "content": user_input})
st.session_state.chat_history.append({"role": "assistant", "content": "Certainly! I can help you with changing your SIM. Please fill out the following form to start the SIM swap process."})
st.session_state.workflow = 'sim_swap'
else:
print("Proceeding with regular chat flow")
# Existing code for non-sim-swap requests
st.session_state.chat_progress = st.progress(0)
response, sources = chat_with_ai(user_input)
st.session_state.chat_progress.progress(1.0)
# Add to chat history
st.session_state.chat_history.append({"role": "user", "content": user_input})
st.session_state.chat_history.append({"role": "assistant", "content": response})
# Display the latest messages
st.markdown("**You:** " + user_input)
st.markdown("**AI:** " + response)
# Store sources in session state for display in col2
st.session_state.sources = sources
st.session_state.chat_progress.empty()
else:
st.warning("Please enter a question.")
with col2:
st.header("Source Information")
if 'sources' in st.session_state and st.session_state.sources:
for i, source in enumerate(st.session_state.sources, 1):
with st.expander(f"Source {i} - {source['type']} ({source['doc_name']})"):
st.markdown(f"**Chunk Index:** {source['chunk_index']}")
st.text(source['text'])
else:
st.info("Ask a question to see source information here.") |