Spaces:
Running
Running
import requests | |
import os | |
import streamlit as st | |
import pickle | |
import time | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain.chains import RetrievalQAWithSourcesChain | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.document_loaders import UnstructuredURLLoader | |
from langchain_groq import ChatGroq | |
from langchain.vectorstores import FAISS | |
from dotenv import load_dotenv | |
load_dotenv() # take environment variables from .env (especially openai api key) | |
st.title("RockyBot: News Research Tool π") | |
st.sidebar.title("News Article URLs") | |
urls = [] | |
for i in range(3): | |
url = st.sidebar.text_input(f"URL {i+1}") | |
urls.append(url) | |
process_url_clicked = st.sidebar.button("Process URLs") | |
file_path = "faiss_store_openai.pkl" | |
main_placeholder = st.empty() | |
llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.9, max_tokens=500) | |
# Debugging: Check if URLs are accessible | |
def check_url(url): | |
try: | |
response = requests.get(url) | |
if response.status_code == 200: | |
return True | |
else: | |
return False | |
except Exception as e: | |
return False | |
if process_url_clicked: | |
# Debugging: Verify URL accessibility | |
valid_urls = [] | |
for url in urls: | |
if check_url(url): | |
valid_urls.append(url) | |
else: | |
main_placeholder.text(f"URL is not accessible: {url}") | |
if not valid_urls: | |
main_placeholder.text("None of the URLs are accessible.") | |
# Load data from URLs | |
loader = UnstructuredURLLoader(urls=valid_urls) | |
main_placeholder.text("Data Loading...Started...β β β ") | |
try: | |
data = loader.load() | |
except Exception as e: | |
main_placeholder.text(f"Error loading data: {e}") | |
# Split data into chunks | |
text_splitter = RecursiveCharacterTextSplitter( | |
separators=['\n\n', '\n', '.', ','], | |
chunk_size=1000 | |
) | |
main_placeholder.text("Text Splitter...Started...β β β ") | |
docs = text_splitter.split_documents(data) | |
# Debugging: Check if docs is empty | |
if not docs: | |
main_placeholder.text("No valid documents found! Please check the URLs.") | |
# Debugging: Check the content of docs | |
for doc in docs: | |
main_placeholder.text(f"Document content: {doc.page_content[:200]}") # Show first 200 chars of each document | |
# Create embeddings using HuggingFaceEmbeddings | |
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
main_placeholder.text("Embedding Vector Started Building...β β β ") | |
# Generate embeddings | |
embeddings = embedding_model.embed_documents([doc.page_content for doc in docs]) | |
# Debugging: Check if embeddings are generated | |
if not embeddings: | |
main_placeholder.text("No embeddings were generated! Check the embedding model or document content.") | |
# Check the size of embeddings | |
main_placeholder.text(f"Generated {len(embeddings)} embeddings.") | |
# Convert embeddings to numpy array (needed by FAISS) | |
embeddings_np = np.array(embeddings).astype(np.float32) | |
# Check the shape of embeddings | |
main_placeholder.text(f"Shape of embeddings: {embeddings_np.shape}") | |
# Create FAISS index | |
if len(embeddings) > 0: | |
dimension = len(embeddings[0]) # Embedding vector dimension | |
index = FAISS(dimension) | |
index.add(embeddings_np) # Add embeddings to FAISS index | |
# Wrap FAISS index using LangChain FAISS wrapper | |
vectorstore_huggingface = FAISS(embedding_function=embedding_model, index=index) | |
# Save the FAISS index to a pickle file | |
with open(file_path, "wb") as f: | |
pickle.dump(vectorstore_huggingface, f) | |
time.sleep(2) | |
else: | |
main_placeholder.text("Embeddings could not be generated, skipping FAISS index creation.") | |
query = main_placeholder.text_input("Question: ") | |
if query: | |
if os.path.exists(file_path): | |
# Load the FAISS index from the pickle file | |
with open(file_path, "rb") as f: | |
vectorstore = pickle.load(f) | |
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever()) | |
result = chain({"question": query}, return_only_outputs=True) | |
# Display the answer | |
st.header("Answer") | |
st.write(result["answer"]) | |
# Display sources, if available | |
sources = result.get("sources", "") | |
if sources: | |
st.subheader("Sources:") | |
sources_list = sources.split("\n") # Split the sources by newline | |
for source in sources_list: | |
st.write(source) | |