Spaces:
Running
Running
File size: 4,787 Bytes
b7c716e a78722c b7c716e a78722c b7c716e a471490 b7c716e a78722c b7c716e a471490 a78722c a471490 cd921da a471490 a78722c a471490 cd921da a471490 b7c716e cd921da a471490 cd921da a78722c a471490 a78722c a471490 a78722c a471490 cd921da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import requests
import os
import streamlit as st
import pickle
import time
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain_groq import ChatGroq
from langchain.vectorstores import FAISS
from dotenv import load_dotenv
load_dotenv() # take environment variables from .env (especially openai api key)
st.title("RockyBot: News Research Tool π")
st.sidebar.title("News Article URLs")
urls = []
for i in range(3):
url = st.sidebar.text_input(f"URL {i+1}")
urls.append(url)
process_url_clicked = st.sidebar.button("Process URLs")
file_path = "faiss_store_openai.pkl"
main_placeholder = st.empty()
llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.9, max_tokens=500)
# Debugging: Check if URLs are accessible
def check_url(url):
try:
response = requests.get(url)
if response.status_code == 200:
return True
else:
return False
except Exception as e:
return False
if process_url_clicked:
# Debugging: Verify URL accessibility
valid_urls = []
for url in urls:
if check_url(url):
valid_urls.append(url)
else:
main_placeholder.text(f"URL is not accessible: {url}")
if not valid_urls:
main_placeholder.text("None of the URLs are accessible.")
# Load data from URLs
loader = UnstructuredURLLoader(urls=valid_urls)
main_placeholder.text("Data Loading...Started...β
β
β
")
try:
data = loader.load()
except Exception as e:
main_placeholder.text(f"Error loading data: {e}")
# Split data into chunks
text_splitter = RecursiveCharacterTextSplitter(
separators=['\n\n', '\n', '.', ','],
chunk_size=1000
)
main_placeholder.text("Text Splitter...Started...β
β
β
")
docs = text_splitter.split_documents(data)
# Debugging: Check if docs is empty
if not docs:
main_placeholder.text("No valid documents found! Please check the URLs.")
# Debugging: Check the content of docs
for doc in docs:
main_placeholder.text(f"Document content: {doc.page_content[:200]}") # Show first 200 chars of each document
# Create embeddings using HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
main_placeholder.text("Embedding Vector Started Building...β
β
β
")
# Generate embeddings
embeddings = embedding_model.embed_documents([doc.page_content for doc in docs])
# Debugging: Check if embeddings are generated
if not embeddings:
main_placeholder.text("No embeddings were generated! Check the embedding model or document content.")
# Check the size of embeddings
main_placeholder.text(f"Generated {len(embeddings)} embeddings.")
# Convert embeddings to numpy array (needed by FAISS)
embeddings_np = np.array(embeddings).astype(np.float32)
# Check the shape of embeddings
main_placeholder.text(f"Shape of embeddings: {embeddings_np.shape}")
# Create FAISS index
if len(embeddings) > 0:
dimension = len(embeddings[0]) # Embedding vector dimension
index = FAISS(dimension)
index.add(embeddings_np) # Add embeddings to FAISS index
# Wrap FAISS index using LangChain FAISS wrapper
vectorstore_huggingface = FAISS(embedding_function=embedding_model, index=index)
# Save the FAISS index to a pickle file
with open(file_path, "wb") as f:
pickle.dump(vectorstore_huggingface, f)
time.sleep(2)
else:
main_placeholder.text("Embeddings could not be generated, skipping FAISS index creation.")
query = main_placeholder.text_input("Question: ")
if query:
if os.path.exists(file_path):
# Load the FAISS index from the pickle file
with open(file_path, "rb") as f:
vectorstore = pickle.load(f)
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
result = chain({"question": query}, return_only_outputs=True)
# Display the answer
st.header("Answer")
st.write(result["answer"])
# Display sources, if available
sources = result.get("sources", "")
if sources:
st.subheader("Sources:")
sources_list = sources.split("\n") # Split the sources by newline
for source in sources_list:
st.write(source)
|