Spaces:
Sleeping
Sleeping
import os | |
import streamlit as st | |
import pickle | |
import time | |
import requests | |
from bs4 import BeautifulSoup | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain.chains import RetrievalQAWithSourcesChain | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import FAISS | |
from langchain_groq import ChatGroq | |
from langchain.schema import Document | |
import os | |
st.title("RockyBot: News Research Tool π") | |
st.sidebar.title("News Article URLs") | |
# Collect URLs from user input | |
urls = [st.sidebar.text_input(f"URL {i+1}") for i in range(3)] | |
process_url_clicked = st.sidebar.button("Process URLs") | |
file_path = "faiss_store_openai.pkl" | |
main_placeholder = st.empty() | |
llm = ChatGroq( | |
api_key=os.environ["GROQ_API_KEY"], # This will raise an error if unset | |
model_name="llama3-70b-8192" | |
) | |
def fetch_web_content(url): | |
"""Fetches text content from a given URL using BeautifulSoup.""" | |
try: | |
response = requests.get(url, timeout=10) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, "html.parser") | |
return soup.get_text() | |
except Exception as e: | |
return f"Error fetching {url}: {str(e)}" | |
if process_url_clicked: | |
main_placeholder.text("Data Loading...Started...β β β ") | |
# Fetch content from URLs | |
data = [(url, fetch_web_content(url)) for url in urls if url.strip()] | |
main_placeholder.text("Data Loading...Completed...β β β ") | |
# Split data into chunks | |
text_splitter = RecursiveCharacterTextSplitter( | |
separators=['\n\n', '\n', '.', ','], | |
chunk_size=1000 | |
) | |
main_placeholder.text("Text Splitting...Started...β β β ") | |
docs = [] | |
for url, text in data: | |
split_docs = text_splitter.split_text(text) | |
docs.extend([Document(page_content=chunk, metadata={"source": url}) for chunk in split_docs]) | |
main_placeholder.text("Text Splitting...Completed...β β β ") | |
# Create embeddings and save to FAISS vector store | |
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
vectorstore_huggingface = FAISS.from_documents(docs, embedding_model) | |
main_placeholder.text("Embedding Vector Started Building...β β β ") | |
time.sleep(2) | |
# Save the vector store to a pickle file | |
with open(file_path, "wb") as f: | |
pickle.dump(vectorstore_huggingface, f) | |
# User query input | |
query = st.text_input("Question: ") | |
if query: | |
if os.path.exists(file_path): | |
with open(file_path, "rb") as f: | |
vectorstore = pickle.load(f) | |
retriever = vectorstore.as_retriever() | |
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=retriever) | |
result = chain({"question": query}, return_only_outputs=True) | |
# Display answer | |
st.header("Answer") | |
st.write(result["answer"]) | |
# Display sources, if available | |
sources = result.get("sources", "").strip() | |
if sources: | |
st.subheader("Sources:") | |
sources_list = sources.split("\n") | |
for source in sources_list: | |
st.write(source) | |
else: | |
st.write("No sources found.") | |