Spaces:
Sleeping
Sleeping
File size: 3,312 Bytes
a78722c 4d6a1a6 a78722c 1dc72a3 4d6a1a6 3a5932c cf2e27f 4d6a1a6 b4ecdbb a78722c 4d6a1a6 a78722c cf2e27f a78722c 4d6a1a6 d809e9e 4d6a1a6 d809e9e 4d6a1a6 1dc72a3 4d6a1a6 a78722c 4d6a1a6 1dc72a3 a78722c ea0eba0 1dc72a3 a78722c 2c7dad0 4d6a1a6 2c7dad0 a78722c 4d6a1a6 a78722c 1dc72a3 a78722c 4d6a1a6 a78722c 4d6a1a6 a78722c 1dc72a3 a78722c 4d6a1a6 a78722c 1dc72a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import os
import streamlit as st
import pickle
import time
import requests
from bs4 import BeautifulSoup
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain.schema import Document
import os
st.title("RockyBot: News Research Tool π")
st.sidebar.title("News Article URLs")
# Collect URLs from user input
urls = [st.sidebar.text_input(f"URL {i+1}") for i in range(3)]
process_url_clicked = st.sidebar.button("Process URLs")
file_path = "faiss_store_openai.pkl"
main_placeholder = st.empty()
llm = ChatGroq(
api_key=os.environ["GROQ_API_KEY"], # This will raise an error if unset
model_name="llama3-70b-8192"
)
def fetch_web_content(url):
"""Fetches text content from a given URL using BeautifulSoup."""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
return soup.get_text()
except Exception as e:
return f"Error fetching {url}: {str(e)}"
if process_url_clicked:
main_placeholder.text("Data Loading...Started...β
β
β
")
# Fetch content from URLs
data = [(url, fetch_web_content(url)) for url in urls if url.strip()]
main_placeholder.text("Data Loading...Completed...β
β
β
")
# Split data into chunks
text_splitter = RecursiveCharacterTextSplitter(
separators=['\n\n', '\n', '.', ','],
chunk_size=1000
)
main_placeholder.text("Text Splitting...Started...β
β
β
")
docs = []
for url, text in data:
split_docs = text_splitter.split_text(text)
docs.extend([Document(page_content=chunk, metadata={"source": url}) for chunk in split_docs])
main_placeholder.text("Text Splitting...Completed...β
β
β
")
# Create embeddings and save to FAISS vector store
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore_huggingface = FAISS.from_documents(docs, embedding_model)
main_placeholder.text("Embedding Vector Started Building...β
β
β
")
time.sleep(2)
# Save the vector store to a pickle file
with open(file_path, "wb") as f:
pickle.dump(vectorstore_huggingface, f)
# User query input
query = st.text_input("Question: ")
if query:
if os.path.exists(file_path):
with open(file_path, "rb") as f:
vectorstore = pickle.load(f)
retriever = vectorstore.as_retriever()
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=retriever)
result = chain({"question": query}, return_only_outputs=True)
# Display answer
st.header("Answer")
st.write(result["answer"])
# Display sources, if available
sources = result.get("sources", "").strip()
if sources:
st.subheader("Sources:")
sources_list = sources.split("\n")
for source in sources_list:
st.write(source)
else:
st.write("No sources found.")
|