Spaces:
Sleeping
Sleeping
File size: 3,388 Bytes
a78722c 4d6a1a6 a78722c 81e6a68 4d6a1a6 a78722c 3a5932c e5c4166 ff0b56d 7dd654d e5c4166 4d6a1a6 a78722c 4d6a1a6 a78722c ff0b56d a78722c 4d6a1a6 d809e9e 4d6a1a6 d809e9e 4d6a1a6 a78722c 4d6a1a6 3a5932c 4d6a1a6 a78722c ea0eba0 a78722c 2c7dad0 4d6a1a6 2c7dad0 a78722c 4d6a1a6 a78722c 4d6a1a6 a78722c 4d6a1a6 a78722c 4d6a1a6 a78722c a471490 cd921da 2c7dad0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import os
import streamlit as st
import pickle
import time
import requests
from bs4 import BeautifulSoup
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain_openai import OpenAI
from langchain.llms import OpenAI
load_dotenv() # Load environment variables from .env file
st.title("RockyBot: News Research Tool π")
st.sidebar.title("News Article URLs")
# Collect URLs from user input
urls = [st.sidebar.text_input(f"URL {i+1}") for i in range(3)]
process_url_clicked = st.sidebar.button("Process URLs")
file_path = "faiss_store_openai.pkl"
main_placeholder = st.empty()
llm = OpenAI(model_name="gpt-4o", temperature=0.9, max_tokens=500)
#llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.9, max_tokens=500)
def fetch_web_content(url):
"""Fetches text content from a given URL using BeautifulSoup."""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
return soup.get_text()
except Exception as e:
return f"Error fetching {url}: {str(e)}"
if process_url_clicked:
main_placeholder.text("Data Loading...Started...β
β
β
")
# Fetch content from URLs
data = [fetch_web_content(url) for url in urls if url.strip()]
main_placeholder.text("Data Loading...Completed...β
β
β
")
# Split data into chunks
text_splitter = RecursiveCharacterTextSplitter(
separators=['\n\n', '\n', '.', ','],
chunk_size=1000
)
main_placeholder.text("Text Splitting...Started...β
β
β
")
docs = [Document(page_content=text) for text in data]
docs = text_splitter.split_documents(docs)
#docs = text_splitter.split_documents(data)
# Create embeddings and save to Chroma vector store
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
#vectorstore_huggingface = Chroma.from_documents(docs, embedding_model)
vectorstore_huggingface = FAISS.from_documents(docs, embedding_model)
main_placeholder.text("Embedding Vector Started Building...β
β
β
")
time.sleep(2)
# Save the vector store to a pickle file
with open(file_path, "wb") as f:
pickle.dump(vectorstore_huggingface, f)
# User query input
query = st.text_input("Question: ")
if query:
if os.path.exists(file_path):
with open(file_path, "rb") as f:
vectorstore = pickle.load(f)
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
result = chain({"question": query}, return_only_outputs=True)
# Display answer
st.header("Answer")
st.write(result["answer"])
# Display sources, if available
sources = result.get("sources", "")
if sources:
st.subheader("Sources:")
sources_list = sources.split("\n")
for source in sources_list:
st.write(source)
|