Spaces:
Running
Running
File size: 3,448 Bytes
a78722c 2c7dad0 a78722c 2c7dad0 a78722c 81e6a68 d809e9e 81e6a68 a78722c 2c7dad0 d809e9e 2392c94 d809e9e 2392c94 d809e9e 2c7dad0 a78722c 2c7dad0 a78722c 81e6a68 a78722c 2c7dad0 cd921da 2c7dad0 a78722c 2c7dad0 a78722c a471490 cd921da 2c7dad0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import os
import streamlit as st
import pickle
import time
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain_groq import ChatGroq
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.vectorstores import Chroma
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
load_dotenv() # take environment variables from .env (especially openai api key)
st.title("RockyBot: News Research Tool π")
st.sidebar.title("News Article URLs")
urls = []
for i in range(3):
url = st.sidebar.text_input(f"URL {i+1}")
urls.append(url)
process_url_clicked = st.sidebar.button("Process URLs")
file_path = "faiss_store_openai.pkl"
main_placeholder = st.empty()
llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.9, max_tokens=500)
if process_url_clicked:
# load data
#loader = UnstructuredURLLoader(urls=urls)
#main_placeholder.text("Data Loading...Started...β
β
β
")
#data = loader.load()
def fetch_web_content(url):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
return soup.get_text()
except Exception as e:
return f"Error fetching {url}: {str(e)}"
# Your list of URLs
url = url
# Display status message
main_placeholder.text("Data Loading...Started...β
β
β
")
# Fetch content
data = [fetch_web_content(url) for url in urls if url.strip()]
# Display completion message
main_placeholder.text("Data Loading...Completed...β
β
β
")
# split data
text_splitter = RecursiveCharacterTextSplitter(
separators=['\n\n', '\n', '.', ','],
chunk_size=1000
)
main_placeholder.text("Text Splitter...Started...β
β
β
")
docs = text_splitter.split_documents(data)
# create embeddings and save it to FAISS index
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
#vectorstore_huggingface = FAISS.from_documents(docs, embedding_model)
vectorstore_huggingface = Chroma.from_documents(docs, embedding_model)
main_placeholder.text("Embedding Vector Started Building...β
β
β
")
time.sleep(2)
# Save the FAISS index to a pickle file
with open(file_path, "wb") as f:
pickle.dump(vectorstore_huggingface, f)
query = main_placeholder.text_input("Question: ")
if query:
if os.path.exists(file_path):
with open(file_path, "rb") as f:
vectorstore = pickle.load(f)
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
result = chain({"question": query}, return_only_outputs=True)
# result will be a dictionary of this format --> {"answer": "", "sources": [] }
st.header("Answer")
st.write(result["answer"])
# Display sources, if available
sources = result.get("sources", "")
if sources:
st.subheader("Sources:")
sources_list = sources.split("\n") # Split the sources by newline
for source in sources_list:
st.write(source)
|