Spaces:

Deaksh
/

research-tool

Running

File size: 3,448 Bytes

a78722c
 
 
 
 
2c7dad0
a78722c
 
 
 
2c7dad0
a78722c
81e6a68
d809e9e
 
81e6a68
a78722c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c7dad0
d809e9e
 
 
 
2392c94
d809e9e
 
 
 
2392c94
d809e9e
 
 
 
 
 
 
 
 
 
 
 
 
2c7dad0
a78722c
 
 
 
 
 
2c7dad0
a78722c
81e6a68
 
a78722c
2c7dad0
cd921da
2c7dad0
 
 
a78722c
 
 
 
 
 
 
 
2c7dad0
a78722c
 
 
 
 
 
 
 
 
 
 
 
 
a471490
cd921da
2c7dad0

import os
import streamlit as st
import pickle
import time
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain_groq import ChatGroq
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.vectorstores import Chroma
import requests
from bs4 import BeautifulSoup


from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env (especially openai api key)

st.title("RockyBot: News Research Tool 📈")
st.sidebar.title("News Article URLs")

urls = []
for i in range(3):
    url = st.sidebar.text_input(f"URL {i+1}")
    urls.append(url)

process_url_clicked = st.sidebar.button("Process URLs")
file_path = "faiss_store_openai.pkl"

main_placeholder = st.empty()
llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.9, max_tokens=500)

if process_url_clicked:
    # load data
    #loader = UnstructuredURLLoader(urls=urls)
    #main_placeholder.text("Data Loading...Started...✅✅✅")
    #data = loader.load()
    def fetch_web_content(url):
      try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        return soup.get_text()
      except Exception as e:
        return f"Error fetching {url}: {str(e)}"

# Your list of URLs
url = url

# Display status message
main_placeholder.text("Data Loading...Started...✅✅✅")

# Fetch content
data = [fetch_web_content(url) for url in urls if url.strip()]

# Display completion message
main_placeholder.text("Data Loading...Completed...✅✅✅")
    # split data
    text_splitter = RecursiveCharacterTextSplitter(
        separators=['\n\n', '\n', '.', ','],
        chunk_size=1000
    )
    main_placeholder.text("Text Splitter...Started...✅✅✅")
    docs = text_splitter.split_documents(data)
    # create embeddings and save it to FAISS index
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    #vectorstore_huggingface = FAISS.from_documents(docs, embedding_model)
    vectorstore_huggingface = Chroma.from_documents(docs, embedding_model)
    main_placeholder.text("Embedding Vector Started Building...✅✅✅")
    time.sleep(2)

    # Save the FAISS index to a pickle file
    with open(file_path, "wb") as f:
        pickle.dump(vectorstore_huggingface, f)

query = main_placeholder.text_input("Question: ")
if query:
    if os.path.exists(file_path):
        with open(file_path, "rb") as f:
            vectorstore = pickle.load(f)
            chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
            result = chain({"question": query}, return_only_outputs=True)
            # result will be a dictionary of this format --> {"answer": "", "sources": [] }
            st.header("Answer")
            st.write(result["answer"])

            # Display sources, if available
            sources = result.get("sources", "")
            if sources:
                st.subheader("Sources:")
                sources_list = sources.split("\n")  # Split the sources by newline
                for source in sources_list:
                    st.write(source)