Spaces:

Deaksh
/

research-tool

Sleeping

App Files Files Community

Deaksh commited on Feb 18

Commit

a471490

verified ·

1 Parent(s): bcc4e52

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -9

app.py CHANGED Viewed

@@ -3,13 +3,12 @@ import streamlit as st
 import pickle
 import time
 from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain import OpenAI
 from langchain.chains import RetrievalQAWithSourcesChain
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.document_loaders import UnstructuredURLLoader
 from langchain_groq import ChatGroq
-from langchain.embeddings import OpenAIEmbeddings
 from langchain.vectorstores import FAISS
 from dotenv import load_dotenv
 load_dotenv()  # take environment variables from .env (especially openai api key)
@@ -29,35 +28,53 @@ main_placeholder = st.empty()
 llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.9, max_tokens=500)
 if process_url_clicked:
-    # load data
     loader = UnstructuredURLLoader(urls=urls)
     main_placeholder.text("Data Loading...Started...✅✅✅")
     data = loader.load()
-    # split data
     text_splitter = RecursiveCharacterTextSplitter(
         separators=['\n\n', '\n', '.', ','],
         chunk_size=1000
     )
     main_placeholder.text("Text Splitter...Started...✅✅✅")
     docs = text_splitter.split_documents(data)
-    # create embeddings and save it to FAISS index
     embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-    vectorstore_huggingface = FAISS.from_documents(docs, embedding_model)
     main_placeholder.text("Embedding Vector Started Building...✅✅✅")
-    time.sleep(2)
     # Save the FAISS index to a pickle file
     with open(file_path, "wb") as f:
         pickle.dump(vectorstore_huggingface, f)
 query = main_placeholder.text_input("Question: ")
 if query:
     if os.path.exists(file_path):
         with open(file_path, "rb") as f:
             vectorstore = pickle.load(f)
             chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
             result = chain({"question": query}, return_only_outputs=True)
-            # result will be a dictionary of this format --> {"answer": "", "sources": [] }
             st.header("Answer")
             st.write(result["answer"])
@@ -71,3 +88,4 @@ if query:

 import pickle
 import time
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.chains import RetrievalQAWithSourcesChain
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.document_loaders import UnstructuredURLLoader
 from langchain_groq import ChatGroq
 from langchain.vectorstores import FAISS
+import numpy as np
 from dotenv import load_dotenv
 load_dotenv()  # take environment variables from .env (especially openai api key)
 llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.9, max_tokens=500)
 if process_url_clicked:
+    # Load data from URLs
     loader = UnstructuredURLLoader(urls=urls)
     main_placeholder.text("Data Loading...Started...✅✅✅")
     data = loader.load()
+    # Split data into chunks
     text_splitter = RecursiveCharacterTextSplitter(
         separators=['\n\n', '\n', '.', ','],
         chunk_size=1000
     )
     main_placeholder.text("Text Splitter...Started...✅✅✅")
     docs = text_splitter.split_documents(data)
+    # Create embeddings using HuggingFaceEmbeddings
     embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
     main_placeholder.text("Embedding Vector Started Building...✅✅✅")
+    # Generate embeddings
+    embeddings = embedding_model.embed_documents([doc.page_content for doc in docs])
+    # Convert embeddings to numpy array (needed by FAISS)
+    embeddings_np = np.array(embeddings).astype(np.float32)
+    # Create FAISS index
+    dimension = len(embeddings[0])  # Embedding vector dimension
+    index = FAISS(dimension)
+    index.add(embeddings_np)  # Add embeddings to FAISS index
+    # Wrap FAISS index using LangChain FAISS wrapper
+    vectorstore_huggingface = FAISS(embedding_function=embedding_model, index=index)
     # Save the FAISS index to a pickle file
     with open(file_path, "wb") as f:
         pickle.dump(vectorstore_huggingface, f)
+    time.sleep(2)
 query = main_placeholder.text_input("Question: ")
 if query:
     if os.path.exists(file_path):
+        # Load the FAISS index from the pickle file
         with open(file_path, "rb") as f:
             vectorstore = pickle.load(f)
             chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
             result = chain({"question": query}, return_only_outputs=True)
+            # Display the answer
             st.header("Answer")
             st.write(result["answer"])