Deaksh commited on
Commit
2c7dad0
Β·
verified Β·
1 Parent(s): b7c716e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -76
app.py CHANGED
@@ -1,13 +1,14 @@
1
- import requests
2
  import os
3
  import streamlit as st
4
  import pickle
5
  import time
6
  from langchain_community.embeddings import HuggingFaceEmbeddings
 
7
  from langchain.chains import RetrievalQAWithSourcesChain
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from langchain.document_loaders import UnstructuredURLLoader
10
  from langchain_groq import ChatGroq
 
11
  from langchain.vectorstores import FAISS
12
 
13
  from dotenv import load_dotenv
@@ -27,100 +28,36 @@ file_path = "faiss_store_openai.pkl"
27
  main_placeholder = st.empty()
28
  llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.9, max_tokens=500)
29
 
30
- # Debugging: Check if URLs are accessible
31
- def check_url(url):
32
- try:
33
- response = requests.get(url)
34
- if response.status_code == 200:
35
- return True
36
- else:
37
- return False
38
- except Exception as e:
39
- return False
40
-
41
  if process_url_clicked:
42
- # Debugging: Verify URL accessibility
43
- valid_urls = []
44
- for url in urls:
45
- if check_url(url):
46
- valid_urls.append(url)
47
- else:
48
- main_placeholder.text(f"URL is not accessible: {url}")
49
-
50
- if not valid_urls:
51
- main_placeholder.text("None of the URLs are accessible.")
52
-
53
- # Load data from URLs
54
- loader = UnstructuredURLLoader(urls=valid_urls)
55
  main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
56
- try:
57
- data = loader.load()
58
- except Exception as e:
59
- main_placeholder.text(f"Error loading data: {e}")
60
-
61
- # Split data into chunks
62
  text_splitter = RecursiveCharacterTextSplitter(
63
  separators=['\n\n', '\n', '.', ','],
64
  chunk_size=1000
65
  )
66
  main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
67
  docs = text_splitter.split_documents(data)
68
-
69
- # Debugging: Check if docs is empty
70
- if not docs:
71
- main_placeholder.text("No valid documents found! Please check the URLs.")
72
-
73
- # Debugging: Check the content of docs
74
- for doc in docs:
75
- main_placeholder.text(f"Document content: {doc.page_content[:200]}") # Show first 200 chars of each document
76
-
77
- # Create embeddings using HuggingFaceEmbeddings
78
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
79
  main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
80
-
81
- # Generate embeddings
82
- embeddings = embedding_model.embed_documents([doc.page_content for doc in docs])
83
-
84
- # Debugging: Check if embeddings are generated
85
- if not embeddings:
86
- main_placeholder.text("No embeddings were generated! Check the embedding model or document content.")
87
-
88
- # Check the size of embeddings
89
- main_placeholder.text(f"Generated {len(embeddings)} embeddings.")
90
 
91
- # Convert embeddings to numpy array (needed by FAISS)
92
- embeddings_np = np.array(embeddings).astype(np.float32)
93
-
94
- # Check the shape of embeddings
95
- main_placeholder.text(f"Shape of embeddings: {embeddings_np.shape}")
96
-
97
- # Create FAISS index
98
- if len(embeddings) > 0:
99
- dimension = len(embeddings[0]) # Embedding vector dimension
100
- index = FAISS(dimension)
101
- index.add(embeddings_np) # Add embeddings to FAISS index
102
-
103
- # Wrap FAISS index using LangChain FAISS wrapper
104
- vectorstore_huggingface = FAISS(embedding_function=embedding_model, index=index)
105
-
106
- # Save the FAISS index to a pickle file
107
- with open(file_path, "wb") as f:
108
- pickle.dump(vectorstore_huggingface, f)
109
-
110
- time.sleep(2)
111
- else:
112
- main_placeholder.text("Embeddings could not be generated, skipping FAISS index creation.")
113
 
114
  query = main_placeholder.text_input("Question: ")
115
  if query:
116
  if os.path.exists(file_path):
117
- # Load the FAISS index from the pickle file
118
  with open(file_path, "rb") as f:
119
  vectorstore = pickle.load(f)
120
  chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
121
  result = chain({"question": query}, return_only_outputs=True)
122
-
123
- # Display the answer
124
  st.header("Answer")
125
  st.write(result["answer"])
126
 
@@ -136,3 +73,4 @@ if query:
136
 
137
 
138
 
 
 
 
1
  import os
2
  import streamlit as st
3
  import pickle
4
  import time
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ from langchain import OpenAI
7
  from langchain.chains import RetrievalQAWithSourcesChain
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from langchain.document_loaders import UnstructuredURLLoader
10
  from langchain_groq import ChatGroq
11
+ from langchain.embeddings import OpenAIEmbeddings
12
  from langchain.vectorstores import FAISS
13
 
14
  from dotenv import load_dotenv
 
28
  main_placeholder = st.empty()
29
  llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.9, max_tokens=500)
30
 
 
 
 
 
 
 
 
 
 
 
 
31
  if process_url_clicked:
32
+ # load data
33
+ loader = UnstructuredURLLoader(urls=urls)
 
 
 
 
 
 
 
 
 
 
 
34
  main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
35
+ data = loader.load()
36
+ # split data
 
 
 
 
37
  text_splitter = RecursiveCharacterTextSplitter(
38
  separators=['\n\n', '\n', '.', ','],
39
  chunk_size=1000
40
  )
41
  main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
42
  docs = text_splitter.split_documents(data)
43
+ # create embeddings and save it to FAISS index
 
 
 
 
 
 
 
 
 
44
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
45
+ vectorstore_huggingface = FAISS.from_documents(docs, embedding_model)
46
  main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
47
+ time.sleep(2)
 
 
 
 
 
 
 
 
 
48
 
49
+ # Save the FAISS index to a pickle file
50
+ with open(file_path, "wb") as f:
51
+ pickle.dump(vectorstore_huggingface, f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  query = main_placeholder.text_input("Question: ")
54
  if query:
55
  if os.path.exists(file_path):
 
56
  with open(file_path, "rb") as f:
57
  vectorstore = pickle.load(f)
58
  chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
59
  result = chain({"question": query}, return_only_outputs=True)
60
+ # result will be a dictionary of this format --> {"answer": "", "sources": [] }
 
61
  st.header("Answer")
62
  st.write(result["answer"])
63
 
 
73
 
74
 
75
 
76
+