NHZ commited on
Commit
51163d3
·
verified ·
1 Parent(s): b3a469f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -37
app.py CHANGED
@@ -1,17 +1,13 @@
1
  import os
2
  import streamlit as st
3
  import requests
4
- import PyPDF2
5
- from sentence_transformers import SentenceTransformer
6
- import faiss
7
- import nltk
8
  from groq import Groq
9
-
10
- # Ensure the punkt tokenizer is downloaded
11
- try:
12
- nltk.data.find('tokenizers/punkt')
13
- except LookupError:
14
- nltk.download('punkt')
15
 
16
  # Initialize Groq client
17
  client = Groq(api_key=os.getenv("GROQ_API_KEY"))
@@ -34,41 +30,38 @@ def extract_text_from_pdf(pdf_url):
34
  os.remove("temp.pdf")
35
  return text
36
 
37
- # Function to chunk text
38
  def chunk_text(text, chunk_size=300):
39
- sentences = nltk.sent_tokenize(text)
 
40
  chunks = []
41
  current_chunk = []
42
- current_length = 0
43
 
44
- for sentence in sentences:
45
- current_length += len(sentence.split())
46
- if current_length <= chunk_size:
47
- current_chunk.append(sentence)
48
  else:
49
  chunks.append(" ".join(current_chunk))
50
- current_chunk = [sentence]
51
- current_length = len(sentence.split())
52
 
53
  if current_chunk:
54
  chunks.append(" ".join(current_chunk))
55
  return chunks
56
 
57
- # Function to create embeddings and store them in FAISS
58
  def create_faiss_index(chunks):
59
- model = SentenceTransformer("all-MiniLM-L6-v2")
60
- embeddings = model.encode(chunks)
61
- dimension = embeddings.shape[1]
62
- index = faiss.IndexFlatL2(dimension)
63
- index.add(embeddings)
64
- return index, embeddings
65
-
66
- # Function to query FAISS
67
- def query_faiss(index, query, chunks, model):
68
- query_vector = model.encode([query])
69
- distances, indices = index.search(query_vector, k=3)
70
- results = [chunks[i] for i in indices[0]]
71
- return results
72
 
73
  # Main Streamlit App
74
  def main():
@@ -89,10 +82,9 @@ def main():
89
  if 'document_text' in st.session_state and "faiss_index" not in st.session_state:
90
  st.write("Processing document...")
91
  chunks = chunk_text(st.session_state['document_text'])
92
- index, embeddings = create_faiss_index(chunks)
93
- st.session_state['faiss_index'] = index
94
  st.session_state['chunks'] = chunks
95
- st.session_state['model'] = SentenceTransformer("all-MiniLM-L6-v2")
96
  st.success(f"Document processed into {len(chunks)} chunks!")
97
 
98
  # Query the Document
@@ -100,4 +92,18 @@ def main():
100
  st.header("Ask Questions")
101
  query = st.text_input("Enter your question here")
102
  if st.button("Query Document"):
103
- results = query_faiss(st.session_state['faiss_index'],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import streamlit as st
3
  import requests
 
 
 
 
4
  from groq import Groq
5
+ from langchain.chains import AnalyzeDocumentChain
6
+ from langchain.prompts import PromptTemplate
7
+ from langchain.document_loaders import TextLoader
8
+ from langchain.vectorstores import FAISS
9
+ from langchain.embeddings import HuggingFaceEmbeddings
10
+ from sentence_transformers import SentenceTransformer
11
 
12
  # Initialize Groq client
13
  client = Groq(api_key=os.getenv("GROQ_API_KEY"))
 
30
  os.remove("temp.pdf")
31
  return text
32
 
33
+ # Function to chunk text manually
34
  def chunk_text(text, chunk_size=300):
35
+ # Split text by spaces and process into chunks
36
+ words = text.split()
37
  chunks = []
38
  current_chunk = []
 
39
 
40
+ for word in words:
41
+ if len(current_chunk) + len(word.split()) <= chunk_size:
42
+ current_chunk.append(word)
 
43
  else:
44
  chunks.append(" ".join(current_chunk))
45
+ current_chunk = [word]
 
46
 
47
  if current_chunk:
48
  chunks.append(" ".join(current_chunk))
49
  return chunks
50
 
51
+ # Function to create embeddings and store them in FAISS using Langchain
52
  def create_faiss_index(chunks):
53
+ # Use SentenceTransformer for embeddings
54
+ embeddings_model = SentenceTransformer("all-MiniLM-L6-v2")
55
+ embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
56
+
57
+ # Create FAISS vector store
58
+ doc_search = FAISS.from_texts(chunks, embeddings)
59
+ return doc_search
60
+
61
+ # Function to query FAISS and retrieve relevant document chunks
62
+ def query_faiss(doc_search, query):
63
+ results = doc_search.similarity_search(query, k=3)
64
+ return [result.page_content for result in results]
 
65
 
66
  # Main Streamlit App
67
  def main():
 
82
  if 'document_text' in st.session_state and "faiss_index" not in st.session_state:
83
  st.write("Processing document...")
84
  chunks = chunk_text(st.session_state['document_text'])
85
+ doc_search = create_faiss_index(chunks)
86
+ st.session_state['faiss_index'] = doc_search
87
  st.session_state['chunks'] = chunks
 
88
  st.success(f"Document processed into {len(chunks)} chunks!")
89
 
90
  # Query the Document
 
92
  st.header("Ask Questions")
93
  query = st.text_input("Enter your question here")
94
  if st.button("Query Document"):
95
+ results = query_faiss(st.session_state['faiss_index'], query)
96
+ st.write("### Results from Document:")
97
+ for i, result in enumerate(results):
98
+ st.write(f"**Result {i+1}:** {result}")
99
+
100
+ # Use Groq API for additional insights
101
+ chat_completion = client.chat.completions.create(
102
+ messages=[{"role": "user", "content": query}],
103
+ model="llama-3.3-70b-versatile",
104
+ )
105
+ st.write("### Insights from Groq-powered Model:")
106
+ st.write(chat_completion.choices[0].message.content)
107
+
108
+ if __name__ == "__main__":
109
+ main()