NaimaAqeel commited on
Commit
d382509
·
verified ·
1 Parent(s): dc170ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -89
app.py CHANGED
@@ -1,26 +1,26 @@
 
 
1
  import os
2
- import fitz
3
  from docx import Document
4
  from sentence_transformers import SentenceTransformer
5
  import faiss
6
  import numpy as np
7
  import pickle
8
- import gradio as gr
9
  from langchain_community.llms import HuggingFaceEndpoint
10
  from langchain_community.vectorstores import FAISS
11
  from langchain_community.embeddings import HuggingFaceEmbeddings
12
 
13
- # Function to extract text from a PDF file
14
- def extract_text_from_pdf(pdf_path):
15
- text = ""
16
- try:
17
- doc = fitz.open(pdf_path)
18
- for page_num in range(len(doc)):
19
- page = doc.load_page(page_num)
20
- text += page.get_text()
21
- except Exception as e:
22
- print(f"Error extracting text from PDF: {e}")
23
- return text
24
 
25
  # Function to extract text from a Word document
26
  def extract_text_from_docx(docx_path):
@@ -32,23 +32,6 @@ def extract_text_from_docx(docx_path):
32
  print(f"Error extracting text from DOCX: {e}")
33
  return text
34
 
35
- # Initialize the embedding model
36
- embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
37
-
38
- # Hugging Face API token
39
- api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
40
- if not api_token:
41
- raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set or invalid")
42
-
43
- # Initialize the HuggingFace LLM
44
- llm = HuggingFaceEndpoint(
45
- endpoint_url="https://api-inference.huggingface.co/models/gpt-3.5-turbo",
46
- model_kwargs={"api_key": api_token}
47
- )
48
-
49
- # Initialize the HuggingFace embeddings
50
- embedding = HuggingFaceEmbeddings()
51
-
52
  # Load or create FAISS index
53
  index_path = "faiss_index.pkl"
54
  document_texts_path = "document_texts.pkl"
@@ -81,22 +64,15 @@ def upload_files(files):
81
  try:
82
  for file in files:
83
  file_path = file.name # Get the file path from the NamedString object
84
- if file_path.endswith('.pdf'):
85
- text = extract_text_from_pdf(file_path)
86
- elif file_path.endswith('.docx'):
87
  text = extract_text_from_docx(file_path)
88
- else:
89
- return "Unsupported file format"
90
-
91
- print(f"Extracted text: {text[:100]}...") # Debug: Show the first 100 characters of the extracted text
92
 
93
- # Process the text and update FAISS index
94
- sentences = text.split("\n")
95
- sentences = [preprocess_text(sentence) for sentence in sentences if sentence.strip()]
96
- embeddings = embedding_model.encode(sentences)
97
- print(f"Embeddings shape: {embeddings.shape}") # Debug: Show the shape of the embeddings
98
- index.add(np.array(embeddings))
99
- document_texts.extend(sentences) # Store sentences for retrieval
100
 
101
  # Save the updated index and documents
102
  with open(index_path, "wb") as f:
@@ -111,49 +87,23 @@ def upload_files(files):
111
  print(f"Error processing files: {e}")
112
  return f"Error processing files: {e}"
113
 
114
- # Improved prompt template
115
- prompt_template = """
116
- You are a helpful assistant. Use the provided context to answer the question accurately.
117
- If the answer is not in the context, say "answer is not available in the context".
118
- Do not provide false information.
119
-
120
- Context:
121
- {context}
122
-
123
- Question:
124
- {question}
125
-
126
- Answer:
127
- """
128
-
129
  def query_text(text):
130
  try:
131
- print(f"Query text: {text}") # Debug: Show the query text
132
-
133
  # Encode the query text
134
  query_embedding = embedding_model.encode([text])
135
- print(f"Query embedding shape: {query_embedding.shape}") # Debug: Show the shape of the query embedding
136
 
137
  # Search the FAISS index
138
  D, I = index.search(np.array(query_embedding), k=5)
139
- print(f"Distances: {D}, Indices: {I}") # Debug: Show the distances and indices of the search results
140
 
141
  top_documents = []
142
  for idx in I[0]:
143
  if idx != -1 and idx < len(document_texts): # Ensure that a valid index is found
144
  top_documents.append(document_texts[idx]) # Append the actual sentences for the response
145
- else:
146
- print(f"Invalid index found: {idx}")
147
-
148
- # Remove duplicates and sort by relevance
149
- top_documents = list(dict.fromkeys(top_documents))
150
 
151
- # Join the top documents for the context
152
  context = "\n".join(top_documents)
 
153
 
154
- # Prepare the prompt
155
- prompt = prompt_template.format(context=context, question=text)
156
-
157
  # Query the LLM
158
  response = llm(prompt)
159
  return response
@@ -161,23 +111,21 @@ def query_text(text):
161
  print(f"Error querying text: {e}")
162
  return f"Error querying text: {e}"
163
 
164
- # Create Gradio interface
165
- with gr.Blocks() as demo:
166
- gr.Markdown("## Document Upload and Query System")
167
-
168
- with gr.Tab("Upload Files"):
169
- upload = gr.File(file_count="multiple", label="Upload PDF or DOCX files")
170
- upload_button = gr.Button("Upload")
171
- upload_output = gr.Textbox()
172
- upload_button.click(fn=upload_files, inputs=upload, outputs=upload_output)
173
-
174
- with gr.Tab("Query"):
175
- query = gr.Textbox(label="Enter your query")
176
- query_button = gr.Button("Search")
177
- query_output = gr.Textbox()
178
- query_button.click(fn=query_text, inputs=query, outputs=query_output)
179
-
180
- demo.launch()
181
 
182
 
183
 
 
1
+
2
+
3
  import os
 
4
  from docx import Document
5
  from sentence_transformers import SentenceTransformer
6
  import faiss
7
  import numpy as np
8
  import pickle
 
9
  from langchain_community.llms import HuggingFaceEndpoint
10
  from langchain_community.vectorstores import FAISS
11
  from langchain_community.embeddings import HuggingFaceEmbeddings
12
 
13
+ # Initialize the embedding model
14
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
15
+
16
+ # Initialize the HuggingFace LLM
17
+ llm = HuggingFaceEndpoint(
18
+ endpoint_url="https://api-inference.huggingface.co/models/gpt2",
19
+ model_kwargs={"api_key": os.getenv('HUGGINGFACEHUB_API_TOKEN')}
20
+ )
21
+
22
+ # Initialize the HuggingFace embeddings
23
+ embedding = HuggingFaceEmbeddings()
24
 
25
  # Function to extract text from a Word document
26
  def extract_text_from_docx(docx_path):
 
32
  print(f"Error extracting text from DOCX: {e}")
33
  return text
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  # Load or create FAISS index
36
  index_path = "faiss_index.pkl"
37
  document_texts_path = "document_texts.pkl"
 
64
  try:
65
  for file in files:
66
  file_path = file.name # Get the file path from the NamedString object
67
+ if file_path.endswith('.docx'):
 
 
68
  text = extract_text_from_docx(file_path)
 
 
 
 
69
 
70
+ # Process the text and update FAISS index
71
+ sentences = text.split("\n")
72
+ sentences = [preprocess_text(sentence) for sentence in sentences if sentence.strip()]
73
+ embeddings = embedding_model.encode(sentences)
74
+ index.add(np.array(embeddings))
75
+ document_texts.extend(sentences) # Store sentences for retrieval
 
76
 
77
  # Save the updated index and documents
78
  with open(index_path, "wb") as f:
 
87
  print(f"Error processing files: {e}")
88
  return f"Error processing files: {e}"
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  def query_text(text):
91
  try:
 
 
92
  # Encode the query text
93
  query_embedding = embedding_model.encode([text])
 
94
 
95
  # Search the FAISS index
96
  D, I = index.search(np.array(query_embedding), k=5)
 
97
 
98
  top_documents = []
99
  for idx in I[0]:
100
  if idx != -1 and idx < len(document_texts): # Ensure that a valid index is found
101
  top_documents.append(document_texts[idx]) # Append the actual sentences for the response
 
 
 
 
 
102
 
103
+ # Prepare the prompt
104
  context = "\n".join(top_documents)
105
+ prompt = f"Context:\n{context}\n\nQuestion:\n{text}\n\nAnswer:\n"
106
 
 
 
 
107
  # Query the LLM
108
  response = llm(prompt)
109
  return response
 
111
  print(f"Error querying text: {e}")
112
  return f"Error querying text: {e}"
113
 
114
+ # Sample Gradio integration (for illustration)
115
+ import gradio as gr
116
+
117
+ def main():
118
+ gr.Interface(
119
+ [upload_files, query_text],
120
+ ["files", "text"],
121
+ ["text", "text"],
122
+ title="Document Upload and Query System",
123
+ description="Upload DOCX files to build an index, then query for answers based on uploaded documents.",
124
+ ).launch()
125
+
126
+ if __name__ == "__main__":
127
+ main()
128
+
 
 
129
 
130
 
131