Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,26 +1,26 @@
|
|
|
|
|
|
1 |
import os
|
2 |
-
import fitz
|
3 |
from docx import Document
|
4 |
from sentence_transformers import SentenceTransformer
|
5 |
import faiss
|
6 |
import numpy as np
|
7 |
import pickle
|
8 |
-
import gradio as gr
|
9 |
from langchain_community.llms import HuggingFaceEndpoint
|
10 |
from langchain_community.vectorstores import FAISS
|
11 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
12 |
|
13 |
-
#
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
|
25 |
# Function to extract text from a Word document
|
26 |
def extract_text_from_docx(docx_path):
|
@@ -32,23 +32,6 @@ def extract_text_from_docx(docx_path):
|
|
32 |
print(f"Error extracting text from DOCX: {e}")
|
33 |
return text
|
34 |
|
35 |
-
# Initialize the embedding model
|
36 |
-
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
37 |
-
|
38 |
-
# Hugging Face API token
|
39 |
-
api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
|
40 |
-
if not api_token:
|
41 |
-
raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set or invalid")
|
42 |
-
|
43 |
-
# Initialize the HuggingFace LLM
|
44 |
-
llm = HuggingFaceEndpoint(
|
45 |
-
endpoint_url="https://api-inference.huggingface.co/models/gpt-3.5-turbo",
|
46 |
-
model_kwargs={"api_key": api_token}
|
47 |
-
)
|
48 |
-
|
49 |
-
# Initialize the HuggingFace embeddings
|
50 |
-
embedding = HuggingFaceEmbeddings()
|
51 |
-
|
52 |
# Load or create FAISS index
|
53 |
index_path = "faiss_index.pkl"
|
54 |
document_texts_path = "document_texts.pkl"
|
@@ -81,22 +64,15 @@ def upload_files(files):
|
|
81 |
try:
|
82 |
for file in files:
|
83 |
file_path = file.name # Get the file path from the NamedString object
|
84 |
-
if file_path.endswith('.
|
85 |
-
text = extract_text_from_pdf(file_path)
|
86 |
-
elif file_path.endswith('.docx'):
|
87 |
text = extract_text_from_docx(file_path)
|
88 |
-
else:
|
89 |
-
return "Unsupported file format"
|
90 |
-
|
91 |
-
print(f"Extracted text: {text[:100]}...") # Debug: Show the first 100 characters of the extracted text
|
92 |
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
document_texts.extend(sentences) # Store sentences for retrieval
|
100 |
|
101 |
# Save the updated index and documents
|
102 |
with open(index_path, "wb") as f:
|
@@ -111,49 +87,23 @@ def upload_files(files):
|
|
111 |
print(f"Error processing files: {e}")
|
112 |
return f"Error processing files: {e}"
|
113 |
|
114 |
-
# Improved prompt template
|
115 |
-
prompt_template = """
|
116 |
-
You are a helpful assistant. Use the provided context to answer the question accurately.
|
117 |
-
If the answer is not in the context, say "answer is not available in the context".
|
118 |
-
Do not provide false information.
|
119 |
-
|
120 |
-
Context:
|
121 |
-
{context}
|
122 |
-
|
123 |
-
Question:
|
124 |
-
{question}
|
125 |
-
|
126 |
-
Answer:
|
127 |
-
"""
|
128 |
-
|
129 |
def query_text(text):
|
130 |
try:
|
131 |
-
print(f"Query text: {text}") # Debug: Show the query text
|
132 |
-
|
133 |
# Encode the query text
|
134 |
query_embedding = embedding_model.encode([text])
|
135 |
-
print(f"Query embedding shape: {query_embedding.shape}") # Debug: Show the shape of the query embedding
|
136 |
|
137 |
# Search the FAISS index
|
138 |
D, I = index.search(np.array(query_embedding), k=5)
|
139 |
-
print(f"Distances: {D}, Indices: {I}") # Debug: Show the distances and indices of the search results
|
140 |
|
141 |
top_documents = []
|
142 |
for idx in I[0]:
|
143 |
if idx != -1 and idx < len(document_texts): # Ensure that a valid index is found
|
144 |
top_documents.append(document_texts[idx]) # Append the actual sentences for the response
|
145 |
-
else:
|
146 |
-
print(f"Invalid index found: {idx}")
|
147 |
-
|
148 |
-
# Remove duplicates and sort by relevance
|
149 |
-
top_documents = list(dict.fromkeys(top_documents))
|
150 |
|
151 |
-
#
|
152 |
context = "\n".join(top_documents)
|
|
|
153 |
|
154 |
-
# Prepare the prompt
|
155 |
-
prompt = prompt_template.format(context=context, question=text)
|
156 |
-
|
157 |
# Query the LLM
|
158 |
response = llm(prompt)
|
159 |
return response
|
@@ -161,23 +111,21 @@ def query_text(text):
|
|
161 |
print(f"Error querying text: {e}")
|
162 |
return f"Error querying text: {e}"
|
163 |
|
164 |
-
#
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
demo.launch()
|
181 |
|
182 |
|
183 |
|
|
|
1 |
+
|
2 |
+
|
3 |
import os
|
|
|
4 |
from docx import Document
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
import faiss
|
7 |
import numpy as np
|
8 |
import pickle
|
|
|
9 |
from langchain_community.llms import HuggingFaceEndpoint
|
10 |
from langchain_community.vectorstores import FAISS
|
11 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
12 |
|
13 |
+
# Initialize the embedding model
|
14 |
+
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
15 |
+
|
16 |
+
# Initialize the HuggingFace LLM
|
17 |
+
llm = HuggingFaceEndpoint(
|
18 |
+
endpoint_url="https://api-inference.huggingface.co/models/gpt2",
|
19 |
+
model_kwargs={"api_key": os.getenv('HUGGINGFACEHUB_API_TOKEN')}
|
20 |
+
)
|
21 |
+
|
22 |
+
# Initialize the HuggingFace embeddings
|
23 |
+
embedding = HuggingFaceEmbeddings()
|
24 |
|
25 |
# Function to extract text from a Word document
|
26 |
def extract_text_from_docx(docx_path):
|
|
|
32 |
print(f"Error extracting text from DOCX: {e}")
|
33 |
return text
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
# Load or create FAISS index
|
36 |
index_path = "faiss_index.pkl"
|
37 |
document_texts_path = "document_texts.pkl"
|
|
|
64 |
try:
|
65 |
for file in files:
|
66 |
file_path = file.name # Get the file path from the NamedString object
|
67 |
+
if file_path.endswith('.docx'):
|
|
|
|
|
68 |
text = extract_text_from_docx(file_path)
|
|
|
|
|
|
|
|
|
69 |
|
70 |
+
# Process the text and update FAISS index
|
71 |
+
sentences = text.split("\n")
|
72 |
+
sentences = [preprocess_text(sentence) for sentence in sentences if sentence.strip()]
|
73 |
+
embeddings = embedding_model.encode(sentences)
|
74 |
+
index.add(np.array(embeddings))
|
75 |
+
document_texts.extend(sentences) # Store sentences for retrieval
|
|
|
76 |
|
77 |
# Save the updated index and documents
|
78 |
with open(index_path, "wb") as f:
|
|
|
87 |
print(f"Error processing files: {e}")
|
88 |
return f"Error processing files: {e}"
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
def query_text(text):
|
91 |
try:
|
|
|
|
|
92 |
# Encode the query text
|
93 |
query_embedding = embedding_model.encode([text])
|
|
|
94 |
|
95 |
# Search the FAISS index
|
96 |
D, I = index.search(np.array(query_embedding), k=5)
|
|
|
97 |
|
98 |
top_documents = []
|
99 |
for idx in I[0]:
|
100 |
if idx != -1 and idx < len(document_texts): # Ensure that a valid index is found
|
101 |
top_documents.append(document_texts[idx]) # Append the actual sentences for the response
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
+
# Prepare the prompt
|
104 |
context = "\n".join(top_documents)
|
105 |
+
prompt = f"Context:\n{context}\n\nQuestion:\n{text}\n\nAnswer:\n"
|
106 |
|
|
|
|
|
|
|
107 |
# Query the LLM
|
108 |
response = llm(prompt)
|
109 |
return response
|
|
|
111 |
print(f"Error querying text: {e}")
|
112 |
return f"Error querying text: {e}"
|
113 |
|
114 |
+
# Sample Gradio integration (for illustration)
|
115 |
+
import gradio as gr
|
116 |
+
|
117 |
+
def main():
|
118 |
+
gr.Interface(
|
119 |
+
[upload_files, query_text],
|
120 |
+
["files", "text"],
|
121 |
+
["text", "text"],
|
122 |
+
title="Document Upload and Query System",
|
123 |
+
description="Upload DOCX files to build an index, then query for answers based on uploaded documents.",
|
124 |
+
).launch()
|
125 |
+
|
126 |
+
if __name__ == "__main__":
|
127 |
+
main()
|
128 |
+
|
|
|
|
|
129 |
|
130 |
|
131 |
|