Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -17,9 +17,17 @@ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
|
17 |
def extract_text_from_pdf(pdf_path):
|
18 |
# ...
|
19 |
|
20 |
-
# Function to extract text from a Word document (
|
21 |
def extract_text_from_docx(docx_path):
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
# Initialize the embedding model (same as before)
|
25 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
@@ -50,14 +58,19 @@ document_texts = []
|
|
50 |
|
51 |
|
52 |
def preprocess_text(text):
|
53 |
-
# ... (text preprocessing logic,
|
54 |
|
55 |
|
56 |
def upload_files(files):
|
57 |
global index, document_texts
|
58 |
try:
|
59 |
for file_path in files:
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
# Preprocess text (call the new function)
|
63 |
sentences = preprocess_text(text)
|
@@ -66,48 +79,11 @@ def upload_files(files):
|
|
66 |
embeddings = embedding_model.encode(sentences)
|
67 |
index.add(np.array(embeddings))
|
68 |
|
69 |
-
# Save the updated index and documents
|
70 |
-
# ...
|
71 |
return "Files processed successfully"
|
72 |
except Exception as e:
|
73 |
-
print(
|
74 |
-
return f"Error processing files: {e}"
|
75 |
-
|
76 |
-
|
77 |
-
def query_text(text):
|
78 |
-
try:
|
79 |
-
# Preprocess query text
|
80 |
-
query_sentences = preprocess_text(text)
|
81 |
-
query_embeddings = embedding_model.encode(query_sentences)
|
82 |
-
|
83 |
-
# Retrieve relevant documents using FAISS
|
84 |
-
D, I = index.search(np.array(query_embeddings), k=5)
|
85 |
-
retrieved_docs = [document_texts[idx] for idx in I[0] if idx != -1]
|
86 |
-
|
87 |
-
# Retriever-Augmented Generation (RAG)
|
88 |
-
retriever_inputs = retriever_tokenizer(
|
89 |
-
text=retrieved_docs, return_tensors="pt", padding=True
|
90 |
-
)
|
91 |
-
retriever_outputs = retriever(**retriever_inputs)
|
92 |
-
retrieved_texts = retriever_tokenizer.batch_decode(retriever_outputs.logits)
|
93 |
-
|
94 |
-
# Generate response using retrieved information (as prompts/context)
|
95 |
-
generator_inputs = generator_tokenizer(
|
96 |
-
text=[text] + retrieved_texts, return_tensors="pt", padding=True
|
97 |
-
)
|
98 |
-
generator_outputs = generator(**generator_inputs)
|
99 |
-
response = generator_tokenizer.decode(generator_outputs.sequences[0], skip_special_tokens=True)
|
100 |
-
|
101 |
-
return response
|
102 |
-
except Exception as e:
|
103 |
-
print(f"Error querying text: {e}")
|
104 |
-
return f"Error querying text: {e}"
|
105 |
-
|
106 |
|
107 |
-
# Create Gradio interface
|
108 |
-
with gr.Blocks() as demo:
|
109 |
-
# ... (rest of the Gradio interface definition)
|
110 |
-
query_button.click(fn=query_text, inputs
|
111 |
|
112 |
|
113 |
|
|
|
17 |
def extract_text_from_pdf(pdf_path):
|
18 |
# ...
|
19 |
|
20 |
+
# Function to extract text from a Word document (fixed indentation)
|
21 |
def extract_text_from_docx(docx_path):
|
22 |
+
"""Extracts text from a Word document."""
|
23 |
+
text = ""
|
24 |
+
try:
|
25 |
+
doc = Document(docx_path)
|
26 |
+
text = "\n".join([para.text for para in doc.paragraphs])
|
27 |
+
except Exception as e:
|
28 |
+
print(f"Error extracting text from DOCX: {e}")
|
29 |
+
return text
|
30 |
+
|
31 |
|
32 |
# Initialize the embedding model (same as before)
|
33 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
|
58 |
|
59 |
|
60 |
def preprocess_text(text):
|
61 |
+
# ... (text preprocessing logic, e.g., sentence segmentation and optional stop word removal)
|
62 |
|
63 |
|
64 |
def upload_files(files):
|
65 |
global index, document_texts
|
66 |
try:
|
67 |
for file_path in files:
|
68 |
+
if file_path.endswith('.pdf'):
|
69 |
+
text = extract_text_from_pdf(file_path)
|
70 |
+
elif file_path.endswith('.docx'):
|
71 |
+
text = extract_text_from_docx(file_path)
|
72 |
+
else:
|
73 |
+
return "Unsupported file format"
|
74 |
|
75 |
# Preprocess text (call the new function)
|
76 |
sentences = preprocess_text(text)
|
|
|
79 |
embeddings = embedding_model.encode(sentences)
|
80 |
index.add(np.array(embeddings))
|
81 |
|
82 |
+
# Save the updated index and documents
|
|
|
83 |
return "Files processed successfully"
|
84 |
except Exception as e:
|
85 |
+
print(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
|
|
|
|
|
|
|
|
87 |
|
88 |
|
89 |
|