Spaces:
Runtime error
Runtime error
File size: 4,918 Bytes
58fc57a 0a2bb75 58fc57a 0a2bb75 58fc57a 944d263 834c71a 58fc57a 944d263 58fc57a 944d263 58fc57a a028e27 58fc57a a028e27 58fc57a a028e27 58fc57a 9502a66 58fc57a 072e16f 58fc57a a028e27 0a2bb75 a028e27 58fc57a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import os
import numpy as np
import faiss
import pickle
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import gradio as gr
import fitz # PyMuPDF for PDFs
import docx # python-docx for Word files
# Initialize global variables
index_path = "faiss_index.pkl"
document_texts_path = "document_texts.pkl"
# Load or initialize FAISS index and document chunks
if os.path.exists(index_path) and os.path.exists(document_texts_path):
with open(index_path, "rb") as f:
index = pickle.load(f)
with open(document_texts_path, "rb") as f:
document_texts = pickle.load(f)
else:
# Use 384 dim for all-MiniLM-L6-v2 model
dim = 384
index = faiss.IndexFlatL2(dim)
document_texts = []
# Load SentenceTransformer for embeddings
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# Initialize QA pipeline with a text generation model
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")
def extract_text_from_pdf(file_path):
doc = fitz.open(file_path)
text = ""
for page in doc:
text += page.get_text()
doc.close()
return text
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
return "\n".join(fullText)
def chunk_text(text, max_len=500):
"""Split text into chunks of max_len characters, trying to split at sentence boundaries."""
import re
sentences = re.split(r'(?<=[.!?]) +', text)
chunks = []
current_chunk = ""
for sent in sentences:
if len(current_chunk) + len(sent) + 1 <= max_len:
current_chunk += sent + " "
else:
chunks.append(current_chunk.strip())
current_chunk = sent + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def get_embeddings(texts, is_query=False):
if isinstance(texts, str):
texts = [texts]
embeddings = embedder.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
return embeddings
def upload_document(file):
global index, document_texts
ext = os.path.splitext(file.name)[-1].lower()
try:
if ext == ".pdf":
text = extract_text_from_pdf(file.file.name)
elif ext == ".docx":
text = extract_text_from_docx(file.file.name)
else:
return "Unsupported file type. Please upload a PDF or DOCX file."
except Exception as e:
return f"Failed to extract text: {str(e)}"
if not text.strip():
return "Failed to extract any text from the document."
chunks = chunk_text(text)
embeddings = get_embeddings(chunks)
# Convert FAISS index to IDMap to allow adding new vectors incrementally
if not isinstance(index, faiss.IndexIDMap):
id_map = faiss.IndexIDMap(index)
index = id_map
start_id = len(document_texts)
ids = np.arange(start_id, start_id + len(chunks))
index.add_with_ids(embeddings.astype('float32'), ids)
document_texts.extend(chunks)
# Save index and texts
with open(index_path, "wb") as f:
pickle.dump(index, f)
with open(document_texts_path, "wb") as f:
pickle.dump(document_texts, f)
return f"Document uploaded and indexed successfully with {len(chunks)} chunks."
def generate_answer_from_file(query, top_k=5):
global index, document_texts
if len(document_texts) == 0:
return "No document uploaded yet. Please upload a PDF or DOCX file first."
query_vec = get_embeddings(query, is_query=True).astype("float32")
scores, indices = index.search(query_vec, top_k)
retrieved_chunks = [document_texts[i] for i in indices[0] if i < len(document_texts)]
context = "\n\n".join(retrieved_chunks)
prompt = (
"You are a helpful assistant reading a document.\n\n"
"Context:\n"
f"{context}\n\n"
f"Question: {query}\n"
"Answer:"
)
# Generate answer with max length 256 tokens
result = qa_pipeline(prompt, max_length=256, do_sample=False)[0]['generated_text']
return result.strip()
with gr.Blocks() as demo:
gr.Markdown("## Document Question Answering App\nUpload a PDF or DOCX file, then ask questions based on it.")
with gr.Row():
file_input = gr.File(label="Upload PDF or DOCX file", file_types=['.pdf', '.docx'])
upload_btn = gr.Button("Upload & Index Document")
upload_output = gr.Textbox(label="Upload Status", interactive=False)
question = gr.Textbox(label="Enter your question here")
answer = gr.Textbox(label="Answer", interactive=False)
ask_btn = gr.Button("Ask")
upload_btn.click(upload_document, inputs=file_input, outputs=upload_output)
ask_btn.click(generate_answer_from_file, inputs=question, outputs=answer)
demo.launch()
|