File size: 4,918 Bytes
58fc57a
 
 
 
 
0a2bb75
58fc57a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a2bb75
58fc57a
 
 
 
 
 
944d263
834c71a
58fc57a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
944d263
58fc57a
 
 
 
 
 
944d263
58fc57a
a028e27
58fc57a
 
a028e27
58fc57a
 
 
 
 
 
 
 
 
 
 
 
 
a028e27
58fc57a
 
 
 
 
9502a66
58fc57a
072e16f
58fc57a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a028e27
0a2bb75
a028e27
58fc57a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import numpy as np
import faiss
import pickle
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import gradio as gr
import fitz  # PyMuPDF for PDFs
import docx  # python-docx for Word files

# Initialize global variables
index_path = "faiss_index.pkl"
document_texts_path = "document_texts.pkl"

# Load or initialize FAISS index and document chunks
if os.path.exists(index_path) and os.path.exists(document_texts_path):
    with open(index_path, "rb") as f:
        index = pickle.load(f)
    with open(document_texts_path, "rb") as f:
        document_texts = pickle.load(f)
else:
    # Use 384 dim for all-MiniLM-L6-v2 model
    dim = 384
    index = faiss.IndexFlatL2(dim)
    document_texts = []

# Load SentenceTransformer for embeddings
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Initialize QA pipeline with a text generation model
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")

def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return "\n".join(fullText)

def chunk_text(text, max_len=500):
    """Split text into chunks of max_len characters, trying to split at sentence boundaries."""
    import re
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = ""
    for sent in sentences:
        if len(current_chunk) + len(sent) + 1 <= max_len:
            current_chunk += sent + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sent + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

def get_embeddings(texts, is_query=False):
    if isinstance(texts, str):
        texts = [texts]
    embeddings = embedder.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
    return embeddings

def upload_document(file):
    global index, document_texts
    
    ext = os.path.splitext(file.name)[-1].lower()
    try:
        if ext == ".pdf":
            text = extract_text_from_pdf(file.file.name)
        elif ext == ".docx":
            text = extract_text_from_docx(file.file.name)
        else:
            return "Unsupported file type. Please upload a PDF or DOCX file."
    except Exception as e:
        return f"Failed to extract text: {str(e)}"

    if not text.strip():
        return "Failed to extract any text from the document."

    chunks = chunk_text(text)
    embeddings = get_embeddings(chunks)
    
    # Convert FAISS index to IDMap to allow adding new vectors incrementally
    if not isinstance(index, faiss.IndexIDMap):
        id_map = faiss.IndexIDMap(index)
        index = id_map
    
    start_id = len(document_texts)
    ids = np.arange(start_id, start_id + len(chunks))
    
    index.add_with_ids(embeddings.astype('float32'), ids)
    document_texts.extend(chunks)

    # Save index and texts
    with open(index_path, "wb") as f:
        pickle.dump(index, f)
    with open(document_texts_path, "wb") as f:
        pickle.dump(document_texts, f)

    return f"Document uploaded and indexed successfully with {len(chunks)} chunks."

def generate_answer_from_file(query, top_k=5):
    global index, document_texts
    
    if len(document_texts) == 0:
        return "No document uploaded yet. Please upload a PDF or DOCX file first."

    query_vec = get_embeddings(query, is_query=True).astype("float32")
    scores, indices = index.search(query_vec, top_k)
    retrieved_chunks = [document_texts[i] for i in indices[0] if i < len(document_texts)]

    context = "\n\n".join(retrieved_chunks)

    prompt = (
        "You are a helpful assistant reading a document.\n\n"
        "Context:\n"
        f"{context}\n\n"
        f"Question: {query}\n"
        "Answer:"
    )
    
    # Generate answer with max length 256 tokens
    result = qa_pipeline(prompt, max_length=256, do_sample=False)[0]['generated_text']

    return result.strip()

with gr.Blocks() as demo:
    gr.Markdown("## Document Question Answering App\nUpload a PDF or DOCX file, then ask questions based on it.")
    
    with gr.Row():
        file_input = gr.File(label="Upload PDF or DOCX file", file_types=['.pdf', '.docx'])
        upload_btn = gr.Button("Upload & Index Document")
    
    upload_output = gr.Textbox(label="Upload Status", interactive=False)

    question = gr.Textbox(label="Enter your question here")
    answer = gr.Textbox(label="Answer", interactive=False)
    ask_btn = gr.Button("Ask")

    upload_btn.click(upload_document, inputs=file_input, outputs=upload_output)
    ask_btn.click(generate_answer_from_file, inputs=question, outputs=answer)

demo.launch()