File size: 4,620 Bytes
1649416
944d263
24d9947
834c71a
24d9947
 
d87413b
24d9947
145a282
24d9947
 
145a282
24d9947
3ac4e4b
 
 
 
 
 
 
 
 
 
 
24d9947
 
d87413b
 
 
 
 
 
 
 
 
 
 
 
 
24d9947
 
 
 
 
d87413b
24d9947
 
 
 
 
 
 
3ac4e4b
 
24d9947
3ac4e4b
56ec544
24d9947
d87413b
24d9947
d87413b
944d263
 
d87413b
24d9947
944d263
 
24d9947
944d263
834c71a
d87413b
944d263
 
d87413b
944d263
 
a028e27
 
 
 
d87413b
a028e27
 
 
 
 
 
 
 
d87413b
a028e27
d87413b
 
 
 
a028e27
 
 
 
 
 
d87413b
a028e27
 
d87413b
a028e27
d87413b
 
 
a028e27
 
 
d87413b
 
 
 
a028e27
d87413b
 
 
a028e27
 
d87413b
a028e27
 
 
 
 
d87413b
 
a028e27
 
 
d87413b
 
 
 
 
a028e27
 
d87413b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os
import pickle
import numpy as np
import gradio as gr
import fitz  # PyMuPDF
from docx import Document
from transformers import AutoModel, AutoTokenizer, pipeline
import faiss
import torch

# =============================================
# EMBEDDING MODEL SETUP
# =============================================
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name)

def get_embeddings(texts):
    if isinstance(texts, str):
        texts = [texts]
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = embedding_model(**inputs)
    return outputs.last_hidden_state[:, 0].cpu().numpy()

# =============================================
# TEXT CHUNKING
# =============================================
def chunk_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = min(len(text), start + chunk_size)
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# =============================================
# FAISS INDEX SETUP
# =============================================
index_path = "faiss_index.pkl"
document_texts_path = "document_texts.pkl"
document_texts = []

embedding_dim = 384  # for all-MiniLM-L6-v2
if os.path.exists(index_path) and os.path.exists(document_texts_path):
    try:
        with open(index_path, "rb") as f:
            index = pickle.load(f)
        with open(document_texts_path, "rb") as f:
            document_texts = pickle.load(f)
    except Exception as e:
        print(f"Error loading index: {e}")
        index = faiss.IndexFlatIP(embedding_dim)
else:
    index = faiss.IndexFlatIP(embedding_dim)

# =============================================
# DOCUMENT PROCESSING
# =============================================
def extract_text_from_pdf(path):
    text = ""
    try:
        doc = fitz.open(path)
        for page in doc:
            text += page.get_text()
    except Exception as e:
        print(f"PDF error: {e}")
    return text

def extract_text_from_docx(path):
    text = ""
    try:
        doc = Document(path)
        text = "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"DOCX error: {e}")
    return text

# =============================================
# UPLOAD AND INDEX FILE
# =============================================
def upload_document(file):
    ext = os.path.splitext(file.name)[-1].lower()
    if ext == ".pdf":
        text = extract_text_from_pdf(file.name)
    elif ext == ".docx":
        text = extract_text_from_docx(file.name)
    else:
        return "Unsupported file type."

    chunks = chunk_text(text)
    chunk_embeddings = get_embeddings(chunks)
    index.add(np.array(chunk_embeddings).astype('float32'))
    document_texts.extend(chunks)

    with open(index_path, "wb") as f:
        pickle.dump(index, f)
    with open(document_texts_path, "wb") as f:
        pickle.dump(document_texts, f)

    return "Document uploaded and indexed successfully."

# =============================================
# QA PIPELINE WITH FLAN-T5
# =============================================
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")

def generate_answer_from_file(query, top_k=3):
    if not document_texts:
        return "No documents indexed yet."

    query_vector = get_embeddings(query).astype("float32")
    scores, indices = index.search(query_vector, k=top_k)
    retrieved_chunks = [document_texts[i] for i in indices[0]]
    context = " ".join(retrieved_chunks)

    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
    result = qa_pipeline(prompt, max_length=200)[0]['generated_text']
    return result

# =============================================
# GRADIO UI
# =============================================
upload_interface = gr.Interface(
    fn=upload_document,
    inputs=gr.File(file_types=[".pdf", ".docx"]),
    outputs="text",
    title="Upload Document",
    description="Upload a Word or PDF file to index it for question answering."
)

search_interface = gr.Interface(
    fn=generate_answer_from_file,
    inputs=gr.Textbox(placeholder="Ask a question about the uploaded document..."),
    outputs="text",
    title="Ask Your Document",
    description="Ask any question. The chatbot will read the document and answer like ChatGPT."
)

app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
app.launch()