File size: 5,538 Bytes
f01a813
944d263
24d9947
834c71a
24d9947
 
d87413b
24d9947
145a282
24d9947
9502a66
6a2ef85
9502a66
6a2ef85
3ac4e4b
 
 
6a2ef85
3ac4e4b
 
6a2ef85
 
 
3ac4e4b
 
6a2ef85
 
f01a813
6a2ef85
5b2f320
f01a813
9502a66
d87413b
9502a66
2737463
d87413b
 
 
 
 
 
 
 
9502a66
d87413b
9502a66
24d9947
 
 
f01a813
24d9947
 
 
 
 
 
 
 
3ac4e4b
 
24d9947
3ac4e4b
56ec544
9502a66
 
 
d87413b
944d263
 
d87413b
24d9947
944d263
 
24d9947
944d263
834c71a
d87413b
944d263
 
d87413b
944d263
 
a028e27
 
 
9502a66
 
 
a028e27
 
 
f01a813
a028e27
f01a813
a028e27
d87413b
a028e27
d87413b
f01a813
d87413b
 
a028e27
 
 
 
 
 
d87413b
a028e27
9502a66
 
 
d87413b
 
2737463
a028e27
 
 
f01a813
d87413b
 
59386c9
a028e27
2737463
 
 
9502a66
2737463
 
59386c9
 
 
2737463
59386c9
9502a66
59386c9
2737463
9502a66
a028e27
2737463
9502a66
 
 
 
 
a028e27
 
 
 
d87413b
9502a66
a028e27
 
 
d87413b
9502a66
d87413b
9502a66
 
a028e27
 
d87413b
 
f01a813
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
now explain this code that how this work? i want to understand deeply import os
import pickle
import numpy as np
import gradio as gr
import fitz  # PyMuPDF
from docx import Document
from transformers import AutoModel, AutoTokenizer, pipeline
import faiss
import torch

# ===============================
# EMBEDDING MODEL (E5)
# ===============================
model_name = "intfloat/e5-small-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name)

def get_embeddings(texts, is_query=False):
    if isinstance(texts, str):
        texts = [texts]
    prefix = "query: " if is_query else "passage: "
    texts = [prefix + t for t in texts]

    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        model_output = embedding_model(**inputs)

    embeddings = model_output.last_hidden_state[:, 0]  # CLS token
    return embeddings.cpu().numpy()


# ===============================
# TEXT CHUNKING
# ===============================
def chunk_text(text, chunk_size=800, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        end = min(len(text), start + chunk_size)
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# ===============================
# FAISS INDEX SETUP
# ===============================
index_path = "faiss_index.pkl"
document_texts_path = "document_texts.pkl"
document_texts = []
embedding_dim = 384

if os.path.exists(index_path) and os.path.exists(document_texts_path):
    try:
        with open(index_path, "rb") as f:
            index = pickle.load(f)
        with open(document_texts_path, "rb") as f:
            document_texts = pickle.load(f)
    except Exception as e:
        print(f"Error loading index: {e}")
        index = faiss.IndexFlatIP(embedding_dim)
else:
    index = faiss.IndexFlatIP(embedding_dim)

# ===============================
# FILE EXTRACTORS
# ===============================
def extract_text_from_pdf(path):
    text = ""
    try:
        doc = fitz.open(path)
        for page in doc:
            text += page.get_text()
    except Exception as e:
        print(f"PDF error: {e}")
    return text

def extract_text_from_docx(path):
    text = ""
    try:
        doc = Document(path)
        text = "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"DOCX error: {e}")
    return text

# ===============================
# UPLOAD HANDLER
# ===============================
def upload_document(file):
    ext = os.path.splitext(file.name)[-1].lower()
    if ext == ".pdf":
        text = extract_text_from_pdf(file.name)
    elif ext == ".docx":
        text = extract_text_from_docx(file.name)
    else:
        return "Unsupported file type."

    chunks = chunk_text(text)
    chunk_embeddings = get_embeddings(chunks)
    index.add(np.array(chunk_embeddings).astype('float32'))
    document_texts.extend(chunks)

    with open(index_path, "wb") as f:
        pickle.dump(index, f)
    with open(document_texts_path, "wb") as f:
        pickle.dump(document_texts, f)

    return "Document uploaded and indexed successfully."

# ===============================
# GENERATION PIPELINE (FLAN-T5)
# ===============================
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")

def generate_answer_from_file(query, top_k=10):
    if not document_texts:
        return "No documents indexed yet."

    query_vector = get_embeddings(query).astype("float32")
    scores, indices = index.search(query_vector, k=top_k)
    retrieved_chunks = [document_texts[i] for i in indices[0]]
    context = "\n\n".join(retrieved_chunks)

    print("\n--- Retrieved Context ---\n", context)  # Debugging print

    # Prompt Engineering
    prompt = (
        "You are a helpful assistant reading student notes or textbook passages.\n\n"
        "Based on the context provided, answer the question accurately and clearly.\n\n"
        "### Example\n"
        "Context:\nArtificial systems are created by people. These systems are designed to perform specific tasks, improve efficiency, and solve problems. Examples include knowledge systems, engineering systems, and social systems.\n\n"
        "Question: What is an Artificial System?\n"
        "Answer: Artificial systems are systems created by humans to perform specific tasks, improve efficiency, and solve problems. They include systems like knowledge systems, engineering systems, and social systems.\n\n"
        "### Now answer this\n"
        f"Context:\n{context}\n\n"
        f"Question: {query}\n"
        f"Answer:"
    )

    result = qa_pipeline(prompt, max_length=512, do_sample=False)[0]['generated_text']
    return result.strip()

# ===============================
# GRADIO INTERFACES
# ===============================
upload_interface = gr.Interface(
    fn=upload_document,
    inputs=gr.File(file_types=[".pdf", ".docx"]),
    outputs="text",
    title="Upload Document",
    description="Upload your Word or PDF document for question answering."
)

search_interface = gr.Interface(
    fn=generate_answer_from_file,
    inputs=gr.Textbox(placeholder="Ask your question about the uploaded document..."),
    outputs="text",
    title="Ask the Document",
    description="Ask questions about the uploaded content. The chatbot will answer based on the document."
)

app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
app.launch()