File size: 5,539 Bytes
1649416
944d263
24d9947
834c71a
24d9947
 
d87413b
24d9947
145a282
24d9947
9502a66
de6a22c
9502a66
3ac4e4b
 
 
 
 
 
 
 
 
 
5b2f320
de6a22c
 
5b2f320
 
9502a66
d87413b
9502a66
de6a22c
d87413b
 
 
 
 
 
 
 
9502a66
d87413b
9502a66
24d9947
 
 
de6a22c
24d9947
 
 
 
 
 
 
 
3ac4e4b
 
24d9947
3ac4e4b
56ec544
9502a66
 
 
d87413b
944d263
 
d87413b
24d9947
944d263
 
24d9947
944d263
834c71a
d87413b
944d263
 
d87413b
944d263
 
a028e27
 
 
9502a66
 
 
a028e27
 
 
 
 
 
 
d87413b
a028e27
d87413b
 
 
 
a028e27
 
 
 
 
 
d87413b
a028e27
9502a66
 
 
d87413b
 
de6a22c
a028e27
 
 
d87413b
 
 
59386c9
a028e27
9502a66
de6a22c
 
59386c9
 
 
de6a22c
59386c9
9502a66
59386c9
de6a22c
9502a66
a028e27
de6a22c
9502a66
 
 
 
 
a028e27
 
 
 
d87413b
9502a66
a028e27
 
 
d87413b
9502a66
d87413b
9502a66
 
a028e27
 
d87413b
 
de6a22c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import os
import pickle
import numpy as np
import gradio as gr
import fitz  # PyMuPDF
from docx import Document
from transformers import AutoModel, AutoTokenizer, pipeline
import faiss
import torch

# ===============================
# EMBEDDING MODEL SETUP
# ===============================
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name)

def get_embeddings(texts):
    if isinstance(texts, str):
        texts = [texts]
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = embedding_model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0].cpu().numpy()
    # Normalize embeddings to unit length for cosine similarity
    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings

# ===============================
# TEXT CHUNKING
# ===============================
def chunk_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = min(len(text), start + chunk_size)
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# ===============================
# FAISS INDEX SETUP
# ===============================
index_path = "faiss_index.pkl"
document_texts_path = "document_texts.pkl"
document_texts = []
embedding_dim = 384  # For all-MiniLM-L6-v2

if os.path.exists(index_path) and os.path.exists(document_texts_path):
    try:
        with open(index_path, "rb") as f:
            index = pickle.load(f)
        with open(document_texts_path, "rb") as f:
            document_texts = pickle.load(f)
    except Exception as e:
        print(f"Error loading index: {e}")
        index = faiss.IndexFlatIP(embedding_dim)
else:
    index = faiss.IndexFlatIP(embedding_dim)

# ===============================
# FILE EXTRACTORS
# ===============================
def extract_text_from_pdf(path):
    text = ""
    try:
        doc = fitz.open(path)
        for page in doc:
            text += page.get_text()
    except Exception as e:
        print(f"PDF error: {e}")
    return text

def extract_text_from_docx(path):
    text = ""
    try:
        doc = Document(path)
        text = "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"DOCX error: {e}")
    return text

# ===============================
# UPLOAD HANDLER
# ===============================
def upload_document(file):
    ext = os.path.splitext(file.name)[-1].lower()
    if ext == ".pdf":
        text = extract_text_from_pdf(file.name)
    elif ext == ".docx":
        text = extract_text_from_docx(file.name)
    else:
        return "Unsupported file type."

    chunks = chunk_text(text)
    chunk_embeddings = get_embeddings(chunks)
    index.add(np.array(chunk_embeddings).astype('float32'))
    document_texts.extend(chunks)

    with open(index_path, "wb") as f:
        pickle.dump(index, f)
    with open(document_texts_path, "wb") as f:
        pickle.dump(document_texts, f)

    return "Document uploaded and indexed successfully."

# ===============================
# GENERATION PIPELINE (FLAN-T5)
# ===============================
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")

def generate_answer_from_file(query, top_k=7):
    if not document_texts:
        return "No documents indexed yet."

    query_vector = get_embeddings(query).astype("float32")
    scores, indices = index.search(query_vector, k=top_k)
    retrieved_chunks = [document_texts[i] for i in indices[0]]
    context = "\n\n".join(retrieved_chunks)

    prompt = (
        "You are a helpful and precise assistant reading student notes or textbook passages.\n\n"
        "Based on the context provided, answer the question accurately and in detail using full sentences.\n\n"
        "### Example\n"
        "Context:\nArtificial systems are created by people. These systems are designed to perform specific tasks, improve efficiency, and solve problems. Examples include knowledge systems, engineering systems, and social systems.\n\n"
        "Question: What is an Artificial System?\n"
        "Answer: Artificial systems are systems created by humans to perform specific tasks, improve efficiency, and solve problems. They include systems such as knowledge systems, engineering systems, and social systems.\n\n"
        "### Now answer this\n"
        f"Context:\n{context}\n\n"
        f"Question: {query}\n"
        "Answer:\nPlease answer ONLY based on the context above without adding extra information."
    )

    result = qa_pipeline(prompt, max_length=700, do_sample=False)[0]['generated_text']
    return result.strip()

# ===============================
# GRADIO INTERFACES
# ===============================
upload_interface = gr.Interface(
    fn=upload_document,
    inputs=gr.File(file_types=[".pdf", ".docx"]),
    outputs="text",
    title="Upload Document",
    description="Upload your Word or PDF document for question answering."
)

search_interface = gr.Interface(
    fn=generate_answer_from_file,
    inputs=gr.Textbox(placeholder="Ask your question about the uploaded document..."),
    outputs="text",
    title="Ask the Document",
    description="Ask questions about the uploaded content. The chatbot will answer based on the document."
)

app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
app.launch()