Spaces:
Runtime error
Runtime error
File size: 5,523 Bytes
f01a813 944d263 24d9947 834c71a 24d9947 d87413b 24d9947 145a282 24d9947 9502a66 6a2ef85 9502a66 6a2ef85 3ac4e4b 6a2ef85 3ac4e4b 6a2ef85 3ac4e4b 6a2ef85 f01a813 6a2ef85 5b2f320 f01a813 9502a66 d87413b 9502a66 2737463 d87413b 9502a66 d87413b 9502a66 24d9947 f01a813 24d9947 3ac4e4b 24d9947 3ac4e4b 56ec544 9502a66 d87413b 944d263 d87413b 24d9947 944d263 24d9947 944d263 834c71a d87413b 944d263 d87413b 944d263 a028e27 9502a66 a028e27 f01a813 a028e27 f01a813 a028e27 d87413b a028e27 d87413b f01a813 d87413b a028e27 d87413b a028e27 9502a66 b7ba413 9502a66 b7ba413 d87413b 2737463 a028e27 f01a813 d87413b 59386c9 a028e27 2737463 9502a66 2737463 59386c9 2737463 59386c9 9502a66 59386c9 2737463 9502a66 a028e27 2737463 9502a66 a028e27 d87413b 9502a66 a028e27 d87413b 9502a66 d87413b 9502a66 a028e27 d87413b f01a813 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
now explain this code that how this work? i want to understand deeply import os
import pickle
import numpy as np
import gradio as gr
import fitz # PyMuPDF
from docx import Document
from transformers import AutoModel, AutoTokenizer, pipeline
import faiss
import torch
# ===============================
# EMBEDDING MODEL (E5)
# ===============================
model_name = "intfloat/e5-small-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name)
def get_embeddings(texts, is_query=False):
if isinstance(texts, str):
texts = [texts]
prefix = "query: " if is_query else "passage: "
texts = [prefix + t for t in texts]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
with torch.no_grad():
model_output = embedding_model(**inputs)
embeddings = model_output.last_hidden_state[:, 0] # CLS token
return embeddings.cpu().numpy()
# ===============================
# TEXT CHUNKING
# ===============================
def chunk_text(text, chunk_size=800, overlap=100):
chunks = []
start = 0
while start < len(text):
end = min(len(text), start + chunk_size)
chunks.append(text[start:end])
start += chunk_size - overlap
return chunks
# ===============================
# FAISS INDEX SETUP
# ===============================
index_path = "faiss_index.pkl"
document_texts_path = "document_texts.pkl"
document_texts = []
embedding_dim = 384
if os.path.exists(index_path) and os.path.exists(document_texts_path):
try:
with open(index_path, "rb") as f:
index = pickle.load(f)
with open(document_texts_path, "rb") as f:
document_texts = pickle.load(f)
except Exception as e:
print(f"Error loading index: {e}")
index = faiss.IndexFlatIP(embedding_dim)
else:
index = faiss.IndexFlatIP(embedding_dim)
# ===============================
# FILE EXTRACTORS
# ===============================
def extract_text_from_pdf(path):
text = ""
try:
doc = fitz.open(path)
for page in doc:
text += page.get_text()
except Exception as e:
print(f"PDF error: {e}")
return text
def extract_text_from_docx(path):
text = ""
try:
doc = Document(path)
text = "\n".join([para.text for para in doc.paragraphs])
except Exception as e:
print(f"DOCX error: {e}")
return text
# ===============================
# UPLOAD HANDLER
# ===============================
def upload_document(file):
ext = os.path.splitext(file.name)[-1].lower()
if ext == ".pdf":
text = extract_text_from_pdf(file.name)
elif ext == ".docx":
text = extract_text_from_docx(file.name)
else:
return "Unsupported file type."
chunks = chunk_text(text)
chunk_embeddings = get_embeddings(chunks)
index.add(np.array(chunk_embeddings).astype('float32'))
document_texts.extend(chunks)
with open(index_path, "wb") as f:
pickle.dump(index, f)
with open(document_texts_path, "wb") as f:
pickle.dump(document_texts, f)
return "Document uploaded and indexed successfully."
# ===============================
# GENERATION PIPELINE
# ===============================
query_vector = get_embeddings(query, is_query=True).astype("float32")
def generate_answer_from_file(query, top_k=10):
if not document_texts:
return "No documents indexed yet."
query_vector = get_embeddings(query).astype("float32")
scores, indices = index.search(query_vector, k=top_k)
retrieved_chunks = [document_texts[i] for i in indices[0]]
context = "\n\n".join(retrieved_chunks)
print("\n--- Retrieved Context ---\n", context) # Debugging print
# Prompt Engineering
prompt = (
"You are a helpful assistant reading student notes or textbook passages.\n\n"
"Based on the context provided, answer the question accurately and clearly.\n\n"
"### Example\n"
"Context:\nArtificial systems are created by people. These systems are designed to perform specific tasks, improve efficiency, and solve problems. Examples include knowledge systems, engineering systems, and social systems.\n\n"
"Question: What is an Artificial System?\n"
"Answer: Artificial systems are systems created by humans to perform specific tasks, improve efficiency, and solve problems. They include systems like knowledge systems, engineering systems, and social systems.\n\n"
"### Now answer this\n"
f"Context:\n{context}\n\n"
f"Question: {query}\n"
f"Answer:"
)
result = qa_pipeline(prompt, max_length=512, do_sample=False)[0]['generated_text']
return result.strip()
# ===============================
# GRADIO INTERFACES
# ===============================
upload_interface = gr.Interface(
fn=upload_document,
inputs=gr.File(file_types=[".pdf", ".docx"]),
outputs="text",
title="Upload Document",
description="Upload your Word or PDF document for question answering."
)
search_interface = gr.Interface(
fn=generate_answer_from_file,
inputs=gr.Textbox(placeholder="Ask your question about the uploaded document..."),
outputs="text",
title="Ask the Document",
description="Ask questions about the uploaded content. The chatbot will answer based on the document."
)
app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
app.launch()
|