NaimaAqeel commited on
Commit
0a2bb75
·
verified ·
1 Parent(s): 072e16f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -152
app.py CHANGED
@@ -1,164 +1,65 @@
1
- import os
2
- import pickle
3
- import numpy as np
4
  import gradio as gr
5
- import fitz # PyMuPDF
6
- from docx import Document
7
- from transformers import AutoModel, AutoTokenizer, pipeline
8
- import faiss
9
- import torch
10
-
11
- # ===============================
12
- # EMBEDDING MODEL (E5)
13
- # ===============================
14
- model_name = "intfloat/e5-small-v2"
15
- tokenizer = AutoTokenizer.from_pretrained(model_name)
16
- embedding_model = AutoModel.from_pretrained(model_name)
17
-
18
- def get_embeddings(texts, is_query=False):
19
- if isinstance(texts, str):
20
- texts = [texts]
21
- prefix = "query: " if is_query else "passage: "
22
- texts = [prefix + t for t in texts]
23
-
24
- inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
25
- with torch.no_grad():
26
- model_output = embedding_model(**inputs)
27
-
28
- embeddings = model_output.last_hidden_state[:, 0] # CLS token
29
- return embeddings.cpu().numpy()
30
-
31
-
32
- # ===============================
33
- # TEXT CHUNKING
34
- # ===============================
35
- def chunk_text(text, chunk_size=800, overlap=100):
36
- chunks = []
37
- start = 0
38
- while start < len(text):
39
- end = min(len(text), start + chunk_size)
40
- chunks.append(text[start:end])
41
- start += chunk_size - overlap
42
- return chunks
43
-
44
- # ===============================
45
- # FAISS INDEX SETUP
46
- # ===============================
47
- index_path = "faiss_index.pkl"
48
- document_texts_path = "document_texts.pkl"
49
- document_texts = []
50
- embedding_dim = 384
51
-
52
- if os.path.exists(index_path) and os.path.exists(document_texts_path):
53
- try:
54
- with open(index_path, "rb") as f:
55
- index = pickle.load(f)
56
- with open(document_texts_path, "rb") as f:
57
- document_texts = pickle.load(f)
58
- except Exception as e:
59
- print(f"Error loading index: {e}")
60
- index = faiss.IndexFlatIP(embedding_dim)
61
- else:
62
- index = faiss.IndexFlatIP(embedding_dim)
63
-
64
- # ===============================
65
- # FILE EXTRACTORS
66
- # ===============================
67
- def extract_text_from_pdf(path):
68
- text = ""
69
- try:
70
- doc = fitz.open(path)
71
- for page in doc:
72
- text += page.get_text()
73
- except Exception as e:
74
- print(f"PDF error: {e}")
75
  return text
76
 
77
- def extract_text_from_docx(path):
78
- text = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  try:
80
- doc = Document(path)
81
- text = "\n".join([para.text for para in doc.paragraphs])
82
  except Exception as e:
83
- print(f"DOCX error: {e}")
84
- return text
85
-
86
- # ===============================
87
- # UPLOAD HANDLER
88
- # ===============================
89
- def upload_document(file):
90
- ext = os.path.splitext(file.name)[-1].lower()
91
- if ext == ".pdf":
92
- text = extract_text_from_pdf(file.name)
93
- elif ext == ".docx":
94
- text = extract_text_from_docx(file.name)
95
- else:
96
- return "Unsupported file type."
97
-
98
- chunks = chunk_text(text)
99
- chunk_embeddings = get_embeddings(chunks)
100
- index.add(np.array(chunk_embeddings).astype('float32'))
101
- document_texts.extend(chunks)
102
-
103
- with open(index_path, "wb") as f:
104
- pickle.dump(index, f)
105
- with open(document_texts_path, "wb") as f:
106
- pickle.dump(document_texts, f)
107
-
108
- return "Document uploaded and indexed successfully."
109
-
110
-
111
- # ===============================
112
- # QA GENERATION PIPELINE
113
- # ===============================
114
- # Initialize text generation pipeline (you can use a more powerful model if needed)
115
- qa_pipeline = pipeline("text-generation", model="gpt2")
116
-
117
- def generate_answer_from_file(query, top_k=10):
118
- if not document_texts:
119
- return "No documents indexed yet."
120
 
121
- query_vector = get_embeddings(query, is_query=True).astype("float32")
122
- scores, indices = index.search(query_vector, k=top_k)
123
- retrieved_chunks = [document_texts[i] for i in indices[0]]
124
- context = "\n\n".join(retrieved_chunks)
125
 
126
- # Prompt for the model
127
- prompt = (
128
- "You are a helpful assistant reading student notes or textbook passages.\n\n"
129
- "Based on the context provided, answer the question accurately and clearly.\n\n"
130
- "### Example\n"
131
- "Context:\nArtificial systems are created by people. These systems are designed to perform specific tasks, improve efficiency, and solve problems. Examples include knowledge systems, engineering systems, and social systems.\n\n"
132
- "Question: What is an Artificial System?\n"
133
- "Answer: Artificial systems are systems created by humans to perform specific tasks, improve efficiency, and solve problems. They include systems like knowledge systems, engineering systems, and social systems.\n\n"
134
- "### Now answer this\n"
135
- f"Context:\n{context}\n\n"
136
- f"Question: {query}\n"
137
- f"Answer:"
138
- )
139
 
140
- result = qa_pipeline(prompt, max_length=512, do_sample=False)[0]['generated_text']
141
- return result.strip()
142
 
 
 
143
 
144
- # ===============================
145
- # GRADIO INTERFACES
146
- # ===============================
147
- upload_interface = gr.Interface(
148
- fn=upload_document,
149
- inputs=gr.File(file_types=[".pdf", ".docx"]),
150
- outputs="text",
151
- title="Upload Document",
152
- description="Upload your Word or PDF document for question answering."
153
- )
154
 
155
- search_interface = gr.Interface(
156
- fn=generate_answer_from_file,
157
- inputs=gr.Textbox(placeholder="Ask your question about the uploaded document..."),
158
- outputs="text",
159
- title="Ask the Document",
160
- description="Ask questions about the uploaded content. The chatbot will answer based on the document."
161
- )
162
 
163
- app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
164
- app.launch()
 
 
 
 
1
  import gradio as gr
2
+ from PyPDF2 import PdfReader
3
+ from transformers import pipeline
4
+
5
+ # Load QA pipeline
6
+ qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
7
+
8
+ # Function to extract text from PDF
9
+ def extract_text_from_pdf(file):
10
+ reader = PdfReader(file)
11
+ text = ''
12
+ for page in reader.pages:
13
+ content = page.extract_text()
14
+ if content:
15
+ text += content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  return text
17
 
18
+ # Store context globally
19
+ document_context = {"text": ""}
20
+
21
+ # Function to set context from PDF or text
22
+ def set_context(pdf_file, text_input):
23
+ if pdf_file:
24
+ extracted = extract_text_from_pdf(pdf_file)
25
+ document_context["text"] = extracted
26
+ return "PDF uploaded and processed successfully!"
27
+ elif text_input.strip():
28
+ document_context["text"] = text_input.strip()
29
+ return "Text received and stored successfully!"
30
+ else:
31
+ return "Please upload a PDF or provide some text."
32
+
33
+ # Function to answer questions based on stored context
34
+ def answer_question(question):
35
+ context = document_context["text"]
36
+ if not context:
37
+ return "Please upload a document or enter some text first."
38
+ if not question.strip():
39
+ return "Please enter a question."
40
  try:
41
+ result = qa_pipeline(question=question, context=context)
42
+ return result["answer"]
43
  except Exception as e:
44
+ return f"Error during QA: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ # Gradio Interface
47
+ with gr.Blocks() as demo:
48
+ gr.Markdown("# 📄 Ask Questions from a Document")
49
+ gr.Markdown("Upload a PDF or paste some text, then ask questions about it!")
50
 
51
+ with gr.Row():
52
+ pdf_input = gr.File(label="Upload PDF (optional)", type="binary")
53
+ text_input = gr.Textbox(label="Or paste text here", lines=8, placeholder="Paste your document text...")
 
 
 
 
 
 
 
 
 
 
54
 
55
+ upload_btn = gr.Button("Submit Document")
56
+ upload_output = gr.Textbox(label="Status", interactive=False)
57
 
58
+ question_input = gr.Textbox(label="Ask a Question", placeholder="Type your question here...")
59
+ answer_output = gr.Textbox(label="Answer", interactive=False)
60
 
61
+ upload_btn.click(set_context, inputs=[pdf_input, text_input], outputs=upload_output)
62
+ question_input.change(answer_question, inputs=question_input, outputs=answer_output)
 
 
 
 
 
 
 
 
63
 
64
+ demo.launch()
 
 
 
 
 
 
65