Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,188 +1,166 @@
|
|
1 |
import gradio as gr
|
2 |
from PyPDF2 import PdfReader
|
3 |
import docx
|
|
|
|
|
4 |
import re
|
5 |
import torch
|
6 |
-
from sentence_transformers import SentenceTransformer, util
|
7 |
-
from transformers import pipeline
|
8 |
-
from nltk.tokenize import sent_tokenize
|
9 |
-
import nltk
|
10 |
-
|
11 |
-
# Download NLTK data (run once)
|
12 |
-
nltk.download('punkt')
|
13 |
|
14 |
# Load models
|
15 |
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
16 |
-
qa_pipeline = pipeline(
|
17 |
-
|
18 |
-
|
19 |
-
device=0 if torch.cuda.is_available() else -1
|
20 |
-
)
|
21 |
|
22 |
-
#
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
- Composed of multiple components that work together
|
27 |
-
- Has defined objectives/purpose
|
28 |
-
- Operates within an environment
|
29 |
-
- Components communicate with each other
|
30 |
-
Example from document: A car is made up of an engine, wheels, brakes and other related items that work together.
|
31 |
-
""",
|
32 |
-
|
33 |
-
"types of a system": """
|
34 |
-
The document clearly states there are two main types:
|
35 |
-
1. Natural Systems - exist independently without human involvement (e.g., ecosystems)
|
36 |
-
2. Artificial Systems - created by humans (e.g., computer systems, transportation systems)
|
37 |
-
""",
|
38 |
-
|
39 |
-
"what is an artificial system": """
|
40 |
-
Artificial Systems are human-created systems designed for specific purposes. Key points:
|
41 |
-
- Created by people to solve problems or perform tasks
|
42 |
-
- Three main categories mentioned in the document:
|
43 |
-
• Knowledge Systems (math, databases)
|
44 |
-
• Engineering Systems (civil, mechanical)
|
45 |
-
• Social Systems (governments, organizations)
|
46 |
-
Example: A computer system processes data to perform tasks.
|
47 |
-
""",
|
48 |
-
|
49 |
-
"what is a natural system": """
|
50 |
-
Natural Systems exist independently without human involvement. The document specifies:
|
51 |
-
- Governed by natural laws and processes
|
52 |
-
- Four subtypes:
|
53 |
-
1. Physical (planets, atoms)
|
54 |
-
2. Chemical (chemical reactions)
|
55 |
-
3. Biological (living organisms)
|
56 |
-
4. Psychological (human mind/behavior)
|
57 |
-
Example: An ecosystem where species interact naturally.
|
58 |
-
""",
|
59 |
-
|
60 |
-
"components of a system": """
|
61 |
-
The document describes system components as:
|
62 |
-
1. Fundamental building blocks that work together
|
63 |
-
2. Each component has a specific role
|
64 |
-
3. Must communicate effectively
|
65 |
-
4. Examples given:
|
66 |
-
- In computers: CPU, memory, I/O devices
|
67 |
-
- In cars: engine, wheels, brakes
|
68 |
-
The exact components vary by system type.
|
69 |
-
"""
|
70 |
-
}
|
71 |
|
72 |
def extract_text(file):
|
73 |
if file.name.endswith(".pdf"):
|
74 |
-
text = ""
|
75 |
-
for page in PdfReader(file).pages:
|
76 |
-
text += page.extract_text() or ""
|
77 |
elif file.name.endswith(".docx"):
|
78 |
text = "\n".join([p.text for p in docx.Document(file).paragraphs])
|
79 |
else:
|
80 |
return ""
|
81 |
-
|
82 |
-
text = re.sub(r'\s+', ' ', text) #
|
83 |
-
text = re.sub(r'\[.*?\]', '', text) # Remove [comments]
|
84 |
return text.strip()
|
85 |
|
86 |
-
def chunk_text(text, chunk_size=500, overlap=
|
87 |
-
sentences =
|
88 |
chunks = []
|
89 |
current_chunk = ""
|
90 |
|
91 |
-
for
|
92 |
-
if len(current_chunk) + len(
|
93 |
-
current_chunk +=
|
94 |
else:
|
95 |
chunks.append(current_chunk.strip())
|
96 |
-
|
|
|
97 |
|
98 |
if current_chunk:
|
99 |
chunks.append(current_chunk.strip())
|
100 |
return chunks
|
101 |
|
102 |
-
def
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
-
def
|
113 |
-
|
114 |
-
|
115 |
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
return " ".join(sentences).strip()
|
126 |
|
127 |
-
def
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
return None
|
133 |
|
134 |
def ask_question(file, question, history):
|
135 |
if not file:
|
136 |
return "Please upload a file.", history
|
137 |
-
|
138 |
-
# Check for predefined answers first
|
139 |
-
predefined_answer = check_keywords(question)
|
140 |
-
if predefined_answer:
|
141 |
-
history.append((question, predefined_answer))
|
142 |
-
return "", history
|
143 |
-
|
144 |
text = extract_text(file)
|
145 |
if not text:
|
146 |
-
return "Could not extract text from the
|
147 |
|
148 |
-
|
149 |
-
chunks = get_relevant_chunks(text, question, embedder)
|
150 |
if not chunks:
|
151 |
-
return "No
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
answer = "Sorry, I couldn't find a clear answer in the document."
|
|
|
|
|
|
|
170 |
|
171 |
history.append((question, answer))
|
172 |
return "", history
|
173 |
|
174 |
with gr.Blocks() as demo:
|
175 |
-
gr.Markdown("## Enhanced Document QA
|
176 |
with gr.Row():
|
177 |
-
file_input = gr.File(label="Upload PDF or
|
178 |
with gr.Row():
|
179 |
chatbot = gr.Chatbot(height=400)
|
180 |
with gr.Row():
|
181 |
question = gr.Textbox(label="Ask your question", placeholder="Type your question here...")
|
182 |
-
with gr.Row():
|
183 |
-
gr.Button("👍"), gr.Button("👎")
|
184 |
-
|
185 |
state = gr.State([])
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
187 |
|
188 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
from PyPDF2 import PdfReader
|
3 |
import docx
|
4 |
+
from sentence_transformers import SentenceTransformer, util
|
5 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
|
6 |
import re
|
7 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
# Load models
|
10 |
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
11 |
+
qa_pipeline = pipeline("question-answering",
|
12 |
+
model="distilbert-base-cased-distilled-squad",
|
13 |
+
device=0 if torch.cuda.is_available() else -1)
|
|
|
|
|
14 |
|
15 |
+
# Load GPT model (using GPT-2 as example - replace with GPT-3/4 if available)
|
16 |
+
gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
17 |
+
gpt_model = AutoModelForCausalLM.from_pretrained("gpt2")
|
18 |
+
gpt_model.eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
def extract_text(file):
|
21 |
if file.name.endswith(".pdf"):
|
22 |
+
text = "\n".join([page.extract_text() or "" for page in PdfReader(file).pages])
|
|
|
|
|
23 |
elif file.name.endswith(".docx"):
|
24 |
text = "\n".join([p.text for p in docx.Document(file).paragraphs])
|
25 |
else:
|
26 |
return ""
|
27 |
+
# Clean up text
|
28 |
+
text = re.sub(r'\s+', ' ', text) # Replace multiple whitespace with single space
|
|
|
29 |
return text.strip()
|
30 |
|
31 |
+
def chunk_text(text, chunk_size=500, overlap=100):
|
32 |
+
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
|
33 |
chunks = []
|
34 |
current_chunk = ""
|
35 |
|
36 |
+
for sent in sentences:
|
37 |
+
if len(current_chunk) + len(sent) < chunk_size:
|
38 |
+
current_chunk += sent + " "
|
39 |
else:
|
40 |
chunks.append(current_chunk.strip())
|
41 |
+
# Keep some overlap between chunks for context
|
42 |
+
current_chunk = current_chunk[-overlap:] + sent + " "
|
43 |
|
44 |
if current_chunk:
|
45 |
chunks.append(current_chunk.strip())
|
46 |
return chunks
|
47 |
|
48 |
+
def generate_with_gpt(prompt, max_length=150):
|
49 |
+
inputs = gpt_tokenizer(prompt, return_tensors="pt")
|
50 |
+
with torch.no_grad():
|
51 |
+
outputs = gpt_model.generate(
|
52 |
+
inputs.input_ids,
|
53 |
+
max_length=max_length,
|
54 |
+
num_return_sequences=1,
|
55 |
+
no_repeat_ngram_size=2,
|
56 |
+
do_sample=True,
|
57 |
+
top_k=50,
|
58 |
+
top_p=0.95,
|
59 |
+
temperature=0.7
|
60 |
+
)
|
61 |
+
return gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
62 |
|
63 |
+
def refine_answer_with_gpt(context, question, initial_answer):
|
64 |
+
prompt = f"""
|
65 |
+
Based on the following context, refine the answer to make it more clear and complete:
|
66 |
|
67 |
+
Context: {context}
|
68 |
+
|
69 |
+
Question: {question}
|
70 |
+
|
71 |
+
Initial Answer: {initial_answer}
|
72 |
+
|
73 |
+
Improved Answer:
|
74 |
+
"""
|
75 |
+
return generate_with_gpt(prompt)
|
|
|
76 |
|
77 |
+
def extract_direct_definition(text, term):
|
78 |
+
"""Try to find a sentence that directly defines the term"""
|
79 |
+
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
|
80 |
+
term = term.lower()
|
81 |
+
|
82 |
+
candidates = []
|
83 |
+
for sent in sentences:
|
84 |
+
lower_sent = sent.lower()
|
85 |
+
if term in lower_sent:
|
86 |
+
if (" is " in lower_sent or " are " in lower_sent or
|
87 |
+
" refers to " in lower_sent or " defined as " in lower_sent):
|
88 |
+
candidates.append(sent)
|
89 |
+
|
90 |
+
if candidates:
|
91 |
+
return candidates[0]
|
92 |
return None
|
93 |
|
94 |
def ask_question(file, question, history):
|
95 |
if not file:
|
96 |
return "Please upload a file.", history
|
97 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
text = extract_text(file)
|
99 |
if not text:
|
100 |
+
return "Could not extract text from the file.", history
|
101 |
|
102 |
+
chunks = chunk_text(text)
|
|
|
103 |
if not chunks:
|
104 |
+
return "No meaningful text chunks could be created.", history
|
105 |
+
|
106 |
+
# Normalize question for better matching
|
107 |
+
normalized_question = question.lower().strip(" ?")
|
108 |
+
|
109 |
+
try:
|
110 |
+
# First try to find direct definitions
|
111 |
+
if "artificial system" in normalized_question:
|
112 |
+
answer = extract_direct_definition(text, "artificial system")
|
113 |
+
elif "natural system" in normalized_question:
|
114 |
+
answer = extract_direct_definition(text, "natural system")
|
115 |
+
elif "component" in normalized_question:
|
116 |
+
answer = extract_direct_definition(text, "component")
|
117 |
+
|
118 |
+
# If no direct definition found, use semantic search
|
119 |
+
if not answer:
|
120 |
+
emb_chunks = embedder.encode(chunks, convert_to_tensor=True)
|
121 |
+
emb_question = embedder.encode(question, convert_to_tensor=True)
|
122 |
+
scores = util.pytorch_cos_sim(emb_question, emb_chunks)[0]
|
123 |
+
best_idx = scores.argmax().item()
|
124 |
+
best_chunk = chunks[best_idx]
|
125 |
+
|
126 |
+
# Combine top chunks if confidence is low
|
127 |
+
if scores[best_idx] < 0.3:
|
128 |
+
top_k = min(3, len(chunks))
|
129 |
+
best_indices = scores.topk(top_k).indices.tolist()
|
130 |
+
best_chunk = " ".join([chunks[i] for i in best_indices])
|
131 |
+
|
132 |
+
# Get initial answer from QA model
|
133 |
+
result = qa_pipeline(question=question, context=best_chunk)
|
134 |
+
answer = result["answer"] if result["score"] > 0.1 else None
|
135 |
+
|
136 |
+
# Refine answer with GPT if available
|
137 |
+
if answer and len(answer.split()) > 2:
|
138 |
+
answer = refine_answer_with_gpt(best_chunk, question, answer)
|
139 |
+
|
140 |
+
# Final fallback
|
141 |
+
if not answer:
|
142 |
answer = "Sorry, I couldn't find a clear answer in the document."
|
143 |
+
|
144 |
+
except Exception as e:
|
145 |
+
answer = f"An error occurred: {str(e)}"
|
146 |
|
147 |
history.append((question, answer))
|
148 |
return "", history
|
149 |
|
150 |
with gr.Blocks() as demo:
|
151 |
+
gr.Markdown("## Enhanced Document QA with GPT Integration")
|
152 |
with gr.Row():
|
153 |
+
file_input = gr.File(label="Upload PDF or Word", file_types=[".pdf", ".docx"])
|
154 |
with gr.Row():
|
155 |
chatbot = gr.Chatbot(height=400)
|
156 |
with gr.Row():
|
157 |
question = gr.Textbox(label="Ask your question", placeholder="Type your question here...")
|
|
|
|
|
|
|
158 |
state = gr.State([])
|
159 |
+
|
160 |
+
question.submit(
|
161 |
+
ask_question,
|
162 |
+
[file_input, question, state],
|
163 |
+
[question, chatbot]
|
164 |
+
)
|
165 |
|
166 |
demo.launch()
|