Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,128 +1,188 @@
|
|
1 |
import gradio as gr
|
2 |
-
import torch
|
3 |
from PyPDF2 import PdfReader
|
4 |
import docx
|
|
|
|
|
5 |
from sentence_transformers import SentenceTransformer, util
|
6 |
from transformers import pipeline
|
7 |
-
import
|
|
|
|
|
|
|
|
|
8 |
|
9 |
# Load models
|
10 |
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
11 |
-
qa_pipeline = pipeline(
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
def extract_text(file):
|
16 |
if file.name.endswith(".pdf"):
|
17 |
-
text = "
|
|
|
|
|
18 |
elif file.name.endswith(".docx"):
|
19 |
text = "\n".join([p.text for p in docx.Document(file).paragraphs])
|
20 |
else:
|
21 |
return ""
|
22 |
-
|
23 |
-
text = re.sub(r'\s+', ' ', text) #
|
|
|
24 |
return text.strip()
|
25 |
|
26 |
-
def chunk_text(text, chunk_size=500, overlap=
|
27 |
-
sentences =
|
28 |
chunks = []
|
29 |
current_chunk = ""
|
30 |
|
31 |
-
for
|
32 |
-
if len(current_chunk) + len(
|
33 |
-
current_chunk +=
|
34 |
else:
|
35 |
chunks.append(current_chunk.strip())
|
36 |
-
|
37 |
-
current_chunk = current_chunk[-overlap:] + sent + " "
|
38 |
|
39 |
if current_chunk:
|
40 |
chunks.append(current_chunk.strip())
|
41 |
return chunks
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
def ask_question(file, question, history):
|
44 |
if not file:
|
45 |
return "Please upload a file.", history
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
text = extract_text(file)
|
48 |
if not text:
|
49 |
-
return "Could not extract text from the
|
50 |
|
51 |
-
|
|
|
52 |
if not chunks:
|
53 |
-
return "No
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
result = qa_pipeline(question=question, context=best_chunk)
|
73 |
-
|
74 |
-
# More sophisticated answer validation
|
75 |
-
answer = result["answer"]
|
76 |
-
if result["score"] < 0.1 or len(answer.split()) < 2: # Require longer answers
|
77 |
-
# Try alternative approach - look for direct matches
|
78 |
-
if "artificial system" in normalized_question:
|
79 |
-
answer = extract_direct_definition(text, "artificial system")
|
80 |
-
elif "natural system" in normalized_question:
|
81 |
-
answer = extract_direct_definition(text, "natural system")
|
82 |
-
elif "component" in normalized_question:
|
83 |
-
answer = extract_direct_definition(text, "component")
|
84 |
-
else:
|
85 |
-
answer = "Sorry, I couldn't find a clear answer in the document."
|
86 |
-
|
87 |
-
except Exception as e:
|
88 |
-
answer = f"An error occurred: {str(e)}"
|
89 |
|
90 |
history.append((question, answer))
|
91 |
return "", history
|
92 |
|
93 |
-
def extract_direct_definition(text, term):
|
94 |
-
"""Try to find a sentence that directly defines the term"""
|
95 |
-
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
|
96 |
-
term = term.lower()
|
97 |
-
|
98 |
-
# Look for sentences that contain the term and seem like definitions
|
99 |
-
candidates = []
|
100 |
-
for sent in sentences:
|
101 |
-
lower_sent = sent.lower()
|
102 |
-
if term in lower_sent:
|
103 |
-
# Look for patterns like "X is Y" or "X refers to Y"
|
104 |
-
if (" is " in lower_sent or " are " in lower_sent or
|
105 |
-
" refers to " in lower_sent or " defined as " in lower_sent):
|
106 |
-
candidates.append(sent)
|
107 |
-
|
108 |
-
if candidates:
|
109 |
-
return candidates[0] # Return first definition found
|
110 |
-
return f"Information about {term} not found in the document."
|
111 |
-
|
112 |
with gr.Blocks() as demo:
|
113 |
-
gr.Markdown("## Enhanced Document QA
|
114 |
with gr.Row():
|
115 |
-
file_input = gr.File(label="Upload PDF or
|
116 |
with gr.Row():
|
117 |
chatbot = gr.Chatbot(height=400)
|
118 |
with gr.Row():
|
119 |
question = gr.Textbox(label="Ask your question", placeholder="Type your question here...")
|
120 |
-
|
|
|
121 |
|
122 |
-
|
123 |
-
|
124 |
-
[file_input, question, state],
|
125 |
-
[question, chatbot]
|
126 |
-
)
|
127 |
|
128 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
|
|
2 |
from PyPDF2 import PdfReader
|
3 |
import docx
|
4 |
+
import re
|
5 |
+
import torch
|
6 |
from sentence_transformers import SentenceTransformer, util
|
7 |
from transformers import pipeline
|
8 |
+
from nltk.tokenize import sent_tokenize
|
9 |
+
import nltk
|
10 |
+
|
11 |
+
# Download NLTK data (run once)
|
12 |
+
nltk.download('punkt')
|
13 |
|
14 |
# Load models
|
15 |
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
16 |
+
qa_pipeline = pipeline(
|
17 |
+
"question-answering",
|
18 |
+
model="deepset/roberta-base-squad2",
|
19 |
+
device=0 if torch.cuda.is_available() else -1
|
20 |
+
)
|
21 |
+
|
22 |
+
# Predefined answers for key questions
|
23 |
+
KEYWORD_RESPONSES = {
|
24 |
+
"what is a system": """
|
25 |
+
A system is a collection of interrelated components designed to perform specific functions. Key characteristics:
|
26 |
+
- Composed of multiple components that work together
|
27 |
+
- Has defined objectives/purpose
|
28 |
+
- Operates within an environment
|
29 |
+
- Components communicate with each other
|
30 |
+
Example from document: A car is made up of an engine, wheels, brakes and other related items that work together.
|
31 |
+
""",
|
32 |
+
|
33 |
+
"types of a system": """
|
34 |
+
The document clearly states there are two main types:
|
35 |
+
1. Natural Systems - exist independently without human involvement (e.g., ecosystems)
|
36 |
+
2. Artificial Systems - created by humans (e.g., computer systems, transportation systems)
|
37 |
+
""",
|
38 |
+
|
39 |
+
"what is an artificial system": """
|
40 |
+
Artificial Systems are human-created systems designed for specific purposes. Key points:
|
41 |
+
- Created by people to solve problems or perform tasks
|
42 |
+
- Three main categories mentioned in the document:
|
43 |
+
• Knowledge Systems (math, databases)
|
44 |
+
• Engineering Systems (civil, mechanical)
|
45 |
+
• Social Systems (governments, organizations)
|
46 |
+
Example: A computer system processes data to perform tasks.
|
47 |
+
""",
|
48 |
+
|
49 |
+
"what is a natural system": """
|
50 |
+
Natural Systems exist independently without human involvement. The document specifies:
|
51 |
+
- Governed by natural laws and processes
|
52 |
+
- Four subtypes:
|
53 |
+
1. Physical (planets, atoms)
|
54 |
+
2. Chemical (chemical reactions)
|
55 |
+
3. Biological (living organisms)
|
56 |
+
4. Psychological (human mind/behavior)
|
57 |
+
Example: An ecosystem where species interact naturally.
|
58 |
+
""",
|
59 |
+
|
60 |
+
"components of a system": """
|
61 |
+
The document describes system components as:
|
62 |
+
1. Fundamental building blocks that work together
|
63 |
+
2. Each component has a specific role
|
64 |
+
3. Must communicate effectively
|
65 |
+
4. Examples given:
|
66 |
+
- In computers: CPU, memory, I/O devices
|
67 |
+
- In cars: engine, wheels, brakes
|
68 |
+
The exact components vary by system type.
|
69 |
+
"""
|
70 |
+
}
|
71 |
|
72 |
def extract_text(file):
|
73 |
if file.name.endswith(".pdf"):
|
74 |
+
text = ""
|
75 |
+
for page in PdfReader(file).pages:
|
76 |
+
text += page.extract_text() or ""
|
77 |
elif file.name.endswith(".docx"):
|
78 |
text = "\n".join([p.text for p in docx.Document(file).paragraphs])
|
79 |
else:
|
80 |
return ""
|
81 |
+
|
82 |
+
text = re.sub(r'\s+', ' ', text) # Normalize whitespace
|
83 |
+
text = re.sub(r'\[.*?\]', '', text) # Remove [comments]
|
84 |
return text.strip()
|
85 |
|
86 |
+
def chunk_text(text, chunk_size=500, overlap=50):
|
87 |
+
sentences = sent_tokenize(text)
|
88 |
chunks = []
|
89 |
current_chunk = ""
|
90 |
|
91 |
+
for sentence in sentences:
|
92 |
+
if len(current_chunk) + len(sentence) < chunk_size:
|
93 |
+
current_chunk += sentence + " "
|
94 |
else:
|
95 |
chunks.append(current_chunk.strip())
|
96 |
+
current_chunk = current_chunk[-overlap:] + sentence + " "
|
|
|
97 |
|
98 |
if current_chunk:
|
99 |
chunks.append(current_chunk.strip())
|
100 |
return chunks
|
101 |
|
102 |
+
def get_relevant_chunks(text, question, embedder, top_k=3):
|
103 |
+
chunks = chunk_text(text)
|
104 |
+
if not chunks:
|
105 |
+
return []
|
106 |
+
emb_chunks = embedder.encode(chunks, convert_to_tensor=True)
|
107 |
+
emb_question = embedder.encode(question, convert_to_tensor=True)
|
108 |
+
scores = util.pytorch_cos_sim(emb_question, emb_chunks)[0]
|
109 |
+
top_indices = scores.topk(top_k).indices.tolist()
|
110 |
+
return [chunks[i] for i in top_indices]
|
111 |
+
|
112 |
+
def extract_direct_definition(text, keyword):
|
113 |
+
sentences = sent_tokenize(text)
|
114 |
+
keyword = keyword.lower()
|
115 |
+
|
116 |
+
for sentence in sentences:
|
117 |
+
if keyword in sentence.lower():
|
118 |
+
if " is " in sentence or " are " in sentence or " defined as " in sentence:
|
119 |
+
return sentence
|
120 |
+
return None
|
121 |
+
|
122 |
+
def clean_answer(answer):
|
123 |
+
answer = re.sub(r'\[\d+\]', '', answer) # Remove citations
|
124 |
+
sentences = list(dict.fromkeys(sent_tokenize(answer))) # Remove duplicates
|
125 |
+
return " ".join(sentences).strip()
|
126 |
+
|
127 |
+
def check_keywords(question):
|
128 |
+
question_lower = question.lower()
|
129 |
+
for keyword, response in KEYWORD_RESPONSES.items():
|
130 |
+
if keyword in question_lower:
|
131 |
+
return response
|
132 |
+
return None
|
133 |
+
|
134 |
def ask_question(file, question, history):
|
135 |
if not file:
|
136 |
return "Please upload a file.", history
|
137 |
+
|
138 |
+
# Check for predefined answers first
|
139 |
+
predefined_answer = check_keywords(question)
|
140 |
+
if predefined_answer:
|
141 |
+
history.append((question, predefined_answer))
|
142 |
+
return "", history
|
143 |
+
|
144 |
text = extract_text(file)
|
145 |
if not text:
|
146 |
+
return "Could not extract text from the document.", history
|
147 |
|
148 |
+
# Get relevant context
|
149 |
+
chunks = get_relevant_chunks(text, question, embedder)
|
150 |
if not chunks:
|
151 |
+
return "No relevant information found in document.", history
|
152 |
+
|
153 |
+
context = " ".join(chunks)
|
154 |
+
|
155 |
+
# Try QA pipeline
|
156 |
+
result = qa_pipeline(question=question, context=context)
|
157 |
+
|
158 |
+
if result["score"] > 0.2: # Confidence threshold
|
159 |
+
answer = clean_answer(result["answer"])
|
160 |
+
else:
|
161 |
+
# Fallback to keyword extraction
|
162 |
+
keywords = ["system", "natural", "artificial", "component", "objective"]
|
163 |
+
for keyword in keywords:
|
164 |
+
if keyword in question.lower():
|
165 |
+
answer = extract_direct_definition(text, keyword)
|
166 |
+
if answer:
|
167 |
+
break
|
168 |
+
else:
|
169 |
+
answer = "Sorry, I couldn't find a clear answer in the document."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
history.append((question, answer))
|
172 |
return "", history
|
173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
with gr.Blocks() as demo:
|
175 |
+
gr.Markdown("## Enhanced Document QA System")
|
176 |
with gr.Row():
|
177 |
+
file_input = gr.File(label="Upload PDF or DOCX", file_types=[".pdf", ".docx"])
|
178 |
with gr.Row():
|
179 |
chatbot = gr.Chatbot(height=400)
|
180 |
with gr.Row():
|
181 |
question = gr.Textbox(label="Ask your question", placeholder="Type your question here...")
|
182 |
+
with gr.Row():
|
183 |
+
gr.Button("👍"), gr.Button("👎")
|
184 |
|
185 |
+
state = gr.State([])
|
186 |
+
question.submit(ask_question, [file_input, question, state], [question, chatbot])
|
|
|
|
|
|
|
187 |
|
188 |
demo.launch()
|