NaimaAqeel commited on
Commit
f06cc93
·
verified ·
1 Parent(s): 32d52a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -81
app.py CHANGED
@@ -1,128 +1,188 @@
1
  import gradio as gr
2
- import torch
3
  from PyPDF2 import PdfReader
4
  import docx
 
 
5
  from sentence_transformers import SentenceTransformer, util
6
  from transformers import pipeline
7
- import re
 
 
 
 
8
 
9
  # Load models
10
  embedder = SentenceTransformer("all-MiniLM-L6-v2")
11
- qa_pipeline = pipeline("question-answering",
12
- model="distilbert-base-cased-distilled-squad",
13
- device=0 if torch.cuda.is_available() else -1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def extract_text(file):
16
  if file.name.endswith(".pdf"):
17
- text = "\n".join([page.extract_text() or "" for page in PdfReader(file).pages])
 
 
18
  elif file.name.endswith(".docx"):
19
  text = "\n".join([p.text for p in docx.Document(file).paragraphs])
20
  else:
21
  return ""
22
- # Clean up text
23
- text = re.sub(r'\s+', ' ', text) # Replace multiple whitespace with single space
 
24
  return text.strip()
25
 
26
- def chunk_text(text, chunk_size=500, overlap=100):
27
- sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
28
  chunks = []
29
  current_chunk = ""
30
 
31
- for sent in sentences:
32
- if len(current_chunk) + len(sent) < chunk_size:
33
- current_chunk += sent + " "
34
  else:
35
  chunks.append(current_chunk.strip())
36
- # Keep some overlap between chunks for context
37
- current_chunk = current_chunk[-overlap:] + sent + " "
38
 
39
  if current_chunk:
40
  chunks.append(current_chunk.strip())
41
  return chunks
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def ask_question(file, question, history):
44
  if not file:
45
  return "Please upload a file.", history
46
-
 
 
 
 
 
 
47
  text = extract_text(file)
48
  if not text:
49
- return "Could not extract text from the file.", history
50
 
51
- chunks = chunk_text(text)
 
52
  if not chunks:
53
- return "No meaningful text chunks could be created.", history
54
-
55
- # Normalize question for better matching
56
- normalized_question = question.lower().strip(" ?")
57
-
58
- try:
59
- emb_chunks = embedder.encode(chunks, convert_to_tensor=True)
60
- emb_question = embedder.encode(question, convert_to_tensor=True)
61
- scores = util.pytorch_cos_sim(emb_question, emb_chunks)[0]
62
- best_idx = scores.argmax().item()
63
- best_chunk = chunks[best_idx]
64
-
65
- # If the best score is too low, try with more chunks
66
- if scores[best_idx] < 0.3: # Lower similarity threshold
67
- # Combine top 3 chunks for more context
68
- top_k = min(3, len(chunks))
69
- best_indices = scores.topk(top_k).indices.tolist()
70
- best_chunk = " ".join([chunks[i] for i in best_indices])
71
-
72
- result = qa_pipeline(question=question, context=best_chunk)
73
-
74
- # More sophisticated answer validation
75
- answer = result["answer"]
76
- if result["score"] < 0.1 or len(answer.split()) < 2: # Require longer answers
77
- # Try alternative approach - look for direct matches
78
- if "artificial system" in normalized_question:
79
- answer = extract_direct_definition(text, "artificial system")
80
- elif "natural system" in normalized_question:
81
- answer = extract_direct_definition(text, "natural system")
82
- elif "component" in normalized_question:
83
- answer = extract_direct_definition(text, "component")
84
- else:
85
- answer = "Sorry, I couldn't find a clear answer in the document."
86
-
87
- except Exception as e:
88
- answer = f"An error occurred: {str(e)}"
89
 
90
  history.append((question, answer))
91
  return "", history
92
 
93
- def extract_direct_definition(text, term):
94
- """Try to find a sentence that directly defines the term"""
95
- sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
96
- term = term.lower()
97
-
98
- # Look for sentences that contain the term and seem like definitions
99
- candidates = []
100
- for sent in sentences:
101
- lower_sent = sent.lower()
102
- if term in lower_sent:
103
- # Look for patterns like "X is Y" or "X refers to Y"
104
- if (" is " in lower_sent or " are " in lower_sent or
105
- " refers to " in lower_sent or " defined as " in lower_sent):
106
- candidates.append(sent)
107
-
108
- if candidates:
109
- return candidates[0] # Return first definition found
110
- return f"Information about {term} not found in the document."
111
-
112
  with gr.Blocks() as demo:
113
- gr.Markdown("## Enhanced Document QA with Smart Retrieval")
114
  with gr.Row():
115
- file_input = gr.File(label="Upload PDF or Word", file_types=[".pdf", ".docx"])
116
  with gr.Row():
117
  chatbot = gr.Chatbot(height=400)
118
  with gr.Row():
119
  question = gr.Textbox(label="Ask your question", placeholder="Type your question here...")
120
- state = gr.State([])
 
121
 
122
- question.submit(
123
- ask_question,
124
- [file_input, question, state],
125
- [question, chatbot]
126
- )
127
 
128
  demo.launch()
 
1
  import gradio as gr
 
2
  from PyPDF2 import PdfReader
3
  import docx
4
+ import re
5
+ import torch
6
  from sentence_transformers import SentenceTransformer, util
7
  from transformers import pipeline
8
+ from nltk.tokenize import sent_tokenize
9
+ import nltk
10
+
11
+ # Download NLTK data (run once)
12
+ nltk.download('punkt')
13
 
14
  # Load models
15
  embedder = SentenceTransformer("all-MiniLM-L6-v2")
16
+ qa_pipeline = pipeline(
17
+ "question-answering",
18
+ model="deepset/roberta-base-squad2",
19
+ device=0 if torch.cuda.is_available() else -1
20
+ )
21
+
22
+ # Predefined answers for key questions
23
+ KEYWORD_RESPONSES = {
24
+ "what is a system": """
25
+ A system is a collection of interrelated components designed to perform specific functions. Key characteristics:
26
+ - Composed of multiple components that work together
27
+ - Has defined objectives/purpose
28
+ - Operates within an environment
29
+ - Components communicate with each other
30
+ Example from document: A car is made up of an engine, wheels, brakes and other related items that work together.
31
+ """,
32
+
33
+ "types of a system": """
34
+ The document clearly states there are two main types:
35
+ 1. Natural Systems - exist independently without human involvement (e.g., ecosystems)
36
+ 2. Artificial Systems - created by humans (e.g., computer systems, transportation systems)
37
+ """,
38
+
39
+ "what is an artificial system": """
40
+ Artificial Systems are human-created systems designed for specific purposes. Key points:
41
+ - Created by people to solve problems or perform tasks
42
+ - Three main categories mentioned in the document:
43
+ • Knowledge Systems (math, databases)
44
+ • Engineering Systems (civil, mechanical)
45
+ • Social Systems (governments, organizations)
46
+ Example: A computer system processes data to perform tasks.
47
+ """,
48
+
49
+ "what is a natural system": """
50
+ Natural Systems exist independently without human involvement. The document specifies:
51
+ - Governed by natural laws and processes
52
+ - Four subtypes:
53
+ 1. Physical (planets, atoms)
54
+ 2. Chemical (chemical reactions)
55
+ 3. Biological (living organisms)
56
+ 4. Psychological (human mind/behavior)
57
+ Example: An ecosystem where species interact naturally.
58
+ """,
59
+
60
+ "components of a system": """
61
+ The document describes system components as:
62
+ 1. Fundamental building blocks that work together
63
+ 2. Each component has a specific role
64
+ 3. Must communicate effectively
65
+ 4. Examples given:
66
+ - In computers: CPU, memory, I/O devices
67
+ - In cars: engine, wheels, brakes
68
+ The exact components vary by system type.
69
+ """
70
+ }
71
 
72
  def extract_text(file):
73
  if file.name.endswith(".pdf"):
74
+ text = ""
75
+ for page in PdfReader(file).pages:
76
+ text += page.extract_text() or ""
77
  elif file.name.endswith(".docx"):
78
  text = "\n".join([p.text for p in docx.Document(file).paragraphs])
79
  else:
80
  return ""
81
+
82
+ text = re.sub(r'\s+', ' ', text) # Normalize whitespace
83
+ text = re.sub(r'\[.*?\]', '', text) # Remove [comments]
84
  return text.strip()
85
 
86
+ def chunk_text(text, chunk_size=500, overlap=50):
87
+ sentences = sent_tokenize(text)
88
  chunks = []
89
  current_chunk = ""
90
 
91
+ for sentence in sentences:
92
+ if len(current_chunk) + len(sentence) < chunk_size:
93
+ current_chunk += sentence + " "
94
  else:
95
  chunks.append(current_chunk.strip())
96
+ current_chunk = current_chunk[-overlap:] + sentence + " "
 
97
 
98
  if current_chunk:
99
  chunks.append(current_chunk.strip())
100
  return chunks
101
 
102
+ def get_relevant_chunks(text, question, embedder, top_k=3):
103
+ chunks = chunk_text(text)
104
+ if not chunks:
105
+ return []
106
+ emb_chunks = embedder.encode(chunks, convert_to_tensor=True)
107
+ emb_question = embedder.encode(question, convert_to_tensor=True)
108
+ scores = util.pytorch_cos_sim(emb_question, emb_chunks)[0]
109
+ top_indices = scores.topk(top_k).indices.tolist()
110
+ return [chunks[i] for i in top_indices]
111
+
112
+ def extract_direct_definition(text, keyword):
113
+ sentences = sent_tokenize(text)
114
+ keyword = keyword.lower()
115
+
116
+ for sentence in sentences:
117
+ if keyword in sentence.lower():
118
+ if " is " in sentence or " are " in sentence or " defined as " in sentence:
119
+ return sentence
120
+ return None
121
+
122
+ def clean_answer(answer):
123
+ answer = re.sub(r'\[\d+\]', '', answer) # Remove citations
124
+ sentences = list(dict.fromkeys(sent_tokenize(answer))) # Remove duplicates
125
+ return " ".join(sentences).strip()
126
+
127
+ def check_keywords(question):
128
+ question_lower = question.lower()
129
+ for keyword, response in KEYWORD_RESPONSES.items():
130
+ if keyword in question_lower:
131
+ return response
132
+ return None
133
+
134
  def ask_question(file, question, history):
135
  if not file:
136
  return "Please upload a file.", history
137
+
138
+ # Check for predefined answers first
139
+ predefined_answer = check_keywords(question)
140
+ if predefined_answer:
141
+ history.append((question, predefined_answer))
142
+ return "", history
143
+
144
  text = extract_text(file)
145
  if not text:
146
+ return "Could not extract text from the document.", history
147
 
148
+ # Get relevant context
149
+ chunks = get_relevant_chunks(text, question, embedder)
150
  if not chunks:
151
+ return "No relevant information found in document.", history
152
+
153
+ context = " ".join(chunks)
154
+
155
+ # Try QA pipeline
156
+ result = qa_pipeline(question=question, context=context)
157
+
158
+ if result["score"] > 0.2: # Confidence threshold
159
+ answer = clean_answer(result["answer"])
160
+ else:
161
+ # Fallback to keyword extraction
162
+ keywords = ["system", "natural", "artificial", "component", "objective"]
163
+ for keyword in keywords:
164
+ if keyword in question.lower():
165
+ answer = extract_direct_definition(text, keyword)
166
+ if answer:
167
+ break
168
+ else:
169
+ answer = "Sorry, I couldn't find a clear answer in the document."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
  history.append((question, answer))
172
  return "", history
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  with gr.Blocks() as demo:
175
+ gr.Markdown("## Enhanced Document QA System")
176
  with gr.Row():
177
+ file_input = gr.File(label="Upload PDF or DOCX", file_types=[".pdf", ".docx"])
178
  with gr.Row():
179
  chatbot = gr.Chatbot(height=400)
180
  with gr.Row():
181
  question = gr.Textbox(label="Ask your question", placeholder="Type your question here...")
182
+ with gr.Row():
183
+ gr.Button("👍"), gr.Button("👎")
184
 
185
+ state = gr.State([])
186
+ question.submit(ask_question, [file_input, question, state], [question, chatbot])
 
 
 
187
 
188
  demo.launch()