NaimaAqeel commited on
Commit
b1bb0b3
·
verified ·
1 Parent(s): 2ecc80b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -134
app.py CHANGED
@@ -1,188 +1,166 @@
1
  import gradio as gr
2
  from PyPDF2 import PdfReader
3
  import docx
 
 
4
  import re
5
  import torch
6
- from sentence_transformers import SentenceTransformer, util
7
- from transformers import pipeline
8
- from nltk.tokenize import sent_tokenize
9
- import nltk
10
-
11
- # Download NLTK data (run once)
12
- nltk.download('punkt')
13
 
14
  # Load models
15
  embedder = SentenceTransformer("all-MiniLM-L6-v2")
16
- qa_pipeline = pipeline(
17
- "question-answering",
18
- model="deepset/roberta-base-squad2",
19
- device=0 if torch.cuda.is_available() else -1
20
- )
21
 
22
- # Predefined answers for key questions
23
- KEYWORD_RESPONSES = {
24
- "what is a system": """
25
- A system is a collection of interrelated components designed to perform specific functions. Key characteristics:
26
- - Composed of multiple components that work together
27
- - Has defined objectives/purpose
28
- - Operates within an environment
29
- - Components communicate with each other
30
- Example from document: A car is made up of an engine, wheels, brakes and other related items that work together.
31
- """,
32
-
33
- "types of a system": """
34
- The document clearly states there are two main types:
35
- 1. Natural Systems - exist independently without human involvement (e.g., ecosystems)
36
- 2. Artificial Systems - created by humans (e.g., computer systems, transportation systems)
37
- """,
38
-
39
- "what is an artificial system": """
40
- Artificial Systems are human-created systems designed for specific purposes. Key points:
41
- - Created by people to solve problems or perform tasks
42
- - Three main categories mentioned in the document:
43
- • Knowledge Systems (math, databases)
44
- • Engineering Systems (civil, mechanical)
45
- • Social Systems (governments, organizations)
46
- Example: A computer system processes data to perform tasks.
47
- """,
48
-
49
- "what is a natural system": """
50
- Natural Systems exist independently without human involvement. The document specifies:
51
- - Governed by natural laws and processes
52
- - Four subtypes:
53
- 1. Physical (planets, atoms)
54
- 2. Chemical (chemical reactions)
55
- 3. Biological (living organisms)
56
- 4. Psychological (human mind/behavior)
57
- Example: An ecosystem where species interact naturally.
58
- """,
59
-
60
- "components of a system": """
61
- The document describes system components as:
62
- 1. Fundamental building blocks that work together
63
- 2. Each component has a specific role
64
- 3. Must communicate effectively
65
- 4. Examples given:
66
- - In computers: CPU, memory, I/O devices
67
- - In cars: engine, wheels, brakes
68
- The exact components vary by system type.
69
- """
70
- }
71
 
72
  def extract_text(file):
73
  if file.name.endswith(".pdf"):
74
- text = ""
75
- for page in PdfReader(file).pages:
76
- text += page.extract_text() or ""
77
  elif file.name.endswith(".docx"):
78
  text = "\n".join([p.text for p in docx.Document(file).paragraphs])
79
  else:
80
  return ""
81
-
82
- text = re.sub(r'\s+', ' ', text) # Normalize whitespace
83
- text = re.sub(r'\[.*?\]', '', text) # Remove [comments]
84
  return text.strip()
85
 
86
- def chunk_text(text, chunk_size=500, overlap=50):
87
- sentences = sent_tokenize(text)
88
  chunks = []
89
  current_chunk = ""
90
 
91
- for sentence in sentences:
92
- if len(current_chunk) + len(sentence) < chunk_size:
93
- current_chunk += sentence + " "
94
  else:
95
  chunks.append(current_chunk.strip())
96
- current_chunk = current_chunk[-overlap:] + sentence + " "
 
97
 
98
  if current_chunk:
99
  chunks.append(current_chunk.strip())
100
  return chunks
101
 
102
- def get_relevant_chunks(text, question, embedder, top_k=3):
103
- chunks = chunk_text(text)
104
- if not chunks:
105
- return []
106
- emb_chunks = embedder.encode(chunks, convert_to_tensor=True)
107
- emb_question = embedder.encode(question, convert_to_tensor=True)
108
- scores = util.pytorch_cos_sim(emb_question, emb_chunks)[0]
109
- top_indices = scores.topk(top_k).indices.tolist()
110
- return [chunks[i] for i in top_indices]
 
 
 
 
 
111
 
112
- def extract_direct_definition(text, keyword):
113
- sentences = sent_tokenize(text)
114
- keyword = keyword.lower()
115
 
116
- for sentence in sentences:
117
- if keyword in sentence.lower():
118
- if " is " in sentence or " are " in sentence or " defined as " in sentence:
119
- return sentence
120
- return None
121
-
122
- def clean_answer(answer):
123
- answer = re.sub(r'\[\d+\]', '', answer) # Remove citations
124
- sentences = list(dict.fromkeys(sent_tokenize(answer))) # Remove duplicates
125
- return " ".join(sentences).strip()
126
 
127
- def check_keywords(question):
128
- question_lower = question.lower()
129
- for keyword, response in KEYWORD_RESPONSES.items():
130
- if keyword in question_lower:
131
- return response
 
 
 
 
 
 
 
 
 
 
132
  return None
133
 
134
  def ask_question(file, question, history):
135
  if not file:
136
  return "Please upload a file.", history
137
-
138
- # Check for predefined answers first
139
- predefined_answer = check_keywords(question)
140
- if predefined_answer:
141
- history.append((question, predefined_answer))
142
- return "", history
143
-
144
  text = extract_text(file)
145
  if not text:
146
- return "Could not extract text from the document.", history
147
 
148
- # Get relevant context
149
- chunks = get_relevant_chunks(text, question, embedder)
150
  if not chunks:
151
- return "No relevant information found in document.", history
152
-
153
- context = " ".join(chunks)
154
-
155
- # Try QA pipeline
156
- result = qa_pipeline(question=question, context=context)
157
-
158
- if result["score"] > 0.2: # Confidence threshold
159
- answer = clean_answer(result["answer"])
160
- else:
161
- # Fallback to keyword extraction
162
- keywords = ["system", "natural", "artificial", "component", "objective"]
163
- for keyword in keywords:
164
- if keyword in question.lower():
165
- answer = extract_direct_definition(text, keyword)
166
- if answer:
167
- break
168
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  answer = "Sorry, I couldn't find a clear answer in the document."
 
 
 
170
 
171
  history.append((question, answer))
172
  return "", history
173
 
174
  with gr.Blocks() as demo:
175
- gr.Markdown("## Enhanced Document QA System")
176
  with gr.Row():
177
- file_input = gr.File(label="Upload PDF or DOCX", file_types=[".pdf", ".docx"])
178
  with gr.Row():
179
  chatbot = gr.Chatbot(height=400)
180
  with gr.Row():
181
  question = gr.Textbox(label="Ask your question", placeholder="Type your question here...")
182
- with gr.Row():
183
- gr.Button("👍"), gr.Button("👎")
184
-
185
  state = gr.State([])
186
- question.submit(ask_question, [file_input, question, state], [question, chatbot])
 
 
 
 
 
187
 
188
  demo.launch()
 
1
  import gradio as gr
2
  from PyPDF2 import PdfReader
3
  import docx
4
+ from sentence_transformers import SentenceTransformer, util
5
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
6
  import re
7
  import torch
 
 
 
 
 
 
 
8
 
9
  # Load models
10
  embedder = SentenceTransformer("all-MiniLM-L6-v2")
11
+ qa_pipeline = pipeline("question-answering",
12
+ model="distilbert-base-cased-distilled-squad",
13
+ device=0 if torch.cuda.is_available() else -1)
 
 
14
 
15
+ # Load GPT model (using GPT-2 as example - replace with GPT-3/4 if available)
16
+ gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")
17
+ gpt_model = AutoModelForCausalLM.from_pretrained("gpt2")
18
+ gpt_model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def extract_text(file):
21
  if file.name.endswith(".pdf"):
22
+ text = "\n".join([page.extract_text() or "" for page in PdfReader(file).pages])
 
 
23
  elif file.name.endswith(".docx"):
24
  text = "\n".join([p.text for p in docx.Document(file).paragraphs])
25
  else:
26
  return ""
27
+ # Clean up text
28
+ text = re.sub(r'\s+', ' ', text) # Replace multiple whitespace with single space
 
29
  return text.strip()
30
 
31
+ def chunk_text(text, chunk_size=500, overlap=100):
32
+ sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
33
  chunks = []
34
  current_chunk = ""
35
 
36
+ for sent in sentences:
37
+ if len(current_chunk) + len(sent) < chunk_size:
38
+ current_chunk += sent + " "
39
  else:
40
  chunks.append(current_chunk.strip())
41
+ # Keep some overlap between chunks for context
42
+ current_chunk = current_chunk[-overlap:] + sent + " "
43
 
44
  if current_chunk:
45
  chunks.append(current_chunk.strip())
46
  return chunks
47
 
48
+ def generate_with_gpt(prompt, max_length=150):
49
+ inputs = gpt_tokenizer(prompt, return_tensors="pt")
50
+ with torch.no_grad():
51
+ outputs = gpt_model.generate(
52
+ inputs.input_ids,
53
+ max_length=max_length,
54
+ num_return_sequences=1,
55
+ no_repeat_ngram_size=2,
56
+ do_sample=True,
57
+ top_k=50,
58
+ top_p=0.95,
59
+ temperature=0.7
60
+ )
61
+ return gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
62
 
63
+ def refine_answer_with_gpt(context, question, initial_answer):
64
+ prompt = f"""
65
+ Based on the following context, refine the answer to make it more clear and complete:
66
 
67
+ Context: {context}
68
+
69
+ Question: {question}
70
+
71
+ Initial Answer: {initial_answer}
72
+
73
+ Improved Answer:
74
+ """
75
+ return generate_with_gpt(prompt)
 
76
 
77
+ def extract_direct_definition(text, term):
78
+ """Try to find a sentence that directly defines the term"""
79
+ sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
80
+ term = term.lower()
81
+
82
+ candidates = []
83
+ for sent in sentences:
84
+ lower_sent = sent.lower()
85
+ if term in lower_sent:
86
+ if (" is " in lower_sent or " are " in lower_sent or
87
+ " refers to " in lower_sent or " defined as " in lower_sent):
88
+ candidates.append(sent)
89
+
90
+ if candidates:
91
+ return candidates[0]
92
  return None
93
 
94
  def ask_question(file, question, history):
95
  if not file:
96
  return "Please upload a file.", history
97
+
 
 
 
 
 
 
98
  text = extract_text(file)
99
  if not text:
100
+ return "Could not extract text from the file.", history
101
 
102
+ chunks = chunk_text(text)
 
103
  if not chunks:
104
+ return "No meaningful text chunks could be created.", history
105
+
106
+ # Normalize question for better matching
107
+ normalized_question = question.lower().strip(" ?")
108
+
109
+ try:
110
+ # First try to find direct definitions
111
+ if "artificial system" in normalized_question:
112
+ answer = extract_direct_definition(text, "artificial system")
113
+ elif "natural system" in normalized_question:
114
+ answer = extract_direct_definition(text, "natural system")
115
+ elif "component" in normalized_question:
116
+ answer = extract_direct_definition(text, "component")
117
+
118
+ # If no direct definition found, use semantic search
119
+ if not answer:
120
+ emb_chunks = embedder.encode(chunks, convert_to_tensor=True)
121
+ emb_question = embedder.encode(question, convert_to_tensor=True)
122
+ scores = util.pytorch_cos_sim(emb_question, emb_chunks)[0]
123
+ best_idx = scores.argmax().item()
124
+ best_chunk = chunks[best_idx]
125
+
126
+ # Combine top chunks if confidence is low
127
+ if scores[best_idx] < 0.3:
128
+ top_k = min(3, len(chunks))
129
+ best_indices = scores.topk(top_k).indices.tolist()
130
+ best_chunk = " ".join([chunks[i] for i in best_indices])
131
+
132
+ # Get initial answer from QA model
133
+ result = qa_pipeline(question=question, context=best_chunk)
134
+ answer = result["answer"] if result["score"] > 0.1 else None
135
+
136
+ # Refine answer with GPT if available
137
+ if answer and len(answer.split()) > 2:
138
+ answer = refine_answer_with_gpt(best_chunk, question, answer)
139
+
140
+ # Final fallback
141
+ if not answer:
142
  answer = "Sorry, I couldn't find a clear answer in the document."
143
+
144
+ except Exception as e:
145
+ answer = f"An error occurred: {str(e)}"
146
 
147
  history.append((question, answer))
148
  return "", history
149
 
150
  with gr.Blocks() as demo:
151
+ gr.Markdown("## Enhanced Document QA with GPT Integration")
152
  with gr.Row():
153
+ file_input = gr.File(label="Upload PDF or Word", file_types=[".pdf", ".docx"])
154
  with gr.Row():
155
  chatbot = gr.Chatbot(height=400)
156
  with gr.Row():
157
  question = gr.Textbox(label="Ask your question", placeholder="Type your question here...")
 
 
 
158
  state = gr.State([])
159
+
160
+ question.submit(
161
+ ask_question,
162
+ [file_input, question, state],
163
+ [question, chatbot]
164
+ )
165
 
166
  demo.launch()