pendar02 commited on
Commit
770037f
·
verified ·
1 Parent(s): b5db7e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -18
app.py CHANGED
@@ -121,23 +121,52 @@ def preprocess_text(text):
121
 
122
  return formatted_text
123
 
124
- def generate_summary(text, model, tokenizer):
125
- """Generate summary for single abstract"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  if not isinstance(text, str) or not text.strip():
127
  return "No abstract available to summarize."
128
 
129
- # Check if abstract is too short
130
  word_count = len(text.split())
131
- if word_count < 50: # Threshold for "short" abstracts
132
- return text # Return original text for very short abstracts
133
 
134
- # Preprocess the text first
135
  formatted_text = preprocess_text(text)
136
 
137
- # Adjust generation parameters based on input length
138
- max_length = min(150, word_count + 50) # Dynamic max length
139
- min_length = min(50, word_count) # Dynamic min length
140
-
141
  inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
142
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
143
 
@@ -146,20 +175,26 @@ def generate_summary(text, model, tokenizer):
146
  **{
147
  "input_ids": inputs["input_ids"],
148
  "attention_mask": inputs["attention_mask"],
149
- "max_length": max_length,
150
- "min_length": min_length,
151
- "num_beams": 4,
152
- "length_penalty": 2.0,
153
  "early_stopping": True,
154
- "no_repeat_ngram_size": 3
 
 
 
155
  }
156
  )
157
 
158
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
159
 
160
- # Post-process summary
 
 
 
161
  if summary.lower() == text.lower() or len(summary.split()) / word_count > 0.9:
162
- return text # Return original if summary is too similar
163
 
164
  return summary
165
 
@@ -242,7 +277,10 @@ def main():
242
  progress_bar = st.progress(0)
243
 
244
  for idx, abstract in enumerate(df['Abstract']):
245
- summary = generate_summary(abstract, model, tokenizer)
 
 
 
246
  summaries.append(summary)
247
  progress_bar.progress((idx + 1) / len(df))
248
 
 
121
 
122
  return formatted_text
123
 
124
+ def post_process_summary(summary):
125
+ """Clean up and improve summary coherence"""
126
+ if not summary:
127
+ return summary
128
+
129
+ # Split into sentences
130
+ sentences = [s.strip() for s in summary.split('.')]
131
+ sentences = [s for s in sentences if s] # Remove empty sentences
132
+
133
+ # Fix common issues
134
+ processed_sentences = []
135
+ for i, sentence in enumerate(sentences):
136
+ # Remove redundant words/phrases
137
+ sentence = sentence.replace(" and and ", " and ")
138
+ sentence = sentence.replace("appointment and appointment", "appointment")
139
+
140
+ # Fix common grammatical issues
141
+ sentence = sentence.replace("Cancers distress", "Cancer distress")
142
+ sentence = sentence.replace(" ", " ") # Remove double spaces
143
+
144
+ # Capitalize first letter of each sentence
145
+ sentence = sentence.capitalize()
146
+
147
+ # Add to processed sentences if not empty
148
+ if sentence.strip():
149
+ processed_sentences.append(sentence)
150
+
151
+ # Join sentences with proper spacing and punctuation
152
+ cleaned_summary = '. '.join(processed_sentences)
153
+ if cleaned_summary and not cleaned_summary.endswith('.'):
154
+ cleaned_summary += '.'
155
+
156
+ return cleaned_summary
157
+
158
+ def improve_summary_generation(text, model, tokenizer):
159
+ """Enhanced version of generate_summary with better parameters and post-processing"""
160
  if not isinstance(text, str) or not text.strip():
161
  return "No abstract available to summarize."
162
 
 
163
  word_count = len(text.split())
164
+ if word_count < 50:
165
+ return text
166
 
 
167
  formatted_text = preprocess_text(text)
168
 
169
+ # Adjust generation parameters for better coherence
 
 
 
170
  inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
171
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
172
 
 
175
  **{
176
  "input_ids": inputs["input_ids"],
177
  "attention_mask": inputs["attention_mask"],
178
+ "max_length": min(200, word_count + 50),
179
+ "min_length": min(50, word_count),
180
+ "num_beams": 5, # Increased from 4
181
+ "length_penalty": 1.5, # Adjusted from 2.0
182
  "early_stopping": True,
183
+ "no_repeat_ngram_size": 3,
184
+ "temperature": 0.7, # Added temperature for better diversity
185
+ "top_p": 0.9, # Added top_p sampling
186
+ "repetition_penalty": 1.2 # Added repetition penalty
187
  }
188
  )
189
 
190
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
191
 
192
+ # Apply post-processing
193
+ summary = post_process_summary(summary)
194
+
195
+ # Check if summary is too similar to original
196
  if summary.lower() == text.lower() or len(summary.split()) / word_count > 0.9:
197
+ return text
198
 
199
  return summary
200
 
 
277
  progress_bar = st.progress(0)
278
 
279
  for idx, abstract in enumerate(df['Abstract']):
280
+ # Replace this line
281
+ # summary = generate_summary(abstract, model, tokenizer)
282
+ # With this line
283
+ summary = improve_summary_generation(abstract, model, tokenizer)
284
  summaries.append(summary)
285
  progress_bar.progress((idx + 1) / len(df))
286