Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -121,23 +121,52 @@ def preprocess_text(text):
|
|
121 |
|
122 |
return formatted_text
|
123 |
|
124 |
-
def
|
125 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
if not isinstance(text, str) or not text.strip():
|
127 |
return "No abstract available to summarize."
|
128 |
|
129 |
-
# Check if abstract is too short
|
130 |
word_count = len(text.split())
|
131 |
-
if word_count < 50:
|
132 |
-
return text
|
133 |
|
134 |
-
# Preprocess the text first
|
135 |
formatted_text = preprocess_text(text)
|
136 |
|
137 |
-
# Adjust generation parameters
|
138 |
-
max_length = min(150, word_count + 50) # Dynamic max length
|
139 |
-
min_length = min(50, word_count) # Dynamic min length
|
140 |
-
|
141 |
inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
|
142 |
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
143 |
|
@@ -146,20 +175,26 @@ def generate_summary(text, model, tokenizer):
|
|
146 |
**{
|
147 |
"input_ids": inputs["input_ids"],
|
148 |
"attention_mask": inputs["attention_mask"],
|
149 |
-
"max_length":
|
150 |
-
"min_length":
|
151 |
-
"num_beams": 4
|
152 |
-
"length_penalty": 2.0
|
153 |
"early_stopping": True,
|
154 |
-
"no_repeat_ngram_size": 3
|
|
|
|
|
|
|
155 |
}
|
156 |
)
|
157 |
|
158 |
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
159 |
|
160 |
-
#
|
|
|
|
|
|
|
161 |
if summary.lower() == text.lower() or len(summary.split()) / word_count > 0.9:
|
162 |
-
return text
|
163 |
|
164 |
return summary
|
165 |
|
@@ -242,7 +277,10 @@ def main():
|
|
242 |
progress_bar = st.progress(0)
|
243 |
|
244 |
for idx, abstract in enumerate(df['Abstract']):
|
245 |
-
|
|
|
|
|
|
|
246 |
summaries.append(summary)
|
247 |
progress_bar.progress((idx + 1) / len(df))
|
248 |
|
|
|
121 |
|
122 |
return formatted_text
|
123 |
|
124 |
+
def post_process_summary(summary):
|
125 |
+
"""Clean up and improve summary coherence"""
|
126 |
+
if not summary:
|
127 |
+
return summary
|
128 |
+
|
129 |
+
# Split into sentences
|
130 |
+
sentences = [s.strip() for s in summary.split('.')]
|
131 |
+
sentences = [s for s in sentences if s] # Remove empty sentences
|
132 |
+
|
133 |
+
# Fix common issues
|
134 |
+
processed_sentences = []
|
135 |
+
for i, sentence in enumerate(sentences):
|
136 |
+
# Remove redundant words/phrases
|
137 |
+
sentence = sentence.replace(" and and ", " and ")
|
138 |
+
sentence = sentence.replace("appointment and appointment", "appointment")
|
139 |
+
|
140 |
+
# Fix common grammatical issues
|
141 |
+
sentence = sentence.replace("Cancers distress", "Cancer distress")
|
142 |
+
sentence = sentence.replace(" ", " ") # Remove double spaces
|
143 |
+
|
144 |
+
# Capitalize first letter of each sentence
|
145 |
+
sentence = sentence.capitalize()
|
146 |
+
|
147 |
+
# Add to processed sentences if not empty
|
148 |
+
if sentence.strip():
|
149 |
+
processed_sentences.append(sentence)
|
150 |
+
|
151 |
+
# Join sentences with proper spacing and punctuation
|
152 |
+
cleaned_summary = '. '.join(processed_sentences)
|
153 |
+
if cleaned_summary and not cleaned_summary.endswith('.'):
|
154 |
+
cleaned_summary += '.'
|
155 |
+
|
156 |
+
return cleaned_summary
|
157 |
+
|
158 |
+
def improve_summary_generation(text, model, tokenizer):
|
159 |
+
"""Enhanced version of generate_summary with better parameters and post-processing"""
|
160 |
if not isinstance(text, str) or not text.strip():
|
161 |
return "No abstract available to summarize."
|
162 |
|
|
|
163 |
word_count = len(text.split())
|
164 |
+
if word_count < 50:
|
165 |
+
return text
|
166 |
|
|
|
167 |
formatted_text = preprocess_text(text)
|
168 |
|
169 |
+
# Adjust generation parameters for better coherence
|
|
|
|
|
|
|
170 |
inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
|
171 |
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
172 |
|
|
|
175 |
**{
|
176 |
"input_ids": inputs["input_ids"],
|
177 |
"attention_mask": inputs["attention_mask"],
|
178 |
+
"max_length": min(200, word_count + 50),
|
179 |
+
"min_length": min(50, word_count),
|
180 |
+
"num_beams": 5, # Increased from 4
|
181 |
+
"length_penalty": 1.5, # Adjusted from 2.0
|
182 |
"early_stopping": True,
|
183 |
+
"no_repeat_ngram_size": 3,
|
184 |
+
"temperature": 0.7, # Added temperature for better diversity
|
185 |
+
"top_p": 0.9, # Added top_p sampling
|
186 |
+
"repetition_penalty": 1.2 # Added repetition penalty
|
187 |
}
|
188 |
)
|
189 |
|
190 |
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
191 |
|
192 |
+
# Apply post-processing
|
193 |
+
summary = post_process_summary(summary)
|
194 |
+
|
195 |
+
# Check if summary is too similar to original
|
196 |
if summary.lower() == text.lower() or len(summary.split()) / word_count > 0.9:
|
197 |
+
return text
|
198 |
|
199 |
return summary
|
200 |
|
|
|
277 |
progress_bar = st.progress(0)
|
278 |
|
279 |
for idx, abstract in enumerate(df['Abstract']):
|
280 |
+
# Replace this line
|
281 |
+
# summary = generate_summary(abstract, model, tokenizer)
|
282 |
+
# With this line
|
283 |
+
summary = improve_summary_generation(abstract, model, tokenizer)
|
284 |
summaries.append(summary)
|
285 |
progress_bar.progress((idx + 1) / len(df))
|
286 |
|