Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -181,7 +181,7 @@ def improve_summary_generation(text, model, tokenizer):
|
|
181 |
"length_penalty": 1.5,
|
182 |
"no_repeat_ngram_size": 3,
|
183 |
"temperature": 0.7,
|
184 |
-
"repetition_penalty": 1.5
|
185 |
}
|
186 |
)
|
187 |
|
@@ -214,12 +214,6 @@ def improve_summary_generation(text, model, tokenizer):
|
|
214 |
|
215 |
def validate_summary(summary, original_text):
|
216 |
"""Validate summary content against original text"""
|
217 |
-
import re
|
218 |
-
|
219 |
-
# Don't validate empty summaries
|
220 |
-
if not summary or not original_text:
|
221 |
-
return False
|
222 |
-
|
223 |
# Check for age inconsistencies
|
224 |
age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
|
225 |
if len(age_mentions) > 1: # Multiple age mentions
|
@@ -237,72 +231,8 @@ def validate_summary(summary, original_text):
|
|
237 |
if summary_words < 20 or summary_words > original_words * 0.8:
|
238 |
return False
|
239 |
|
240 |
-
# Check for common error patterns
|
241 |
-
error_patterns = [
|
242 |
-
r'mean.*mean',
|
243 |
-
r'median.*median',
|
244 |
-
r'results.*results',
|
245 |
-
r'conclusion.*conclusion',
|
246 |
-
r'significance.*significance'
|
247 |
-
]
|
248 |
-
|
249 |
-
for pattern in error_patterns:
|
250 |
-
if len(re.findall(pattern, summary.lower())) > 1:
|
251 |
-
return False
|
252 |
-
|
253 |
return True
|
254 |
|
255 |
-
def post_process_summary(summary):
|
256 |
-
"""Enhanced post-processing to catch common errors"""
|
257 |
-
if not summary:
|
258 |
-
return summary
|
259 |
-
|
260 |
-
# Remove contradictory age statements
|
261 |
-
age_statements = []
|
262 |
-
lines = summary.split('.')
|
263 |
-
cleaned_lines = []
|
264 |
-
for line in lines:
|
265 |
-
if "age" not in line.lower():
|
266 |
-
cleaned_lines.append(line)
|
267 |
-
elif not age_statements: # Only keep first age statement
|
268 |
-
age_statements.append(line)
|
269 |
-
cleaned_lines.append(line)
|
270 |
-
|
271 |
-
# Remove redundant statements
|
272 |
-
seen_content = set()
|
273 |
-
unique_lines = []
|
274 |
-
for line in cleaned_lines:
|
275 |
-
# Skip empty lines
|
276 |
-
if not line.strip():
|
277 |
-
continue
|
278 |
-
|
279 |
-
# Normalize for comparison
|
280 |
-
line_core = ' '.join(sorted(line.lower().split()))
|
281 |
-
|
282 |
-
# Check for near-duplicates
|
283 |
-
duplicate = False
|
284 |
-
for seen in seen_content:
|
285 |
-
if line_core in seen or seen in line_core:
|
286 |
-
duplicate = True
|
287 |
-
break
|
288 |
-
|
289 |
-
if not duplicate:
|
290 |
-
seen_content.add(line_core)
|
291 |
-
unique_lines.append(line)
|
292 |
-
|
293 |
-
# Join sentences with proper spacing and punctuation
|
294 |
-
cleaned_summary = '. '.join(s.strip() for s in unique_lines if s.strip())
|
295 |
-
if cleaned_summary and not cleaned_summary.endswith('.'):
|
296 |
-
cleaned_summary += '.'
|
297 |
-
|
298 |
-
# Additional cleaning
|
299 |
-
cleaned_summary = cleaned_summary.replace(" and and ", " and ")
|
300 |
-
cleaned_summary = cleaned_summary.replace("results showed", "")
|
301 |
-
cleaned_summary = cleaned_summary.replace("results indicated", "")
|
302 |
-
cleaned_summary = cleaned_summary.replace(" ", " ")
|
303 |
-
|
304 |
-
return cleaned_summary
|
305 |
-
|
306 |
def generate_focused_summary(question, abstracts, model, tokenizer):
|
307 |
"""Generate focused summary based on question"""
|
308 |
# Preprocess each abstract
|
@@ -327,22 +257,63 @@ def generate_focused_summary(question, abstracts, model, tokenizer):
|
|
327 |
|
328 |
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
329 |
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
345 |
|
|
|
346 |
|
347 |
def main():
|
348 |
st.title("🔬 Biomedical Papers Analysis")
|
@@ -391,46 +362,59 @@ def main():
|
|
391 |
if st.session_state.summaries is None:
|
392 |
try:
|
393 |
with st.spinner("Generating individual paper summaries..."):
|
394 |
-
# Load summarization model
|
395 |
model, tokenizer = load_model("summarize")
|
396 |
-
|
397 |
-
# Generate summaries for each abstract
|
398 |
summaries = []
|
399 |
progress_bar = st.progress(0)
|
400 |
|
401 |
for idx, abstract in enumerate(df['Abstract']):
|
402 |
-
# Replace this line
|
403 |
-
# summary = generate_summary(abstract, model, tokenizer)
|
404 |
-
# With this line
|
405 |
summary = improve_summary_generation(abstract, model, tokenizer)
|
406 |
summaries.append(summary)
|
407 |
progress_bar.progress((idx + 1) / len(df))
|
408 |
|
409 |
-
# Store summaries in session state
|
410 |
st.session_state.summaries = summaries
|
411 |
-
|
412 |
-
# Cleanup
|
413 |
cleanup_model(model, tokenizer)
|
414 |
progress_bar.empty()
|
415 |
|
416 |
except Exception as e:
|
417 |
st.error(f"Error generating summaries: {str(e)}")
|
418 |
-
st.session_state.processing_started = False
|
419 |
|
420 |
-
# Display summaries with improved sorting
|
421 |
if st.session_state.summaries is not None:
|
422 |
col1, col2 = st.columns(2)
|
423 |
with col1:
|
424 |
sort_options = ['Article Title', 'Authors', 'Publication Year', 'Source Title']
|
425 |
-
sort_column = st.selectbox("Sort by:", sort_options)
|
426 |
with col2:
|
427 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
428 |
|
429 |
-
# Create display dataframe
|
430 |
display_df = df.copy()
|
431 |
display_df['Summary'] = st.session_state.summaries
|
432 |
display_df['Publication Year'] = display_df['Publication Year'].astype(int)
|
433 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
434 |
|
435 |
# Apply custom styling
|
436 |
st.markdown("""
|
@@ -463,7 +447,7 @@ def main():
|
|
463 |
</style>
|
464 |
""", unsafe_allow_html=True)
|
465 |
|
466 |
-
# Display papers
|
467 |
for _, row in sorted_df.iterrows():
|
468 |
paper_info_cols = st.columns([1, 1])
|
469 |
|
@@ -489,7 +473,7 @@ def main():
|
|
489 |
</div>
|
490 |
""", unsafe_allow_html=True)
|
491 |
|
492 |
-
# Add
|
493 |
st.markdown("<div style='margin-bottom: 20px;'></div>", unsafe_allow_html=True)
|
494 |
|
495 |
# Question-focused Summary Section (only if question provided)
|
|
|
181 |
"length_penalty": 1.5,
|
182 |
"no_repeat_ngram_size": 3,
|
183 |
"temperature": 0.7,
|
184 |
+
"repetition_penalty": 1.5
|
185 |
}
|
186 |
)
|
187 |
|
|
|
214 |
|
215 |
def validate_summary(summary, original_text):
|
216 |
"""Validate summary content against original text"""
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
# Check for age inconsistencies
|
218 |
age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
|
219 |
if len(age_mentions) > 1: # Multiple age mentions
|
|
|
231 |
if summary_words < 20 or summary_words > original_words * 0.8:
|
232 |
return False
|
233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
return True
|
235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
def generate_focused_summary(question, abstracts, model, tokenizer):
|
237 |
"""Generate focused summary based on question"""
|
238 |
# Preprocess each abstract
|
|
|
257 |
|
258 |
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
259 |
|
260 |
+
def create_filter_controls(df, sort_column):
|
261 |
+
"""Create appropriate filter controls based on the selected column"""
|
262 |
+
filtered_df = df.copy()
|
263 |
+
|
264 |
+
if sort_column == 'Publication Year':
|
265 |
+
# Year range slider
|
266 |
+
year_min = int(df['Publication Year'].min())
|
267 |
+
year_max = int(df['Publication Year'].max())
|
268 |
+
col1, col2 = st.columns(2)
|
269 |
+
with col1:
|
270 |
+
start_year = st.number_input('From Year',
|
271 |
+
min_value=year_min,
|
272 |
+
max_value=year_max,
|
273 |
+
value=year_min)
|
274 |
+
with col2:
|
275 |
+
end_year = st.number_input('To Year',
|
276 |
+
min_value=year_min,
|
277 |
+
max_value=year_max,
|
278 |
+
value=year_max)
|
279 |
+
filtered_df = filtered_df[
|
280 |
+
(filtered_df['Publication Year'] >= start_year) &
|
281 |
+
(filtered_df['Publication Year'] <= end_year)
|
282 |
+
]
|
283 |
+
|
284 |
+
elif sort_column == 'Authors':
|
285 |
+
# Multi-select for authors
|
286 |
+
unique_authors = sorted(set(
|
287 |
+
author.strip()
|
288 |
+
for authors in df['Authors'].dropna()
|
289 |
+
for author in authors.split(';')
|
290 |
+
))
|
291 |
+
selected_authors = st.multiselect(
|
292 |
+
'Select Authors',
|
293 |
+
unique_authors
|
294 |
+
)
|
295 |
+
if selected_authors:
|
296 |
+
filtered_df = filtered_df[
|
297 |
+
filtered_df['Authors'].apply(
|
298 |
+
lambda x: any(author in str(x) for author in selected_authors)
|
299 |
+
)
|
300 |
+
]
|
301 |
+
|
302 |
+
elif sort_column == 'Source Title':
|
303 |
+
# Multi-select for source titles
|
304 |
+
unique_sources = sorted(df['Source Title'].unique())
|
305 |
+
selected_sources = st.multiselect(
|
306 |
+
'Select Sources',
|
307 |
+
unique_sources
|
308 |
+
)
|
309 |
+
if selected_sources:
|
310 |
+
filtered_df = filtered_df[filtered_df['Source Title'].isin(selected_sources)]
|
311 |
+
|
312 |
+
elif sort_column == 'Article Title':
|
313 |
+
# Only alphabetical sorting, no filtering
|
314 |
+
pass
|
315 |
|
316 |
+
return filtered_df
|
317 |
|
318 |
def main():
|
319 |
st.title("🔬 Biomedical Papers Analysis")
|
|
|
362 |
if st.session_state.summaries is None:
|
363 |
try:
|
364 |
with st.spinner("Generating individual paper summaries..."):
|
|
|
365 |
model, tokenizer = load_model("summarize")
|
|
|
|
|
366 |
summaries = []
|
367 |
progress_bar = st.progress(0)
|
368 |
|
369 |
for idx, abstract in enumerate(df['Abstract']):
|
|
|
|
|
|
|
370 |
summary = improve_summary_generation(abstract, model, tokenizer)
|
371 |
summaries.append(summary)
|
372 |
progress_bar.progress((idx + 1) / len(df))
|
373 |
|
|
|
374 |
st.session_state.summaries = summaries
|
|
|
|
|
375 |
cleanup_model(model, tokenizer)
|
376 |
progress_bar.empty()
|
377 |
|
378 |
except Exception as e:
|
379 |
st.error(f"Error generating summaries: {str(e)}")
|
380 |
+
st.session_state.processing_started = False
|
381 |
|
382 |
+
# Display summaries with improved sorting and filtering
|
383 |
if st.session_state.summaries is not None:
|
384 |
col1, col2 = st.columns(2)
|
385 |
with col1:
|
386 |
sort_options = ['Article Title', 'Authors', 'Publication Year', 'Source Title']
|
387 |
+
sort_column = st.selectbox("Sort/Filter by:", sort_options)
|
388 |
with col2:
|
389 |
+
# Only show A-Z/Z-A option for Article Title
|
390 |
+
if sort_column == 'Article Title':
|
391 |
+
ascending = st.radio(
|
392 |
+
"Sort order",
|
393 |
+
["A to Z", "Z to A"],
|
394 |
+
horizontal=True
|
395 |
+
) == "A to Z"
|
396 |
+
else:
|
397 |
+
ascending = True # Default for other columns
|
398 |
|
399 |
+
# Create display dataframe
|
400 |
display_df = df.copy()
|
401 |
display_df['Summary'] = st.session_state.summaries
|
402 |
display_df['Publication Year'] = display_df['Publication Year'].astype(int)
|
403 |
+
|
404 |
+
# Apply filters
|
405 |
+
filtered_df = create_filter_controls(display_df, sort_column)
|
406 |
+
|
407 |
+
if sort_column == 'Article Title':
|
408 |
+
# Sort alphabetically
|
409 |
+
sorted_df = filtered_df.sort_values(by=sort_column, ascending=ascending)
|
410 |
+
else:
|
411 |
+
# Keep original order for other columns after filtering
|
412 |
+
# Keep original order for other columns after filtering
|
413 |
+
sorted_df = filtered_df
|
414 |
+
|
415 |
+
# Show number of filtered results
|
416 |
+
if len(sorted_df) != len(display_df):
|
417 |
+
st.write(f"Showing {len(sorted_df)} of {len(display_df)} papers")
|
418 |
|
419 |
# Apply custom styling
|
420 |
st.markdown("""
|
|
|
447 |
</style>
|
448 |
""", unsafe_allow_html=True)
|
449 |
|
450 |
+
# Display papers using the filtered and sorted dataframe
|
451 |
for _, row in sorted_df.iterrows():
|
452 |
paper_info_cols = st.columns([1, 1])
|
453 |
|
|
|
473 |
</div>
|
474 |
""", unsafe_allow_html=True)
|
475 |
|
476 |
+
# Add spacing between papers
|
477 |
st.markdown("<div style='margin-bottom: 20px;'></div>", unsafe_allow_html=True)
|
478 |
|
479 |
# Question-focused Summary Section (only if question provided)
|