pendar02 commited on
Commit
4742b6b
·
verified ·
1 Parent(s): ba5200e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -103
app.py CHANGED
@@ -181,7 +181,7 @@ def improve_summary_generation(text, model, tokenizer):
181
  "length_penalty": 1.5,
182
  "no_repeat_ngram_size": 3,
183
  "temperature": 0.7,
184
- "repetition_penalty": 1.5 # Increased to reduce repetition
185
  }
186
  )
187
 
@@ -214,12 +214,6 @@ def improve_summary_generation(text, model, tokenizer):
214
 
215
  def validate_summary(summary, original_text):
216
  """Validate summary content against original text"""
217
- import re
218
-
219
- # Don't validate empty summaries
220
- if not summary or not original_text:
221
- return False
222
-
223
  # Check for age inconsistencies
224
  age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
225
  if len(age_mentions) > 1: # Multiple age mentions
@@ -237,72 +231,8 @@ def validate_summary(summary, original_text):
237
  if summary_words < 20 or summary_words > original_words * 0.8:
238
  return False
239
 
240
- # Check for common error patterns
241
- error_patterns = [
242
- r'mean.*mean',
243
- r'median.*median',
244
- r'results.*results',
245
- r'conclusion.*conclusion',
246
- r'significance.*significance'
247
- ]
248
-
249
- for pattern in error_patterns:
250
- if len(re.findall(pattern, summary.lower())) > 1:
251
- return False
252
-
253
  return True
254
 
255
- def post_process_summary(summary):
256
- """Enhanced post-processing to catch common errors"""
257
- if not summary:
258
- return summary
259
-
260
- # Remove contradictory age statements
261
- age_statements = []
262
- lines = summary.split('.')
263
- cleaned_lines = []
264
- for line in lines:
265
- if "age" not in line.lower():
266
- cleaned_lines.append(line)
267
- elif not age_statements: # Only keep first age statement
268
- age_statements.append(line)
269
- cleaned_lines.append(line)
270
-
271
- # Remove redundant statements
272
- seen_content = set()
273
- unique_lines = []
274
- for line in cleaned_lines:
275
- # Skip empty lines
276
- if not line.strip():
277
- continue
278
-
279
- # Normalize for comparison
280
- line_core = ' '.join(sorted(line.lower().split()))
281
-
282
- # Check for near-duplicates
283
- duplicate = False
284
- for seen in seen_content:
285
- if line_core in seen or seen in line_core:
286
- duplicate = True
287
- break
288
-
289
- if not duplicate:
290
- seen_content.add(line_core)
291
- unique_lines.append(line)
292
-
293
- # Join sentences with proper spacing and punctuation
294
- cleaned_summary = '. '.join(s.strip() for s in unique_lines if s.strip())
295
- if cleaned_summary and not cleaned_summary.endswith('.'):
296
- cleaned_summary += '.'
297
-
298
- # Additional cleaning
299
- cleaned_summary = cleaned_summary.replace(" and and ", " and ")
300
- cleaned_summary = cleaned_summary.replace("results showed", "")
301
- cleaned_summary = cleaned_summary.replace("results indicated", "")
302
- cleaned_summary = cleaned_summary.replace(" ", " ")
303
-
304
- return cleaned_summary
305
-
306
  def generate_focused_summary(question, abstracts, model, tokenizer):
307
  """Generate focused summary based on question"""
308
  # Preprocess each abstract
@@ -327,22 +257,63 @@ def generate_focused_summary(question, abstracts, model, tokenizer):
327
 
328
  return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
329
 
330
-
331
- def validate_summary(summary, original_text):
332
- """Validate summary content against original text"""
333
- # Check for age inconsistencies
334
- age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
335
- if len(age_mentions) > 1: # Multiple age mentions
336
- return False
337
-
338
- # Check for repetitive sentences
339
- sentences = summary.split('.')
340
- unique_sentences = set(s.strip().lower() for s in sentences if s.strip())
341
- if len(sentences) - len(unique_sentences) > 1: # More than one duplicate
342
- return False
343
-
344
- return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
 
 
346
 
347
  def main():
348
  st.title("🔬 Biomedical Papers Analysis")
@@ -391,46 +362,59 @@ def main():
391
  if st.session_state.summaries is None:
392
  try:
393
  with st.spinner("Generating individual paper summaries..."):
394
- # Load summarization model
395
  model, tokenizer = load_model("summarize")
396
-
397
- # Generate summaries for each abstract
398
  summaries = []
399
  progress_bar = st.progress(0)
400
 
401
  for idx, abstract in enumerate(df['Abstract']):
402
- # Replace this line
403
- # summary = generate_summary(abstract, model, tokenizer)
404
- # With this line
405
  summary = improve_summary_generation(abstract, model, tokenizer)
406
  summaries.append(summary)
407
  progress_bar.progress((idx + 1) / len(df))
408
 
409
- # Store summaries in session state
410
  st.session_state.summaries = summaries
411
-
412
- # Cleanup
413
  cleanup_model(model, tokenizer)
414
  progress_bar.empty()
415
 
416
  except Exception as e:
417
  st.error(f"Error generating summaries: {str(e)}")
418
- st.session_state.processing_started = False # Reset to allow retry
419
 
420
- # Display summaries with improved sorting
421
  if st.session_state.summaries is not None:
422
  col1, col2 = st.columns(2)
423
  with col1:
424
  sort_options = ['Article Title', 'Authors', 'Publication Year', 'Source Title']
425
- sort_column = st.selectbox("Sort by:", sort_options)
426
  with col2:
427
- ascending = st.checkbox("Ascending order", True)
 
 
 
 
 
 
 
 
428
 
429
- # Create display dataframe with formatted year
430
  display_df = df.copy()
431
  display_df['Summary'] = st.session_state.summaries
432
  display_df['Publication Year'] = display_df['Publication Year'].astype(int)
433
- sorted_df = display_df.sort_values(by=sort_column, ascending=ascending)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
 
435
  # Apply custom styling
436
  st.markdown("""
@@ -463,7 +447,7 @@ def main():
463
  </style>
464
  """, unsafe_allow_html=True)
465
 
466
- # Display papers in side-by-side layout
467
  for _, row in sorted_df.iterrows():
468
  paper_info_cols = st.columns([1, 1])
469
 
@@ -489,7 +473,7 @@ def main():
489
  </div>
490
  """, unsafe_allow_html=True)
491
 
492
- # Add some spacing between papers
493
  st.markdown("<div style='margin-bottom: 20px;'></div>", unsafe_allow_html=True)
494
 
495
  # Question-focused Summary Section (only if question provided)
 
181
  "length_penalty": 1.5,
182
  "no_repeat_ngram_size": 3,
183
  "temperature": 0.7,
184
+ "repetition_penalty": 1.5
185
  }
186
  )
187
 
 
214
 
215
  def validate_summary(summary, original_text):
216
  """Validate summary content against original text"""
 
 
 
 
 
 
217
  # Check for age inconsistencies
218
  age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
219
  if len(age_mentions) > 1: # Multiple age mentions
 
231
  if summary_words < 20 or summary_words > original_words * 0.8:
232
  return False
233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  return True
235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  def generate_focused_summary(question, abstracts, model, tokenizer):
237
  """Generate focused summary based on question"""
238
  # Preprocess each abstract
 
257
 
258
  return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
259
 
260
+ def create_filter_controls(df, sort_column):
261
+ """Create appropriate filter controls based on the selected column"""
262
+ filtered_df = df.copy()
263
+
264
+ if sort_column == 'Publication Year':
265
+ # Year range slider
266
+ year_min = int(df['Publication Year'].min())
267
+ year_max = int(df['Publication Year'].max())
268
+ col1, col2 = st.columns(2)
269
+ with col1:
270
+ start_year = st.number_input('From Year',
271
+ min_value=year_min,
272
+ max_value=year_max,
273
+ value=year_min)
274
+ with col2:
275
+ end_year = st.number_input('To Year',
276
+ min_value=year_min,
277
+ max_value=year_max,
278
+ value=year_max)
279
+ filtered_df = filtered_df[
280
+ (filtered_df['Publication Year'] >= start_year) &
281
+ (filtered_df['Publication Year'] <= end_year)
282
+ ]
283
+
284
+ elif sort_column == 'Authors':
285
+ # Multi-select for authors
286
+ unique_authors = sorted(set(
287
+ author.strip()
288
+ for authors in df['Authors'].dropna()
289
+ for author in authors.split(';')
290
+ ))
291
+ selected_authors = st.multiselect(
292
+ 'Select Authors',
293
+ unique_authors
294
+ )
295
+ if selected_authors:
296
+ filtered_df = filtered_df[
297
+ filtered_df['Authors'].apply(
298
+ lambda x: any(author in str(x) for author in selected_authors)
299
+ )
300
+ ]
301
+
302
+ elif sort_column == 'Source Title':
303
+ # Multi-select for source titles
304
+ unique_sources = sorted(df['Source Title'].unique())
305
+ selected_sources = st.multiselect(
306
+ 'Select Sources',
307
+ unique_sources
308
+ )
309
+ if selected_sources:
310
+ filtered_df = filtered_df[filtered_df['Source Title'].isin(selected_sources)]
311
+
312
+ elif sort_column == 'Article Title':
313
+ # Only alphabetical sorting, no filtering
314
+ pass
315
 
316
+ return filtered_df
317
 
318
  def main():
319
  st.title("🔬 Biomedical Papers Analysis")
 
362
  if st.session_state.summaries is None:
363
  try:
364
  with st.spinner("Generating individual paper summaries..."):
 
365
  model, tokenizer = load_model("summarize")
 
 
366
  summaries = []
367
  progress_bar = st.progress(0)
368
 
369
  for idx, abstract in enumerate(df['Abstract']):
 
 
 
370
  summary = improve_summary_generation(abstract, model, tokenizer)
371
  summaries.append(summary)
372
  progress_bar.progress((idx + 1) / len(df))
373
 
 
374
  st.session_state.summaries = summaries
 
 
375
  cleanup_model(model, tokenizer)
376
  progress_bar.empty()
377
 
378
  except Exception as e:
379
  st.error(f"Error generating summaries: {str(e)}")
380
+ st.session_state.processing_started = False
381
 
382
+ # Display summaries with improved sorting and filtering
383
  if st.session_state.summaries is not None:
384
  col1, col2 = st.columns(2)
385
  with col1:
386
  sort_options = ['Article Title', 'Authors', 'Publication Year', 'Source Title']
387
+ sort_column = st.selectbox("Sort/Filter by:", sort_options)
388
  with col2:
389
+ # Only show A-Z/Z-A option for Article Title
390
+ if sort_column == 'Article Title':
391
+ ascending = st.radio(
392
+ "Sort order",
393
+ ["A to Z", "Z to A"],
394
+ horizontal=True
395
+ ) == "A to Z"
396
+ else:
397
+ ascending = True # Default for other columns
398
 
399
+ # Create display dataframe
400
  display_df = df.copy()
401
  display_df['Summary'] = st.session_state.summaries
402
  display_df['Publication Year'] = display_df['Publication Year'].astype(int)
403
+
404
+ # Apply filters
405
+ filtered_df = create_filter_controls(display_df, sort_column)
406
+
407
+ if sort_column == 'Article Title':
408
+ # Sort alphabetically
409
+ sorted_df = filtered_df.sort_values(by=sort_column, ascending=ascending)
410
+ else:
411
+ # Keep original order for other columns after filtering
412
+ # Keep original order for other columns after filtering
413
+ sorted_df = filtered_df
414
+
415
+ # Show number of filtered results
416
+ if len(sorted_df) != len(display_df):
417
+ st.write(f"Showing {len(sorted_df)} of {len(display_df)} papers")
418
 
419
  # Apply custom styling
420
  st.markdown("""
 
447
  </style>
448
  """, unsafe_allow_html=True)
449
 
450
+ # Display papers using the filtered and sorted dataframe
451
  for _, row in sorted_df.iterrows():
452
  paper_info_cols = st.columns([1, 1])
453
 
 
473
  </div>
474
  """, unsafe_allow_html=True)
475
 
476
+ # Add spacing between papers
477
  st.markdown("<div style='margin-bottom: 20px;'></div>", unsafe_allow_html=True)
478
 
479
  # Question-focused Summary Section (only if question provided)