hiyata commited on
Commit
88b80ae
·
verified ·
1 Parent(s): e502db5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -106
app.py CHANGED
@@ -319,6 +319,11 @@ def analyze_subregion(state, header, region_start, region_end):
319
  # 9. COMPARISON ANALYSIS FUNCTIONS
320
  ###############################################################################
321
 
 
 
 
 
 
322
  def compute_shap_difference(shap1_norm, shap2_norm):
323
  """Compute the SHAP difference between normalized sequences"""
324
  return shap2_norm - shap1_norm
@@ -351,26 +356,39 @@ def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
351
  plt.subplots_adjust(bottom=0.25, left=0.05, right=0.95)
352
 
353
  return fig
354
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  def calculate_adaptive_parameters(len1, len2):
356
  """
357
  Calculate adaptive parameters based on sequence lengths and their difference.
358
-
359
- Returns:
360
- tuple: (num_points, smooth_window, resolution_factor)
361
  """
362
  length_diff = abs(len1 - len2)
363
  max_length = max(len1, len2)
364
- length_ratio = min(len1, len2) / max_length
 
365
 
366
  # Base number of points scales with sequence length
367
  base_points = min(2000, max(500, max_length // 100))
368
 
369
- # Adjust resolution based on length difference
370
  if length_diff < 500:
371
- resolution_factor = 2.0 # Higher resolution for very similar sequences
372
  num_points = min(3000, base_points * 2)
373
- smooth_window = max(10, length_diff // 50) # Minimal smoothing
374
  elif length_diff < 5000:
375
  resolution_factor = 1.5
376
  num_points = min(2000, base_points * 1.5)
@@ -380,7 +398,6 @@ def calculate_adaptive_parameters(len1, len2):
380
  num_points = base_points
381
  smooth_window = max(50, length_diff // 200)
382
  else:
383
- # For very large differences, reduce resolution but increase smoothing
384
  resolution_factor = 0.75
385
  num_points = max(500, base_points // 2)
386
  smooth_window = max(100, length_diff // 500)
@@ -392,20 +409,16 @@ def calculate_adaptive_parameters(len1, len2):
392
 
393
  def sliding_window_smooth(values, window_size=50):
394
  """
395
- Apply sliding window smoothing with edge handling.
396
- Uses exponential decay at edges to reduce boundary effects.
397
  """
398
  if window_size < 3:
399
  return values
400
-
401
- window = np.ones(window_size)
402
 
403
- # Create exponential decay at edges
 
404
  decay = np.exp(-np.linspace(0, 3, window_size // 2))
405
  window[:window_size // 2] = decay
406
  window[-(window_size // 2):] = decay[::-1]
407
-
408
- # Normalize window
409
  window = window / window.sum()
410
 
411
  # Apply convolution
@@ -416,17 +429,16 @@ def sliding_window_smooth(values, window_size=50):
416
  pad_left = pad_size // 2
417
  pad_right = pad_size - pad_left
418
 
419
- # Use actual values at edges instead of padding
420
  result = np.zeros_like(values)
421
  result[pad_left:-pad_right] = smoothed
422
- result[:pad_left] = values[:pad_left] # Keep original values at start
423
- result[-pad_right:] = values[-pad_right:] # Keep original values at end
424
 
425
  return result
426
 
427
- def normalize_shap_lengths(shap1, shap2, num_points=1000, smooth_window=50):
428
  """
429
- Normalize and smooth SHAP values with dynamic adaptation.
430
  """
431
  # Calculate adaptive parameters
432
  num_points, smooth_window, _ = calculate_adaptive_parameters(len(shap1), len(shap2))
@@ -435,12 +447,11 @@ def normalize_shap_lengths(shap1, shap2, num_points=1000, smooth_window=50):
435
  shap1_smooth = sliding_window_smooth(shap1, smooth_window)
436
  shap2_smooth = sliding_window_smooth(shap2, smooth_window)
437
 
438
- # Create relative positions
439
  x1 = np.linspace(0, 1, len(shap1_smooth))
440
  x2 = np.linspace(0, 1, len(shap2_smooth))
441
  x_norm = np.linspace(0, 1, num_points)
442
 
443
- # Interpolate smoothed values
444
  shap1_interp = np.interp(x_norm, x1, shap1_smooth)
445
  shap2_interp = np.interp(x_norm, x2, shap2_smooth)
446
 
@@ -448,91 +459,103 @@ def normalize_shap_lengths(shap1, shap2, num_points=1000, smooth_window=50):
448
 
449
  def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
450
  """
451
- Fully dynamic sequence comparison with adaptive parameters.
452
  """
453
- # Analyze sequences
454
- res1 = analyze_sequence(file1, top_kmers=10, fasta_text=fasta1, window_size=500)
455
- if isinstance(res1[0], str) and "Error" in res1[0]:
456
- return (f"Error in sequence 1: {res1[0]}", None, None)
457
-
458
- res2 = analyze_sequence(file2, top_kmers=10, fasta_text=fasta2, window_size=500)
459
- if isinstance(res2[0], str) and "Error" in res2[0]:
460
- return (f"Error in sequence 2: {res2[0]}", None, None)
461
-
462
- shap1 = res1[3]["shap_means"]
463
- shap2 = res2[3]["shap_means"]
464
-
465
- # Get sequence properties
466
- len1, len2 = len(shap1), len(shap2)
467
- length_diff = abs(len1 - len2)
468
- length_ratio = min(len1, len2) / max(len1, len2)
469
-
470
- # Get normalized values with adaptive parameters
471
- shap1_norm, shap2_norm, smooth_window = normalize_shap_lengths(shap1, shap2)
472
- shap_diff = shap2_norm - shap1_norm
473
-
474
- # Calculate adaptive threshold
475
- base_threshold = 0.05
476
- adaptive_threshold = base_threshold * (1 + (1 - length_ratio))
477
- if length_diff > 50000:
478
- adaptive_threshold *= 1.5 # More forgiving for very large differences
479
-
480
- # Calculate statistics
481
- avg_diff = np.mean(shap_diff)
482
- std_diff = np.std(shap_diff)
483
- max_diff = np.max(shap_diff)
484
- min_diff = np.min(shap_diff)
485
- substantial_diffs = np.abs(shap_diff) > adaptive_threshold
486
- frac_different = np.mean(substantial_diffs)
487
-
488
- # Get the classification info without string splitting
489
  try:
490
- classification1 = res1[0].split('Classification: ')[1].split('\n')[0].strip()
491
- classification2 = res2[0].split('Classification: ')[1].split('\n')[0].strip()
492
- except:
493
- classification1 = "Unknown"
494
- classification2 = "Unknown"
495
-
496
- # Format detailed output with line breaks for readability
497
- comparison_text = (
498
- "Sequence Comparison Results:\n"
499
- f"Sequence 1: {res1[4]}\n"
500
- f"Length: {len1:,} bases\n"
501
- f"Classification: {classification1}\n\n"
502
- f"Sequence 2: {res2[4]}\n"
503
- f"Length: {len2:,} bases\n"
504
- f"Classification: {classification2}\n\n"
505
- "Comparison Parameters:\n"
506
- f"Length Difference: {length_diff:,} bases\n"
507
- f"Length Ratio: {length_ratio:.3f}\n"
508
- f"Smoothing Window: {smooth_window} points\n"
509
- f"Adaptive Threshold: {adaptive_threshold:.3f}\n\n"
510
- "Statistics:\n"
511
- f"Average SHAP difference: {avg_diff:.4f}\n"
512
- f"Standard deviation: {std_diff:.4f}\n"
513
- f"Max difference: {max_diff:.4f} (Seq2 more human-like)\n"
514
- f"Min difference: {min_diff:.4f} (Seq1 more human-like)\n"
515
- f"Fraction with substantial differences: {frac_different:.2%}\n\n"
516
- "Note: All parameters automatically adjusted based on sequence properties\n\n"
517
- "Interpretation:\n"
518
- "- Red regions: Sequence 2 more human-like\n"
519
- "- Blue regions: Sequence 1 more human-like\n"
520
- "- White regions: Similar between sequences"
521
- )
522
-
523
- # Generate visualizations
524
- heatmap_fig = plot_comparative_heatmap(
525
- shap_diff,
526
- title=f"SHAP Difference Heatmap (window: {smooth_window})"
527
- )
528
- heatmap_img = fig_to_image(heatmap_fig)
529
-
530
- # Adaptive number of bins based on data
531
- num_bins = max(20, min(50, int(np.sqrt(len(shap_diff)))))
532
- hist_fig = plot_shap_histogram(shap_diff, num_bins=num_bins)
533
- hist_img = fig_to_image(hist_fig)
534
-
535
- return comparison_text, heatmap_img, hist_img
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
536
  ###############################################################################
537
  # 10. BUILD GRADIO INTERFACE
538
  ###############################################################################
 
319
  # 9. COMPARISON ANALYSIS FUNCTIONS
320
  ###############################################################################
321
 
322
+ def get_zero_centered_cmap():
323
+ """Create a zero-centered blue-white-red colormap"""
324
+ colors = [(0.0, 'blue'), (0.5, 'white'), (1.0, 'red')]
325
+ return mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
326
+
327
  def compute_shap_difference(shap1_norm, shap2_norm):
328
  """Compute the SHAP difference between normalized sequences"""
329
  return shap2_norm - shap1_norm
 
356
  plt.subplots_adjust(bottom=0.25, left=0.05, right=0.95)
357
 
358
  return fig
359
+
360
+ def plot_shap_histogram(shap_array, title="SHAP Distribution", num_bins=30):
361
+ """
362
+ Plot histogram of SHAP values with configurable number of bins
363
+ """
364
+ fig, ax = plt.subplots(figsize=(6, 4))
365
+ ax.hist(shap_array, bins=num_bins, color='gray', edgecolor='black', alpha=0.7)
366
+ ax.axvline(0, color='red', linestyle='--', label='0.0')
367
+ ax.set_xlabel("SHAP Value")
368
+ ax.set_ylabel("Count")
369
+ ax.set_title(title)
370
+ ax.legend()
371
+ plt.tight_layout()
372
+ return fig
373
+
374
  def calculate_adaptive_parameters(len1, len2):
375
  """
376
  Calculate adaptive parameters based on sequence lengths and their difference.
377
+ Returns: (num_points, smooth_window, resolution_factor)
 
 
378
  """
379
  length_diff = abs(len1 - len2)
380
  max_length = max(len1, len2)
381
+ min_length = min(len1, len2)
382
+ length_ratio = min_length / max_length
383
 
384
  # Base number of points scales with sequence length
385
  base_points = min(2000, max(500, max_length // 100))
386
 
387
+ # Adjust parameters based on sequence properties
388
  if length_diff < 500:
389
+ resolution_factor = 2.0
390
  num_points = min(3000, base_points * 2)
391
+ smooth_window = max(10, length_diff // 50)
392
  elif length_diff < 5000:
393
  resolution_factor = 1.5
394
  num_points = min(2000, base_points * 1.5)
 
398
  num_points = base_points
399
  smooth_window = max(50, length_diff // 200)
400
  else:
 
401
  resolution_factor = 0.75
402
  num_points = max(500, base_points // 2)
403
  smooth_window = max(100, length_diff // 500)
 
409
 
410
  def sliding_window_smooth(values, window_size=50):
411
  """
412
+ Apply sliding window smoothing with edge handling
 
413
  """
414
  if window_size < 3:
415
  return values
 
 
416
 
417
+ # Create window with exponential decay at edges
418
+ window = np.ones(window_size)
419
  decay = np.exp(-np.linspace(0, 3, window_size // 2))
420
  window[:window_size // 2] = decay
421
  window[-(window_size // 2):] = decay[::-1]
 
 
422
  window = window / window.sum()
423
 
424
  # Apply convolution
 
429
  pad_left = pad_size // 2
430
  pad_right = pad_size - pad_left
431
 
 
432
  result = np.zeros_like(values)
433
  result[pad_left:-pad_right] = smoothed
434
+ result[:pad_left] = values[:pad_left]
435
+ result[-pad_right:] = values[-pad_right:]
436
 
437
  return result
438
 
439
+ def normalize_shap_lengths(shap1, shap2):
440
  """
441
+ Normalize and smooth SHAP values with dynamic adaptation
442
  """
443
  # Calculate adaptive parameters
444
  num_points, smooth_window, _ = calculate_adaptive_parameters(len(shap1), len(shap2))
 
447
  shap1_smooth = sliding_window_smooth(shap1, smooth_window)
448
  shap2_smooth = sliding_window_smooth(shap2, smooth_window)
449
 
450
+ # Create relative positions and interpolate
451
  x1 = np.linspace(0, 1, len(shap1_smooth))
452
  x2 = np.linspace(0, 1, len(shap2_smooth))
453
  x_norm = np.linspace(0, 1, num_points)
454
 
 
455
  shap1_interp = np.interp(x_norm, x1, shap1_smooth)
456
  shap2_interp = np.interp(x_norm, x2, shap2_smooth)
457
 
 
459
 
460
  def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
461
  """
462
+ Compare two sequences with adaptive parameters and visualization
463
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  try:
465
+ # Analyze first sequence
466
+ res1 = analyze_sequence(file1, top_kmers=10, fasta_text=fasta1, window_size=500)
467
+ if isinstance(res1[0], str) and "Error" in res1[0]:
468
+ return (f"Error in sequence 1: {res1[0]}", None, None)
469
+
470
+ # Analyze second sequence
471
+ res2 = analyze_sequence(file2, top_kmers=10, fasta_text=fasta2, window_size=500)
472
+ if isinstance(res2[0], str) and "Error" in res2[0]:
473
+ return (f"Error in sequence 2: {res2[0]}", None, None)
474
+
475
+ # Extract SHAP values and sequence info
476
+ shap1 = res1[3]["shap_means"]
477
+ shap2 = res2[3]["shap_means"]
478
+
479
+ # Calculate sequence properties
480
+ len1, len2 = len(shap1), len(shap2)
481
+ length_diff = abs(len1 - len2)
482
+ length_ratio = min(len1, len2) / max(len1, len2)
483
+
484
+ # Normalize and compare sequences
485
+ shap1_norm, shap2_norm, smooth_window = normalize_shap_lengths(shap1, shap2)
486
+ shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
487
+
488
+ # Calculate adaptive threshold and statistics
489
+ base_threshold = 0.05
490
+ adaptive_threshold = base_threshold * (1 + (1 - length_ratio))
491
+ if length_diff > 50000:
492
+ adaptive_threshold *= 1.5
493
+
494
+ # Calculate comparison statistics
495
+ avg_diff = np.mean(shap_diff)
496
+ std_diff = np.std(shap_diff)
497
+ max_diff = np.max(shap_diff)
498
+ min_diff = np.min(shap_diff)
499
+ substantial_diffs = np.abs(shap_diff) > adaptive_threshold
500
+ frac_different = np.mean(substantial_diffs)
501
+
502
+ # Extract classifications
503
+ try:
504
+ classification1 = res1[0].split('Classification: ')[1].split('\n')[0].strip()
505
+ classification2 = res2[0].split('Classification: ')[1].split('\n')[0].strip()
506
+ except:
507
+ classification1 = "Unknown"
508
+ classification2 = "Unknown"
509
+
510
+ # Format output text
511
+ comparison_text = (
512
+ "Sequence Comparison Results:\n"
513
+ f"Sequence 1: {res1[4]}\n"
514
+ f"Length: {len1:,} bases\n"
515
+ f"Classification: {classification1}\n\n"
516
+ f"Sequence 2: {res2[4]}\n"
517
+ f"Length: {len2:,} bases\n"
518
+ f"Classification: {classification2}\n\n"
519
+ "Comparison Parameters:\n"
520
+ f"Length Difference: {length_diff:,} bases\n"
521
+ f"Length Ratio: {length_ratio:.3f}\n"
522
+ f"Smoothing Window: {smooth_window} points\n"
523
+ f"Adaptive Threshold: {adaptive_threshold:.3f}\n\n"
524
+ "Statistics:\n"
525
+ f"Average SHAP difference: {avg_diff:.4f}\n"
526
+ f"Standard deviation: {std_diff:.4f}\n"
527
+ f"Max difference: {max_diff:.4f} (Seq2 more human-like)\n"
528
+ f"Min difference: {min_diff:.4f} (Seq1 more human-like)\n"
529
+ f"Fraction with substantial differences: {frac_different:.2%}\n\n"
530
+ "Note: All parameters automatically adjusted based on sequence properties\n\n"
531
+ "Interpretation:\n"
532
+ "- Red regions: Sequence 2 more human-like\n"
533
+ "- Blue regions: Sequence 1 more human-like\n"
534
+ "- White regions: Similar between sequences"
535
+ )
536
+
537
+ # Generate visualizations
538
+ heatmap_fig = plot_comparative_heatmap(
539
+ shap_diff,
540
+ title=f"SHAP Difference Heatmap (window: {smooth_window})"
541
+ )
542
+ heatmap_img = fig_to_image(heatmap_fig)
543
+
544
+ # Create histogram with adaptive bins
545
+ num_bins = max(20, min(50, int(np.sqrt(len(shap_diff)))))
546
+ hist_fig = plot_shap_histogram(
547
+ shap_diff,
548
+ title="Distribution of SHAP Differences",
549
+ num_bins=num_bins
550
+ )
551
+ hist_img = fig_to_image(hist_fig)
552
+
553
+ return comparison_text, heatmap_img, hist_img
554
+
555
+ except Exception as e:
556
+ error_msg = f"Error during sequence comparison: {str(e)}"
557
+ return error_msg, None, None
558
+
559
  ###############################################################################
560
  # 10. BUILD GRADIO INTERFACE
561
  ###############################################################################