Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 12

Commit

88b80ae

verified ·

1 Parent(s): e502db5

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -106

app.py CHANGED Viewed

@@ -319,6 +319,11 @@ def analyze_subregion(state, header, region_start, region_end):
 # 9. COMPARISON ANALYSIS FUNCTIONS
 ###############################################################################
 def compute_shap_difference(shap1_norm, shap2_norm):
     """Compute the SHAP difference between normalized sequences"""
     return shap2_norm - shap1_norm
@@ -351,26 +356,39 @@ def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
     plt.subplots_adjust(bottom=0.25, left=0.05, right=0.95)
     return fig
 def calculate_adaptive_parameters(len1, len2):
     """
     Calculate adaptive parameters based on sequence lengths and their difference.
-    Returns:
-        tuple: (num_points, smooth_window, resolution_factor)
     """
     length_diff = abs(len1 - len2)
     max_length = max(len1, len2)
-    length_ratio = min(len1, len2) / max_length
     # Base number of points scales with sequence length
     base_points = min(2000, max(500, max_length // 100))
-    # Adjust resolution based on length difference
     if length_diff < 500:
-        resolution_factor = 2.0  # Higher resolution for very similar sequences
         num_points = min(3000, base_points * 2)
-        smooth_window = max(10, length_diff // 50)  # Minimal smoothing
     elif length_diff < 5000:
         resolution_factor = 1.5
         num_points = min(2000, base_points * 1.5)
@@ -380,7 +398,6 @@ def calculate_adaptive_parameters(len1, len2):
         num_points = base_points
         smooth_window = max(50, length_diff // 200)
     else:
-        # For very large differences, reduce resolution but increase smoothing
         resolution_factor = 0.75
         num_points = max(500, base_points // 2)
         smooth_window = max(100, length_diff // 500)
@@ -392,20 +409,16 @@ def calculate_adaptive_parameters(len1, len2):
 def sliding_window_smooth(values, window_size=50):
     """
-    Apply sliding window smoothing with edge handling.
-    Uses exponential decay at edges to reduce boundary effects.
     """
     if window_size < 3:
         return values
-    window = np.ones(window_size)
-    # Create exponential decay at edges
     decay = np.exp(-np.linspace(0, 3, window_size // 2))
     window[:window_size // 2] = decay
     window[-(window_size // 2):] = decay[::-1]
-    # Normalize window
     window = window / window.sum()
     # Apply convolution
@@ -416,17 +429,16 @@ def sliding_window_smooth(values, window_size=50):
     pad_left = pad_size // 2
     pad_right = pad_size - pad_left
-    # Use actual values at edges instead of padding
     result = np.zeros_like(values)
     result[pad_left:-pad_right] = smoothed
-    result[:pad_left] = values[:pad_left]  # Keep original values at start
-    result[-pad_right:] = values[-pad_right:]  # Keep original values at end
     return result
-def normalize_shap_lengths(shap1, shap2, num_points=1000, smooth_window=50):
     """
-    Normalize and smooth SHAP values with dynamic adaptation.
     """
     # Calculate adaptive parameters
     num_points, smooth_window, _ = calculate_adaptive_parameters(len(shap1), len(shap2))
@@ -435,12 +447,11 @@ def normalize_shap_lengths(shap1, shap2, num_points=1000, smooth_window=50):
     shap1_smooth = sliding_window_smooth(shap1, smooth_window)
     shap2_smooth = sliding_window_smooth(shap2, smooth_window)
-    # Create relative positions
     x1 = np.linspace(0, 1, len(shap1_smooth))
     x2 = np.linspace(0, 1, len(shap2_smooth))
     x_norm = np.linspace(0, 1, num_points)
-    # Interpolate smoothed values
     shap1_interp = np.interp(x_norm, x1, shap1_smooth)
     shap2_interp = np.interp(x_norm, x2, shap2_smooth)
@@ -448,91 +459,103 @@ def normalize_shap_lengths(shap1, shap2, num_points=1000, smooth_window=50):
 def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
     """
-    Fully dynamic sequence comparison with adaptive parameters.
     """
-    # Analyze sequences
-    res1 = analyze_sequence(file1, top_kmers=10, fasta_text=fasta1, window_size=500)
-    if isinstance(res1[0], str) and "Error" in res1[0]:
-        return (f"Error in sequence 1: {res1[0]}", None, None)
-    res2 = analyze_sequence(file2, top_kmers=10, fasta_text=fasta2, window_size=500)
-    if isinstance(res2[0], str) and "Error" in res2[0]:
-        return (f"Error in sequence 2: {res2[0]}", None, None)
-    shap1 = res1[3]["shap_means"]
-    shap2 = res2[3]["shap_means"]
-    # Get sequence properties
-    len1, len2 = len(shap1), len(shap2)
-    length_diff = abs(len1 - len2)
-    length_ratio = min(len1, len2) / max(len1, len2)
-    # Get normalized values with adaptive parameters
-    shap1_norm, shap2_norm, smooth_window = normalize_shap_lengths(shap1, shap2)
-    shap_diff = shap2_norm - shap1_norm
-    # Calculate adaptive threshold
-    base_threshold = 0.05
-    adaptive_threshold = base_threshold * (1 + (1 - length_ratio))
-    if length_diff > 50000:
-        adaptive_threshold *= 1.5  # More forgiving for very large differences
-    # Calculate statistics
-    avg_diff = np.mean(shap_diff)
-    std_diff = np.std(shap_diff)
-    max_diff = np.max(shap_diff)
-    min_diff = np.min(shap_diff)
-    substantial_diffs = np.abs(shap_diff) > adaptive_threshold
-    frac_different = np.mean(substantial_diffs)
-    # Get the classification info without string splitting
     try:
-        classification1 = res1[0].split('Classification: ')[1].split('\n')[0].strip()
-        classification2 = res2[0].split('Classification: ')[1].split('\n')[0].strip()
-    except:
-        classification1 = "Unknown"
-        classification2 = "Unknown"
-    # Format detailed output with line breaks for readability
-    comparison_text = (
-        "Sequence Comparison Results:\n"
-        f"Sequence 1: {res1[4]}\n"
-        f"Length: {len1:,} bases\n"
-        f"Classification: {classification1}\n\n"
-        f"Sequence 2: {res2[4]}\n"
-        f"Length: {len2:,} bases\n"
-        f"Classification: {classification2}\n\n"
-        "Comparison Parameters:\n"
-        f"Length Difference: {length_diff:,} bases\n"
-        f"Length Ratio: {length_ratio:.3f}\n"
-        f"Smoothing Window: {smooth_window} points\n"
-        f"Adaptive Threshold: {adaptive_threshold:.3f}\n\n"
-        "Statistics:\n"
-        f"Average SHAP difference: {avg_diff:.4f}\n"
-        f"Standard deviation: {std_diff:.4f}\n"
-        f"Max difference: {max_diff:.4f} (Seq2 more human-like)\n"
-        f"Min difference: {min_diff:.4f} (Seq1 more human-like)\n"
-        f"Fraction with substantial differences: {frac_different:.2%}\n\n"
-        "Note: All parameters automatically adjusted based on sequence properties\n\n"
-        "Interpretation:\n"
-        "- Red regions: Sequence 2 more human-like\n"
-        "- Blue regions: Sequence 1 more human-like\n"
-        "- White regions: Similar between sequences"
-    )
-    # Generate visualizations
-    heatmap_fig = plot_comparative_heatmap(
-        shap_diff,
-        title=f"SHAP Difference Heatmap (window: {smooth_window})"
-    )
-    heatmap_img = fig_to_image(heatmap_fig)
-    # Adaptive number of bins based on data
-    num_bins = max(20, min(50, int(np.sqrt(len(shap_diff)))))
-    hist_fig = plot_shap_histogram(shap_diff, num_bins=num_bins)
-    hist_img = fig_to_image(hist_fig)
-    return comparison_text, heatmap_img, hist_img
 ###############################################################################
 # 10. BUILD GRADIO INTERFACE
 ###############################################################################

 # 9. COMPARISON ANALYSIS FUNCTIONS
 ###############################################################################
+def get_zero_centered_cmap():
+    """Create a zero-centered blue-white-red colormap"""
+    colors = [(0.0, 'blue'), (0.5, 'white'), (1.0, 'red')]
+    return mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
 def compute_shap_difference(shap1_norm, shap2_norm):
     """Compute the SHAP difference between normalized sequences"""
     return shap2_norm - shap1_norm
     plt.subplots_adjust(bottom=0.25, left=0.05, right=0.95)
     return fig
+def plot_shap_histogram(shap_array, title="SHAP Distribution", num_bins=30):
+    """
+    Plot histogram of SHAP values with configurable number of bins
+    """
+    fig, ax = plt.subplots(figsize=(6, 4))
+    ax.hist(shap_array, bins=num_bins, color='gray', edgecolor='black', alpha=0.7)
+    ax.axvline(0, color='red', linestyle='--', label='0.0')
+    ax.set_xlabel("SHAP Value")
+    ax.set_ylabel("Count")
+    ax.set_title(title)
+    ax.legend()
+    plt.tight_layout()
+    return fig
 def calculate_adaptive_parameters(len1, len2):
     """
     Calculate adaptive parameters based on sequence lengths and their difference.
+    Returns: (num_points, smooth_window, resolution_factor)
     """
     length_diff = abs(len1 - len2)
     max_length = max(len1, len2)
+    min_length = min(len1, len2)
+    length_ratio = min_length / max_length
     # Base number of points scales with sequence length
     base_points = min(2000, max(500, max_length // 100))
+    # Adjust parameters based on sequence properties
     if length_diff < 500:
+        resolution_factor = 2.0
         num_points = min(3000, base_points * 2)
+        smooth_window = max(10, length_diff // 50)
     elif length_diff < 5000:
         resolution_factor = 1.5
         num_points = min(2000, base_points * 1.5)
         num_points = base_points
         smooth_window = max(50, length_diff // 200)
     else:
         resolution_factor = 0.75
         num_points = max(500, base_points // 2)
         smooth_window = max(100, length_diff // 500)
 def sliding_window_smooth(values, window_size=50):
     """
+    Apply sliding window smoothing with edge handling
     """
     if window_size < 3:
         return values
+    # Create window with exponential decay at edges
+    window = np.ones(window_size)
     decay = np.exp(-np.linspace(0, 3, window_size // 2))
     window[:window_size // 2] = decay
     window[-(window_size // 2):] = decay[::-1]
     window = window / window.sum()
     # Apply convolution
     pad_left = pad_size // 2
     pad_right = pad_size - pad_left
     result = np.zeros_like(values)
     result[pad_left:-pad_right] = smoothed
+    result[:pad_left] = values[:pad_left]
+    result[-pad_right:] = values[-pad_right:]
     return result
+def normalize_shap_lengths(shap1, shap2):
     """
+    Normalize and smooth SHAP values with dynamic adaptation
     """
     # Calculate adaptive parameters
     num_points, smooth_window, _ = calculate_adaptive_parameters(len(shap1), len(shap2))
     shap1_smooth = sliding_window_smooth(shap1, smooth_window)
     shap2_smooth = sliding_window_smooth(shap2, smooth_window)
+    # Create relative positions and interpolate
     x1 = np.linspace(0, 1, len(shap1_smooth))
     x2 = np.linspace(0, 1, len(shap2_smooth))
     x_norm = np.linspace(0, 1, num_points)
     shap1_interp = np.interp(x_norm, x1, shap1_smooth)
     shap2_interp = np.interp(x_norm, x2, shap2_smooth)
 def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
     """
+    Compare two sequences with adaptive parameters and visualization
     """
     try:
+        # Analyze first sequence
+        res1 = analyze_sequence(file1, top_kmers=10, fasta_text=fasta1, window_size=500)
+        if isinstance(res1[0], str) and "Error" in res1[0]:
+            return (f"Error in sequence 1: {res1[0]}", None, None)
+        # Analyze second sequence
+        res2 = analyze_sequence(file2, top_kmers=10, fasta_text=fasta2, window_size=500)
+        if isinstance(res2[0], str) and "Error" in res2[0]:
+            return (f"Error in sequence 2: {res2[0]}", None, None)
+        # Extract SHAP values and sequence info
+        shap1 = res1[3]["shap_means"]
+        shap2 = res2[3]["shap_means"]
+        # Calculate sequence properties
+        len1, len2 = len(shap1), len(shap2)
+        length_diff = abs(len1 - len2)
+        length_ratio = min(len1, len2) / max(len1, len2)
+        # Normalize and compare sequences
+        shap1_norm, shap2_norm, smooth_window = normalize_shap_lengths(shap1, shap2)
+        shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
+        # Calculate adaptive threshold and statistics
+        base_threshold = 0.05
+        adaptive_threshold = base_threshold * (1 + (1 - length_ratio))
+        if length_diff > 50000:
+            adaptive_threshold *= 1.5
+        # Calculate comparison statistics
+        avg_diff = np.mean(shap_diff)
+        std_diff = np.std(shap_diff)
+        max_diff = np.max(shap_diff)
+        min_diff = np.min(shap_diff)
+        substantial_diffs = np.abs(shap_diff) > adaptive_threshold
+        frac_different = np.mean(substantial_diffs)
+        # Extract classifications
+        try:
+            classification1 = res1[0].split('Classification: ')[1].split('\n')[0].strip()
+            classification2 = res2[0].split('Classification: ')[1].split('\n')[0].strip()
+        except:
+            classification1 = "Unknown"
+            classification2 = "Unknown"
+        # Format output text
+        comparison_text = (
+            "Sequence Comparison Results:\n"
+            f"Sequence 1: {res1[4]}\n"
+            f"Length: {len1:,} bases\n"
+            f"Classification: {classification1}\n\n"
+            f"Sequence 2: {res2[4]}\n"
+            f"Length: {len2:,} bases\n"
+            f"Classification: {classification2}\n\n"
+            "Comparison Parameters:\n"
+            f"Length Difference: {length_diff:,} bases\n"
+            f"Length Ratio: {length_ratio:.3f}\n"
+            f"Smoothing Window: {smooth_window} points\n"
+            f"Adaptive Threshold: {adaptive_threshold:.3f}\n\n"
+            "Statistics:\n"
+            f"Average SHAP difference: {avg_diff:.4f}\n"
+            f"Standard deviation: {std_diff:.4f}\n"
+            f"Max difference: {max_diff:.4f} (Seq2 more human-like)\n"
+            f"Min difference: {min_diff:.4f} (Seq1 more human-like)\n"
+            f"Fraction with substantial differences: {frac_different:.2%}\n\n"
+            "Note: All parameters automatically adjusted based on sequence properties\n\n"
+            "Interpretation:\n"
+            "- Red regions: Sequence 2 more human-like\n"
+            "- Blue regions: Sequence 1 more human-like\n"
+            "- White regions: Similar between sequences"
+        )
+        # Generate visualizations
+        heatmap_fig = plot_comparative_heatmap(
+            shap_diff,
+            title=f"SHAP Difference Heatmap (window: {smooth_window})"
+        )
+        heatmap_img = fig_to_image(heatmap_fig)
+        # Create histogram with adaptive bins
+        num_bins = max(20, min(50, int(np.sqrt(len(shap_diff)))))
+        hist_fig = plot_shap_histogram(
+            shap_diff,
+            title="Distribution of SHAP Differences",
+            num_bins=num_bins
+        )
+        hist_img = fig_to_image(hist_fig)
+        return comparison_text, heatmap_img, hist_img
+    except Exception as e:
+        error_msg = f"Error during sequence comparison: {str(e)}"
+        return error_msg, None, None
 ###############################################################################
 # 10. BUILD GRADIO INTERFACE
 ###############################################################################