Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 12

Commit

455bf4d

verified ·

1 Parent(s): 0d6258f

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -201

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import gradio as gr
 import torch
-import joblib
 import numpy as np
 from itertools import product
 import torch.nn as nn
@@ -72,7 +71,7 @@ def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     total_kmers = len(sequence) - k + 1
     if total_kmers > 0:
-        vec = vec / total_kmers
     return vec
@@ -87,12 +86,10 @@ def calculate_shap_values(model, x_tensor):
     """
     model.eval()
     with torch.no_grad():
-        # Baseline
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
         baseline_prob = baseline_probs[0, 1].item()  # Probability of 'human'
-        # Zeroing each feature to measure impact
         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
@@ -100,10 +97,10 @@ def calculate_shap_values(model, x_tensor):
             x_zeroed[0, i] = 0.0
             output = model(x_zeroed)
             probs = torch.softmax(output, dim=1)
-            prob = probs[0, 1].item()  # Probability of 'human'
             impact = baseline_prob - prob
             shap_values.append(impact)
-            x_zeroed[0, i] = original_val  # restore
     return np.array(shap_values), baseline_prob
 ###############################################################################
@@ -111,10 +108,6 @@ def calculate_shap_values(model, x_tensor):
 ###############################################################################
 def compute_positionwise_scores(sequence, shap_values, k=4):
-    """
-    Returns an array of per-base SHAP contributions by averaging
-    the k-mer SHAP values of all k-mers covering that base.
-    """
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
@@ -139,20 +132,13 @@ def compute_positionwise_scores(sequence, shap_values, k=4):
 ###############################################################################
 def find_extreme_subregion(shap_means, window_size=500, mode="max"):
-    """
-    Finds the subregion of length `window_size` that has the maximum
-    (mode="max") or minimum (mode="min") average SHAP.
-    Returns (best_start, best_end, best_avg).
-    """
     n = len(shap_means)
     if n == 0:
         return (0, 0, 0.0)
     if window_size >= n:
-        # entire sequence
         avg_val = float(np.mean(shap_means))
         return (0, n, avg_val)
-    # We'll build csum of length n+1
     csum = np.zeros(n + 1, dtype=np.float32)
     csum[1:] = np.cumsum(shap_means)
@@ -179,7 +165,6 @@ def find_extreme_subregion(shap_means, window_size=500, mode="max"):
 ###############################################################################
 def fig_to_image(fig):
-    """Convert a Matplotlib figure to a PIL Image for Gradio."""
     buf = io.BytesIO()
     fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
     buf.seek(0)
@@ -188,27 +173,14 @@ def fig_to_image(fig):
     return img
 def get_zero_centered_cmap():
-    """
-    Creates a custom diverging colormap that is:
-    - Blue for negative
-    - White for zero
-    - Red for positive
-    """
     colors = [
-        (0.0, 'blue'),   # negative
-        (0.5, 'white'),  # zero
-        (1.0, 'red')     # positive
     ]
-    cmap = mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
-    return cmap
 def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
-    """
-    Plots a 1D heatmap of per-base SHAP contributions with a custom colormap:
-    - Negative = blue
-    - 0 = white
-    - Positive = red
-    """
     if start is not None and end is not None:
         local_shap = shap_means[start:end]
         subtitle = f" (positions {start}-{end})"
@@ -219,73 +191,46 @@ def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, e
     if len(local_shap) == 0:
         local_shap = np.array([0.0])
-    # Build 2D array for imshow
     heatmap_data = local_shap.reshape(1, -1)
-    # Force symmetrical range
     min_val = np.min(local_shap)
     max_val = np.max(local_shap)
     extent = max(abs(min_val), abs(max_val))
-    # Create custom colormap
-    custom_cmap = get_zero_centered_cmap()
-    # Create figure with adjusted height ratio
-    fig, ax = plt.subplots(figsize=(12, 1.8))  # Reduced height
-    # Plot heatmap
     cax = ax.imshow(
         heatmap_data,
         aspect='auto',
-        cmap=custom_cmap,
         vmin=-extent,
-        vmax=+extent
     )
-    # Configure colorbar with more subtle positioning
     cbar = plt.colorbar(
         cax,
         orientation='horizontal',
-        pad=0.25,  # Reduced padding
-        aspect=40,  # Make colorbar thinner
-        shrink=0.8  # Make colorbar shorter than plot width
-    )
-    # Style the colorbar
-    cbar.ax.tick_params(labelsize=8)  # Smaller tick labels
-    cbar.set_label(
-        'SHAP Contribution',
-        fontsize=9,
-        labelpad=5
     )
-    # Configure main plot
     ax.set_yticks([])
     ax.set_xlabel('Position in Sequence', fontsize=10)
     ax.set_title(f"{title}{subtitle}", pad=10)
-    # Fine-tune layout
-    plt.subplots_adjust(
-        bottom=0.25,  # Reduced bottom margin
-        left=0.05,    # Tighter left margin
-        right=0.95    # Tighter right margin
-    )
     return fig
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
-    """Create a bar plot of the most important k-mers."""
     plt.rcParams.update({'font.size': 10})
     fig = plt.figure(figsize=(10, 5))
-    # Sort by absolute importance
     indices = np.argsort(np.abs(shap_values))[-top_k:]
     values = shap_values[indices]
     features = [kmers[i] for i in indices]
-    # negative -> blue, positive -> red
     colors = ['#99ccff' if v < 0 else '#ff9999' for v in values]
     plt.barh(range(len(values)), values, color=colors)
     plt.yticks(range(len(values)), features)
     plt.xlabel('SHAP Value (impact on model output)')
@@ -295,9 +240,6 @@ def create_importance_bar_plot(shap_values, kmers, top_k=10):
     return fig
 def plot_shap_histogram(shap_array, title="SHAP Distribution in Region"):
-    """
-    Simple histogram of SHAP values in the subregion.
-    """
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=30, color='gray', edgecolor='black')
     ax.axvline(0, color='red', linestyle='--', label='0.0')
@@ -309,7 +251,6 @@ def plot_shap_histogram(shap_array, title="SHAP Distribution in Region"):
     return fig
 def compute_gc_content(sequence):
-    """Compute %GC in the sequence (A, C, G, T)."""
     if not sequence:
         return 0
     gc_count = sequence.count('G') + sequence.count('C')
@@ -319,78 +260,72 @@ def compute_gc_content(sequence):
 # 7. SEQUENCE ANALYSIS FUNCTIONS
 ###############################################################################
 def analyze_sequence(file_path, top_k=10, fasta_text="", window_size=500):
     """
     Analyze a virus sequence from a FASTA file or text input.
     Returns (results_text, kmer_plot, heatmap_plot, state_dict, header)
     """
     try:
-        # Load model and k-mer info
-        model = VirusClassifier(256)  # 4^4 = 256 k-mers for k=4
-        model.load_state_dict(torch.load("model.pt"))
-        model.eval()
-        kmers = [''.join(p) for p in product("ACGT", repeat=4)]
-        # Process input (file takes precedence over text)
         if file_path:
             with open(file_path, 'r') as f:
                 fasta_text = f.read()
         if not fasta_text.strip():
             return ("Error: No sequence provided", None, None, {}, "")
-        # Parse FASTA
         sequences = parse_fasta(fasta_text)
         if not sequences:
             return ("Error: No valid FASTA sequences found", None, None, {}, "")
-        header, sequence = sequences[0]  # Take first sequence
-        # Convert to k-mer frequencies
-        x = sequence_to_kmer_vector(sequence)
-        x_tensor = torch.tensor(x).float().unsqueeze(0)
-        # Get model prediction
         with torch.no_grad():
             output = model(x_tensor)
             probs = torch.softmax(output, dim=1)
-            # Using index 1 for probability of human
             pred_human = probs[0, 1].item()
-        # Calculate SHAP values
-        shap_values, prob = calculate_shap_values(model, x_tensor)
-        # Find most extreme regions
-        shap_means = compute_positionwise_scores(sequence, shap_values)
         start_max, end_max, avg_max = find_extreme_subregion(shap_means, window_size, mode="max")
         start_min, end_min, avg_min = find_extreme_subregion(shap_means, window_size, mode="min")
-        # Format results text
-        classification = "Human" if pred_human > 0.5 else "Non-human"
         results = (
             f"Classification: {classification} "
             f"(probability of human = {pred_human:.3f})\n\n"
             f"Sequence length: {len(sequence):,} bases\n"
             f"Overall GC content: {compute_gc_content(sequence):.1f}%\n\n"
-            f"Most human-like {window_size}bp region:\n"
             f"Position {start_max:,} to {end_max:,}\n"
             f"Average SHAP: {avg_max:.4f}\n"
             f"GC content: {compute_gc_content(sequence[start_max:end_max]):.1f}%\n\n"
-            f"Least human-like {window_size}bp region:\n"
             f"Position {start_min:,} to {end_min:,}\n"
             f"Average SHAP: {avg_min:.4f}\n"
             f"GC content: {compute_gc_content(sequence[start_min:end_min]):.1f}%"
         )
-        # Create k-mer importance plot
-        kmer_fig = create_importance_bar_plot(shap_values, kmers, top_k)
         kmer_img = fig_to_image(kmer_fig)
-        # Create genome-wide heatmap
         heatmap_fig = plot_linear_heatmap(shap_means)
         heatmap_img = fig_to_image(heatmap_fig)
-        # Store data for subregion analysis
         state = {
             "seq": sequence,
             "shap_means": shap_means
@@ -399,21 +334,19 @@ def analyze_sequence(file_path, top_k=10, fasta_text="", window_size=500):
         return results, kmer_img, heatmap_img, state, header
     except Exception as e:
-        error_msg = f"Error analyzing sequence: {str(e)}"
-        return (error_msg, None, None, {}, "")
 def analyze_subregion(state, header, region_start, region_end):
-    """
-    Takes stored data from step 1 and a user-chosen region.
-    Returns a subregion heatmap, histogram, and some stats (GC, average SHAP).
-    """
     if not state or "seq" not in state or "shap_means" not in state:
         return ("No sequence data found. Please run Step 1 first.", None, None)
     seq = state["seq"]
     shap_means = state["shap_means"]
-    # Validate bounds
     region_start = int(region_start)
     region_end = int(region_end)
@@ -422,19 +355,15 @@ def analyze_subregion(state, header, region_start, region_end):
     if region_end <= region_start:
         return ("Invalid region range. End must be > Start.", None, None)
-    # Subsequence
     region_seq = seq[region_start:region_end]
     region_shap = shap_means[region_start:region_end]
-    # Some stats
     gc_percent = compute_gc_content(region_seq)
     avg_shap = float(np.mean(region_shap))
-    # Fraction pushing toward human vs. non-human
     positive_fraction = np.mean(region_shap > 0)
     negative_fraction = np.mean(region_shap < 0)
-    # Simple logic-based interpretation
     if avg_shap > 0.05:
         region_classification = "Likely pushing toward human"
     elif avg_shap < -0.05:
@@ -452,7 +381,6 @@ def analyze_subregion(state, header, region_start, region_end):
         f"Subregion interpretation: {region_classification}\n"
     )
-    # Plot region as small heatmap
     heatmap_fig = plot_linear_heatmap(
         shap_means,
         title="Subregion SHAP",
@@ -461,72 +389,45 @@ def analyze_subregion(state, header, region_start, region_end):
     )
     heatmap_img = fig_to_image(heatmap_fig)
-    # Plot histogram of SHAP in region
     hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
     hist_img = fig_to_image(hist_fig)
     return (region_info, heatmap_img, hist_img)
 ###############################################################################
-# 8. COMPARISON ANALYSIS FUNCTIONS
 ###############################################################################
 def normalize_shap_lengths(shap1, shap2, num_points=1000):
-    """
-    Normalize two SHAP arrays to the same length using interpolation.
-    Returns (normalized_shap1, normalized_shap2)
-    """
-    # Create x coordinates for both sequences
     x1 = np.linspace(0, 1, len(shap1))
     x2 = np.linspace(0, 1, len(shap2))
-    # Create interpolation functions
     f1 = interp1d(x1, shap1, kind='linear')
     f2 = interp1d(x2, shap2, kind='linear')
-    # Create new x coordinates for interpolation
     x_new = np.linspace(0, 1, num_points)
-    # Interpolate both sequences to new length
     shap1_norm = f1(x_new)
     shap2_norm = f2(x_new)
     return shap1_norm, shap2_norm
 def compute_shap_difference(shap1_norm, shap2_norm):
-    """
-    Compute the difference between two normalized SHAP arrays.
-    Positive values indicate seq2 is more "human-like" than seq1.
-    """
     return shap2_norm - shap1_norm
 def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
-    """
-    Plot the difference between two sequences' SHAP values.
-    Red indicates seq2 is more human-like, blue indicates seq1 is more human-like.
-    """
-    # Build 2D array for imshow
     heatmap_data = shap_diff.reshape(1, -1)
-    # Force symmetrical range
     extent = max(abs(np.min(shap_diff)), abs(np.max(shap_diff)))
-    # Create figure with adjusted height ratio
     fig, ax = plt.subplots(figsize=(12, 1.8))
-    # Create custom colormap
-    custom_cmap = get_zero_centered_cmap()
-    # Plot heatmap
     cax = ax.imshow(
         heatmap_data,
         aspect='auto',
-        cmap=custom_cmap,
         vmin=-extent,
-        vmax=+extent
     )
-    # Configure colorbar
     cbar = plt.colorbar(
         cax,
         orientation='horizontal',
@@ -534,74 +435,47 @@ def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
         aspect=40,
         shrink=0.8
     )
-    # Style the colorbar
     cbar.ax.tick_params(labelsize=8)
-    cbar.set_label(
-        'SHAP Difference (Seq2 - Seq1)',
-        fontsize=9,
-        labelpad=5
-    )
-    # Configure main plot
     ax.set_yticks([])
     ax.set_xlabel('Normalized Position (0-100%)', fontsize=10)
     ax.set_title(title, pad=10)
-    plt.subplots_adjust(
-        bottom=0.25,
-        left=0.05,
-        right=0.95
-    )
     return fig
 def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
-    """
-    Compare two sequences by analyzing their SHAP differences.
-    Returns comparison text and visualizations.
-    """
-    # Process first sequence
-    results1 = analyze_sequence(file1, fasta_text=fasta1)
     if isinstance(results1[0], str) and "Error" in results1[0]:
         return (f"Error in sequence 1: {results1[0]}", None, None)
-    # Process second sequence
-    results2 = analyze_sequence(file2, fasta_text=fasta2)
     if isinstance(results2[0], str) and "Error" in results2[0]:
         return (f"Error in sequence 2: {results2[0]}", None, None)
-    # Get SHAP means from state dictionaries
     shap1 = results1[3]["shap_means"]
     shap2 = results2[3]["shap_means"]
-    # Normalize lengths
     shap1_norm, shap2_norm = normalize_shap_lengths(shap1, shap2)
-    # Compute difference (positive = seq2 more human-like)
     shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
-    # Calculate statistics
     avg_diff = np.mean(shap_diff)
     std_diff = np.std(shap_diff)
     max_diff = np.max(shap_diff)
     min_diff = np.min(shap_diff)
-    # Calculate what fraction of positions show substantial differences
-    threshold = 0.05  # Arbitrary threshold for "substantial" difference
     substantial_diffs = np.abs(shap_diff) > threshold
     frac_different = np.mean(substantial_diffs)
-    # Extract classifications safely
     classification1 = results1[0].split('Classification: ')[1].split('\n')[0].strip()
     classification2 = results2[0].split('Classification: ')[1].split('\n')[0].strip()
-    # Format numbers
     len1_formatted = "{:,}".format(len(shap1))
     len2_formatted = "{:,}".format(len(shap2))
     frac_formatted = "{:.2%}".format(frac_different)
-    # Build comparison text
     comparison_text = (
         "Sequence Comparison Results:\n"
         f"Sequence 1: {results1[4]}\n"
@@ -621,21 +495,16 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
         "Negative values (blue) indicate regions where Sequence 1 is more 'human-like'"
     )
-    # Create comparison heatmap
     heatmap_fig = plot_comparative_heatmap(shap_diff)
     heatmap_img = fig_to_image(heatmap_fig)
-    # Create histogram of differences
-    hist_fig = plot_shap_histogram(
-        shap_diff,
-        title="Distribution of SHAP Differences"
-    )
     hist_img = fig_to_image(hist_fig)
     return comparison_text, heatmap_img, hist_img
 ###############################################################################
-# 9. BUILD GRADIO INTERFACE
 ###############################################################################
 css = """
@@ -666,14 +535,14 @@ with gr.Blocks(css=css) as iface:
                     placeholder=">sequence_name\nACGTACGT...",
                     lines=5
                 )
-                top_k = gr.Slider(
                     minimum=5,
                     maximum=30,
                     value=10,
                     step=1,
                     label="Number of top k-mers to display"
                 )
-                win_size = gr.Slider(
                     minimum=100,
                     maximum=5000,
                     value=500,
@@ -694,7 +563,7 @@ with gr.Blocks(css=css) as iface:
         analyze_btn.click(
             analyze_sequence,
-            inputs=[file_input, top_k, text_input, win_size],
             outputs=[results_box, kmer_img, genome_img, seq_state, header_state]
         )
@@ -797,22 +666,16 @@ with gr.Blocks(css=css) as iface:
       - Statistical summary of differences
     """)
-###############################################################################
-# 10. MAIN EXECUTION
-###############################################################################
 if __name__ == "__main__":
-    # Set up any global configurations if needed
     plt.style.use('default')
     plt.rcParams['figure.figsize'] = [10, 6]
     plt.rcParams['figure.dpi'] = 100
     plt.rcParams['font.size'] = 10
-    # Launch the interface
     iface.launch(
-        share=False,              # Set to True to create a public link
-        server_name="0.0.0.0",   # Listen on all network interfaces
-        server_port=7860,        # Default Gradio port
-        show_api=False,          # Hide API docs
-        debug=False              # Set to True for debugging
     )

 import gradio as gr
 import torch
 import numpy as np
 from itertools import product
 import torch.nn as nn
     total_kmers = len(sequence) - k + 1
     if total_kmers > 0:
+        vec /= total_kmers
     return vec
     """
     model.eval()
     with torch.no_grad():
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
         baseline_prob = baseline_probs[0, 1].item()  # Probability of 'human'
         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
             x_zeroed[0, i] = 0.0
             output = model(x_zeroed)
             probs = torch.softmax(output, dim=1)
+            prob = probs[0, 1].item()
             impact = baseline_prob - prob
             shap_values.append(impact)
+            x_zeroed[0, i] = original_val
     return np.array(shap_values), baseline_prob
 ###############################################################################
 ###############################################################################
 def compute_positionwise_scores(sequence, shap_values, k=4):
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
 ###############################################################################
 def find_extreme_subregion(shap_means, window_size=500, mode="max"):
     n = len(shap_means)
     if n == 0:
         return (0, 0, 0.0)
     if window_size >= n:
         avg_val = float(np.mean(shap_means))
         return (0, n, avg_val)
     csum = np.zeros(n + 1, dtype=np.float32)
     csum[1:] = np.cumsum(shap_means)
 ###############################################################################
 def fig_to_image(fig):
     buf = io.BytesIO()
     fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
     buf.seek(0)
     return img
 def get_zero_centered_cmap():
     colors = [
+        (0.0, 'blue'),
+        (0.5, 'white'),
+        (1.0, 'red')
     ]
+    return mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
 def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
     if start is not None and end is not None:
         local_shap = shap_means[start:end]
         subtitle = f" (positions {start}-{end})"
     if len(local_shap) == 0:
         local_shap = np.array([0.0])
     heatmap_data = local_shap.reshape(1, -1)
     min_val = np.min(local_shap)
     max_val = np.max(local_shap)
     extent = max(abs(min_val), abs(max_val))
+    cmap = get_zero_centered_cmap()
+    fig, ax = plt.subplots(figsize=(12, 1.8))
     cax = ax.imshow(
         heatmap_data,
         aspect='auto',
+        cmap=cmap,
         vmin=-extent,
+        vmax=extent
     )
     cbar = plt.colorbar(
         cax,
         orientation='horizontal',
+        pad=0.25,
+        aspect=40,
+        shrink=0.8
     )
+    cbar.ax.tick_params(labelsize=8)
+    cbar.set_label('SHAP Contribution', fontsize=9, labelpad=5)
     ax.set_yticks([])
     ax.set_xlabel('Position in Sequence', fontsize=10)
     ax.set_title(f"{title}{subtitle}", pad=10)
+    plt.subplots_adjust(bottom=0.25, left=0.05, right=0.95)
     return fig
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
     plt.rcParams.update({'font.size': 10})
     fig = plt.figure(figsize=(10, 5))
     indices = np.argsort(np.abs(shap_values))[-top_k:]
     values = shap_values[indices]
     features = [kmers[i] for i in indices]
     colors = ['#99ccff' if v < 0 else '#ff9999' for v in values]
     plt.barh(range(len(values)), values, color=colors)
     plt.yticks(range(len(values)), features)
     plt.xlabel('SHAP Value (impact on model output)')
     return fig
 def plot_shap_histogram(shap_array, title="SHAP Distribution in Region"):
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=30, color='gray', edgecolor='black')
     ax.axvline(0, color='red', linestyle='--', label='0.0')
     return fig
 def compute_gc_content(sequence):
     if not sequence:
         return 0
     gc_count = sequence.count('G') + sequence.count('C')
 # 7. SEQUENCE ANALYSIS FUNCTIONS
 ###############################################################################
+# Set up device and load the model once globally
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = VirusClassifier(256)
+model.load_state_dict(torch.load("model.pt", map_location=device))
+model.to(device)
+model.eval()
+KMERS_4 = [''.join(p) for p in product("ACGT", repeat=4)]
 def analyze_sequence(file_path, top_k=10, fasta_text="", window_size=500):
     """
     Analyze a virus sequence from a FASTA file or text input.
     Returns (results_text, kmer_plot, heatmap_plot, state_dict, header)
     """
     try:
         if file_path:
             with open(file_path, 'r') as f:
                 fasta_text = f.read()
         if not fasta_text.strip():
             return ("Error: No sequence provided", None, None, {}, "")
         sequences = parse_fasta(fasta_text)
         if not sequences:
             return ("Error: No valid FASTA sequences found", None, None, {}, "")
+        header, sequence = sequences[0]
+        x = sequence_to_kmer_vector(sequence, k=4)
+        x_tensor = torch.tensor(x).float().unsqueeze(0).to(device)
         with torch.no_grad():
             output = model(x_tensor)
             probs = torch.softmax(output, dim=1)
             pred_human = probs[0, 1].item()
+        classification = "Human" if pred_human > 0.5 else "Non-human"
+        shap_values, baseline_prob = calculate_shap_values(model, x_tensor)
+        shap_means = compute_positionwise_scores(sequence, shap_values, k=4)
         start_max, end_max, avg_max = find_extreme_subregion(shap_means, window_size, mode="max")
         start_min, end_min, avg_min = find_extreme_subregion(shap_means, window_size, mode="min")
         results = (
             f"Classification: {classification} "
             f"(probability of human = {pred_human:.3f})\n\n"
             f"Sequence length: {len(sequence):,} bases\n"
             f"Overall GC content: {compute_gc_content(sequence):.1f}%\n\n"
+            f"Most human-like {window_size} bp region:\n"
             f"Position {start_max:,} to {end_max:,}\n"
             f"Average SHAP: {avg_max:.4f}\n"
             f"GC content: {compute_gc_content(sequence[start_max:end_max]):.1f}%\n\n"
+            f"Least human-like {window_size} bp region:\n"
             f"Position {start_min:,} to {end_min:,}\n"
             f"Average SHAP: {avg_min:.4f}\n"
             f"GC content: {compute_gc_content(sequence[start_min:end_min]):.1f}%"
         )
+        kmer_fig = create_importance_bar_plot(shap_values, KMERS_4, top_k=top_k)
         kmer_img = fig_to_image(kmer_fig)
         heatmap_fig = plot_linear_heatmap(shap_means)
         heatmap_img = fig_to_image(heatmap_fig)
         state = {
             "seq": sequence,
             "shap_means": shap_means
         return results, kmer_img, heatmap_img, state, header
     except Exception as e:
+        return (f"Error analyzing sequence: {str(e)}", None, None, {}, "")
+###############################################################################
+# 8. SUBREGION ANALYSIS FUNCTION
+###############################################################################
 def analyze_subregion(state, header, region_start, region_end):
     if not state or "seq" not in state or "shap_means" not in state:
         return ("No sequence data found. Please run Step 1 first.", None, None)
     seq = state["seq"]
     shap_means = state["shap_means"]
     region_start = int(region_start)
     region_end = int(region_end)
     if region_end <= region_start:
         return ("Invalid region range. End must be > Start.", None, None)
     region_seq = seq[region_start:region_end]
     region_shap = shap_means[region_start:region_end]
     gc_percent = compute_gc_content(region_seq)
     avg_shap = float(np.mean(region_shap))
     positive_fraction = np.mean(region_shap > 0)
     negative_fraction = np.mean(region_shap < 0)
     if avg_shap > 0.05:
         region_classification = "Likely pushing toward human"
     elif avg_shap < -0.05:
         f"Subregion interpretation: {region_classification}\n"
     )
     heatmap_fig = plot_linear_heatmap(
         shap_means,
         title="Subregion SHAP",
     )
     heatmap_img = fig_to_image(heatmap_fig)
     hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
     hist_img = fig_to_image(hist_fig)
     return (region_info, heatmap_img, hist_img)
 ###############################################################################
+# 9. COMPARISON ANALYSIS FUNCTIONS
 ###############################################################################
 def normalize_shap_lengths(shap1, shap2, num_points=1000):
     x1 = np.linspace(0, 1, len(shap1))
     x2 = np.linspace(0, 1, len(shap2))
     f1 = interp1d(x1, shap1, kind='linear')
     f2 = interp1d(x2, shap2, kind='linear')
     x_new = np.linspace(0, 1, num_points)
     shap1_norm = f1(x_new)
     shap2_norm = f2(x_new)
     return shap1_norm, shap2_norm
 def compute_shap_difference(shap1_norm, shap2_norm):
     return shap2_norm - shap1_norm
 def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
     heatmap_data = shap_diff.reshape(1, -1)
     extent = max(abs(np.min(shap_diff)), abs(np.max(shap_diff)))
+    cmap = get_zero_centered_cmap()
     fig, ax = plt.subplots(figsize=(12, 1.8))
     cax = ax.imshow(
         heatmap_data,
         aspect='auto',
+        cmap=cmap,
         vmin=-extent,
+        vmax=extent
     )
     cbar = plt.colorbar(
         cax,
         orientation='horizontal',
         aspect=40,
         shrink=0.8
     )
     cbar.ax.tick_params(labelsize=8)
+    cbar.set_label('SHAP Difference (Seq2 - Seq1)', fontsize=9, labelpad=5)
     ax.set_yticks([])
     ax.set_xlabel('Normalized Position (0-100%)', fontsize=10)
     ax.set_title(title, pad=10)
+    plt.subplots_adjust(bottom=0.25, left=0.05, right=0.95)
     return fig
 def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
+    results1 = analyze_sequence(file1, top_k=10, fasta_text=fasta1, window_size=500)
     if isinstance(results1[0], str) and "Error" in results1[0]:
         return (f"Error in sequence 1: {results1[0]}", None, None)
+    results2 = analyze_sequence(file2, top_k=10, fasta_text=fasta2, window_size=500)
     if isinstance(results2[0], str) and "Error" in results2[0]:
         return (f"Error in sequence 2: {results2[0]}", None, None)
     shap1 = results1[3]["shap_means"]
     shap2 = results2[3]["shap_means"]
     shap1_norm, shap2_norm = normalize_shap_lengths(shap1, shap2)
     shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
     avg_diff = np.mean(shap_diff)
     std_diff = np.std(shap_diff)
     max_diff = np.max(shap_diff)
     min_diff = np.min(shap_diff)
+    threshold = 0.05
     substantial_diffs = np.abs(shap_diff) > threshold
     frac_different = np.mean(substantial_diffs)
     classification1 = results1[0].split('Classification: ')[1].split('\n')[0].strip()
     classification2 = results2[0].split('Classification: ')[1].split('\n')[0].strip()
     len1_formatted = "{:,}".format(len(shap1))
     len2_formatted = "{:,}".format(len(shap2))
     frac_formatted = "{:.2%}".format(frac_different)
     comparison_text = (
         "Sequence Comparison Results:\n"
         f"Sequence 1: {results1[4]}\n"
         "Negative values (blue) indicate regions where Sequence 1 is more 'human-like'"
     )
     heatmap_fig = plot_comparative_heatmap(shap_diff)
     heatmap_img = fig_to_image(heatmap_fig)
+    hist_fig = plot_shap_histogram(shap_diff, title="Distribution of SHAP Differences")
     hist_img = fig_to_image(hist_fig)
     return comparison_text, heatmap_img, hist_img
 ###############################################################################
+# 10. BUILD GRADIO INTERFACE
 ###############################################################################
 css = """
                     placeholder=">sequence_name\nACGTACGT...",
                     lines=5
                 )
+                top_k_slider = gr.Slider(
                     minimum=5,
                     maximum=30,
                     value=10,
                     step=1,
                     label="Number of top k-mers to display"
                 )
+                win_size_slider = gr.Slider(
                     minimum=100,
                     maximum=5000,
                     value=500,
         analyze_btn.click(
             analyze_sequence,
+            inputs=[file_input, top_k_slider, text_input, win_size_slider],
             outputs=[results_box, kmer_img, genome_img, seq_state, header_state]
         )
       - Statistical summary of differences
     """)
 if __name__ == "__main__":
     plt.style.use('default')
     plt.rcParams['figure.figsize'] = [10, 6]
     plt.rcParams['figure.dpi'] = 100
     plt.rcParams['font.size'] = 10
     iface.launch(
+        share=False,
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_api=False,
+        debug=False
     )