Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 12

Commit

de0719b

verified ·

1 Parent(s): 1d54b05

Update app.py

Browse files

Files changed (1) hide show

app.py +421 -468

app.py CHANGED Viewed

@@ -6,34 +6,13 @@ from itertools import product
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import matplotlib.colors as mcolors
-import seaborn as sns
-from PIL import Image
 import io
-import pandas as pd
-from typing import Tuple, List, Dict, Any
-from dataclasses import dataclass
-import plotly.graph_objects as go
-import plotly.express as px
-from plotly.subplots import make_subplots
 ###############################################################################
-# 1. DATA STRUCTURES & MODEL
 ###############################################################################
-@dataclass
-class SequenceAnalysis:
-    """Container for sequence analysis results"""
-    header: str
-    sequence: str
-    length: int
-    gc_content: float
-    classification: str
-    human_prob: float
-    nonhuman_prob: float
-    shap_values: np.ndarray
-    shap_means: np.ndarray
-    extreme_regions: Dict[str, Dict[str, Any]]
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
         super(VirusClassifier, self).__init__()
@@ -50,16 +29,16 @@ class VirusClassifier(nn.Module):
             nn.GELU(),
             nn.Linear(32, 2)
         )
     def forward(self, x):
         return self.network(x)
 ###############################################################################
-# 2. SEQUENCE PROCESSING
 ###############################################################################
-def parse_fasta(text: str) -> List[Tuple[str, str]]:
-    """Parse FASTA formatted text with improved robustness"""
     sequences = []
     current_header = None
     current_sequence = []
@@ -74,68 +53,67 @@ def parse_fasta(text: str) -> List[Tuple[str, str]]:
             current_header = line[1:]
             current_sequence = []
         else:
-            # Filter out non-ACGT characters and convert to uppercase
-            filtered_line = ''.join(c for c in line.upper() if c in 'ACGT')
-            current_sequence.append(filtered_line)
     if current_header:
         sequences.append((current_header, ''.join(current_sequence)))
     return sequences
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
-    """Convert sequence to k-mer frequency vector with optimizations"""
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
-    # Use sliding window for efficiency
     for i in range(len(sequence) - k + 1):
         kmer = sequence[i:i+k]
-        if kmer in kmer_dict:  # Handle non-ACGT kmers
             vec[kmer_dict[kmer]] += 1
-    # Normalize
     total_kmers = len(sequence) - k + 1
     if total_kmers > 0:
         vec = vec / total_kmers
-    return vec
-def compute_gc_content(sequence: str) -> float:
-    """Compute GC content percentage"""
-    if not sequence:
-        return 0.0
-    gc_count = sum(1 for base in sequence if base in 'GC')
-    return (gc_count / len(sequence)) * 100.0
 ###############################################################################
-# 3. SHAP & ANALYSIS
 ###############################################################################
-def calculate_shap_values(model: nn.Module, x_tensor: torch.Tensor) -> Tuple[np.ndarray, float]:
-    """Calculate SHAP values using ablation with improved efficiency"""
     model.eval()
     with torch.no_grad():
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
-        baseline_prob = baseline_probs[0, 1].item()
         shap_values = []
         x_zeroed = x_tensor.clone()
-        # Vectorized computation where possible
         for i in range(x_tensor.shape[1]):
             x_zeroed[0, i] = 0.0
             output = model(x_zeroed)
             probs = torch.softmax(output, dim=1)
-            impact = baseline_prob - probs[0, 1].item()
             shap_values.append(impact)
-            x_zeroed[0, i] = x_tensor[0, i]
     return np.array(shap_values), baseline_prob
-def compute_positionwise_scores(sequence: str, shap_values: np.ndarray, k: int = 4) -> np.ndarray:
-    """Compute per-base SHAP scores with optimized memory usage"""
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
@@ -143,472 +121,447 @@ def compute_positionwise_scores(sequence: str, shap_values: np.ndarray, k: int =
     shap_sums = np.zeros(seq_len, dtype=np.float32)
     coverage = np.zeros(seq_len, dtype=np.float32)
-    # Vectorized operations where possible
     for i in range(seq_len - k + 1):
         kmer = sequence[i:i+k]
         if kmer in kmer_dict:
-            idx = kmer_dict[kmer]
-            shap_sums[i:i+k] += shap_values[idx]
-            coverage[i:i+k] += 1
     with np.errstate(divide='ignore', invalid='ignore'):
         shap_means = np.where(coverage > 0, shap_sums / coverage, 0.0)
     return shap_means
-def find_extreme_regions(shap_means: np.ndarray, window_size: int = 500) -> Dict[str, Dict[str, Any]]:
-    """Find regions with extreme SHAP values using efficient sliding window"""
-    if len(shap_means) < window_size:
-        window_size = len(shap_means)
-    # Compute cumulative sum for efficient sliding window
-    cumsum = np.cumsum(np.pad(shap_means, (0, 1)))
-    # Sliding window calculation
-    window_avgs = (cumsum[window_size:] - cumsum[:-window_size]) / window_size
-    max_idx = np.argmax(window_avgs)
-    min_idx = np.argmin(window_avgs)
-    return {
-        "human": {
-            "start": max_idx,
-            "end": max_idx + window_size,
-            "avg_shap": float(window_avgs[max_idx])
-        },
-        "nonhuman": {
-            "start": min_idx,
-            "end": min_idx + window_size,
-            "avg_shap": float(window_avgs[min_idx])
-        }
-    }
 ###############################################################################
-# 4. VISUALIZATION
 ###############################################################################
-def create_genome_overview_plot(analysis: SequenceAnalysis) -> go.Figure:
-    """Create an interactive genome overview using Plotly"""
-    fig = make_subplots(
-        rows=2, cols=1,
-        subplot_titles=("SHAP Values Along Genome", "GC Content"),
-        row_heights=[0.7, 0.3],
-        vertical_spacing=0.1
-    )
-    # SHAP trace
-    fig.add_trace(
-        go.Scatter(
-            x=list(range(len(analysis.shap_means))),
-            y=analysis.shap_means,
-            name="SHAP",
-            line=dict(color='rgba(31, 119, 180, 0.8)'),
-            hovertemplate="Position: %{x}<br>SHAP: %{y:.4f}<extra></extra>"
-        ),
-        row=1, col=1
-    )
-    # Highlight extreme regions
-    for region_type, region in analysis.extreme_regions.items():
-        color = 'rgba(255, 0, 0, 0.2)' if region_type == 'human' else 'rgba(0, 0, 255, 0.2)'
-        fig.add_vrect(
-            x0=region['start'],
-            x1=region['end'],
-            fillcolor=color,
-            opacity=0.5,
-            layer="below",
-            line_width=0,
-            row=1, col=1
-        )
-    # Calculate rolling GC content
-    window = 100
-    gc_content = np.array([
-        compute_gc_content(analysis.sequence[i:i+window])
-        for i in range(0, len(analysis.sequence) - window + 1, window)
-    ])
-    # GC content trace
-    fig.add_trace(
-        go.Scatter(
-            x=np.arange(len(gc_content)) * window,
-            y=gc_content,
-            name="GC%",
-            line=dict(color='rgba(44, 160, 44, 0.8)'),
-            hovertemplate="Position: %{x}<br>GC%: %{y:.1f}%<extra></extra>"
-        ),
-        row=2, col=1
     )
-    # Update layout
-    fig.update_layout(
-        height=800,
-        title=dict(
-            text=f"Genome Analysis Overview<br><sub>{analysis.header}</sub>",
-            x=0.5
-        ),
-        showlegend=False,
-        plot_bgcolor='white'
-    )
-    # Update axes
-    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
-    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
     return fig
-def create_kmer_importance_plot(analysis: SequenceAnalysis, top_k: int = 10) -> go.Figure:
-    """Create interactive k-mer importance plot using Plotly"""
-    # Get top k-mers by absolute SHAP value
-    kmers = [''.join(p) for p in product("ACGT", repeat=4)]
-    indices = np.argsort(np.abs(analysis.shap_values))[-top_k:]
-    # Create DataFrame for plotting
-    df = pd.DataFrame({
-        'k-mer': [kmers[i] for i in indices],
-        'SHAP': analysis.shap_values[indices]
-    })
-    # Create plot
-    fig = px.bar(
-        df,
-        x='SHAP',
-        y='k-mer',
-        orientation='h',
-        color='SHAP',
-        color_continuous_scale='RdBu',
-        title=f'Top {top_k} Most Influential k-mers'
-    )
-    # Update layout
-    fig.update_layout(
-        height=400,
-        plot_bgcolor='white',
-        yaxis_title='',
-        xaxis_title='SHAP Value',
-        coloraxis_showscale=False
-    )
     return fig
-def create_shap_distribution_plot(analysis: SequenceAnalysis) -> go.Figure:
-    """Create SHAP distribution plot using Plotly"""
-    fig = go.Figure()
-    # Add histogram
-    fig.add_trace(go.Histogram(
-        x=analysis.shap_means,
-        nbinsx=50,
-        name='SHAP Values',
-        marker_color='rgba(31, 119, 180, 0.6)'
-    ))
-    # Add vertical line at x=0
-    fig.add_vline(
-        x=0,
-        line_dash="dash",
-        line_color="red",
-        annotation_text="Neutral",
-        annotation_position="top"
-    )
-    # Update layout
-    fig.update_layout(
-        title='Distribution of SHAP Values',
-        xaxis_title='SHAP Value',
-        yaxis_title='Count',
-        plot_bgcolor='white',
-        height=400
-    )
     return fig
 ###############################################################################
-# 5. MAIN ANALYSIS
 ###############################################################################
-def analyze_sequence(
-    file_obj: str = None,
-    fasta_text: str = "",
-    window_size: int = 500,
-    model_path: str = 'model.pt',
-    scaler_path: str = 'scaler.pkl'
-) -> SequenceAnalysis:
-    """Main sequence analysis function"""
     # Handle input
     if fasta_text.strip():
         text = fasta_text.strip()
     elif file_obj is not None:
-        with open(file_obj, 'r') as f:
-            text = f.read()
     else:
-        raise ValueError("No input provided")
     # Parse FASTA
     sequences = parse_fasta(text)
     if not sequences:
-        raise ValueError("No valid FASTA sequences found")
     header, seq = sequences[0]
     # Load model and scaler
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    state_dict = torch.load(model_path, map_location=device)
-    model = VirusClassifier(256).to(device)
-    model.load_state_dict(state_dict)
-    scaler = joblib.load(scaler_path)
-    # Process sequence
     freq_vector = sequence_to_kmer_vector(seq)
     scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
     x_tensor = torch.FloatTensor(scaled_vector).to(device)
-    # Get SHAP values and classification
     shap_values, prob_human = calculate_shap_values(model, x_tensor)
     prob_nonhuman = 1.0 - prob_human
-    # Get per-base SHAP scores
-    shap_means = compute_positionwise_scores(seq, shap_values)
-    # Find extreme regions
-    extreme_regions = find_extreme_regions(shap_means, window_size)
-    # Create analysis object
-    return SequenceAnalysis(
-        header=header,
-        sequence=seq,
-        length=len(seq),
-        gc_content=compute_gc_content(seq),
-        classification="Human" if prob_human > 0.5 else "Non-human",
-        human_prob=prob_human,
-        nonhuman_prob=prob_nonhuman,
-        shap_values=shap_values,
-        shap_means=shap_means,
-        extreme_regions=extreme_regions
     )
 ###############################################################################
-# 6. GRADIO INTERFACE
 ###############################################################################
-def create_interface():
-    """Create enhanced Gradio interface with improved layout and interactivity"""
-    def process_sequence(
-        file_obj: str,
-        fasta_text: str,
-        window_size: int,
-        top_kmers: int
-    ) -> Tuple[str, List[go.Figure]]:
-        """Process sequence and return formatted results and plots"""
-        try:
-            # Run analysis
-            analysis = analyze_sequence(
-                file_obj=file_obj,
-                fasta_text=fasta_text,
-                window_size=window_size
-            )
-            # Format results text
-            results = f"""
-            ### Sequence Analysis Results
-            **Basic Information**
-            - Sequence: {analysis.header}
-            - Length: {analysis.length:,} bases
-            - GC Content: {analysis.gc_content:.1f}%
-            **Classification**
-            - Prediction: {analysis.classification}
-            - Human Probability: {analysis.human_prob:.3f}
-            - Non-human Probability: {analysis.nonhuman_prob:.3f}
-            **Extreme Regions (window size: {window_size}bp)**
-            Most Human-like Region:
-            - Position: {analysis.extreme_regions['human']['start']:,} - {analysis.extreme_regions['human']['end']:,}
-            - Average SHAP: {analysis.extreme_regions['human']['avg_shap']:.4f}
-            Most Non-human-like Region:
-            - Position: {analysis.extreme_regions['nonhuman']['start']:,} - {analysis.extreme_regions['nonhuman']['end']:,}
-            - Average SHAP: {analysis.extreme_regions['nonhuman']['avg_shap']:.4f}
-            """
-            # Create plots
-            genome_plot = create_genome_overview_plot(analysis)
-            kmer_plot = create_kmer_importance_plot(analysis, top_kmers)
-            dist_plot = create_shap_distribution_plot(analysis)
-            return results, [genome_plot, kmer_plot, dist_plot], analysis
-        except Exception as e:
-            return f"Error: {str(e)}", [], None
-    # Create theme and styling
-    theme = gr.themes.Soft(
-        primary_hue="blue",
-        secondary_hue="gray",
-    ).set(
-        body_text_color="gray-dark",
-        background_fill_primary="*gray-50",
-        block_shadow="*shadow-sm",
-        block_background_fill="white",
     )
-    # Build interface
-    with gr.Blocks(theme=theme, css="""
-        .container { margin: 0 auto; max-width: 1200px; padding: 20px; }
-        .results { margin-top: 20px; }
-        .plot-container { margin-top: 10px; }
-    """) as interface:
         gr.Markdown("""
-        # 🧬 Enhanced Virus Host Classifier
-        This tool analyzes viral sequences to predict their host (human vs. non-human) and provides detailed visualizations
-        of the features influencing this classification. Upload or paste a FASTA sequence to begin.
-        *Using advanced SHAP analysis and interactive visualizations for interpretable results.*
         """)
-        # Input section
-        with gr.Tab("Sequence Analysis"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    file_input = gr.File(
-                        label="Upload FASTA File",
-                        file_types=[".fasta", ".fa", ".txt"],
-                        type="filepath"
-                    )
-                    text_input = gr.Textbox(
-                        label="Or Paste FASTA Sequence",
-                        placeholder=">sequence_name\nACGTACGT...",
-                        lines=5
-                    )
-                    with gr.Row():
-                        window_size = gr.Slider(
-                            minimum=100,
-                            maximum=5000,
-                            value=500,
-                            step=100,
-                            label="Window Size for Region Analysis"
-                        )
-                        top_kmers = gr.Slider(
-                            minimum=5,
-                            maximum=30,
-                            value=10,
-                            step=1,
-                            label="Number of Top k-mers to Display"
-                        )
-                    analyze_btn = gr.Button(
-                        "🔍 Analyze Sequence",
-                        variant="primary"
-                    )
-                # Results section
-                with gr.Column(scale=2):
-                    results_text = gr.Markdown(
-                        label="Analysis Results"
-                    )
-                    # Plots
-                    genome_plot = gr.Plot(
-                        label="Genome Overview"
-                    )
-                    with gr.Row():
-                        kmer_plot = gr.Plot(
-                            label="k-mer Importance"
-                        )
-                        dist_plot = gr.Plot(
-                            label="SHAP Distribution"
-                        )
-        # Help tab
-        with gr.Tab("Help & Information"):
-            gr.Markdown("""
-            ### 📖 How to Use This Tool
-            1. **Input Your Sequence**
-               - Upload a FASTA file or paste your sequence in FASTA format
-               - The sequence should contain only ACGT bases (non-standard bases will be filtered)
-            2. **Adjust Parameters**
-               - Window Size: Controls the length of regions analyzed for extreme patterns
-               - Top k-mers: Number of most influential sequence patterns to display
-            3. **Interpret Results**
-               - Classification: Predicted host (human vs. non-human)
-               - Genome Overview: Interactive plot showing SHAP values and GC content
-               - k-mer Importance: Most influential sequence patterns
-               - SHAP Distribution: Overall distribution of feature importance
-            ### 🎨 Visualization Guide
-            - **SHAP Values**:
-              - Positive (red) = pushing toward human classification
-              - Negative (blue) = pushing toward non-human classification
-              - Zero (white) = neutral impact
-            - **Extreme Regions**:
-              - Highlighted in the genome overview plot
-              - Red regions = most human-like
-              - Blue regions = most non-human-like
-            ### 🔬 Technical Details
-            - The classifier uses k-mer frequencies (k=4) as features
-            - SHAP values are calculated using an ablation-based approach
-            - GC content is calculated using a sliding window
-            """)
-                    # Connect components
-        sequence_state = gr.State()
-        def process_and_update(file_obj, fasta_text, window_size, top_kmers):
-            """Wrapper to handle plot outputs correctly"""
-            results, plots, analysis = process_sequence(file_obj, fasta_text, window_size, top_kmers)
-            if plots:
-                return [
-                    results,
-                    plots[0],  # genome plot
-                    plots[1],  # kmer plot
-                    plots[2],  # distribution plot
-                    analysis
-                ]
-            return [results, None, None, None, None]
-        analyze_btn.click(
-            process_and_update,
-            inputs=[
-                file_input,
-                text_input,
-                window_size,
-                top_kmers
-            ],
-            outputs=[
-                results_text,
-                genome_plot,
-                kmer_plot,
-                dist_plot,
-                sequence_state
-            ]
         )
-        return interface
-###############################################################################
-# 7. MAIN ENTRY POINT
-###############################################################################
 if __name__ == "__main__":
-    iface = create_interface()
-    iface.launch(
-        share=True,
-        server_name="0.0.0.0",
-        show_error=True
-    )
-    #

 import torch.nn as nn
 import matplotlib.pyplot as plt
 import matplotlib.colors as mcolors
 import io
+from PIL import Image
 ###############################################################################
+# 1. MODEL DEFINITION
 ###############################################################################
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
         super(VirusClassifier, self).__init__()
             nn.GELU(),
             nn.Linear(32, 2)
         )
     def forward(self, x):
         return self.network(x)
 ###############################################################################
+# 2. FASTA PARSING & K-MER FEATURE ENGINEERING
 ###############################################################################
+def parse_fasta(text):
+    """Parse FASTA formatted text into a list of (header, sequence)."""
     sequences = []
     current_header = None
     current_sequence = []
             current_header = line[1:]
             current_sequence = []
         else:
+            current_sequence.append(line.upper())
     if current_header:
         sequences.append((current_header, ''.join(current_sequence)))
     return sequences
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
+    """Convert a sequence to a k-mer frequency vector for classification."""
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
     for i in range(len(sequence) - k + 1):
         kmer = sequence[i:i+k]
+        if kmer in kmer_dict:
             vec[kmer_dict[kmer]] += 1
     total_kmers = len(sequence) - k + 1
     if total_kmers > 0:
         vec = vec / total_kmers
+    return vec
 ###############################################################################
+# 3. SHAP-VALUE (ABLATION) CALCULATION
 ###############################################################################
+def calculate_shap_values(model, x_tensor):
+    """
+    Calculate SHAP values using a simple ablation approach.
+    Returns shap_values, prob_human
+    """
     model.eval()
     with torch.no_grad():
+        # Baseline
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
+        baseline_prob = baseline_probs[0, 1].item()  # Probability of 'human' class
+        # Zeroing each feature to measure impact
         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
+            original_val = x_zeroed[0, i].item()
             x_zeroed[0, i] = 0.0
             output = model(x_zeroed)
             probs = torch.softmax(output, dim=1)
+            prob = probs[0, 1].item()
+            impact = baseline_prob - prob
             shap_values.append(impact)
+            x_zeroed[0, i] = original_val  # restore
     return np.array(shap_values), baseline_prob
+###############################################################################
+# 4. PER-BASE SHAP AGGREGATION
+###############################################################################
+def compute_positionwise_scores(sequence, shap_values, k=4):
+    """
+    Returns an array of per-base SHAP contributions by averaging
+    the k-mer SHAP values of all k-mers covering that base.
+    """
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     shap_sums = np.zeros(seq_len, dtype=np.float32)
     coverage = np.zeros(seq_len, dtype=np.float32)
     for i in range(seq_len - k + 1):
         kmer = sequence[i:i+k]
         if kmer in kmer_dict:
+            val = shap_values[kmer_dict[kmer]]
+            shap_sums[i : i + k] += val
+            coverage[i : i + k] += 1
     with np.errstate(divide='ignore', invalid='ignore'):
         shap_means = np.where(coverage > 0, shap_sums / coverage, 0.0)
     return shap_means
+###############################################################################
+# 5. FIND EXTREME SHAP REGIONS
+###############################################################################
+def find_extreme_subregion(shap_means, window_size=500, mode="max"):
+    """
+    Finds the subregion of length `window_size` that has the maximum
+    (mode="max") or minimum (mode="min") average SHAP.
+    Returns (best_start, best_end, best_avg).
+    """
+    n = len(shap_means)
+    if n == 0:
+        return (0, 0, 0.0)
+    if window_size >= n:
+        # entire sequence
+        avg_val = float(np.mean(shap_means))
+        return (0, n, avg_val)
+    # We'll build csum of length n+1
+    csum = np.zeros(n + 1, dtype=np.float32)
+    csum[1:] = np.cumsum(shap_means)
+    best_start = 0
+    best_sum = csum[window_size] - csum[0]
+    best_avg = best_sum / window_size
+    for start in range(1, n - window_size + 1):
+        wsum = csum[start + window_size] - csum[start]
+        wavg = wsum / window_size
+        if mode == "max":
+            if wavg > best_avg:
+                best_avg = wavg
+                best_start = start
+        else:  # mode == "min"
+            if wavg < best_avg:
+                best_avg = wavg
+                best_start = start
+    return (best_start, best_start + window_size, float(best_avg))
 ###############################################################################
+# 6. PLOTTING / UTILITIES
 ###############################################################################
+def fig_to_image(fig):
+    """Convert a Matplotlib figure to a PIL Image for Gradio."""
+    buf = io.BytesIO()
+    fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
+    buf.seek(0)
+    img = Image.open(buf)
+    plt.close(fig)
+    return img
+def get_zero_centered_cmap():
+    """
+    Creates a custom diverging colormap that is:
+    - Blue for negative
+    - White for zero
+    - Red for positive
+    """
+    colors = [
+        (0.0, 'blue'),   # negative
+        (0.5, 'white'),  # zero
+        (1.0, 'red')     # positive
+    ]
+    cmap = mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
+    return cmap
+def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
+    """
+    Plots a 1D heatmap of per-base SHAP contributions with a custom colormap:
+    - Negative = blue
+    - 0 = white
+    - Positive = red
+    We'll force the range to be symmetrical around 0 by using:
+      vmin=-extent, vmax=+extent
+    so 0 is in the middle.
+    """
+    if start is not None and end is not None:
+        local_shap = shap_means[start:end]
+        subtitle = f" (positions {start}-{end})"
+    else:
+        local_shap = shap_means
+        subtitle = ""
+    if len(local_shap) == 0:
+        # Edge case: no data to plot
+        local_shap = np.array([0.0])
+    # Build 2D array for imshow
+    heatmap_data = local_shap.reshape(1, -1)
+    # Force symmetrical range
+    min_val = np.min(local_shap)
+    max_val = np.max(local_shap)
+    extent = max(abs(min_val), abs(max_val))
+    # Create custom colormap
+    custom_cmap = get_zero_centered_cmap()
+    fig, ax = plt.subplots(figsize=(12, 2))
+    cax = ax.imshow(
+        heatmap_data,
+        aspect='auto',
+        cmap=custom_cmap,
+        vmin=-extent,
+        vmax=+extent
     )
+    # Place colorbar below with plenty of margin
+    cbar = plt.colorbar(cax, orientation='horizontal', pad=0.35)
+    cbar.set_label('SHAP Contribution (negative=blue, zero=white, positive=red)')
+    ax.set_yticks([])
+    ax.set_xlabel('Position in Sequence')
+    ax.set_title(f"{title}{subtitle}")
+    # Extra bottom margin so colorbar won't overlap x-axis labels
+    plt.subplots_adjust(bottom=0.4)
     return fig
+def create_importance_bar_plot(shap_values, kmers, top_k=10):
+    """Create a bar plot of the most important k-mers."""
+    plt.rcParams.update({'font.size': 10})
+    fig = plt.figure(figsize=(10, 5))
+    # Sort by absolute importance
+    indices = np.argsort(np.abs(shap_values))[-top_k:]
+    values = shap_values[indices]
+    features = [kmers[i] for i in indices]
+    # negative -> blue, positive -> red
+    colors = ['#99ccff' if v < 0 else '#ff9999' for v in values]
+    plt.barh(range(len(values)), values, color=colors)
+    plt.yticks(range(len(values)), features)
+    plt.xlabel('SHAP Value (impact on model output)')
+    plt.title(f'Top {top_k} Most Influential k-mers')
+    plt.gca().invert_yaxis()
+    plt.tight_layout()
     return fig
+def plot_shap_histogram(shap_array, title="SHAP Distribution in Region"):
+    """
+    Simple histogram of SHAP values in the subregion.
+    """
+    fig, ax = plt.subplots(figsize=(6, 4))
+    ax.hist(shap_array, bins=30, color='gray', edgecolor='black')
+    ax.axvline(0, color='red', linestyle='--', label='0.0')
+    ax.set_xlabel("SHAP Value")
+    ax.set_ylabel("Count")
+    ax.set_title(title)
+    ax.legend()
+    plt.tight_layout()
     return fig
+def compute_gc_content(sequence):
+    """Compute %GC in the sequence (A, C, G, T)."""
+    if not sequence:
+        return 0
+    gc_count = sequence.count('G') + sequence.count('C')
+    return (gc_count / len(sequence)) * 100.0
 ###############################################################################
+# 7. MAIN ANALYSIS STEP (Gradio Step 1)
 ###############################################################################
+def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
+    """
+    Analyzes the entire genome, returning classification, full-genome heatmap,
+    top k-mer bar plot, and identifies subregions with strongest positive/negative push.
+    """
     # Handle input
     if fasta_text.strip():
         text = fasta_text.strip()
     elif file_obj is not None:
+        try:
+            with open(file_obj, 'r') as f:
+                text = f.read()
+        except Exception as e:
+            return (f"Error reading file: {str(e)}", None, None, None, None)
     else:
+        return ("Please provide a FASTA sequence.", None, None, None, None)
     # Parse FASTA
     sequences = parse_fasta(text)
     if not sequences:
+        return ("No valid FASTA sequences found.", None, None, None, None)
     header, seq = sequences[0]
     # Load model and scaler
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    try:
+        # Use weights_only=True for safer loading
+        state_dict = torch.load('model.pt', map_location=device, weights_only=True)
+        model = VirusClassifier(256).to(device)
+        model.load_state_dict(state_dict)
+        scaler = joblib.load('scaler.pkl')
+    except Exception as e:
+        return (f"Error loading model/scaler: {str(e)}", None, None, None, None)
+    # Vectorize + scale
     freq_vector = sequence_to_kmer_vector(seq)
     scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
     x_tensor = torch.FloatTensor(scaled_vector).to(device)
+    # SHAP + classification
     shap_values, prob_human = calculate_shap_values(model, x_tensor)
     prob_nonhuman = 1.0 - prob_human
+    classification = "Human" if prob_human > 0.5 else "Non-human"
+    confidence = max(prob_human, prob_nonhuman)
+    # Per-base SHAP
+    shap_means = compute_positionwise_scores(seq, shap_values, k=4)
+    # Find the most "human-pushing" region
+    (max_start, max_end, max_avg) = find_extreme_subregion(shap_means, window_size, mode="max")
+    # Find the most "non-human–pushing" region
+    (min_start, min_end, min_avg) = find_extreme_subregion(shap_means, window_size, mode="min")
+    # Build results text
+    results_text = (
+        f"Sequence: {header}\n"
+        f"Length: {len(seq):,} bases\n"
+        f"Classification: {classification}\n"
+        f"Confidence: {confidence:.3f}\n"
+        f"(Human Probability: {prob_human:.3f}, Non-human Probability: {prob_nonhuman:.3f})\n\n"
+        f"---\n"
+        f"**Most Human-Pushing {window_size}-bp Subregion**:\n"
+        f"Start: {max_start}, End: {max_end}, Avg SHAP: {max_avg:.4f}\n\n"
+        f"**Most Non-Human–Pushing {window_size}-bp Subregion**:\n"
+        f"Start: {min_start}, End: {min_end}, Avg SHAP: {min_avg:.4f}"
     )
+    # K-mer importance plot
+    kmers = [''.join(p) for p in product("ACGT", repeat=4)]
+    bar_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
+    bar_img = fig_to_image(bar_fig)
+    # Full-genome SHAP heatmap
+    heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
+    heatmap_img = fig_to_image(heatmap_fig)
+    # Store data for subregion analysis
+    state_dict_out = {
+        "seq": seq,
+        "shap_means": shap_means
+    }
+    return (results_text, bar_img, heatmap_img, state_dict_out, header)
 ###############################################################################
+# 8. SUBREGION ANALYSIS (Gradio Step 2)
 ###############################################################################
+def analyze_subregion(state, header, region_start, region_end):
+    """
+    Takes stored data from step 1 and a user-chosen region.
+    Returns a subregion heatmap, histogram, and some stats (GC, average SHAP).
+    """
+    if not state or "seq" not in state or "shap_means" not in state:
+        return ("No sequence data found. Please run Step 1 first.", None, None)
+    seq = state["seq"]
+    shap_means = state["shap_means"]
+    # Validate bounds
+    region_start = int(region_start)
+    region_end = int(region_end)
+    region_start = max(0, min(region_start, len(seq)))
+    region_end = max(0, min(region_end, len(seq)))
+    if region_end <= region_start:
+        return ("Invalid region range. End must be > Start.", None, None)
+    # Subsequence
+    region_seq = seq[region_start:region_end]
+    region_shap = shap_means[region_start:region_end]
+    # Some stats
+    gc_percent = compute_gc_content(region_seq)
+    avg_shap = float(np.mean(region_shap))
+    # Fraction pushing toward human vs. non-human
+    positive_fraction = np.mean(region_shap > 0)
+    negative_fraction = np.mean(region_shap < 0)
+    # Simple logic-based interpretation
+    if avg_shap > 0.05:
+        region_classification = "Likely pushing toward human"
+    elif avg_shap < -0.05:
+        region_classification = "Likely pushing toward non-human"
+    else:
+        region_classification = "Near neutral (no strong push)"
+    region_info = (
+        f"Analyzing subregion of {header} from {region_start} to {region_end}\n"
+        f"Region length: {len(region_seq)} bases\n"
+        f"GC content: {gc_percent:.2f}%\n"
+        f"Average SHAP in region: {avg_shap:.4f}\n"
+        f"Fraction with SHAP > 0 (toward human): {positive_fraction:.2f}\n"
+        f"Fraction with SHAP < 0 (toward non-human): {negative_fraction:.2f}\n"
+        f"Subregion interpretation: {region_classification}\n"
+    )
+    # Plot region as small heatmap
+    heatmap_fig = plot_linear_heatmap(
+        shap_means,
+        title="Subregion SHAP",
+        start=region_start,
+        end=region_end
     )
+    heatmap_img = fig_to_image(heatmap_fig)
+    # Plot histogram of SHAP in region
+    hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
+    hist_img = fig_to_image(hist_fig)
+    return (region_info, heatmap_img, hist_img)
+###############################################################################
+# 9. BUILD GRADIO INTERFACE
+###############################################################################
+css = """
+.gradio-container {
+    font-family: 'IBM Plex Sans', sans-serif;
+}
+"""
+with gr.Blocks(css=css) as iface:
+    gr.Markdown("""
+    # Virus Host Classifier with White-Centered Gradient
+    **Step 1**: Predict overall viral sequence origin (human vs non-human) and identify extreme regions.
+    **Step 2**: Explore subregions to see local SHAP signals, distribution, GC content, etc.
+    **Color Scale**: Negative SHAP = Blue, Zero = White, Positive = Red.
+    """)
+    with gr.Tab("1) Full-Sequence Analysis"):
+        with gr.Row():
+            with gr.Column(scale=1):
+                file_input = gr.File(
+                    label="Upload FASTA file",
+                    file_types=[".fasta", ".fa", ".txt"],
+                    type="filepath"
+                )
+                text_input = gr.Textbox(
+                    label="Or paste FASTA sequence",
+                    placeholder=">sequence_name\nACGTACGT...",
+                    lines=5
+                )
+                top_k = gr.Slider(
+                    minimum=5,
+                    maximum=30,
+                    value=10,
+                    step=1,
+                    label="Number of top k-mers to display"
+                )
+                win_size = gr.Slider(
+                    minimum=100,
+                    maximum=5000,
+                    value=500,
+                    step=100,
+                    label="Window size for 'most pushing' subregions"
+                )
+                analyze_btn = gr.Button("Analyze Sequence", variant="primary")
+            with gr.Column(scale=2):
+                results_box = gr.Textbox(
+                    label="Classification Results", lines=12, interactive=False
+                )
+                kmer_img = gr.Image(label="Top k-mer SHAP")
+                genome_img = gr.Image(label="Genome-wide SHAP Heatmap (Blue=neg, White=0, Red=pos)")
+        seq_state = gr.State()
+        header_state = gr.State()
+        # analyze_sequence(...) returns 5 items
+        analyze_btn.click(
+            analyze_sequence,
+            inputs=[file_input, top_k, text_input, win_size],
+            outputs=[results_box, kmer_img, genome_img, seq_state, header_state]
+        )
+    with gr.Tab("2) Subregion Exploration"):
         gr.Markdown("""
+        **Subregion Analysis**
+        Select start/end positions to view local SHAP signals, distribution, and GC content.
+        The heatmap also uses the same Blue-White-Red scale.
         """)
+        with gr.Row():
+            region_start = gr.Number(label="Region Start", value=0)
+            region_end = gr.Number(label="Region End", value=500)
+            region_btn = gr.Button("Analyze Subregion")
+        subregion_info = gr.Textbox(
+            label="Subregion Analysis",
+            lines=7,
+            interactive=False
         )
+        with gr.Row():
+            subregion_img = gr.Image(label="Subregion SHAP Heatmap (B-W-R)")
+            subregion_hist_img = gr.Image(label="SHAP Distribution (Histogram)")
+        region_btn.click(
+            analyze_subregion,
+            inputs=[seq_state, header_state, region_start, region_end],
+            outputs=[subregion_info, subregion_img, subregion_hist_img]
+        )
+    gr.Markdown("""
+    ### Interface Features
+    - **Overall Classification** (human vs non-human) using k-mer frequencies.
+    - **SHAP Analysis** to see which k-mers push classification toward or away from human.
+    - **White-Centered SHAP Gradient**:
+      - Negative (blue), 0 (white), Positive (red), with symmetrical color range around 0.
+    - **Identify Subregions** with the strongest push for human or non-human.
+    - **Subregion Exploration**:
+      - Local SHAP heatmap & histogram
+      - GC content
+      - Fraction of positions pushing human vs. non-human
+      - Simple logic-based classification
+    """)
 if __name__ == "__main__":
+    iface.launch()