Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 12

Commit

6be7ede

verified ·

1 Parent(s): 910c6c2

Update app.py

Browse files

Files changed (1) hide show

app.py +453 -421

app.py CHANGED Viewed

@@ -6,13 +6,34 @@ from itertools import product
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import matplotlib.colors as mcolors
-import io
 from PIL import Image
 ###############################################################################
-# 1. MODEL DEFINITION
 ###############################################################################
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
         super(VirusClassifier, self).__init__()
@@ -29,16 +50,16 @@ class VirusClassifier(nn.Module):
             nn.GELU(),
             nn.Linear(32, 2)
         )
     def forward(self, x):
         return self.network(x)
 ###############################################################################
-# 2. FASTA PARSING & K-MER FEATURE ENGINEERING
 ###############################################################################
-def parse_fasta(text):
-    """Parse FASTA formatted text into a list of (header, sequence)."""
     sequences = []
     current_header = None
     current_sequence = []
@@ -53,67 +74,68 @@ def parse_fasta(text):
             current_header = line[1:]
             current_sequence = []
         else:
-            current_sequence.append(line.upper())
     if current_header:
         sequences.append((current_header, ''.join(current_sequence)))
     return sequences
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
-    """Convert a sequence to a k-mer frequency vector for classification."""
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
     for i in range(len(sequence) - k + 1):
         kmer = sequence[i:i+k]
-        if kmer in kmer_dict:
             vec[kmer_dict[kmer]] += 1
     total_kmers = len(sequence) - k + 1
     if total_kmers > 0:
         vec = vec / total_kmers
     return vec
 ###############################################################################
-# 3. SHAP-VALUE (ABLATION) CALCULATION
 ###############################################################################
-def calculate_shap_values(model, x_tensor):
-    """
-    Calculate SHAP values using a simple ablation approach.
-    Returns shap_values, prob_human
-    """
     model.eval()
     with torch.no_grad():
-        # Baseline
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
-        baseline_prob = baseline_probs[0, 1].item()  # Probability of 'human' class
-        # Zeroing each feature to measure impact
         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
-            original_val = x_zeroed[0, i].item()
             x_zeroed[0, i] = 0.0
             output = model(x_zeroed)
             probs = torch.softmax(output, dim=1)
-            prob = probs[0, 1].item()
-            impact = baseline_prob - prob
             shap_values.append(impact)
-            x_zeroed[0, i] = original_val  # restore
     return np.array(shap_values), baseline_prob
-###############################################################################
-# 4. PER-BASE SHAP AGGREGATION
-###############################################################################
-def compute_positionwise_scores(sequence, shap_values, k=4):
-    """
-    Returns an array of per-base SHAP contributions by averaging
-    the k-mer SHAP values of all k-mers covering that base.
-    """
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
@@ -121,447 +143,457 @@ def compute_positionwise_scores(sequence, shap_values, k=4):
     shap_sums = np.zeros(seq_len, dtype=np.float32)
     coverage = np.zeros(seq_len, dtype=np.float32)
     for i in range(seq_len - k + 1):
         kmer = sequence[i:i+k]
         if kmer in kmer_dict:
-            val = shap_values[kmer_dict[kmer]]
-            shap_sums[i : i + k] += val
-            coverage[i : i + k] += 1
     with np.errstate(divide='ignore', invalid='ignore'):
         shap_means = np.where(coverage > 0, shap_sums / coverage, 0.0)
     return shap_means
-###############################################################################
-# 5. FIND EXTREME SHAP REGIONS
-###############################################################################
-def find_extreme_subregion(shap_means, window_size=500, mode="max"):
-    """
-    Finds the subregion of length `window_size` that has the maximum
-    (mode="max") or minimum (mode="min") average SHAP.
-    Returns (best_start, best_end, best_avg).
-    """
-    n = len(shap_means)
-    if n == 0:
-        return (0, 0, 0.0)
-    if window_size >= n:
-        # entire sequence
-        avg_val = float(np.mean(shap_means))
-        return (0, n, avg_val)
-    # We'll build csum of length n+1
-    csum = np.zeros(n + 1, dtype=np.float32)
-    csum[1:] = np.cumsum(shap_means)
-    best_start = 0
-    best_sum = csum[window_size] - csum[0]
-    best_avg = best_sum / window_size
-    for start in range(1, n - window_size + 1):
-        wsum = csum[start + window_size] - csum[start]
-        wavg = wsum / window_size
-        if mode == "max":
-            if wavg > best_avg:
-                best_avg = wavg
-                best_start = start
-        else:  # mode == "min"
-            if wavg < best_avg:
-                best_avg = wavg
-                best_start = start
-    return (best_start, best_start + window_size, float(best_avg))
 ###############################################################################
-# 6. PLOTTING / UTILITIES
 ###############################################################################
-def fig_to_image(fig):
-    """Convert a Matplotlib figure to a PIL Image for Gradio."""
-    buf = io.BytesIO()
-    fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
-    buf.seek(0)
-    img = Image.open(buf)
-    plt.close(fig)
-    return img
-def get_zero_centered_cmap():
-    """
-    Creates a custom diverging colormap that is:
-    - Blue for negative
-    - White for zero
-    - Red for positive
-    """
-    colors = [
-        (0.0, 'blue'),   # negative
-        (0.5, 'white'),  # zero
-        (1.0, 'red')     # positive
-    ]
-    cmap = mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
-    return cmap
-def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
-    """
-    Plots a 1D heatmap of per-base SHAP contributions with a custom colormap:
-    - Negative = blue
-    - 0 = white
-    - Positive = red
-    We'll force the range to be symmetrical around 0 by using:
-      vmin=-extent, vmax=+extent
-    so 0 is in the middle.
-    """
-    if start is not None and end is not None:
-        local_shap = shap_means[start:end]
-        subtitle = f" (positions {start}-{end})"
-    else:
-        local_shap = shap_means
-        subtitle = ""
-    if len(local_shap) == 0:
-        # Edge case: no data to plot
-        local_shap = np.array([0.0])
-    # Build 2D array for imshow
-    heatmap_data = local_shap.reshape(1, -1)
-    # Force symmetrical range
-    min_val = np.min(local_shap)
-    max_val = np.max(local_shap)
-    extent = max(abs(min_val), abs(max_val))
-    # Create custom colormap
-    custom_cmap = get_zero_centered_cmap()
-    fig, ax = plt.subplots(figsize=(12, 2))
-    cax = ax.imshow(
-        heatmap_data,
-        aspect='auto',
-        cmap=custom_cmap,
-        vmin=-extent,
-        vmax=+extent
     )
-    # Place colorbar below with plenty of margin
-    cbar = plt.colorbar(cax, orientation='horizontal', pad=0.35)
-    cbar.set_label('SHAP Contribution (negative=blue, zero=white, positive=red)')
-    ax.set_yticks([])
-    ax.set_xlabel('Position in Sequence')
-    ax.set_title(f"{title}{subtitle}")
-    # Extra bottom margin so colorbar won't overlap x-axis labels
-    plt.subplots_adjust(bottom=0.4)
     return fig
-def create_importance_bar_plot(shap_values, kmers, top_k=10):
-    """Create a bar plot of the most important k-mers."""
-    plt.rcParams.update({'font.size': 10})
-    fig = plt.figure(figsize=(10, 5))
-    # Sort by absolute importance
-    indices = np.argsort(np.abs(shap_values))[-top_k:]
-    values = shap_values[indices]
-    features = [kmers[i] for i in indices]
-    # negative -> blue, positive -> red
-    colors = ['#99ccff' if v < 0 else '#ff9999' for v in values]
-    plt.barh(range(len(values)), values, color=colors)
-    plt.yticks(range(len(values)), features)
-    plt.xlabel('SHAP Value (impact on model output)')
-    plt.title(f'Top {top_k} Most Influential k-mers')
-    plt.gca().invert_yaxis()
-    plt.tight_layout()
     return fig
-def plot_shap_histogram(shap_array, title="SHAP Distribution in Region"):
-    """
-    Simple histogram of SHAP values in the subregion.
-    """
-    fig, ax = plt.subplots(figsize=(6, 4))
-    ax.hist(shap_array, bins=30, color='gray', edgecolor='black')
-    ax.axvline(0, color='red', linestyle='--', label='0.0')
-    ax.set_xlabel("SHAP Value")
-    ax.set_ylabel("Count")
-    ax.set_title(title)
-    ax.legend()
-    plt.tight_layout()
     return fig
-def compute_gc_content(sequence):
-    """Compute %GC in the sequence (A, C, G, T)."""
-    if not sequence:
-        return 0
-    gc_count = sequence.count('G') + sequence.count('C')
-    return (gc_count / len(sequence)) * 100.0
 ###############################################################################
-# 7. MAIN ANALYSIS STEP (Gradio Step 1)
 ###############################################################################
-def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
-    """
-    Analyzes the entire genome, returning classification, full-genome heatmap,
-    top k-mer bar plot, and identifies subregions with strongest positive/negative push.
-    """
     # Handle input
     if fasta_text.strip():
         text = fasta_text.strip()
     elif file_obj is not None:
-        try:
-            with open(file_obj, 'r') as f:
-                text = f.read()
-        except Exception as e:
-            return (f"Error reading file: {str(e)}", None, None, None, None)
     else:
-        return ("Please provide a FASTA sequence.", None, None, None, None)
     # Parse FASTA
     sequences = parse_fasta(text)
     if not sequences:
-        return ("No valid FASTA sequences found.", None, None, None, None)
     header, seq = sequences[0]
     # Load model and scaler
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    try:
-        # Use weights_only=True for safer loading
-        state_dict = torch.load('model.pt', map_location=device, weights_only=True)
-        model = VirusClassifier(256).to(device)
-        model.load_state_dict(state_dict)
-        scaler = joblib.load('scaler.pkl')
-    except Exception as e:
-        return (f"Error loading model/scaler: {str(e)}", None, None, None, None)
-    # Vectorize + scale
     freq_vector = sequence_to_kmer_vector(seq)
     scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
     x_tensor = torch.FloatTensor(scaled_vector).to(device)
-    # SHAP + classification
     shap_values, prob_human = calculate_shap_values(model, x_tensor)
     prob_nonhuman = 1.0 - prob_human
-    classification = "Human" if prob_human > 0.5 else "Non-human"
-    confidence = max(prob_human, prob_nonhuman)
-    # Per-base SHAP
-    shap_means = compute_positionwise_scores(seq, shap_values, k=4)
-    # Find the most "human-pushing" region
-    (max_start, max_end, max_avg) = find_extreme_subregion(shap_means, window_size, mode="max")
-    # Find the most "non-human–pushing" region
-    (min_start, min_end, min_avg) = find_extreme_subregion(shap_means, window_size, mode="min")
-    # Build results text
-    results_text = (
-        f"Sequence: {header}\n"
-        f"Length: {len(seq):,} bases\n"
-        f"Classification: {classification}\n"
-        f"Confidence: {confidence:.3f}\n"
-        f"(Human Probability: {prob_human:.3f}, Non-human Probability: {prob_nonhuman:.3f})\n\n"
-        f"---\n"
-        f"**Most Human-Pushing {window_size}-bp Subregion**:\n"
-        f"Start: {max_start}, End: {max_end}, Avg SHAP: {max_avg:.4f}\n\n"
-        f"**Most Non-Human–Pushing {window_size}-bp Subregion**:\n"
-        f"Start: {min_start}, End: {min_end}, Avg SHAP: {min_avg:.4f}"
     )
-    # K-mer importance plot
-    kmers = [''.join(p) for p in product("ACGT", repeat=4)]
-    bar_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
-    bar_img = fig_to_image(bar_fig)
-    # Full-genome SHAP heatmap
-    heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
-    heatmap_img = fig_to_image(heatmap_fig)
-    # Store data for subregion analysis
-    state_dict_out = {
-        "seq": seq,
-        "shap_means": shap_means
-    }
-    return (results_text, bar_img, heatmap_img, state_dict_out, header)
 ###############################################################################
-# 8. SUBREGION ANALYSIS (Gradio Step 2)
 ###############################################################################
-def analyze_subregion(state, header, region_start, region_end):
-    """
-    Takes stored data from step 1 and a user-chosen region.
-    Returns a subregion heatmap, histogram, and some stats (GC, average SHAP).
-    """
-    if not state or "seq" not in state or "shap_means" not in state:
-        return ("No sequence data found. Please run Step 1 first.", None, None)
-    seq = state["seq"]
-    shap_means = state["shap_means"]
-    # Validate bounds
-    region_start = int(region_start)
-    region_end = int(region_end)
-    region_start = max(0, min(region_start, len(seq)))
-    region_end = max(0, min(region_end, len(seq)))
-    if region_end <= region_start:
-        return ("Invalid region range. End must be > Start.", None, None)
-    # Subsequence
-    region_seq = seq[region_start:region_end]
-    region_shap = shap_means[region_start:region_end]
-    # Some stats
-    gc_percent = compute_gc_content(region_seq)
-    avg_shap = float(np.mean(region_shap))
-    # Fraction pushing toward human vs. non-human
-    positive_fraction = np.mean(region_shap > 0)
-    negative_fraction = np.mean(region_shap < 0)
-    # Simple logic-based interpretation
-    if avg_shap > 0.05:
-        region_classification = "Likely pushing toward human"
-    elif avg_shap < -0.05:
-        region_classification = "Likely pushing toward non-human"
-    else:
-        region_classification = "Near neutral (no strong push)"
-    region_info = (
-        f"Analyzing subregion of {header} from {region_start} to {region_end}\n"
-        f"Region length: {len(region_seq)} bases\n"
-        f"GC content: {gc_percent:.2f}%\n"
-        f"Average SHAP in region: {avg_shap:.4f}\n"
-        f"Fraction with SHAP > 0 (toward human): {positive_fraction:.2f}\n"
-        f"Fraction with SHAP < 0 (toward non-human): {negative_fraction:.2f}\n"
-        f"Subregion interpretation: {region_classification}\n"
-    )
-    # Plot region as small heatmap
-    heatmap_fig = plot_linear_heatmap(
-        shap_means,
-        title="Subregion SHAP",
-        start=region_start,
-        end=region_end
     )
-    heatmap_img = fig_to_image(heatmap_fig)
-    # Plot histogram of SHAP in region
-    hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
-    hist_img = fig_to_image(hist_fig)
-    return (region_info, heatmap_img, hist_img)
-###############################################################################
-# 9. BUILD GRADIO INTERFACE
-###############################################################################
-css = """
-.gradio-container {
-    font-family: 'IBM Plex Sans', sans-serif;
-}
-"""
-with gr.Blocks(css=css) as iface:
-    gr.Markdown("""
-    # Virus Host Classifier with White-Centered Gradient
-    **Step 1**: Predict overall viral sequence origin (human vs non-human) and identify extreme regions.
-    **Step 2**: Explore subregions to see local SHAP signals, distribution, GC content, etc.
-    **Color Scale**: Negative SHAP = Blue, Zero = White, Positive = Red.
-    """)
-    with gr.Tab("1) Full-Sequence Analysis"):
-        with gr.Row():
-            with gr.Column(scale=1):
-                file_input = gr.File(
-                    label="Upload FASTA file",
-                    file_types=[".fasta", ".fa", ".txt"],
-                    type="filepath"
-                )
-                text_input = gr.Textbox(
-                    label="Or paste FASTA sequence",
-                    placeholder=">sequence_name\nACGTACGT...",
-                    lines=5
-                )
-                top_k = gr.Slider(
-                    minimum=5,
-                    maximum=30,
-                    value=10,
-                    step=1,
-                    label="Number of top k-mers to display"
-                )
-                win_size = gr.Slider(
-                    minimum=100,
-                    maximum=5000,
-                    value=500,
-                    step=100,
-                    label="Window size for 'most pushing' subregions"
-                )
-                analyze_btn = gr.Button("Analyze Sequence", variant="primary")
-            with gr.Column(scale=2):
-                results_box = gr.Textbox(
-                    label="Classification Results", lines=12, interactive=False
-                )
-                kmer_img = gr.Image(label="Top k-mer SHAP")
-                genome_img = gr.Image(label="Genome-wide SHAP Heatmap (Blue=neg, White=0, Red=pos)")
-        seq_state = gr.State()
-        header_state = gr.State()
-        # analyze_sequence(...) returns 5 items
-        analyze_btn.click(
-            analyze_sequence,
-            inputs=[file_input, top_k, text_input, win_size],
-            outputs=[results_box, kmer_img, genome_img, seq_state, header_state]
-        )
-    with gr.Tab("2) Subregion Exploration"):
         gr.Markdown("""
-        **Subregion Analysis**
-        Select start/end positions to view local SHAP signals, distribution, and GC content.
-        The heatmap also uses the same Blue-White-Red scale.
         """)
-        with gr.Row():
-            region_start = gr.Number(label="Region Start", value=0)
-            region_end = gr.Number(label="Region End", value=500)
-            region_btn = gr.Button("Analyze Subregion")
-        subregion_info = gr.Textbox(
-            label="Subregion Analysis",
-            lines=7,
-            interactive=False
-        )
-        with gr.Row():
-            subregion_img = gr.Image(label="Subregion SHAP Heatmap (B-W-R)")
-            subregion_hist_img = gr.Image(label="SHAP Distribution (Histogram)")
-        region_btn.click(
-            analyze_subregion,
-            inputs=[seq_state, header_state, region_start, region_end],
-            outputs=[subregion_info, subregion_img, subregion_hist_img]
         )
-    gr.Markdown("""
-    ### Interface Features
-    - **Overall Classification** (human vs non-human) using k-mer frequencies.
-    - **SHAP Analysis** to see which k-mers push classification toward or away from human.
-    - **White-Centered SHAP Gradient**:
-      - Negative (blue), 0 (white), Positive (red), with symmetrical color range around 0.
-    - **Identify Subregions** with the strongest push for human or non-human.
-    - **Subregion Exploration**:
-      - Local SHAP heatmap & histogram
-      - GC content
-      - Fraction of positions pushing human vs. non-human
-      - Simple logic-based classification
-    """)
 if __name__ == "__main__":
-    iface.launch()

 import torch.nn as nn
 import matplotlib.pyplot as plt
 import matplotlib.colors as mcolors
+import seaborn as sns
 from PIL import Image
+import io
+import pandas as pd
+from typing import Tuple, List, Dict, Any
+from dataclasses import dataclass
+import plotly.graph_objects as go
+import plotly.express as px
+from plotly.subplots import make_subplots
 ###############################################################################
+# 1. DATA STRUCTURES & MODEL
 ###############################################################################
+@dataclass
+class SequenceAnalysis:
+    """Container for sequence analysis results"""
+    header: str
+    sequence: str
+    length: int
+    gc_content: float
+    classification: str
+    human_prob: float
+    nonhuman_prob: float
+    shap_values: np.ndarray
+    shap_means: np.ndarray
+    extreme_regions: Dict[str, Dict[str, Any]]
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
         super(VirusClassifier, self).__init__()
             nn.GELU(),
             nn.Linear(32, 2)
         )
     def forward(self, x):
         return self.network(x)
 ###############################################################################
+# 2. SEQUENCE PROCESSING
 ###############################################################################
+def parse_fasta(text: str) -> List[Tuple[str, str]]:
+    """Parse FASTA formatted text with improved robustness"""
     sequences = []
     current_header = None
     current_sequence = []
             current_header = line[1:]
             current_sequence = []
         else:
+            # Filter out non-ACGT characters and convert to uppercase
+            filtered_line = ''.join(c for c in line.upper() if c in 'ACGT')
+            current_sequence.append(filtered_line)
     if current_header:
         sequences.append((current_header, ''.join(current_sequence)))
     return sequences
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
+    """Convert sequence to k-mer frequency vector with optimizations"""
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
+    # Use sliding window for efficiency
     for i in range(len(sequence) - k + 1):
         kmer = sequence[i:i+k]
+        if kmer in kmer_dict:  # Handle non-ACGT kmers
             vec[kmer_dict[kmer]] += 1
+    # Normalize
     total_kmers = len(sequence) - k + 1
     if total_kmers > 0:
         vec = vec / total_kmers
     return vec
+def compute_gc_content(sequence: str) -> float:
+    """Compute GC content percentage"""
+    if not sequence:
+        return 0.0
+    gc_count = sum(1 for base in sequence if base in 'GC')
+    return (gc_count / len(sequence)) * 100.0
 ###############################################################################
+# 3. SHAP & ANALYSIS
 ###############################################################################
+def calculate_shap_values(model: nn.Module, x_tensor: torch.Tensor) -> Tuple[np.ndarray, float]:
+    """Calculate SHAP values using ablation with improved efficiency"""
     model.eval()
     with torch.no_grad():
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
+        baseline_prob = baseline_probs[0, 1].item()
         shap_values = []
         x_zeroed = x_tensor.clone()
+        # Vectorized computation where possible
         for i in range(x_tensor.shape[1]):
             x_zeroed[0, i] = 0.0
             output = model(x_zeroed)
             probs = torch.softmax(output, dim=1)
+            impact = baseline_prob - probs[0, 1].item()
             shap_values.append(impact)
+            x_zeroed[0, i] = x_tensor[0, i]
     return np.array(shap_values), baseline_prob
+def compute_positionwise_scores(sequence: str, shap_values: np.ndarray, k: int = 4) -> np.ndarray:
+    """Compute per-base SHAP scores with optimized memory usage"""
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     shap_sums = np.zeros(seq_len, dtype=np.float32)
     coverage = np.zeros(seq_len, dtype=np.float32)
+    # Vectorized operations where possible
     for i in range(seq_len - k + 1):
         kmer = sequence[i:i+k]
         if kmer in kmer_dict:
+            idx = kmer_dict[kmer]
+            shap_sums[i:i+k] += shap_values[idx]
+            coverage[i:i+k] += 1
     with np.errstate(divide='ignore', invalid='ignore'):
         shap_means = np.where(coverage > 0, shap_sums / coverage, 0.0)
     return shap_means
+def find_extreme_regions(shap_means: np.ndarray, window_size: int = 500) -> Dict[str, Dict[str, Any]]:
+    """Find regions with extreme SHAP values using efficient sliding window"""
+    if len(shap_means) < window_size:
+        window_size = len(shap_means)
+    # Compute cumulative sum for efficient sliding window
+    cumsum = np.cumsum(np.pad(shap_means, (0, 1)))
+    # Sliding window calculation
+    window_avgs = (cumsum[window_size:] - cumsum[:-window_size]) / window_size
+    max_idx = np.argmax(window_avgs)
+    min_idx = np.argmin(window_avgs)
+    return {
+        "human": {
+            "start": max_idx,
+            "end": max_idx + window_size,
+            "avg_shap": float(window_avgs[max_idx])
+        },
+        "nonhuman": {
+            "start": min_idx,
+            "end": min_idx + window_size,
+            "avg_shap": float(window_avgs[min_idx])
+        }
+    }
 ###############################################################################
+# 4. VISUALIZATION
 ###############################################################################
+def create_genome_overview_plot(analysis: SequenceAnalysis) -> go.Figure:
+    """Create an interactive genome overview using Plotly"""
+    fig = make_subplots(
+        rows=2, cols=1,
+        subplot_titles=("SHAP Values Along Genome", "GC Content"),
+        row_heights=[0.7, 0.3],
+        vertical_spacing=0.1
     )
+    # SHAP trace
+    fig.add_trace(
+        go.Scatter(
+            x=list(range(len(analysis.shap_means))),
+            y=analysis.shap_means,
+            name="SHAP",
+            line=dict(color='rgba(31, 119, 180, 0.8)'),
+            hovertemplate="Position: %{x}<br>SHAP: %{y:.4f}<extra></extra>"
+        ),
+        row=1, col=1
+    )
+    # Highlight extreme regions
+    for region_type, region in analysis.extreme_regions.items():
+        color = 'rgba(255, 0, 0, 0.2)' if region_type == 'human' else 'rgba(0, 0, 255, 0.2)'
+        fig.add_vrect(
+            x0=region['start'],
+            x1=region['end'],
+            fillcolor=color,
+            opacity=0.5,
+            layer="below",
+            line_width=0,
+            row=1, col=1
+        )
+    # Calculate rolling GC content
+    window = 100
+    gc_content = np.array([
+        compute_gc_content(analysis.sequence[i:i+window])
+        for i in range(0, len(analysis.sequence) - window + 1, window)
+    ])
+    # GC content trace
+    fig.add_trace(
+        go.Scatter(
+            x=np.arange(len(gc_content)) * window,
+            y=gc_content,
+            name="GC%",
+            line=dict(color='rgba(44, 160, 44, 0.8)'),
+            hovertemplate="Position: %{x}<br>GC%: %{y:.1f}%<extra></extra>"
+        ),
+        row=2, col=1
+    )
+    # Update layout
+    fig.update_layout(
+        height=800,
+        title=dict(
+            text=f"Genome Analysis Overview<br><sub>{analysis.header}</sub>",
+            x=0.5
+        ),
+        showlegend=False,
+        plot_bgcolor='white'
+    )
+    # Update axes
+    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
+    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
     return fig
+def create_kmer_importance_plot(analysis: SequenceAnalysis, top_k: int = 10) -> go.Figure:
+    """Create interactive k-mer importance plot using Plotly"""
+    # Get top k-mers by absolute SHAP value
+    kmers = [''.join(p) for p in product("ACGT", repeat=4)]
+    indices = np.argsort(np.abs(analysis.shap_values))[-top_k:]
+    # Create DataFrame for plotting
+    df = pd.DataFrame({
+        'k-mer': [kmers[i] for i in indices],
+        'SHAP': analysis.shap_values[indices]
+    })
+    # Create plot
+    fig = px.bar(
+        df,
+        x='SHAP',
+        y='k-mer',
+        orientation='h',
+        color='SHAP',
+        color_continuous_scale='RdBu',
+        title=f'Top {top_k} Most Influential k-mers'
+    )
+    # Update layout
+    fig.update_layout(
+        height=400,
+        plot_bgcolor='white',
+        yaxis_title='',
+        xaxis_title='SHAP Value',
+        coloraxis_showscale=False
+    )
     return fig
+def create_shap_distribution_plot(analysis: SequenceAnalysis) -> go.Figure:
+    """Create SHAP distribution plot using Plotly"""
+    fig = go.Figure()
+    # Add histogram
+    fig.add_trace(go.Histogram(
+        x=analysis.shap_means,
+        nbinsx=50,
+        name='SHAP Values',
+        marker_color='rgba(31, 119, 180, 0.6)'
+    ))
+    # Add vertical line at x=0
+    fig.add_vline(
+        x=0,
+        line_dash="dash",
+        line_color="red",
+        annotation_text="Neutral",
+        annotation_position="top"
+    )
+    # Update layout
+    fig.update_layout(
+        title='Distribution of SHAP Values',
+        xaxis_title='SHAP Value',
+        yaxis_title='Count',
+        plot_bgcolor='white',
+        height=400
+    )
     return fig
 ###############################################################################
+# 5. MAIN ANALYSIS
 ###############################################################################
+def analyze_sequence(
+    file_obj: str = None,
+    fasta_text: str = "",
+    window_size: int = 500,
+    model_path: str = 'model.pt',
+    scaler_path: str = 'scaler.pkl'
+) -> SequenceAnalysis:
+    """Main sequence analysis function"""
     # Handle input
     if fasta_text.strip():
         text = fasta_text.strip()
     elif file_obj is not None:
+        with open(file_obj, 'r') as f:
+            text = f.read()
     else:
+        raise ValueError("No input provided")
     # Parse FASTA
     sequences = parse_fasta(text)
     if not sequences:
+        raise ValueError("No valid FASTA sequences found")
     header, seq = sequences[0]
     # Load model and scaler
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    state_dict = torch.load(model_path, map_location=device)
+    model = VirusClassifier(256).to(device)
+    model.load_state_dict(state_dict)
+    scaler = joblib.load(scaler_path)
+    # Process sequence
     freq_vector = sequence_to_kmer_vector(seq)
     scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
     x_tensor = torch.FloatTensor(scaled_vector).to(device)
+    # Get SHAP values and classification
     shap_values, prob_human = calculate_shap_values(model, x_tensor)
     prob_nonhuman = 1.0 - prob_human
+    # Get per-base SHAP scores
+    shap_means = compute_positionwise_scores(seq, shap_values)
+    # Find extreme regions
+    extreme_regions = find_extreme_regions(shap_means, window_size)
+    # Create analysis object
+    return SequenceAnalysis(
+        header=header,
+        sequence=seq,
+        length=len(seq),
+        gc_content=compute_gc_content(seq),
+        classification="Human" if prob_human > 0.5 else "Non-human",
+        human_prob=prob_human,
+        nonhuman_prob=prob_nonhuman,
+        shap_values=shap_values,
+        shap_means=shap_means,
+        extreme_regions=extreme_regions
     )
 ###############################################################################
+# 6. GRADIO INTERFACE
 ###############################################################################
+def create_interface():
+    """Create enhanced Gradio interface with improved layout and interactivity"""
+    def process_sequence(
+        file_obj: str,
+        fasta_text: str,
+        window_size: int,
+        top_kmers: int
+    ) -> Tuple[str, List[go.Figure]]:
+        """Process sequence and return formatted results and plots"""
+        try:
+            # Run analysis
+            analysis = analyze_sequence(
+                file_obj=file_obj,
+                fasta_text=fasta_text,
+                window_size=window_size
+            )
+            # Format results text
+            results = f"""
+            ### Sequence Analysis Results
+            **Basic Information**
+            - Sequence: {analysis.header}
+            - Length: {analysis.length:,} bases
+            - GC Content: {analysis.gc_content:.1f}%
+            **Classification**
+            - Prediction: {analysis.classification}
+            - Human Probability: {analysis.human_prob:.3f}
+            - Non-human Probability: {analysis.nonhuman_prob:.3f}
+            **Extreme Regions (window size: {window_size}bp)**
+            Most Human-like Region:
+            - Position: {analysis.extreme_regions['human']['start']:,} - {analysis.extreme_regions['human']['end']:,}
+            - Average SHAP: {analysis.extreme_regions['human']['avg_shap']:.4f}
+            Most Non-human-like Region:
+            - Position: {analysis.extreme_regions['nonhuman']['start']:,} - {analysis.extreme_regions['nonhuman']['end']:,}
+            - Average SHAP: {analysis.extreme_regions['nonhuman']['avg_shap']:.4f}
+            """
+            # Create plots
+            genome_plot = create_genome_overview_plot(analysis)
+            kmer_plot = create_kmer_importance_plot(analysis, top_kmers)
+            dist_plot = create_shap_distribution_plot(analysis)
+            return results, [genome_plot, kmer_plot, dist_plot], analysis
+        except Exception as e:
+            return f"Error: {str(e)}", [], None
+    # Create theme and styling
+    theme = gr.themes.Soft(
+        primary_hue="blue",
+        secondary_hue="gray",
+    ).set(
+        body_text_color="gray-dark",
+        background_fill_primary="*gray-50",
+        block_shadow="*shadow-sm",
+        block_background_fill="white",
     )
+    # Build interface
+    with gr.Blocks(theme=theme, css="""
+        .container { margin: 0 auto; max-width: 1200px; padding: 20px; }
+        .results { margin-top: 20px; }
+        .plot-container { margin-top: 10px; }
+    """) as interface:
         gr.Markdown("""
+        # 🧬 Enhanced Virus Host Classifier
+        This tool analyzes viral sequences to predict their host (human vs. non-human) and provides detailed visualizations
+        of the features influencing this classification. Upload or paste a FASTA sequence to begin.
+        *Using advanced SHAP analysis and interactive visualizations for interpretable results.*
         """)
+        # Input section
+        with gr.Tab("Sequence Analysis"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    file_input = gr.File(
+                        label="Upload FASTA File",
+                        file_types=[".fasta", ".fa", ".txt"],
+                        type="filepath"
+                    )
+                    text_input = gr.Textbox(
+                        label="Or Paste FASTA Sequence",
+                        placeholder=">sequence_name\nACGTACGT...",
+                        lines=5
+                    )
+                    with gr.Row():
+                        window_size = gr.Slider(
+                            minimum=100,
+                            maximum=5000,
+                            value=500,
+                            step=100,
+                            label="Window Size for Region Analysis"
+                        )
+                        top_kmers = gr.Slider(
+                            minimum=5,
+                            maximum=30,
+                            value=10,
+                            step=1,
+                            label="Number of Top k-mers to Display"
+                        )
+                    analyze_btn = gr.Button(
+                        "🔍 Analyze Sequence",
+                        variant="primary"
+                    )
+                # Results section
+                with gr.Column(scale=2):
+                    results_text = gr.Markdown(
+                        label="Analysis Results"
+                    )
+                    # Plots
+                    genome_plot = gr.Plot(
+                        label="Genome Overview"
+                    )
+                    with gr.Row():
+                        kmer_plot = gr.Plot(
+                            label="k-mer Importance"
+                        )
+                        dist_plot = gr.Plot(
+                            label="SHAP Distribution"
+                        )
+        # Help tab
+        with gr.Tab("Help & Information"):
+            gr.Markdown("""
+            ### 📖 How to Use This Tool
+            1. **Input Your Sequence**
+               - Upload a FASTA file or paste your sequence in FASTA format
+               - The sequence should contain only ACGT bases (non-standard bases will be filtered)
+            2. **Adjust Parameters**
+               - Window Size: Controls the length of regions analyzed for extreme patterns
+               - Top k-mers: Number of most influential sequence patterns to display
+            3. **Interpret Results**
+               - Classification: Predicted host (human vs. non-human)
+               - Genome Overview: Interactive plot showing SHAP values and GC content
+               - k-mer Importance: Most influential sequence patterns
+               - SHAP Distribution: Overall distribution of feature importance
+            ### 🎨 Visualization Guide
+            - **SHAP Values**:
+              - Positive (red) = pushing toward human classification
+              - Negative (blue) = pushing toward non-human classification
+              - Zero (white) = neutral impact
+            - **Extreme Regions**:
+              - Highlighted in the genome overview plot
+              - Red regions = most human-like
+              - Blue regions = most non-human-like
+            ### 🔬 Technical Details
+            - The classifier uses k-mer frequencies (k=4) as features
+            - SHAP values are calculated using an ablation-based approach
+            - GC content is calculated using a sliding window
+            """)
+        # Connect components
+        sequence_state = gr.State()
+        analyze_btn.click(
+            process_sequence,
+            inputs=[
+                file_input,
+                text_input,
+                window_size,
+                top_kmers
+            ],
+            outputs=[
+                results_text,
+                [genome_plot, kmer_plot, dist_plot],
+                sequence_state
+            ]
         )
+        return interface
+###############################################################################
+# 7. MAIN ENTRY POINT
+###############################################################################
 if __name__ == "__main__":
+    iface = create_interface()
+    iface.launch(
+        share=True,
+        server_name="0.0.0.0",
+        show_error=True
+    )
+    #