Spaces:

hiyata
/

HostClassifier

Sleeping

App Files Files Community

hiyata commited on Jan 14

Commit

18efb8a

verified ·

1 Parent(s): d01c414

Update app.py

Browse files

Files changed (1) hide show

app.py +331 -285

app.py CHANGED Viewed

@@ -67,6 +67,9 @@ def parse_fasta(text):
     return sequences
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
@@ -84,11 +87,15 @@ def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
 ###############################################################################
 def calculate_shap_values(model, x_tensor):
     model.eval()
     with torch.no_grad():
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
-        baseline_prob = baseline_probs[0, 1].item()  # Prob of 'human'
         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
@@ -106,6 +113,9 @@ def calculate_shap_values(model, x_tensor):
 ###############################################################################
 def compute_positionwise_scores(sequence, shap_values, k=4):
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     seq_len = len(sequence)
@@ -126,6 +136,9 @@ def compute_positionwise_scores(sequence, shap_values, k=4):
 ###############################################################################
 def find_extreme_subregion(shap_means, window_size=500, mode="max"):
     n = len(shap_means)
     if n == 0:
         return (0, 0, 0.0)
@@ -152,6 +165,9 @@ def find_extreme_subregion(shap_means, window_size=500, mode="max"):
 ###############################################################################
 def fig_to_image(fig):
     buf = io.BytesIO()
     fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
     buf.seek(0)
@@ -160,10 +176,16 @@ def fig_to_image(fig):
     return img
 def get_zero_centered_cmap():
     colors = [(0.0, 'blue'), (0.5, 'white'), (1.0, 'red')]
     return mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
 def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
     if start is not None and end is not None:
         local_shap = shap_means[start:end]
         subtitle = f" (positions {start}-{end})"
@@ -189,6 +211,9 @@ def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, e
     return fig
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
     plt.rcParams.update({'font.size': 10})
     fig = plt.figure(figsize=(10, 5))
     indices = np.argsort(np.abs(shap_values))[-top_k:]
@@ -204,6 +229,9 @@ def create_importance_bar_plot(shap_values, kmers, top_k=10):
     return fig
 def plot_shap_histogram(shap_array, title="SHAP Distribution in Region", num_bins=30):
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=num_bins, color='gray', edgecolor='black')
     ax.axvline(0, color='red', linestyle='--', label='0.0')
@@ -215,8 +243,11 @@ def plot_shap_histogram(shap_array, title="SHAP Distribution in Region", num_bin
     return fig
 def compute_gc_content(sequence):
     if not sequence:
-        return 0
     gc_count = sequence.count('G') + sequence.count('C')
     return (gc_count / len(sequence)) * 100.0
@@ -225,6 +256,11 @@ def compute_gc_content(sequence):
 ###############################################################################
 def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     if fasta_text.strip():
         text = fasta_text.strip()
     elif file_obj is not None:
@@ -236,14 +272,15 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     else:
         return ("Please provide a FASTA sequence.", None, None, None, None, None)
     sequences = parse_fasta(text)
     if not sequences:
         return ("No valid FASTA sequences found.", None, None, None, None, None)
     header, seq = sequences[0]
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
-        # IMPORTANT: adjust how you load your model as needed
         state_dict = torch.load('model.pt', map_location=device)
         model = VirusClassifier(256).to(device)
         model.load_state_dict(state_dict)
@@ -260,10 +297,12 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     classification = "Human" if prob_human > 0.5 else "Non-human"
     confidence = max(prob_human, prob_nonhuman)
     shap_means = compute_positionwise_scores(seq, shap_values, k=4)
     max_start, max_end, max_avg = find_extreme_subregion(shap_means, window_size, mode="max")
     min_start, min_end, min_avg = find_extreme_subregion(shap_means, window_size, mode="min")
     results_text = (
         f"Sequence: {header}\n"
         f"Length: {len(seq):,} bases\n"
@@ -277,6 +316,7 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
         f"Start: {min_start}, End: {min_end}, Avg SHAP: {min_avg:.4f}"
     )
     kmers = [''.join(p) for p in product("ACGT", repeat=4)]
     bar_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
     bar_img = fig_to_image(bar_fig)
@@ -284,10 +324,10 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
     heatmap_img = fig_to_image(heatmap_fig)
-    # You might want to provide a CSV or other data for the 6th return item
-    # Here, we'll simply return None for the file download:
     state_dict_out = {"seq": seq, "shap_means": shap_means}
     return (results_text, bar_img, heatmap_img, state_dict_out, header, None)
 ###############################################################################
@@ -295,6 +335,9 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
 ###############################################################################
 def analyze_subregion(state, header, region_start, region_end):
     if not state or "seq" not in state or "shap_means" not in state:
         return ("No sequence data found. Please run Step 1 first.", None, None, None)
     seq = state["seq"]
@@ -305,18 +348,22 @@ def analyze_subregion(state, header, region_start, region_end):
     region_end = max(0, min(region_end, len(seq)))
     if region_end <= region_start:
         return ("Invalid region range. End must be > Start.", None, None, None)
     region_seq = seq[region_start:region_end]
     region_shap = shap_means[region_start:region_end]
     gc_percent = compute_gc_content(region_seq)
     avg_shap = float(np.mean(region_shap))
     positive_fraction = np.mean(region_shap > 0)
     negative_fraction = np.mean(region_shap < 0)
     if avg_shap > 0.05:
         region_classification = "Likely pushing toward human"
     elif avg_shap < -0.05:
         region_classification = "Likely pushing toward non-human"
     else:
         region_classification = "Near neutral (no strong push)"
     region_info = (
         f"Analyzing subregion of {header} from {region_start} to {region_end}\n"
         f"Region length: {len(region_seq)} bases\n"
@@ -326,30 +373,29 @@ def analyze_subregion(state, header, region_start, region_end):
         f"Fraction with SHAP < 0 (toward non-human): {negative_fraction:.2f}\n"
         f"Subregion interpretation: {region_classification}\n"
     )
     heatmap_fig = plot_linear_heatmap(shap_means, title="Subregion SHAP", start=region_start, end=region_end)
     heatmap_img = fig_to_image(heatmap_fig)
     hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
     hist_img = fig_to_image(hist_fig)
-    # For demonstration, returning None for the file download as well
     return (region_info, heatmap_img, hist_img, None)
 ###############################################################################
-# 9. COMPARISON ANALYSIS FUNCTIONS
 ###############################################################################
-def get_zero_centered_cmap():
-    """Create a zero-centered blue-white-red colormap"""
-    colors = [(0.0, 'blue'), (0.5, 'white'), (1.0, 'red')]
-    return mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
 def compute_shap_difference(shap1_norm, shap2_norm):
-    """Compute the SHAP difference between normalized sequences"""
     return shap2_norm - shap1_norm
 def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
     """
-    Plot heatmap using relative positions (0-100%)
     """
     heatmap_data = shap_diff.reshape(1, -1)
     extent = max(abs(np.min(shap_diff)), abs(np.max(shap_diff)))
@@ -378,7 +424,7 @@ def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
 def plot_shap_histogram(shap_array, title="SHAP Distribution", num_bins=30):
     """
-    Plot histogram of SHAP values with configurable number of bins
     """
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=num_bins, color='gray', edgecolor='black', alpha=0.7)
@@ -392,18 +438,16 @@ def plot_shap_histogram(shap_array, title="SHAP Distribution", num_bins=30):
 def calculate_adaptive_parameters(len1, len2):
     """
-    Calculate adaptive parameters based on sequence lengths and their difference.
-    Returns: (num_points, smooth_window, resolution_factor)
     """
     length_diff = abs(len1 - len2)
     max_length = max(len1, len2)
     min_length = min(len1, len2)
     length_ratio = min_length / max_length
-    # Base number of points scales with sequence length
     base_points = min(2000, max(500, max_length // 100))
-    # Adjust parameters based on sequence properties
     if length_diff < 500:
         resolution_factor = 2.0
         num_points = min(3000, base_points * 2)
@@ -421,29 +465,22 @@ def calculate_adaptive_parameters(len1, len2):
         num_points = max(500, base_points // 2)
         smooth_window = max(100, length_diff // 500)
-    # Adjust window size based on length ratio
     smooth_window = int(smooth_window * (1 + (1 - length_ratio)))
     return int(num_points), int(smooth_window), resolution_factor
 def sliding_window_smooth(values, window_size=50):
     """
-    Apply sliding window smoothing with edge handling
     """
     if window_size < 3:
         return values
-    # Create window with exponential decay at edges
     window = np.ones(window_size)
     decay = np.exp(-np.linspace(0, 3, window_size // 2))
     window[:window_size // 2] = decay
     window[-(window_size // 2):] = decay[::-1]
     window = window / window.sum()
-    # Apply convolution
     smoothed = np.convolve(values, window, mode='valid')
-    # Handle edges
     pad_size = len(values) - len(smoothed)
     pad_left = pad_size // 2
     pad_right = pad_size - pad_left
@@ -457,16 +494,13 @@ def sliding_window_smooth(values, window_size=50):
 def normalize_shap_lengths(shap1, shap2):
     """
-    Normalize and smooth SHAP values with dynamic adaptation
     """
-    # Calculate adaptive parameters
     num_points, smooth_window, _ = calculate_adaptive_parameters(len(shap1), len(shap2))
-    # Apply initial smoothing
     shap1_smooth = sliding_window_smooth(shap1, smooth_window)
     shap2_smooth = sliding_window_smooth(shap2, smooth_window)
-    # Create relative positions and interpolate
     x1 = np.linspace(0, 1, len(shap1_smooth))
     x2 = np.linspace(0, 1, len(shap2_smooth))
     x_norm = np.linspace(0, 1, num_points)
@@ -478,7 +512,8 @@ def normalize_shap_lengths(shap1, shap2):
 def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
     """
-    Compare two sequences with adaptive parameters and visualization
     """
     try:
         # Analyze first sequence
@@ -491,26 +526,23 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
         if isinstance(res2[0], str) and "Error" in res2[0]:
             return (f"Error in sequence 2: {res2[0]}", None, None, None)
-        # Extract SHAP values and sequence info
         shap1 = res1[3]["shap_means"]
         shap2 = res2[3]["shap_means"]
-        # Calculate sequence properties
         len1, len2 = len(shap1), len(shap2)
         length_diff = abs(len1 - len2)
         length_ratio = min(len1, len2) / max(len1, len2)
-        # Normalize and compare sequences
         shap1_norm, shap2_norm, smooth_window = normalize_shap_lengths(shap1, shap2)
         shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
-        # Calculate adaptive threshold and statistics
         base_threshold = 0.05
         adaptive_threshold = base_threshold * (1 + (1 - length_ratio))
         if length_diff > 50000:
             adaptive_threshold *= 1.5
-        # Calculate comparison statistics
         avg_diff = np.mean(shap_diff)
         std_diff = np.std(shap_diff)
         max_diff = np.max(shap_diff)
@@ -518,7 +550,7 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
         substantial_diffs = np.abs(shap_diff) > adaptive_threshold
         frac_different = np.mean(substantial_diffs)
-        # Extract classifications
         try:
             classification1 = res1[0].split('Classification: ')[1].split('\n')[0].strip()
             classification2 = res2[0].split('Classification: ')[1].split('\n')[0].strip()
@@ -526,7 +558,6 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
             classification1 = "Unknown"
             classification2 = "Unknown"
-        # Format output text
         comparison_text = (
             "Sequence Comparison Results:\n"
             f"Sequence 1: {res1[4]}\n"
@@ -553,14 +584,12 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
             "- White regions: Similar between sequences"
         )
-        # Generate visualizations
         heatmap_fig = plot_comparative_heatmap(
             shap_diff,
             title=f"SHAP Difference Heatmap (window: {smooth_window})"
         )
         heatmap_img = fig_to_image(heatmap_fig)
-        # Create histogram with adaptive bins
         num_bins = max(20, min(50, int(np.sqrt(len(shap_diff)))))
         hist_fig = plot_shap_histogram(
             shap_diff,
@@ -569,7 +598,6 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
         )
         hist_img = fig_to_image(hist_fig)
-        # Return 4 outputs (text, image, image, and a file or None for the last)
         return (comparison_text, heatmap_img, hist_img, None)
     except Exception as e:
@@ -577,23 +605,55 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
         return (error_msg, None, None, None)
 ###############################################################################
-# 11. GENE FEATURE ANALYSIS
 ###############################################################################
-import io
-from io import BytesIO
-from PIL import Image, ImageDraw, ImageFont
-import numpy as np
-import pandas as pd
-import tempfile
-import os
-from typing import List, Dict, Tuple, Optional, Any
-import matplotlib.pyplot as plt
-from matplotlib.colors import LinearSegmentedColormap
-import seaborn as sns
 def parse_gene_features(text: str) -> List[Dict[str, Any]]:
-    """Parse gene features from text file in FASTA-like format"""
     genes = []
     current_header = None
     current_sequence = []
@@ -602,7 +662,6 @@ def parse_gene_features(text: str) -> List[Dict[str, Any]]:
         line = line.strip()
         if not line:
             continue
         if line.startswith('>'):
             if current_header:
                 genes.append({
@@ -614,36 +673,29 @@ def parse_gene_features(text: str) -> List[Dict[str, Any]]:
             current_sequence = []
         else:
             current_sequence.append(line.upper())
     if current_header:
         genes.append({
             'header': current_header,
             'sequence': ''.join(current_sequence),
             'metadata': parse_gene_metadata(current_header)
         })
     return genes
 def parse_gene_metadata(header: str) -> Dict[str, str]:
-    """Extract metadata from gene header"""
     metadata = {}
     parts = header.split()
     for part in parts:
         if '[' in part and ']' in part:
             key_value = part[1:-1].split('=', 1)
             if len(key_value) == 2:
                 metadata[key_value[0]] = key_value[1]
     return metadata
 def parse_location(location_str: str) -> Tuple[Optional[int], Optional[int]]:
-    """Parse gene location string, handling both forward and complement strands"""
     try:
-        # Remove 'complement(' and ')' if present
         clean_loc = location_str.replace('complement(', '').replace(')', '')
-        # Split on '..' and convert to integers
         if '..' in clean_loc:
             start, end = map(int, clean_loc.split('..'))
             return start, end
@@ -654,48 +706,41 @@ def parse_location(location_str: str) -> Tuple[Optional[int], Optional[int]]:
         return None, None
 def compute_gene_statistics(gene_shap: np.ndarray) -> Dict[str, float]:
-    """Compute statistical measures for gene SHAP values"""
     return {
-        'avg_shap': float(np.mean(gene_shap)),
-        'median_shap': float(np.median(gene_shap)),
-        'std_shap': float(np.std(gene_shap)),
-        'max_shap': float(np.max(gene_shap)),
-        'min_shap': float(np.min(gene_shap)),
-        'pos_fraction': float(np.mean(gene_shap > 0))
     }
 def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_length: int) -> Image.Image:
     """
-    Create a simple genome diagram using PIL, forcing a minimum color intensity
-    so that small SHAP values don't appear white.
     """
-    from PIL import Image, ImageDraw, ImageFont
-    # Validate inputs
     if not gene_results or genome_length <= 0:
         img = Image.new('RGB', (800, 100), color='white')
         draw = ImageDraw.Draw(img)
         draw.text((10, 40), "Error: Invalid input data", fill='black')
         return img
-    # Ensure all gene coordinates are valid integers
     for gene in gene_results:
         gene['start'] = max(0, int(gene['start']))
         gene['end'] = min(genome_length, int(gene['end']))
         if gene['start'] >= gene['end']:
-            print(f"Warning: Invalid coordinates for gene {gene.get('gene_name','?')}: {gene['start']}-{gene['end']}")
-    # Image dimensions
     width = 1500
     height = 600
     margin = 50
     track_height = 40
-    # Create image with white background
     img = Image.new('RGB', (width, height), 'white')
     draw = ImageDraw.Draw(img)
-    # Try to load font, fall back to default if unavailable
     try:
         font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
         title_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 16)
@@ -703,24 +748,16 @@ def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_leng
         font = ImageFont.load_default()
         title_font = ImageFont.load_default()
-    # Draw title
-    draw.text((margin, margin // 2), "Genome SHAP Analysis", fill='black', font=title_font or font)
-    # Draw genome line
     line_y = height // 2
     draw.line([(int(margin), int(line_y)), (int(width - margin), int(line_y))], fill='black', width=2)
-    # Calculate scale factor
     scale = float(width - 2 * margin) / float(genome_length)
-    # Determine a reasonable step for scale markers
     num_ticks = 10
-    if genome_length < num_ticks:
-        step = 1
-    else:
-        step = genome_length // num_ticks
-    # Draw scale markers
     for i in range(0, genome_length + 1, step):
         x_coord = margin + i * scale
         draw.line([
@@ -729,50 +766,33 @@ def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_leng
         ], fill='black', width=1)
         draw.text((int(x_coord - 20), int(line_y + 10)), f"{i:,}", fill='black', font=font)
-    # Sort genes by absolute SHAP value for drawing
     sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']))
-    # Draw genes
     for idx, gene in enumerate(sorted_genes):
-        # Calculate position and ensure integers
         start_x = margin + int(gene['start'] * scale)
         end_x   = margin + int(gene['end'] * scale)
-        # Calculate color based on SHAP value
         avg_shap = gene['avg_shap']
-        # Convert shap -> color intensity (0 to 255)
-        # Then clamp to a minimum intensity so it never ends up plain white
         intensity = int(abs(avg_shap) * 500)
-        intensity = max(50, min(255, intensity))  # clamp between 50 and 255
         if avg_shap > 0:
-            # Red-ish for positive
-            color = (255, 255 - intensity, 255 - intensity)
         else:
-            # Blue-ish for negative or zero
-            color = (255 - intensity, 255 - intensity, 255)
-        # Draw gene rectangle
         draw.rectangle([
             (int(start_x), int(line_y - track_height // 2)),
             (int(end_x),   int(line_y + track_height // 2))
         ], fill=color, outline='black')
-        # Prepare gene name label
         label = str(gene.get('gene_name','?'))
-        # Fallback for label size
         label_mask = font.getmask(label)
         label_width, label_height = label_mask.size
-        # Alternate label positions
         if idx % 2 == 0:
             text_y = line_y - track_height - 15
         else:
             text_y = line_y + track_height + 5
-        # Decide whether to rotate text based on space
         gene_width = end_x - start_x
         if gene_width > label_width:
             text_x = start_x + (gene_width - label_width) // 2
@@ -784,64 +804,113 @@ def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_leng
             rotated_img = txt_img.rotate(90, expand=True)
             img.paste(rotated_img, (int(start_x), int(text_y)), rotated_img)
-    # Draw legend
-    legend_x = margin
-    legend_y = height - margin
-    draw.text((int(legend_x), int(legend_y - 60)), "SHAP Values:", fill='black', font=font)
-    # Draw legend boxes
-    box_width = 20
-    box_height = 20
-    spacing = 15
-    # Strong human-like
-    draw.rectangle([
-        (int(legend_x), int(legend_y - 45)),
-        (int(legend_x + box_width), int(legend_y - 45 + box_height))
-    ], fill=(255, 0, 0), outline='black')
-    draw.text((int(legend_x + box_width + spacing), int(legend_y - 45)),
-              "Strong human-like signal", fill='black', font=font)
-    # Weak human-like
-    draw.rectangle([
-        (int(legend_x), int(legend_y - 20)),
-        (int(legend_x + box_width), int(legend_y - 20 + box_height))
-    ], fill=(255, 200, 200), outline='black')
-    draw.text((int(legend_x + box_width + spacing), int(legend_y - 20)),
-              "Weak human-like signal", fill='black', font=font)
-    # Weak non-human-like
-    draw.rectangle([
-        (int(legend_x + 250), int(legend_y - 45)),
-        (int(legend_x + 250 + box_width), int(legend_y - 45 + box_height))
-    ], fill=(200, 200, 255), outline='black')
-    draw.text((int(legend_x + 250 + box_width + spacing), int(legend_y - 45)),
-              "Weak non-human-like signal", fill='black', font=font)
-    # Strong non-human-like
-    draw.rectangle([
-        (int(legend_x + 250), int(legend_y - 20)),
-        (int(legend_x + 250 + box_width), int(legend_y - 20 + box_height))
-    ], fill=(0, 0, 255), outline='black')
-    draw.text((int(legend_x + 250 + box_width + spacing), int(legend_y - 20)),
-              "Strong non-human-like signal", fill='black', font=font)
     return img
 def analyze_gene_features(sequence_file: str,
                           features_file: str,
                           fasta_text: str = "",
-                          features_text: str = "") -> Tuple[str, Optional[str], Optional[Image.Image]]:
-    """Analyze SHAP values for each gene feature"""
-    # First analyze whole sequence
     sequence_results = analyze_sequence(sequence_file, top_kmers=10, fasta_text=fasta_text)
     if isinstance(sequence_results[0], str) and "Error" in sequence_results[0]:
         return f"Error in sequence analysis: {sequence_results[0]}", None, None
-    # Get SHAP values
     shap_means = sequence_results[3]["shap_means"]
-    # Parse gene features
     try:
         if features_text.strip():
             genes = parse_gene_features(features_text)
@@ -850,98 +919,100 @@ def analyze_gene_features(sequence_file: str,
                 genes = parse_gene_features(f.read())
     except Exception as e:
         return f"Error reading features file: {str(e)}", None, None
-    # Analyze each gene
     gene_results = []
     for gene in genes:
-        try:
-            location = gene['metadata'].get('location', '')
-            if not location:
-                continue
-            start, end = parse_location(location)
-            if start is None or end is None:
-                continue
-            # Get SHAP values for this region
-            gene_shap = shap_means[start:end]
-            stats = compute_gene_statistics(gene_shap)
-            gene_results.append({
-                'gene_name': gene['metadata'].get('gene', 'Unknown'),
-                'location': location,
-                'start': start,
-                'end': end,
-                'locus_tag': gene['metadata'].get('locus_tag', ''),
-                'avg_shap': stats['avg_shap'],
-                'median_shap': stats['median_shap'],
-                'std_shap': stats['std_shap'],
-                'max_shap': stats['max_shap'],
-                'min_shap': stats['min_shap'],
-                'pos_fraction': stats['pos_fraction'],
-                'classification': 'Human' if stats['avg_shap'] > 0 else 'Non-human',
-                'confidence': abs(stats['avg_shap'])
-            })
-        except Exception as e:
-            print(f"Error processing gene {gene['metadata'].get('gene', 'Unknown')}: {str(e)}")
             continue
     if not gene_results:
         return "No valid genes could be processed", None, None
-    # Sort genes by absolute SHAP value
     sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']), reverse=True)
-    # Create results text
     results_text = "Gene Analysis Results:\n\n"
     results_text += f"Total genes analyzed: {len(gene_results)}\n"
-    results_text += f"Human-like genes: {sum(1 for g in gene_results if g['classification'] == 'Human')}\n"
-    results_text += f"Non-human-like genes: {sum(1 for g in gene_results if g['classification'] == 'Non-human')}\n\n"
-    results_text += "Top 10 most distinctive genes:\n"
     for gene in sorted_genes[:10]:
         results_text += (
             f"Gene: {gene['gene_name']}\n"
             f"Location: {gene['location']}\n"
             f"Classification: {gene['classification']} "
             f"(confidence: {gene['confidence']:.4f})\n"
-            f"Average SHAP: {gene['avg_shap']:.4f}\n\n"
         )
-    # Create CSV content
-    csv_content = "gene_name,location,avg_shap,median_shap,std_shap,max_shap,min_shap,"
-    csv_content += "pos_fraction,classification,confidence,locus_tag\n"
-    for gene in gene_results:
         csv_content += (
-            f"{gene['gene_name']},{gene['location']},{gene['avg_shap']:.4f},"
-            f"{gene['median_shap']:.4f},{gene['std_shap']:.4f},{gene['max_shap']:.4f},"
-            f"{gene['min_shap']:.4f},{gene['pos_fraction']:.4f},{gene['classification']},"
-            f"{gene['confidence']:.4f},{gene['locus_tag']}\n"
         )
-    # Save CSV to temp file
     try:
         temp_dir = tempfile.gettempdir()
         temp_path = os.path.join(temp_dir, f"gene_analysis_{os.urandom(4).hex()}.csv")
         with open(temp_path, 'w') as f:
             f.write(csv_content)
     except Exception as e:
         print(f"Error saving CSV: {str(e)}")
         temp_path = None
-    # Create visualization
     try:
-        diagram_img = create_simple_genome_diagram(gene_results, len(shap_means))
     except Exception as e:
         print(f"Error creating visualization: {str(e)}")
-        # Create error image
         diagram_img = Image.new('RGB', (800, 100), color='white')
         draw = ImageDraw.Draw(diagram_img)
         draw.text((10, 40), f"Error creating visualization: {str(e)}", fill='black')
     return results_text, temp_path, diagram_img
 ###############################################################################
@@ -949,13 +1020,14 @@ def analyze_gene_features(sequence_file: str,
 ###############################################################################
 def prepare_csv_download(data, filename="analysis_results.csv"):
-    """Prepare CSV data for download"""
     if isinstance(data, str):
         return data.encode(), filename
     elif isinstance(data, (list, dict)):
         import csv
         from io import StringIO
         output = StringIO()
         writer = csv.DictWriter(output, fieldnames=data[0].keys())
         writer.writeheader()
@@ -979,22 +1051,22 @@ css = """
 with gr.Blocks(css=css) as iface:
     gr.Markdown("""
-    # Virus Host Classifier
-    **Step 1**: Predict overall viral sequence origin (human vs non-human) and identify extreme regions.
-    **Step 2**: Explore subregions to see local SHAP signals, distribution, GC content, etc.
-    **Step 3**: Analyze gene features and their contributions.
-    **Step 4**: Compare sequences and analyze differences.
-    **Color Scale**: Negative SHAP = Blue, Zero = White, Positive SHAP = Red.
     """)
     with gr.Tab("1) Full-Sequence Analysis"):
         with gr.Row():
             with gr.Column(scale=1):
                 file_input = gr.File(label="Upload FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
-                text_input = gr.Textbox(label="Or paste FASTA sequence", placeholder=">sequence_name\nACGTACGT...", lines=5)
                 top_k = gr.Slider(minimum=5, maximum=30, value=10, step=1, label="Number of top k-mers to display")
-                win_size = gr.Slider(minimum=100, maximum=5000, value=500, step=100, label="Window size for 'most pushing' subregions")
                 analyze_btn = gr.Button("Analyze Sequence", variant="primary")
             with gr.Column(scale=2):
                 results_box = gr.Textbox(label="Classification Results", lines=12, interactive=False)
@@ -1013,8 +1085,7 @@ with gr.Blocks(css=css) as iface:
     with gr.Tab("2) Subregion Exploration"):
         gr.Markdown("""
         **Subregion Analysis**
-        Select start/end positions to view local SHAP signals, distribution, GC content, etc.
-        The heatmap uses the same Blue-White-Red scale.
         """)
         with gr.Row():
             region_start = gr.Number(label="Region Start", value=0)
@@ -1024,7 +1095,7 @@ with gr.Blocks(css=css) as iface:
         with gr.Row():
             subregion_img = gr.Image(label="Subregion SHAP Heatmap (B-W-R)")
             subregion_hist_img = gr.Image(label="SHAP Distribution (Histogram)")
-        download_subregion = gr.File(label="Download Subregion Analysis", visible=False, elem_classes="download-button")
         region_btn.click(
             analyze_subregion,
@@ -1035,60 +1106,48 @@ with gr.Blocks(css=css) as iface:
     with gr.Tab("3) Gene Features Analysis"):
         gr.Markdown("""
         **Analyze Gene Features**
-        Upload a FASTA file and corresponding gene features file to analyze SHAP values per gene.
-        Gene features should be in the format:
-        ```
-        >gene_name [gene=X] [locus_tag=Y] [location=start..end] or [location=complement(start..end)]
-        SEQUENCE
-        ```
-        The genome viewer will show genes color-coded by their contribution:
-        - Red: Genes pushing toward human origin
-        - Blue: Genes pushing toward non-human origin
-        - Color intensity indicates strength of signal
         """)
         with gr.Row():
             with gr.Column(scale=1):
-                gene_fasta_file = gr.File(label="Upload FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
-                gene_fasta_text = gr.Textbox(label="Or paste FASTA sequence", placeholder=">sequence_name\nACGTACGT...", lines=5)
             with gr.Column(scale=1):
-                features_file = gr.File(label="Upload gene features file", file_types=[".txt"], type="filepath")
-                features_text = gr.Textbox(label="Or paste gene features", placeholder=">gene_1 [gene=U12]...\nACGT...", lines=5)
         analyze_genes_btn = gr.Button("Analyze Gene Features", variant="primary")
         gene_results = gr.Textbox(label="Gene Analysis Results", lines=12, interactive=False)
-        gene_diagram = gr.Image(label="Genome Diagram with Gene Features")
         download_gene_results = gr.File(label="Download Gene Analysis (CSV)", visible=True)
         analyze_genes_btn.click(
             analyze_gene_features,
-            inputs=[gene_fasta_file, features_file, gene_fasta_text, features_text],
             outputs=[gene_results, download_gene_results, gene_diagram]
         )
     with gr.Tab("4) Comparative Analysis"):
         gr.Markdown("""
         **Compare Two Sequences**
-        Upload or paste two FASTA sequences to compare their SHAP patterns.
-        The sequences will be normalized to the same length for comparison.
-        **Color Scale**:
-        - Red: Sequence 2 more human-like
-        - Blue: Sequence 1 more human-like
-        - White: No substantial difference
         """)
         with gr.Row():
             with gr.Column(scale=1):
-                file_input1 = gr.File(label="Upload first FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
-                text_input1 = gr.Textbox(label="Or paste first FASTA sequence", placeholder=">sequence1\nACGTACGT...", lines=5)
             with gr.Column(scale=1):
-                file_input2 = gr.File(label="Upload second FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
-                text_input2 = gr.Textbox(label="Or paste second FASTA sequence", placeholder=">sequence2\nACGTACGT...", lines=5)
         compare_btn = gr.Button("Compare Sequences", variant="primary")
         comparison_text = gr.Textbox(label="Comparison Results", lines=12, interactive=False)
         with gr.Row():
             diff_heatmap = gr.Image(label="SHAP Difference Heatmap")
             diff_hist = gr.Image(label="Distribution of SHAP Differences")
-        download_comparison = gr.File(label="Download Comparison Results", visible=False, elem_classes="download-button")
         compare_btn.click(
             analyze_sequence_comparison,
@@ -1097,25 +1156,12 @@ with gr.Blocks(css=css) as iface:
         )
     gr.Markdown("""
-    ### Interface Features
-    - **Overall Classification** (human vs non-human) using k-mer frequencies
-    - **SHAP Analysis** shows which k-mers push classification toward or away from human
-    - **White-Centered SHAP Gradient**:
-      - Negative (blue), 0 (white), Positive (red)
-      - Symmetrical color range around 0
-    - **Identify Subregions** with strongest push for human or non-human
-    - **Gene Feature Analysis**:
-      - Analyze individual genes' contributions
-      - Interactive genome viewer
-      - Gene-level statistics and classification
-    - **Sequence Comparison**:
-      - Compare two sequences to identify regions of difference
-      - Normalized comparison to handle different lengths
-      - Statistical summary of differences
-    - **Data Export**:
-      - Download results as CSV files
-      - Save analysis outputs for further processing
     """)
 if __name__ == "__main__":
     iface.launch()

     return sequences
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
+    """
+    Convert a sequence into a frequency vector of all possible 4-mer combinations.
+    """
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
 ###############################################################################
 def calculate_shap_values(model, x_tensor):
+    """
+    A simple ablation-based SHAP approximation. Zero out each position
+    and measure the impact on the 'human' probability.
+    """
     model.eval()
     with torch.no_grad():
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
+        baseline_prob = baseline_probs[0, 1].item()  # Probability for 'human'
         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
 ###############################################################################
 def compute_positionwise_scores(sequence, shap_values, k=4):
+    """
+    Distribute each k-mer's SHAP contribution across its k underlying positions.
+    """
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     seq_len = len(sequence)
 ###############################################################################
 def find_extreme_subregion(shap_means, window_size=500, mode="max"):
+    """
+    Use a sliding window to find the subregion with the highest (or lowest) average SHAP.
+    """
     n = len(shap_means)
     if n == 0:
         return (0, 0, 0.0)
 ###############################################################################
 def fig_to_image(fig):
+    """
+    Render a Matplotlib figure to a PIL Image.
+    """
     buf = io.BytesIO()
     fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
     buf.seek(0)
     return img
 def get_zero_centered_cmap():
+    """
+    Create a symmetrical (blue-white-red) colormap around zero.
+    """
     colors = [(0.0, 'blue'), (0.5, 'white'), (1.0, 'red')]
     return mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
 def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
+    """
+    Plot an inline heatmap for the chosen region (or entire genome if start/end not provided).
+    """
     if start is not None and end is not None:
         local_shap = shap_means[start:end]
         subtitle = f" (positions {start}-{end})"
     return fig
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
+    """
+    Show bar chart of top k-mers by absolute SHAP value.
+    """
     plt.rcParams.update({'font.size': 10})
     fig = plt.figure(figsize=(10, 5))
     indices = np.argsort(np.abs(shap_values))[-top_k:]
     return fig
 def plot_shap_histogram(shap_array, title="SHAP Distribution in Region", num_bins=30):
+    """
+    Plot a histogram of SHAP values in some region.
+    """
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=num_bins, color='gray', edgecolor='black')
     ax.axvline(0, color='red', linestyle='--', label='0.0')
     return fig
 def compute_gc_content(sequence):
+    """
+    Compute GC content (%) for a given sequence.
+    """
     if not sequence:
+        return 0.0
     gc_count = sequence.count('G') + sequence.count('C')
     return (gc_count / len(sequence)) * 100.0
 ###############################################################################
 def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
+    """
+    Perform the main classification, SHAP analysis, and extreme subregion detection
+    for a single sequence.
+    """
+    # 1) Read input
     if fasta_text.strip():
         text = fasta_text.strip()
     elif file_obj is not None:
     else:
         return ("Please provide a FASTA sequence.", None, None, None, None, None)
+    # 2) Parse FASTA
     sequences = parse_fasta(text)
     if not sequences:
         return ("No valid FASTA sequences found.", None, None, None, None, None)
     header, seq = sequences[0]
+    # 3) Load model, scaler, and run inference
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
         state_dict = torch.load('model.pt', map_location=device)
         model = VirusClassifier(256).to(device)
         model.load_state_dict(state_dict)
     classification = "Human" if prob_human > 0.5 else "Non-human"
     confidence = max(prob_human, prob_nonhuman)
+    # 4) Per-base SHAP & subregion detection
     shap_means = compute_positionwise_scores(seq, shap_values, k=4)
     max_start, max_end, max_avg = find_extreme_subregion(shap_means, window_size, mode="max")
     min_start, min_end, min_avg = find_extreme_subregion(shap_means, window_size, mode="min")
+    # 5) Prepare result text
     results_text = (
         f"Sequence: {header}\n"
         f"Length: {len(seq):,} bases\n"
         f"Start: {min_start}, End: {min_end}, Avg SHAP: {min_avg:.4f}"
     )
+    # 6) Create bar & heatmap figures
     kmers = [''.join(p) for p in product("ACGT", repeat=4)]
     bar_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
     bar_img = fig_to_image(bar_fig)
     heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
     heatmap_img = fig_to_image(heatmap_fig)
+    # 7) Build the "state" dictionary so we can do subregion analysis
     state_dict_out = {"seq": seq, "shap_means": shap_means}
+    # Return 6 items to match your Gradio output
     return (results_text, bar_img, heatmap_img, state_dict_out, header, None)
 ###############################################################################
 ###############################################################################
 def analyze_subregion(state, header, region_start, region_end):
+    """
+    Examine a subregion’s SHAP distribution, GC content, etc.
+    """
     if not state or "seq" not in state or "shap_means" not in state:
         return ("No sequence data found. Please run Step 1 first.", None, None, None)
     seq = state["seq"]
     region_end = max(0, min(region_end, len(seq)))
     if region_end <= region_start:
         return ("Invalid region range. End must be > Start.", None, None, None)
     region_seq = seq[region_start:region_end]
     region_shap = shap_means[region_start:region_end]
     gc_percent = compute_gc_content(region_seq)
     avg_shap = float(np.mean(region_shap))
     positive_fraction = np.mean(region_shap > 0)
     negative_fraction = np.mean(region_shap < 0)
     if avg_shap > 0.05:
         region_classification = "Likely pushing toward human"
     elif avg_shap < -0.05:
         region_classification = "Likely pushing toward non-human"
     else:
         region_classification = "Near neutral (no strong push)"
     region_info = (
         f"Analyzing subregion of {header} from {region_start} to {region_end}\n"
         f"Region length: {len(region_seq)} bases\n"
         f"Fraction with SHAP < 0 (toward non-human): {negative_fraction:.2f}\n"
         f"Subregion interpretation: {region_classification}\n"
     )
     heatmap_fig = plot_linear_heatmap(shap_means, title="Subregion SHAP", start=region_start, end=region_end)
     heatmap_img = fig_to_image(heatmap_fig)
     hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
     hist_img = fig_to_image(hist_fig)
+    # Return 4 items to match your Gradio output
     return (region_info, heatmap_img, hist_img, None)
 ###############################################################################
+# 9. COMPARISON ANALYSIS FUNCTIONS (Step 4)
 ###############################################################################
 def compute_shap_difference(shap1_norm, shap2_norm):
+    """
+    Compute the SHAP difference (Seq2 - Seq1).
+    """
     return shap2_norm - shap1_norm
 def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
     """
+    Plot a 1D heatmap of differences using relative positions 0-100%.
     """
     heatmap_data = shap_diff.reshape(1, -1)
     extent = max(abs(np.min(shap_diff)), abs(np.max(shap_diff)))
 def plot_shap_histogram(shap_array, title="SHAP Distribution", num_bins=30):
     """
+    Plot a histogram of SHAP values with optional # of bins.
     """
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=num_bins, color='gray', edgecolor='black', alpha=0.7)
 def calculate_adaptive_parameters(len1, len2):
     """
+    Choose smoothing & interpolation parameters automatically based on length difference.
     """
     length_diff = abs(len1 - len2)
     max_length = max(len1, len2)
     min_length = min(len1, len2)
     length_ratio = min_length / max_length
+    # Base number of points
     base_points = min(2000, max(500, max_length // 100))
     if length_diff < 500:
         resolution_factor = 2.0
         num_points = min(3000, base_points * 2)
         num_points = max(500, base_points // 2)
         smooth_window = max(100, length_diff // 500)
     smooth_window = int(smooth_window * (1 + (1 - length_ratio)))
     return int(num_points), int(smooth_window), resolution_factor
 def sliding_window_smooth(values, window_size=50):
     """
+    A custom smoothing approach, including exponential decay at edges.
     """
     if window_size < 3:
         return values
     window = np.ones(window_size)
     decay = np.exp(-np.linspace(0, 3, window_size // 2))
     window[:window_size // 2] = decay
     window[-(window_size // 2):] = decay[::-1]
     window = window / window.sum()
     smoothed = np.convolve(values, window, mode='valid')
     pad_size = len(values) - len(smoothed)
     pad_left = pad_size // 2
     pad_right = pad_size - pad_left
 def normalize_shap_lengths(shap1, shap2):
     """
+    Smooth, interpolate, and return arrays of the same length for direct comparison.
     """
     num_points, smooth_window, _ = calculate_adaptive_parameters(len(shap1), len(shap2))
     shap1_smooth = sliding_window_smooth(shap1, smooth_window)
     shap2_smooth = sliding_window_smooth(shap2, smooth_window)
     x1 = np.linspace(0, 1, len(shap1_smooth))
     x2 = np.linspace(0, 1, len(shap2_smooth))
     x_norm = np.linspace(0, 1, num_points)
 def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
     """
+    Compare two sequences using the previously defined analysis pipeline
+    and produce difference visualizations & stats.
     """
     try:
         # Analyze first sequence
         if isinstance(res2[0], str) and "Error" in res2[0]:
             return (f"Error in sequence 2: {res2[0]}", None, None, None)
         shap1 = res1[3]["shap_means"]
         shap2 = res2[3]["shap_means"]
         len1, len2 = len(shap1), len(shap2)
         length_diff = abs(len1 - len2)
         length_ratio = min(len1, len2) / max(len1, len2)
+        # Normalize both to the same length
         shap1_norm, shap2_norm, smooth_window = normalize_shap_lengths(shap1, shap2)
         shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
+        # Compute stats
         base_threshold = 0.05
         adaptive_threshold = base_threshold * (1 + (1 - length_ratio))
         if length_diff > 50000:
             adaptive_threshold *= 1.5
         avg_diff = np.mean(shap_diff)
         std_diff = np.std(shap_diff)
         max_diff = np.max(shap_diff)
         substantial_diffs = np.abs(shap_diff) > adaptive_threshold
         frac_different = np.mean(substantial_diffs)
+        # Extract classification from text
         try:
             classification1 = res1[0].split('Classification: ')[1].split('\n')[0].strip()
             classification2 = res2[0].split('Classification: ')[1].split('\n')[0].strip()
             classification1 = "Unknown"
             classification2 = "Unknown"
         comparison_text = (
             "Sequence Comparison Results:\n"
             f"Sequence 1: {res1[4]}\n"
             "- White regions: Similar between sequences"
         )
         heatmap_fig = plot_comparative_heatmap(
             shap_diff,
             title=f"SHAP Difference Heatmap (window: {smooth_window})"
         )
         heatmap_img = fig_to_image(heatmap_fig)
         num_bins = max(20, min(50, int(np.sqrt(len(shap_diff)))))
         hist_fig = plot_shap_histogram(
             shap_diff,
         )
         hist_img = fig_to_image(hist_fig)
         return (comparison_text, heatmap_img, hist_img, None)
     except Exception as e:
         return (error_msg, None, None, None)
 ###############################################################################
+# 10. ADDITIONAL / ADVANCED VISUALIZATIONS & STATISTICS
 ###############################################################################
+def n50_length(sequence):
+    """
+    Calculate the N50 for a single continuous sequence (for demonstration).
+    For a single sequence, N50 is typically the length if it's just one piece,
+    but let's do a simplistic example.
+    """
+    # If you had contigs, you'd do a sorted list, cumulative sums, etc.
+    # We'll do a trivial approach here:
+    return len(sequence)  # Because we have only one contiguous region
+def sequence_complexity(sequence):
+    """
+    Compute a simple measure of 'sequence complexity'.
+    Here, we define complexity as the Shannon entropy over the nucleotides.
+    """
+    from math import log2
+    length = len(sequence)
+    if length == 0:
+        return 0.0
+    freq = {}
+    for base in sequence:
+        freq[base] = freq.get(base, 0) + 1
+    complexity = 0.0
+    for base, count in freq.items():
+        p = count / length
+        complexity -= p * log2(p)
+    return complexity
+def advanced_gene_statistics(gene_shap: np.ndarray, gene_seq: str) -> Dict[str, float]:
+    """
+    Additional stats: N50, complexity, etc.
+    """
+    stats = {}
+    stats['n50'] = len(gene_seq)  # trivial for a single gene region
+    stats['entropy'] = sequence_complexity(gene_seq)
+    stats['avg_shap'] = float(np.mean(gene_shap))
+    stats['max_shap'] = float(np.max(gene_shap)) if len(gene_shap) else 0.0
+    stats['min_shap'] = float(np.min(gene_shap)) if len(gene_shap) else 0.0
+    return stats
+###############################################################################
+# 11. GENE FEATURE ANALYSIS
+###############################################################################
 def parse_gene_features(text: str) -> List[Dict[str, Any]]:
+    """Parse gene features from text file in a FASTA-like format."""
     genes = []
     current_header = None
     current_sequence = []
         line = line.strip()
         if not line:
             continue
         if line.startswith('>'):
             if current_header:
                 genes.append({
             current_sequence = []
         else:
             current_sequence.append(line.upper())
     if current_header:
         genes.append({
             'header': current_header,
             'sequence': ''.join(current_sequence),
             'metadata': parse_gene_metadata(current_header)
         })
     return genes
 def parse_gene_metadata(header: str) -> Dict[str, str]:
+    """Extract metadata from gene header line."""
     metadata = {}
     parts = header.split()
     for part in parts:
         if '[' in part and ']' in part:
             key_value = part[1:-1].split('=', 1)
             if len(key_value) == 2:
                 metadata[key_value[0]] = key_value[1]
     return metadata
 def parse_location(location_str: str) -> Tuple[Optional[int], Optional[int]]:
+    """Parse gene location string, handling forward and complement strands."""
     try:
         clean_loc = location_str.replace('complement(', '').replace(')', '')
         if '..' in clean_loc:
             start, end = map(int, clean_loc.split('..'))
             return start, end
         return None, None
 def compute_gene_statistics(gene_shap: np.ndarray) -> Dict[str, float]:
+    """Basic statistical measures for gene SHAP values."""
     return {
+        'avg_shap': float(np.mean(gene_shap)) if len(gene_shap) else 0.0,
+        'median_shap': float(np.median(gene_shap)) if len(gene_shap) else 0.0,
+        'std_shap': float(np.std(gene_shap)) if len(gene_shap) else 0.0,
+        'max_shap': float(np.max(gene_shap)) if len(gene_shap) else 0.0,
+        'min_shap': float(np.min(gene_shap)) if len(gene_shap) else 0.0,
+        'pos_fraction': float(np.mean(gene_shap > 0)) if len(gene_shap) else 0.0
     }
 def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_length: int) -> Image.Image:
     """
+    A quick PIL-based diagram to show genes along the genome.
+    Color intensity = magnitude of SHAP. Red/Blue = sign of SHAP.
     """
     if not gene_results or genome_length <= 0:
         img = Image.new('RGB', (800, 100), color='white')
         draw = ImageDraw.Draw(img)
         draw.text((10, 40), "Error: Invalid input data", fill='black')
         return img
     for gene in gene_results:
         gene['start'] = max(0, int(gene['start']))
         gene['end'] = min(genome_length, int(gene['end']))
         if gene['start'] >= gene['end']:
+            print(f"Warning: Invalid coordinates for gene {gene.get('gene_name','?')}")
     width = 1500
     height = 600
     margin = 50
     track_height = 40
     img = Image.new('RGB', (width, height), 'white')
     draw = ImageDraw.Draw(img)
     try:
         font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
         title_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 16)
         font = ImageFont.load_default()
         title_font = ImageFont.load_default()
+    draw.text((margin, margin // 2), "Genome SHAP Analysis (Simple)", fill='black', font=title_font or font)
     line_y = height // 2
     draw.line([(int(margin), int(line_y)), (int(width - margin), int(line_y))], fill='black', width=2)
     scale = float(width - 2 * margin) / float(genome_length)
+    # Scale markers
     num_ticks = 10
+    step = max(1, genome_length // num_ticks)
     for i in range(0, genome_length + 1, step):
         x_coord = margin + i * scale
         draw.line([
         ], fill='black', width=1)
         draw.text((int(x_coord - 20), int(line_y + 10)), f"{i:,}", fill='black', font=font)
     sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']))
     for idx, gene in enumerate(sorted_genes):
         start_x = margin + int(gene['start'] * scale)
         end_x   = margin + int(gene['end'] * scale)
         avg_shap = gene['avg_shap']
         intensity = int(abs(avg_shap) * 500)
+        intensity = max(50, min(255, intensity))
         if avg_shap > 0:
+            color = (255, 255 - intensity, 255 - intensity)  # Redish
         else:
+            color = (255 - intensity, 255 - intensity, 255)  # Blueish
         draw.rectangle([
             (int(start_x), int(line_y - track_height // 2)),
             (int(end_x),   int(line_y + track_height // 2))
         ], fill=color, outline='black')
         label = str(gene.get('gene_name','?'))
         label_mask = font.getmask(label)
         label_width, label_height = label_mask.size
         if idx % 2 == 0:
             text_y = line_y - track_height - 15
         else:
             text_y = line_y + track_height + 5
         gene_width = end_x - start_x
         if gene_width > label_width:
             text_x = start_x + (gene_width - label_width) // 2
             rotated_img = txt_img.rotate(90, expand=True)
             img.paste(rotated_img, (int(start_x), int(text_y)), rotated_img)
+    return img
+def create_advanced_genome_diagram(gene_results: List[Dict[str, Any]],
+                                   genome_length: int,
+                                   shap_means: np.ndarray,
+                                   diagram_title: str = "Advanced Genome Diagram") -> Image.Image:
+    """
+    An advanced genome diagram using Biopython's GenomeDiagram.
+    We'll create tracks for genes and a 'SHAP line plot' track.
+    """
+    if not gene_results or genome_length <= 0 or len(shap_means) == 0:
+        # Fallback if data is invalid
+        img = Image.new('RGB', (800, 100), color='white')
+        d = ImageDraw.Draw(img)
+        d.text((10, 40), "Error: Not enough data for advanced diagram", fill='black')
+        return img
+    diagram = GenomeDiagram.Diagram(diagram_title)
+    gene_track = diagram.new_track(1, name="Genes", greytrack=False, height=0.5)
+    gene_set = gene_track.new_set()
+    # Add each gene as a feature
+    for gene in gene_results:
+        start = max(0, int(gene['start']))
+        end = min(genome_length, int(gene['end']))
+        avg_shap = gene['avg_shap']
+        # Color scale: negative = blue, positive = red
+        intensity = abs(avg_shap) * 500
+        intensity = max(50, min(255, intensity))
+        if avg_shap >= 0:
+            color_hex = colors.Color(1.0, 1.0 - intensity/255.0, 1.0 - intensity/255.0)
+        else:
+            color_hex = colors.Color(1.0 - intensity/255.0, 1.0 - intensity/255.0, 1.0)
+        feature = SeqFeature(FeatureLocation(start, end), strand=1)
+        gene_set.add_feature(
+            feature,
+            color=color_hex,
+            label=True,
+            name=str(gene.get('gene_name','?')),
+            label_size=8,
+            label_color=colors.black
+        )
+    # Add a track for the SHAP line
+    shap_track = diagram.new_track(2, name="SHAP Score", greytrack=False, height=0.3)
+    shap_set = shap_track.new_set("graph")
+    # We'll plot the entire shap_means array.
+    # X coords = [0..genome_length], Y coords = shap_means
+    # We'll keep negative values below baseline, positive above.
+    # Normalizing for visualization
+    max_abs = max(abs(shap_means.min()), abs(shap_means.max()))
+    if max_abs == 0:
+        scaled_shap = [0]*len(shap_means)
+    else:
+        scaled_shap = (shap_means / max_abs * 50).tolist()  # scale to +/- 50
+    shap_set.add_graph(
+        data=scaled_shap,
+        name="shap_line",
+        style="line",
+        color=colors.darkgreen,
+        altcolor=colors.red,
+        linewidth=1
+    )
+    # Draw to a temporary file
+    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmpf:
+        diagram.draw(format="linear", pagesize='A3', fragments=1, start=0, end=genome_length)
+        diagram.write(tmpf.name, "PDF")
+        # Convert PDF to a PIL image (requires poppler or similar).
+        # If you do not have poppler, you can skip PDF -> image or use Cairo.
+        try:
+            import pdf2image
+            pages = pdf2image.convert_from_path(tmpf.name, dpi=100)
+            img = pages[0] if pages else Image.new('RGB', (800, 100), color='white')
+        except ImportError:
+            img = Image.new('RGB', (800, 100), color='white')
+            d = ImageDraw.Draw(img)
+            d.text((10, 40), "pdf2image not installed, can't show advanced diagram as image.", fill='black')
+    # Cleanup
+    os.remove(tmpf.name)
     return img
 def analyze_gene_features(sequence_file: str,
                           features_file: str,
                           fasta_text: str = "",
+                          features_text: str = "",
+                          diagram_mode: str = "advanced"
+                          ) -> Tuple[str, Optional[str], Optional[Image.Image]]:
+    """
+    Analyze each gene in the features file, compute gene-level SHAP stats,
+    produce tabular output, and create an optional genome diagram.
+    """
+    # 1) Analyze the entire sequence with the top-level function
     sequence_results = analyze_sequence(sequence_file, top_kmers=10, fasta_text=fasta_text)
     if isinstance(sequence_results[0], str) and "Error" in sequence_results[0]:
         return f"Error in sequence analysis: {sequence_results[0]}", None, None
+    seq = sequence_results[3]["seq"]
     shap_means = sequence_results[3]["shap_means"]
+    genome_length = len(seq)
+    # 2) Read gene features
     try:
         if features_text.strip():
             genes = parse_gene_features(features_text)
                 genes = parse_gene_features(f.read())
     except Exception as e:
         return f"Error reading features file: {str(e)}", None, None
     gene_results = []
     for gene in genes:
+        location = gene['metadata'].get('location', '')
+        if not location:
             continue
+        start, end = parse_location(location)
+        if start is None or end is None or start >= end or end > genome_length:
+            continue
+        gene_shap = shap_means[start:end]
+        basic_stats = compute_gene_statistics(gene_shap)
+        # Additional stats
+        gene_seq = seq[start:end]
+        adv_stats = advanced_gene_statistics(gene_shap, gene_seq)
+        # Merge basic + advanced stats
+        all_stats = {**basic_stats, **adv_stats}
+        classification = 'Human' if basic_stats['avg_shap'] > 0 else 'Non-human'
+        locus_tag = gene['metadata'].get('locus_tag', '')
+        gene_name = gene['metadata'].get('gene', 'Unknown')
+        gene_dict = {
+            'gene_name': gene_name,
+            'location': location,
+            'start': start,
+            'end': end,
+            'locus_tag': locus_tag,
+            'avg_shap': all_stats['avg_shap'],
+            'median_shap': basic_stats['median_shap'],
+            'std_shap': basic_stats['std_shap'],
+            'max_shap': basic_stats['max_shap'],
+            'min_shap': basic_stats['min_shap'],
+            'pos_fraction': basic_stats['pos_fraction'],
+            'n50': all_stats['n50'],
+            'entropy': all_stats['entropy'],
+            'classification': classification,
+            'confidence': abs(all_stats['avg_shap'])
+        }
+        gene_results.append(gene_dict)
     if not gene_results:
         return "No valid genes could be processed", None, None
+    # 3) Summaries
     sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']), reverse=True)
     results_text = "Gene Analysis Results:\n\n"
     results_text += f"Total genes analyzed: {len(gene_results)}\n"
+    num_human = sum(1 for g in gene_results if g['classification'] == 'Human')
+    results_text += f"Human-like genes: {num_human}\n"
+    results_text += f"Non-human-like genes: {len(gene_results) - num_human}\n\n"
+    results_text += "Top 10 most distinctive genes (by avg SHAP magnitude):\n"
     for gene in sorted_genes[:10]:
         results_text += (
             f"Gene: {gene['gene_name']}\n"
             f"Location: {gene['location']}\n"
             f"Classification: {gene['classification']} "
             f"(confidence: {gene['confidence']:.4f})\n"
+            f"Average SHAP: {gene['avg_shap']:.4f}\n"
+            f"N50: {gene['n50']}, Entropy: {gene['entropy']:.3f}\n\n"
         )
+    # 4) Make CSV
+    csv_content = "gene_name,location,start,end,locus_tag,avg_shap,median_shap,std_shap,"
+    csv_content += "max_shap,min_shap,pos_fraction,n50,entropy,classification,confidence\n"
+    for g in gene_results:
         csv_content += (
+            f"{g['gene_name']},{g['location']},{g['start']},{g['end']},{g['locus_tag']},"
+            f"{g['avg_shap']:.4f},{g['median_shap']:.4f},{g['std_shap']:.4f},"
+            f"{g['max_shap']:.4f},{g['min_shap']:.4f},{g['pos_fraction']:.4f},"
+            f"{g['n50']},{g['entropy']:.4f},{g['classification']},{g['confidence']:.4f}\n"
         )
     try:
         temp_dir = tempfile.gettempdir()
         temp_path = os.path.join(temp_dir, f"gene_analysis_{os.urandom(4).hex()}.csv")
         with open(temp_path, 'w') as f:
             f.write(csv_content)
     except Exception as e:
         print(f"Error saving CSV: {str(e)}")
         temp_path = None
+    # 5) Create diagram
     try:
+        if diagram_mode == "advanced":
+            diagram_img = create_advanced_genome_diagram(gene_results, genome_length, shap_means)
+        else:
+            diagram_img = create_simple_genome_diagram(gene_results, genome_length)
     except Exception as e:
         print(f"Error creating visualization: {str(e)}")
         diagram_img = Image.new('RGB', (800, 100), color='white')
         draw = ImageDraw.Draw(diagram_img)
         draw.text((10, 40), f"Error creating visualization: {str(e)}", fill='black')
     return results_text, temp_path, diagram_img
 ###############################################################################
 ###############################################################################
 def prepare_csv_download(data, filename="analysis_results.csv"):
+    """
+    Convert data to CSV for Gradio download button.
+    """
     if isinstance(data, str):
         return data.encode(), filename
     elif isinstance(data, (list, dict)):
         import csv
         from io import StringIO
         output = StringIO()
         writer = csv.DictWriter(output, fieldnames=data[0].keys())
         writer.writeheader()
 with gr.Blocks(css=css) as iface:
     gr.Markdown("""
+    # Virus Host Classifier + Extended Genome Visualization
+    **Step 1**: Predict overall viral sequence origin (human vs non-human) and identify extreme subregions.
+    **Step 2**: Explore subregions (local SHAP, GC content, histogram).
+    **Step 3**: Analyze gene features (per-gene SHAP, advanced stats, improved diagrams).
+    **Step 4**: Compare sequences for SHAP differences.
+    **Color Scale**: Negative SHAP = Blue, 0 = White, Positive = Red.
     """)
     with gr.Tab("1) Full-Sequence Analysis"):
         with gr.Row():
             with gr.Column(scale=1):
                 file_input = gr.File(label="Upload FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
+                text_input = gr.Textbox(label="Or paste FASTA", placeholder=">name\nACGT...", lines=5)
                 top_k = gr.Slider(minimum=5, maximum=30, value=10, step=1, label="Number of top k-mers to display")
+                win_size = gr.Slider(minimum=100, maximum=5000, value=500, step=100, label="Subregion Window Size")
                 analyze_btn = gr.Button("Analyze Sequence", variant="primary")
             with gr.Column(scale=2):
                 results_box = gr.Textbox(label="Classification Results", lines=12, interactive=False)
     with gr.Tab("2) Subregion Exploration"):
         gr.Markdown("""
         **Subregion Analysis**
+        View SHAP signals, GC content, etc. for a specific region.
         """)
         with gr.Row():
             region_start = gr.Number(label="Region Start", value=0)
         with gr.Row():
             subregion_img = gr.Image(label="Subregion SHAP Heatmap (B-W-R)")
             subregion_hist_img = gr.Image(label="SHAP Distribution (Histogram)")
+        download_subregion = gr.File(label="Download Subregion", visible=False, elem_classes="download-button")
         region_btn.click(
             analyze_subregion,
     with gr.Tab("3) Gene Features Analysis"):
         gr.Markdown("""
         **Analyze Gene Features**
+        - Upload a FASTA file and a gene features file.
+        - See per-gene SHAP, classification, N50, entropy, etc.
+        - Choose a diagram mode (simple or advanced).
         """)
         with gr.Row():
             with gr.Column(scale=1):
+                gene_fasta_file = gr.File(label="FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
+                gene_fasta_text = gr.Textbox(label="Or paste FASTA sequence", lines=5)
             with gr.Column(scale=1):
+                features_file = gr.File(label="Gene features file", file_types=[".txt"], type="filepath")
+                features_text = gr.Textbox(label="Or paste gene features", lines=5)
+                diagram_mode = gr.Radio(choices=["simple", "advanced"], value="advanced", label="Diagram Mode")
         analyze_genes_btn = gr.Button("Analyze Gene Features", variant="primary")
         gene_results = gr.Textbox(label="Gene Analysis Results", lines=12, interactive=False)
+        gene_diagram = gr.Image(label="Genome Diagram")
         download_gene_results = gr.File(label="Download Gene Analysis (CSV)", visible=True)
         analyze_genes_btn.click(
             analyze_gene_features,
+            inputs=[gene_fasta_file, features_file, gene_fasta_text, features_text, diagram_mode],
             outputs=[gene_results, download_gene_results, gene_diagram]
         )
     with gr.Tab("4) Comparative Analysis"):
         gr.Markdown("""
         **Compare Two Sequences**
+        - Upload or paste two FASTA sequences.
+        - We'll compare SHAP patterns (normalized for different lengths).
         """)
         with gr.Row():
             with gr.Column(scale=1):
+                file_input1 = gr.File(label="1st FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
+                text_input1 = gr.Textbox(label="Or paste 1st FASTA", lines=5)
             with gr.Column(scale=1):
+                file_input2 = gr.File(label="2nd FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
+                text_input2 = gr.Textbox(label="Or paste 2nd FASTA", lines=5)
         compare_btn = gr.Button("Compare Sequences", variant="primary")
         comparison_text = gr.Textbox(label="Comparison Results", lines=12, interactive=False)
         with gr.Row():
             diff_heatmap = gr.Image(label="SHAP Difference Heatmap")
             diff_hist = gr.Image(label="Distribution of SHAP Differences")
+        download_comparison = gr.File(label="Download Comparison", visible=False, elem_classes="download-button")
         compare_btn.click(
             analyze_sequence_comparison,
         )
     gr.Markdown("""
+    ### Notes & Features
+    - **Advanced Genome Diagram** uses Biopython’s `GenomeDiagram` (requires `pdf2image` if you want it as an image).
+    - **Additional Stats**: N50, Shannon entropy, etc.
+    - **Auto-scaling** for comparative analysis with adaptive smoothing.
+    - **Data Export**: Download CSV of analysis results.
     """)
 if __name__ == "__main__":
     iface.launch()