Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Mar 5

Commit

78f8b3b

verified ·

1 Parent(s): bc5e648

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -66

app.py CHANGED Viewed

@@ -82,7 +82,7 @@ def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     return vec
 ###############################################################################
-# 3. SHAP-VALUE (ABLATION) CALCULATION
 ###############################################################################
 def calculate_shap_values(model, x_tensor):
@@ -105,7 +105,7 @@ def calculate_shap_values(model, x_tensor):
 ###############################################################################
-# 4. PER-BASE SHAP AGGREGATION
 ###############################################################################
 def compute_positionwise_scores(sequence, shap_values, k=4):
@@ -125,7 +125,7 @@ def compute_positionwise_scores(sequence, shap_values, k=4):
     return shap_means
 ###############################################################################
-# 5. FIND EXTREME SHAP REGIONS
 ###############################################################################
 def find_extreme_subregion(shap_means, window_size=500, mode="max"):
@@ -166,7 +166,7 @@ def get_zero_centered_cmap():
     colors = [(0.0, 'blue'), (0.5, 'white'), (1.0, 'red')]
     return mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
-def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
     if start is not None and end is not None:
         local_shap = shap_means[start:end]
         subtitle = f" (positions {start}-{end})"
@@ -184,7 +184,7 @@ def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, e
     cax = ax.imshow(heatmap_data, aspect='auto', cmap=cmap, vmin=-extent, vmax=extent)
     cbar = plt.colorbar(cax, orientation='horizontal', pad=0.25, aspect=40, shrink=0.8)
     cbar.ax.tick_params(labelsize=8)
-    cbar.set_label('SHAP Contribution', fontsize=9, labelpad=5)
     ax.set_yticks([])
     ax.set_xlabel('Position in Sequence', fontsize=10)
     ax.set_title(f"{title}{subtitle}", pad=10)
@@ -200,17 +200,17 @@ def create_importance_bar_plot(shap_values, kmers, top_k=10):
     colors = ['#99ccff' if v < 0 else '#ff9999' for v in values]
     plt.barh(range(len(values)), values, color=colors)
     plt.yticks(range(len(values)), features)
-    plt.xlabel('SHAP Value (impact on model output)')
     plt.title(f'Top {top_k} Most Influential k-mers')
     plt.gca().invert_yaxis()
     plt.tight_layout()
     return fig
-def plot_shap_histogram(shap_array, title="SHAP Distribution in Region", num_bins=30):
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=num_bins, color='gray', edgecolor='black')
     ax.axvline(0, color='red', linestyle='--', label='0.0')
-    ax.set_xlabel("SHAP Value")
     ax.set_ylabel("Count")
     ax.set_title(title)
     ax.legend()
@@ -227,23 +227,23 @@ def compute_gc_content(sequence):
 # 7. MAIN ANALYSIS STEP (Gradio Step 1)
 ###############################################################################
 def create_kmer_shap_csv(kmers, shap_values):
-    """Create a CSV file with k-mer SHAP values and return the filepath"""
-    # Create DataFrame with k-mers and SHAP values
     kmer_df = pd.DataFrame({
         'kmer': kmers,
-        'shap_value': shap_values,
-        'abs_shap': np.abs(shap_values)
     })
-    # Sort by absolute SHAP value (most influential first)
-    kmer_df = kmer_df.sort_values('abs_shap', ascending=False)
-    # Drop the abs_shap column used for sorting
-    kmer_df = kmer_df[['kmer', 'shap_value']]
     # Save to temporary file
     temp_dir = tempfile.gettempdir()
-    temp_path = os.path.join(temp_dir, f"kmer_shap_values_{os.urandom(4).hex()}.csv")
     kmer_df.to_csv(temp_path, index=False)
     return temp_path  # Return only the file path, not a tuple
@@ -296,19 +296,19 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
         f"(Human Probability: {prob_human:.3f}, Non-human Probability: {prob_nonhuman:.3f})\n\n"
         f"---\n"
         f"**Most Human-Pushing {window_size}-bp Subregion**:\n"
-        f"Start: {max_start}, End: {max_end}, Avg SHAP: {max_avg:.4f}\n\n"
         f"**Most Non-Human–Pushing {window_size}-bp Subregion**:\n"
-        f"Start: {min_start}, End: {min_end}, Avg SHAP: {min_avg:.4f}"
     )
     kmers = [''.join(p) for p in product("ACGT", repeat=4)]
     bar_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
     bar_img = fig_to_image(bar_fig)
-    heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
     heatmap_img = fig_to_image(heatmap_fig)
-    # Create CSV with k-mer SHAP values and return the file path
     kmer_shap_csv = create_kmer_shap_csv(kmers, shap_values)
     # State dictionary for subregion analysis
@@ -347,14 +347,14 @@ def analyze_subregion(state, header, region_start, region_end):
         f"Analyzing subregion of {header} from {region_start} to {region_end}\n"
         f"Region length: {len(region_seq)} bases\n"
         f"GC content: {gc_percent:.2f}%\n"
-        f"Average SHAP in region: {avg_shap:.4f}\n"
-        f"Fraction with SHAP > 0 (toward human): {positive_fraction:.2f}\n"
-        f"Fraction with SHAP < 0 (toward non-human): {negative_fraction:.2f}\n"
         f"Subregion interpretation: {region_classification}\n"
     )
-    heatmap_fig = plot_linear_heatmap(shap_means, title="Subregion SHAP", start=region_start, end=region_end)
     heatmap_img = fig_to_image(heatmap_fig)
-    hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
     hist_img = fig_to_image(hist_fig)
     # For demonstration, returning None for the file download as well
@@ -370,10 +370,10 @@ def get_zero_centered_cmap():
     return mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
 def compute_shap_difference(shap1_norm, shap2_norm):
-    """Compute the SHAP difference between normalized sequences"""
     return shap2_norm - shap1_norm
-def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
     """
     Plot heatmap using relative positions (0-100%)
     """
@@ -393,7 +393,7 @@ def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
     cbar = plt.colorbar(cax, orientation='horizontal', pad=0.25, aspect=40, shrink=0.8)
     cbar.ax.tick_params(labelsize=8)
-    cbar.set_label('SHAP Difference (Seq2 - Seq1)', fontsize=9, labelpad=5)
     ax.set_yticks([])
     ax.set_xlabel('Relative Position in Sequence', fontsize=10)
@@ -402,14 +402,14 @@ def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
     return fig
-def plot_shap_histogram(shap_array, title="SHAP Distribution", num_bins=30):
     """
-    Plot histogram of SHAP values with configurable number of bins
     """
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=num_bins, color='gray', edgecolor='black', alpha=0.7)
     ax.axvline(0, color='red', linestyle='--', label='0.0')
-    ax.set_xlabel("SHAP Value")
     ax.set_ylabel("Count")
     ax.set_title(title)
     ax.legend()
@@ -483,7 +483,7 @@ def sliding_window_smooth(values, window_size=50):
 def normalize_shap_lengths(shap1, shap2):
     """
-    Normalize and smooth SHAP values with dynamic adaptation
     """
     # Calculate adaptive parameters
     num_points, smooth_window, _ = calculate_adaptive_parameters(len(shap1), len(shap2))
@@ -517,7 +517,7 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
         if isinstance(res2[0], str) and "Error" in res2[0]:
             return (f"Error in sequence 2: {res2[0]}", None, None, None)
-        # Extract SHAP values and sequence info
         shap1 = res1[3]["shap_means"]
         shap2 = res2[3]["shap_means"]
@@ -567,7 +567,7 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
             f"Smoothing Window: {smooth_window} points\n"
             f"Adaptive Threshold: {adaptive_threshold:.3f}\n\n"
             "Statistics:\n"
-            f"Average SHAP difference: {avg_diff:.4f}\n"
             f"Standard deviation: {std_diff:.4f}\n"
             f"Max difference: {max_diff:.4f} (Seq2 more human-like)\n"
             f"Min difference: {min_diff:.4f} (Seq1 more human-like)\n"
@@ -582,7 +582,7 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
         # Generate visualizations
         heatmap_fig = plot_comparative_heatmap(
             shap_diff,
-            title=f"SHAP Difference Heatmap (window: {smooth_window})"
         )
         heatmap_img = fig_to_image(heatmap_fig)
@@ -590,7 +590,7 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
         num_bins = max(20, min(50, int(np.sqrt(len(shap_diff)))))
         hist_fig = plot_shap_histogram(
             shap_diff,
-            title="Distribution of SHAP Differences",
             num_bins=num_bins
         )
         hist_img = fig_to_image(hist_fig)
@@ -680,7 +680,7 @@ def parse_location(location_str: str) -> Tuple[Optional[int], Optional[int]]:
         return None, None
 def compute_gene_statistics(gene_shap: np.ndarray) -> Dict[str, float]:
-    """Compute statistical measures for gene SHAP values"""
     return {
         'avg_shap': float(np.mean(gene_shap)),
         'median_shap': float(np.median(gene_shap)),
@@ -693,7 +693,7 @@ def compute_gene_statistics(gene_shap: np.ndarray) -> Dict[str, float]:
 def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_length: int) -> Image.Image:
     """
     Create a simple genome diagram using PIL, forcing a minimum color intensity
-    so that small SHAP values don't appear white.
     """
     from PIL import Image, ImageDraw, ImageFont
@@ -730,7 +730,7 @@ def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_leng
         title_font = ImageFont.load_default()
     # Draw title
-    draw.text((margin, margin // 2), "Genome SHAP Analysis", fill='black', font=title_font or font)
     # Draw genome line
     line_y = height // 2
@@ -755,7 +755,7 @@ def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_leng
         ], fill='black', width=1)
         draw.text((int(x_coord - 20), int(line_y + 10)), f"{i:,}", fill='black', font=font)
-    # Sort genes by absolute SHAP value for drawing
     sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']))
     # Draw genes
@@ -764,10 +764,10 @@ def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_leng
         start_x = margin + int(gene['start'] * scale)
         end_x   = margin + int(gene['end'] * scale)
-        # Calculate color based on SHAP value
         avg_shap = gene['avg_shap']
-        # Convert shap -> color intensity (0 to 255)
         # Then clamp to a minimum intensity so it never ends up plain white
         intensity = int(abs(avg_shap) * 500)
         intensity = max(50, min(255, intensity))  # clamp between 50 and 255
@@ -813,7 +813,7 @@ def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_leng
     # Draw legend
     legend_x = margin
     legend_y = height - margin
-    draw.text((int(legend_x), int(legend_y - 60)), "SHAP Values:", fill='black', font=font)
     # Draw legend boxes
     box_width = 20
@@ -858,13 +858,13 @@ def analyze_gene_features(sequence_file: str,
                           features_file: str,
                           fasta_text: str = "",
                           features_text: str = "") -> Tuple[str, Optional[str], Optional[Image.Image]]:
-    """Analyze SHAP values for each gene feature"""
     # First analyze whole sequence
     sequence_results = analyze_sequence(sequence_file, top_kmers=10, fasta_text=fasta_text)
     if isinstance(sequence_results[0], str) and "Error" in sequence_results[0]:
         return f"Error in sequence analysis: {sequence_results[0]}", None, None
-    # Get SHAP values
     shap_means = sequence_results[3]["shap_means"]
     # Parse gene features
@@ -889,7 +889,7 @@ def analyze_gene_features(sequence_file: str,
             if start is None or end is None:
                 continue
-            # Get SHAP values for this region
             gene_shap = shap_means[start:end]
             stats = compute_gene_statistics(gene_shap)
@@ -916,7 +916,7 @@ def analyze_gene_features(sequence_file: str,
     if not gene_results:
         return "No valid genes could be processed", None, None
-    # Sort genes by absolute SHAP value
     sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']), reverse=True)
     # Create results text
@@ -932,11 +932,11 @@ def analyze_gene_features(sequence_file: str,
             f"Location: {gene['location']}\n"
             f"Classification: {gene['classification']} "
             f"(confidence: {gene['confidence']:.4f})\n"
-            f"Average SHAP: {gene['avg_shap']:.4f}\n\n"
         )
     # Create CSV content
-    csv_content = "gene_name,location,avg_shap,median_shap,std_shap,max_shap,min_shap,"
     csv_content += "pos_fraction,classification,confidence,locus_tag\n"
     for gene in gene_results:
@@ -1020,11 +1020,11 @@ with gr.Blocks(css=css) as iface:
     gr.Markdown("""
     # Virus Host Classifier
     **Step 1**: Predict overall viral sequence origin (human vs non-human) and identify extreme regions.
-    **Step 2**: Explore subregions to see local SHAP signals, distribution, GC content, etc.
     **Step 3**: Analyze gene features and their contributions.
     **Step 4**: Compare sequences and analyze differences.
-    **Color Scale**: Negative SHAP = Blue, Zero = White, Positive SHAP = Red.
     """)
     with gr.Tab("1) Full-Sequence Analysis"):
@@ -1043,11 +1043,11 @@ with gr.Blocks(css=css) as iface:
             with gr.Column(scale=2):
                 results_box = gr.Textbox(label="Classification Results", lines=12, interactive=False)
-                kmer_img = gr.Image(label="Top k-mer SHAP")
-                genome_img = gr.Image(label="Genome-wide SHAP Heatmap (Blue=neg, White=0, Red=pos)")
                 # File components with the correct type parameter
-                download_kmer_shap = gr.File(label="Download k-mer SHAP Values (CSV)", visible=True, type="filepath")
                 download_results = gr.File(label="Download Results", visible=True, elem_classes="download-button")
         seq_state = gr.State()
@@ -1071,7 +1071,7 @@ with gr.Blocks(css=css) as iface:
     with gr.Tab("2) Subregion Exploration"):
         gr.Markdown("""
         **Subregion Analysis**
-        Select start/end positions to view local SHAP signals, distribution, GC content, etc.
         The heatmap uses the same Blue-White-Red scale.
         """)
         with gr.Row():
@@ -1080,8 +1080,8 @@ with gr.Blocks(css=css) as iface:
             region_btn = gr.Button("Analyze Subregion")
         subregion_info = gr.Textbox(label="Subregion Analysis", lines=7, interactive=False)
         with gr.Row():
-            subregion_img = gr.Image(label="Subregion SHAP Heatmap (B-W-R)")
-            subregion_hist_img = gr.Image(label="SHAP Distribution (Histogram)")
         download_subregion = gr.File(label="Download Subregion Analysis", visible=False, elem_classes="download-button")
         region_btn.click(
@@ -1093,12 +1093,11 @@ with gr.Blocks(css=css) as iface:
     with gr.Tab("3) Gene Features Analysis"):
         gr.Markdown("""
         **Analyze Gene Features**
-        Upload a FASTA file and corresponding gene features file to analyze SHAP values per gene.
         Gene features should be in the format:
 >gene_name [gene=X] [locus_tag=Y] [location=start..end] or [location=complement(start..end)]
         SEQUENCE
         The genome viewer will show genes color-coded by their contribution:
         - Red: Genes pushing toward human origin
         - Blue: Genes pushing toward non-human origin
@@ -1126,7 +1125,7 @@ with gr.Blocks(css=css) as iface:
     with gr.Tab("4) Comparative Analysis"):
         gr.Markdown("""
         **Compare Two Sequences**
-        Upload or paste two FASTA sequences to compare their SHAP patterns.
         The sequences will be normalized to the same length for comparison.
         **Color Scale**:
@@ -1144,8 +1143,8 @@ with gr.Blocks(css=css) as iface:
         compare_btn = gr.Button("Compare Sequences", variant="primary")
         comparison_text = gr.Textbox(label="Comparison Results", lines=12, interactive=False)
         with gr.Row():
-            diff_heatmap = gr.Image(label="SHAP Difference Heatmap")
-            diff_hist = gr.Image(label="Distribution of SHAP Differences")
         download_comparison = gr.File(label="Download Comparison Results", visible=False, elem_classes="download-button")
         compare_btn.click(
@@ -1157,8 +1156,8 @@ with gr.Blocks(css=css) as iface:
     gr.Markdown("""
     ### Interface Features
     - **Overall Classification** (human vs non-human) using k-mer frequencies
-    - **SHAP Analysis** shows which k-mers push classification toward or away from human
-    - **White-Centered SHAP Gradient**:
       - Negative (blue), 0 (white), Positive (red)
       - Symmetrical color range around 0
     - **Identify Subregions** with strongest push for human or non-human
@@ -1172,7 +1171,7 @@ with gr.Blocks(css=css) as iface:
       - Statistical summary of differences
     - **Data Export**:
       - Download results as CSV files
-      - Download k-mer SHAP values
       - Save analysis outputs for further processing
     """)

     return vec
 ###############################################################################
+# 3. FEATURE IMPORTANCE (ABLATION) CALCULATION
 ###############################################################################
 def calculate_shap_values(model, x_tensor):
 ###############################################################################
+# 4. PER-BASE FEATURE IMPORTANCE AGGREGATION
 ###############################################################################
 def compute_positionwise_scores(sequence, shap_values, k=4):
     return shap_means
 ###############################################################################
+# 5. FIND EXTREME IMPORTANCE REGIONS
 ###############################################################################
 def find_extreme_subregion(shap_means, window_size=500, mode="max"):
     colors = [(0.0, 'blue'), (0.5, 'white'), (1.0, 'red')]
     return mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
+def plot_linear_heatmap(shap_means, title="Per-base Feature Importance Heatmap", start=None, end=None):
     if start is not None and end is not None:
         local_shap = shap_means[start:end]
         subtitle = f" (positions {start}-{end})"
     cax = ax.imshow(heatmap_data, aspect='auto', cmap=cmap, vmin=-extent, vmax=extent)
     cbar = plt.colorbar(cax, orientation='horizontal', pad=0.25, aspect=40, shrink=0.8)
     cbar.ax.tick_params(labelsize=8)
+    cbar.set_label('Feature Importance', fontsize=9, labelpad=5)
     ax.set_yticks([])
     ax.set_xlabel('Position in Sequence', fontsize=10)
     ax.set_title(f"{title}{subtitle}", pad=10)
     colors = ['#99ccff' if v < 0 else '#ff9999' for v in values]
     plt.barh(range(len(values)), values, color=colors)
     plt.yticks(range(len(values)), features)
+    plt.xlabel('Feature Importance (impact on model output)')
     plt.title(f'Top {top_k} Most Influential k-mers')
     plt.gca().invert_yaxis()
     plt.tight_layout()
     return fig
+def plot_shap_histogram(shap_array, title="Feature Importance Distribution in Region", num_bins=30):
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=num_bins, color='gray', edgecolor='black')
     ax.axvline(0, color='red', linestyle='--', label='0.0')
+    ax.set_xlabel("Feature Importance Value")
     ax.set_ylabel("Count")
     ax.set_title(title)
     ax.legend()
 # 7. MAIN ANALYSIS STEP (Gradio Step 1)
 ###############################################################################
 def create_kmer_shap_csv(kmers, shap_values):
+    """Create a CSV file with k-mer importance values and return the filepath"""
+    # Create DataFrame with k-mers and importance values
     kmer_df = pd.DataFrame({
         'kmer': kmers,
+        'importance_value': shap_values,
+        'abs_importance': np.abs(shap_values)
     })
+    # Sort by absolute importance value (most influential first)
+    kmer_df = kmer_df.sort_values('abs_importance', ascending=False)
+    # Drop the abs_importance column used for sorting
+    kmer_df = kmer_df[['kmer', 'importance_value']]
     # Save to temporary file
     temp_dir = tempfile.gettempdir()
+    temp_path = os.path.join(temp_dir, f"kmer_importance_values_{os.urandom(4).hex()}.csv")
     kmer_df.to_csv(temp_path, index=False)
     return temp_path  # Return only the file path, not a tuple
         f"(Human Probability: {prob_human:.3f}, Non-human Probability: {prob_nonhuman:.3f})\n\n"
         f"---\n"
         f"**Most Human-Pushing {window_size}-bp Subregion**:\n"
+        f"Start: {max_start}, End: {max_end}, Avg Importance: {max_avg:.4f}\n\n"
         f"**Most Non-Human–Pushing {window_size}-bp Subregion**:\n"
+        f"Start: {min_start}, End: {min_end}, Avg Importance: {min_avg:.4f}"
     )
     kmers = [''.join(p) for p in product("ACGT", repeat=4)]
     bar_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
     bar_img = fig_to_image(bar_fig)
+    heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide Feature Importance")
     heatmap_img = fig_to_image(heatmap_fig)
+    # Create CSV with k-mer importance values and return the file path
     kmer_shap_csv = create_kmer_shap_csv(kmers, shap_values)
     # State dictionary for subregion analysis
         f"Analyzing subregion of {header} from {region_start} to {region_end}\n"
         f"Region length: {len(region_seq)} bases\n"
         f"GC content: {gc_percent:.2f}%\n"
+        f"Average importance in region: {avg_shap:.4f}\n"
+        f"Fraction with importance > 0 (toward human): {positive_fraction:.2f}\n"
+        f"Fraction with importance < 0 (toward non-human): {negative_fraction:.2f}\n"
         f"Subregion interpretation: {region_classification}\n"
     )
+    heatmap_fig = plot_linear_heatmap(shap_means, title="Subregion Feature Importance", start=region_start, end=region_end)
     heatmap_img = fig_to_image(heatmap_fig)
+    hist_fig = plot_shap_histogram(region_shap, title="Feature Importance Distribution in Subregion")
     hist_img = fig_to_image(hist_fig)
     # For demonstration, returning None for the file download as well
     return mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
 def compute_shap_difference(shap1_norm, shap2_norm):
+    """Compute the feature importance difference between normalized sequences"""
     return shap2_norm - shap1_norm
+def plot_comparative_heatmap(shap_diff, title="Feature Importance Difference Heatmap"):
     """
     Plot heatmap using relative positions (0-100%)
     """
     cbar = plt.colorbar(cax, orientation='horizontal', pad=0.25, aspect=40, shrink=0.8)
     cbar.ax.tick_params(labelsize=8)
+    cbar.set_label('Feature Importance Difference (Seq2 - Seq1)', fontsize=9, labelpad=5)
     ax.set_yticks([])
     ax.set_xlabel('Relative Position in Sequence', fontsize=10)
     return fig
+def plot_shap_histogram(shap_array, title="Feature Importance Distribution", num_bins=30):
     """
+    Plot histogram of feature importance values with configurable number of bins
     """
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=num_bins, color='gray', edgecolor='black', alpha=0.7)
     ax.axvline(0, color='red', linestyle='--', label='0.0')
+    ax.set_xlabel("Feature Importance Value")
     ax.set_ylabel("Count")
     ax.set_title(title)
     ax.legend()
 def normalize_shap_lengths(shap1, shap2):
     """
+    Normalize and smooth feature importance values with dynamic adaptation
     """
     # Calculate adaptive parameters
     num_points, smooth_window, _ = calculate_adaptive_parameters(len(shap1), len(shap2))
         if isinstance(res2[0], str) and "Error" in res2[0]:
             return (f"Error in sequence 2: {res2[0]}", None, None, None)
+        # Extract feature importance values and sequence info
         shap1 = res1[3]["shap_means"]
         shap2 = res2[3]["shap_means"]
             f"Smoothing Window: {smooth_window} points\n"
             f"Adaptive Threshold: {adaptive_threshold:.3f}\n\n"
             "Statistics:\n"
+            f"Average feature importance difference: {avg_diff:.4f}\n"
             f"Standard deviation: {std_diff:.4f}\n"
             f"Max difference: {max_diff:.4f} (Seq2 more human-like)\n"
             f"Min difference: {min_diff:.4f} (Seq1 more human-like)\n"
         # Generate visualizations
         heatmap_fig = plot_comparative_heatmap(
             shap_diff,
+            title=f"Feature Importance Difference Heatmap (window: {smooth_window})"
         )
         heatmap_img = fig_to_image(heatmap_fig)
         num_bins = max(20, min(50, int(np.sqrt(len(shap_diff)))))
         hist_fig = plot_shap_histogram(
             shap_diff,
+            title="Distribution of Feature Importance Differences",
             num_bins=num_bins
         )
         hist_img = fig_to_image(hist_fig)
         return None, None
 def compute_gene_statistics(gene_shap: np.ndarray) -> Dict[str, float]:
+    """Compute statistical measures for gene feature importance values"""
     return {
         'avg_shap': float(np.mean(gene_shap)),
         'median_shap': float(np.median(gene_shap)),
 def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_length: int) -> Image.Image:
     """
     Create a simple genome diagram using PIL, forcing a minimum color intensity
+    so that small feature importance values don't appear white.
     """
     from PIL import Image, ImageDraw, ImageFont
         title_font = ImageFont.load_default()
     # Draw title
+    draw.text((margin, margin // 2), "Genome Feature Importance Analysis", fill='black', font=title_font or font)
     # Draw genome line
     line_y = height // 2
         ], fill='black', width=1)
         draw.text((int(x_coord - 20), int(line_y + 10)), f"{i:,}", fill='black', font=font)
+    # Sort genes by absolute feature importance value for drawing
     sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']))
     # Draw genes
         start_x = margin + int(gene['start'] * scale)
         end_x   = margin + int(gene['end'] * scale)
+        # Calculate color based on feature importance value
         avg_shap = gene['avg_shap']
+        # Convert importance -> color intensity (0 to 255)
         # Then clamp to a minimum intensity so it never ends up plain white
         intensity = int(abs(avg_shap) * 500)
         intensity = max(50, min(255, intensity))  # clamp between 50 and 255
     # Draw legend
     legend_x = margin
     legend_y = height - margin
+    draw.text((int(legend_x), int(legend_y - 60)), "Feature Importance Values:", fill='black', font=font)
     # Draw legend boxes
     box_width = 20
                           features_file: str,
                           fasta_text: str = "",
                           features_text: str = "") -> Tuple[str, Optional[str], Optional[Image.Image]]:
+    """Analyze feature importance values for each gene feature"""
     # First analyze whole sequence
     sequence_results = analyze_sequence(sequence_file, top_kmers=10, fasta_text=fasta_text)
     if isinstance(sequence_results[0], str) and "Error" in sequence_results[0]:
         return f"Error in sequence analysis: {sequence_results[0]}", None, None
+    # Get feature importance values
     shap_means = sequence_results[3]["shap_means"]
     # Parse gene features
             if start is None or end is None:
                 continue
+            # Get feature importance values for this region
             gene_shap = shap_means[start:end]
             stats = compute_gene_statistics(gene_shap)
     if not gene_results:
         return "No valid genes could be processed", None, None
+    # Sort genes by absolute feature importance value
     sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']), reverse=True)
     # Create results text
             f"Location: {gene['location']}\n"
             f"Classification: {gene['classification']} "
             f"(confidence: {gene['confidence']:.4f})\n"
+            f"Average Feature Importance: {gene['avg_shap']:.4f}\n\n"
         )
     # Create CSV content
+    csv_content = "gene_name,location,avg_importance,median_importance,std_importance,max_importance,min_importance,"
     csv_content += "pos_fraction,classification,confidence,locus_tag\n"
     for gene in gene_results:
     gr.Markdown("""
     # Virus Host Classifier
     **Step 1**: Predict overall viral sequence origin (human vs non-human) and identify extreme regions.
+    **Step 2**: Explore subregions to see local feature influence, distribution, GC content, etc.
     **Step 3**: Analyze gene features and their contributions.
     **Step 4**: Compare sequences and analyze differences.
+    **Color Scale**: Negative values = Blue, Zero = White, Positive values = Red.
     """)
     with gr.Tab("1) Full-Sequence Analysis"):
             with gr.Column(scale=2):
                 results_box = gr.Textbox(label="Classification Results", lines=12, interactive=False)
+                kmer_img = gr.Image(label="Top k-mer Importance")
+                genome_img = gr.Image(label="Genome-wide Feature Importance Heatmap (Blue=neg, White=0, Red=pos)")
                 # File components with the correct type parameter
+                download_kmer_shap = gr.File(label="Download k-mer Importance Values (CSV)", visible=True, type="filepath")
                 download_results = gr.File(label="Download Results", visible=True, elem_classes="download-button")
         seq_state = gr.State()
     with gr.Tab("2) Subregion Exploration"):
         gr.Markdown("""
         **Subregion Analysis**
+        Select start/end positions to view local feature importance, distribution, GC content, etc.
         The heatmap uses the same Blue-White-Red scale.
         """)
         with gr.Row():
             region_btn = gr.Button("Analyze Subregion")
         subregion_info = gr.Textbox(label="Subregion Analysis", lines=7, interactive=False)
         with gr.Row():
+            subregion_img = gr.Image(label="Subregion Feature Importance Heatmap (B-W-R)")
+            subregion_hist_img = gr.Image(label="Feature Importance Distribution (Histogram)")
         download_subregion = gr.File(label="Download Subregion Analysis", visible=False, elem_classes="download-button")
         region_btn.click(
     with gr.Tab("3) Gene Features Analysis"):
         gr.Markdown("""
         **Analyze Gene Features**
+        Upload a FASTA file and corresponding gene features file to analyze feature importance values per gene.
         Gene features should be in the format:
 >gene_name [gene=X] [locus_tag=Y] [location=start..end] or [location=complement(start..end)]
         SEQUENCE
         The genome viewer will show genes color-coded by their contribution:
         - Red: Genes pushing toward human origin
         - Blue: Genes pushing toward non-human origin
     with gr.Tab("4) Comparative Analysis"):
         gr.Markdown("""
         **Compare Two Sequences**
+        Upload or paste two FASTA sequences to compare their feature importance patterns.
         The sequences will be normalized to the same length for comparison.
         **Color Scale**:
         compare_btn = gr.Button("Compare Sequences", variant="primary")
         comparison_text = gr.Textbox(label="Comparison Results", lines=12, interactive=False)
         with gr.Row():
+            diff_heatmap = gr.Image(label="Feature Importance Difference Heatmap")
+            diff_hist = gr.Image(label="Distribution of Feature Importance Differences")
         download_comparison = gr.File(label="Download Comparison Results", visible=False, elem_classes="download-button")
         compare_btn.click(
     gr.Markdown("""
     ### Interface Features
     - **Overall Classification** (human vs non-human) using k-mer frequencies
+    - **Feature Importance Analysis** shows which k-mers push classification toward or away from human
+    - **White-Centered Gradient**:
       - Negative (blue), 0 (white), Positive (red)
       - Symmetrical color range around 0
     - **Identify Subregions** with strongest push for human or non-human
       - Statistical summary of differences
     - **Data Export**:
       - Download results as CSV files
+      - Download k-mer importance values
       - Save analysis outputs for further processing
     """)