import gradio as gr
import torch
import joblib
import numpy as np
from itertools import product
import torch.nn as nn
import matplotlib.pyplot as plt
import io
from PIL import Image

###############################################################################
# 1. MODEL DEFINITION
###############################################################################

class VirusClassifier(nn.Module):
    def __init__(self, input_shape: int):
        super(VirusClassifier, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_shape, 64),
            nn.GELU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.GELU(),
            nn.BatchNorm1d(32),
            nn.Dropout(0.3),
            nn.Linear(32, 32),
            nn.GELU(),
            nn.Linear(32, 2)
        )

    def forward(self, x):
        return self.network(x)

###############################################################################
# 2. FASTA PARSING & K-MER FEATURE ENGINEERING
###############################################################################

def parse_fasta(text):
    """Parse FASTA formatted text into a list of (header, sequence)."""
    sequences = []
    current_header = None
    current_sequence = []
    
    for line in text.strip().split('\n'):
        line = line.strip()
        if not line:
            continue
        if line.startswith('>'):
            if current_header:
                sequences.append((current_header, ''.join(current_sequence)))
            current_header = line[1:]
            current_sequence = []
        else:
            current_sequence.append(line.upper())
    if current_header:
        sequences.append((current_header, ''.join(current_sequence)))
    return sequences

def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
    """Convert a sequence to a k-mer frequency vector for classification."""
    kmers = [''.join(p) for p in product("ACGT", repeat=k)]
    kmer_dict = {km: i for i, km in enumerate(kmers)}
    vec = np.zeros(len(kmers), dtype=np.float32)
    
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i+k]
        if kmer in kmer_dict:
            vec[kmer_dict[kmer]] += 1

    total_kmers = len(sequence) - k + 1
    if total_kmers > 0:
        vec = vec / total_kmers

    return vec

###############################################################################
# 3. SHAP-VALUE (ABLATION) CALCULATION
###############################################################################

def calculate_shap_values(model, x_tensor):
    """
    Calculate SHAP values using a simple ablation approach.
    Returns shap_values, prob_human
    """
    model.eval()
    with torch.no_grad():
        # Baseline
        baseline_output = model(x_tensor)
        baseline_probs = torch.softmax(baseline_output, dim=1)
        baseline_prob = baseline_probs[0, 1].item()  # Probability of 'human' class
        
        # Zeroing each feature to measure impact
        shap_values = []
        x_zeroed = x_tensor.clone()
        for i in range(x_tensor.shape[1]):
            original_val = x_zeroed[0, i].item()
            x_zeroed[0, i] = 0.0
            output = model(x_zeroed)
            probs = torch.softmax(output, dim=1)
            prob = probs[0, 1].item()
            impact = baseline_prob - prob
            shap_values.append(impact)
            x_zeroed[0, i] = original_val  # restore
    return np.array(shap_values), baseline_prob

###############################################################################
# 4. PER-BASE SHAP AGGREGATION
###############################################################################

def compute_positionwise_scores(sequence, shap_values, k=4):
    """
    Returns an array of per-base SHAP contributions by averaging
    the k-mer SHAP values of all k-mers covering that base.
    """
    kmers = [''.join(p) for p in product("ACGT", repeat=k)]
    kmer_dict = {km: i for i, km in enumerate(kmers)}
    
    seq_len = len(sequence)
    shap_sums = np.zeros(seq_len, dtype=np.float32)
    coverage = np.zeros(seq_len, dtype=np.float32)
    
    for i in range(seq_len - k + 1):
        kmer = sequence[i:i+k]
        if kmer in kmer_dict:
            val = shap_values[kmer_dict[kmer]]
            shap_sums[i : i + k] += val
            coverage[i : i + k] += 1

    with np.errstate(divide='ignore', invalid='ignore'):
        shap_means = np.where(coverage > 0, shap_sums / coverage, 0.0)
        
    return shap_means

###############################################################################
# 5. FIND EXTREME SHAP REGIONS
###############################################################################

def find_extreme_subregion(shap_means, window_size=500, mode="max"):
    """
    Finds the subregion of length `window_size` that has the maximum 
    (mode="max") or minimum (mode="min") average SHAP.
    Returns (best_start, best_end, avg_shap).
    """
    n = len(shap_means)
    if window_size >= n:
        # If the window is bigger than the entire sequence, return the whole seq
        avg_val = np.mean(shap_means) if n > 0 else 0.0
        return (0, n, avg_val)
    
    # For efficiency, we can do a rolling sum approach
    csum = np.cumsum(shap_means)  
    # csum[i] = sum of shap_means[0..i-1]
    def window_sum(start):
        end = start + window_size
        return csum[end] - csum[start]
    
    best_start = 0
    # Initialize the best with the first window
    best_sum = window_sum(0)
    best_avg = best_sum / window_size
    
    for start in range(1, n - window_size + 1):
        wsum = window_sum(start)
        wavg = wsum / window_size
        if mode == "max":
            if wavg > best_avg:
                best_avg = wavg
                best_start = start
        else:  # mode == "min"
            if wavg < best_avg:
                best_avg = wavg
                best_start = start

    return (best_start, best_start + window_size, best_avg)

###############################################################################
# 6. PLOTTING / UTILITIES
###############################################################################

def fig_to_image(fig):
    """Convert a Matplotlib figure to a PIL Image for Gradio."""
    buf = io.BytesIO()
    fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
    buf.seek(0)
    img = Image.open(buf)
    plt.close(fig)
    return img

def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
    """
    Plots a 1D heatmap of per-base SHAP contributions.
    Negative = push toward Non-Human, Positive = push toward Human.
    Optionally can show only a subrange (start:end).

    We adjust layout so the colorbar is well below the x-axis:
      - orientation='horizontal', pad=0.35
      - plt.subplots_adjust(bottom=0.4) 
    """
    if start is not None and end is not None:
        shap_means = shap_means[start:end]
        subtitle = f" (positions {start}-{end})"
    else:
        subtitle = ""
    
    heatmap_data = shap_means.reshape(1, -1)  # shape (1, region_length)
    
    fig, ax = plt.subplots(figsize=(12, 2))
    cax = ax.imshow(heatmap_data, aspect='auto', cmap='RdBu_r')
    
    # Place colorbar below and add extra margin
    cbar = plt.colorbar(cax, orientation='horizontal', pad=0.35)
    cbar.set_label('SHAP Contribution')

    ax.set_yticks([])
    ax.set_xlabel('Position in Sequence')
    ax.set_title(f"{title}{subtitle}")
    
    # Extra bottom margin so colorbar won't overlap x-axis labels
    plt.subplots_adjust(bottom=0.4)
    
    return fig

def create_importance_bar_plot(shap_values, kmers, top_k=10):
    """Create a bar plot of the most important k-mers."""
    plt.rcParams.update({'font.size': 10})
    fig = plt.figure(figsize=(10, 5))
    
    # Sort by absolute importance
    indices = np.argsort(np.abs(shap_values))[-top_k:]
    values = shap_values[indices]
    features = [kmers[i] for i in indices]
    
    colors = ['#ff9999' if v > 0 else '#99ccff' for v in values]
    
    plt.barh(range(len(values)), values, color=colors)
    plt.yticks(range(len(values)), features)
    plt.xlabel('SHAP Value (impact on model output)')
    plt.title(f'Top {top_k} Most Influential k-mers')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    return fig

def plot_shap_histogram(shap_array, title="SHAP Distribution in Region"):
    """
    Simple histogram of SHAP values in the subregion.
    Helps see how many positions push human vs non-human.
    """
    fig, ax = plt.subplots(figsize=(6, 4))
    ax.hist(shap_array, bins=30, color='gray', edgecolor='black')
    ax.axvline(0, color='red', linestyle='--', label='0.0')
    ax.set_xlabel("SHAP Value")
    ax.set_ylabel("Count")
    ax.set_title(title)
    ax.legend()
    plt.tight_layout()
    return fig

def compute_gc_content(sequence):
    """Compute %GC in the sequence (A, C, G, T)."""
    if not sequence:
        return 0
    gc_count = sequence.count('G') + sequence.count('C')
    return (gc_count / len(sequence)) * 100.0

###############################################################################
# 7. MAIN ANALYSIS STEP (Gradio Step 1)
###############################################################################

def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
    """
    Analyzes the entire genome, returning classification, full-genome heatmap,
    top k-mer bar plot, and identifies subregions with strongest positive/negative push.
    """
    # Handle input
    if fasta_text.strip():
        text = fasta_text.strip()
    elif file_obj is not None:
        try:
            with open(file_obj, 'r') as f:
                text = f.read()
        except Exception as e:
            return (f"Error reading file: {str(e)}", None, None, None, None)
    else:
        return ("Please provide a FASTA sequence.", None, None, None, None)

    # Parse FASTA
    sequences = parse_fasta(text)
    if not sequences:
        return ("No valid FASTA sequences found.", None, None, None, None)
    
    header, seq = sequences[0]

    # Load model and scaler
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    try:
        model = VirusClassifier(256).to(device)
        model.load_state_dict(torch.load('model.pt', map_location=device))
        scaler = joblib.load('scaler.pkl')
    except Exception as e:
        return (f"Error loading model: {str(e)}", None, None, None, None)

    # Vectorize + scale
    freq_vector = sequence_to_kmer_vector(seq)
    scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
    x_tensor = torch.FloatTensor(scaled_vector).to(device)

    # SHAP + classification
    shap_values, prob_human = calculate_shap_values(model, x_tensor)
    prob_nonhuman = 1.0 - prob_human
    
    classification = "Human" if prob_human > 0.5 else "Non-human"
    confidence = max(prob_human, prob_nonhuman)

    # Per-base SHAP
    shap_means = compute_positionwise_scores(seq, shap_values, k=4)

    # Find the most "human-pushing" region
    (max_start, max_end, max_avg) = find_extreme_subregion(shap_means, window_size, mode="max")
    # Find the most "non-human–pushing" region
    (min_start, min_end, min_avg) = find_extreme_subregion(shap_means, window_size, mode="min")

    # Build results text
    results_text = (
        f"Sequence: {header}\n"
        f"Length: {len(seq):,} bases\n"
        f"Classification: {classification}\n"
        f"Confidence: {confidence:.3f}\n"
        f"(Human Probability: {prob_human:.3f}, Non-human Probability: {prob_nonhuman:.3f})\n\n"
        f"---\n"
        f"**Most Human-Pushing {window_size}-bp Subregion**:\n"
        f"Start: {max_start}, End: {max_end}, Avg SHAP: {max_avg:.4f}\n\n"
        f"**Most Non-Human–Pushing {window_size}-bp Subregion**:\n"
        f"Start: {min_start}, End: {min_end}, Avg SHAP: {min_avg:.4f}"
    )

    # K-mer importance plot
    kmers = [''.join(p) for p in product("ACGT", repeat=4)]
    bar_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
    bar_img = fig_to_image(bar_fig)

    # Full-genome SHAP heatmap
    heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
    heatmap_img = fig_to_image(heatmap_fig)

    # Store data for subregion analysis
    state_dict = {
        "seq": seq,
        "shap_means": shap_means
    }

    # We now return 5 items (not 6):
    return (results_text, bar_img, heatmap_img, state_dict, header)

###############################################################################
# 8. SUBREGION ANALYSIS (Gradio Step 2)
###############################################################################

def analyze_subregion(state, header, region_start, region_end):
    """
    Takes stored data from step 1 and a user-chosen region.
    Returns a subregion heatmap, histogram, and some stats (GC, average SHAP).
    """
    if not state or "seq" not in state or "shap_means" not in state:
        return ("No sequence data found. Please run Step 1 first.", None, None)
    
    seq = state["seq"]
    shap_means = state["shap_means"]

    # Validate bounds
    region_start = int(region_start)
    region_end = int(region_end)

    region_start = max(0, min(region_start, len(seq)))
    region_end = max(0, min(region_end, len(seq)))
    if region_end <= region_start:
        return ("Invalid region range. End must be > Start.", None, None)

    # Subsequence
    region_seq = seq[region_start:region_end]
    region_shap = shap_means[region_start:region_end]

    # Some stats
    gc_percent = compute_gc_content(region_seq)
    avg_shap = float(np.mean(region_shap))

    # Fraction pushing toward human vs. non-human
    positive_fraction = np.mean(region_shap > 0)
    negative_fraction = np.mean(region_shap < 0)

    # Simple logic-based interpretation
    if avg_shap > 0.05:
        region_classification = "Likely pushing toward human"
    elif avg_shap < -0.05:
        region_classification = "Likely pushing toward non-human"
    else:
        region_classification = "Near neutral (no strong push)"

    region_info = (
        f"Analyzing subregion of {header} from {region_start} to {region_end}\n"
        f"Region length: {len(region_seq)} bases\n"
        f"GC content: {gc_percent:.2f}%\n"
        f"Average SHAP in region: {avg_shap:.4f}\n"
        f"Fraction with SHAP > 0 (toward human): {positive_fraction:.2f}\n"
        f"Fraction with SHAP < 0 (toward non-human): {negative_fraction:.2f}\n"
        f"Subregion interpretation: {region_classification}\n"
    )

    # Plot region as small heatmap
    heatmap_fig = plot_linear_heatmap(
        shap_means, 
        title="Subregion SHAP", 
        start=region_start, 
        end=region_end
    )
    heatmap_img = fig_to_image(heatmap_fig)

    # Plot histogram of SHAP in region
    hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
    hist_img = fig_to_image(hist_fig)

    return (region_info, heatmap_img, hist_img)


###############################################################################
# 9. BUILD GRADIO INTERFACE
###############################################################################

css = """
.gradio-container {
    font-family: 'IBM Plex Sans', sans-serif;
}
"""

with gr.Blocks(css=css) as iface:
    gr.Markdown("""
    # Virus Host Classifier (with Interactive Region Viewer)
    **Step 1**: Predict overall viral sequence origin (human vs non-human) and identify extreme regions.  
    **Step 2**: Explore subregions to see local SHAP signals, distribution, GC content, etc.
    """)
    
    with gr.Tab("1) Full-Sequence Analysis"):
        with gr.Row():
            with gr.Column(scale=1):
                file_input = gr.File(
                    label="Upload FASTA file",
                    file_types=[".fasta", ".fa", ".txt"],
                    type="filepath"
                )
                text_input = gr.Textbox(
                    label="Or paste FASTA sequence",
                    placeholder=">sequence_name\nACGTACGT...",
                    lines=5
                )
                top_k = gr.Slider(
                    minimum=5,
                    maximum=30,
                    value=10,
                    step=1,
                    label="Number of top k-mers to display"
                )
                win_size = gr.Slider(
                    minimum=100,
                    maximum=5000,
                    value=500,
                    step=100,
                    label="Window size for 'most pushing' subregions"
                )
                analyze_btn = gr.Button("Analyze Sequence", variant="primary")
                
            with gr.Column(scale=2):
                results_box = gr.Textbox(
                    label="Classification Results", lines=12, interactive=False
                )
                kmer_img = gr.Image(label="Top k-mer SHAP")
                genome_img = gr.Image(label="Genome-wide SHAP Heatmap")
        
        # State for step 2
        seq_state = gr.State()
        header_state = gr.State()

        # analyze_sequence(...) now returns 5 items, so we have 5 outputs.
        # 1) results_text
        # 2) bar_img
        # 3) heatmap_img
        # 4) state_dict
        # 5) header
        analyze_btn.click(
            analyze_sequence,
            inputs=[file_input, top_k, text_input, win_size],
            outputs=[results_box, kmer_img, genome_img, seq_state, header_state]
        )
    
    with gr.Tab("2) Subregion Exploration"):
        gr.Markdown("""
        **Subregion Analysis**  
        Select start/end positions to view local SHAP signals, distribution, and GC content.
        """)
        with gr.Row():
            region_start = gr.Number(label="Region Start", value=0)
            region_end = gr.Number(label="Region End", value=500)
            region_btn = gr.Button("Analyze Subregion")
        
        subregion_info = gr.Textbox(
            label="Subregion Analysis",
            lines=7, 
            interactive=False
        )
        with gr.Row():
            subregion_img = gr.Image(label="Subregion SHAP Heatmap")
            subregion_hist_img = gr.Image(label="SHAP Distribution (Histogram)")
        
        region_btn.click(
            analyze_subregion,
            inputs=[seq_state, header_state, region_start, region_end],
            outputs=[subregion_info, subregion_img, subregion_hist_img]
        )
    
    gr.Markdown("""
    ### What does this interface provide?
    1. **Overall Classification** (human vs non-human), using a learned model on k-mer frequencies.
    2. **SHAP Analysis** (ablation-based) to see which k-mer features push classification toward or away from "human".
    3. **Genome-Wide SHAP Heatmap**: Each base's average SHAP across overlapping k-mers.
    4. **Subregion Exploration**:
       - Local SHAP signals (heatmap & histogram)
       - GC content, fraction of bases pushing "human" vs "non-human"
       - Simple logic-based interpretation based on average SHAP
    5. **Identification of the most 'human-pushing' subregion** (max average SHAP) 
       and the most 'non-human–pushing' subregion (min average SHAP), 
       each of a chosen window size.
    """)

if __name__ == "__main__":
    iface.launch()