Spaces:

DNA-LLM
/

viral_complexity

Runtime error

File size: 4,129 Bytes

import pandas as pd
import numpy as np
from datasets import load_dataset
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
from shiny import render
from shiny.express import input, output, ui
# from utils import (
#     generate_2d_sequence,
#     plot_seq_full_label
# )
import os
import matplotlib as mpl
import seaborn as sns
mpl.rcParams.update(mpl.rcParamsDefault)

ds = load_dataset('Hack90/virus_tiny')
df_virus = pd.DataFrame(ds['train'])

def shannon_entropy(seq):
  seq=re.sub("[^ATCG]","",seq)
  seq = seq.replace('A', 'T')
  seq = seq.replace('G', 'C')
  p = seq.count('T') / len(seq)
  e = 8.69 - 8.31
  c_h = ((-p * math.log(p)) - (1-p)* math.log(1-p)) * math.log((1-p)/p)
  c_h = c_h /e
  seq=seq.replace('T', '5 ')
  seq=seq.replace('C', '4 ')
  seq = np.array(seq.split()).astype(int)
  shann = -sum((p*math.log(p), ((1-p)*math.log(1-p))))
  shann = shann/2
  return c_h , shann


ui.page_opts(fillable=True)

with ui.navset_card_tab(id="tab"):
    with ui.nav_panel("Species View"):
        ui.panel_title("What is the distribution of complexity across viral species?")
        with ui.card():
            ui.input_slider("sample", "samples", 0, len(df_virus), 40)
                
        def plot_loss_rates(df,samples):
            complexity = []
            for k in range(len(df.iloc[:samples])):
              complexity.append(shannon_entropy(df['sequence'].iloc[k]))
            
            df_nana = pd.DataFrame(complexity)
            df_nana['x'] = df_nana[1] * 2
            df_nana['y'] = df_nana[0]


            # fig, ax = plt.subplots()


            # Create a figure and axis
            fig, ax = plt.subplots()
            
            # Create the scatter plot
            scatter = ax.scatter(df_nana['x'], df_nana['y'], s=0.5)
            
            # Add a colorbar
            cbar = fig.colorbar(scatter, ax=ax)
            cbar.set_label('Label')
            
            # Set labels and title
            # ax.set_xlabel('X')
            # ax.set_ylabel('Y')
            # ax.set_title(f"Loss ra")
            # ax.set_xlabel("Training steps")
            # ax.set_ylabel("Loss rate")
            return fig

        @render.plot()
        def plot_context_size_scaling():
            fig = plot_loss_rates(df_virus,input.sample() )
            if fig:
                return fig
    # with ui.nav_panel("Histone Modification"):
    #     ui.panel_title("Is there a pattern to histone modification?")
    #     with ui.layout_columns():
    #         with ui.card():
    #             ui.input_slider("sample_histone", "sample", 0, df_histone_len, 40)
        
        
    #     def plot_histone(df,sample):
    #         y_values = generate_2d_sequence(df['seq'].iloc[sample])[0]
    #         x_values = generate_2d_sequence(df['seq'].iloc[sample])[1]
            
    #         integers = str((np.argwhere(df['labels'][sample] == np.amax(df['labels'][sample]))).flatten().tolist())
    #         # Create a DataFrame with the x values, y values, and integers
    #         data = {'x': x_values, 'y': y_values, 'color': integers}

    #         fig, ax = plt.subplots()

    #         sns.scatterplot(x='x', y='y', hue='color', data=data, palette='viridis', ax=ax)
    #         ax.legend()
    #         # ax.set_title(f"Loss ra")
    #         # ax.set_xlabel("Training steps")
    #         # ax.set_ylabel("Loss rate")
    #         return fig      
    #     @render.plot()
    #     def plot_histones_two():
    #         fig = plot_histone(df_histone,input.sample_histone() )
    #         if fig:
    #             return fig
    # with ui.nav_panel("Enhancer Annontations"):
    #     ui.panel_title("Is there a pattern to enhancer annotations?")
    #     with ui.layout_columns():
    #         with ui.card():
    #             ui.input_slider("sample_enhancer", "sample", 0, df_enhancer_annotation_len, 40)
    #     @render.plot()
    #     def plot_enhancer():
    #         fig = plot_loss_rates(df_enhancer_annotation,input.sample_enhancer() , True)
    #         if fig:
    #             return fig