Spaces:

DNA-LLM
/

viral_complexity

Runtime error

App Files Files Community

Hack90 commited on Jun 19, 2024

Commit

42c997a

verified ·

1 Parent(s): 61488b4

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -62

app.py CHANGED Viewed

@@ -1,46 +1,56 @@
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from scipy.interpolate import interp1d
 from shiny import render
 from shiny.express import input, output, ui
-from utils import (
-    generate_2d_sequence,
-    plot_seq_full_label
-)
 import os
 import matplotlib as mpl
 import seaborn as sns
 mpl.rcParams.update(mpl.rcParamsDefault)
-df_gene_varient = pd.read_parquet("gene_varient.parquet")
-df_histone = pd.read_parquet("histone.parquet")
-df_gene_len = len(df_gene_varient)
-df_histone_len = len(df_histone)
-df_enhancer_annotation = pd.read_parquet('enhancer_annotation.parquet')
-df_enhancer_annotation_len = len(df_enhancer_annotation)
 ui.page_opts(fillable=True)
 with ui.navset_card_tab(id="tab"):
-    with ui.nav_panel("Gene Varient"):
-        ui.panel_title("Is there a pattern to gene varient location?")
-        with ui.layout_columns():
-            with ui.card():
-                ui.input_slider("sample", "sample", 0, df_gene_len, 40)
-        def plot_loss_rates(df, sample, enhancer=False):
-            y_values = generate_2d_sequence(df['seq'].iloc[sample])[0]
-            x_values = generate_2d_sequence(df['seq'].iloc[sample])[1]
-            integers = df['labels'].iloc[sample]
-            if enhancer:
-                K= 128
-                res = []
-                for i in integers:
-                    res.extend([i]*K)
-                integers = res
-            # Create a DataFrame with the x values, y values, and integers
-            data = {'x': x_values, 'y': y_values, 'color': integers}
             # fig, ax = plt.subplots()
@@ -49,7 +59,7 @@ with ui.navset_card_tab(id="tab"):
             fig, ax = plt.subplots()
             # Create the scatter plot
-            scatter = ax.scatter(data['x'], data['y'], c=data['color'], cmap='tab20', s=0.5)
             # Add a colorbar
             cbar = fig.colorbar(scatter, ax=ax)
@@ -65,46 +75,46 @@ with ui.navset_card_tab(id="tab"):
         @render.plot()
         def plot_context_size_scaling():
-            fig = plot_loss_rates(df_gene_varient,input.sample() )
             if fig:
                 return fig
-    with ui.nav_panel("Histone Modification"):
-        ui.panel_title("Is there a pattern to histone modification?")
-        with ui.layout_columns():
-            with ui.card():
-                ui.input_slider("sample_histone", "sample", 0, df_histone_len, 40)
-        def plot_histone(df,sample):
-            y_values = generate_2d_sequence(df['seq'].iloc[sample])[0]
-            x_values = generate_2d_sequence(df['seq'].iloc[sample])[1]
-            integers = str((np.argwhere(df['labels'][sample] == np.amax(df['labels'][sample]))).flatten().tolist())
-            # Create a DataFrame with the x values, y values, and integers
-            data = {'x': x_values, 'y': y_values, 'color': integers}
-            fig, ax = plt.subplots()
-            sns.scatterplot(x='x', y='y', hue='color', data=data, palette='viridis', ax=ax)
-            ax.legend()
-            # ax.set_title(f"Loss ra")
-            # ax.set_xlabel("Training steps")
-            # ax.set_ylabel("Loss rate")
-            return fig
-        @render.plot()
-        def plot_histones_two():
-            fig = plot_histone(df_histone,input.sample_histone() )
-            if fig:
-                return fig
-    with ui.nav_panel("Enhancer Annontations"):
-        ui.panel_title("Is there a pattern to enhancer annotations?")
-        with ui.layout_columns():
-            with ui.card():
-                ui.input_slider("sample_enhancer", "sample", 0, df_enhancer_annotation_len, 40)
-        @render.plot()
-        def plot_enhancer():
-            fig = plot_loss_rates(df_enhancer_annotation,input.sample_enhancer() , True)
-            if fig:
-                return fig

 import pandas as pd
 import numpy as np
+from datasets import load_dataset
 import matplotlib.pyplot as plt
 from scipy.interpolate import interp1d
 from shiny import render
 from shiny.express import input, output, ui
+# from utils import (
+#     generate_2d_sequence,
+#     plot_seq_full_label
+# )
 import os
 import matplotlib as mpl
 import seaborn as sns
 mpl.rcParams.update(mpl.rcParamsDefault)
+ds = load_dataset('Hack90/virus_tiny')
+df_virus = pd.DataFrame(ds['train'])
+def shannon_entropy(seq):
+  seq=re.sub("[^ATCG]","",seq)
+  seq = seq.replace('A', 'T')
+  seq = seq.replace('G', 'C')
+  p = seq.count('T') / len(seq)
+  e = 8.69 - 8.31
+  c_h = ((-p * math.log(p)) - (1-p)* math.log(1-p)) * math.log((1-p)/p)
+  c_h = c_h /e
+  seq=seq.replace('T', '5 ')
+  seq=seq.replace('C', '4 ')
+  seq = np.array(seq.split()).astype(int)
+  shann = -sum((p*math.log(p), ((1-p)*math.log(1-p))))
+  shann = shann/2
+  return c_h , shann
 ui.page_opts(fillable=True)
 with ui.navset_card_tab(id="tab"):
+    with ui.nav_panel("Species View"):
+        ui.panel_title("What is the distribution of complexity across viral species?")
+        with ui.card():
+            ui.input_slider("sample", "samples", 0, len(df_virus), 40)
+        def plot_loss_rates(df,samples enhancer=False):
+            for
+            complexity = []
+            for k in range(len(df.iloc[:df_virus])):
+              complexity.append(shannon_entropy(df['sequence'].iloc[k]))
+            df_nana = pd.DataFrame(complexity)
+            df_nana['x'] = df_nana[1] * 2
+            df_nana['y'] = df_nana[0]
             # fig, ax = plt.subplots()
             fig, ax = plt.subplots()
             # Create the scatter plot
+            scatter = ax.scatter(df_nana['x'], df_nana['y'], s=0.5)
             # Add a colorbar
             cbar = fig.colorbar(scatter, ax=ax)
         @render.plot()
         def plot_context_size_scaling():
+            fig = plot_loss_rates(df_virus,input.sample() )
             if fig:
                 return fig
+    # with ui.nav_panel("Histone Modification"):
+    #     ui.panel_title("Is there a pattern to histone modification?")
+    #     with ui.layout_columns():
+    #         with ui.card():
+    #             ui.input_slider("sample_histone", "sample", 0, df_histone_len, 40)
+    #     def plot_histone(df,sample):
+    #         y_values = generate_2d_sequence(df['seq'].iloc[sample])[0]
+    #         x_values = generate_2d_sequence(df['seq'].iloc[sample])[1]
+    #         integers = str((np.argwhere(df['labels'][sample] == np.amax(df['labels'][sample]))).flatten().tolist())
+    #         # Create a DataFrame with the x values, y values, and integers
+    #         data = {'x': x_values, 'y': y_values, 'color': integers}
+    #         fig, ax = plt.subplots()
+    #         sns.scatterplot(x='x', y='y', hue='color', data=data, palette='viridis', ax=ax)
+    #         ax.legend()
+    #         # ax.set_title(f"Loss ra")
+    #         # ax.set_xlabel("Training steps")
+    #         # ax.set_ylabel("Loss rate")
+    #         return fig
+    #     @render.plot()
+    #     def plot_histones_two():
+    #         fig = plot_histone(df_histone,input.sample_histone() )
+    #         if fig:
+    #             return fig
+    # with ui.nav_panel("Enhancer Annontations"):
+    #     ui.panel_title("Is there a pattern to enhancer annotations?")
+    #     with ui.layout_columns():
+    #         with ui.card():
+    #             ui.input_slider("sample_enhancer", "sample", 0, df_enhancer_annotation_len, 40)
+    #     @render.plot()
+    #     def plot_enhancer():
+    #         fig = plot_loss_rates(df_enhancer_annotation,input.sample_enhancer() , True)
+    #         if fig:
+    #             return fig