maom's picture
Update app.py
992d092 verified
raw
history blame
3.45 kB
import datasets
import streamlit as st
import numpy as np
import pandas as pd
import altair as alt
st.set_page_config(layout='wide')
st.markdown("""
# CryptoCEN Expression Scatter
**CryptoCEN** is a co-expression network for *Cryptococcus neoformans* built on 1,524 RNA-seq runs across 34 studies.
A pair of genes are said to be co-expressed when their expression is correlated across different conditions and
is often a marker for genes to be involved in similar processes.
To Cite:
MJ O'Meara, JR Rapala, CB Nichols, C Alexandre, B Billmyre, JL Steenwyk, A Alspaugh,
TR O'Meara CryptoCEN: A Co-Expression Network for Cryptococcus neoformans reveals
novel proteins involved in DNA damage repair
* Code available at https://github.com/maomlab/CalCEN/tree/master/vignettes/CryptoCEN
* Full network and dataset: https://huggingface.co/datasets/maomlab/CryptoCEN
## Plot scatter plot expression for a pair of genes across studies.
Put in the ``CNAG_#####`` gene_id for two genes.
""")
h99_transcript_annotations = datasets.load_dataset(
path = "maomlab/CryptoCEN",
data_files = {"h99_transcript_annotations": "h99_transcript_annotations.tsv"})
h99_transcript_annotations = h99_transcript_annotations["h99_transcript_annotations"].to_pandas()
estimated_expression_meta = datasets.load_dataset(
path = "maomlab/CryptoCEN",
data_files = {"estimated_expression_meta": "Data/estimated_expression_meta.tsv"})
estimated_expression_meta = estimated_expression_meta["estimated_expression_meta"].to_pandas()
estimated_expression = datasets.load_dataset(
path = "maomlab/CryptoCEN",
data_files = {"estimated_expression": "estimated_expression_matrix.parquet"})
estimated_expression = estimated_expression["estimated_expression"].to_pandas()
#DEBUG
print(f"estimated_expression shape: {estimated_expression.shape}")
col1, col2, padding = st.columns(spec = [0.2, 0.2, 0.6])
with col1:
gene_id_1 = st.text_input(
label = "Gene ID 1",
value = "CNAG_04365",
max_chars = 10,
help = "CNAG Gene ID e.g. CNAG_04365")
with col2:
gene_id_2 = st.text_input(
label = "Gene ID 2",
value = "CNAG_04222",
max_chars = 10,
help = "CNAG Gene ID e.g. CNAG_04222")
# check the user input
try:
cnag_id_1 = h99_transcript_annotations.loc[h99_transcript_annotations["gene_id"] == gene_id_1]["cnag_id"].values
except:
st.error(f"Unable to locate cnag_id for Gene ID 1: {gene_id_1}, it should be of the form 'CNAG_######'")
try:
cnag_id_2 = h99_transcript_annotations.loc[h99_transcript_annotations["gene_id"] == gene_id_2]["cnag_id"].values
except:
st.error(f"Unable to locate cnag_id for Gene ID 2: {gene_id_2}, it should be of the form 'CNAG_######'")
chart_data = pd.DataFrame({
"expression_1": np.log10(estimated_expression.loc[h99_transcript_annotations["gene_id"] == gene_id_1].to_numpy()[0] + 1),
"expression_2": np.log10(estimated_expression.loc[h99_transcript_annotations["gene_id"] == gene_id_2].to_numpy()[0] + 1),
"run_accession": estimated_expression.columns})
chart_data = chart_data.merge(
right = estimated_expression_meta,
on = "run_accession")
chart = (
alt.Chart(
chart_data,
width = 750,
height = 750)
.mark_circle()
.encode(
x="expression_1",
y="expression_2",
color="study_accession",
tooltip=["run_accession", "study_accession"]))
st.altair_chart(
chart)