Spaces:

maomlab
/

CryptoCEN-ExpressionScatter

Running

File size: 2,632 Bytes

57a3fc8
 
 
005bc4b
57a3fc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd427aa
 
57a3fc8
 
02ada7f
 
 
 
 
57a3fc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd427aa
 
57a3fc8
 
bd427aa
57a3fc8
c135f96
57a3fc8

import datasets
import streamlit as st
import numpy as np
import pandas as pd
import altair as alt

st.set_page_config(layout='wide')

st.markdown("""
# CryptoCEN Expression Scatter
**CryptoCEN** is a co-expression network for *Cryptococcus neoformans* built on 1,524 RNA-seq runs across 34 studies.
A pair of genes are said to be co-expressed when their expression is correlated across different conditions and
is often a marker for genes to be involved in similar processes. 
To Cite:
MJ O'Meara, JR Rapala, CB Nichols, C Alexandre, B Billmyre, JL Steenwyk, A Alspaugh,
TR O'Meara CryptoCEN: A Co-Expression Network for Cryptococcus neoformans reveals
novel proteins involved in DNA damage repair
* Code available at https://github.com/maomlab/CalCEN/tree/master/vignettes/CryptoCEN
* Full network and dataset: https://huggingface.co/datasets/maomlab/CryptoCEN

## Plot scatter plot expression for a pair of genes across studies.
Put in the ``CNAG_#####`` gene_id for two genes.
""")

estimated_expression_meta = datasets.load_dataset(
    path = "maomlab/CryptoCEN",
    data_files = {"estimated_expression_meta": "Data/estimated_expression_meta.tsv"})
estimated_expression_meta = estimated_expression_meta["estimated_expression_meta"].to_pandas()

estimated_expression = datasets.load_dataset(
    path = "maomlab/CryptoCEN",
    data_files = {"estimated_expression": "estimated_expression.tsv"})
estimated_expression = estimated_expression["estimated_expression"].to_pandas()

col1, col2, col3 = st.columns(spec = [0.3, 0.2, 0.5])
with col1:
    gene_id_1 = st.text_input(
        label = "Gene ID 1",
        value = "CNAG_04365",
        max_chars = 10,
        help = "CNAG Gene ID e.g. CNAG_04365")

with col2:
    gene_id_2 = st.text_input(
        label = "Gene ID 2",
        value = "CNAG_04222",
        max_chars = 10,
        help = "CNAG Gene ID e.g. CNAG_04222")

chart_data = pd.DataFrame({
    "expression_1": np.log10(estimated_expression[estimated_expression.index == gene_id_1,] + 1),
    "expression_2": np.log10(estimated_expression[estimated_expression.index == gene_id_2,] + 1),
    "run_accession": estimated_expression.columns,
    "run_accession_meta": estimated_expression_meta["run_accession"],
    "study_accession": estimated_expression_meta["study_accession"]}) 

print(f"run_ids are equal: {sum(chart_data['run_accession'] == chart_data['run_accession_meta'])}")

chart = (
   alt.Chart(chart_data)
   .mark_circle()
   .encode(x="expression_1", y="expression_2", size=5, color="study_accession", tooltip=["run_accession", "study_accession"]))

st.altair_chart(chart, use_container_width=True)