Spaces:
Sleeping
Sleeping
import datasets | |
import streamlit as st | |
import numpy as np | |
import pandas as pd | |
import altair as alt | |
st.set_page_config(layout='wide') | |
st.markdown(""" | |
# CryptoCEN Expression Scatter | |
**CryptoCEN** is a co-expression network for *Cryptococcus neoformans* built on 1,524 RNA-seq runs across 34 studies. | |
A pair of genes are said to be co-expressed when their expression is correlated across different conditions and | |
is often a marker for genes to be involved in similar processes. | |
To Cite: | |
MJ O'Meara, JR Rapala, CB Nichols, C Alexandre, B Billmyre, JL Steenwyk, A Alspaugh, | |
TR O'Meara CryptoCEN: A Co-Expression Network for Cryptococcus neoformans reveals | |
novel proteins involved in DNA damage repair | |
* Code available at https://github.com/maomlab/CalCEN/tree/master/vignettes/CryptoCEN | |
* Full network and dataset: https://huggingface.co/datasets/maomlab/CryptoCEN | |
## Plot scatter plot expression for a pair of genes across studies. | |
Put in the ``CNAG_#####`` gene_id for two genes. | |
""") | |
h99_transcript_annotations = datasets.load_dataset( | |
path = "maomlab/CryptoCEN", | |
data_files = {"h99_transcript_annotations": "h99_transcript_annotations.tsv"}) | |
h99_transcript_annotations = h99_transcript_annotations["h99_transcript_annotations"].to_pandas() | |
estimated_expression_meta = datasets.load_dataset( | |
path = "maomlab/CryptoCEN", | |
data_files = {"estimated_expression_meta": "Data/estimated_expression_meta.tsv"}) | |
estimated_expression_meta = estimated_expression_meta["estimated_expression_meta"].to_pandas() | |
estimated_expression = datasets.load_dataset( | |
path = "maomlab/CryptoCEN", | |
data_files = {"estimated_expression": "estimated_expression.tsv"}) | |
estimated_expression = estimated_expression["estimated_expression"].to_pandas() | |
print(f"estimated_expression shape: {estimated_expression.shape}") | |
print(f"transcript_annotations are equal: {sum(h99_transcript_annotations['cnag_id'] == estimated_expression.index)}") | |
col1, col2, col3 = st.columns(spec = [0.2, 0.2, 0.6]) | |
with col1: | |
gene_id_1 = st.text_input( | |
label = "Gene ID 1", | |
value = "CNAG_04365", | |
max_chars = 10, | |
help = "CNAG Gene ID e.g. CNAG_04365") | |
with col2: | |
gene_id_2 = st.text_input( | |
label = "Gene ID 2", | |
value = "CNAG_04222", | |
max_chars = 10, | |
help = "CNAG Gene ID e.g. CNAG_04222") | |
chart_data = pd.DataFrame({ | |
"expression_1": np.log10(estimated_expression.loc[h99_transcript_annotations["gene_id"] == gene_id_1].to_numpy()[0] + 1), | |
"expression_2": np.log10(estimated_expression.loc[h99_transcript_annotations["gene_id"] == gene_id_2].to_numpy()[0] + 1), | |
"run_accession": estimated_expression.columns, | |
"run_accession_meta": estimated_expression_meta["run_accession"], | |
"study_accession": estimated_expression_meta["study_accession"]}) | |
print(f"run_ids are equal: {sum(chart_data['run_accession'] == chart_data['run_accession_meta'])}") | |
chart = ( | |
alt.Chart(chart_data) | |
.mark_circle() | |
.encode(x="expression_1", y="expression_2", size=5, color="study_accession", tooltip=["run_accession", "study_accession"])) | |
st.altair_chart(chart, use_container_width=True) | |