Spaces:
Sleeping
Sleeping
File size: 5,704 Bytes
57a3fc8 005bc4b 57a3fc8 833f2a3 6ff7501 5b5977a 833f2a3 5b5977a 833f2a3 57a3fc8 4131518 57a3fc8 bd427aa 57a3fc8 4131518 02ada7f 57a3fc8 bdfdb89 57a3fc8 f9b57d0 4131518 44662fb 57a3fc8 833f2a3 57a3fc8 833f2a3 57a3fc8 f9b57d0 833f2a3 31a39f0 f9b57d0 833f2a3 31a39f0 f9b57d0 57a3fc8 9752475 b8cafbc 9752475 3239333 ac9ba30 3239333 57a3fc8 44662fb f1725de 833f2a3 1b46b26 31a39f0 f1725de 833f2a3 1b46b26 31a39f0 57a3fc8 84fc864 992d092 57a3fc8 f9b57d0 9752475 bf6bf02 f9b57d0 84fc864 57a3fc8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import datasets
import streamlit as st
import numpy as np
import pandas as pd
import altair as alt
st.set_page_config(layout='wide')
# parse out gene_ids from URL query args to it's possible to link to this page
query_params = st.query_params
if "gene_id_1" in query_params.keys():
gene_id_1 = query_params["gene_id_1"]
else:
gene_id_1 = "CNAG_04365"
if "gene_id_2" in query_params.keys():
gene_id_2 = query_params["gene_id_2"]
else:
gene_id_2 = "CNAG_04222"
st.markdown("""
# CryptoCEN Expression Scatter
**CryptoCEN** is a co-expression network for *Cryptococcus neoformans* built on 1,524 RNA-seq runs across 34 studies.
A pair of genes are said to be co-expressed when their expression is correlated across different conditions and
is often a marker for genes to be involved in similar processes.
To Cite:
MJ O'Meara, JR Rapala, CB Nichols, C Alexandre, B Billmyre, JL Steenwyk, A Alspaugh,
TR O'Meara CryptoCEN: A Co-Expression Network for Cryptococcus neoformans reveals
novel proteins involved in DNA damage repair
* Code available at https://github.com/maomlab/CalCEN/tree/master/vignettes/CryptoCEN
* Full network and dataset: https://huggingface.co/datasets/maomlab/CryptoCEN
## Plot scatter plot expression for a pair of genes across studies.
Put in the ``CNAG_#####`` gene_id for two genes.
""")
h99_transcript_annotations = datasets.load_dataset(
path = "maomlab/CryptoCEN",
data_files = {"h99_transcript_annotations": "h99_transcript_annotations.tsv"})
h99_transcript_annotations = h99_transcript_annotations["h99_transcript_annotations"].to_pandas()
estimated_expression_meta = datasets.load_dataset(
path = "maomlab/CryptoCEN",
data_files = {"estimated_expression_meta": "Data/estimated_expression_meta.tsv"})
estimated_expression_meta = estimated_expression_meta["estimated_expression_meta"].to_pandas()
estimated_expression = datasets.load_dataset(
path = "maomlab/CryptoCEN",
data_files = {"estimated_expression": "estimated_expression_matrix.parquet"})
estimated_expression = estimated_expression["estimated_expression"].to_pandas()
#DEBUG
print(f"estimated_expression shape: {estimated_expression.shape}")
col1, col2, col3, padding = st.columns(spec = [0.2, 0.2, 0.2, 0.4])
with col1:
gene_id_1 = st.text_input(
label = "Gene ID 1",
value = f"{gene_id_1}",
max_chars = 10,
help = "CNAG Gene ID e.g. CNAG_04365")
with col2:
gene_id_2 = st.text_input(
label = "Gene ID 2",
value = f"{gene_id_2}",
max_chars = 10,
help = "CNAG Gene ID e.g. CNAG_04222")
# check the user input
try:
cnag_id_1 = h99_transcript_annotations.loc[h99_transcript_annotations["gene_id"] == gene_id_1]["cnag_id"].values[0]
gene_symbol_1 = h99_transcript_annotations.loc[h99_transcript_annotations["gene_id"] == gene_id_1]["gene_symbol"].values[0]
description_1 = h99_transcript_annotations.loc[h99_transcript_annotations["gene_id"] == gene_id_1]["description"].values[0]
except:
st.error(f"Unable to locate cnag_id for Gene ID 1: {gene_id_1}, it should be of the form 'CNAG_######'")
try:
cnag_id_2 = h99_transcript_annotations.loc[h99_transcript_annotations["gene_id"] == gene_id_2]["cnag_id"].values[0]
gene_symbol_2 = h99_transcript_annotations.loc[h99_transcript_annotations["gene_id"] == gene_id_2]["gene_symbol"].values[0]
description_2 = h99_transcript_annotations.loc[h99_transcript_annotations["gene_id"] == gene_id_2]["description"].values[0]
except:
st.error(f"Unable to locate cnag_id for Gene ID 2: {gene_id_2}, it should be of the form 'CNAG_######'")
chart_data = pd.DataFrame({
"gene_id_1": gene_id_1,
"gene_id_2": gene_id_2,
"expression_1": estimated_expression.loc[h99_transcript_annotations["gene_id"] == gene_id_1].to_numpy()[0],
"expression_2": estimated_expression.loc[h99_transcript_annotations["gene_id"] == gene_id_2].to_numpy()[0],
"log_expression_1": np.log10(estimated_expression.loc[h99_transcript_annotations["gene_id"] == gene_id_1].to_numpy()[0] + 1),
"log_expression_2": np.log10(estimated_expression.loc[h99_transcript_annotations["gene_id"] == gene_id_2].to_numpy()[0] + 1),
"run_accession": estimated_expression.columns})
chart_data = chart_data.merge(
right = estimated_expression_meta,
on = "run_accession")
with col3:
st.text('') # help alignment with input box
st.download_button(
label = "Download data as TSV",
data = chart_data.to_csv(sep ='\t').encode('utf-8'),
file_name = f"CryptoCEN_expression_{gene_id_1}_vs_{gene_id_2}.tsv",
mime = "text/csv")
st.markdown(f"""
#### Gene 1:
* *Gene ID*: [{gene_id_1}](https://fungidb.org/fungidb/app/record/gene/{gene_id_1})
{'* *Gene Symbol*:' + gene_symbol_1 if gene_symbol_1 is not None else ''}
* *Description*: {description_1}
* *Top [Co-Expressed Partners](https://huggingface.co/spaces/maomlab/CryptoCEN-TopHits?gene_id={gene_id_1})*
#### Gene 2:
* *Gene ID*: [{gene_id_2}](https://fungidb.org/fungidb/app/record/gene/{gene_id_2})
{'* *Gene Symbol*:' + gene_symbol_2 if gene_symbol_2 is not None else ''}
* *Description*: {description_2}
* *Top [Co-Expressed Partners](https://huggingface.co/spaces/maomlab/CryptoCEN-TopHits?gene_id={gene_id_2})*
""")
chart = (
alt.Chart(
chart_data,
width = 750,
height = 750)
.mark_circle()
.encode(
x=alt.X("log_expression_1", title=f"Log10[{gene_id_1}+1] Expression"),
y=alt.Y("log_expression_2", title=f"Log10[{gene_id_2}+1] Expression"),
color=alt.Color("study_accession", title="Study Accession"),
tooltip=["run_accession", "study_accession"]))
st.altair_chart(
chart)
|