File size: 5,704 Bytes
57a3fc8
 
 
005bc4b
57a3fc8
 
 
 
833f2a3
6ff7501
5b5977a
833f2a3
 
 
 
5b5977a
833f2a3
 
 
 
 
 
 
57a3fc8
 
 
 
 
4131518
57a3fc8
 
 
 
 
 
 
bd427aa
 
57a3fc8
 
4131518
 
 
 
 
02ada7f
 
 
 
 
57a3fc8
 
bdfdb89
 
57a3fc8
f9b57d0
4131518
 
44662fb
57a3fc8
 
 
833f2a3
57a3fc8
 
 
 
 
 
833f2a3
57a3fc8
 
 
f9b57d0
 
833f2a3
 
31a39f0
f9b57d0
 
 
 
833f2a3
 
31a39f0
f9b57d0
 
 
57a3fc8
9752475
 
 
b8cafbc
9752475
 
3239333
ac9ba30
3239333
 
57a3fc8
44662fb
 
 
 
 
 
 
 
 
f1725de
 
833f2a3
 
 
1b46b26
31a39f0
f1725de
833f2a3
 
 
1b46b26
31a39f0
 
57a3fc8
84fc864
 
992d092
 
57a3fc8
f9b57d0
9752475
 
bf6bf02
f9b57d0
 
 
84fc864
57a3fc8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import datasets
import streamlit as st
import numpy as np
import pandas as pd
import altair as alt

st.set_page_config(layout='wide')

# parse out gene_ids from URL query args to it's possible to link to this page
query_params = st.query_params
if "gene_id_1" in query_params.keys():
    gene_id_1 = query_params["gene_id_1"]
else:
    gene_id_1 = "CNAG_04365"

if "gene_id_2" in query_params.keys():
    gene_id_2 = query_params["gene_id_2"]
else:
    gene_id_2 = "CNAG_04222"

    


st.markdown("""
# CryptoCEN Expression Scatter
**CryptoCEN** is a co-expression network for *Cryptococcus neoformans* built on 1,524 RNA-seq runs across 34 studies.
A pair of genes are said to be co-expressed when their expression is correlated across different conditions and
is often a marker for genes to be involved in similar processes. 

To Cite:
MJ O'Meara, JR Rapala, CB Nichols, C Alexandre, B Billmyre, JL Steenwyk, A Alspaugh,
TR O'Meara CryptoCEN: A Co-Expression Network for Cryptococcus neoformans reveals
novel proteins involved in DNA damage repair
* Code available at https://github.com/maomlab/CalCEN/tree/master/vignettes/CryptoCEN
* Full network and dataset: https://huggingface.co/datasets/maomlab/CryptoCEN

## Plot scatter plot expression for a pair of genes across studies.
Put in the ``CNAG_#####`` gene_id for two genes.
""")

h99_transcript_annotations = datasets.load_dataset(
    path = "maomlab/CryptoCEN",
    data_files = {"h99_transcript_annotations": "h99_transcript_annotations.tsv"})
h99_transcript_annotations = h99_transcript_annotations["h99_transcript_annotations"].to_pandas()

estimated_expression_meta = datasets.load_dataset(
    path = "maomlab/CryptoCEN",
    data_files = {"estimated_expression_meta": "Data/estimated_expression_meta.tsv"})
estimated_expression_meta = estimated_expression_meta["estimated_expression_meta"].to_pandas()

estimated_expression = datasets.load_dataset(
    path = "maomlab/CryptoCEN",
    data_files = {"estimated_expression": "estimated_expression_matrix.parquet"})
estimated_expression = estimated_expression["estimated_expression"].to_pandas()

#DEBUG
print(f"estimated_expression shape: {estimated_expression.shape}")

col1, col2, col3, padding = st.columns(spec = [0.2, 0.2, 0.2, 0.4])
with col1:
    gene_id_1 = st.text_input(
        label = "Gene ID 1",
        value = f"{gene_id_1}",
        max_chars = 10,
        help = "CNAG Gene ID e.g. CNAG_04365")

with col2:
    gene_id_2 = st.text_input(
        label = "Gene ID 2",
        value = f"{gene_id_2}",
        max_chars = 10,
        help = "CNAG Gene ID e.g. CNAG_04222")

# check the user input
try:
    cnag_id_1 = h99_transcript_annotations.loc[h99_transcript_annotations["gene_id"] == gene_id_1]["cnag_id"].values[0]
    gene_symbol_1 = h99_transcript_annotations.loc[h99_transcript_annotations["gene_id"] == gene_id_1]["gene_symbol"].values[0]
    description_1 = h99_transcript_annotations.loc[h99_transcript_annotations["gene_id"] == gene_id_1]["description"].values[0]
except:
    st.error(f"Unable to locate cnag_id for Gene ID 1: {gene_id_1}, it should be of the form 'CNAG_######'")

try:
    cnag_id_2 = h99_transcript_annotations.loc[h99_transcript_annotations["gene_id"] == gene_id_2]["cnag_id"].values[0]
    gene_symbol_2 = h99_transcript_annotations.loc[h99_transcript_annotations["gene_id"] == gene_id_2]["gene_symbol"].values[0]
    description_2 = h99_transcript_annotations.loc[h99_transcript_annotations["gene_id"] == gene_id_2]["description"].values[0]
except:
    st.error(f"Unable to locate cnag_id for Gene ID 2: {gene_id_2}, it should be of the form 'CNAG_######'")

chart_data = pd.DataFrame({
    "gene_id_1": gene_id_1,
    "gene_id_2": gene_id_2,
    "expression_1": estimated_expression.loc[h99_transcript_annotations["gene_id"] == gene_id_1].to_numpy()[0],
    "expression_2": estimated_expression.loc[h99_transcript_annotations["gene_id"] == gene_id_2].to_numpy()[0],
    "log_expression_1": np.log10(estimated_expression.loc[h99_transcript_annotations["gene_id"] == gene_id_1].to_numpy()[0] + 1),
    "log_expression_2": np.log10(estimated_expression.loc[h99_transcript_annotations["gene_id"] == gene_id_2].to_numpy()[0] + 1),
    "run_accession": estimated_expression.columns})
chart_data = chart_data.merge(
    right = estimated_expression_meta,
    on = "run_accession")

with col3:
    st.text('') # help alignment with input box
    st.download_button(
        label = "Download data as TSV",
        data = chart_data.to_csv(sep ='\t').encode('utf-8'),
        file_name = f"CryptoCEN_expression_{gene_id_1}_vs_{gene_id_2}.tsv",
        mime = "text/csv")


st.markdown(f"""
#### Gene 1:
* *Gene ID*: [{gene_id_1}](https://fungidb.org/fungidb/app/record/gene/{gene_id_1})
{'* *Gene Symbol*:' + gene_symbol_1 if gene_symbol_1 is not None else ''}
* *Description*: {description_1}
* *Top [Co-Expressed Partners](https://huggingface.co/spaces/maomlab/CryptoCEN-TopHits?gene_id={gene_id_1})*

#### Gene 2:
* *Gene ID*: [{gene_id_2}](https://fungidb.org/fungidb/app/record/gene/{gene_id_2})
{'* *Gene Symbol*:' + gene_symbol_2 if gene_symbol_2 is not None else ''}
* *Description*: {description_2}
* *Top [Co-Expressed Partners](https://huggingface.co/spaces/maomlab/CryptoCEN-TopHits?gene_id={gene_id_2})*
""")

chart = (
   alt.Chart(
       chart_data,
       width = 750,
       height = 750)
   .mark_circle()
   .encode(
       x=alt.X("log_expression_1", title=f"Log10[{gene_id_1}+1] Expression"),
       y=alt.Y("log_expression_2", title=f"Log10[{gene_id_2}+1] Expression"),
       color=alt.Color("study_accession", title="Study Accession"),
       tooltip=["run_accession", "study_accession"]))

st.altair_chart(
    chart)