File size: 2,632 Bytes
57a3fc8
 
 
005bc4b
57a3fc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd427aa
 
57a3fc8
 
02ada7f
 
 
 
 
57a3fc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd427aa
 
57a3fc8
 
bd427aa
57a3fc8
c135f96
57a3fc8
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import datasets
import streamlit as st
import numpy as np
import pandas as pd
import altair as alt

st.set_page_config(layout='wide')

st.markdown("""
# CryptoCEN Expression Scatter
**CryptoCEN** is a co-expression network for *Cryptococcus neoformans* built on 1,524 RNA-seq runs across 34 studies.
A pair of genes are said to be co-expressed when their expression is correlated across different conditions and
is often a marker for genes to be involved in similar processes. 
To Cite:
MJ O'Meara, JR Rapala, CB Nichols, C Alexandre, B Billmyre, JL Steenwyk, A Alspaugh,
TR O'Meara CryptoCEN: A Co-Expression Network for Cryptococcus neoformans reveals
novel proteins involved in DNA damage repair
* Code available at https://github.com/maomlab/CalCEN/tree/master/vignettes/CryptoCEN
* Full network and dataset: https://huggingface.co/datasets/maomlab/CryptoCEN

## Plot scatter plot expression for a pair of genes across studies.
Put in the ``CNAG_#####`` gene_id for two genes.
""")

estimated_expression_meta = datasets.load_dataset(
    path = "maomlab/CryptoCEN",
    data_files = {"estimated_expression_meta": "Data/estimated_expression_meta.tsv"})
estimated_expression_meta = estimated_expression_meta["estimated_expression_meta"].to_pandas()

estimated_expression = datasets.load_dataset(
    path = "maomlab/CryptoCEN",
    data_files = {"estimated_expression": "estimated_expression.tsv"})
estimated_expression = estimated_expression["estimated_expression"].to_pandas()

col1, col2, col3 = st.columns(spec = [0.3, 0.2, 0.5])
with col1:
    gene_id_1 = st.text_input(
        label = "Gene ID 1",
        value = "CNAG_04365",
        max_chars = 10,
        help = "CNAG Gene ID e.g. CNAG_04365")

with col2:
    gene_id_2 = st.text_input(
        label = "Gene ID 2",
        value = "CNAG_04222",
        max_chars = 10,
        help = "CNAG Gene ID e.g. CNAG_04222")

chart_data = pd.DataFrame({
    "expression_1": np.log10(estimated_expression[estimated_expression.index == gene_id_1,] + 1),
    "expression_2": np.log10(estimated_expression[estimated_expression.index == gene_id_2,] + 1),
    "run_accession": estimated_expression.columns,
    "run_accession_meta": estimated_expression_meta["run_accession"],
    "study_accession": estimated_expression_meta["study_accession"]}) 

print(f"run_ids are equal: {sum(chart_data['run_accession'] == chart_data['run_accession_meta'])}")

chart = (
   alt.Chart(chart_data)
   .mark_circle()
   .encode(x="expression_1", y="expression_2", size=5, color="study_accession", tooltip=["run_accession", "study_accession"]))

st.altair_chart(chart, use_container_width=True)