matq007 commited on
Commit
0e1164c
·
unverified ·
1 Parent(s): fc741fb

feat: move from streamlit to hugging spaces

Browse files
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .ruff_cache
2
+ .venv
3
+ __pycache__
README.md CHANGED
@@ -1,15 +1,16 @@
1
  ---
2
  title: Preimplantation portal
 
3
  sdk: streamlit
4
  sdk_version: 1.38.0
5
  app_file: Home.py
6
  ---
7
 
8
- # preimplantation portal
9
-
10
  ## Run locally
11
 
12
  ```bash
13
- source venv/bin/activate
 
 
14
  streamlit run Home.py
15
  ```
 
1
  ---
2
  title: Preimplantation portal
3
+ short_description: Preimplantation mouse and human development
4
  sdk: streamlit
5
  sdk_version: 1.38.0
6
  app_file: Home.py
7
  ---
8
 
 
 
9
  ## Run locally
10
 
11
  ```bash
12
+ source .venv/bin/activate
13
+ # install dependencies
14
+ uv pip install -r requirements.txt
15
  streamlit run Home.py
16
  ```
constants.py CHANGED
@@ -1,26 +1,6 @@
1
- VERSION = 1.5
 
 
2
 
3
  DEFAULT_DR = "X_draw_graph_fa"
4
  DEFAULT_META = "stage"
5
- ZENODO_URL = "https://zenodo.org/records/13749348/files"
6
-
7
- DATA = {
8
- "HUMAN": {
9
- "RAW_DATASET": f"{ZENODO_URL}/32_human_adata.h5ad",
10
- "DATASET": f"{ZENODO_URL}/portal_human_v{VERSION}.h5ad",
11
- "DEGS": {
12
- "CT": f"{ZENODO_URL}/human_degs_ct_v{VERSION}.feather",
13
- "STAGE": f"{ZENODO_URL}/human_degs_stage_v{VERSION}.feather"
14
- },
15
- "SHAP": f"{ZENODO_URL}/human_SHAP_v{VERSION}.feather",
16
- },
17
- "MOUSE": {
18
- "RAW_DATASET": f"{ZENODO_URL}/01_mouse_reprocessed.h5ad",
19
- "DATASET": f"{ZENODO_URL}/portal_mouse_v{VERSION}.h5ad",
20
- "DEGS": {
21
- "CT": f"{ZENODO_URL}/mouse_degs_ct_v{VERSION}.feather",
22
- "STAGE": f"{ZENODO_URL}/mouse_degs_stage_v{VERSION}.feather",
23
- },
24
- "SHAP": f"{ZENODO_URL}/mouse_SHAP_v{VERSION}.feather",
25
- },
26
- }
 
1
+ # Constants
2
+
3
+ MODELS = {"HUMAN": ["v1.0.1", "main"], "MOUSE": ["v1.0.1", "main"]}
4
 
5
  DEFAULT_DR = "X_draw_graph_fa"
6
  DEFAULT_META = "stage"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/1_Gene_Expression.py CHANGED
@@ -1,9 +1,8 @@
1
  #!/usr/bin/env python
2
- import anndata
3
  import streamlit as st
4
 
5
- from constants import DATA, DEFAULT_DR, DEFAULT_META
6
- from utils import fetch_resource, plot_feature, plot_sc_embedding
7
 
8
  st.set_page_config(layout="wide")
9
  st.markdown("""
@@ -12,16 +11,10 @@ st.markdown("""
12
  Levels of gene activity along differentiation.
13
  """)
14
 
15
- ds = st.sidebar.selectbox(
16
- "**Load dataset**",
17
- DATA.keys(),
18
- index=None,
19
- placeholder="Select contact method...",
20
- )
21
 
22
- if ds is not None:
23
-
24
- adata = anndata.read_h5ad(fetch_resource(DATA[ds]['DATASET']))
25
 
26
  sl_dr = st.sidebar.selectbox(
27
  "**Dimension reduction**",
@@ -30,6 +23,10 @@ if ds is not None:
30
  placeholder="Select method ...",
31
  )
32
 
 
 
 
 
33
  sl_metadata = st.sidebar.selectbox(
34
  "**Metadata**",
35
  adata.obs.columns,
@@ -39,7 +36,7 @@ if ds is not None:
39
 
40
  sl_feature = st.sidebar.selectbox(
41
  "**Gene**",
42
- adata.raw.var_names,
43
  index=0,
44
  placeholder="Select gene ...",
45
  )
@@ -48,7 +45,7 @@ if ds is not None:
48
  sl_denoised = st.sidebar.checkbox(
49
  "Use denoised expression?",
50
  help="Denoised expression is sampled from the decoder.",
51
- disabled=(not is_imputed)
52
  )
53
 
54
  col1, col2 = st.columns(2)
 
1
  #!/usr/bin/env python
 
2
  import streamlit as st
3
 
4
+ from constants import DEFAULT_DR, DEFAULT_META
5
+ from utils import fetch_resource, plot_feature, plot_sc_embedding, ui_model_selection
6
 
7
  st.set_page_config(layout="wide")
8
  st.markdown("""
 
11
  Levels of gene activity along differentiation.
12
  """)
13
 
14
+ ui_model_selection()
 
 
 
 
 
15
 
16
+ if st.session_state["SPECIE"] and st.session_state["VERSION"]:
17
+ adata = fetch_resource(st.session_state["SPECIE"], st.session_state["VERSION"])
 
18
 
19
  sl_dr = st.sidebar.selectbox(
20
  "**Dimension reduction**",
 
23
  placeholder="Select method ...",
24
  )
25
 
26
+ st.sidebar.markdown(
27
+ f"Visualization done on `{adata.uns['neighbors']['params']['use_rep']}` space."
28
+ )
29
+
30
  sl_metadata = st.sidebar.selectbox(
31
  "**Metadata**",
32
  adata.obs.columns,
 
36
 
37
  sl_feature = st.sidebar.selectbox(
38
  "**Gene**",
39
+ sorted(adata.raw.var_names),
40
  index=0,
41
  placeholder="Select gene ...",
42
  )
 
45
  sl_denoised = st.sidebar.checkbox(
46
  "Use denoised expression?",
47
  help="Denoised expression is sampled from the decoder.",
48
+ disabled=(not is_imputed),
49
  )
50
 
51
  col1, col2 = st.columns(2)
pages/2_Differentially_Expressed_Genes.py CHANGED
@@ -1,9 +1,7 @@
1
  #!/usr/bin/env python
2
- import pandas as pd
3
  import streamlit as st
4
 
5
- from constants import DATA
6
- from utils import fetch_resource
7
 
8
  st.set_page_config(layout="wide")
9
 
@@ -12,61 +10,58 @@ st.markdown("""
12
 
13
  Genes below have been determined using `sc.tl.rank_genes_groups` where `t-test_overestim_var`
14
  is the default method.
15
- """
16
- )
 
17
 
18
  filter_flag = []
19
- ds = st.sidebar.selectbox(
20
- "**Select models**",
21
- DATA.keys(),
22
- index=None,
23
- placeholder="Select species",
24
- )
25
-
26
- if ds:
27
- filter_by = st.sidebar.selectbox(
28
  "**Select by**",
29
- DATA[ds]["DEGS"].keys(),
30
  index=None,
31
- placeholder="Select by",
32
  )
33
 
34
- if ds and filter_by:
35
- markers = pd.read_feather(fetch_resource(DATA[ds]['DEGS'][filter_by]))
36
 
37
- group = st.sidebar.multiselect(
38
- "**Cell type**", markers.group.unique(), placeholder="Select group ..."
39
- )
40
 
41
- genes = st.sidebar.multiselect(
42
- "**Gene**", markers.gene_symbol.unique(), placeholder="Select genes ..."
43
- )
44
 
45
- foldchange = st.sidebar.number_input(
46
- "**Log2 fold-change**",
47
- value=1,
48
- )
49
 
50
- pval = st.sidebar.number_input(
51
- "**Adjusted p-value**",
52
- value=0.05,
53
- )
54
 
55
- if group:
56
- filter_flag.append("group == @group")
57
 
58
- if genes:
59
- filter_flag.append("@genes in gene_symbol")
60
 
61
- if foldchange:
62
- filter_flag.append(
63
- "logfoldchanges > @foldchange"
64
- if foldchange > 0
65
- else "logfoldchanges < @foldchange"
66
- )
67
 
68
- if pval:
69
- filter_flag.append("pvals_adj < @pval")
70
 
71
- subset = markers.query(" & ".join(filter_flag)) if filter_flag else markers
72
- st.dataframe(subset, use_container_width=True, height=650)
 
1
  #!/usr/bin/env python
 
2
  import streamlit as st
3
 
4
+ from utils import fetch_resource, get_degs, ui_model_selection
 
5
 
6
  st.set_page_config(layout="wide")
7
 
 
10
 
11
  Genes below have been determined using `sc.tl.rank_genes_groups` where `t-test_overestim_var`
12
  is the default method.
13
+ """)
14
+
15
+ ui_model_selection()
16
 
17
  filter_flag = []
18
+
19
+ if st.session_state["SPECIE"] and st.session_state["VERSION"]:
20
+ adata = fetch_resource(st.session_state["SPECIE"], st.session_state["VERSION"])
21
+
22
+ degs_by = st.sidebar.selectbox(
 
 
 
 
23
  "**Select by**",
24
+ [x for x in adata.uns_keys() if "degs" in x],
25
  index=None,
26
+ placeholder="Differentially expressed genes by",
27
  )
28
 
29
+ if degs_by:
30
+ degs = get_degs(adata, degs_by)
31
 
32
+ group = st.sidebar.multiselect(
33
+ "**Cell type**", degs["group"].unique(), placeholder="Select group ..."
34
+ )
35
 
36
+ genes = st.sidebar.multiselect(
37
+ "**Gene**", sorted(degs["names"].unique()), placeholder="Select genes ..."
38
+ )
39
 
40
+ foldchange = st.sidebar.number_input(
41
+ "**Log2 fold-change**",
42
+ value=1,
43
+ )
44
 
45
+ pval_adj = st.sidebar.number_input(
46
+ "**Adjusted p-value**",
47
+ value=0.05,
48
+ )
49
 
50
+ if group:
51
+ filter_flag.append("group == @group")
52
 
53
+ if genes:
54
+ filter_flag.append("@genes in names")
55
 
56
+ if foldchange:
57
+ filter_flag.append(
58
+ "logfoldchanges > @foldchange"
59
+ if foldchange > 0
60
+ else "logfoldchanges < @foldchange"
61
+ )
62
 
63
+ if pval_adj:
64
+ filter_flag.append("pvals_adj < @pval_adj")
65
 
66
+ subset = degs.query(" & ".join(filter_flag)) if filter_flag else degs
67
+ st.dataframe(subset, use_container_width=True, height=650)
pages/3_SHAP_features.py CHANGED
@@ -1,9 +1,7 @@
1
  #!/usr/bin/env python
2
- import pandas as pd
3
  import streamlit as st
4
 
5
- from constants import DATA
6
- from utils import fetch_resource
7
 
8
  st.set_page_config(layout="wide")
9
 
@@ -22,37 +20,49 @@ st.markdown("""
22
  - logfoldchanges: Log2fold change from differentiation expression analysis
23
  - pvals_adj: Adjusted p-value from differentiation expression analysis
24
  - scores: Estimated score from differentiation expression analysis
25
- """
26
- )
27
-
28
- ds = st.sidebar.selectbox(
29
- "**Load dataset**",
30
- DATA.keys(),
31
- index=None,
32
- placeholder="Select dataset ...",
33
- )
34
-
35
- if ds:
36
- data = pd.read_feather(fetch_resource(DATA[ds]["SHAP"]))
37
-
38
- query = st.sidebar.selectbox(
39
- "**Subset**",
40
- data.ct.unique().tolist(),
41
  index=None,
42
- placeholder="Select cell type ...",
43
  )
44
 
45
- features = st.sidebar.multiselect(
46
- "**Genes**", data.feature.unique(), placeholder="Select genes ..."
47
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- filter_condition = []
50
- if query:
51
- filter_condition.append("ct == @query")
52
- if features:
53
- filter_condition.append("feature in @features")
54
 
55
- if filter_condition:
56
- data = data.query(" & ".join(filter_condition))
57
 
58
- st.dataframe(data, use_container_width=True, height=650)
 
1
  #!/usr/bin/env python
 
2
  import streamlit as st
3
 
4
+ from utils import fetch_resource, ui_model_selection
 
5
 
6
  st.set_page_config(layout="wide")
7
 
 
20
  - logfoldchanges: Log2fold change from differentiation expression analysis
21
  - pvals_adj: Adjusted p-value from differentiation expression analysis
22
  - scores: Estimated score from differentiation expression analysis
23
+ """)
24
+
25
+ ui_model_selection()
26
+
27
+ filter_condition = []
28
+
29
+ if st.session_state["SPECIE"] and st.session_state["VERSION"]:
30
+ adata = fetch_resource(st.session_state["SPECIE"], st.session_state["VERSION"])
31
+
32
+ explainer = st.sidebar.selectbox(
33
+ "**Explainer**",
34
+ adata.uns["explainer"].keys(),
 
 
 
 
35
  index=None,
36
+ placeholder="Select explainer ...",
37
  )
38
 
39
+ if explainer:
40
+ shap_values = adata.uns["explainer"][explainer].pop("shap_values").reset_index()
41
+ params = [f"{k}:\t{v}" for k, v in adata.uns["explainer"][explainer].items()]
42
+
43
+ st.sidebar.markdown("**Parameters**")
44
+ for k, v in adata.uns["explainer"][explainer].items():
45
+ st.sidebar.markdown(f"{k}:\t{v}")
46
+
47
+ celltype = st.sidebar.selectbox(
48
+ "**Cell type**",
49
+ adata.obs.ct.cat.categories,
50
+ index=None,
51
+ placeholder="Select cell type ...",
52
+ )
53
+
54
+ features = st.sidebar.multiselect(
55
+ "**Genes**",
56
+ sorted(shap_values.feature.unique()),
57
+ placeholder="Select genes ...",
58
+ )
59
 
60
+ if celltype:
61
+ filter_condition.append("ct == @celltype")
62
+ if features:
63
+ filter_condition.append("feature in @features")
 
64
 
65
+ if filter_condition:
66
+ shap_values = shap_values.query(" & ".join(filter_condition))
67
 
68
+ st.dataframe(shap_values, use_container_width=True, height=650)
pages/4_Download.py CHANGED
@@ -1,40 +1,49 @@
1
  #!/usr/bin/env python
2
  import streamlit as st
3
 
4
- from constants import DATA
5
 
6
  st.set_page_config(layout="wide")
7
 
8
  st.markdown(
9
- f"""
10
- # Download
11
 
12
- ## 1. Pipelines
13
 
14
  - Downloading datasets: [nf-core/fetchngs (revision 1.10.0)](https://github.com/nf-core/fetchngs)
15
  - Aligning datasets: [brickmanlab/scrnaseq (revision: feature/smartseq)](https://github.com/brickmanlab/scrnaseq)
16
- - **Ensembl Genomes**
17
  - Mouse: GRCm38 v102
18
  - Human: GRCh38 v110
19
 
20
  ## 2. Codebase
21
 
22
  - Data analysis: [brickmanlab/proks-salehin-et-al](https://github.com/brickmanlab/proks-salehin-et-al)
23
- - Web portal: [brickmanlab/preimplantation-portal](https://github.com/brickmanlab/preimplantation-portal)
 
 
 
24
 
25
- ## 3. Raw data
26
 
27
- - [Mouse]({DATA['MOUSE']['RAW_DATASET']})
28
- - [Human]({DATA['HUMAN']['RAW_DATASET']})
 
29
 
30
- ## 4. AI models
31
 
32
  Trained models with parameters were uploaded to [Hugging Face](https://huggingface.co/brickmanlab/preimplantation-models).
33
-
34
- ### 4.1 Models
35
-
36
- - [scANVI mouse](https://huggingface.co/brickmanlab/mouse-scanvi)
37
- - [scANVI human](https://huggingface.co/brickmanlab/human-scanvi)
38
-
39
  """
40
  )
 
 
 
 
 
 
 
 
 
 
 
 
1
  #!/usr/bin/env python
2
  import streamlit as st
3
 
4
+ from constants import MODELS
5
 
6
  st.set_page_config(layout="wide")
7
 
8
  st.markdown(
9
+ """
10
+ # Download & Credits
11
 
12
+ ## 1. Preprocessing pipelines
13
 
14
  - Downloading datasets: [nf-core/fetchngs (revision 1.10.0)](https://github.com/nf-core/fetchngs)
15
  - Aligning datasets: [brickmanlab/scrnaseq (revision: feature/smartseq)](https://github.com/brickmanlab/scrnaseq)
16
+ - **Ensembl Genomes (models < v1.0.1)**
17
  - Mouse: GRCm38 v102
18
  - Human: GRCh38 v110
19
 
20
  ## 2. Codebase
21
 
22
  - Data analysis: [brickmanlab/proks-salehin-et-al](https://github.com/brickmanlab/proks-salehin-et-al)
23
+ - Web portal on HF: [brickmanlab/hf-preimplantation-portal](https://huggingface.co/spaces/brickmanlab/hf-preimplantation-portal/tree/main)
24
+ - Web portal (deprecated): [brickmanlab/preimplantation-portal](https://github.com/brickmanlab/preimplantation-portal)
25
+
26
+ ## 3. Raw and normalized counts
27
 
28
+ Raw counts are stored in `layers['counts']` and normalized counts are stored in `.X`.
29
 
30
+ - models < v1.0.1
31
+ - [mouse](https://zenodo.org/records/13749348/files/01_mouse_reprocessed.h5ad)
32
+ - [human](https://zenodo.org/records/13749348/files/32_human_adata.h5ad)
33
 
34
+ ## 4. scVI/scANVI models
35
 
36
  Trained models with parameters were uploaded to [Hugging Face](https://huggingface.co/brickmanlab/preimplantation-models).
 
 
 
 
 
 
37
  """
38
  )
39
+
40
+ text = ""
41
+ for specie in MODELS:
42
+ text += f"- **{specie}**: "
43
+ for version in MODELS[specie]:
44
+ url = (
45
+ f"https://huggingface.co/brickmanlab/{specie.lower()}-scanvi/tree/{version}"
46
+ )
47
+ text += f"[{version}]({url}), "
48
+ text = text[:-2] + "\n"
49
+ st.markdown(text)
utils.py CHANGED
@@ -8,9 +8,41 @@ import pandas as pd
8
  import plotly.express as px
9
  import streamlit as st
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  @st.cache_data
13
- def fetch_resource(url: str) -> str:
14
  """Helper function for downloading datasets
15
 
16
  Parameters
@@ -24,14 +56,45 @@ def fetch_resource(url: str) -> str:
24
  Path where the file was downloaded to, default /tmp
25
  """
26
 
27
- filename = f"/tmp/{url.split('/')[-1]}"
28
- if not Path(filename).exists():
 
 
 
29
  try:
30
- urllib.request.urlretrieve(url, filename)
31
  except (socket.gaierror, urllib.error.URLError) as err:
32
  raise ConnectionError(f"could not download {url} due to {err}")
33
 
34
- return filename
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
 
37
  def get_embedding(adata: anndata.AnnData, key: str) -> pd.DataFrame:
@@ -162,3 +225,33 @@ def plot_feature(
162
 
163
  ax_ = ax if ax else st
164
  ax_.plotly_chart(g, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import plotly.express as px
9
  import streamlit as st
10
 
11
+ from constants import MODELS
12
+
13
+
14
+ def ui_model_selection():
15
+
16
+ # shared state variables between pages
17
+ if "SPECIE" not in st.session_state:
18
+ st.session_state["SPECIE"] = None
19
+ if "VERSION" not in st.session_state:
20
+ st.session_state["VERSION"] = None
21
+
22
+ specie = st.sidebar.selectbox(
23
+ "**Species**",
24
+ MODELS.keys(),
25
+ index=list(MODELS.keys()).index(st.session_state["SPECIE"]) if st.session_state["SPECIE"] else None,
26
+ placeholder="Supported species",
27
+ )
28
+
29
+ if specie:
30
+ version = st.sidebar.selectbox(
31
+ "**Version**",
32
+ MODELS[specie],
33
+ index=MODELS[specie].index(st.session_state["VERSION"]) if st.session_state["VERSION"] else None,
34
+ placeholder="Version",
35
+ )
36
+
37
+ st.sidebar.divider()
38
+
39
+ if specie and version:
40
+ st.session_state["SPECIE"] = specie
41
+ st.session_state["VERSION"] = version
42
+
43
 
44
  @st.cache_data
45
+ def _fetch_resource(url: str, filename: str) -> str:
46
  """Helper function for downloading datasets
47
 
48
  Parameters
 
56
  Path where the file was downloaded to, default /tmp
57
  """
58
 
59
+ destination = Path(f"/tmp/{filename}")
60
+ if not filename:
61
+ raise ValueError("Filename not specified!")
62
+
63
+ if not destination.exists():
64
  try:
65
+ urllib.request.urlretrieve(url, destination)
66
  except (socket.gaierror, urllib.error.URLError) as err:
67
  raise ConnectionError(f"could not download {url} due to {err}")
68
 
69
+ return destination.as_posix()
70
+
71
+
72
+ def fetch_resource(specie: str, version: str) -> anndata.AnnData:
73
+ """Load H5AD dataset from Hugging Face (https://huggingface.co/brickmanlab)
74
+
75
+ Parameters
76
+ ----------
77
+ specie : str
78
+ Specie
79
+ version : str
80
+ Model version
81
+
82
+ Returns
83
+ -------
84
+ anndata.AnnData
85
+ Annotated dataset
86
+
87
+ Raises
88
+ ------
89
+ ValueError
90
+ Specie and Version have to exist
91
+ """
92
+
93
+ if specie not in MODELS and version not in MODELS[specie]:
94
+ raise ValueError(f"Provided {specie} and {version} are not present on Hugging Face models!")
95
+
96
+ url: str = f"https://huggingface.co/brickmanlab/{specie.lower()}-scanvi/resolve/{version}/adata.h5ad"
97
+ return anndata.read_h5ad(_fetch_resource(url, filename=f"{specie.lower()}_v{version}.h5ad"))
98
 
99
 
100
  def get_embedding(adata: anndata.AnnData, key: str) -> pd.DataFrame:
 
225
 
226
  ax_ = ax if ax else st
227
  ax_.plotly_chart(g, use_container_width=True)
228
+
229
+
230
+ def get_degs(adata: anndata.AnnData, key: str) -> pd.DataFrame:
231
+ """Format DEGs to datagrame.
232
+
233
+ Code taken from https://github.com/scverse/scanpy/blob/1.10.4/src/scanpy/get/get.py#L27-L111
234
+
235
+ Parameters
236
+ ----------
237
+ adata : anndata.AnnData
238
+ Annotated dataframe
239
+ key : str
240
+ Key used to store the degs
241
+
242
+ Returns
243
+ -------
244
+ pd.DataFrame
245
+ Dataframe of differentially expressed genes
246
+ """
247
+
248
+ group = list(adata.uns[key]["names"].dtype.names)
249
+ colnames = ["names", "scores", "logfoldchanges", "pvals", "pvals_adj"]
250
+
251
+ d = [pd.DataFrame(adata.uns[key][c])[group] for c in colnames]
252
+ d = pd.concat(d, axis=1, names=[None, "group"], keys=colnames)
253
+ d = d.stack(level=1).reset_index()
254
+ d["group"] = pd.Categorical(d["group"], categories=group)
255
+ d = d.sort_values(["group", "level_0"]).drop(columns="level_0")
256
+
257
+ return d