Spaces:

brickmanlab
/

hf-preimplantation-portal

Sleeping

App Files Files Community

matq007 commited on Jan 17

Commit

0e1164c

unverified ·

1 Parent(s): fc741fb

feat: move from streamlit to hugging spaces

Browse files

Files changed (8) hide show

.gitignore +3 -0
README.md +4 -3
constants.py +3 -23
pages/1_Gene_Expression.py +11 -14
pages/2_Differentially_Expressed_Genes.py +41 -46
pages/3_SHAP_features.py +41 -31
pages/4_Download.py +25 -16
utils.py +98 -5

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.ruff_cache
+.venv
+__pycache__

README.md CHANGED Viewed

@@ -1,15 +1,16 @@
 ---
 title: Preimplantation portal
 sdk: streamlit
 sdk_version: 1.38.0
 app_file: Home.py
 ---
-# preimplantation portal
 ## Run locally
 ```bash
-source venv/bin/activate
 streamlit run Home.py
 ```

 ---
 title: Preimplantation portal
+short_description: Preimplantation mouse and human development
 sdk: streamlit
 sdk_version: 1.38.0
 app_file: Home.py
 ---
 ## Run locally
 ```bash
+source .venv/bin/activate
+# install dependencies
+uv pip install -r requirements.txt
 streamlit run Home.py
 ```

constants.py CHANGED Viewed

@@ -1,26 +1,6 @@
-VERSION = 1.5
 DEFAULT_DR = "X_draw_graph_fa"
 DEFAULT_META = "stage"
-ZENODO_URL = "https://zenodo.org/records/13749348/files"
-DATA = {
-    "HUMAN": {
-        "RAW_DATASET": f"{ZENODO_URL}/32_human_adata.h5ad",
-        "DATASET": f"{ZENODO_URL}/portal_human_v{VERSION}.h5ad",
-        "DEGS": {
-            "CT": f"{ZENODO_URL}/human_degs_ct_v{VERSION}.feather",
-            "STAGE": f"{ZENODO_URL}/human_degs_stage_v{VERSION}.feather"
-        },
-        "SHAP": f"{ZENODO_URL}/human_SHAP_v{VERSION}.feather",
-    },
-    "MOUSE": {
-        "RAW_DATASET": f"{ZENODO_URL}/01_mouse_reprocessed.h5ad",
-        "DATASET": f"{ZENODO_URL}/portal_mouse_v{VERSION}.h5ad",
-        "DEGS": {
-            "CT": f"{ZENODO_URL}/mouse_degs_ct_v{VERSION}.feather",
-            "STAGE": f"{ZENODO_URL}/mouse_degs_stage_v{VERSION}.feather",
-        },
-        "SHAP": f"{ZENODO_URL}/mouse_SHAP_v{VERSION}.feather",
-    },
-}

+# Constants
+MODELS = {"HUMAN": ["v1.0.1", "main"], "MOUSE": ["v1.0.1", "main"]}
 DEFAULT_DR = "X_draw_graph_fa"
 DEFAULT_META = "stage"

pages/1_Gene_Expression.py CHANGED Viewed

@@ -1,9 +1,8 @@
 #!/usr/bin/env python
-import anndata
 import streamlit as st
-from constants import DATA, DEFAULT_DR, DEFAULT_META
-from utils import fetch_resource, plot_feature, plot_sc_embedding
 st.set_page_config(layout="wide")
 st.markdown("""
@@ -12,16 +11,10 @@ st.markdown("""
     Levels of gene activity along differentiation.
 """)
-ds = st.sidebar.selectbox(
-    "**Load dataset**",
-    DATA.keys(),
-    index=None,
-    placeholder="Select contact method...",
-)
-if ds is not None:
-    adata = anndata.read_h5ad(fetch_resource(DATA[ds]['DATASET']))
     sl_dr = st.sidebar.selectbox(
         "**Dimension reduction**",
@@ -30,6 +23,10 @@ if ds is not None:
         placeholder="Select method ...",
     )
     sl_metadata = st.sidebar.selectbox(
         "**Metadata**",
         adata.obs.columns,
@@ -39,7 +36,7 @@ if ds is not None:
     sl_feature = st.sidebar.selectbox(
         "**Gene**",
-        adata.raw.var_names,
         index=0,
         placeholder="Select gene ...",
     )
@@ -48,7 +45,7 @@ if ds is not None:
     sl_denoised = st.sidebar.checkbox(
         "Use denoised expression?",
         help="Denoised expression is sampled from the decoder.",
-        disabled=(not is_imputed)
     )
     col1, col2 = st.columns(2)

 #!/usr/bin/env python
 import streamlit as st
+from constants import DEFAULT_DR, DEFAULT_META
+from utils import fetch_resource, plot_feature, plot_sc_embedding, ui_model_selection
 st.set_page_config(layout="wide")
 st.markdown("""
     Levels of gene activity along differentiation.
 """)
+ui_model_selection()
+if st.session_state["SPECIE"] and st.session_state["VERSION"]:
+    adata = fetch_resource(st.session_state["SPECIE"], st.session_state["VERSION"])
     sl_dr = st.sidebar.selectbox(
         "**Dimension reduction**",
         placeholder="Select method ...",
     )
+    st.sidebar.markdown(
+        f"Visualization done on `{adata.uns['neighbors']['params']['use_rep']}` space."
+    )
     sl_metadata = st.sidebar.selectbox(
         "**Metadata**",
         adata.obs.columns,
     sl_feature = st.sidebar.selectbox(
         "**Gene**",
+        sorted(adata.raw.var_names),
         index=0,
         placeholder="Select gene ...",
     )
     sl_denoised = st.sidebar.checkbox(
         "Use denoised expression?",
         help="Denoised expression is sampled from the decoder.",
+        disabled=(not is_imputed),
     )
     col1, col2 = st.columns(2)

pages/2_Differentially_Expressed_Genes.py CHANGED Viewed

@@ -1,9 +1,7 @@
 #!/usr/bin/env python
-import pandas as pd
 import streamlit as st
-from constants import DATA
-from utils import fetch_resource
 st.set_page_config(layout="wide")
@@ -12,61 +10,58 @@ st.markdown("""
     Genes below have been determined using `sc.tl.rank_genes_groups` where `t-test_overestim_var`
     is the default method.
-    """
-)
 filter_flag = []
-ds = st.sidebar.selectbox(
-    "**Select models**",
-    DATA.keys(),
-    index=None,
-    placeholder="Select species",
-)
-if ds:
-    filter_by = st.sidebar.selectbox(
         "**Select by**",
-        DATA[ds]["DEGS"].keys(),
         index=None,
-        placeholder="Select by",
     )
-if ds and filter_by:
-    markers = pd.read_feather(fetch_resource(DATA[ds]['DEGS'][filter_by]))
-    group = st.sidebar.multiselect(
-        "**Cell type**", markers.group.unique(), placeholder="Select group ..."
-    )
-    genes = st.sidebar.multiselect(
-        "**Gene**", markers.gene_symbol.unique(), placeholder="Select genes ..."
-    )
-    foldchange = st.sidebar.number_input(
-        "**Log2 fold-change**",
-        value=1,
-    )
-    pval = st.sidebar.number_input(
-        "**Adjusted p-value**",
-        value=0.05,
-    )
-    if group:
-        filter_flag.append("group == @group")
-    if genes:
-        filter_flag.append("@genes in gene_symbol")
-    if foldchange:
-        filter_flag.append(
-            "logfoldchanges > @foldchange"
-            if foldchange > 0
-            else "logfoldchanges < @foldchange"
-        )
-    if pval:
-        filter_flag.append("pvals_adj < @pval")
-    subset = markers.query(" & ".join(filter_flag)) if filter_flag else markers
-    st.dataframe(subset, use_container_width=True, height=650)

 #!/usr/bin/env python
 import streamlit as st
+from utils import fetch_resource, get_degs, ui_model_selection
 st.set_page_config(layout="wide")
     Genes below have been determined using `sc.tl.rank_genes_groups` where `t-test_overestim_var`
     is the default method.
+    """)
+ui_model_selection()
 filter_flag = []
+if st.session_state["SPECIE"] and st.session_state["VERSION"]:
+    adata = fetch_resource(st.session_state["SPECIE"], st.session_state["VERSION"])
+    degs_by = st.sidebar.selectbox(
         "**Select by**",
+        [x for x in adata.uns_keys() if "degs" in x],
         index=None,
+        placeholder="Differentially expressed genes by",
     )
+    if degs_by:
+        degs = get_degs(adata, degs_by)
+        group = st.sidebar.multiselect(
+            "**Cell type**", degs["group"].unique(), placeholder="Select group ..."
+        )
+        genes = st.sidebar.multiselect(
+            "**Gene**", sorted(degs["names"].unique()), placeholder="Select genes ..."
+        )
+        foldchange = st.sidebar.number_input(
+            "**Log2 fold-change**",
+            value=1,
+        )
+        pval_adj = st.sidebar.number_input(
+            "**Adjusted p-value**",
+            value=0.05,
+        )
+        if group:
+            filter_flag.append("group == @group")
+        if genes:
+            filter_flag.append("@genes in names")
+        if foldchange:
+            filter_flag.append(
+                "logfoldchanges > @foldchange"
+                if foldchange > 0
+                else "logfoldchanges < @foldchange"
+            )
+        if pval_adj:
+            filter_flag.append("pvals_adj < @pval_adj")
+        subset = degs.query(" & ".join(filter_flag)) if filter_flag else degs
+        st.dataframe(subset, use_container_width=True, height=650)

pages/3_SHAP_features.py CHANGED Viewed

@@ -1,9 +1,7 @@
 #!/usr/bin/env python
-import pandas as pd
 import streamlit as st
-from constants import DATA
-from utils import fetch_resource
 st.set_page_config(layout="wide")
@@ -22,37 +20,49 @@ st.markdown("""
     - logfoldchanges: Log2fold change from differentiation expression analysis
     - pvals_adj: Adjusted p-value from differentiation expression analysis
     - scores: Estimated score from differentiation expression analysis
-    """
-)
-ds = st.sidebar.selectbox(
-    "**Load dataset**",
-    DATA.keys(),
-    index=None,
-    placeholder="Select dataset ...",
-)
-if ds:
-    data = pd.read_feather(fetch_resource(DATA[ds]["SHAP"]))
-    query = st.sidebar.selectbox(
-        "**Subset**",
-        data.ct.unique().tolist(),
         index=None,
-        placeholder="Select cell type ...",
     )
-    features = st.sidebar.multiselect(
-        "**Genes**", data.feature.unique(), placeholder="Select genes ..."
-    )
-    filter_condition = []
-    if query:
-        filter_condition.append("ct == @query")
-    if features:
-        filter_condition.append("feature in @features")
-    if filter_condition:
-        data = data.query(" & ".join(filter_condition))
-    st.dataframe(data, use_container_width=True, height=650)

 #!/usr/bin/env python
 import streamlit as st
+from utils import fetch_resource, ui_model_selection
 st.set_page_config(layout="wide")
     - logfoldchanges: Log2fold change from differentiation expression analysis
     - pvals_adj: Adjusted p-value from differentiation expression analysis
     - scores: Estimated score from differentiation expression analysis
+    """)
+ui_model_selection()
+filter_condition = []
+if st.session_state["SPECIE"] and st.session_state["VERSION"]:
+    adata = fetch_resource(st.session_state["SPECIE"], st.session_state["VERSION"])
+    explainer = st.sidebar.selectbox(
+        "**Explainer**",
+        adata.uns["explainer"].keys(),
         index=None,
+        placeholder="Select explainer ...",
     )
+    if explainer:
+        shap_values = adata.uns["explainer"][explainer].pop("shap_values").reset_index()
+        params = [f"{k}:\t{v}" for k, v in adata.uns["explainer"][explainer].items()]
+        st.sidebar.markdown("**Parameters**")
+        for k, v in adata.uns["explainer"][explainer].items():
+            st.sidebar.markdown(f"{k}:\t{v}")
+        celltype = st.sidebar.selectbox(
+            "**Cell type**",
+            adata.obs.ct.cat.categories,
+            index=None,
+            placeholder="Select cell type ...",
+        )
+        features = st.sidebar.multiselect(
+            "**Genes**",
+            sorted(shap_values.feature.unique()),
+            placeholder="Select genes ...",
+        )
+        if celltype:
+            filter_condition.append("ct == @celltype")
+        if features:
+            filter_condition.append("feature in @features")
+        if filter_condition:
+            shap_values = shap_values.query(" & ".join(filter_condition))
+        st.dataframe(shap_values, use_container_width=True, height=650)

pages/4_Download.py CHANGED Viewed

@@ -1,40 +1,49 @@
 #!/usr/bin/env python
 import streamlit as st
-from constants import DATA
 st.set_page_config(layout="wide")
 st.markdown(
-    f"""
-    # Download
-    ## 1. Pipelines
     - Downloading datasets: [nf-core/fetchngs (revision 1.10.0)](https://github.com/nf-core/fetchngs)
     - Aligning datasets: [brickmanlab/scrnaseq (revision: feature/smartseq)](https://github.com/brickmanlab/scrnaseq)
-    - **Ensembl Genomes**
         - Mouse: GRCm38 v102
         - Human: GRCh38 v110
     ## 2. Codebase
     - Data analysis: [brickmanlab/proks-salehin-et-al](https://github.com/brickmanlab/proks-salehin-et-al)
-    - Web portal: [brickmanlab/preimplantation-portal](https://github.com/brickmanlab/preimplantation-portal)
-    ## 3. Raw data
-    - [Mouse]({DATA['MOUSE']['RAW_DATASET']})
-    - [Human]({DATA['HUMAN']['RAW_DATASET']})
-    ## 4. AI models
     Trained models with parameters were uploaded to [Hugging Face](https://huggingface.co/brickmanlab/preimplantation-models).
-    ### 4.1 Models
-    - [scANVI mouse](https://huggingface.co/brickmanlab/mouse-scanvi)
-    - [scANVI human](https://huggingface.co/brickmanlab/human-scanvi)
     """
 )

 #!/usr/bin/env python
 import streamlit as st
+from constants import MODELS
 st.set_page_config(layout="wide")
 st.markdown(
+    """
+    # Download & Credits
+    ## 1. Preprocessing pipelines
     - Downloading datasets: [nf-core/fetchngs (revision 1.10.0)](https://github.com/nf-core/fetchngs)
     - Aligning datasets: [brickmanlab/scrnaseq (revision: feature/smartseq)](https://github.com/brickmanlab/scrnaseq)
+    - **Ensembl Genomes (models < v1.0.1)**
         - Mouse: GRCm38 v102
         - Human: GRCh38 v110
     ## 2. Codebase
     - Data analysis: [brickmanlab/proks-salehin-et-al](https://github.com/brickmanlab/proks-salehin-et-al)
+    - Web portal on HF: [brickmanlab/hf-preimplantation-portal](https://huggingface.co/spaces/brickmanlab/hf-preimplantation-portal/tree/main)
+    - Web portal (deprecated): [brickmanlab/preimplantation-portal](https://github.com/brickmanlab/preimplantation-portal)
+    ## 3. Raw and normalized counts
+    Raw counts are stored in `layers['counts']` and normalized counts are stored in `.X`.
+    - models < v1.0.1
+        - [mouse](https://zenodo.org/records/13749348/files/01_mouse_reprocessed.h5ad)
+        - [human](https://zenodo.org/records/13749348/files/32_human_adata.h5ad)
+    ## 4. scVI/scANVI models
     Trained models with parameters were uploaded to [Hugging Face](https://huggingface.co/brickmanlab/preimplantation-models).
     """
 )
+text = ""
+for specie in MODELS:
+    text += f"- **{specie}**: "
+    for version in MODELS[specie]:
+        url = (
+            f"https://huggingface.co/brickmanlab/{specie.lower()}-scanvi/tree/{version}"
+        )
+        text += f"[{version}]({url}), "
+    text = text[:-2] + "\n"
+st.markdown(text)

utils.py CHANGED Viewed

@@ -8,9 +8,41 @@ import pandas as pd
 import plotly.express as px
 import streamlit as st
 @st.cache_data
-def fetch_resource(url: str) -> str:
     """Helper function for downloading datasets
     Parameters
@@ -24,14 +56,45 @@ def fetch_resource(url: str) -> str:
         Path where the file was downloaded to, default /tmp
     """
-    filename = f"/tmp/{url.split('/')[-1]}"
-    if not Path(filename).exists():
         try:
-            urllib.request.urlretrieve(url, filename)
         except (socket.gaierror, urllib.error.URLError) as err:
             raise ConnectionError(f"could not download {url} due to {err}")
-    return filename
 def get_embedding(adata: anndata.AnnData, key: str) -> pd.DataFrame:
@@ -162,3 +225,33 @@ def plot_feature(
     ax_ = ax if ax else st
     ax_.plotly_chart(g, use_container_width=True)

 import plotly.express as px
 import streamlit as st
+from constants import MODELS
+def ui_model_selection():
+    # shared state variables between pages
+    if "SPECIE" not in st.session_state:
+        st.session_state["SPECIE"] = None
+    if "VERSION" not in st.session_state:
+        st.session_state["VERSION"] = None
+    specie = st.sidebar.selectbox(
+        "**Species**",
+        MODELS.keys(),
+        index=list(MODELS.keys()).index(st.session_state["SPECIE"]) if st.session_state["SPECIE"] else None,
+        placeholder="Supported species",
+    )
+    if specie:
+        version = st.sidebar.selectbox(
+            "**Version**",
+            MODELS[specie],
+            index=MODELS[specie].index(st.session_state["VERSION"]) if st.session_state["VERSION"] else None,
+            placeholder="Version",
+        )
+    st.sidebar.divider()
+    if specie and version:
+        st.session_state["SPECIE"] = specie
+        st.session_state["VERSION"] = version
 @st.cache_data
+def _fetch_resource(url: str, filename: str) -> str:
     """Helper function for downloading datasets
     Parameters
         Path where the file was downloaded to, default /tmp
     """
+    destination = Path(f"/tmp/{filename}")
+    if not filename:
+        raise ValueError("Filename not specified!")
+    if not destination.exists():
         try:
+            urllib.request.urlretrieve(url, destination)
         except (socket.gaierror, urllib.error.URLError) as err:
             raise ConnectionError(f"could not download {url} due to {err}")
+    return destination.as_posix()
+def fetch_resource(specie: str, version: str) -> anndata.AnnData:
+    """Load H5AD dataset from Hugging Face (https://huggingface.co/brickmanlab)
+    Parameters
+    ----------
+    specie : str
+        Specie
+    version : str
+        Model version
+    Returns
+    -------
+    anndata.AnnData
+        Annotated dataset
+    Raises
+    ------
+    ValueError
+        Specie and Version have to exist
+    """
+    if specie not in MODELS and version not in MODELS[specie]:
+        raise ValueError(f"Provided {specie} and {version} are not present on Hugging Face models!")
+    url: str = f"https://huggingface.co/brickmanlab/{specie.lower()}-scanvi/resolve/{version}/adata.h5ad"
+    return anndata.read_h5ad(_fetch_resource(url, filename=f"{specie.lower()}_v{version}.h5ad"))
 def get_embedding(adata: anndata.AnnData, key: str) -> pd.DataFrame:
     ax_ = ax if ax else st
     ax_.plotly_chart(g, use_container_width=True)
+def get_degs(adata: anndata.AnnData, key: str) -> pd.DataFrame:
+    """Format DEGs to datagrame.
+    Code taken from https://github.com/scverse/scanpy/blob/1.10.4/src/scanpy/get/get.py#L27-L111
+    Parameters
+    ----------
+    adata : anndata.AnnData
+        Annotated dataframe
+    key : str
+        Key used to store the degs
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe of differentially expressed genes
+    """
+    group = list(adata.uns[key]["names"].dtype.names)
+    colnames = ["names", "scores", "logfoldchanges", "pvals", "pvals_adj"]
+    d = [pd.DataFrame(adata.uns[key][c])[group] for c in colnames]
+    d = pd.concat(d, axis=1, names=[None, "group"], keys=colnames)
+    d = d.stack(level=1).reset_index()
+    d["group"] = pd.Categorical(d["group"], categories=group)
+    d = d.sort_values(["group", "level_0"]).drop(columns="level_0")
+    return d