Spaces:

brickmanlab
/

hf-preimplantation-portal

Sleeping

App Files Files Community

Martin Proks commited on Jan 10

Commit

fc741fb

unverified ·

1 Parent(s): dddbe1c

chore: port streamlit-portal to huggingface spaces

Browse files

Files changed (13) hide show

Home.py +49 -0
README.md +11 -8
__init__.py +0 -0
constants.py +26 -0
pages/1_Gene_Expression.py +61 -0
pages/2_Differentially_Expressed_Genes.py +72 -0
pages/3_SHAP_features.py +58 -0
pages/4_Download.py +40 -0
pages/__init__.py +0 -0
prepare.ipynb +0 -0
requirements.txt +3 -0
static/Fig-1.v4.3.png +0 -0
utils.py +164 -0

Home.py ADDED Viewed

	@@ -0,0 +1,49 @@

+#!/usr/bin/env python
+import streamlit as st
+st.set_page_config(layout="wide")
+st.markdown(
+    """
+    # Deep Learning Based Models for Preimplantation Mouse and Human Embryos Based on Single Cell RNA Sequencing
+    _[Martin Proks](https://orcid.org/0000-0002-8178-3128)\*,
+    [Nazmus Salehin](https://orcid.org/0000-0002-8155-4296)\*,
+    [Joshua M. Brickman](https://orcid.org/0000-0003-1580-7491)**_
+    _\* There authors contributed equally to the work_
+    _** Corresponding author [[email protected]](mailto:[email protected])_
+    The rapid growth of single-cell transcriptomic technology has produced an increasing number of
+    datasets for both embryonic development and _in vitro_ pluripotent stem cell derived models.
+    This avalanche of data surrounding pluripotency and the process of lineage specification has
+    meant it has become increasingly difficult to define specific cell types or states and compare
+    these to _in vitro_ differentiation. Here we utilize a set of deep learning (DL) tools to
+    integrate and classify multiple datasets. This allows for the definition of both mouse and
+    human embryo cell types, lineages and states, thereby maximising the information one can garner
+    from these precious experimental resources. Our approaches are built on recent initiatives for
+    large scale human organ atlases, but here we focus on the difficult to obtain and process
+    material that spans early mouse and human development. We deploy similar approaches as the
+    initiatives building large reference organ atlases, however with a focus on early mammalian
+    development. Using publicly available data for these stages, we test different deep learning
+    approaches and develop a model to classify cell types in an unbiased fashion at the same time as
+    defining the set of genes used by the model to identify lineages, cell types and states. We have
+    used our models trained on _in vivo_ development to classify pluripotent stem cell models for
+    both mouse and human development, showcasing the importance of this resource as a dynamic
+    reference for early embryogenesis.
+    """
+)
+st.image(
+    "static/Fig-1.v4.3.png",
+    caption="""
+        Summary of datasets used to build reference models. a) Schematic overview of mouse
+        and human preimplantation development. b) Quantification of cells per publication which
+        were collected for building the mouse (grey) and human (black) reference. c) Computational
+        schematic of tools used to build and interpret the reference models. d) Gene expression
+        of canonical markers for each developmental stage in mouse (top) and human (bottom)
+        preimplantation. e) Reduced dimensional representation of preimplantation mouse (left)
+        and human (right) datasets. dpf: days post fertilization, E: embryonic day.
+         """,
+)

README.md CHANGED Viewed

@@ -1,12 +1,15 @@
 ---
-title: Preimplantation
-emoji: 🏆
-colorFrom: green
-colorTo: purple
 sdk: streamlit
-sdk_version: 1.41.1
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Preimplantation portal
 sdk: streamlit
+sdk_version: 1.38.0
+app_file: Home.py
 ---
+# preimplantation portal
+## Run locally
+```bash
+source venv/bin/activate
+streamlit run Home.py
+```

__init__.py ADDED Viewed

File without changes

constants.py ADDED Viewed

	@@ -0,0 +1,26 @@

+VERSION = 1.5
+DEFAULT_DR = "X_draw_graph_fa"
+DEFAULT_META = "stage"
+ZENODO_URL = "https://zenodo.org/records/13749348/files"
+DATA = {
+    "HUMAN": {
+        "RAW_DATASET": f"{ZENODO_URL}/32_human_adata.h5ad",
+        "DATASET": f"{ZENODO_URL}/portal_human_v{VERSION}.h5ad",
+        "DEGS": {
+            "CT": f"{ZENODO_URL}/human_degs_ct_v{VERSION}.feather",
+            "STAGE": f"{ZENODO_URL}/human_degs_stage_v{VERSION}.feather"
+        },
+        "SHAP": f"{ZENODO_URL}/human_SHAP_v{VERSION}.feather",
+    },
+    "MOUSE": {
+        "RAW_DATASET": f"{ZENODO_URL}/01_mouse_reprocessed.h5ad",
+        "DATASET": f"{ZENODO_URL}/portal_mouse_v{VERSION}.h5ad",
+        "DEGS": {
+            "CT": f"{ZENODO_URL}/mouse_degs_ct_v{VERSION}.feather",
+            "STAGE": f"{ZENODO_URL}/mouse_degs_stage_v{VERSION}.feather",
+        },
+        "SHAP": f"{ZENODO_URL}/mouse_SHAP_v{VERSION}.feather",
+    },
+}

pages/1_Gene_Expression.py ADDED Viewed

	@@ -0,0 +1,61 @@

+#!/usr/bin/env python
+import anndata
+import streamlit as st
+from constants import DATA, DEFAULT_DR, DEFAULT_META
+from utils import fetch_resource, plot_feature, plot_sc_embedding
+st.set_page_config(layout="wide")
+st.markdown("""
+    # Gene expression
+    Levels of gene activity along differentiation.
+""")
+ds = st.sidebar.selectbox(
+    "**Load dataset**",
+    DATA.keys(),
+    index=None,
+    placeholder="Select contact method...",
+)
+if ds is not None:
+    adata = anndata.read_h5ad(fetch_resource(DATA[ds]['DATASET']))
+    sl_dr = st.sidebar.selectbox(
+        "**Dimension reduction**",
+        adata.obsm_keys(),
+        index=adata.obsm_keys().index(DEFAULT_DR),
+        placeholder="Select method ...",
+    )
+    sl_metadata = st.sidebar.selectbox(
+        "**Metadata**",
+        adata.obs.columns,
+        index=adata.obs.columns.get_loc(DEFAULT_META),
+        placeholder="Select column ...",
+    )
+    sl_feature = st.sidebar.selectbox(
+        "**Gene**",
+        adata.raw.var_names,
+        index=0,
+        placeholder="Select gene ...",
+    )
+    is_imputed = sl_feature in adata.var_names
+    sl_denoised = st.sidebar.checkbox(
+        "Use denoised expression?",
+        help="Denoised expression is sampled from the decoder.",
+        disabled=(not is_imputed)
+    )
+    col1, col2 = st.columns(2)
+    plot_sc_embedding(adata, group_by=sl_metadata, reduction_key=sl_dr, ax=col1)
+    plot_sc_embedding(
+        adata, feature=sl_feature, reduction_key=sl_dr, layer=sl_denoised, ax=col2
+    )
+    st.markdown("## Raw gene expression")
+    plot_feature(adata, feature=sl_feature, group_by=sl_metadata, kind="box")

pages/2_Differentially_Expressed_Genes.py ADDED Viewed

	@@ -0,0 +1,72 @@

+#!/usr/bin/env python
+import pandas as pd
+import streamlit as st
+from constants import DATA
+from utils import fetch_resource
+st.set_page_config(layout="wide")
+st.markdown("""
+    # Differentially expressed genes
+    Genes below have been determined using `sc.tl.rank_genes_groups` where `t-test_overestim_var`
+    is the default method.
+    """
+)
+filter_flag = []
+ds = st.sidebar.selectbox(
+    "**Select models**",
+    DATA.keys(),
+    index=None,
+    placeholder="Select species",
+)
+if ds:
+    filter_by = st.sidebar.selectbox(
+        "**Select by**",
+        DATA[ds]["DEGS"].keys(),
+        index=None,
+        placeholder="Select by",
+    )
+if ds and filter_by:
+    markers = pd.read_feather(fetch_resource(DATA[ds]['DEGS'][filter_by]))
+    group = st.sidebar.multiselect(
+        "**Cell type**", markers.group.unique(), placeholder="Select group ..."
+    )
+    genes = st.sidebar.multiselect(
+        "**Gene**", markers.gene_symbol.unique(), placeholder="Select genes ..."
+    )
+    foldchange = st.sidebar.number_input(
+        "**Log2 fold-change**",
+        value=1,
+    )
+    pval = st.sidebar.number_input(
+        "**Adjusted p-value**",
+        value=0.05,
+    )
+    if group:
+        filter_flag.append("group == @group")
+    if genes:
+        filter_flag.append("@genes in gene_symbol")
+    if foldchange:
+        filter_flag.append(
+            "logfoldchanges > @foldchange"
+            if foldchange > 0
+            else "logfoldchanges < @foldchange"
+        )
+    if pval:
+        filter_flag.append("pvals_adj < @pval")
+    subset = markers.query(" & ".join(filter_flag)) if filter_flag else markers
+    st.dataframe(subset, use_container_width=True, height=650)

pages/3_SHAP_features.py ADDED Viewed

	@@ -0,0 +1,58 @@

+#!/usr/bin/env python
+import pandas as pd
+import streamlit as st
+from constants import DATA
+from utils import fetch_resource
+st.set_page_config(layout="wide")
+st.markdown("""
+    # SHAP features
+    Predicted features (genes) used by the scANVI classifier to determine a cell type. The features
+    have been determined using [SHAP](https://shap.readthedocs.io/en/latest/).
+    Each metric for a feature is determined from 10 random boostraps with replacement.
+    - weight_mean: $\mu$ of SHAP value
+    - weight_std: $\sigma$ of SHAP value
+    - weight_ci_upper: $\mu$ + $\sigma$
+    - weight_ci_lower: $\mu$ - $\sigma$
+    - logfoldchanges: Log2fold change from differentiation expression analysis
+    - pvals_adj: Adjusted p-value from differentiation expression analysis
+    - scores: Estimated score from differentiation expression analysis
+    """
+)
+ds = st.sidebar.selectbox(
+    "**Load dataset**",
+    DATA.keys(),
+    index=None,
+    placeholder="Select dataset ...",
+)
+if ds:
+    data = pd.read_feather(fetch_resource(DATA[ds]["SHAP"]))
+    query = st.sidebar.selectbox(
+        "**Subset**",
+        data.ct.unique().tolist(),
+        index=None,
+        placeholder="Select cell type ...",
+    )
+    features = st.sidebar.multiselect(
+        "**Genes**", data.feature.unique(), placeholder="Select genes ..."
+    )
+    filter_condition = []
+    if query:
+        filter_condition.append("ct == @query")
+    if features:
+        filter_condition.append("feature in @features")
+    if filter_condition:
+        data = data.query(" & ".join(filter_condition))
+    st.dataframe(data, use_container_width=True, height=650)

pages/4_Download.py ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/usr/bin/env python
+import streamlit as st
+from constants import DATA
+st.set_page_config(layout="wide")
+st.markdown(
+    f"""
+    # Download
+    ## 1. Pipelines
+    - Downloading datasets: [nf-core/fetchngs (revision 1.10.0)](https://github.com/nf-core/fetchngs)
+    - Aligning datasets: [brickmanlab/scrnaseq (revision: feature/smartseq)](https://github.com/brickmanlab/scrnaseq)
+    - **Ensembl Genomes**
+        - Mouse: GRCm38 v102
+        - Human: GRCh38 v110
+    ## 2. Codebase
+    - Data analysis: [brickmanlab/proks-salehin-et-al](https://github.com/brickmanlab/proks-salehin-et-al)
+    - Web portal: [brickmanlab/preimplantation-portal](https://github.com/brickmanlab/preimplantation-portal)
+    ## 3. Raw data
+    - [Mouse]({DATA['MOUSE']['RAW_DATASET']})
+    - [Human]({DATA['HUMAN']['RAW_DATASET']})
+    ## 4. AI models
+    Trained models with parameters were uploaded to [Hugging Face](https://huggingface.co/brickmanlab/preimplantation-models).
+    ### 4.1 Models
+    - [scANVI mouse](https://huggingface.co/brickmanlab/mouse-scanvi)
+    - [scANVI human](https://huggingface.co/brickmanlab/human-scanvi)
+    """
+)

pages/__init__.py ADDED Viewed

File without changes

prepare.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+anndata==0.10.9
+plotly==5.24.0
+pyarrow==17.0.0

static/Fig-1.v4.3.png ADDED Viewed

utils.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import socket
+import urllib.request
+from pathlib import Path
+from typing import Literal
+import anndata
+import pandas as pd
+import plotly.express as px
+import streamlit as st
+@st.cache_data
+def fetch_resource(url: str) -> str:
+    """Helper function for downloading datasets
+    Parameters
+    ----------
+    url : str
+        Zenodo url link
+    Returns
+    -------
+    str
+        Path where the file was downloaded to, default /tmp
+    """
+    filename = f"/tmp/{url.split('/')[-1]}"
+    if not Path(filename).exists():
+        try:
+            urllib.request.urlretrieve(url, filename)
+        except (socket.gaierror, urllib.error.URLError) as err:
+            raise ConnectionError(f"could not download {url} due to {err}")
+    return filename
+def get_embedding(adata: anndata.AnnData, key: str) -> pd.DataFrame:
+    """
+    Helper function which retrieves embedding coordinates for each cell.
+    Parameters
+    ----------
+    adata : anndata.AnnData
+        scrna-seq dataset
+    key : str
+        Dimension reduction key, usually starts with X_
+    Returns
+    -------
+    pd.DataFrame
+        Embedding coordinates
+    Raises
+    ------
+    ValueError
+        Fail if reduction key doesn't exist
+    """
+    if key not in adata.obsm.keys():
+        raise ValueError(f"Reduction key: {key} not available")
+    dimension_names = f"{key[2:].upper()}_1", f"{key[2:].upper()}_2"
+    return pd.DataFrame(adata.obsm[key][:, :2], columns=dimension_names)
+def plot_sc_embedding(
+    adata: anndata.AnnData,
+    reduction_key: str,
+    group_by: str = None,
+    feature: str = None,
+    layer: str = None,
+    ax = None,
+):
+    """
+    Plot single-cell dataset
+    Parameters
+    ----------
+    adata : anndata.AnnData
+        scrna-seq dataset
+    reduction_key : str
+        Reduced space key
+    group_by : str
+        Key used to color cells
+    features: str
+        Gene
+    ax : _type_
+        Axes
+    """
+    embeddings = get_embedding(adata, reduction_key)
+    if group_by:
+        embeddings[group_by] = adata.obs[group_by].values
+        embeddings = embeddings.sort_values(by=group_by)
+        # color_uns_key = f"{group_by}_colors"
+        kwargs = {"color": embeddings[group_by].values.tolist()}
+        if adata.obs[group_by].dtype == "category":
+            ...
+        else:
+            kwargs["color_continuous_scale"] = px.colors.sequential.Viridis
+    if feature:
+        X = (
+            adata[:, feature].layers["scVI_normalized"].toarray()
+            if layer
+            else adata.raw[:, feature].X.toarray()
+        )
+        embeddings[feature] = X.ravel()
+        kwargs = {
+            "color": embeddings[feature].values.tolist(),
+            # "title": feature,
+            "color_continuous_scale": px.colors.sequential.Viridis,
+        }
+    ax_ = ax if ax else st
+    ax_.plotly_chart(
+        px.scatter(
+            data_frame=embeddings,
+            x=embeddings.columns[0],
+            y=embeddings.columns[1],
+            **kwargs,
+        ),
+        use_container_width=True,
+        # .update_xaxes(showgrid=False)
+        # .update_yaxes(showgrid=False, zeroline=False)
+    )
+def plot_feature(
+    adata: anndata.AnnData,
+    feature: str,
+    group_by: str,
+    kind: Literal["box"] = "box",
+    ax = None
+):
+    """Plot feature expression
+    Parameters
+    ----------
+    adata : anndata.AnnData
+        Dataset
+    feature : str
+        Gene name
+    group_by : str
+        Metadata column
+    kind : str
+        Type of plot
+    ax : _type_, optional
+        Axis, by default None
+    """
+    df = pd.DataFrame(adata.raw[:, feature].X.toarray(), columns=[feature])
+    df[group_by] = adata.obs[group_by].values
+    df = df.sort_values(by=group_by)
+    g = None
+    if kind == "box":
+        g = px.box(df, x=group_by, y=feature, color=group_by)
+    else:
+        raise ValueError(f"Provided kind: {kind} not supported")
+    ax_ = ax if ax else st
+    ax_.plotly_chart(g, use_container_width=True)