Spaces:

maomlab
/

CryptoCEN-ExpressionScatter

Running

App Files Files Community

CryptoCEN-ExpressionScatter / app.py

maom

fix lookup

1109bbe verified over 1 year ago

raw

history blame

3.6 kB

	import datasets
	import streamlit as st
	import numpy as np
	import pandas as pd
	import altair as alt

	st.set_page_config(layout='wide')

	st.markdown("""
	# CryptoCEN Expression Scatter
	CryptoCEN is a co-expression network for Cryptococcus neoformans built on 1,524 RNA-seq runs across 34 studies.
	A pair of genes are said to be co-expressed when their expression is correlated across different conditions and
	is often a marker for genes to be involved in similar processes.

	To Cite:
	MJ O'Meara, JR Rapala, CB Nichols, C Alexandre, B Billmyre, JL Steenwyk, A Alspaugh,
	TR O'Meara CryptoCEN: A Co-Expression Network for Cryptococcus neoformans reveals
	novel proteins involved in DNA damage repair
	* Code available at https://github.com/maomlab/CalCEN/tree/master/vignettes/CryptoCEN
	* Full network and dataset: https://huggingface.co/datasets/maomlab/CryptoCEN

	## Plot scatter plot expression for a pair of genes across studies.
	Put in the ``CNAG_#####`` gene_id for two genes.
	""")

	h99_transcript_annotations = datasets.load_dataset(
	path = "maomlab/CryptoCEN",
	data_files = {"h99_transcript_annotations": "h99_transcript_annotations.tsv"})
	h99_transcript_annotations = h99_transcript_annotations["h99_transcript_annotations"].to_pandas()

	estimated_expression_meta = datasets.load_dataset(
	path = "maomlab/CryptoCEN",
	data_files = {"estimated_expression_meta": "Data/estimated_expression_meta.tsv"})
	estimated_expression_meta = estimated_expression_meta["estimated_expression_meta"].to_pandas()

	estimated_expression = datasets.load_dataset(
	path = "maomlab/CryptoCEN",
	data_files = {"estimated_expression": "estimated_expression_matrix.parquet"})
	estimated_expression = estimated_expression["estimated_expression"].to_pandas()

	#DEBUG
	print(f"estimated_expression shape: {estimated_expression.shape}")

	col1, col2, padding = st.columns(spec = [0.2, 0.2, 0.6])
	with col1:
	gene_id_1 = st.text_input(
	label = "Gene ID 1",
	value = "CNAG_04365",
	max_chars = 10,
	help = "CNAG Gene ID e.g. CNAG_04365")

	with col2:
	gene_id_2 = st.text_input(
	label = "Gene ID 2",
	value = "CNAG_04222",
	max_chars = 10,
	help = "CNAG Gene ID e.g. CNAG_04222")

	# check the user input
	try:
	cnag_id_1 = h99_transcript_annotations.loc[h99_transcript_annotations["gene_id"] == gene_id_1]["cnag_id"].values
	except:
	st.error(f"Unable to locate cnag_id for Gene ID 1: {gene_id_1}, it should be of the form 'CNAG_######'")

	try:
	cnag_id_2 = h99_transcript_annotations.loc[h99_transcript_annotations["gene_id"] == gene_id_2]["cnag_id"].values
	except:
	st.error(f"Unable to locate cnag_id for Gene ID 2: {gene_id_2}, it should be of the form 'CNAG_######'")

	chart_data = pd.DataFrame({
	"expression_1": np.log10(estimated_expression.loc[h99_transcript_annotations["gene_id"] == gene_id_1].to_numpy()[0] + 1),
	"expression_2": np.log10(estimated_expression.loc[h99_transcript_annotations["gene_id"] == gene_id_2].to_numpy()[0] + 1),
	"run_accession": estimated_expression.columns,
	"run_accession_meta": estimated_expression_meta["run_accession"],
	"study_accession": estimated_expression_meta["study_accession"]})

	# DEBUG
	print(f"run_ids are equal: {sum(chart_data['run_accession'] == chart_data['run_accession_meta'])}")

	chart = (
	alt.Chart(chart_data)
	.mark_circle()
	.encode(
	x="expression_1",
	y="expression_2",
	size=5,
	color="study_accession",
	tooltip=["run_accession", "study_accession"]))

	st.altair_chart(
	chart,
	use_container_width=True)