Spaces:

lynx-analytics
/

lynxkite

Running

App Files Files Community

lynxkite / examples /Cheminformatics /chembl_tools.py

abhik1368

New tools and filters for cheminfo

4cf35ad unverified about 2 months ago

raw

history blame contribute delete

6.71 kB

	from lynxkite.core.ops import op
	import pandas as pd
	from chembl_webresource_client.new_client import new_client
	from rdkit import Chem


	@op("LynxKite Graph Analytics", "chembl sim search")
	def similarity_to_dataframe(*, smiles: str, cutoff: int = 70) -> pd.DataFrame:
	"""
	Run a ChEMBL similarity search and return the hits as a pandas DataFrame.
	If the SMILES is invalid or an error occurs, prints a message and returns
	an empty DataFrame with the expected columns.

	Parameters
	----------
	smiles : str
	The SMILES string to search on.
	cutoff : int
	The minimum Tanimoto similarity (0–100).

	Returns
	-------
	pd.DataFrame
	Columns: 'molecule_chembl_id', 'similarity'
	"""
	# Prepare empty frame to return on error
	cols = ["molecule_chembl_id", "similarity"]
	empty_df = pd.DataFrame(columns=cols)

	# 1) Quick SMILES validation
	if Chem.MolFromSmiles(smiles) is None:
	print("Please input a correct SMILES string.")
	return empty_df

	try:
	# 2) Do the ChEMBL API call
	similarity = new_client.similarity
	results = similarity.filter(smiles=smiles, similarity=cutoff).only(cols)

	# 3) Build DataFrame
	data = list(results)
	df = pd.DataFrame.from_records(data, columns=cols)

	# 4) Inform if no hits
	if df.empty:
	print("No hits found for that SMILES at the given cutoff.")
	return df

	except Exception as e:
	# Catch network errors, unexpected API replies, etc.
	print("An error occurred during the similarity search.")
	print(" Details:", str(e))
	return empty_df


	@op("LynxKite Graph Analytics", "chembl structure")
	def _chembl_structures(
	df: pd.DataFrame, *, id_col: str = "molecule_chembl_id", timeout: int = 5
	) -> pd.DataFrame:
	"""
	Given a DataFrame with a column of ChEMBL molecule IDs, append
	canonical SMILES, standard InChI, and standard InChIKey.

	Parameters
	----------
	df : pd.DataFrame
	Input DataFrame; must contain `id_col`.
	id_col : str
	Name of the column in `df` that holds ChEMBL IDs (e.g. 'CHEMBL1234').
	timeout : int
	How many seconds to wait for the API (not currently used by chembl client,
	but reserved for future enhancements or custom wrappers).

	Returns
	-------
	pd.DataFrame
	A new DataFrame with three additional columns:
	- smiles
	- standard_inchi
	- standard_inchi_key
	"""
	# make a copy so we don’t modify in-place
	out = df.copy()
	# prepare new columns
	out["smiles"] = None
	out["standard_inchi"] = None
	out["standard_inchi_key"] = None

	mol_client = new_client.molecule

	for idx, chembl_id in out[id_col].items():
	try:
	# query ChEMBL for this molecule
	res = mol_client.filter(chembl_id=chembl_id).only(
	["molecule_chembl_id", "molecule_structures"]
	)
	# filter() returns an iterable; grab first record if exists
	rec = next(iter(res), None)
	if rec and rec.get("molecule_structures"):
	struct = rec["molecule_structures"]
	out.at[idx, "smiles"] = struct.get("canonical_smiles")
	out.at[idx, "standard_inchi"] = struct.get("standard_inchi")
	out.at[idx, "standard_inchi_key"] = struct.get("standard_inchi_key")
	else:
	print(f"[Warning] No structure found for {chembl_id}")
	except Exception as e:
	print(f"[Error] Lookup failed for {chembl_id}: {e!s}")

	return out


	@op("LynxKite Graph Analytics", "get chembl drugs")
	def fetch_chembl_drugs(
	*, first_approval: int = 2000, development_phase: int = None
	) -> pd.DataFrame:
	"""
	Fetch drugs from ChEMBL matching the given USAN stem, approval year,
	and development phase, returning key fields as a DataFrame.

	Parameters
	----------
	first_approval : int, optional
	Only include drugs first approved in or after this year (default=1980).
	development_phase : int, optional
	Only include drugs in this development phase (e.g. 2, 3, 4).
	If None, do not filter by phase.
	usan_stem : str, optional
	USAN stem to filter on (default="-azosin").

	Returns
	-------
	pd.DataFrame
	Columns:
	- development_phase
	- first_approval
	- molecule_chembl_id
	- synonyms
	- usan_stem
	- usan_stem_definition
	- usan_year

	If no results (or on error), returns an empty DataFrame with these columns.
	"""
	cols = [
	"development_phase",
	"first_approval",
	"molecule_chembl_id",
	"synonyms",
	"usan_stem",
	"usan_stem_definition",
	"usan_year",
	]
	empty_df = pd.DataFrame(columns=cols)

	# Validate inputs
	if first_approval is not None and not isinstance(first_approval, int):
	print("Error: first_approval must be an integer year.")
	return empty_df
	if development_phase is not None and not isinstance(development_phase, int):
	print("Error: development_phase must be an integer.")
	return empty_df
	# if not isinstance(usan_stem, str):
	# print("Error: usan_stem must be a string.")
	# return empty_df

	try:
	drug = new_client.drug

	# apply approval-year filter
	if first_approval is not None:
	drug = drug.filter(first_approval__gte=first_approval)
	# apply development-phase filter
	if development_phase is not None:
	drug = drug.filter(development_phase=development_phase)
	# apply USAN stem filter
	# drug = drug.filter(usan_stem=usan_stem)

	res = drug.only(cols)
	df = pd.DataFrame(res, columns=cols)

	if df.empty:
	print("No drugs found for those filters.")
	return df

	except Exception as e:
	print("An error occurred during the ChEMBL query:")
	print(" ", str(e))
	return empty_df


	@op("LynxKite Graph Analytics", "get bioactivity from uniprot")
	def fetch_chembl_bioactivity(*, uniprot_id: str = "Q9NZQ7"):
	"""
	Fetch bioactivity data from ChEMBL for a given UniProt ID.
	"""
	target = new_client.target.filter(target_components__accession=uniprot_id)
	targets = list(target)
	if not targets:
	return []

	target_chembl_id = targets[0]["target_chembl_id"]
	activities = new_client.activity.filter(
	target_chembl_id=target_chembl_id, standard_type__in=["IC50", "Ki", "Kd"]
	)
	df = pd.DataFrame(activities)
	return df