Spaces:
Running
Running
from lynxkite.core.ops import op | |
import pandas as pd | |
from chembl_webresource_client.new_client import new_client | |
from rdkit import Chem | |
def similarity_to_dataframe(*, smiles: str, cutoff: int = 70) -> pd.DataFrame: | |
""" | |
Run a ChEMBL similarity search and return the hits as a pandas DataFrame. | |
If the SMILES is invalid or an error occurs, prints a message and returns | |
an empty DataFrame with the expected columns. | |
Parameters | |
---------- | |
smiles : str | |
The SMILES string to search on. | |
cutoff : int | |
The minimum Tanimoto similarity (0–100). | |
Returns | |
------- | |
pd.DataFrame | |
Columns: 'molecule_chembl_id', 'similarity' | |
""" | |
# Prepare empty frame to return on error | |
cols = ["molecule_chembl_id", "similarity"] | |
empty_df = pd.DataFrame(columns=cols) | |
# 1) Quick SMILES validation | |
if Chem.MolFromSmiles(smiles) is None: | |
print("Please input a correct SMILES string.") | |
return empty_df | |
try: | |
# 2) Do the ChEMBL API call | |
similarity = new_client.similarity | |
results = similarity.filter(smiles=smiles, similarity=cutoff).only(cols) | |
# 3) Build DataFrame | |
data = list(results) | |
df = pd.DataFrame.from_records(data, columns=cols) | |
# 4) Inform if no hits | |
if df.empty: | |
print("No hits found for that SMILES at the given cutoff.") | |
return df | |
except Exception as e: | |
# Catch network errors, unexpected API replies, etc. | |
print("An error occurred during the similarity search.") | |
print(" Details:", str(e)) | |
return empty_df | |
def _chembl_structures( | |
df: pd.DataFrame, *, id_col: str = "molecule_chembl_id", timeout: int = 5 | |
) -> pd.DataFrame: | |
""" | |
Given a DataFrame with a column of ChEMBL molecule IDs, append | |
canonical SMILES, standard InChI, and standard InChIKey. | |
Parameters | |
---------- | |
df : pd.DataFrame | |
Input DataFrame; must contain `id_col`. | |
id_col : str | |
Name of the column in `df` that holds ChEMBL IDs (e.g. 'CHEMBL1234'). | |
timeout : int | |
How many seconds to wait for the API (not currently used by chembl client, | |
but reserved for future enhancements or custom wrappers). | |
Returns | |
------- | |
pd.DataFrame | |
A new DataFrame with three additional columns: | |
- smiles | |
- standard_inchi | |
- standard_inchi_key | |
""" | |
# make a copy so we don’t modify in-place | |
out = df.copy() | |
# prepare new columns | |
out["smiles"] = None | |
out["standard_inchi"] = None | |
out["standard_inchi_key"] = None | |
mol_client = new_client.molecule | |
for idx, chembl_id in out[id_col].items(): | |
try: | |
# query ChEMBL for this molecule | |
res = mol_client.filter(chembl_id=chembl_id).only( | |
["molecule_chembl_id", "molecule_structures"] | |
) | |
# filter() returns an iterable; grab first record if exists | |
rec = next(iter(res), None) | |
if rec and rec.get("molecule_structures"): | |
struct = rec["molecule_structures"] | |
out.at[idx, "smiles"] = struct.get("canonical_smiles") | |
out.at[idx, "standard_inchi"] = struct.get("standard_inchi") | |
out.at[idx, "standard_inchi_key"] = struct.get("standard_inchi_key") | |
else: | |
print(f"[Warning] No structure found for {chembl_id}") | |
except Exception as e: | |
print(f"[Error] Lookup failed for {chembl_id}: {e!s}") | |
return out | |
def fetch_chembl_drugs( | |
*, first_approval: int = 2000, development_phase: int = None | |
) -> pd.DataFrame: | |
""" | |
Fetch drugs from ChEMBL matching the given USAN stem, approval year, | |
and development phase, returning key fields as a DataFrame. | |
Parameters | |
---------- | |
first_approval : int, optional | |
Only include drugs first approved in or after this year (default=1980). | |
development_phase : int, optional | |
Only include drugs in this development phase (e.g. 2, 3, 4). | |
If None, do not filter by phase. | |
usan_stem : str, optional | |
USAN stem to filter on (default="-azosin"). | |
Returns | |
------- | |
pd.DataFrame | |
Columns: | |
- development_phase | |
- first_approval | |
- molecule_chembl_id | |
- synonyms | |
- usan_stem | |
- usan_stem_definition | |
- usan_year | |
If no results (or on error), returns an empty DataFrame with these columns. | |
""" | |
cols = [ | |
"development_phase", | |
"first_approval", | |
"molecule_chembl_id", | |
"synonyms", | |
"usan_stem", | |
"usan_stem_definition", | |
"usan_year", | |
] | |
empty_df = pd.DataFrame(columns=cols) | |
# Validate inputs | |
if first_approval is not None and not isinstance(first_approval, int): | |
print("Error: first_approval must be an integer year.") | |
return empty_df | |
if development_phase is not None and not isinstance(development_phase, int): | |
print("Error: development_phase must be an integer.") | |
return empty_df | |
# if not isinstance(usan_stem, str): | |
# print("Error: usan_stem must be a string.") | |
# return empty_df | |
try: | |
drug = new_client.drug | |
# apply approval-year filter | |
if first_approval is not None: | |
drug = drug.filter(first_approval__gte=first_approval) | |
# apply development-phase filter | |
if development_phase is not None: | |
drug = drug.filter(development_phase=development_phase) | |
# apply USAN stem filter | |
# drug = drug.filter(usan_stem=usan_stem) | |
res = drug.only(cols) | |
df = pd.DataFrame(res, columns=cols) | |
if df.empty: | |
print("No drugs found for those filters.") | |
return df | |
except Exception as e: | |
print("An error occurred during the ChEMBL query:") | |
print(" ", str(e)) | |
return empty_df | |
def fetch_chembl_bioactivity(*, uniprot_id: str = "Q9NZQ7"): | |
""" | |
Fetch bioactivity data from ChEMBL for a given UniProt ID. | |
""" | |
target = new_client.target.filter(target_components__accession=uniprot_id) | |
targets = list(target) | |
if not targets: | |
return [] | |
target_chembl_id = targets[0]["target_chembl_id"] | |
activities = new_client.activity.filter( | |
target_chembl_id=target_chembl_id, standard_type__in=["IC50", "Ki", "Kd"] | |
) | |
df = pd.DataFrame(activities) | |
return df | |