Spaces:
Running
Running
File size: 6,711 Bytes
4cf35ad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
from lynxkite.core.ops import op
import pandas as pd
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
@op("LynxKite Graph Analytics", "chembl sim search")
def similarity_to_dataframe(*, smiles: str, cutoff: int = 70) -> pd.DataFrame:
"""
Run a ChEMBL similarity search and return the hits as a pandas DataFrame.
If the SMILES is invalid or an error occurs, prints a message and returns
an empty DataFrame with the expected columns.
Parameters
----------
smiles : str
The SMILES string to search on.
cutoff : int
The minimum Tanimoto similarity (0–100).
Returns
-------
pd.DataFrame
Columns: 'molecule_chembl_id', 'similarity'
"""
# Prepare empty frame to return on error
cols = ["molecule_chembl_id", "similarity"]
empty_df = pd.DataFrame(columns=cols)
# 1) Quick SMILES validation
if Chem.MolFromSmiles(smiles) is None:
print("Please input a correct SMILES string.")
return empty_df
try:
# 2) Do the ChEMBL API call
similarity = new_client.similarity
results = similarity.filter(smiles=smiles, similarity=cutoff).only(cols)
# 3) Build DataFrame
data = list(results)
df = pd.DataFrame.from_records(data, columns=cols)
# 4) Inform if no hits
if df.empty:
print("No hits found for that SMILES at the given cutoff.")
return df
except Exception as e:
# Catch network errors, unexpected API replies, etc.
print("An error occurred during the similarity search.")
print(" Details:", str(e))
return empty_df
@op("LynxKite Graph Analytics", "chembl structure")
def _chembl_structures(
df: pd.DataFrame, *, id_col: str = "molecule_chembl_id", timeout: int = 5
) -> pd.DataFrame:
"""
Given a DataFrame with a column of ChEMBL molecule IDs, append
canonical SMILES, standard InChI, and standard InChIKey.
Parameters
----------
df : pd.DataFrame
Input DataFrame; must contain `id_col`.
id_col : str
Name of the column in `df` that holds ChEMBL IDs (e.g. 'CHEMBL1234').
timeout : int
How many seconds to wait for the API (not currently used by chembl client,
but reserved for future enhancements or custom wrappers).
Returns
-------
pd.DataFrame
A new DataFrame with three additional columns:
- smiles
- standard_inchi
- standard_inchi_key
"""
# make a copy so we don’t modify in-place
out = df.copy()
# prepare new columns
out["smiles"] = None
out["standard_inchi"] = None
out["standard_inchi_key"] = None
mol_client = new_client.molecule
for idx, chembl_id in out[id_col].items():
try:
# query ChEMBL for this molecule
res = mol_client.filter(chembl_id=chembl_id).only(
["molecule_chembl_id", "molecule_structures"]
)
# filter() returns an iterable; grab first record if exists
rec = next(iter(res), None)
if rec and rec.get("molecule_structures"):
struct = rec["molecule_structures"]
out.at[idx, "smiles"] = struct.get("canonical_smiles")
out.at[idx, "standard_inchi"] = struct.get("standard_inchi")
out.at[idx, "standard_inchi_key"] = struct.get("standard_inchi_key")
else:
print(f"[Warning] No structure found for {chembl_id}")
except Exception as e:
print(f"[Error] Lookup failed for {chembl_id}: {e!s}")
return out
@op("LynxKite Graph Analytics", "get chembl drugs")
def fetch_chembl_drugs(
*, first_approval: int = 2000, development_phase: int = None
) -> pd.DataFrame:
"""
Fetch drugs from ChEMBL matching the given USAN stem, approval year,
and development phase, returning key fields as a DataFrame.
Parameters
----------
first_approval : int, optional
Only include drugs first approved in or after this year (default=1980).
development_phase : int, optional
Only include drugs in this development phase (e.g. 2, 3, 4).
If None, do not filter by phase.
usan_stem : str, optional
USAN stem to filter on (default="-azosin").
Returns
-------
pd.DataFrame
Columns:
- development_phase
- first_approval
- molecule_chembl_id
- synonyms
- usan_stem
- usan_stem_definition
- usan_year
If no results (or on error), returns an empty DataFrame with these columns.
"""
cols = [
"development_phase",
"first_approval",
"molecule_chembl_id",
"synonyms",
"usan_stem",
"usan_stem_definition",
"usan_year",
]
empty_df = pd.DataFrame(columns=cols)
# Validate inputs
if first_approval is not None and not isinstance(first_approval, int):
print("Error: first_approval must be an integer year.")
return empty_df
if development_phase is not None and not isinstance(development_phase, int):
print("Error: development_phase must be an integer.")
return empty_df
# if not isinstance(usan_stem, str):
# print("Error: usan_stem must be a string.")
# return empty_df
try:
drug = new_client.drug
# apply approval-year filter
if first_approval is not None:
drug = drug.filter(first_approval__gte=first_approval)
# apply development-phase filter
if development_phase is not None:
drug = drug.filter(development_phase=development_phase)
# apply USAN stem filter
# drug = drug.filter(usan_stem=usan_stem)
res = drug.only(cols)
df = pd.DataFrame(res, columns=cols)
if df.empty:
print("No drugs found for those filters.")
return df
except Exception as e:
print("An error occurred during the ChEMBL query:")
print(" ", str(e))
return empty_df
@op("LynxKite Graph Analytics", "get bioactivity from uniprot")
def fetch_chembl_bioactivity(*, uniprot_id: str = "Q9NZQ7"):
"""
Fetch bioactivity data from ChEMBL for a given UniProt ID.
"""
target = new_client.target.filter(target_components__accession=uniprot_id)
targets = list(target)
if not targets:
return []
target_chembl_id = targets[0]["target_chembl_id"]
activities = new_client.activity.filter(
target_chembl_id=target_chembl_id, standard_type__in=["IC50", "Ki", "Kd"]
)
df = pd.DataFrame(activities)
return df
|