File size: 6,711 Bytes
4cf35ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
from lynxkite.core.ops import op
import pandas as pd
from chembl_webresource_client.new_client import new_client
from rdkit import Chem


@op("LynxKite Graph Analytics", "chembl sim search")
def similarity_to_dataframe(*, smiles: str, cutoff: int = 70) -> pd.DataFrame:
    """
    Run a ChEMBL similarity search and return the hits as a pandas DataFrame.
    If the SMILES is invalid or an error occurs, prints a message and returns
    an empty DataFrame with the expected columns.

    Parameters
    ----------
    smiles : str
        The SMILES string to search on.
    cutoff : int
        The minimum Tanimoto similarity (0–100).

    Returns
    -------
    pd.DataFrame
        Columns: 'molecule_chembl_id', 'similarity'
    """
    # Prepare empty frame to return on error
    cols = ["molecule_chembl_id", "similarity"]
    empty_df = pd.DataFrame(columns=cols)

    # 1) Quick SMILES validation
    if Chem.MolFromSmiles(smiles) is None:
        print("Please input a correct SMILES string.")
        return empty_df

    try:
        # 2) Do the ChEMBL API call
        similarity = new_client.similarity
        results = similarity.filter(smiles=smiles, similarity=cutoff).only(cols)

        # 3) Build DataFrame
        data = list(results)
        df = pd.DataFrame.from_records(data, columns=cols)

        # 4) Inform if no hits
        if df.empty:
            print("No hits found for that SMILES at the given cutoff.")
        return df

    except Exception as e:
        # Catch network errors, unexpected API replies, etc.
        print("An error occurred during the similarity search.")
        print("  Details:", str(e))
        return empty_df


@op("LynxKite Graph Analytics", "chembl structure")
def _chembl_structures(
    df: pd.DataFrame, *, id_col: str = "molecule_chembl_id", timeout: int = 5
) -> pd.DataFrame:
    """
    Given a DataFrame with a column of ChEMBL molecule IDs, append
    canonical SMILES, standard InChI, and standard InChIKey.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame; must contain `id_col`.
    id_col : str
        Name of the column in `df` that holds ChEMBL IDs (e.g. 'CHEMBL1234').
    timeout : int
        How many seconds to wait for the API (not currently used by chembl client,
        but reserved for future enhancements or custom wrappers).

    Returns
    -------
    pd.DataFrame
        A new DataFrame with three additional columns:
          - smiles
          - standard_inchi
          - standard_inchi_key
    """
    # make a copy so we don’t modify in-place
    out = df.copy()
    # prepare new columns
    out["smiles"] = None
    out["standard_inchi"] = None
    out["standard_inchi_key"] = None

    mol_client = new_client.molecule

    for idx, chembl_id in out[id_col].items():
        try:
            # query ChEMBL for this molecule
            res = mol_client.filter(chembl_id=chembl_id).only(
                ["molecule_chembl_id", "molecule_structures"]
            )
            # filter() returns an iterable; grab first record if exists
            rec = next(iter(res), None)
            if rec and rec.get("molecule_structures"):
                struct = rec["molecule_structures"]
                out.at[idx, "smiles"] = struct.get("canonical_smiles")
                out.at[idx, "standard_inchi"] = struct.get("standard_inchi")
                out.at[idx, "standard_inchi_key"] = struct.get("standard_inchi_key")
            else:
                print(f"[Warning] No structure found for {chembl_id}")
        except Exception as e:
            print(f"[Error] Lookup failed for {chembl_id}: {e!s}")

    return out


@op("LynxKite Graph Analytics", "get chembl drugs")
def fetch_chembl_drugs(
    *, first_approval: int = 2000, development_phase: int = None
) -> pd.DataFrame:
    """
    Fetch drugs from ChEMBL matching the given USAN stem, approval year,
    and development phase, returning key fields as a DataFrame.

    Parameters
    ----------
    first_approval : int, optional
        Only include drugs first approved in or after this year (default=1980).
    development_phase : int, optional
        Only include drugs in this development phase (e.g. 2, 3, 4).
        If None, do not filter by phase.
    usan_stem : str, optional
        USAN stem to filter on (default="-azosin").

    Returns
    -------
    pd.DataFrame
        Columns:
          - development_phase
          - first_approval
          - molecule_chembl_id
          - synonyms
          - usan_stem
          - usan_stem_definition
          - usan_year

        If no results (or on error), returns an empty DataFrame with these columns.
    """
    cols = [
        "development_phase",
        "first_approval",
        "molecule_chembl_id",
        "synonyms",
        "usan_stem",
        "usan_stem_definition",
        "usan_year",
    ]
    empty_df = pd.DataFrame(columns=cols)

    # Validate inputs
    if first_approval is not None and not isinstance(first_approval, int):
        print("Error: first_approval must be an integer year.")
        return empty_df
    if development_phase is not None and not isinstance(development_phase, int):
        print("Error: development_phase must be an integer.")
        return empty_df
    # if not isinstance(usan_stem, str):
    #     print("Error: usan_stem must be a string.")
    #     return empty_df

    try:
        drug = new_client.drug

        # apply approval-year filter
        if first_approval is not None:
            drug = drug.filter(first_approval__gte=first_approval)
        # apply development-phase filter
        if development_phase is not None:
            drug = drug.filter(development_phase=development_phase)
        # apply USAN stem filter
        # drug = drug.filter(usan_stem=usan_stem)

        res = drug.only(cols)
        df = pd.DataFrame(res, columns=cols)

        if df.empty:
            print("No drugs found for those filters.")
        return df

    except Exception as e:
        print("An error occurred during the ChEMBL query:")
        print(" ", str(e))
        return empty_df


@op("LynxKite Graph Analytics", "get bioactivity from uniprot")
def fetch_chembl_bioactivity(*, uniprot_id: str = "Q9NZQ7"):
    """
    Fetch bioactivity data from ChEMBL for a given UniProt ID.
    """
    target = new_client.target.filter(target_components__accession=uniprot_id)
    targets = list(target)
    if not targets:
        return []

    target_chembl_id = targets[0]["target_chembl_id"]
    activities = new_client.activity.filter(
        target_chembl_id=target_chembl_id, standard_type__in=["IC50", "Ki", "Kd"]
    )
    df = pd.DataFrame(activities)
    return df