Spaces:

lynx-analytics
/

lynxkite

Running

App Files Files Community

abhik1368 commited on May 6

Commit

4cf35ad

unverified ·

1 Parent(s): f407b8d

New tools and filters for cheminfo

Browse files

* Update Cheminformatrics use Cases

Add _cheminfo_tools.py with lipinksi filter , View Mol Image , View mol filter with smarts and smiles and highlights are done .

* update new filters and chembl webapi

update new filters and chembl webapi

veber, pains, muegge, brenk_aggregator_filter, egan , ghose , new qsar2.py code with matplotlib plots.

* update tools

update on chembl uniprot based search

* update the code

Delete the old files and folder

Put in example \ Cheminformatics folders

Chembl web service client with example
Plots with plot qsar and plot qsar2 with confidence intervals

* Update new code with new workspace

New workspace created deleted ex1 and ex2 .
Deleted the ecfp and maccs model .pkl file

Files changed (9) hide show

examples/.crdt/Image table.lynxkite.json.crdt +0 -0
examples/.crdt/requirements.txt.crdt +0 -0
examples/Cheminformatics/chem_utils.py +263 -0
examples/Cheminformatics/chembl_api_uses.lynxkite.json +0 -0
examples/Cheminformatics/chembl_tools.py +206 -0
examples/Cheminformatics/cheminfo_tools.py +610 -0
examples/Cheminformatics/qsar_example.lynxkite.json +0 -0
examples/draw_molecules.py +0 -29
examples/requirements.txt +3 -0

examples/.crdt/Image table.lynxkite.json.crdt ADDED Viewed

Binary file (31.8 kB). View file

examples/.crdt/requirements.txt.crdt ADDED Viewed

Binary file (251 Bytes). View file

examples/Cheminformatics/chem_utils.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import base64
+import io
+import sys
+from io import StringIO
+from operator import itemgetter
+from typing import List
+from typing import Tuple
+import itertools
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+from rdkit import Chem, DataStructs, RDLogger
+from rdkit.Chem.Draw import rdMolDraw2D
+from rdkit.Chem.rdchem import Mol
+from rdkit.ML.Cluster import Butina
+from rdkit.rdBase import BlockLogs
+import pandas as pd
+from rdkit.Chem.rdMMPA import FragmentMol
+from rdkit.Chem.rdRGroupDecomposition import RGroupDecompose
+def smi2mol_with_errors(smi: str) -> Tuple[Mol, str]:
+    """Parse SMILES and return any associated errors or warnings
+    :param smi: input SMILES
+    :return: tuple of RDKit molecule, warning or error
+    """
+    sio = sys.stderr = StringIO()
+    mol = Chem.MolFromSmiles(smi)
+    err = sio.getvalue()
+    sio = sys.stderr = StringIO()
+    sys.stderr = sys.__stderr__
+    return mol, err
+def count_fragments(mol: Mol) -> int:
+    """Count the number of fragments in a molecule
+    :param mol: RDKit molecule
+    :return: number of fragments
+    """
+    return len(Chem.GetMolFrags(mol, asMols=True))
+def get_largest_fragment(mol: Mol) -> Mol:
+    """Return the fragment with the largest number of atoms
+    :param mol: RDKit molecule
+    :return: RDKit molecule with the largest number of atoms
+    """
+    frag_list = list(Chem.GetMolFrags(mol, asMols=True))
+    frag_mw_list = [(x.GetNumAtoms(), x) for x in frag_list]
+    frag_mw_list.sort(key=itemgetter(0), reverse=True)
+    return frag_mw_list[0][1]
+# ----------- Clustering
+# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GroupShuffleSplit.html
+def taylor_butina_clustering(
+    fp_list: List[DataStructs.ExplicitBitVect], cutoff: float = 0.65
+) -> List[int]:
+    """Cluster a set of fingerprints using the RDKit Taylor-Butina implementation
+    :param fp_list: a list of fingerprints
+    :param cutoff: distance cutoff (1 - Tanimoto similarity)
+    :return: a list of cluster ids
+    """
+    dists = []
+    nfps = len(fp_list)
+    for i in range(1, nfps):
+        sims = DataStructs.BulkTanimotoSimilarity(fp_list[i], fp_list[:i])
+        dists.extend([1 - x for x in sims])
+    cluster_res = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
+    cluster_id_list = np.zeros(nfps, dtype=int)
+    for cluster_num, cluster in enumerate(cluster_res):
+        for member in cluster:
+            cluster_id_list[member] = cluster_num
+    return cluster_id_list.tolist()
+# ----------- Atom tagging
+def label_atoms(mol: Mol, labels: List[str]) -> Mol:
+    """Label atoms when depicting a molecule
+    :param mol: input molecule
+    :param labels: labels, one for each atom
+    :return: molecule with labels
+    """
+    [atm.SetProp("atomNote", "") for atm in mol.GetAtoms()]
+    for atm in mol.GetAtoms():
+        idx = atm.GetIdx()
+        mol.GetAtomWithIdx(idx).SetProp("atomNote", f"{labels[idx]}")
+    return mol
+def tag_atoms(mol: Mol, atoms_to_tag: List[int], tag: str = "x") -> Mol:
+    """Tag atoms with a specified string
+    :param mol: input molecule
+    :param atoms_to_tag: indices of atoms to tag
+    :param tag: string to use for the tags
+    :return: molecule with atoms tagged
+    """
+    [atm.SetProp("atomNote", "") for atm in mol.GetAtoms()]
+    [mol.GetAtomWithIdx(idx).SetProp("atomNote", tag) for idx in atoms_to_tag]
+    return mol
+# ----------- Logging
+def rd_shut_the_hell_up() -> None:
+    """Make the RDKit be a bit more quiet
+    :return: None
+    """
+    lg = RDLogger.logger()
+    lg.setLevel(RDLogger.CRITICAL)
+def demo_block_logs() -> None:
+    """An example of another way to turn off RDKit logging
+    :return: None
+    """
+    block = BlockLogs()
+    # do stuff
+    del block
+# ----------- Image generation
+def boxplot_base64_image(dist: np.ndarray, x_lim: list[int] = [0, 10]) -> str:
+    """
+    Plot a distribution as a seaborn boxplot and save the resulting image as a base64 image.
+    Parameters:
+    dist (np.ndarray): The distribution data to plot.
+    x_lim (list[int]): The x-axis limits for the boxplot.
+    Returns:
+    str: The base64 encoded image string.
+    """
+    sns.set(rc={"figure.figsize": (3, 1)})
+    sns.set_style("whitegrid")
+    ax = sns.boxplot(x=dist)
+    ax.set_xlim(x_lim[0], x_lim[1])
+    s = io.BytesIO()
+    plt.savefig(s, format="png", bbox_inches="tight")
+    plt.close()
+    s = base64.b64encode(s.getvalue()).decode("utf-8").replace("\n", "")
+    return '<img align="left" src="data:image/png;base64,%s">' % s
+def mol_to_base64_image(mol: Chem.Mol) -> str:
+    """
+    Convert an RDKit molecule to a base64 encoded image string.
+    Parameters:
+    mol (Chem.Mol): The RDKit molecule to convert.
+    Returns:
+    str: The base64 encoded image string.
+    """
+    drawer = rdMolDraw2D.MolDraw2DCairo(300, 150)
+    drawer.DrawMolecule(mol)
+    drawer.FinishDrawing()
+    text = drawer.GetDrawingText()
+    im_text64 = base64.b64encode(text).decode("utf8")
+    img_str = f"<img src='data:image/png;base64, {im_text64}'/>"
+    return img_str
+def cleanup_fragment(mol: Mol) -> Tuple[Mol, int]:
+    """
+    Replace atom map numbers with Hydrogens
+    :param mol: input molecule
+    :return: modified molecule, number of R-groups
+    """
+    rgroup_count = 0
+    for atm in mol.GetAtoms():
+        atm.SetAtomMapNum(0)
+        if atm.GetAtomicNum() == 0:
+            rgroup_count += 1
+            atm.SetAtomicNum(1)
+    mol = Chem.RemoveAllHs(mol)
+    return mol, rgroup_count
+def generate_fragments(mol: Mol) -> pd.DataFrame:
+    """
+    Generate fragments using the RDKit
+    :param mol: RDKit molecule
+    :return: a Pandas dataframe with Scaffold SMILES, Number of Atoms, Number of R-Groups
+    """
+    # Generate molecule fragments
+    frag_list = FragmentMol(mol)
+    # Flatten the output into a single list
+    flat_frag_list = [x for x in itertools.chain(*frag_list) if x]
+    # The output of Fragment mol is contained in single molecules.  Extract the largest fragment from each molecule
+    flat_frag_list = [get_largest_fragment(x) for x in flat_frag_list]
+    # Keep fragments where the number of atoms in the fragment is at least 2/3 of the number fragments in
+    # input molecule
+    num_mol_atoms = mol.GetNumAtoms()
+    flat_frag_list = [x for x in flat_frag_list if x.GetNumAtoms() / num_mol_atoms > 0.67]
+    # remove atom map numbers from the fragments
+    flat_frag_list = [cleanup_fragment(x) for x in flat_frag_list]
+    # Convert fragments to SMILES
+    frag_smiles_list = [[Chem.MolToSmiles(x), x.GetNumAtoms(), y] for (x, y) in flat_frag_list]
+    # Add the input molecule to the fragment list
+    frag_smiles_list.append([Chem.MolToSmiles(mol), mol.GetNumAtoms(), 1])
+    # Put the results into a Pandas dataframe
+    frag_df = pd.DataFrame(frag_smiles_list, columns=["Scaffold", "NumAtoms", "NumRgroupgs"])
+    # Remove duplicate fragments
+    frag_df = frag_df.drop_duplicates("Scaffold")
+    return frag_df
+def find_scaffolds(df_in: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Generate scaffolds for a set of molecules
+    :param df_in: Pandas dataframe with [SMILES, Name, RDKit molecule] columns
+    :return: dataframe with molecules and scaffolds, dataframe with unique scaffolds
+    """
+    # Loop over molecules and generate fragments, fragments for each molecule are returned as a Pandas dataframe
+    df_list = []
+    for smiles, name, mol in df_in[["SMILES", "Name", "mol"]].values:
+        tmp_df = generate_fragments(mol).copy()
+        tmp_df["Name"] = name
+        tmp_df["SMILES"] = smiles
+        df_list.append(tmp_df)
+    # Combine the list of dataframes into a single dataframe
+    mol_df = pd.concat(df_list)
+    # Collect scaffolds
+    scaffold_list = []
+    for k, v in mol_df.groupby("Scaffold"):
+        scaffold_list.append([k, len(v.Name.unique()), v.NumAtoms.values[0]])
+    scaffold_df = pd.DataFrame(scaffold_list, columns=["Scaffold", "Count", "NumAtoms"])
+    # Any fragment that occurs more times than the number of fragments can't be a scaffold
+    num_df_rows = len(df_in)  # noqa: F841
+    scaffold_df = scaffold_df.query(f"Count <= {num_df_rows}")
+    # Sort scaffolds by frequency
+    scaffold_df = scaffold_df.sort_values(["Count", "NumAtoms"], ascending=[False, False])
+    return mol_df, scaffold_df
+def get_molecules_with_scaffold(
+    scaffold: str, mol_df: pd.DataFrame, activity_df: pd.DataFrame
+) -> Tuple[List[str], pd.DataFrame]:
+    """
+    Associate molecules with scaffolds
+    :param scaffold: scaffold SMILES
+    :param mol_df: dataframe with molecules and scaffolds, returned by find_scaffolds()
+    :param activity_df: dataframe with [SMILES, Name, pIC50] columns
+    :return: list of core(s) with R-groups labeled, dataframe with [SMILES, Name, pIC50]
+    """
+    match_df = mol_df.query("Scaffold == @scaffold")
+    merge_df = match_df.merge(activity_df, on=["SMILES", "Name"])
+    scaffold_mol = Chem.MolFromSmiles(scaffold)
+    rgroup_match, rgroup_miss = RGroupDecompose(scaffold_mol, merge_df.mol, asSmiles=True)
+    if len(rgroup_match):
+        rgroup_df = pd.DataFrame(rgroup_match)
+        return rgroup_df.Core.unique(), merge_df[["SMILES", "Name", "pIC50"]]
+    else:
+        return [], merge_df[["SMILES", "Name", "pIC50"]]

examples/Cheminformatics/chembl_api_uses.lynxkite.json ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/Cheminformatics/chembl_tools.py ADDED Viewed

	@@ -0,0 +1,206 @@

+from lynxkite.core.ops import op
+import pandas as pd
+from chembl_webresource_client.new_client import new_client
+from rdkit import Chem
+@op("LynxKite Graph Analytics", "chembl sim search")
+def similarity_to_dataframe(*, smiles: str, cutoff: int = 70) -> pd.DataFrame:
+    """
+    Run a ChEMBL similarity search and return the hits as a pandas DataFrame.
+    If the SMILES is invalid or an error occurs, prints a message and returns
+    an empty DataFrame with the expected columns.
+    Parameters
+    ----------
+    smiles : str
+        The SMILES string to search on.
+    cutoff : int
+        The minimum Tanimoto similarity (0–100).
+    Returns
+    -------
+    pd.DataFrame
+        Columns: 'molecule_chembl_id', 'similarity'
+    """
+    # Prepare empty frame to return on error
+    cols = ["molecule_chembl_id", "similarity"]
+    empty_df = pd.DataFrame(columns=cols)
+    # 1) Quick SMILES validation
+    if Chem.MolFromSmiles(smiles) is None:
+        print("Please input a correct SMILES string.")
+        return empty_df
+    try:
+        # 2) Do the ChEMBL API call
+        similarity = new_client.similarity
+        results = similarity.filter(smiles=smiles, similarity=cutoff).only(cols)
+        # 3) Build DataFrame
+        data = list(results)
+        df = pd.DataFrame.from_records(data, columns=cols)
+        # 4) Inform if no hits
+        if df.empty:
+            print("No hits found for that SMILES at the given cutoff.")
+        return df
+    except Exception as e:
+        # Catch network errors, unexpected API replies, etc.
+        print("An error occurred during the similarity search.")
+        print("  Details:", str(e))
+        return empty_df
+@op("LynxKite Graph Analytics", "chembl structure")
+def _chembl_structures(
+    df: pd.DataFrame, *, id_col: str = "molecule_chembl_id", timeout: int = 5
+) -> pd.DataFrame:
+    """
+    Given a DataFrame with a column of ChEMBL molecule IDs, append
+    canonical SMILES, standard InChI, and standard InChIKey.
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Input DataFrame; must contain `id_col`.
+    id_col : str
+        Name of the column in `df` that holds ChEMBL IDs (e.g. 'CHEMBL1234').
+    timeout : int
+        How many seconds to wait for the API (not currently used by chembl client,
+        but reserved for future enhancements or custom wrappers).
+    Returns
+    -------
+    pd.DataFrame
+        A new DataFrame with three additional columns:
+          - smiles
+          - standard_inchi
+          - standard_inchi_key
+    """
+    # make a copy so we don’t modify in-place
+    out = df.copy()
+    # prepare new columns
+    out["smiles"] = None
+    out["standard_inchi"] = None
+    out["standard_inchi_key"] = None
+    mol_client = new_client.molecule
+    for idx, chembl_id in out[id_col].items():
+        try:
+            # query ChEMBL for this molecule
+            res = mol_client.filter(chembl_id=chembl_id).only(
+                ["molecule_chembl_id", "molecule_structures"]
+            )
+            # filter() returns an iterable; grab first record if exists
+            rec = next(iter(res), None)
+            if rec and rec.get("molecule_structures"):
+                struct = rec["molecule_structures"]
+                out.at[idx, "smiles"] = struct.get("canonical_smiles")
+                out.at[idx, "standard_inchi"] = struct.get("standard_inchi")
+                out.at[idx, "standard_inchi_key"] = struct.get("standard_inchi_key")
+            else:
+                print(f"[Warning] No structure found for {chembl_id}")
+        except Exception as e:
+            print(f"[Error] Lookup failed for {chembl_id}: {e!s}")
+    return out
+@op("LynxKite Graph Analytics", "get chembl drugs")
+def fetch_chembl_drugs(
+    *, first_approval: int = 2000, development_phase: int = None
+) -> pd.DataFrame:
+    """
+    Fetch drugs from ChEMBL matching the given USAN stem, approval year,
+    and development phase, returning key fields as a DataFrame.
+    Parameters
+    ----------
+    first_approval : int, optional
+        Only include drugs first approved in or after this year (default=1980).
+    development_phase : int, optional
+        Only include drugs in this development phase (e.g. 2, 3, 4).
+        If None, do not filter by phase.
+    usan_stem : str, optional
+        USAN stem to filter on (default="-azosin").
+    Returns
+    -------
+    pd.DataFrame
+        Columns:
+          - development_phase
+          - first_approval
+          - molecule_chembl_id
+          - synonyms
+          - usan_stem
+          - usan_stem_definition
+          - usan_year
+        If no results (or on error), returns an empty DataFrame with these columns.
+    """
+    cols = [
+        "development_phase",
+        "first_approval",
+        "molecule_chembl_id",
+        "synonyms",
+        "usan_stem",
+        "usan_stem_definition",
+        "usan_year",
+    ]
+    empty_df = pd.DataFrame(columns=cols)
+    # Validate inputs
+    if first_approval is not None and not isinstance(first_approval, int):
+        print("Error: first_approval must be an integer year.")
+        return empty_df
+    if development_phase is not None and not isinstance(development_phase, int):
+        print("Error: development_phase must be an integer.")
+        return empty_df
+    # if not isinstance(usan_stem, str):
+    #     print("Error: usan_stem must be a string.")
+    #     return empty_df
+    try:
+        drug = new_client.drug
+        # apply approval-year filter
+        if first_approval is not None:
+            drug = drug.filter(first_approval__gte=first_approval)
+        # apply development-phase filter
+        if development_phase is not None:
+            drug = drug.filter(development_phase=development_phase)
+        # apply USAN stem filter
+        # drug = drug.filter(usan_stem=usan_stem)
+        res = drug.only(cols)
+        df = pd.DataFrame(res, columns=cols)
+        if df.empty:
+            print("No drugs found for those filters.")
+        return df
+    except Exception as e:
+        print("An error occurred during the ChEMBL query:")
+        print(" ", str(e))
+        return empty_df
+@op("LynxKite Graph Analytics", "get bioactivity from uniprot")
+def fetch_chembl_bioactivity(*, uniprot_id: str = "Q9NZQ7"):
+    """
+    Fetch bioactivity data from ChEMBL for a given UniProt ID.
+    """
+    target = new_client.target.filter(target_components__accession=uniprot_id)
+    targets = list(target)
+    if not targets:
+        return []
+    target_chembl_id = targets[0]["target_chembl_id"]
+    activities = new_client.activity.filter(
+        target_chembl_id=target_chembl_id, standard_type__in=["IC50", "Ki", "Kd"]
+    )
+    df = pd.DataFrame(activities)
+    return df

examples/Cheminformatics/cheminfo_tools.py CHANGED Viewed

@@ -16,6 +16,7 @@ from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
 from sklearn.model_selection import train_test_split
 import numpy as np
 @op("LynxKite Graph Analytics", "View mol filter", view="matplotlib", slow=True)
@@ -303,3 +304,612 @@ def build_qsar_model(
     print(f"Trained & saved QSAR model for '{fp_type}' → {model_file}")
     return metrics_df

 from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
 from sklearn.model_selection import train_test_split
 import numpy as np
+from rdkit.Chem import MACCSkeys
 @op("LynxKite Graph Analytics", "View mol filter", view="matplotlib", slow=True)
     print(f"Trained & saved QSAR model for '{fp_type}' → {model_file}")
     return metrics_df
+def predict_with_ci(model, X, confidence=0.95):
+    """
+    Calculates predictions and confidence intervals for a RandomForestRegressor.
+    (Implementation is the same as in the previous answer)
+    """
+    # Get predictions from each individual tree
+    tree_preds = np.array([tree.predict(X) for tree in model.estimators_])
+    # Calculate mean prediction
+    y_pred_mean = np.mean(tree_preds, axis=0)
+    # Calculate percentiles for confidence interval
+    alpha = (1.0 - confidence) / 2.0
+    lower_percentile = alpha * 100
+    upper_percentile = (1.0 - alpha) * 100
+    y_pred_lower = np.percentile(tree_preds, lower_percentile, axis=0)
+    y_pred_upper = np.percentile(tree_preds, upper_percentile, axis=0)
+    return y_pred_mean, y_pred_lower, y_pred_upper
+# --- End of predict_with_ci definition ---
+@op("LynxKite Graph Analytics", "Train QSAR2")
+def build_qsar_model2(
+    df: pd.DataFrame,
+    *,
+    smiles_col: str,
+    target_col: str,
+    fp_type: str,
+    radius: int = 2,
+    n_bits: int = 2048,
+    test_size: float = 0.2,
+    random_state: int = 42,
+    out_dir: str = "Models",
+    confidence: float = 0.95,
+):
+    """
+    Train/save RandomForest QSAR model, returning the model and a results DataFrame.
+    The results DataFrame contains per-point data ('actual', 'predicted',
+    'lower_ci', 'upper_ci', 'split') AND repeated summary metrics for each
+    split ('split_R2', 'split_MAE', 'split_RMSE').
+    Parameters
+    ----------
+    (Parameters are the same as before)
+    bundle : any
+    table_name : str
+    smiles_col : str
+    target_col : str
+    fp_type : str
+    radius : int
+    n_bits : int
+    test_size : float
+    random_state : int
+    out_dir : str
+    confidence : float, optional
+    Returns
+    -------
+    model : RandomForestRegressor
+        The trained QSAR model.
+    results_df : pandas.DataFrame
+        DataFrame containing columns: 'actual', 'predicted', 'lower_ci',
+        'upper_ci', 'split', 'split_R2', 'split_MAE', 'split_RMSE'.
+        The metric columns repeat the overall metric for the corresponding split.
+    """
+    # Steps 1-5: Load data, split, featurize, split features, train model
+    # (Code is identical to previous versions up to model training)
+    # ... (load data, sanitize, split indices) ...
+    # df = bundle.dfs.get(table_name)
+    df = df.copy()
+    if df is None:
+        raise KeyError("Table not found")
+    df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
+    df.dropna(subset=[target_col, smiles_col], inplace=True)
+    df["mol"] = df[smiles_col].apply(Chem.MolFromSmiles)
+    df = df[df["mol"].notnull()].reset_index(drop=True)
+    if df.empty:
+        raise ValueError("No valid molecules or targets")
+    indices = np.arange(len(df))
+    train_idx, test_idx = train_test_split(indices, test_size=test_size, random_state=random_state)
+    print(f"Featurizing using {fp_type}...")
+    fps = []
+    valid_indices = []
+    for i, mol in enumerate(df["mol"]):
+        try:
+            # ... (fp generation logic as before) ...
+            if fp_type == "ecfp":
+                bv = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
+                current_n_bits = n_bits
+            elif fp_type == "rdkit":
+                bv = Chem.RDKFingerprint(mol, fpSize=n_bits)
+                current_n_bits = n_bits
+            elif fp_type == "torsion":
+                bv = AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(mol, nBits=n_bits)
+                current_n_bits = n_bits
+            elif fp_type == "atompair":
+                bv = AllChem.GetHashedAtomPairFingerprintAsBitVect(mol, nBits=n_bits)
+                current_n_bits = n_bits
+            elif fp_type == "maccs":
+                bv = MACCSkeys.GenMACCSKeys(mol)  # 167 bits
+                current_n_bits = 167
+            else:
+                raise ValueError(f"Unsupported fp type: '{fp_type}'")
+            arr = np.zeros((current_n_bits,), dtype=np.int8)
+            DataStructs.ConvertToNumpyArray(bv, arr)
+            fps.append(arr)
+            valid_indices.append(i)
+        except Exception as e:
+            print(f"Warning: Featurization failed index {i}. Skipping. Error: {e}")
+            continue
+    if not fps:
+        raise ValueError("No molecules featurized.")
+    X = np.vstack(fps)
+    df_filtered = df.iloc[valid_indices].reset_index(drop=True)
+    y = df_filtered[target_col].values
+    # original_indices_set = set(valid_indices)
+    train_idx_filtered = [
+        i for i, original_idx in enumerate(valid_indices) if original_idx in train_idx
+    ]
+    test_idx_filtered = [
+        i for i, original_idx in enumerate(valid_indices) if original_idx in test_idx
+    ]
+    X_train, y_train = X[train_idx_filtered], y[train_idx_filtered]
+    X_test, y_test = X[test_idx_filtered], y[test_idx_filtered]
+    if X_train.shape[0] == 0 or X_test.shape[0] == 0:
+        raise ValueError("Train or test split empty after filtering.")
+    print("Training RandomForestRegressor...")
+    model = RandomForestRegressor(random_state=random_state, n_jobs=-1)
+    model.fit(X_train, y_train)
+    # 6) Compute predictions and *summary* performance metrics
+    print("Calculating predictions and metrics...")
+    y_pred_train, lower_ci_train, upper_ci_train = predict_with_ci(model, X_train, confidence)
+    y_pred_test, lower_ci_test, upper_ci_test = predict_with_ci(model, X_test, confidence)
+    def _metrics(y_true, y_pred_mean):
+        # (Same helper function as before)
+        y_true = np.ravel(y_true)
+        y_pred_mean = np.ravel(y_pred_mean)
+        if len(y_true) == 0:
+            return {"R2": np.nan, "MAE": np.nan, "RMSE": np.nan}
+        mse = mean_squared_error(y_true, y_pred_mean)
+        return {
+            "R2": r2_score(y_true, y_pred_mean),
+            "MAE": mean_absolute_error(y_true, y_pred_mean),
+            "RMSE": np.sqrt(mse),
+        }
+    train_metrics_dict = _metrics(y_train, y_pred_train)
+    test_metrics_dict = _metrics(y_test, y_pred_test)
+    # 7) Create results DataFrames and ADD metrics columns
+    train_results = pd.DataFrame(
+        {
+            "actual": y_train,
+            "predicted": y_pred_train,
+            "lower_ci": lower_ci_train,
+            "upper_ci": upper_ci_train,
+            "split": "train",
+        }
+    )
+    # Add repeated metrics
+    for metric, value in train_metrics_dict.items():
+        train_results[f"split_{metric}"] = value
+    test_results = pd.DataFrame(
+        {
+            "actual": y_test,
+            "predicted": y_pred_test,
+            "lower_ci": lower_ci_test,
+            "upper_ci": upper_ci_test,
+            "split": "test",
+        }
+    )
+    # Add repeated metrics
+    for metric, value in test_metrics_dict.items():
+        test_results[f"split_{metric}"] = value
+    # Concatenate into the final DataFrame
+    results_df = pd.concat([train_results, test_results], ignore_index=True)
+    # 8) Save the model (same as before)
+    os.makedirs(out_dir, exist_ok=True)
+    model_file = os.path.join(out_dir, f"qsar_model_{fp_type}.pkl")
+    try:
+        with open(model_file, "wb") as fout:
+            pickle.dump(model, fout)
+        print(f"Trained & saved QSAR model for '{fp_type}' -> {model_file}")
+    except Exception as e:
+        print(f"Error saving model to {model_file}: {e}")
+    return results_df
+@op("LynxKite Graph Analytics", "plot qsar", view="matplotlib")
+def plot_qsar(results_df: pd.DataFrame):
+    """
+    Plots actual vs. predicted values from a QSAR results DataFrame.
+    Requires a single positional argument: the results DataFrame. All other
+    parameters are optional keyword arguments. It extracts summary metrics
+    directly from columns ('split_R2', 'split_MAE', 'split_RMSE')
+    expected within the results_df.
+    """
+    title = "QSAR Model Performance: Actual vs. Predicted"
+    xlabel = "Actual Values"
+    ylabel = "Predicted Values"
+    show_metrics = True
+    if not isinstance(results_df, pd.DataFrame):
+        raise TypeError(
+            "plot_qsar() missing 1 required positional argument: 'results_df' or the provided argument is not a pandas DataFrame."
+        )
+    required_cols = ["actual", "predicted", "lower_ci", "upper_ci", "split"]
+    if not all(col in results_df.columns for col in required_cols):
+        raise ValueError(f"Invalid 'results_df'. Must contain columns: {required_cols}")
+    metric_cols = ["split_R2", "split_MAE", "split_RMSE"]
+    metrics_available = all(col in results_df.columns for col in metric_cols)
+    if show_metrics and not metrics_available:
+        print(
+            f"Warning: Metrics display requested, but one or more metric columns ({metric_cols}) are missing in results_df."
+        )
+    # --- Prepare Data ---
+    train_data = results_df[results_df["split"] == "train"]
+    test_data = results_df[results_df["split"] == "test"]
+    can_plot_train = not train_data.empty
+    can_plot_test = not test_data.empty
+    if not can_plot_train and not can_plot_test:
+        print("Warning: Both training and test data subsets are empty. Cannot generate plot.")
+        return  # Exit function early if no data
+    # --- Create Plot (Internal Figure/Axes) ---
+    fig, ax = plt.subplots(figsize=(8, 8))
+    # --- Plotting Logic ---
+    # (Draws scatter, error bars, line, grid, labels, title, legend on 'ax')
+    if can_plot_train:
+        train_error = [
+            train_data["predicted"] - train_data["lower_ci"],
+            train_data["upper_ci"] - train_data["predicted"],
+        ]
+        ax.scatter(
+            train_data["actual"],
+            train_data["predicted"],
+            label="Train",
+            alpha=0.6,
+            s=30,
+            edgecolors="w",
+            linewidth=0.5,
+        )
+        ax.errorbar(
+            train_data["actual"],
+            train_data["predicted"],
+            yerr=train_error,
+            fmt="none",
+            ecolor="tab:blue",
+            label="_nolegend_",
+            capsize=0,
+            elinewidth=1,
+        )
+    if can_plot_test:
+        test_error = [
+            test_data["predicted"] - test_data["lower_ci"],
+            test_data["upper_ci"] - test_data["predicted"],
+        ]
+        ax.scatter(
+            test_data["actual"],
+            test_data["predicted"],
+            label="Test",
+            alpha=0.8,
+            s=40,
+            edgecolors="w",
+            linewidth=0.5,
+        )
+        ax.errorbar(
+            test_data["actual"],
+            test_data["predicted"],
+            yerr=test_error,
+            fmt="none",
+            ecolor="tab:orange",
+            label="_nolegend_",
+            capsize=0,
+            elinewidth=1,
+        )
+    all_actual = results_df["actual"].dropna()
+    all_pred_ci = pd.concat(
+        [results_df["predicted"], results_df["lower_ci"], results_df["upper_ci"]]
+    ).dropna()
+    all_values = pd.concat([all_actual, all_pred_ci]).dropna()
+    if all_values.empty:
+        min_val, max_val = 0, 1
+    else:
+        min_val, max_val = all_values.min(), all_values.max()
+        if min_val == max_val:
+            min_val -= 0.5
+            max_val += 0.5
+        padding = (max_val - min_val) * 0.05
+        min_val -= padding
+        max_val += padding
+    ax.plot([min_val, max_val], [min_val, max_val], "k--", alpha=0.7, lw=1, label="y=x")
+    ax.set_xlim(min_val, max_val)
+    ax.set_ylim(min_val, max_val)
+    ax.set_aspect("equal", adjustable="box")
+    ax.grid(True, linestyle=":", alpha=0.6)
+    ax.set_xlabel(xlabel)
+    ax.set_ylabel(ylabel)
+    ax.set_title(title)
+    ax.legend(loc="lower right")
+    # --- Display Metrics Text ---
+    if show_metrics and metrics_available:
+        # (Logic for extracting and formatting metrics text remains the same)
+        metrics_text = ""
+        try:
+            if can_plot_train:
+                train_metrics = train_data[metric_cols].iloc[0]
+                r2_tr = (
+                    f"{train_metrics['split_R2']:.3f}"
+                    if pd.notna(train_metrics["split_R2"])
+                    else "N/A"
+                )
+                mae_tr = (
+                    f"{train_metrics['split_MAE']:.3f}"
+                    if pd.notna(train_metrics["split_MAE"])
+                    else "N/A"
+                )
+                rmse_tr = (
+                    f"{train_metrics['split_RMSE']:.3f}"
+                    if pd.notna(train_metrics["split_RMSE"])
+                    else "N/A"
+                )
+                metrics_text += f"Train: $R^2$={r2_tr}, MAE={mae_tr}, RMSE={rmse_tr}\n"
+            else:
+                metrics_text += "Train: N/A (No Data)\n"
+            if can_plot_test:
+                test_metrics = test_data[metric_cols].iloc[0]
+                r2_te = (
+                    f"{test_metrics['split_R2']:.3f}"
+                    if pd.notna(test_metrics["split_R2"])
+                    else "N/A"
+                )
+                mae_te = (
+                    f"{test_metrics['split_MAE']:.3f}"
+                    if pd.notna(test_metrics["split_MAE"])
+                    else "N/A"
+                )
+                rmse_te = (
+                    f"{test_metrics['split_RMSE']:.3f}"
+                    if pd.notna(test_metrics["split_RMSE"])
+                    else "N/A"
+                )
+                metrics_text += f"Test:  $R^2$={r2_te}, MAE={mae_te}, RMSE={rmse_te}"
+            else:
+                metrics_text += "Test:  N/A (No Data)"
+            if metrics_text:
+                ax.text(
+                    0.05,
+                    0.95,
+                    metrics_text.strip(),
+                    transform=ax.transAxes,
+                    fontsize=9,
+                    verticalalignment="top",
+                    bbox=dict(boxstyle="round,pad=0.5", fc="white", alpha=0.8),
+                )
+        except Exception as e:
+            print(f"An error occurred during metrics display: {e}")
+            ax.text(
+                0.05,
+                0.95,
+                "Error displaying metrics",
+                transform=ax.transAxes,
+                fontsize=9,
+                color="red",
+                verticalalignment="top",
+                bbox=dict(boxstyle="round,pad=0.5", fc="white", alpha=0.8),
+            )
+@op("LynxKite Graph Analytics", "plot qsar2", view="matplotlib")
+def plot_qsar2(results_df: pd.DataFrame):
+    """
+    Plots actual vs. predicted values resembling the example image.
+    Includes separate markers for train/test, y=x line, and parallel dashed
+    error bands based on test set RMSE (optional). Does NOT use per-point CIs.
+    Handles displaying the plot via plt.show() or saving it to a file
+    based on the `save_path` parameter. THIS FUNCTION DOES NOT RETURN ANY VALUE.
+    Parameters
+    ----------
+    results_df : pd.DataFrame
+        Mandatory input DataFrame. Must contain: 'actual', 'predicted', 'split'.
+        Should also contain 'split_RMSE' column for error bands and metrics display.
+    title : str, optional
+    xlabel : str, optional
+    ylabel : str, optional
+    rmse_multiplier_for_bands : float or None, optional
+        Determines the width of the dashed error bands (multiplier * test_RMSE).
+        Set to None to disable bands. Default is 1.0.
+    show_metrics : bool, optional
+        Whether to display R2/MAE/RMSE text (requires metric columns). Default is True.
+    save_path : str, optional
+        If provided, saves plot to this path. If None (default), displays plot.
+    Raises
+    ------
+    ValueError / TypeError : For invalid inputs.
+    """
+    COLOR_TRAIN = "royalblue"
+    COLOR_TEST = "darkorange"  # Changed from red for potentially better contrast/appeal
+    COLOR_PERFECT = "black"
+    COLOR_BANDS = "dimgrey"  # Less prominent than the perfect line
+    COLOR_GRID = "lightgrey"
+    title = "QSAR Model Performance: Actual vs. Predicted"
+    xlabel = "Actual Values"
+    ylabel = "Predicted Values"
+    # ci_alpha = 0.2
+    show_metrics = True
+    rmse_multiplier_for_bands = 1.0
+    # --- Input Validation ---
+    if not isinstance(results_df, pd.DataFrame):
+        raise TypeError("Input must be a pandas DataFrame.")
+    required_cols = ["actual", "predicted", "split"]
+    if not all(col in results_df.columns for col in required_cols):
+        raise ValueError(f"DataFrame must contain columns: {required_cols}")
+    metric_cols = ["split_R2", "split_MAE", "split_RMSE"]
+    metrics_available = all(col in results_df.columns for col in metric_cols)
+    bands_possible = rmse_multiplier_for_bands is not None and "split_RMSE" in results_df.columns
+    if show_metrics and not metrics_available:
+        print(
+            f"Warning: Metrics display requested, but one or more metric columns ({metric_cols}) are missing."
+        )
+    if rmse_multiplier_for_bands is not None and "split_RMSE" not in results_df.columns:
+        print("Warning: Error bands requested, but 'split_RMSE' column is missing.")
+        bands_possible = False
+    # --- Prepare Data ---
+    train_data = results_df[results_df["split"] == "train"].copy()
+    test_data = results_df[results_df["split"] == "test"].copy()
+    can_plot_train = not train_data.empty
+    can_plot_test = not test_data.empty
+    if not can_plot_train and not can_plot_test:
+        print("Warning: Both training and test data subsets are empty. Cannot generate plot.")
+        return
+    # --- Create Plot with Style ---
+    plt.style.use("seaborn-v0_8-whitegrid")  # Use a cleaner base style
+    fig, ax = plt.subplots(figsize=(8, 8))  # Slightly larger figure
+    # --- Plotting Logic ---
+    # Scatter plots with enhanced style
+    common_scatter_kws = {"s": 45, "alpha": 0.75, "edgecolor": "black", "linewidth": 0.5}
+    if can_plot_train:
+        ax.scatter(
+            train_data["actual"],
+            train_data["predicted"],
+            label="Training set",
+            marker="o",
+            color=COLOR_TRAIN,
+            **common_scatter_kws,
+        )  # Blue circles
+    if can_plot_test:
+        ax.scatter(
+            test_data["actual"],
+            test_data["predicted"],
+            label="Test set",
+            marker="o",
+            color=COLOR_TEST,
+            **common_scatter_kws,
+        )  # Orange circles
+    # Determine plot limits
+    # (Using the same logic as before to calculate min_val, max_val)
+    all_actual = results_df["actual"].dropna()
+    all_pred = results_df["predicted"].dropna()
+    all_values = pd.concat([all_actual, all_pred]).dropna()
+    if all_values.empty:
+        min_val, max_val = 0, 1
+    else:
+        min_val, max_val = all_values.min(), all_values.max()
+        if min_val == max_val:
+            min_val -= 0.5
+            max_val += 0.5
+        data_range = max_val - min_val
+        if data_range == 0:
+            data_range = 1.0
+        padding = data_range * 0.10
+        min_val -= padding
+        max_val += padding
+    # Plot y=x line (Solid Black, slightly thicker)
+    ax.plot(
+        [min_val, max_val],
+        [min_val, max_val],
+        color=COLOR_PERFECT,
+        linestyle="-",
+        linewidth=1.5,
+        alpha=0.9,
+        label="_nolegend_",
+    )
+    # Plot Error Bands based on Test RMSE (subtler style)
+    rmse_test = np.nan
+    if bands_possible and can_plot_test:
+        try:
+            rmse_test = test_data["split_RMSE"].dropna().iloc[0]
+            if pd.notna(rmse_test) and rmse_test >= 0:
+                margin = rmse_multiplier_for_bands * rmse_test
+                band_label = (
+                    f"$\pm {rmse_multiplier_for_bands}\,$RMSE"
+                    if rmse_multiplier_for_bands == 1
+                    else f"$\pm {rmse_multiplier_for_bands}\,$RMSE"
+                )
+                ax.plot(
+                    [min_val, max_val],
+                    [min_val + margin, max_val + margin],
+                    color=COLOR_BANDS,
+                    linestyle="--",
+                    linewidth=1.0,
+                    alpha=0.7,
+                    label=band_label,
+                )  # Grey dashed
+                ax.plot(
+                    [min_val, max_val],
+                    [min_val - margin, max_val - margin],
+                    color=COLOR_BANDS,
+                    linestyle="--",
+                    linewidth=1.0,
+                    alpha=0.7,
+                    label="_nolegend_",
+                )  # Grey dashed
+            # else: print("Warning: Could not plot error bands (Invalid Test RMSE).") # Optionally silent
+        except Exception as e:
+            print(f"Warning: Could not plot error bands: {e}")
+    # Set limits and aspect ratio
+    ax.set_xlim(min_val, max_val)
+    ax.set_ylim(min_val, max_val)
+    ax.set_aspect("equal", adjustable="box")
+    # ADD BACK Grid (Subtle Style)
+    ax.grid(True, which="both", linestyle=":", linewidth=0.7, color=COLOR_GRID, alpha=0.7)
+    # Ensure grid is behind data points
+    ax.set_axisbelow(True)
+    # Set Labels and Title (using specified arguments)
+    ax.set_xlabel(xlabel, fontsize=12)
+    ax.set_ylabel(ylabel, fontsize=12)
+    ax.set_title(title, fontsize=15, pad=15, weight="semibold")  # Slightly larger title
+    # Enhance Legend
+    ax.legend(loc="best", frameon=True, framealpha=0.85, fontsize=10, shadow=False)
+    # --- Display Metrics Text (Optional) ---
+    if show_metrics and metrics_available:
+        # (Logic for extracting and formatting metrics text remains the same)
+        metrics_text = ""
+        try:
+            if can_plot_train:
+                train_metrics = train_data[metric_cols].dropna().iloc[0]  # Ensure using valid row
+                r2_tr = f"{train_metrics['split_R2']:.3f}"
+                mae_tr = f"{train_metrics['split_MAE']:.3f}"
+                rmse_tr = f"{train_metrics['split_RMSE']:.3f}"
+                metrics_text += f"Train: $R^2$={r2_tr}, MAE={mae_tr}, RMSE={rmse_tr}\n"
+            else:
+                metrics_text += "Train: N/A\n"
+            if can_plot_test:
+                test_metrics = test_data[metric_cols].dropna().iloc[0]  # Ensure using valid row
+                r2_te = f"{test_metrics['split_R2']:.3f}"
+                mae_te = f"{test_metrics['split_MAE']:.3f}"
+                rmse_te = f"{test_metrics['split_RMSE']:.3f}"
+                metrics_text += f"Test:  $R^2$={r2_te}, MAE={mae_te}, RMSE={rmse_te}"
+            else:
+                metrics_text += "Test:  N/A"
+            if metrics_text:
+                ax.text(
+                    0.05,
+                    0.95,
+                    metrics_text.strip(),
+                    transform=ax.transAxes,
+                    fontsize=9,
+                    verticalalignment="top",
+                    bbox=dict(boxstyle="round,pad=0.3", fc="white", alpha=0.7),
+                )  # Adjusted box slightly
+        except Exception as e:
+            print(f"An error occurred during metrics display: {e}")

examples/Cheminformatics/qsar_example.lynxkite.json ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/draw_molecules.py DELETED Viewed

@@ -1,29 +0,0 @@
-from lynxkite.core.ops import op
-import pandas as pd
-import base64
-import io
-def pil_to_data(image):
-    buffer = io.BytesIO()
-    image.save(buffer, format="png")
-    b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
-    return "data:image/png;base64," + b64
-def smiles_to_data(smiles):
-    import rdkit
-    m = rdkit.Chem.MolFromSmiles(smiles)
-    if m is None:
-        return None
-    img = rdkit.Chem.Draw.MolToImage(m)
-    data = pil_to_data(img)
-    return data
-@op("LynxKite Graph Analytics", "Draw molecules")
-def draw_molecules(df: pd.DataFrame, *, smiles_column: str, image_column: str = "image"):
-    df = df.copy()
-    df[image_column] = df[smiles_column].apply(smiles_to_data)
-    return df

examples/requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
 # Example of a requirements.txt file. LynxKite will automatically install anything you put here.
 faker
 matplotlib

 # Example of a requirements.txt file. LynxKite will automatically install anything you put here.
 faker
 matplotlib
+chembl_webresource_client
+rcsb-api
+itertools