ksg-dfci
/

MatchMiner-AI

Model card Files Files and versions Community

File size: 12,793 Bytes

b63318a

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee78bb6d-4e3c-4751-b042-12c358d89cac",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import json\n",
    "from vllm import LLM, SamplingParams\n",
    "from transformers import AutoTokenizer\n",
    "import torch\n",
    "import os\n",
    "#os.environ['CUDA_VISIBLE_DEVICES'] = '2,3'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8eeb339-6aca-4d3f-96fb-24a1caf26b34",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7129a989-04e9-475d-9260-d1fdb1ab7faa",
   "metadata": {},
   "outputs": [],
   "source": [
    "llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 2, \n",
    "            gpu_memory_utilization = 0.5,\n",
    "            download_dir = \"../../..\", max_model_len=6000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9d7d1c4-50ed-4614-9855-8e6cc86bbb0e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73897bb9-0738-4446-b332-9b9bf46ad043",
   "metadata": {},
   "outputs": [],
   "source": [
    "def summarize_trials_multi_cohort(eligibility_texts, llama_model):\n",
    "\n",
    "    tokenizer = llama.get_tokenizer()\n",
    "    prompts = []\n",
    "    for trial in eligibility_texts:\n",
    "        messages = [\n",
    "            {'role':'system', 'content': \"\"\"You are an expert clinical oncologist with an encyclopedic knowledge of cancer and its treatments.\n",
    "        Your job is to review a clinical trial document and extract a list of structured clinical spaces that are eligible for that trial.\n",
    "        A clinical space is defined as a unique combination of cancer primary site, histology, which treatments a patient must have received, which treatments a patient must not have received, cancer burden (eg presence of metastatic disease), and tumor biomarkers (such as germline or somatic gene mutations or alterations, or protein expression on tumor) that a patient must have or must not have; that renders a patient eligible for the trial.\n",
    "        Trials often specify that a particular treatment is excluded only if it was given within a short period of time, for example 14 days, one month, etc , prior to trial start. Do not include this type of time-specific treatment eligibility criteria in your output at all.\n",
    "        Some trials have only one space, while others have several. Do not output a space that contains multiple cancer types and/or histologies. Instead, generate separate spaces for each cancer type/histology combination.\n",
    "        For biomarkers, if the trial specifies whether the biomarker will be assessed during screening, note that.\n",
    "        Spell out cancer types; do not abbreviate them. For example, write \"non-small cell lung cancer\" rather than \"NSCLC\".\n",
    "        Structure your output like this, as a list of spaces, with spaces separated by newlines, as below:\n",
    "        1. Cancer type allowed: <cancer_type_allowed>. Histology allowed: <histology_allowed>. Cancer burden allowed: <cancer_burden_allowed>. Prior treatment required: <prior_treatments_requred>. Prior treatment excluded: <prior_treatments_excluded>. Biomarkers required: <biomarkers_required>. Biomarkers excluded: <biomarkers_excluded>.\n",
    "        2. Cancer type allowed: <cancer_type_allowed>, etc.\n",
    "        If a particular concept is not mentioned in the trial text, do not include it in your definition of trial space(s).\n",
    "        \"\"\"},      \n",
    "              \n",
    "            {'role':'user', 'content': \"Here is a clinical trial document: \\n\" + trial + \"\\n\" + \"\"\"Now, generate your list of the trial space(s), formatted as above.\n",
    "            Do not provide any introductory, explanatory, concluding, or disclaimer text.\n",
    "            Reminder: Treatment history is an important component of trial space definitions, but treatment history requirements that are described as applying only in a given period of time prior to trial treatment MUST BE IGNORED.\"\"\"\n",
    "            }\n",
    "        ]\n",
    "    \n",
    "        prompts.append(tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False))\n",
    "    \n",
    "\n",
    "    \n",
    "    responses = llama_model.generate(\n",
    "        prompts,   \n",
    "        SamplingParams(\n",
    "        temperature=0.0,\n",
    "        top_p=0.9,\n",
    "        max_tokens=3096,\n",
    "        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")],  # KEYPOINT HERE\n",
    "    ))\n",
    "\n",
    "    response_texts = [x.outputs[0].text for x in responses]\n",
    "\n",
    "\n",
    "    return responses, response_texts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca683840-842b-4346-8eef-b66bc52d26af",
   "metadata": {},
   "outputs": [],
   "source": [
    "trials = pd.read_csv('./ctgov_cancer_trials.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa51de7d-74e0-4822-b7e1-2c9a3bc31260",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4816dbf0-bd92-4742-912a-477e545e330b",
   "metadata": {},
   "outputs": [],
   "source": [
    "trial_cohorts = summarize_trials_multi_cohort(trials.trial_text.tolist(), llama)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8283c587-c909-4548-804d-4d88b4ed7255",
   "metadata": {},
   "outputs": [],
   "source": [
    "trials['spaces'] = trial_cohorts[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ca75bab-7273-4ab0-86cd-1e0373546fce",
   "metadata": {},
   "outputs": [],
   "source": [
    "trials.to_csv('ctgov_all_trials_unique_trial_spaces_10-31-24.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0291913f-f3b9-4b39-99ab-954cb7237255",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16563812-6967-4788-a123-0af5fd701ede",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95776dbe-1a25-44bd-90f8-5c1573b6e92a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "output = pd.read_csv('ctgov_all_trials_unique_trial_spaces_10-31-24.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf647a1f-5a8c-4958-9032-440806a306d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# example of a trial and extracted spaces\n",
    "i = 1000\n",
    "output.trial_text.iloc[i], output.spaces.iloc[i]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9cc06840-5647-4524-a7bf-a1ad53a07b7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "frames = []\n",
    "for i in range(trials.shape[0]):\n",
    "    cohorts = pd.Series(trials.iloc[i].spaces.split(\"\\n\"))\n",
    "    cohorts = cohorts[~((cohorts.isnull()) | (cohorts == \"\\n\") | (cohorts == ''))].reset_index(drop=True)\n",
    "    frame = pd.DataFrame(np.repeat(trials.iloc[[i]], len(cohorts), axis=0), columns=trials.columns)\n",
    "    frame['this_space'] = cohorts\n",
    "    frame['space_number'] = frame.index\n",
    "    frames.append(frame)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "541669eb-f92e-49f3-9a36-b6625448c1a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "cohort_level_trials = pd.concat(frames, axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51a04e84-7483-4398-b4a0-d0cdab790609",
   "metadata": {},
   "outputs": [],
   "source": [
    "cohort_level_trials.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "648f0e1e-ef81-4983-8f03-1fbdb138f649",
   "metadata": {},
   "outputs": [],
   "source": [
    "cohort_level_trials.this_space.str[0].isin(['1','2','3','4','5','6','7','8','9']).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9ea048c1-c4ef-4202-a9be-a4658c4f1058",
   "metadata": {},
   "outputs": [],
   "source": [
    "cohort_level_trials = cohort_level_trials[cohort_level_trials.this_space.str[0].isin(['1','2','3','4','5','6','7','8','9'])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "852aee9d-ad97-4374-932f-6cae378dde2a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "00d2220a-627a-4b67-be28-c42561c3c964",
   "metadata": {},
   "outputs": [],
   "source": [
    "cohort_level_trials.to_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a130e909-6629-4408-b1ad-201b319d5e0f",
   "metadata": {},
   "outputs": [],
   "source": [
    "temp = pd.read_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad078444-33e1-4398-92b8-2e7f9f1a4031",
   "metadata": {},
   "outputs": [],
   "source": [
    "temp.this_space.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be264ecb-12e7-4fd4-a16b-5a4b2f44d2aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "out = pd.read_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d38ca13b-f4c4-47f1-abd6-3289abbd5f64",
   "metadata": {},
   "outputs": [],
   "source": [
    "out.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6849b44d-df0d-464f-bbce-f8fc1f789d3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# this component and following cells will not run without access to the DFCI private dataset\n",
    "\n",
    "import pandas as pd\n",
    "dfci_trials = pd.read_csv(\"../space_specific_eligibility_checks_11-6-24.csv\")\n",
    "dfci_trials.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "869690c3-2a80-4403-8933-f8f042c4ae35",
   "metadata": {},
   "outputs": [],
   "source": [
    "non_dfci_ctgov_trials = out[~out.nct_id.isin(dfci_trials.nct_id)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d28ed5c-d152-40a0-ab14-4aa748f3f8ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "non_dfci_ctgov_trials.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "efdaaf5b-edfb-4900-b85b-dde7eb1f92df",
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_trials = non_dfci_ctgov_trials.groupby('nct_id').first().reset_index()[['nct_id']]\n",
    "unique_trials.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41a63a73-4822-4c1d-820d-389252c0c56f",
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_trial_sample = unique_trials.nct_id.sample(n=500, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4cbbffe7-72ca-45b4-a11f-bb2d278bcfb7",
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_spaces = non_dfci_ctgov_trials[non_dfci_ctgov_trials.nct_id.isin(unique_trial_sample)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2639cc4-3472-463c-8519-ce0a9a1d845c",
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_spaces.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc6def48-cacc-437b-ac19-2af9418821c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_spaces.to_csv('sample_spaces.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b2370cf-e2ec-4e54-8dd0-6dde6d0fb041",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}