In [4]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
import re

In [2]:
# Target列を分割する関数
def split_target(target):
    # 指定された区切り文字で分割
    split_words = re.split(r'[,\n、・及びおよび又はまたは]+', target)
    # 空白文字を除外してリストとして返す
    return [word.strip() for word in split_words if word.strip()]

In [3]:
basedf = pd.read_csv('../ClinicalTrialCSV/JRCT20241202Cancer.csv', index_col=0)
basedf = basedf.dropna(subset=['試験等のフェーズ'])
# Target列を分割してTargetWord列を追加
basedf['TargetWord'] = basedf['Target'].apply(split_target)

In [5]:
# モデルとトークナイザーのロード
model_name = "intfloat/multilingual-e5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [6]:
# テキストをベクトル化する関数
def encode_text(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)  # 平均プーリングでベクトル化

In [66]:
# クエリ
query = "乳がん"
#query = "神経膠腫"
threshold = 0.91
#threshold = 0.875
# クエリをベクトル化
query_vec = encode_text([query])

In [67]:
query_vec.shape

torch.Size([1, 1024])

In [68]:
# ターゲットリスト全体を処理
matched_indices = []
target_vecs_list = []
cosine_scores_list = []
for idx, target_words in enumerate(basedf['TargetWord']):
    # ターゲット内の各単語をベクトル化
    target_vecs = encode_text(target_words)
    # コサイン類似度を計算
    cosine_scores = torch.nn.functional.cosine_similarity(query_vec, target_vecs)
    target_vecs_list.append(target_vecs)
    cosine_scores_list.append(cosine_scores)
    # 閾値を超えるか確認
    if (cosine_scores >= threshold).any():  # いずれかが閾値を超えている場合
        matched_indices.append(idx)

# 抽出
matched_df = basedf.iloc[matched_indices]
matched_df

Unnamed: 0,JRCT ID,NCT No,JapicCTI No,Title,Target,研究・治験の目的,試験等のフェーズ,試験の種類,無作為化,盲検化,...,purpose,Inclusion Criteria,Exclusion Criteria,Age Minimum,Age Maximum,Gender,Discontinuation Criteria,Keyword,Intervention(s),TargetWord
12,jRCT2041240130,,,中等度-高度PONV発症リスクを有する肺切除術予定患者の周術期管理におけるGranisetr...,肺がん,全身麻酔を要する肺切除術周術期における標準制吐療法であるグラニセトロン(Granisetro...,2,,無作為化比較,非盲検,...,treatment purpose,(1) Patients scheduled to undergo anatomical l...,(1) Patients with a history of allergy to the ...,18age old over,No limit,Both,,,Treatment group\r\nAdminister GRA 1mg intraven...,[肺がん]
59,jRCT2031240404,,,転移性HR+/HER2-乳がん及びその他の進行性固形がん患者を対象にCDK4阻害剤BGB-4...,・進行性固形腫瘍・進行性乳がん・転移性乳がん・ホルモン受容体陽性乳がん・ホルモン受容体陽性乳...,サイクリン依存性キナーゼ 4 (CDK4) 阻害剤である BGB-43395 が、ホルモン受...,1,,非無作為化比較,非盲検,...,treatment purpose,-Phase 1a (Dose Escalation) and 1b (Dose Expan...,-Prior therapy selectively targeting CDK4 (pri...,18age old over,No limit,Both,,,-Drug: BGB-43395\r\nPlanned doses administered...,"[進行性固形腫瘍, 進行性乳がん, 転移性乳がん, ホルモン受容体陽性乳がん, ホルモン受容..."
201,jRCT2052240059,,,遺伝子HSD17B4高メチル化を有するHER2陽性ER陰性乳癌における非手術療法の有用性を評...,乳がん,HSD17B4高メチル化（HSD17B4 hypermethylation：HH）を有するH...,2,,単一群,非盲検,...,diagnostic purpose,1. Histologically confirmed invasive breast ca...,1. History of other malignancy within the last...,20age old over,No limit,Female,,,Omitting breast surgery after preoperative che...,[乳がん]
234,jRCT2031240065,,,DNAミスマッチ修復機構欠損を有する進行性・転移性子宮体がん患者を対象に、一次治療として化学...,子宮体がん,dMMR/MSI-Hを有する再発又は進行子宮体癌患者を対象にdostarlimabの有効性及...,3,,無作為化比較,非盲検,...,treatment purpose,"1. Female patient is at least 18 years of age,...",1. Patient has received neoadjuvant/adjuvant s...,18age old over,No limit,Female,,,Patients will be randomized 1:1 to receive eit...,[子宮体がん]
257,jRCT2033240023,,,免疫チェックポイント阻害剤に対してacquired resistanceとなったがん患者を対...,肺がん、食道がん、胃癌等の固形がん,免疫チェックポイント阻害剤に対してacquired resistanceとなったがん患者（非...,1-2,,単一群,非盲検,...,treatment purpose,1. Japanese patients aged 20 years or older at...,1. Patients with a history of severe drug hype...,20age old over,No limit,Both,,,Intramuscular injection of the investigational...,"[肺がん, 食道がん, 胃癌等の固形がん]"
271,jRCT2031230750,NCT06188559,,治療歴のある HER2 陽性又は HER2 低発現の切除不能又は転移性乳癌の被験者を対象に，...,HER2 陽性又は HER2 低発現の切除不能又は転移性乳癌,治療,2,,無作為化比較,非盲検,...,treatment purpose,"(1) Male or female, aged >=18 years at the tim...","(1) Presence of brain or subdural metastases, ...",18age old over,No limit,Both,,,Generic Name:NA\r\nStudy Treatment in dose opt...,"[HER2 陽性, HER2 低発現の切除不能, 転移性乳癌]"
279,jRCT2031230723,NCT06112379,,未治療のトリプルネガティブ又はホルモン受容体低発現／HER2陰性乳癌の成人患者を対象として、...,乳癌,未治療のトリプルネガティブ又はホルモン受容体低発現／HER2陰性乳癌の成人患者を対象として、...,3,,無作為化比較,非盲検,...,treatment purpose,1. Participant must be >= 18 years at the time...,"1. As judged by the investigator, any evidence...",18age old over,No limit,Both,,,- Experimental arm: Dato-DXd plus durvalumab n...,[乳癌]
311,jRCT2061230102,NCT06103864,,Programmed death-ligand（PD-L1）陽性の局所再発手術不能又は転移性...,乳癌,PD-L1陽性の局所再発手術不能または転移性TNBC患者を対象に、デュルバルマブ併用または非...,3,,無作為化比較,非盲検,...,treatment purpose,Histologically or cytologically documented loc...,"As judged by investigator, severe or uncontrol...",18age old over,No limit,Both,,,Arm 1: Dato-DXd + durvalumab\r\nArm 2: Investi...,[乳癌]
396,jRCT2061230074,NCT05952557,,根治的局所治療（化学療法の併用または非併用）を受けて疾患の兆候のない、再発リスクが中間～高リ...,乳がん、早期乳がん,Treatment,3,,無作為化比較,非盲検,...,treatment purpose,- Women and Men; 18 years or more at the time ...,- Inoperable locally advanced or metastatic br...,18age old over,No limit,,,,arm A: continue with SoC ET as directed by inv...,"[乳がん, 早期乳がん]"
467,jRCT2031230249,NCT05753501,,[M23-647]B細胞性悪性腫瘍患者を対象としたBTK 分解誘導化合物ABBV-101 の...,血液がん,・ABBV-101 の安全性を明らかにし，忍容性を確認する\r\n・ABBV-101 の経口...,1,,単一群,非盲検,...,treatment purpose,-For Dose Escalation (Part 1) only: Participan...,-Previously treated with a Bruton's tyrosine k...,18age old over,No limit,Both,,,Drug: ABBV-101\r\nOral: Tablet,[血液がん]


In [69]:
cosine_scores_list

[tensor([0.8485]),
 tensor([0.8322, 0.8710]),
 tensor([0.8630]),
 tensor([0.8748, 0.8663]),
 tensor([0.9373]),
 tensor([0.8615, 0.8576]),
 tensor([0.8443]),
 tensor([0.8511]),
 tensor([0.8320, 0.8452]),
 tensor([0.8485]),
 tensor([0.8681]),
 tensor([0.8585]),
 tensor([0.8489]),
 tensor([0.8535]),
 tensor([0.9058]),
 tensor([0.8473, 0.9020]),
 tensor([0.8587]),
 tensor([0.8621]),
 tensor([0.8526]),
 tensor([0.8363]),
 tensor([0.8573]),
 tensor([0.7646, 0.7990]),
 tensor([0.9068, 0.8733, 0.8775]),
 tensor([0.8584]),
 tensor([0.8584]),
 tensor([0.8624]),
 tensor([0.8456]),
 tensor([0.8959]),
 tensor([0.8844]),
 tensor([0.8842, 0.9356, 0.9305, 0.9065, 0.8970]),
 tensor([0.8691, 0.8630, 0.8718]),
 tensor([0.8758]),
 tensor([0.8630]),
 tensor([0.8611]),
 tensor([0.8630]),
 tensor([0.8963]),
 tensor([0.8601]),
 tensor([0.8040, 0.8439]),
 tensor([0.8789, 0.8568]),
 tensor([0.8110, 0.8643, 0.8400, 0.9012]),
 tensor([0.8604]),
 tensor([0.8752, 0.8826, 0.8916, 0.8565]),
 tensor([0.9091, 0.8941, 0

In [70]:
# 全データのターゲット列をベクトル化
target_list = basedf['Target'].tolist()

target_vecs = encode_text(target_list)
    # コサイン類似度を計算
cosine_scores = torch.nn.functional.cosine_similarity(query_vec, target_vecs)

In [71]:
matched_indices_d = (cosine_scores >= threshold).nonzero().tolist()
# 入れ子リストをフラットなリストに変換
flat_indices_d = [idx[0] for idx in matched_indices_d]

# 抽出
matched_df_d = basedf.iloc[flat_indices_d]
matched_df_d


Unnamed: 0,JRCT ID,NCT No,JapicCTI No,Title,Target,研究・治験の目的,試験等のフェーズ,試験の種類,無作為化,盲検化,...,purpose,Inclusion Criteria,Exclusion Criteria,Age Minimum,Age Maximum,Gender,Discontinuation Criteria,Keyword,Intervention(s),TargetWord
12,jRCT2041240130,,,中等度-高度PONV発症リスクを有する肺切除術予定患者の周術期管理におけるGranisetr...,肺がん,全身麻酔を要する肺切除術周術期における標準制吐療法であるグラニセトロン(Granisetro...,2,,無作為化比較,非盲検,...,treatment purpose,(1) Patients scheduled to undergo anatomical l...,(1) Patients with a history of allergy to the ...,18age old over,No limit,Both,,,Treatment group\r\nAdminister GRA 1mg intraven...,[肺がん]
201,jRCT2052240059,,,遺伝子HSD17B4高メチル化を有するHER2陽性ER陰性乳癌における非手術療法の有用性を評...,乳がん,HSD17B4高メチル化（HSD17B4 hypermethylation：HH）を有するH...,2,,単一群,非盲検,...,diagnostic purpose,1. Histologically confirmed invasive breast ca...,1. History of other malignancy within the last...,20age old over,No limit,Female,,,Omitting breast surgery after preoperative che...,[乳がん]
279,jRCT2031230723,NCT06112379,,未治療のトリプルネガティブ又はホルモン受容体低発現／HER2陰性乳癌の成人患者を対象として、...,乳癌,未治療のトリプルネガティブ又はホルモン受容体低発現／HER2陰性乳癌の成人患者を対象として、...,3,,無作為化比較,非盲検,...,treatment purpose,1. Participant must be >= 18 years at the time...,"1. As judged by the investigator, any evidence...",18age old over,No limit,Both,,,- Experimental arm: Dato-DXd plus durvalumab n...,[乳癌]
311,jRCT2061230102,NCT06103864,,Programmed death-ligand（PD-L1）陽性の局所再発手術不能又は転移性...,乳癌,PD-L1陽性の局所再発手術不能または転移性TNBC患者を対象に、デュルバルマブ併用または非...,3,,無作為化比較,非盲検,...,treatment purpose,Histologically or cytologically documented loc...,"As judged by investigator, severe or uncontrol...",18age old over,No limit,Both,,,Arm 1: Dato-DXd + durvalumab\r\nArm 2: Investi...,[乳癌]
396,jRCT2061230074,NCT05952557,,根治的局所治療（化学療法の併用または非併用）を受けて疾患の兆候のない、再発リスクが中間～高リ...,乳がん、早期乳がん,Treatment,3,,無作為化比較,非盲検,...,treatment purpose,- Women and Men; 18 years or more at the time ...,- Inoperable locally advanced or metastatic br...,18age old over,No limit,,,,arm A: continue with SoC ET as directed by inv...,"[乳がん, 早期乳がん]"
509,jRCT2031230109,NCT05514054,,EMBER-4：2～5年間の術後内分泌療法による前治療歴を有する再発高リスクのER+、HER...,乳癌,早期乳癌患者を対象としたimlunestrantと標準的な内分泌療法の比較試験,3,,無作為化比較,非盲検,...,treatment purpose,"-Have a diagnosis of ER+, HER2- early-stage, r...",-Have any evidence of metastatic disease (incl...,18age old over,No limit,Both,,,-Drug: Imlunestrant\r\n Administered orally.\...,[乳癌]
515,jRCT2031230096,NCT05774951,,根治的局所領域療法（化学療法の併用または非併用）および標準補助内分泌療法（ET）を少なくとも...,乳がん、早期乳がん,Treatment,3,,無作為化比較,非盲検,...,treatment purpose,"- Women and Men, greater than or equal to 18 y...",- Inoperable locally advanced or metastatic br...,18age old over,No limit,Both,,,arm A: continue with SoC ET as directed by inv...,"[乳がん, 早期乳がん]"
610,jRCT2061220087,NCT05629585,,術前薬物療法後の外科的切除時に乳房及び／又は腋窩リンパ節に浸潤性残存病変を有するステージI～...,乳癌,術前薬物療法後の外科的切除時に乳房および／または腋窩リンパ節に浸潤性残存病変を有するI～II...,3,,無作為化比較,非盲検,...,treatment purpose,Participant must be >= 18 years at the time of...,Stage IV (metastatic) TNBC.\r\nHistory of prio...,18age old over,130age old under,Both,,,Arm 1: Dato-DXd 6 mg/kg IV Q3W x 8 cycles + Du...,[乳癌]
693,jRCT2031220276,NCT05307705,,PIK3CA H1047R変異を有する進行乳がん患者及びその他の固形がん患者を対象としたLO...,乳がん,LOXO-783 の単独投与及び他の抗がん剤との併用投与における 第 2 相試験の推奨用量、...,1,,単一群,非盲検,...,treatment purpose,-Have advanced breast cancer or another solid ...,-Medical Conditions\r\n -Colorectal cancer\r\n...,18age old over,No limit,Both,,,-Drug: LOXO-783\r\n Oral\r\n Other Name: LY384...,[乳がん]
865,jRCT2052210099,,,乳がんを有する成人女性及び健康成人女性を対象とした乳房用マイクロ波画像診断装置IGS-000...,乳がん,本治験の目的は，乳がん又はその疑いのある者，並びに乳がん又はその疑いがない者を対象に乳房画像...,2,,単一群,単盲検,...,screening,Breast cancer (including suspected) \r\n(1) On...,(1) There is trauma with bleeding on the breas...,20age old over,No limit,Female,,,The test using IGS-0001 will be performed twic...,[乳がん]
