In [1]:
import requests
from bs4 import BeautifulSoup
import unicodedata

def normalize_text(text):
    if text is None:
        return ""
    text = unicodedata.normalize('NFKC', text)
    return " ".join(text.split())

url = "https://jrct.niph.go.jp/latest-detail/jRCT2051240150"
headers = {
    "User-Agent": "Mozilla/5.0"
}

try:
    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()
except requests.RequestException as e:
    print(f"URLリクエストに失敗しました: {url} - エラー: {e}")
    # エラーが出た場合はここで止まります
    exit(1)

# response.text をファイルに保存して確認しても良い
# with open("debug_html.html", "w", encoding="utf-8") as f:
#     f.write(response.text)

soup = BeautifulSoup(response.text, 'html.parser')


In [2]:
for l in soup.find_all('label'):
    raw_text = l.get_text()
    normalized = normalize_text(raw_text)
    print("-----")
    print("RAW:", repr(raw_text))
    print("NORMALIZED:", repr(normalized))


-----
RAW: '研究の種別'
NORMALIZED: '研究の種別'
-----
RAW: '治験の区分'
NORMALIZED: '治験の区分'
-----
RAW: '初回公表日'
NORMALIZED: '初回公表日'
-----
RAW: '最終公表日'
NORMALIZED: '最終公表日'
-----
RAW: '中止年月日'
NORMALIZED: '中止年月日'
-----
RAW: '観察期間終了日'
NORMALIZED: '観察期間終了日'
-----
RAW: '研究名称'
NORMALIZED: '研究名称'
-----
RAW: '平易な研究名称'
NORMALIZED: '平易な研究名称'
-----
RAW: '研究責任（代表）医師の氏名'
NORMALIZED: '研究責任(代表)医師の氏名'
-----
RAW: '研究責任（代表）医師の所属機関'
NORMALIZED: '研究責任(代表)医師の所属機関'
-----
RAW: '研究・治験の目的'
NORMALIZED: '研究・治験の目的'
-----
RAW: '試験のフェーズ'
NORMALIZED: '試験のフェーズ'
-----
RAW: '対象疾患名'
NORMALIZED: '対象疾患名'
-----
RAW: '進捗状況'
NORMALIZED: '進捗状況'
-----
RAW: '医薬品等の一般名称'
NORMALIZED: '医薬品等の一般名称'
-----
RAW: '販売名'
NORMALIZED: '販売名'
-----
RAW: '認定委員会の名称'
NORMALIZED: '認定委員会の名称'
-----
RAW: '認定番号'
NORMALIZED: '認定番号'
-----
RAW: '                        試験等の名称                   /                       Scientific Title（Acronym）                                  '
NORMALIZED: '試験等の名称 / Scientific Title(Acronym)'
-----
RAW: '                      平易な試験等の名称  

In [3]:
target_jp = "対象疾患名"
found_label = None
for l in soup.find_all('label'):
    normalized = normalize_text(l.get_text())
    if target_jp in normalized:
        found_label = l
        print("FOUND LABEL:")
        print("NORMALIZED:", normalized)
        break

if not found_label:
    print("対象疾患名 を含むラベルが見つかりませんでした。")


FOUND LABEL:
NORMALIZED: 対象疾患名


In [4]:
if found_label:
    th = found_label.find_parent('th')
    if th:
        tr = th.find_parent('tr')
        if tr:
            tds = tr.find_all('td')
            print("Number of TDs:", len(tds))
            for i, td in enumerate(tds):
                print(f"TD[{i}]:", repr(normalize_text(td.get_text())))
        else:
            print("Parent <tr> not found.")
    else:
        print("Parent <th> not found.")


Number of TDs: 1
TD[0]: 'ラブドイド腫瘍'


In [5]:
def extract_label_data(label_text, label_en=None):
    for l in soup.find_all('label'):
        lt = l.get_text()
        normalized_lt = normalize_text(lt)
        if label_text in normalized_lt:
            # 該当の<label>が見つかった場合の処理
            th = l.find_parent('th')
            if not th:
                return None, None
            tr = th.find_parent('tr')
            if not tr:
                return None, None
            tds = tr.find_all('td')
            if len(tds) == 0:
                return None, None
            
            jp_data = normalize_text(tds[0].get_text()) if len(tds) > 0 else None
            en_data = normalize_text(tds[1].get_text()) if label_en and len(tds) > 1 else None
            return jp_data, en_data
    return None, None


In [6]:
extract_label_data("対象疾患名", "Health Condition(s) or Problem(s) Studied")

('ラブドイド腫瘍', None)

In [7]:
extract_label_data("対象疾患名", "Health Condition(s) or Problem(s) Studied")

('ラブドイド腫瘍', None)

In [8]:
def extract_label_data(label_text, label_en=None):
    for l in soup.find_all('label'):
        lt = l.get_text()
        normalized_lt = normalize_text(lt)
        # 日本語だけでなく、英語のフレーズも含めてチェック
        # label_text + " / " + label_en という完全一致に近い形で探索する
        combined_text = label_text + " / " + label_en
        if combined_text in normalized_lt:
            # 該当<label>が見つかったら、ここで<th>→<tr>→<td>を辿る
            th = l.find_parent('th')
            if not th:
                return None, None
            tr = th.find_parent('tr')
            if not tr:
                return None, None
            tds = tr.find_all('td')
            if len(tds) == 0:
                return None, None

            jp_data = normalize_text(tds[0].get_text()) if len(tds) > 0 else None
            en_data = normalize_text(tds[1].get_text()) if label_en and len(tds) > 1 else None
            return jp_data, en_data
    return None, None


In [9]:
extract_label_data("対象疾患名", "Health Condition(s) or Problem(s) Studied")

('ラブドイド腫瘍', 'rhabdoid tumor')

In [10]:
def extract_label_data_all(label_text, label_en):
    results = []
    combined_text = label_text + " / " + label_en
    for l in soup.find_all('label'):
        normalized_lt = normalize_text(l.get_text())
        if combined_text in normalized_lt:
            th = l.find_parent('th')
            if not th:
                continue
            tr = th.find_parent('tr')
            if not tr:
                continue
            tds = tr.find_all('td')
            if len(tds) < 2:
                continue
            jp_data = normalize_text(tds[0].get_text())
            en_data = normalize_text(tds[1].get_text())
            results.append((jp_data, en_data))
    return results

In [11]:
extract_label_data_all("対象疾患名", "Health Condition(s) or Problem(s) Studied") 

[('ラブドイド腫瘍', 'rhabdoid tumor')]

In [12]:
all_results = extract_label_data_all("対象疾患名", "Health Condition(s) or Problem(s) Studied")
for jp, en in all_results:
    if "H3 K27M" in jp and "Newly Diagnosed" in en:
        print("Found desired pair:", jp, en)
        # ここでbreakするなり、returnするなりして終了
