Spaces:
Running
Running
高橋慧
commited on
Commit
·
c8ef2b0
1
Parent(s):
318221a
stage4
Browse files- OpenAITools/.ipynb_checkpoints/ECarteTools-checkpoint.py +0 -73
- OpenAITools/.ipynb_checkpoints/ExpertTools-checkpoint.py +0 -245
- OpenAITools/.ipynb_checkpoints/FetchTools-checkpoint.py +0 -158
- OpenAITools/.ipynb_checkpoints/scrapeThisData-checkpoint.py +0 -237
- OpenAITools/CrinicalTrialTools_old.py +0 -423
- OpenAITools/ECarteTools.py +0 -73
- OpenAITools/ExpertTools.py +0 -246
- OpenAITools/JRCTTools.py +0 -559
- OpenAITools/ReviewPaperTools.py +0 -42
- OpenAITools/scrapeThisData.py +0 -237
- README.md +155 -83
- app.py +163 -14
OpenAITools/.ipynb_checkpoints/ECarteTools-checkpoint.py
DELETED
@@ -1,73 +0,0 @@
|
|
1 |
-
import openai
|
2 |
-
import time
|
3 |
-
import wikipedia
|
4 |
-
import random
|
5 |
-
import re
|
6 |
-
import requests
|
7 |
-
from bs4 import BeautifulSoup
|
8 |
-
import os
|
9 |
-
import glob
|
10 |
-
from natsort import natsorted
|
11 |
-
import requests
|
12 |
-
from bs4 import BeautifulSoup
|
13 |
-
import xml.etree.ElementTree as ET
|
14 |
-
import pandas as pd
|
15 |
-
|
16 |
-
wikipedia.set_lang("ja")
|
17 |
-
# APIキーの設定
|
18 |
-
openai.api_key = os.environ['OPENAI_API_KEY']
|
19 |
-
engine="gpt-3.5-turbo"
|
20 |
-
|
21 |
-
|
22 |
-
def generate(system_template,prompt,engine="gpt-3.5-turbo"):
|
23 |
-
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
24 |
-
try:
|
25 |
-
response = openai.ChatCompletion.create(
|
26 |
-
model=engine,
|
27 |
-
messages=[
|
28 |
-
{"role": "system", "content": system_template},
|
29 |
-
{"role": "user", "content":prompt},
|
30 |
-
]
|
31 |
-
)
|
32 |
-
result=response["choices"][0]["message"]["content"]
|
33 |
-
return result
|
34 |
-
except:
|
35 |
-
print("リトライ")
|
36 |
-
time.sleep(30)
|
37 |
-
pass
|
38 |
-
|
39 |
-
def generate_carte(prompt,engine="gpt-3.5-turbo"):
|
40 |
-
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
41 |
-
try:
|
42 |
-
response = openai.ChatCompletion.create(
|
43 |
-
model=engine,
|
44 |
-
messages=[
|
45 |
-
{"role": "system", "content": "You are useful assistant"},
|
46 |
-
{"role": "user", "content":"%s\n・・・という患者と医師の会話をSOAP形式のカルテとして日本語で端的にまとめて下さい。各セクションはS),O), A),P)として下さい "%prompt},
|
47 |
-
]
|
48 |
-
)
|
49 |
-
result=response["choices"][0]["message"]["content"]
|
50 |
-
return result
|
51 |
-
except:
|
52 |
-
print("リトライ")
|
53 |
-
time.sleep(30)
|
54 |
-
pass
|
55 |
-
|
56 |
-
def get_selected_fileds(texts):
|
57 |
-
input_name = texts.replace(' ' , "+")
|
58 |
-
corona_fields = ct.get_study_fields(
|
59 |
-
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
|
60 |
-
fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
|
61 |
-
max_studies=500,
|
62 |
-
fmt="csv")
|
63 |
-
return corona_fields
|
64 |
-
|
65 |
-
def get_retriever_str(fields):
|
66 |
-
retriever_str=''
|
67 |
-
for i in range(1,len(fields)):
|
68 |
-
colnames = fields[0]
|
69 |
-
targetCol = fields[i]
|
70 |
-
for f in range(len(fields[0])):
|
71 |
-
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
72 |
-
retriever_str+='\n'
|
73 |
-
return retriever_str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
OpenAITools/.ipynb_checkpoints/ExpertTools-checkpoint.py
DELETED
@@ -1,245 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import openai
|
3 |
-
import time
|
4 |
-
import wikipedia
|
5 |
-
import random
|
6 |
-
import re
|
7 |
-
import requests
|
8 |
-
from bs4 import BeautifulSoup
|
9 |
-
import os
|
10 |
-
import glob
|
11 |
-
from natsort import natsorted
|
12 |
-
import requests
|
13 |
-
from bs4 import BeautifulSoup
|
14 |
-
import xml.etree.ElementTree as ET
|
15 |
-
from pytrials.client import ClinicalTrials
|
16 |
-
from Bio import Entrez
|
17 |
-
import pandas as pd
|
18 |
-
import numpy as np
|
19 |
-
import time
|
20 |
-
#from langchain.agents import create_pandas_dataframe_agent
|
21 |
-
from langchain_experimental.agents import create_pandas_dataframe_agent
|
22 |
-
from langchain.llms import OpenAI
|
23 |
-
|
24 |
-
# APIキーの設定
|
25 |
-
openai.api_key = os.environ['OPENAI_API_KEY']
|
26 |
-
gptengine="gpt-3.5-turbo"
|
27 |
-
|
28 |
-
|
29 |
-
"""def get_selected_fileds(texts):
|
30 |
-
ct = ClinicalTrials()
|
31 |
-
input_name = texts.replace(' ' , "+")
|
32 |
-
corona_fields = ct.get_study_fields(
|
33 |
-
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
|
34 |
-
fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
|
35 |
-
max_studies=500,
|
36 |
-
fmt="csv")
|
37 |
-
return corona_fields"""
|
38 |
-
|
39 |
-
def get_retriever_str(fields):
|
40 |
-
retriever_str=''
|
41 |
-
for i in range(1,len(fields)):
|
42 |
-
colnames = fields[0]
|
43 |
-
targetCol = fields[i]
|
44 |
-
for f in range(len(fields[0])):
|
45 |
-
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
46 |
-
retriever_str+='\n'
|
47 |
-
return retriever_str
|
48 |
-
|
49 |
-
def get_chanked_retriever(fields):
|
50 |
-
retriever_list =[]
|
51 |
-
for i in range(1,len(fields)):
|
52 |
-
retriever_str=''
|
53 |
-
colnames = fields[0]
|
54 |
-
targetCol = fields[i]
|
55 |
-
for f in range(len(fields[0])):
|
56 |
-
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
57 |
-
retriever_list.append(retriever_str)
|
58 |
-
return retriever_list
|
59 |
-
|
60 |
-
from pytrials.client import ClinicalTrials
|
61 |
-
def get_selected_fields(texts, split_criteria=False,
|
62 |
-
split_word_number = False, split_number=700):
|
63 |
-
ct = ClinicalTrials()
|
64 |
-
input_name = texts.replace(' ', "+")
|
65 |
-
corona_fields = ct.get_study_fields(
|
66 |
-
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (input_name),
|
67 |
-
fields=["NCTId", "Condition", "BriefTitle", 'BriefSummary', 'EligibilityCriteria'],
|
68 |
-
max_studies=500,
|
69 |
-
fmt="csv")
|
70 |
-
|
71 |
-
if split_criteria:
|
72 |
-
new_fields = []
|
73 |
-
|
74 |
-
# 検索対象の文字列
|
75 |
-
target_string1 = 'Exclusion Criteria'
|
76 |
-
target_string2 = 'Exclusion criteria'
|
77 |
-
|
78 |
-
# 各要素で検索対象の文字列を探し、直前で分割して新しいリストに格納
|
79 |
-
for corona_field in corona_fields:
|
80 |
-
new_list = []
|
81 |
-
for item in corona_field:
|
82 |
-
if target_string1 in item:
|
83 |
-
split_position = item.index(target_string1)
|
84 |
-
new_list.append(item[:split_position])
|
85 |
-
new_list.append(item[split_position:])
|
86 |
-
elif target_string2 in item:
|
87 |
-
split_position = item.index(target_string2)
|
88 |
-
new_list.append(item[:split_position])
|
89 |
-
new_list.append(item[split_position:])
|
90 |
-
else:
|
91 |
-
new_list.append(item)
|
92 |
-
new_fields.append(new_list)
|
93 |
-
else:
|
94 |
-
new_fields = corona_fields
|
95 |
-
|
96 |
-
if split_word_number:
|
97 |
-
split_fields = []
|
98 |
-
for new_field in new_fields:
|
99 |
-
new_list= []
|
100 |
-
|
101 |
-
# 各要素を調べて、700文字以上であれば分割し、新しいリストに格納
|
102 |
-
for item in new_field:
|
103 |
-
item_length = len(item)
|
104 |
-
if item_length > split_number:
|
105 |
-
num_parts = -(-item_length // split_number) # 向上の除算を用いて分割数を計算
|
106 |
-
for i in range(num_parts):
|
107 |
-
start_index = i * split_number
|
108 |
-
end_index = min((i + 1) * split_number, item_length) # 文字列の終わりを超えないように調整
|
109 |
-
new_list.append(item[start_index:end_index])
|
110 |
-
else:
|
111 |
-
new_list.append(item)
|
112 |
-
|
113 |
-
split_fields.append(new_list)
|
114 |
-
new_fields = split_fields
|
115 |
-
|
116 |
-
return new_fields
|
117 |
-
|
118 |
-
|
119 |
-
def print_agent_results(df, Ids,
|
120 |
-
interesteds = ['Condition', 'BriefTitle', 'BriefSummary', 'EligibilityCriteria'],
|
121 |
-
translater=None):
|
122 |
-
results = ""
|
123 |
-
for Id in Ids:
|
124 |
-
print("%s\n"%Id)
|
125 |
-
sdf = df[df['NCTId'] == Id]
|
126 |
-
for interested in interesteds:
|
127 |
-
# 最初の要素を取得
|
128 |
-
results += '%s: \n %s \n' % (interested, sdf[interested].iloc[0])
|
129 |
-
#print('%s: \n %s \n' % (interested, sdf[interested].iloc[0]))
|
130 |
-
if translater:
|
131 |
-
to_be_printed = translater.translate(results)
|
132 |
-
else:
|
133 |
-
to_be_printed =results
|
134 |
-
print(to_be_printed)
|
135 |
-
|
136 |
-
def search(query):
|
137 |
-
Entrez.email = os.getenv('MAIL_ADRESS')
|
138 |
-
#Entrez.email='[email protected]'
|
139 |
-
handle = Entrez.esearch(db='pubmed',
|
140 |
-
sort = 'relevance',
|
141 |
-
retmax = '20',
|
142 |
-
retmode = 'xml',
|
143 |
-
term = query)
|
144 |
-
results = Entrez.read(handle)
|
145 |
-
return results
|
146 |
-
|
147 |
-
def fetch_details(id_list):
|
148 |
-
ids = ','.join(id_list)
|
149 |
-
Entrez.email = os.getenv('MAIL_ADRESS')
|
150 |
-
#Entrez.email = '[email protected]'
|
151 |
-
handle = Entrez.efetch(db = 'pubmed',
|
152 |
-
retmode = 'xml',
|
153 |
-
id = ids)
|
154 |
-
results = Entrez.read(handle)
|
155 |
-
return results
|
156 |
-
'''def generate(prompt,engine=None):
|
157 |
-
if engine is None:
|
158 |
-
engine=gptengine
|
159 |
-
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
160 |
-
try:
|
161 |
-
response = openai.ChatCompletion.create(
|
162 |
-
model=engine,
|
163 |
-
messages=[
|
164 |
-
{"role": "system", "content": "You are useful assistant"},
|
165 |
-
{"role": "user", "content":prompt},
|
166 |
-
]
|
167 |
-
)
|
168 |
-
result=response["choices"][0]["message"]["content"]
|
169 |
-
return result
|
170 |
-
except Exception as e:
|
171 |
-
print(e)
|
172 |
-
print("リトライ")
|
173 |
-
time.sleep(30)
|
174 |
-
pass
|
175 |
-
'''
|
176 |
-
|
177 |
-
def generate(prompt,engine=None):
|
178 |
-
if engine is None:
|
179 |
-
engine=gptengine
|
180 |
-
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
181 |
-
try:
|
182 |
-
response = openai.chat.completions.create(
|
183 |
-
model=engine,
|
184 |
-
messages=[
|
185 |
-
{"role": "system", "content": "You are useful assistant"},
|
186 |
-
{"role": "user", "content":prompt},
|
187 |
-
]
|
188 |
-
)
|
189 |
-
#result=response["choices"][0]["message"]["content"]
|
190 |
-
result=response.choices[0].message.content
|
191 |
-
return result
|
192 |
-
except Exception as e:
|
193 |
-
print(e)
|
194 |
-
print("リトライ")
|
195 |
-
time.sleep(30)
|
196 |
-
pass
|
197 |
-
|
198 |
-
def GetPubmedSummaryDf(studies):
|
199 |
-
title_list= []
|
200 |
-
abstract_list=[]
|
201 |
-
journal_list = []
|
202 |
-
language_list =[]
|
203 |
-
pubdate_year_list = []
|
204 |
-
pubdate_month_list = []
|
205 |
-
studiesIdList = studies['IdList']
|
206 |
-
chunk_size = 10000
|
207 |
-
for chunk_i in range(0, len(studiesIdList), chunk_size):
|
208 |
-
chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
|
209 |
-
|
210 |
-
try:
|
211 |
-
papers = fetch_details(chunk)
|
212 |
-
for i, paper in enumerate(papers['PubmedArticle']):
|
213 |
-
title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
|
214 |
-
try:
|
215 |
-
abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
|
216 |
-
except:
|
217 |
-
abstract_list.append('No Abstract')
|
218 |
-
journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
|
219 |
-
language_list.append(paper['MedlineCitation']['Article']['Language'][0])
|
220 |
-
try:
|
221 |
-
pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
|
222 |
-
except:
|
223 |
-
pubdate_year_list.append('No Data')
|
224 |
-
try:
|
225 |
-
pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
|
226 |
-
except:
|
227 |
-
pubdate_month_list.append('No Data')
|
228 |
-
except: # occasionally a chunk might annoy your parser
|
229 |
-
pass
|
230 |
-
df = pd.DataFrame(list(zip(
|
231 |
-
title_list, abstract_list, journal_list, language_list, pubdate_year_list,
|
232 |
-
pubdate_month_list)),
|
233 |
-
columns=['Title', 'Abstract', 'Journal', 'Language', 'Year','Month'])
|
234 |
-
return df, abstract_list
|
235 |
-
|
236 |
-
def ClinicalAgent(fileds, verbose=False):
|
237 |
-
df = pd.DataFrame.from_records(fileds[1:], columns=fileds[0])
|
238 |
-
return create_pandas_dataframe_agent(OpenAI(temperature=0, model='gpt-3.5-turbo-16k'), df, verbose=verbose)
|
239 |
-
|
240 |
-
def GetNCTID(results):
|
241 |
-
# NCTで始まる単語を検索する正規表現
|
242 |
-
pattern = r'\bNCT\d+\b'
|
243 |
-
# 正規表現を使って単語を抽出
|
244 |
-
nct_words = re.findall(pattern,results)
|
245 |
-
return nct_words
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
OpenAITools/.ipynb_checkpoints/FetchTools-checkpoint.py
DELETED
@@ -1,158 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import pandas as pd
|
3 |
-
#from llama_index.llms.replicate import Replicate
|
4 |
-
import requests
|
5 |
-
import re
|
6 |
-
|
7 |
-
|
8 |
-
def extract_japan_cities(text):
|
9 |
-
# 正規表現を使用して " - Japan" で終わる都市名を抽出
|
10 |
-
pattern = r'(\b\w+\s*\w*\b) - Japan'
|
11 |
-
cities = re.findall(pattern, text)
|
12 |
-
unique_cities = list(set(cities))
|
13 |
-
# ユニークな都市名をソートしてカンマで区切られた文字列に変換
|
14 |
-
unique_cities.sort()
|
15 |
-
return ', '.join(unique_cities)
|
16 |
-
|
17 |
-
def fetch_clinical_trials(cancer_name):
|
18 |
-
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
|
19 |
-
# Initial URL for the first API call
|
20 |
-
base_url = "https://clinicaltrials.gov/api/v2/studies"
|
21 |
-
params = {
|
22 |
-
"query.titles": search_expr,
|
23 |
-
"pageSize": 100
|
24 |
-
}
|
25 |
-
|
26 |
-
# Initialize an empty list to store the data
|
27 |
-
data_list = []
|
28 |
-
# Loop until there is no nextPageToken
|
29 |
-
while True:
|
30 |
-
# Print the current URL (for debugging purposes)
|
31 |
-
print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
|
32 |
-
|
33 |
-
# Send a GET request to the API
|
34 |
-
response = requests.get(base_url, params=params)
|
35 |
-
|
36 |
-
# Check if the request was successful
|
37 |
-
if response.status_code == 200:
|
38 |
-
data = response.json() # Parse JSON response
|
39 |
-
studies = data.get('studies', []) # Extract the list of studies
|
40 |
-
|
41 |
-
# Loop through each study and extract specific information
|
42 |
-
for study in studies:
|
43 |
-
# Safely access nested keys
|
44 |
-
nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
|
45 |
-
startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
|
46 |
-
conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
|
47 |
-
title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
|
48 |
-
summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
|
49 |
-
|
50 |
-
# Extract locations safely
|
51 |
-
locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
|
52 |
-
locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
|
53 |
-
|
54 |
-
JapanesLocations = extract_japan_cities(locations)
|
55 |
-
# Extract dates and phases
|
56 |
-
primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
|
57 |
-
|
58 |
-
phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
|
59 |
-
eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
|
60 |
-
|
61 |
-
# Append the data to the list as a dictionary
|
62 |
-
data_list.append({
|
63 |
-
"NCTID": nctId,
|
64 |
-
"Title": title,
|
65 |
-
#"Start Date": startDate,
|
66 |
-
"Primary Completion Date": primaryCompletionDate,
|
67 |
-
#"Conditions": conditions,
|
68 |
-
"Cancer": conditions,
|
69 |
-
"Summary": summary,
|
70 |
-
"Japanes Locations": JapanesLocations,
|
71 |
-
#"Phases": phases,
|
72 |
-
"Eligibility Criteria": eligibilityCriteria
|
73 |
-
})
|
74 |
-
|
75 |
-
# Check for nextPageToken and update the params or break the loop
|
76 |
-
nextPageToken = data.get('nextPageToken')
|
77 |
-
if nextPageToken:
|
78 |
-
params['pageToken'] = nextPageToken # Set the pageToken for the next request
|
79 |
-
else:
|
80 |
-
break # Exit the loop if no nextPageToken is present
|
81 |
-
else:
|
82 |
-
print("Failed to fetch data. Status code:", response.status_code)
|
83 |
-
break
|
84 |
-
|
85 |
-
# Create a DataFrame from the list of dictionaries
|
86 |
-
df = pd.DataFrame(data_list)
|
87 |
-
return df
|
88 |
-
|
89 |
-
def fetch_clinical_trials_jp(cancer_name):
|
90 |
-
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
|
91 |
-
# Initial URL for the first API call
|
92 |
-
base_url = "https://clinicaltrials.gov/api/v2/studies"
|
93 |
-
params = {
|
94 |
-
"query.titles": search_expr,
|
95 |
-
"pageSize": 100
|
96 |
-
}
|
97 |
-
|
98 |
-
# Initialize an empty list to store the data
|
99 |
-
data_list = []
|
100 |
-
# Loop until there is no nextPageToken
|
101 |
-
while True:
|
102 |
-
# Print the current URL (for debugging purposes)
|
103 |
-
print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
|
104 |
-
|
105 |
-
# Send a GET request to the API
|
106 |
-
response = requests.get(base_url, params=params)
|
107 |
-
|
108 |
-
# Check if the request was successful
|
109 |
-
if response.status_code == 200:
|
110 |
-
data = response.json() # Parse JSON response
|
111 |
-
studies = data.get('studies', []) # Extract the list of studies
|
112 |
-
|
113 |
-
# Loop through each study and extract specific information
|
114 |
-
for study in studies:
|
115 |
-
# Safely access nested keys
|
116 |
-
nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
|
117 |
-
startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
|
118 |
-
conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
|
119 |
-
title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
|
120 |
-
summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
|
121 |
-
|
122 |
-
# Extract locations safely
|
123 |
-
locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
|
124 |
-
locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
|
125 |
-
|
126 |
-
JapanesLocations = extract_japan_cities(locations)
|
127 |
-
# Extract dates and phases
|
128 |
-
primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
|
129 |
-
|
130 |
-
phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
|
131 |
-
eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
|
132 |
-
|
133 |
-
# Append the data to the list as a dictionary
|
134 |
-
data_list.append({
|
135 |
-
"NCTID": nctId,
|
136 |
-
"タイトル": title,
|
137 |
-
#"Start Date": startDate,
|
138 |
-
#"Primary Completion Date": primaryCompletionDate,
|
139 |
-
"対象となる癌": conditions,
|
140 |
-
"サマリー": summary,
|
141 |
-
"場所": JapanesLocations,
|
142 |
-
#"Phases": phases,
|
143 |
-
"クライテリア": eligibilityCriteria
|
144 |
-
})
|
145 |
-
|
146 |
-
# Check for nextPageToken and update the params or break the loop
|
147 |
-
nextPageToken = data.get('nextPageToken')
|
148 |
-
if nextPageToken:
|
149 |
-
params['pageToken'] = nextPageToken # Set the pageToken for the next request
|
150 |
-
else:
|
151 |
-
break # Exit the loop if no nextPageToken is present
|
152 |
-
else:
|
153 |
-
print("Failed to fetch data. Status code:", response.status_code)
|
154 |
-
break
|
155 |
-
|
156 |
-
# Create a DataFrame from the list of dictionaries
|
157 |
-
df = pd.DataFrame(data_list)
|
158 |
-
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
OpenAITools/.ipynb_checkpoints/scrapeThisData-checkpoint.py
DELETED
@@ -1,237 +0,0 @@
|
|
1 |
-
from selenium import webdriver
|
2 |
-
from selenium.webdriver.support.ui import Select
|
3 |
-
from selenium.webdriver.common.by import By
|
4 |
-
|
5 |
-
import requests
|
6 |
-
from bs4 import BeautifulSoup
|
7 |
-
import re
|
8 |
-
|
9 |
-
import os
|
10 |
-
import time
|
11 |
-
|
12 |
-
from selenium.webdriver.support.ui import WebDriverWait
|
13 |
-
from selenium.webdriver.common.by import By
|
14 |
-
from selenium.webdriver.support import expected_conditions as EC
|
15 |
-
from selenium.webdriver.common.action_chains import ActionChains
|
16 |
-
import chromedriver_autoinstaller
|
17 |
-
|
18 |
-
class ScrapeThatData:
|
19 |
-
|
20 |
-
def __init__(self, time_threshold = 10):
|
21 |
-
|
22 |
-
try:
|
23 |
-
chrome_options = webdriver.ChromeOptions()
|
24 |
-
chrome_options.add_argument('--no-sandbox')
|
25 |
-
self.driver = webdriver.Chrome(options=chrome_options)
|
26 |
-
|
27 |
-
except:
|
28 |
-
chromedriver_autoinstaller.install()
|
29 |
-
chrome_options = webdriver.ChromeOptions()
|
30 |
-
chrome_options.add_argument('--no-sandbox')
|
31 |
-
self.driver = webdriver.Chrome(options=chrome_options)
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
self.wait = WebDriverWait(self.driver,time_threshold)
|
36 |
-
self.attribute_dict = {'status':1 ,'conditions':2, 'interventions': 3, 'study type':4,
|
37 |
-
'phase':5, 'sponsor':6, 'funder type':7 , 'study design': 8,
|
38 |
-
'outcome measures':9, 'number enrolled':10, 'sex':11, 'age':12,
|
39 |
-
'nct number': 13, 'other ids':14, 'title acronym': 15 , 'study start': 16,
|
40 |
-
'primary completion': 17, 'study completion': 18 , 'first posted': 19,
|
41 |
-
'last update posted': 20 , 'results first posted': 21 , 'locations':22, 'study documents': 23}
|
42 |
-
|
43 |
-
self.status_dict = {'not yet recruiting' : 'notYetRecrCB',
|
44 |
-
'recruiting' : 'recruitingCB',
|
45 |
-
'enrolling by invitation':'enrollingByInvCB',
|
46 |
-
'active, not recruiting': 'activeCB',
|
47 |
-
'suspended': 'suspendedCB',
|
48 |
-
'terminated':'terminatedCB',
|
49 |
-
'completed':'completedCB',
|
50 |
-
'withdrawn': 'withdrawnCB',
|
51 |
-
'unknown status': 'unknownCB'}
|
52 |
-
|
53 |
-
def clicking_show_hide_cols(self, driver):
|
54 |
-
columns = driver.find_element(By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/button')
|
55 |
-
action_chain = ActionChains(driver)
|
56 |
-
action_chain.move_to_element(columns).click()
|
57 |
-
action_chain.perform()
|
58 |
-
|
59 |
-
def select_attributes_to_show(self, listed_attributes, attribute_dict):
|
60 |
-
ll = [value.lower() for value in listed_attributes if value.lower() in ['status', 'conditions', 'interventions', 'locations']]
|
61 |
-
if ll:
|
62 |
-
to_show = [value.lower() for value in listed_attributes if value.lower() not in ll]
|
63 |
-
to_hide = [value for value in ['status', 'conditions', 'interventions', 'locations'] if value not in ll]
|
64 |
-
to_click = to_hide + to_show
|
65 |
-
for att in to_click:
|
66 |
-
self.clicking_show_hide_cols(self.driver)
|
67 |
-
time.sleep(1)
|
68 |
-
self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att]) + ']'))).click()
|
69 |
-
time.sleep(1)
|
70 |
-
else:
|
71 |
-
for att in listed_attributes:
|
72 |
-
self.clicking_show_hide_cols(self.driver)
|
73 |
-
time.sleep(1)
|
74 |
-
self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att.lower()]) + ']'))).click()
|
75 |
-
time.sleep(1)
|
76 |
-
|
77 |
-
def select_by_status(self, listed_states, status_dict):
|
78 |
-
if listed_states:
|
79 |
-
for status in listed_states:
|
80 |
-
self.driver.find_element(By.ID,status_dict[status.lower()]).click()
|
81 |
-
|
82 |
-
self.driver.find_element(By.XPATH,'//*[@id="FiltersBody"]/div[1]/input[1]').click()
|
83 |
-
time.sleep(3)
|
84 |
-
|
85 |
-
|
86 |
-
select = Select(self.driver.find_element_by_name('theDataTable_length'))
|
87 |
-
select.select_by_value('100')
|
88 |
-
|
89 |
-
def collect_data_search_page(self,l_ordered, amount_of_data = None):
|
90 |
-
|
91 |
-
class_name = ''
|
92 |
-
page_index = 1
|
93 |
-
|
94 |
-
elements = [l_ordered]
|
95 |
-
|
96 |
-
while 'disabled' not in class_name :
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
time.sleep(10)
|
101 |
-
|
102 |
-
print('Getting data from page {}'.format(page_index))
|
103 |
-
|
104 |
-
#Counting how many rows of the table appear
|
105 |
-
table = self.driver.find_element(By.ID,'theDataTable')
|
106 |
-
row_count = len(table.find_elements(By.TAG_NAME,"tr"))
|
107 |
-
|
108 |
-
#Looping table page
|
109 |
-
for index in range(1, row_count):
|
110 |
-
row = []
|
111 |
-
if 'status' in l_ordered:
|
112 |
-
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3)')))
|
113 |
-
status_element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3) > span')
|
114 |
-
row.append(status_element.text.strip())
|
115 |
-
for i, val in enumerate(l_ordered):
|
116 |
-
if val == 'status':
|
117 |
-
continue
|
118 |
-
|
119 |
-
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')))
|
120 |
-
element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')
|
121 |
-
try:
|
122 |
-
row.append(element.text.strip())
|
123 |
-
except:
|
124 |
-
print(i, element)
|
125 |
-
else:
|
126 |
-
for i, val in enumerate(l_ordered):
|
127 |
-
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')))
|
128 |
-
element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')
|
129 |
-
try:
|
130 |
-
row.append(element.text.strip())
|
131 |
-
except:
|
132 |
-
print(i, element)
|
133 |
-
elements.append(row)
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
#Getting next page button
|
139 |
-
next_page= self.driver.find_element(By.ID,"theDataTable_next")
|
140 |
-
|
141 |
-
#Getting the class attribute of the next page button
|
142 |
-
class_name = next_page.get_attribute('class')
|
143 |
-
|
144 |
-
#Going to the next page
|
145 |
-
next_page.click()
|
146 |
-
page_index += 1
|
147 |
-
|
148 |
-
if amount_of_data:
|
149 |
-
if len(elements) >= amount_of_data or row_count < amount_of_data :
|
150 |
-
break
|
151 |
-
else:
|
152 |
-
continue
|
153 |
-
|
154 |
-
return elements
|
155 |
-
|
156 |
-
def get_criteria(self, NCTnumber):
|
157 |
-
|
158 |
-
url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
|
159 |
-
ClinicalTrialpage = requests.get(url)
|
160 |
-
soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
|
161 |
-
|
162 |
-
wrapping_crit_class = soup.find_all("div", {"class": "tr-indent2"})
|
163 |
-
list_elements = wrapping_crit_class[1].find_all(re.compile("(ul|ol)"))
|
164 |
-
inclusion, exclusion = ('','')
|
165 |
-
|
166 |
-
|
167 |
-
if not list_elements:
|
168 |
-
print ("WARNING: Study number " + NCTnumber + " doesn't have eligibility criteria or HTML tag format is not a list")
|
169 |
-
else:
|
170 |
-
|
171 |
-
if len(list_elements) == 1:
|
172 |
-
try:
|
173 |
-
if wrapping_crit_class[1].find(text = 'Inclusion Criteria:'):
|
174 |
-
inclusion = list_elements[0].find_all("li")
|
175 |
-
|
176 |
-
elif wrapping_crit_class[1].find(text = 'Exclusion Criteria:'):
|
177 |
-
exclusion = list_elements[0].find_all("li")
|
178 |
-
except:
|
179 |
-
print('criteria doesnt exist')
|
180 |
-
else:
|
181 |
-
inclusion = list_elements[0].find_all("li")
|
182 |
-
exclusion = list_elements[1].find_all("li")
|
183 |
-
|
184 |
-
|
185 |
-
inclusion = ' '.join([t.text.strip() for t in inclusion ])
|
186 |
-
exclusion = ' '.join([t.text.strip() for t in exclusion ])
|
187 |
-
|
188 |
-
return(inclusion, exclusion)
|
189 |
-
|
190 |
-
#function that gets number of patients enrolled in a study
|
191 |
-
def get_enrollment (self, NCTnumber):
|
192 |
-
url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
|
193 |
-
ClinicalTrialpage = requests.get(url)
|
194 |
-
soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
|
195 |
-
enrollment = ''
|
196 |
-
wrapping_enrol_class = soup.find_all('td', {'headers':'studyInfoColData','style':"padding-left:1em"})
|
197 |
-
if not wrapping_enrol_class:
|
198 |
-
print('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
|
199 |
-
else:
|
200 |
-
enrollment = wrapping_enrol_class[1]
|
201 |
-
enrollment = enrollment.text.split()[0]
|
202 |
-
if enrollment.isdigit() == False:
|
203 |
-
print ('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
|
204 |
-
else:
|
205 |
-
return(enrollment)
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
def __call__(self, condition, listed_attributes, listed_states, amount_of_data):
|
210 |
-
|
211 |
-
self.driver.get('https://clinicaltrials.gov/ct2/results?cond=' + condition + '&rank=1&view=record#rowId0')
|
212 |
-
self.select_attributes_to_show(listed_attributes, self.attribute_dict)
|
213 |
-
|
214 |
-
try:
|
215 |
-
self.select_by_status(listed_states, self.status_dict)
|
216 |
-
except:
|
217 |
-
print('select by status is a problem')
|
218 |
-
n = []
|
219 |
-
for i in listed_attributes:
|
220 |
-
n.append(self.attribute_dict[i.lower()])
|
221 |
-
attribute_ordered = [list(self.attribute_dict.keys())[list(self.attribute_dict.values()).index(i)]for i in sorted(n)]
|
222 |
-
|
223 |
-
search_data = self.collect_data_search_page(attribute_ordered, amount_of_data=amount_of_data)
|
224 |
-
nct_numbers = [e[search_data[0].index('nct number')] for e in search_data[1:]]
|
225 |
-
search_data[0].extend(['inclusion', 'exclusion', 'enrollment'])
|
226 |
-
for index, nct in enumerate(nct_numbers):
|
227 |
-
if index % 100 == 0 and index!= 0:
|
228 |
-
print("Collected Data from {} Studies: ".format(index))
|
229 |
-
|
230 |
-
inc, exc = self.get_criteria(nct)
|
231 |
-
enrol = self.get_enrollment(nct)
|
232 |
-
search_data[index + 1].extend([inc, exc, enrol])
|
233 |
-
return search_data
|
234 |
-
# except:
|
235 |
-
# print('no data available with the specified status')
|
236 |
-
|
237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
OpenAITools/CrinicalTrialTools_old.py
DELETED
@@ -1,423 +0,0 @@
|
|
1 |
-
from langchain_community.agent_toolkits import create_sql_agent
|
2 |
-
from langchain_openai import ChatOpenAI
|
3 |
-
from langchain_groq import ChatGroq
|
4 |
-
from langchain_core.prompts import ChatPromptTemplate
|
5 |
-
from langchain_core.pydantic_v1 import BaseModel, Field
|
6 |
-
import pandas as pd
|
7 |
-
from pydantic import BaseModel, Field
|
8 |
-
|
9 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
10 |
-
from langchain_community.vectorstores import Chroma
|
11 |
-
from langchain.embeddings import HuggingFaceEmbeddings
|
12 |
-
from langchain_core.runnables import RunnablePassthrough
|
13 |
-
from langchain_core.output_parsers import StrOutputParser
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
gpt = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
|
18 |
-
#agent_gpt_executor = create_sql_agent(gpt, db=db, agent_type="tool-calling", verbose=True)
|
19 |
-
|
20 |
-
## make database
|
21 |
-
from langchain_community.utilities import SQLDatabase
|
22 |
-
from sqlalchemy import create_engine
|
23 |
-
|
24 |
-
from langchain.prompts import ChatPromptTemplate
|
25 |
-
from langchain.schema import SystemMessage
|
26 |
-
from langchain_core.prompts import MessagesPlaceholder
|
27 |
-
#agent_groq_executor = create_sql_agent(llm, db=db, agent_type="tool-calling", verbose=True)
|
28 |
-
|
29 |
-
from OpenAITools.FetchTools import fetch_clinical_trials, fetch_clinical_trials_jp
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
## Cancer Name の抽出
|
34 |
-
class ExtractTumorName(BaseModel):
|
35 |
-
"""Extract tumor name from the user's question."""
|
36 |
-
tumor_name: str = Field(description="Extracted tumor name from the question, or 'None' if no tumor found")
|
37 |
-
|
38 |
-
class TumorNameExtractor:
|
39 |
-
def __init__(self, llm):
|
40 |
-
self.llm = llm
|
41 |
-
|
42 |
-
# LLMの出力を構造化するための設定
|
43 |
-
self.structured_llm_extractor = self.llm.with_structured_output(ExtractTumorName)
|
44 |
-
|
45 |
-
# システムプロンプトを設定
|
46 |
-
self.system_prompt = """あなたは、ユーザーの質問に基づいて腫瘍名を英語で抽出するシステムです。\n
|
47 |
-
質問文に腫瘍の種類や名前が含まれている場合、それを英語で返してください。\n
|
48 |
-
質問文に腫瘍名がない場合は 'None' と返答してください。"""
|
49 |
-
|
50 |
-
# プロンプトテンプレート
|
51 |
-
self.grade_prompt = ChatPromptTemplate.from_messages(
|
52 |
-
[
|
53 |
-
("system", self.system_prompt),
|
54 |
-
("human", "ユーザーの質問: {question}"),
|
55 |
-
]
|
56 |
-
)
|
57 |
-
|
58 |
-
def extract_tumor_name(self, question: str) -> str:
|
59 |
-
"""
|
60 |
-
腫瘍名を抽出するメソッド。
|
61 |
-
:param question: 質問文
|
62 |
-
:return: 抽出された腫瘍名
|
63 |
-
"""
|
64 |
-
# 質問から腫瘍名を抽出する処理
|
65 |
-
tumor_extractor = self.grade_prompt | self.structured_llm_extractor
|
66 |
-
result = tumor_extractor.invoke({"question": question})
|
67 |
-
return result.tumor_name
|
68 |
-
|
69 |
-
### 質問変更システム
|
70 |
-
|
71 |
-
# ModifyQuestionの出力形式を定義
|
72 |
-
class ModifyQuestion(BaseModel):
|
73 |
-
"""Class for modifying a question by inserting NCTID."""
|
74 |
-
modified_question: str = Field(description="The modified question with the inserted NCTID.")
|
75 |
-
|
76 |
-
class QuestionModifier:
|
77 |
-
def __init__(self, llm):
|
78 |
-
self.llm = llm
|
79 |
-
|
80 |
-
# LLMの出力を構造化するための設定
|
81 |
-
self.structured_llm_modifier = self.llm.with_structured_output(ModifyQuestion)
|
82 |
-
|
83 |
-
# システムプロンプトを設定
|
84 |
-
self.system_prompt = """あなたは、ユーザーの質問に対して適切なNCTIDを挿入して質問を変更するシステムです。\n
|
85 |
-
質問文にNCTIDを挿入し、形式に基づいて新しい質問を生成してください。\n
|
86 |
-
例えば16歳男性の神経膠腫の患者さんが参加できる臨床治験を教えて下さいという質問に対しては\n
|
87 |
-
16歳男性の神経膠腫の患者さんは{nct_id}に参加できますか?と変更して下さい\n
|
88 |
-
NCTIDは {nct_id} を使用してください。"""
|
89 |
-
|
90 |
-
# プロンプトテンプレート
|
91 |
-
self.modify_prompt = ChatPromptTemplate.from_messages(
|
92 |
-
[
|
93 |
-
("system", self.system_prompt),
|
94 |
-
("human", "ユーザーの質問: {question}"),
|
95 |
-
]
|
96 |
-
)
|
97 |
-
|
98 |
-
def modify_question(self, question: str, nct_id: str) -> str:
|
99 |
-
"""
|
100 |
-
質問を変更するメソッド。
|
101 |
-
:param question: 質問文
|
102 |
-
:param nct_id: NCTID
|
103 |
-
:return: NCTIDを挿入した新しい質問
|
104 |
-
"""
|
105 |
-
# 質問を変更するプロセス
|
106 |
-
question_modifier = self.modify_prompt | self.structured_llm_modifier
|
107 |
-
result = question_modifier.invoke({"question": question, "nct_id": nct_id})
|
108 |
-
modify_question = result.modified_question
|
109 |
-
return modify_question
|
110 |
-
|
111 |
-
class QuestionModifierSecond:
|
112 |
-
def __init__(self, llm):
|
113 |
-
self.llm = llm
|
114 |
-
|
115 |
-
# LLMの出力を構造化するための設定
|
116 |
-
self.structured_llm_modifier = self.llm.with_structured_output(ModifyQuestion)
|
117 |
-
|
118 |
-
# システムプロンプトを設定
|
119 |
-
self.system_prompt = """あなたは、ユーザーの質問を変更するシステムです。\n
|
120 |
-
形式に基づいて新しい質問を生成してください。\n
|
121 |
-
例えば16歳男性の神経膠腫の患者さんが参加できる臨床治験を教えて下さいという質問に対しては\n
|
122 |
-
16歳男性の神経膠腫の患者さんはlこの治験に参加できますか?と変更して下さい\n
|
123 |
-
"""
|
124 |
-
|
125 |
-
# プロンプトテンプレート
|
126 |
-
self.modify_prompt = ChatPromptTemplate.from_messages(
|
127 |
-
[
|
128 |
-
("system", self.system_prompt),
|
129 |
-
("human", "ユーザーの質問: {question}"),
|
130 |
-
]
|
131 |
-
)
|
132 |
-
|
133 |
-
def modify_question(self, question: str) -> str:
|
134 |
-
"""
|
135 |
-
質問を変更するメソッド。
|
136 |
-
:param question: 質問文
|
137 |
-
:param nct_id: NCTID
|
138 |
-
:return: NCTIDを挿入した新しい質問
|
139 |
-
"""
|
140 |
-
# 質問を変更するプロセス
|
141 |
-
question_modifier = self.modify_prompt | self.structured_llm_modifier
|
142 |
-
result = question_modifier.invoke({"question": question})
|
143 |
-
modify_question = result.modified_question
|
144 |
-
return modify_question
|
145 |
-
|
146 |
-
class QuestionModifierEnglish:
|
147 |
-
def __init__(self, llm):
|
148 |
-
self.llm = llm
|
149 |
-
|
150 |
-
# LLMの出力を構造化するための設定
|
151 |
-
self.structured_llm_modifier = self.llm.with_structured_output(ModifyQuestion)
|
152 |
-
|
153 |
-
# システムプロンプトを設定
|
154 |
-
self.system_prompt = """あなたは、ユーザーの質問を変更し英語に翻訳するシステムです。\n
|
155 |
-
形式に基づいて新しい質問を生成してください。\n
|
156 |
-
例えば16歳男性の神経膠腫の患者さんが参加できる臨床治験を教えて下さいという質問に対しては\n
|
157 |
-
Can a 16 year old male patient with glioma participate in this clinical trial?と変更して下さい\n
|
158 |
-
"""
|
159 |
-
|
160 |
-
# プロンプトテンプレート
|
161 |
-
self.modify_prompt = ChatPromptTemplate.from_messages(
|
162 |
-
[
|
163 |
-
("system", self.system_prompt),
|
164 |
-
("human", "ユーザーの質問: {question}"),
|
165 |
-
]
|
166 |
-
)
|
167 |
-
|
168 |
-
def modify_question(self, question: str) -> str:
|
169 |
-
"""
|
170 |
-
質問を変更するメソッド。
|
171 |
-
:param question: 質問文
|
172 |
-
:param nct_id: NCTID
|
173 |
-
:return: NCTIDを挿入した新しい質問
|
174 |
-
"""
|
175 |
-
# 質問を変更するプロセス
|
176 |
-
question_modifier = self.modify_prompt | self.structured_llm_modifier
|
177 |
-
result = question_modifier.invoke({"question": question})
|
178 |
-
modify_question = result.modified_question
|
179 |
-
return modify_question
|
180 |
-
|
181 |
-
|
182 |
-
### Make criteria check Agent
|
183 |
-
|
184 |
-
class ClinicalTrialAgent:
|
185 |
-
def __init__(self, llm, db):
|
186 |
-
self.llm = llm
|
187 |
-
self.db = db
|
188 |
-
|
189 |
-
# システムプロンプトの定義
|
190 |
-
self.system_prompt = """
|
191 |
-
あなたは患者さんに適した治験を探すエージェントです。
|
192 |
-
データベースのEligibility Criteriaをチェックして患者さんがその治験を受けることが可能かどうか答えて下さい
|
193 |
-
"""
|
194 |
-
|
195 |
-
# プロンプトテンプレートを作成
|
196 |
-
self.prompt = ChatPromptTemplate.from_messages(
|
197 |
-
[("system", self.system_prompt),
|
198 |
-
("human", "{input}"),
|
199 |
-
MessagesPlaceholder("agent_scratchpad")]
|
200 |
-
)
|
201 |
-
|
202 |
-
# SQL Agentの設定
|
203 |
-
self.agent_executor = self.create_sql_agent(self.llm, self.db, self.prompt)
|
204 |
-
|
205 |
-
def create_sql_agent(self, llm, db, prompt):
|
206 |
-
"""SQLエージェントを作成するメソッド"""
|
207 |
-
agent_executor = create_sql_agent(
|
208 |
-
llm,
|
209 |
-
db=db,
|
210 |
-
prompt=prompt,
|
211 |
-
agent_type="tool-calling",
|
212 |
-
verbose=True
|
213 |
-
)
|
214 |
-
return agent_executor
|
215 |
-
|
216 |
-
def get_agent_judgment(self, modify_question: str) -> str:
|
217 |
-
"""
|
218 |
-
Modifyされた質問を元に、患者さんが治験に参加可能かどうかのエージェント判断を取得。
|
219 |
-
:param modify_question: NCTIDが挿入された質問
|
220 |
-
:return: エージェントの判断 (AgentJudgment)
|
221 |
-
"""
|
222 |
-
# LLMに質問を投げて、判断を得る
|
223 |
-
result = self.agent_executor.invoke({"input": modify_question})
|
224 |
-
return result
|
225 |
-
|
226 |
-
|
227 |
-
class SimpleClinicalTrialAgent:
|
228 |
-
def __init__(self, llm):
|
229 |
-
self.llm = llm
|
230 |
-
|
231 |
-
def evaluate_eligibility(self, TargetCriteria: str, question: str) -> str:
|
232 |
-
"""
|
233 |
-
臨床試験の参加適格性を評価するメソッド。
|
234 |
-
:param TargetCriteria: 試験基準 (Inclusion/Exclusion criteria)
|
235 |
-
:param question: 患者の条件に関する質問
|
236 |
-
:return: 臨床試験に参加可能かどうかのLLMからの応答
|
237 |
-
"""
|
238 |
-
|
239 |
-
# プロンプトの定義
|
240 |
-
prompt_template = """
|
241 |
-
You are an agent looking for a suitable clinical trial for a patient.
|
242 |
-
Please answer whether the patient is eligible for this clinical trial based on the following criteria. If you do not know the answer, say you do not know. Your answer should be brief, no more than 3 sentences.
|
243 |
-
Question: {question}
|
244 |
-
Criteria:
|
245 |
-
""" + TargetCriteria
|
246 |
-
|
247 |
-
# プロンプトテンプレートの作成
|
248 |
-
criteria_prompt = ChatPromptTemplate.from_messages(
|
249 |
-
[
|
250 |
-
(
|
251 |
-
"human",
|
252 |
-
prompt_template
|
253 |
-
)
|
254 |
-
]
|
255 |
-
)
|
256 |
-
|
257 |
-
# RAGチェーンの作成
|
258 |
-
rag_chain = (
|
259 |
-
{"question": RunnablePassthrough()}
|
260 |
-
| criteria_prompt
|
261 |
-
| self.llm
|
262 |
-
| StrOutputParser()
|
263 |
-
)
|
264 |
-
|
265 |
-
# 質問をチェーンに渡して、応答を得る
|
266 |
-
response = rag_chain.invoke(question)
|
267 |
-
return response
|
268 |
-
|
269 |
-
|
270 |
-
### output 評価システム
|
271 |
-
class TrialEligibilityGrader(BaseModel):
|
272 |
-
"""3値評価: yes, no, unclear"""
|
273 |
-
score: str = Field(
|
274 |
-
description="The eligibility of the patient for the clinical trial based on the document. Options are: 'yes', 'no', or 'unclear'."
|
275 |
-
)
|
276 |
-
|
277 |
-
class GraderAgent:
|
278 |
-
def __init__(self, llm):
|
279 |
-
self.llm = llm
|
280 |
-
|
281 |
-
# LLMの出力を構造化するための設定
|
282 |
-
self.structured_llm_grader = self.llm.with_structured_output(TrialEligibilityGrader)
|
283 |
-
|
284 |
-
# Graderの入力プロンプト
|
285 |
-
self.system_prompt = """
|
286 |
-
あなたは治験に参加する患者の適合性を評価するGraderです。
|
287 |
-
以下のドキュメントを読み、患者が治験に参加可能かどうかを判断してください。
|
288 |
-
'yes'(参加可能)、'no'(参加不可能)、'unclear'(判断できない)の3値で答えてください。
|
289 |
-
"""
|
290 |
-
|
291 |
-
# 評価のためのプロンプトを作成
|
292 |
-
self.grade_prompt = ChatPromptTemplate.from_messages(
|
293 |
-
[
|
294 |
-
("system", self.system_prompt),
|
295 |
-
(
|
296 |
-
"human",
|
297 |
-
"取得したドキュメント: \n\n {document} ",
|
298 |
-
),
|
299 |
-
]
|
300 |
-
)
|
301 |
-
|
302 |
-
def evaluate_eligibility(self, AgentJudgment_output: str) -> str:
|
303 |
-
"""
|
304 |
-
AgentJudgment['output']を基に患者が治験に参加可能かどうかを評価し、スコア(AgentGrade)を返す。
|
305 |
-
:param AgentJudgment_output: エージェント判断の 'output' の値
|
306 |
-
:return: 評価されたスコア (AgentGrade)
|
307 |
-
"""
|
308 |
-
GraderAgent = self.grade_prompt | self.structured_llm_grader
|
309 |
-
result = GraderAgent.invoke({"document": AgentJudgment_output})
|
310 |
-
AgentGrade = result.score
|
311 |
-
return AgentGrade
|
312 |
-
|
313 |
-
import re
|
314 |
-
|
315 |
-
class LLMTranslator:
|
316 |
-
def __init__(self, llm):
|
317 |
-
self.llm = llm
|
318 |
-
self.structured_llm_modifier = self.llm.with_structured_output(ModifyQuestion)
|
319 |
-
|
320 |
-
self.system_prompt = """あなたは、優秀な翻訳者です。\n
|
321 |
-
日本語を英語に翻訳して下さい。\n
|
322 |
-
"""
|
323 |
-
self.system_prompt2 = """あなたは、優秀な翻訳者です。\n
|
324 |
-
日本語を英語に以下のフォーマットに従って翻訳して下さい。\n
|
325 |
-
MainQuestion:
|
326 |
-
Known gene mutation:
|
327 |
-
Measurable tumour:
|
328 |
-
Biopsiable tumour:
|
329 |
-
"""
|
330 |
-
|
331 |
-
self.modify_prompt = ChatPromptTemplate.from_messages(
|
332 |
-
[
|
333 |
-
("system", self.system_prompt),
|
334 |
-
("human", "ユーザーの質問: {question}"),
|
335 |
-
]
|
336 |
-
)
|
337 |
-
|
338 |
-
self.modify_prompt2 = ChatPromptTemplate.from_messages(
|
339 |
-
[
|
340 |
-
("system", self.system_prompt2),
|
341 |
-
("human", "ユーザーの質問: {question}"),
|
342 |
-
]
|
343 |
-
)
|
344 |
-
|
345 |
-
def is_english(self, text: str) -> bool:
|
346 |
-
"""
|
347 |
-
簡易的にテキストが英語かどうかを判定する関数
|
348 |
-
:param text: 判定するテキスト
|
349 |
-
:return: 英語の場合True、日本語の場合False
|
350 |
-
"""
|
351 |
-
# 英語のアルファベットが多く含まれているかを確認
|
352 |
-
return bool(re.match(r'^[A-Za-z0-9\s.,?!]+$', text))
|
353 |
-
|
354 |
-
def translate(self, question: str) -> str:
|
355 |
-
"""
|
356 |
-
質問を翻訳するメソッド。英語の質問はそのまま返す。
|
357 |
-
:param question: 質問文
|
358 |
-
:return: 翻訳済みの質問文、または元の質問文(英語の場合)
|
359 |
-
"""
|
360 |
-
# 質問が英語の場合、そのまま返す
|
361 |
-
if self.is_english(question):
|
362 |
-
return question
|
363 |
-
|
364 |
-
# 日本語の質問は翻訳プロセスにかける
|
365 |
-
question_modifier = self.modify_prompt | self.structured_llm_modifier
|
366 |
-
result = question_modifier.invoke({"question": question})
|
367 |
-
modify_question = result.modified_question
|
368 |
-
return modify_question
|
369 |
-
|
370 |
-
def translateQuestion(self, question: str) -> str:
|
371 |
-
"""
|
372 |
-
フォーマット付きで質問を翻訳するメソッド。
|
373 |
-
:param question: 質問文
|
374 |
-
:return: フォーマットに従った翻訳済みの質問
|
375 |
-
"""
|
376 |
-
question_modifier = self.modify_prompt2 | self.structured_llm_modifier
|
377 |
-
result = question_modifier.invoke({"question": question})
|
378 |
-
modify_question = result.modified_question
|
379 |
-
return modify_question
|
380 |
-
|
381 |
-
def generate_ex_question(age, sex, tumor_type, GeneMutation, Meseable, Biopsiable):
|
382 |
-
# GeneMutationが空の場合はUnknownに設定
|
383 |
-
gene_mutation_text = GeneMutation if GeneMutation else "Unknown"
|
384 |
-
|
385 |
-
# MeseableとBiopsiableの値をYes, No, Unknownに変換
|
386 |
-
meseable_text = (
|
387 |
-
"Yes" if Meseable == "有り" else "No" if Meseable == "無し" else "Unknown"
|
388 |
-
)
|
389 |
-
biopsiable_text = (
|
390 |
-
"Yes" if Biopsiable == "有り" else "No" if Biopsiable == "無し" else "Unknown"
|
391 |
-
)
|
392 |
-
|
393 |
-
# 質問文の生成
|
394 |
-
ex_question = f"""{age}歳{sex}の{tumor_type}患者さんはこの治験に参加することができますか?
|
395 |
-
判明している遺伝子変異: {gene_mutation_text}
|
396 |
-
Meseable tumor: {meseable_text}
|
397 |
-
Biopsiable tumor: {biopsiable_text}
|
398 |
-
です。
|
399 |
-
"""
|
400 |
-
return ex_question
|
401 |
-
|
402 |
-
def generate_ex_question_English(age, sex, tumor_type, GeneMutation, Meseable, Biopsiable):
|
403 |
-
# GeneMutationが空の場合は"Unknown"に設定
|
404 |
-
gene_mutation_text = GeneMutation if GeneMutation else "Unknown"
|
405 |
-
|
406 |
-
# sexの値を male または female に変換
|
407 |
-
sex_text = "male" if sex == "男性" else "female" if sex == "女性" else "Unknown"
|
408 |
-
|
409 |
-
# MeseableとBiopsiableの値を "Yes", "No", "Unknown" に変換
|
410 |
-
meseable_text = (
|
411 |
-
"Yes" if Meseable == "有り" else "No" if Meseable == "無し" else "Unknown"
|
412 |
-
)
|
413 |
-
biopsiable_text = (
|
414 |
-
"Yes" if Biopsiable == "有り" else "No" if Biopsiable == "無し" else "Unknown"
|
415 |
-
)
|
416 |
-
|
417 |
-
# 英語での質問文を生成
|
418 |
-
ex_question = f"""Can a {age}-year-old {sex_text} patient with {tumor_type} participate in this clinical trial?
|
419 |
-
Known gene mutation: {gene_mutation_text}
|
420 |
-
Measurable tumor: {meseable_text}
|
421 |
-
Biopsiable tumor: {biopsiable_text}
|
422 |
-
"""
|
423 |
-
return ex_question
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
OpenAITools/ECarteTools.py
DELETED
@@ -1,73 +0,0 @@
|
|
1 |
-
import openai
|
2 |
-
import time
|
3 |
-
import wikipedia
|
4 |
-
import random
|
5 |
-
import re
|
6 |
-
import requests
|
7 |
-
from bs4 import BeautifulSoup
|
8 |
-
import os
|
9 |
-
import glob
|
10 |
-
from natsort import natsorted
|
11 |
-
import requests
|
12 |
-
from bs4 import BeautifulSoup
|
13 |
-
import xml.etree.ElementTree as ET
|
14 |
-
import pandas as pd
|
15 |
-
|
16 |
-
wikipedia.set_lang("ja")
|
17 |
-
# APIキーの設定
|
18 |
-
openai.api_key = os.environ['OPENAI_API_KEY']
|
19 |
-
engine="gpt-3.5-turbo"
|
20 |
-
|
21 |
-
|
22 |
-
def generate(system_template,prompt,engine="gpt-3.5-turbo"):
|
23 |
-
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
24 |
-
try:
|
25 |
-
response = openai.ChatCompletion.create(
|
26 |
-
model=engine,
|
27 |
-
messages=[
|
28 |
-
{"role": "system", "content": system_template},
|
29 |
-
{"role": "user", "content":prompt},
|
30 |
-
]
|
31 |
-
)
|
32 |
-
result=response["choices"][0]["message"]["content"]
|
33 |
-
return result
|
34 |
-
except:
|
35 |
-
print("リトライ")
|
36 |
-
time.sleep(30)
|
37 |
-
pass
|
38 |
-
|
39 |
-
def generate_carte(prompt,engine="gpt-3.5-turbo"):
|
40 |
-
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
41 |
-
try:
|
42 |
-
response = openai.ChatCompletion.create(
|
43 |
-
model=engine,
|
44 |
-
messages=[
|
45 |
-
{"role": "system", "content": "You are useful assistant"},
|
46 |
-
{"role": "user", "content":"%s\n・・・という患者と医師の会話をSOAP形式のカルテとして日本語で端的にまとめて下さい。各セクションはS),O), A),P)として下さい "%prompt},
|
47 |
-
]
|
48 |
-
)
|
49 |
-
result=response["choices"][0]["message"]["content"]
|
50 |
-
return result
|
51 |
-
except:
|
52 |
-
print("リトライ")
|
53 |
-
time.sleep(30)
|
54 |
-
pass
|
55 |
-
|
56 |
-
def get_selected_fileds(texts):
|
57 |
-
input_name = texts.replace(' ' , "+")
|
58 |
-
corona_fields = ct.get_study_fields(
|
59 |
-
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
|
60 |
-
fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
|
61 |
-
max_studies=500,
|
62 |
-
fmt="csv")
|
63 |
-
return corona_fields
|
64 |
-
|
65 |
-
def get_retriever_str(fields):
|
66 |
-
retriever_str=''
|
67 |
-
for i in range(1,len(fields)):
|
68 |
-
colnames = fields[0]
|
69 |
-
targetCol = fields[i]
|
70 |
-
for f in range(len(fields[0])):
|
71 |
-
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
72 |
-
retriever_str+='\n'
|
73 |
-
return retriever_str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
OpenAITools/ExpertTools.py
DELETED
@@ -1,246 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import openai
|
3 |
-
import time
|
4 |
-
import wikipedia
|
5 |
-
import random
|
6 |
-
import re
|
7 |
-
import requests
|
8 |
-
from bs4 import BeautifulSoup
|
9 |
-
import os
|
10 |
-
import glob
|
11 |
-
from natsort import natsorted
|
12 |
-
import requests
|
13 |
-
from bs4 import BeautifulSoup
|
14 |
-
import xml.etree.ElementTree as ET
|
15 |
-
from pytrials.client import ClinicalTrials
|
16 |
-
from Bio import Entrez
|
17 |
-
import pandas as pd
|
18 |
-
import numpy as np
|
19 |
-
import time
|
20 |
-
#from langchain.agents import create_pandas_dataframe_agent
|
21 |
-
from langchain_experimental.agents import create_pandas_dataframe_agent
|
22 |
-
#from langchain.llms import OpenAI
|
23 |
-
from langchain_community.llms import OpenAI
|
24 |
-
|
25 |
-
# APIキーの設定
|
26 |
-
openai.api_key = os.environ['OPENAI_API_KEY']
|
27 |
-
gptengine="gpt-3.5-turbo"
|
28 |
-
|
29 |
-
|
30 |
-
"""def get_selected_fileds(texts):
|
31 |
-
ct = ClinicalTrials()
|
32 |
-
input_name = texts.replace(' ' , "+")
|
33 |
-
corona_fields = ct.get_study_fields(
|
34 |
-
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
|
35 |
-
fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
|
36 |
-
max_studies=500,
|
37 |
-
fmt="csv")
|
38 |
-
return corona_fields"""
|
39 |
-
|
40 |
-
def get_retriever_str(fields):
|
41 |
-
retriever_str=''
|
42 |
-
for i in range(1,len(fields)):
|
43 |
-
colnames = fields[0]
|
44 |
-
targetCol = fields[i]
|
45 |
-
for f in range(len(fields[0])):
|
46 |
-
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
47 |
-
retriever_str+='\n'
|
48 |
-
return retriever_str
|
49 |
-
|
50 |
-
def get_chanked_retriever(fields):
|
51 |
-
retriever_list =[]
|
52 |
-
for i in range(1,len(fields)):
|
53 |
-
retriever_str=''
|
54 |
-
colnames = fields[0]
|
55 |
-
targetCol = fields[i]
|
56 |
-
for f in range(len(fields[0])):
|
57 |
-
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
58 |
-
retriever_list.append(retriever_str)
|
59 |
-
return retriever_list
|
60 |
-
|
61 |
-
from pytrials.client import ClinicalTrials
|
62 |
-
def get_selected_fields(texts, split_criteria=False,
|
63 |
-
split_word_number = False, split_number=700):
|
64 |
-
ct = ClinicalTrials()
|
65 |
-
input_name = texts.replace(' ', "+")
|
66 |
-
corona_fields = ct.get_study_fields(
|
67 |
-
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (input_name),
|
68 |
-
fields=["NCTId", "Condition", "BriefTitle", 'BriefSummary', 'EligibilityCriteria'],
|
69 |
-
max_studies=500,
|
70 |
-
fmt="csv")
|
71 |
-
|
72 |
-
if split_criteria:
|
73 |
-
new_fields = []
|
74 |
-
|
75 |
-
# 検索対象の文字列
|
76 |
-
target_string1 = 'Exclusion Criteria'
|
77 |
-
target_string2 = 'Exclusion criteria'
|
78 |
-
|
79 |
-
# 各要素で検索対象の文字列を探し、直前で分割して新しいリストに格納
|
80 |
-
for corona_field in corona_fields:
|
81 |
-
new_list = []
|
82 |
-
for item in corona_field:
|
83 |
-
if target_string1 in item:
|
84 |
-
split_position = item.index(target_string1)
|
85 |
-
new_list.append(item[:split_position])
|
86 |
-
new_list.append(item[split_position:])
|
87 |
-
elif target_string2 in item:
|
88 |
-
split_position = item.index(target_string2)
|
89 |
-
new_list.append(item[:split_position])
|
90 |
-
new_list.append(item[split_position:])
|
91 |
-
else:
|
92 |
-
new_list.append(item)
|
93 |
-
new_fields.append(new_list)
|
94 |
-
else:
|
95 |
-
new_fields = corona_fields
|
96 |
-
|
97 |
-
if split_word_number:
|
98 |
-
split_fields = []
|
99 |
-
for new_field in new_fields:
|
100 |
-
new_list= []
|
101 |
-
|
102 |
-
# 各要素を調べて、700文字以上であれば分割し、新しいリストに格納
|
103 |
-
for item in new_field:
|
104 |
-
item_length = len(item)
|
105 |
-
if item_length > split_number:
|
106 |
-
num_parts = -(-item_length // split_number) # 向上の除算を用いて分割数を計算
|
107 |
-
for i in range(num_parts):
|
108 |
-
start_index = i * split_number
|
109 |
-
end_index = min((i + 1) * split_number, item_length) # 文字列の終わりを超えないように調整
|
110 |
-
new_list.append(item[start_index:end_index])
|
111 |
-
else:
|
112 |
-
new_list.append(item)
|
113 |
-
|
114 |
-
split_fields.append(new_list)
|
115 |
-
new_fields = split_fields
|
116 |
-
|
117 |
-
return new_fields
|
118 |
-
|
119 |
-
|
120 |
-
def print_agent_results(df, Ids,
|
121 |
-
interesteds = ['Condition', 'BriefTitle', 'BriefSummary', 'EligibilityCriteria'],
|
122 |
-
translater=None):
|
123 |
-
results = ""
|
124 |
-
for Id in Ids:
|
125 |
-
print("%s\n"%Id)
|
126 |
-
sdf = df[df['NCTId'] == Id]
|
127 |
-
for interested in interesteds:
|
128 |
-
# 最初の要素を取得
|
129 |
-
results += '%s: \n %s \n' % (interested, sdf[interested].iloc[0])
|
130 |
-
#print('%s: \n %s \n' % (interested, sdf[interested].iloc[0]))
|
131 |
-
if translater:
|
132 |
-
to_be_printed = translater.translate(results)
|
133 |
-
else:
|
134 |
-
to_be_printed =results
|
135 |
-
print(to_be_printed)
|
136 |
-
|
137 |
-
def search(query):
|
138 |
-
Entrez.email = os.getenv('MAIL_ADRESS')
|
139 |
-
#Entrez.email='[email protected]'
|
140 |
-
handle = Entrez.esearch(db='pubmed',
|
141 |
-
sort = 'relevance',
|
142 |
-
retmax = '20',
|
143 |
-
retmode = 'xml',
|
144 |
-
term = query)
|
145 |
-
results = Entrez.read(handle)
|
146 |
-
return results
|
147 |
-
|
148 |
-
def fetch_details(id_list):
|
149 |
-
ids = ','.join(id_list)
|
150 |
-
Entrez.email = os.getenv('MAIL_ADRESS')
|
151 |
-
#Entrez.email = '[email protected]'
|
152 |
-
handle = Entrez.efetch(db = 'pubmed',
|
153 |
-
retmode = 'xml',
|
154 |
-
id = ids)
|
155 |
-
results = Entrez.read(handle)
|
156 |
-
return results
|
157 |
-
'''def generate(prompt,engine=None):
|
158 |
-
if engine is None:
|
159 |
-
engine=gptengine
|
160 |
-
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
161 |
-
try:
|
162 |
-
response = openai.ChatCompletion.create(
|
163 |
-
model=engine,
|
164 |
-
messages=[
|
165 |
-
{"role": "system", "content": "You are useful assistant"},
|
166 |
-
{"role": "user", "content":prompt},
|
167 |
-
]
|
168 |
-
)
|
169 |
-
result=response["choices"][0]["message"]["content"]
|
170 |
-
return result
|
171 |
-
except Exception as e:
|
172 |
-
print(e)
|
173 |
-
print("リトライ")
|
174 |
-
time.sleep(30)
|
175 |
-
pass
|
176 |
-
'''
|
177 |
-
|
178 |
-
def generate(prompt,engine=None):
|
179 |
-
if engine is None:
|
180 |
-
engine=gptengine
|
181 |
-
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
182 |
-
try:
|
183 |
-
response = openai.chat.completions.create(
|
184 |
-
model=engine,
|
185 |
-
messages=[
|
186 |
-
{"role": "system", "content": "You are useful assistant"},
|
187 |
-
{"role": "user", "content":prompt},
|
188 |
-
]
|
189 |
-
)
|
190 |
-
#result=response["choices"][0]["message"]["content"]
|
191 |
-
result=response.choices[0].message.content
|
192 |
-
return result
|
193 |
-
except Exception as e:
|
194 |
-
print(e)
|
195 |
-
print("リトライ")
|
196 |
-
time.sleep(30)
|
197 |
-
pass
|
198 |
-
|
199 |
-
def GetPubmedSummaryDf(studies):
|
200 |
-
title_list= []
|
201 |
-
abstract_list=[]
|
202 |
-
journal_list = []
|
203 |
-
language_list =[]
|
204 |
-
pubdate_year_list = []
|
205 |
-
pubdate_month_list = []
|
206 |
-
studiesIdList = studies['IdList']
|
207 |
-
chunk_size = 10000
|
208 |
-
for chunk_i in range(0, len(studiesIdList), chunk_size):
|
209 |
-
chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
|
210 |
-
|
211 |
-
try:
|
212 |
-
papers = fetch_details(chunk)
|
213 |
-
for i, paper in enumerate(papers['PubmedArticle']):
|
214 |
-
title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
|
215 |
-
try:
|
216 |
-
abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
|
217 |
-
except:
|
218 |
-
abstract_list.append('No Abstract')
|
219 |
-
journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
|
220 |
-
language_list.append(paper['MedlineCitation']['Article']['Language'][0])
|
221 |
-
try:
|
222 |
-
pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
|
223 |
-
except:
|
224 |
-
pubdate_year_list.append('No Data')
|
225 |
-
try:
|
226 |
-
pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
|
227 |
-
except:
|
228 |
-
pubdate_month_list.append('No Data')
|
229 |
-
except: # occasionally a chunk might annoy your parser
|
230 |
-
pass
|
231 |
-
df = pd.DataFrame(list(zip(
|
232 |
-
title_list, abstract_list, journal_list, language_list, pubdate_year_list,
|
233 |
-
pubdate_month_list)),
|
234 |
-
columns=['Title', 'Abstract', 'Journal', 'Language', 'Year','Month'])
|
235 |
-
return df, abstract_list
|
236 |
-
|
237 |
-
def ClinicalAgent(fileds, verbose=False):
|
238 |
-
df = pd.DataFrame.from_records(fileds[1:], columns=fileds[0])
|
239 |
-
return create_pandas_dataframe_agent(OpenAI(temperature=0, model='gpt-3.5-turbo-16k'), df, verbose=verbose)
|
240 |
-
|
241 |
-
def GetNCTID(results):
|
242 |
-
# NCTで始まる単語を検索する正規表現
|
243 |
-
pattern = r'\bNCT\d+\b'
|
244 |
-
# 正規表現を使って単語を抽出
|
245 |
-
nct_words = re.findall(pattern,results)
|
246 |
-
return nct_words
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
OpenAITools/JRCTTools.py
DELETED
@@ -1,559 +0,0 @@
|
|
1 |
-
from selenium import webdriver
|
2 |
-
from selenium.webdriver.common.by import By
|
3 |
-
from selenium.webdriver.support.ui import WebDriverWait
|
4 |
-
from selenium.webdriver.support import expected_conditions as EC
|
5 |
-
import csv
|
6 |
-
|
7 |
-
|
8 |
-
from selenium import webdriver
|
9 |
-
from selenium.webdriver.common.by import By
|
10 |
-
from selenium.webdriver.support.ui import WebDriverWait
|
11 |
-
from selenium.webdriver.support import expected_conditions as EC
|
12 |
-
import csv
|
13 |
-
|
14 |
-
from selenium.common.exceptions import ElementClickInterceptedException, TimeoutException
|
15 |
-
|
16 |
-
import pandas as pd
|
17 |
-
import requests
|
18 |
-
from bs4 import BeautifulSoup
|
19 |
-
import time
|
20 |
-
import unicodedata
|
21 |
-
import re
|
22 |
-
import ast
|
23 |
-
import torch
|
24 |
-
|
25 |
-
|
26 |
-
from selenium import webdriver
|
27 |
-
from selenium.webdriver.common.by import By
|
28 |
-
from selenium.webdriver.support.ui import WebDriverWait
|
29 |
-
from selenium.webdriver.support import expected_conditions as EC
|
30 |
-
from selenium.common.exceptions import ElementClickInterceptedException
|
31 |
-
|
32 |
-
|
33 |
-
def fetch_clinical_trials(
|
34 |
-
disease_name="",
|
35 |
-
freeword="",
|
36 |
-
include_not_yet_recruiting=False,
|
37 |
-
include_suspended=False,
|
38 |
-
specific_clinical_research=True,
|
39 |
-
corporate_clinical_trial=True,
|
40 |
-
physician_initiated_clinical_trial=True,
|
41 |
-
):
|
42 |
-
"""
|
43 |
-
指定された条件に基づいてjRCTから臨床試験情報を取得します。
|
44 |
-
|
45 |
-
Args:
|
46 |
-
disease_name (str): 対象疾患名(例: "がん 神経膠腫 骨髄腫")
|
47 |
-
freeword (str): フリーワード検索(例: "免疫療法")
|
48 |
-
include_not_yet_recruiting (bool): 募集前の試験も含める場合はTrue。
|
49 |
-
include_suspended (bool): 募集中断を含める場合はTrue。
|
50 |
-
specific_clinical_research (bool): 特定臨床研究を含める場合はTrue。
|
51 |
-
corporate_clinical_trial (bool): 企業治験を含める場合はTrue。
|
52 |
-
physician_initiated_clinical_trial (bool): 医師主導治験を含める場合はTrue。
|
53 |
-
|
54 |
-
Returns:
|
55 |
-
list: 検索結果のリスト([試験ID, タイトル, 対象疾患, 進捗状況, 日付, リンク])
|
56 |
-
"""
|
57 |
-
# WebDriverを初期化
|
58 |
-
driver = webdriver.Chrome() # 必要に応じてChromeDriverを設定
|
59 |
-
|
60 |
-
all_results = []
|
61 |
-
|
62 |
-
try:
|
63 |
-
# jRCTの検索ページにアクセス
|
64 |
-
driver.get("https://jrct.niph.go.jp/search")
|
65 |
-
|
66 |
-
# 対象疾患名を入力
|
67 |
-
if disease_name:
|
68 |
-
disease_field = WebDriverWait(driver, 10).until(
|
69 |
-
EC.presence_of_element_located((By.ID, "reg-plobrem-1"))
|
70 |
-
)
|
71 |
-
disease_field.send_keys(disease_name)
|
72 |
-
|
73 |
-
# 対象疾患名の条件を「or」に設定
|
74 |
-
condition_select = driver.find_element(By.ID, "reg-plobrem-type")
|
75 |
-
condition_select.find_element(By.CSS_SELECTOR, "option[value='1']").click()
|
76 |
-
|
77 |
-
# フリーワード検索を入力
|
78 |
-
if freeword:
|
79 |
-
freeword_field = WebDriverWait(driver, 10).until(
|
80 |
-
EC.presence_of_element_located((By.ID, "demo-1"))
|
81 |
-
)
|
82 |
-
freeword_field.send_keys(freeword)
|
83 |
-
|
84 |
-
# フリーワード検索の条件を「or」に設定
|
85 |
-
condition_select = driver.find_element(By.ID, "others")
|
86 |
-
condition_select.find_element(By.CSS_SELECTOR, "option[value='1']").click()
|
87 |
-
|
88 |
-
# 募集中を選択
|
89 |
-
recruitment_checkbox = driver.find_element(By.ID, "reg-recruitment-2")
|
90 |
-
recruitment_checkbox.click()
|
91 |
-
|
92 |
-
# 募集前も含める場合
|
93 |
-
if include_not_yet_recruiting:
|
94 |
-
not_yet_checkbox = driver.find_element(By.ID, "reg-recruitment-1")
|
95 |
-
not_yet_checkbox.click()
|
96 |
-
|
97 |
-
# 募集中断を選択
|
98 |
-
if include_suspended:
|
99 |
-
suspended_checkbox = driver.find_element(By.ID, "reg-recruitment-3")
|
100 |
-
suspended_checkbox.click()
|
101 |
-
|
102 |
-
# 特定臨床研究を選択
|
103 |
-
if specific_clinical_research:
|
104 |
-
specific_checkbox = driver.find_element(By.ID, "is-specific1")
|
105 |
-
specific_checkbox.click()
|
106 |
-
|
107 |
-
# 企業治験を選択
|
108 |
-
if corporate_clinical_trial:
|
109 |
-
corporate_checkbox = driver.find_element(By.ID, "is-specific3")
|
110 |
-
corporate_checkbox.click()
|
111 |
-
|
112 |
-
# 医師主導治験を選択
|
113 |
-
if physician_initiated_clinical_trial:
|
114 |
-
physician_checkbox = driver.find_element(By.ID, "is-specific7")
|
115 |
-
physician_checkbox.click()
|
116 |
-
|
117 |
-
# 検索ボタンをクリック
|
118 |
-
try:
|
119 |
-
search_button = driver.find_element(By.NAME, "button_type")
|
120 |
-
driver.execute_script("arguments[0].scrollIntoView();", search_button) # ボタンを画面内にスクロール
|
121 |
-
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.NAME, "button_type"))).click()
|
122 |
-
except ElementClickInterceptedException:
|
123 |
-
print("検索ボタンがクリックできないため、JavaScriptでクリックします。")
|
124 |
-
driver.execute_script("arguments[0].click();", search_button)
|
125 |
-
|
126 |
-
# ページネーション対応ループ
|
127 |
-
while True:
|
128 |
-
# 現在のページの結果がロードされるのを待機
|
129 |
-
WebDriverWait(driver, 10).until(
|
130 |
-
EC.presence_of_element_located((By.CSS_SELECTOR, "table tbody tr"))
|
131 |
-
)
|
132 |
-
|
133 |
-
# 現在のページの結果を取得
|
134 |
-
rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
|
135 |
-
for row in rows:
|
136 |
-
columns = row.find_elements(By.TAG_NAME, "td")
|
137 |
-
if len(columns) > 4:
|
138 |
-
# 試験情報をリストに追加
|
139 |
-
trial_id = columns[0].text
|
140 |
-
title = columns[1].text
|
141 |
-
condition = columns[2].text
|
142 |
-
status = columns[3].text
|
143 |
-
date = columns[4].text
|
144 |
-
|
145 |
-
# リンクを取得(エラー処理を追加)
|
146 |
-
try:
|
147 |
-
link = columns[1].find_element(By.TAG_NAME, "a").get_attribute("href")
|
148 |
-
except Exception:
|
149 |
-
link = "リンク取得エラー"
|
150 |
-
|
151 |
-
all_results.append([trial_id, title, condition, status, date, link])
|
152 |
-
|
153 |
-
# ページネーションの確認
|
154 |
-
try:
|
155 |
-
current_page = driver.find_element(By.CSS_SELECTOR, "ul.pagination li.active").text
|
156 |
-
print(f"{current_page} ページ目を処理しました。")
|
157 |
-
except Exception:
|
158 |
-
print("ページネーションが存在しません。全ての結果を取得しました。")
|
159 |
-
break
|
160 |
-
|
161 |
-
# 次ページボタンのリストを取得
|
162 |
-
pagination_buttons = driver.find_elements(By.CSS_SELECTOR, "ul.pagination li a")
|
163 |
-
next_button = None
|
164 |
-
for button in pagination_buttons:
|
165 |
-
if button.text.isdigit() and int(button.text) > int(current_page):
|
166 |
-
next_button = button
|
167 |
-
break
|
168 |
-
|
169 |
-
if next_button:
|
170 |
-
try:
|
171 |
-
driver.execute_script("arguments[0].scrollIntoView();", next_button) # ボタンを画面内にスクロール
|
172 |
-
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.LINK_TEXT, next_button.text))).click()
|
173 |
-
except ElementClickInterceptedException:
|
174 |
-
print("次ページボタンがクリックできないため、JavaScriptでクリックします。")
|
175 |
-
driver.execute_script("arguments[0].click();", next_button)
|
176 |
-
WebDriverWait(driver, 10).until(EC.staleness_of(rows[0])) # ページが変わるまで待機
|
177 |
-
else:
|
178 |
-
print("次のページはありません。全ての結果を取得しました。")
|
179 |
-
break
|
180 |
-
|
181 |
-
finally:
|
182 |
-
# ブラウザを閉じる
|
183 |
-
driver.quit()
|
184 |
-
|
185 |
-
return all_results
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
def scrape_jrct_all_details(url):
|
190 |
-
"""
|
191 |
-
指定されたjRCT URLから必要なすべての情報を抽出します。
|
192 |
-
"""
|
193 |
-
|
194 |
-
def normalize_text(text):
|
195 |
-
if not text:
|
196 |
-
return ""
|
197 |
-
# Unicode正規化 + 余分な空白除去
|
198 |
-
text = unicodedata.normalize('NFKC', text)
|
199 |
-
return " ".join(text.split())
|
200 |
-
|
201 |
-
# リクエストを送信
|
202 |
-
headers = {
|
203 |
-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
204 |
-
}
|
205 |
-
try:
|
206 |
-
response = requests.get(url, headers=headers, timeout=10)
|
207 |
-
response.raise_for_status()
|
208 |
-
except requests.RequestException as e:
|
209 |
-
print(f"URLリクエストに失敗しました: {url} - エラー: {e}")
|
210 |
-
return {"URL": url, "エラー": "リクエスト失敗"}
|
211 |
-
|
212 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
213 |
-
|
214 |
-
data = {"URL": url}
|
215 |
-
|
216 |
-
def extract_label_data(label_text, label_en=None):
|
217 |
-
"""
|
218 |
-
特定のラベルに対応するデータを抽出するヘルパー関数
|
219 |
-
|
220 |
-
複数の候補があった場合は、すべて取得してからフィルタする方式をとる。
|
221 |
-
"""
|
222 |
-
results = []
|
223 |
-
# 日本語ラベルと英語ラベルが両方指定されていれば、両方含む行を優先的に探す
|
224 |
-
combined_search = None
|
225 |
-
if label_en:
|
226 |
-
combined_search = f"{label_text} / {label_en}"
|
227 |
-
|
228 |
-
# ページ内のすべての<label>を探索
|
229 |
-
for l in soup.find_all('label'):
|
230 |
-
lt = normalize_text(l.get_text())
|
231 |
-
# combined_searchが利用可能ならまず完全な結合形でマッチを試みる
|
232 |
-
# なければ従来通りlabel_textをinでマッチ
|
233 |
-
if combined_search:
|
234 |
-
if combined_search in lt:
|
235 |
-
th = l.find_parent('th')
|
236 |
-
if not th:
|
237 |
-
continue
|
238 |
-
tr = th.find_parent('tr')
|
239 |
-
if not tr:
|
240 |
-
continue
|
241 |
-
tds = tr.find_all('td')
|
242 |
-
if len(tds) >= 1:
|
243 |
-
jp_data = normalize_text(tds[0].get_text()) if len(tds) > 0 else None
|
244 |
-
en_data = normalize_text(tds[1].get_text()) if label_en and len(tds) > 1 else None
|
245 |
-
results.append((jp_data, en_data))
|
246 |
-
else:
|
247 |
-
# label_enが無い場合は、label_textだけで検索
|
248 |
-
if label_text in lt:
|
249 |
-
th = l.find_parent('th')
|
250 |
-
if not th:
|
251 |
-
continue
|
252 |
-
tr = th.find_parent('tr')
|
253 |
-
if not tr:
|
254 |
-
continue
|
255 |
-
tds = tr.find_all('td')
|
256 |
-
if len(tds) >= 1:
|
257 |
-
jp_data = normalize_text(tds[0].get_text()) if len(tds) > 0 else None
|
258 |
-
en_data = normalize_text(tds[1].get_text()) if label_en and len(tds) > 1 else None
|
259 |
-
results.append((jp_data, en_data))
|
260 |
-
|
261 |
-
# resultsに候補が格納されている
|
262 |
-
if not results:
|
263 |
-
return None, None
|
264 |
-
|
265 |
-
# 複数候補がある場合、特定キーワードによるフィルタリングが可能
|
266 |
-
# ここでは特定キーワードがなければそのまま最初のを返す
|
267 |
-
# もし特定の疾患キーワードでフィルタリングしたい場合はここで処理を追加
|
268 |
-
|
269 |
-
# ひとまず最初の候補を返す
|
270 |
-
return results[0]
|
271 |
-
|
272 |
-
# "研究・治験の目的" を抽出
|
273 |
-
data["研究・治験の目的"], _ = extract_label_data("研究・治験の目的")
|
274 |
-
|
275 |
-
# 試験デザイン情報(日本語と英語)を抽出
|
276 |
-
design_labels = [
|
277 |
-
('試験等のフェーズ', 'Phase'),
|
278 |
-
('試験の種類', 'Study Type'),
|
279 |
-
('無作為化', 'allocation'),
|
280 |
-
('盲検化', 'masking'),
|
281 |
-
('対照', 'control'),
|
282 |
-
('割付け', 'assignment'),
|
283 |
-
('研究目的', 'purpose')
|
284 |
-
]
|
285 |
-
for label_jp, label_en in design_labels:
|
286 |
-
jp, en = extract_label_data(label_jp, label_en)
|
287 |
-
data[label_jp] = jp
|
288 |
-
data[label_en] = en
|
289 |
-
|
290 |
-
# その他の情報を抽出
|
291 |
-
# 対象疾患名 / Health Condition(s) or Problem(s) Studiedを追加
|
292 |
-
details_labels = [
|
293 |
-
('主たる選択基準', 'Inclusion Criteria'),
|
294 |
-
('主たる除外基準', 'Exclusion Criteria'),
|
295 |
-
('年齢下限', 'Age Minimum'),
|
296 |
-
('年齢上限', 'Age Maximum'),
|
297 |
-
('性別', 'Gender'),
|
298 |
-
('中止基準', 'Discontinuation Criteria'),
|
299 |
-
('対象疾患名', 'Health Condition(s) or Problem(s) Studied'), # 追加
|
300 |
-
('対象疾患キーワード', 'Keyword'),
|
301 |
-
('介入の内容', 'Intervention(s)')
|
302 |
-
]
|
303 |
-
for label_jp, label_en in details_labels:
|
304 |
-
jp, en = extract_label_data(label_jp, label_en)
|
305 |
-
data[label_jp] = jp
|
306 |
-
data[label_en] = en
|
307 |
-
|
308 |
-
# "他の臨床研究登録機関への登録" を探索
|
309 |
-
other_registries_section = soup.find("div", id="area-toggle-07-02")
|
310 |
-
japic_no_list = []
|
311 |
-
nct_no_list = []
|
312 |
-
|
313 |
-
if other_registries_section:
|
314 |
-
rows = other_registries_section.find_all("tr")
|
315 |
-
for row in rows:
|
316 |
-
label = row.find("label")
|
317 |
-
if label and ("ID番号" in label.text or "研究番号" in label.text):
|
318 |
-
value_td = row.find("td")
|
319 |
-
if value_td:
|
320 |
-
id_number = value_td.text.strip()
|
321 |
-
if id_number.startswith("JapicCTI"):
|
322 |
-
japic_no_list.append(id_number)
|
323 |
-
elif id_number.startswith("NCT"):
|
324 |
-
nct_no_list.append(id_number)
|
325 |
-
|
326 |
-
# JapicCTI No と NCT No を格納(複数あればカンマ区切り)
|
327 |
-
data["JapicCTI No"] = ", ".join(japic_no_list) if japic_no_list else None
|
328 |
-
data["NCT No"] = ", ".join(nct_no_list) if nct_no_list else None
|
329 |
-
|
330 |
-
# サーバーへの負荷を避けるためのスリープ
|
331 |
-
time.sleep(1) # 必要に応じて調整
|
332 |
-
|
333 |
-
return data
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
def create_dataframe_from_urls(urls, delay=5):
|
345 |
-
"""
|
346 |
-
URLのリストを受け取り、pandas DataFrameを作成します。
|
347 |
-
リクエスト間に待機時間を設定して403エラーを防ぎます。
|
348 |
-
|
349 |
-
Args:
|
350 |
-
urls (list): jRCTの詳細ページURLリスト。
|
351 |
-
delay (int): 各リクエスト間の待機時間(秒単位、デフォルトは5秒)。
|
352 |
-
|
353 |
-
Returns:
|
354 |
-
pd.DataFrame: 取得したデータのDataFrame。
|
355 |
-
"""
|
356 |
-
all_data = []
|
357 |
-
|
358 |
-
for url in urls:
|
359 |
-
print(f"Processing URL: {url}")
|
360 |
-
try:
|
361 |
-
# 各URLのデータを取得
|
362 |
-
data = scrape_jrct_all_details(url)
|
363 |
-
all_data.append(data)
|
364 |
-
|
365 |
-
# 次のリクエストまで待機
|
366 |
-
print(f"Waiting for {delay} seconds before the next request...")
|
367 |
-
time.sleep(delay)
|
368 |
-
except Exception as e:
|
369 |
-
print(f"Failed to process URL {url}: {e}")
|
370 |
-
# URLとエラー情報を記録しておく(必要ならログに保存など)
|
371 |
-
all_data.append({"URL": url, "Error": str(e)})
|
372 |
-
|
373 |
-
# pandas DataFrameに変換
|
374 |
-
return pd.DataFrame(all_data)
|
375 |
-
|
376 |
-
|
377 |
-
def extract_jrct_links(results):
|
378 |
-
"""
|
379 |
-
fetch_clinical_trialsの結果からjRCT-Noを抽出し、詳細リンクを作成する。
|
380 |
-
|
381 |
-
Args:
|
382 |
-
results (list): fetch_clinical_trialsから得られる結果リスト
|
383 |
-
|
384 |
-
Returns:
|
385 |
-
list: jRCTの詳細ページリンクリスト
|
386 |
-
"""
|
387 |
-
base_url = "https://jrct.niph.go.jp/latest-detail/"
|
388 |
-
links = []
|
389 |
-
for result in results:
|
390 |
-
if len(result) > 0:
|
391 |
-
jrct_no = result[0] # jRCT-Noは結果リストの最初の要素
|
392 |
-
links.append(base_url + jrct_no)
|
393 |
-
return links
|
394 |
-
|
395 |
-
def reorder_columns(df):
|
396 |
-
"""
|
397 |
-
DataFrame の列を日本語の列を前半に、英語の列を後半に並び替える。
|
398 |
-
"""
|
399 |
-
# 日本語と英語の列を分ける
|
400 |
-
jp_columns = [col for col in df.columns if all(ord(c) < 128 for c in col) is False] # 非 ASCII(日本語)文字列を含む列
|
401 |
-
en_columns = [col for col in df.columns if col not in jp_columns] # 残りの列を英語と仮定
|
402 |
-
|
403 |
-
# 日本語列 + 英語列の順序で整列
|
404 |
-
ordered_columns = jp_columns + en_columns
|
405 |
-
|
406 |
-
# 列を並び替えた DataFrame を返す
|
407 |
-
return df[ordered_columns]
|
408 |
-
|
409 |
-
|
410 |
-
# Target列を分割する関数
|
411 |
-
def split_target(target):
|
412 |
-
# 指定された区切り文字で分割
|
413 |
-
split_words = re.split(r'[,\n、・及びおよび又はまたは]+', target)
|
414 |
-
# 空白文字を除外してリストとして返す
|
415 |
-
return [word.strip() for word in split_words if word.strip()]
|
416 |
-
|
417 |
-
|
418 |
-
# Target列を分割する関数(改良後)
|
419 |
-
def split_target_English(target):
|
420 |
-
# 区切り文字を (,) or (\n) or (、) or (・) または文字列"or" として扱う
|
421 |
-
# 正規表現では、パイプ(|)でor条件を定義し、"(?: ... )"はグルーピングのみ行う非捕捉グループ
|
422 |
-
# [,\n、・] はいずれかの1文字とマッチ
|
423 |
-
# or は文字列全体とマッチ
|
424 |
-
# 複数連続した区切り文字をまとめて1回の分割として扱うために+(1回以上)とする
|
425 |
-
split_words = re.split(r'(?:[,\n、・]|or| and)+', target)
|
426 |
-
|
427 |
-
# 空白文字を除外してリストとして返す
|
428 |
-
return [word.strip() for word in split_words if word.strip()]
|
429 |
-
|
430 |
-
# 処理プログラム
|
431 |
-
def split_triple_negative_words(target_words):
|
432 |
-
updated_words = []
|
433 |
-
for word in target_words:
|
434 |
-
if 'triple negative' in word.lower():
|
435 |
-
# 'triple negative' の部分を追加
|
436 |
-
updated_words.append('Triple Negative') # 大文字で統一して追加
|
437 |
-
# 'triple negative' を除いた残りの部分を追加
|
438 |
-
remaining = word.lower().replace('triple negative', '').strip()
|
439 |
-
if remaining: # 残りの単語が存在する場合のみ追加
|
440 |
-
updated_words.append(remaining.title().strip()) # 単語の先頭を大文字化
|
441 |
-
else:
|
442 |
-
updated_words.append(word.strip().title()) # 単語の先頭を大文字化
|
443 |
-
return updated_words
|
444 |
-
|
445 |
-
class WordProcessor:
|
446 |
-
def __init__(self, target_words):
|
447 |
-
self.target_words = target_words
|
448 |
-
|
449 |
-
def process(self, target_words):
|
450 |
-
"""
|
451 |
-
入力された単語のリストを処理して、ターゲット単語に基づき分割します。
|
452 |
-
"""
|
453 |
-
updated_words = []
|
454 |
-
for word in target_words:
|
455 |
-
word_lower = word.lower()
|
456 |
-
for target in self.target_words:
|
457 |
-
if target in word_lower:
|
458 |
-
# ターゲット単語を追加
|
459 |
-
updated_words.append(target.title())
|
460 |
-
# ターゲット単語を除いた残りを追加
|
461 |
-
remaining = word_lower.replace(target, '').strip()
|
462 |
-
if remaining:
|
463 |
-
updated_words.append(remaining.title())
|
464 |
-
break
|
465 |
-
else:
|
466 |
-
# ターゲット単語に該当しない場合
|
467 |
-
updated_words.append(word.strip().title())
|
468 |
-
return updated_words
|
469 |
-
|
470 |
-
def __call__(self, target_words):
|
471 |
-
"""
|
472 |
-
インスタンスを関数として呼び出すためのエントリポイント。
|
473 |
-
"""
|
474 |
-
return self.process(target_words)
|
475 |
-
|
476 |
-
|
477 |
-
import pandas as pd
|
478 |
-
from sentence_transformers import util
|
479 |
-
import torch
|
480 |
-
|
481 |
-
def DfPostProcess(exclusive_words, model, csv_loc=None, dataframe=None):
|
482 |
-
"""
|
483 |
-
exclusive_words: 除外ワードリスト
|
484 |
-
model: SentenceTransformerなどのモデル
|
485 |
-
csv_loc: CSVファイルのパス(文字列)。dataframeが与えられない場合に使用。
|
486 |
-
dataframe: 既存のpandas.DataFrame。csv_locが与えられない場合に使用。
|
487 |
-
"""
|
488 |
-
# csv_locもdataframeも与えられなかった場合はエラー
|
489 |
-
if csv_loc is None and dataframe is None:
|
490 |
-
raise ValueError("Either csv_loc or dataframe must be provided.")
|
491 |
-
|
492 |
-
# 入力データフレームの決定
|
493 |
-
if dataframe is not None:
|
494 |
-
basedf = dataframe.copy()
|
495 |
-
else:
|
496 |
-
basedf = pd.read_csv(csv_loc, index_col=0)
|
497 |
-
|
498 |
-
# '試験等のフェーズ'がNaNの行を削除
|
499 |
-
basedf = basedf.dropna(subset=['試験等のフェーズ'])
|
500 |
-
|
501 |
-
# WordProcessorインスタンス作成
|
502 |
-
processor = WordProcessor(exclusive_words)
|
503 |
-
|
504 |
-
# TargetEnglish列をsplit_target_Englishで処理しTargetWord列作成
|
505 |
-
basedf['TargetWord'] = basedf['TargetEnglish'].apply(split_target_English)
|
506 |
-
|
507 |
-
# NaNやNoneではない場合にprocessor適用
|
508 |
-
basedf['TargetWord'] = basedf['TargetWord'].apply(lambda x: processor(x) if isinstance(x, list) else x)
|
509 |
-
|
510 |
-
# TargetWord列をベクトル化し、リスト化して格納
|
511 |
-
target_vecs_list = []
|
512 |
-
for idx, target_words in enumerate(basedf['TargetWord']):
|
513 |
-
target_vecs = model.encode(target_words, convert_to_tensor=True).cpu()
|
514 |
-
# テンソルをリストに変換
|
515 |
-
target_vecs_list.append(target_vecs.tolist())
|
516 |
-
|
517 |
-
# TargetVec列にリストを格納 (dtype=objectのままでOK)
|
518 |
-
basedf['TargetVec'] = pd.Series(target_vecs_list, index=basedf.index, dtype=object)
|
519 |
-
|
520 |
-
return basedf
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
def get_matched_df(basedf, query, model, threshold=0.5):
|
525 |
-
# queryをベクトル化(テンソル化)しCPUへ移動
|
526 |
-
query_vec = model.encode(query, convert_to_tensor=True).cpu()
|
527 |
-
|
528 |
-
matched_indices = []
|
529 |
-
for idx, target_vec_str in enumerate(basedf['TargetVec']):
|
530 |
-
# CSVから読み込んだ時点でTargetVecはPythonリストを文字列化したものになっているため、
|
531 |
-
# ここでliteral_evalでリストに戻します。
|
532 |
-
if isinstance(target_vec_str, str):
|
533 |
-
# target_vec_strは"[[...], [...]]"のようなリスト形式
|
534 |
-
target_list = ast.literal_eval(target_vec_str) # リストに変換
|
535 |
-
target_vecs = torch.tensor(target_list) # リストからTensorへ
|
536 |
-
else:
|
537 |
-
# 万が一既にTensorの場合はそのまま使用
|
538 |
-
target_vecs = target_vec_str
|
539 |
-
|
540 |
-
# 必要であればCPUへ移動(通常はすでにCPU上のはず)
|
541 |
-
"""if target_vecs[0].is_cuda:
|
542 |
-
target_vecs = target_vecs.cpu()"""
|
543 |
-
|
544 |
-
# コサイン類似度を計算
|
545 |
-
cosine_scores = util.cos_sim(query_vec, target_vecs).squeeze()
|
546 |
-
|
547 |
-
# thresholdを超えるスコアが1つでもあればマッチと判断
|
548 |
-
if (cosine_scores >= threshold).any():
|
549 |
-
matched_indices.append(idx)
|
550 |
-
|
551 |
-
# 条件を満たした行を抽出
|
552 |
-
matched_df = basedf.iloc[matched_indices]
|
553 |
-
return matched_df
|
554 |
-
|
555 |
-
|
556 |
-
def GetJRCTCriteria(dataframe, idx):
|
557 |
-
InC = dataframe.iloc[idx,:]['Inclusion Criteria']
|
558 |
-
ExC = dataframe.iloc[idx,:]['Exclusion Criteria']
|
559 |
-
return "Inclusion Criteria :" + InC + "\n" + "Exclusion Criteria :" + ExC
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
OpenAITools/ReviewPaperTools.py
DELETED
@@ -1,42 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
import pandas as pd
|
3 |
-
|
4 |
-
def parse_text_file(text):
|
5 |
-
# セクションを分割するための正規表現パターンを定義
|
6 |
-
# \d+ は1つ以上の数字にマッチします
|
7 |
-
pattern = re.compile(r'\n\n\n\d+\.')
|
8 |
-
|
9 |
-
# テキストをセクションごとに分割
|
10 |
-
sections = pattern.split(text)[1:] # 最初の空のセクションを除外
|
11 |
-
|
12 |
-
# 各セクションの前後の空白を削除
|
13 |
-
sections = [section.strip() for section in sections]
|
14 |
-
|
15 |
-
return sections
|
16 |
-
|
17 |
-
def split_sections(text):
|
18 |
-
contents = text.split('\n\n')
|
19 |
-
contents = [section.strip() for section in contents if section.strip()]
|
20 |
-
if len(contents) == 8 :
|
21 |
-
keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI', 'COI']
|
22 |
-
elif len(contents) == 7 :
|
23 |
-
keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI']
|
24 |
-
elif len(contents) == 6:
|
25 |
-
keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'DOI']
|
26 |
-
elif len(contents) == 5:
|
27 |
-
keys = ['PublishInfo', 'Title', 'AuthorName', 'Abstract', 'DOI']
|
28 |
-
|
29 |
-
# 辞書を作成し、キーが存在しない場合は空の文字列を設定
|
30 |
-
section_dict = {key: contents[i] if i < len(contents) else "" for i, key in enumerate(keys)}
|
31 |
-
return section_dict
|
32 |
-
|
33 |
-
|
34 |
-
def GetSummaryDf(textdir):
|
35 |
-
with open(textdir, 'r', encoding='utf-8') as f:
|
36 |
-
content = f.read()
|
37 |
-
sections = parse_text_file(content)
|
38 |
-
dicts = []
|
39 |
-
for section in sections:
|
40 |
-
splited_dic = split_sections(section)
|
41 |
-
dicts.append(splited_dic)
|
42 |
-
return pd.DataFrame(dicts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
OpenAITools/scrapeThisData.py
DELETED
@@ -1,237 +0,0 @@
|
|
1 |
-
from selenium import webdriver
|
2 |
-
from selenium.webdriver.support.ui import Select
|
3 |
-
from selenium.webdriver.common.by import By
|
4 |
-
|
5 |
-
import requests
|
6 |
-
from bs4 import BeautifulSoup
|
7 |
-
import re
|
8 |
-
|
9 |
-
import os
|
10 |
-
import time
|
11 |
-
|
12 |
-
from selenium.webdriver.support.ui import WebDriverWait
|
13 |
-
from selenium.webdriver.common.by import By
|
14 |
-
from selenium.webdriver.support import expected_conditions as EC
|
15 |
-
from selenium.webdriver.common.action_chains import ActionChains
|
16 |
-
import chromedriver_autoinstaller
|
17 |
-
|
18 |
-
class ScrapeThatData:
|
19 |
-
|
20 |
-
def __init__(self, time_threshold = 10):
|
21 |
-
|
22 |
-
try:
|
23 |
-
chrome_options = webdriver.ChromeOptions()
|
24 |
-
chrome_options.add_argument('--no-sandbox')
|
25 |
-
self.driver = webdriver.Chrome(options=chrome_options)
|
26 |
-
|
27 |
-
except:
|
28 |
-
chromedriver_autoinstaller.install()
|
29 |
-
chrome_options = webdriver.ChromeOptions()
|
30 |
-
chrome_options.add_argument('--no-sandbox')
|
31 |
-
self.driver = webdriver.Chrome(options=chrome_options)
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
self.wait = WebDriverWait(self.driver,time_threshold)
|
36 |
-
self.attribute_dict = {'status':1 ,'conditions':2, 'interventions': 3, 'study type':4,
|
37 |
-
'phase':5, 'sponsor':6, 'funder type':7 , 'study design': 8,
|
38 |
-
'outcome measures':9, 'number enrolled':10, 'sex':11, 'age':12,
|
39 |
-
'nct number': 13, 'other ids':14, 'title acronym': 15 , 'study start': 16,
|
40 |
-
'primary completion': 17, 'study completion': 18 , 'first posted': 19,
|
41 |
-
'last update posted': 20 , 'results first posted': 21 , 'locations':22, 'study documents': 23}
|
42 |
-
|
43 |
-
self.status_dict = {'not yet recruiting' : 'notYetRecrCB',
|
44 |
-
'recruiting' : 'recruitingCB',
|
45 |
-
'enrolling by invitation':'enrollingByInvCB',
|
46 |
-
'active, not recruiting': 'activeCB',
|
47 |
-
'suspended': 'suspendedCB',
|
48 |
-
'terminated':'terminatedCB',
|
49 |
-
'completed':'completedCB',
|
50 |
-
'withdrawn': 'withdrawnCB',
|
51 |
-
'unknown status': 'unknownCB'}
|
52 |
-
|
53 |
-
def clicking_show_hide_cols(self, driver):
|
54 |
-
columns = driver.find_element(By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/button')
|
55 |
-
action_chain = ActionChains(driver)
|
56 |
-
action_chain.move_to_element(columns).click()
|
57 |
-
action_chain.perform()
|
58 |
-
|
59 |
-
def select_attributes_to_show(self, listed_attributes, attribute_dict):
|
60 |
-
ll = [value.lower() for value in listed_attributes if value.lower() in ['status', 'conditions', 'interventions', 'locations']]
|
61 |
-
if ll:
|
62 |
-
to_show = [value.lower() for value in listed_attributes if value.lower() not in ll]
|
63 |
-
to_hide = [value for value in ['status', 'conditions', 'interventions', 'locations'] if value not in ll]
|
64 |
-
to_click = to_hide + to_show
|
65 |
-
for att in to_click:
|
66 |
-
self.clicking_show_hide_cols(self.driver)
|
67 |
-
time.sleep(1)
|
68 |
-
self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att]) + ']'))).click()
|
69 |
-
time.sleep(1)
|
70 |
-
else:
|
71 |
-
for att in listed_attributes:
|
72 |
-
self.clicking_show_hide_cols(self.driver)
|
73 |
-
time.sleep(1)
|
74 |
-
self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att.lower()]) + ']'))).click()
|
75 |
-
time.sleep(1)
|
76 |
-
|
77 |
-
def select_by_status(self, listed_states, status_dict):
|
78 |
-
if listed_states:
|
79 |
-
for status in listed_states:
|
80 |
-
self.driver.find_element(By.ID,status_dict[status.lower()]).click()
|
81 |
-
|
82 |
-
self.driver.find_element(By.XPATH,'//*[@id="FiltersBody"]/div[1]/input[1]').click()
|
83 |
-
time.sleep(3)
|
84 |
-
|
85 |
-
|
86 |
-
select = Select(self.driver.find_element_by_name('theDataTable_length'))
|
87 |
-
select.select_by_value('100')
|
88 |
-
|
89 |
-
def collect_data_search_page(self,l_ordered, amount_of_data = None):
|
90 |
-
|
91 |
-
class_name = ''
|
92 |
-
page_index = 1
|
93 |
-
|
94 |
-
elements = [l_ordered]
|
95 |
-
|
96 |
-
while 'disabled' not in class_name :
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
time.sleep(10)
|
101 |
-
|
102 |
-
print('Getting data from page {}'.format(page_index))
|
103 |
-
|
104 |
-
#Counting how many rows of the table appear
|
105 |
-
table = self.driver.find_element(By.ID,'theDataTable')
|
106 |
-
row_count = len(table.find_elements(By.TAG_NAME,"tr"))
|
107 |
-
|
108 |
-
#Looping table page
|
109 |
-
for index in range(1, row_count):
|
110 |
-
row = []
|
111 |
-
if 'status' in l_ordered:
|
112 |
-
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3)')))
|
113 |
-
status_element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3) > span')
|
114 |
-
row.append(status_element.text.strip())
|
115 |
-
for i, val in enumerate(l_ordered):
|
116 |
-
if val == 'status':
|
117 |
-
continue
|
118 |
-
|
119 |
-
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')))
|
120 |
-
element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')
|
121 |
-
try:
|
122 |
-
row.append(element.text.strip())
|
123 |
-
except:
|
124 |
-
print(i, element)
|
125 |
-
else:
|
126 |
-
for i, val in enumerate(l_ordered):
|
127 |
-
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')))
|
128 |
-
element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')
|
129 |
-
try:
|
130 |
-
row.append(element.text.strip())
|
131 |
-
except:
|
132 |
-
print(i, element)
|
133 |
-
elements.append(row)
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
#Getting next page button
|
139 |
-
next_page= self.driver.find_element(By.ID,"theDataTable_next")
|
140 |
-
|
141 |
-
#Getting the class attribute of the next page button
|
142 |
-
class_name = next_page.get_attribute('class')
|
143 |
-
|
144 |
-
#Going to the next page
|
145 |
-
next_page.click()
|
146 |
-
page_index += 1
|
147 |
-
|
148 |
-
if amount_of_data:
|
149 |
-
if len(elements) >= amount_of_data or row_count < amount_of_data :
|
150 |
-
break
|
151 |
-
else:
|
152 |
-
continue
|
153 |
-
|
154 |
-
return elements
|
155 |
-
|
156 |
-
def get_criteria(self, NCTnumber):
|
157 |
-
|
158 |
-
url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
|
159 |
-
ClinicalTrialpage = requests.get(url)
|
160 |
-
soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
|
161 |
-
|
162 |
-
wrapping_crit_class = soup.find_all("div", {"class": "tr-indent2"})
|
163 |
-
list_elements = wrapping_crit_class[1].find_all(re.compile("(ul|ol)"))
|
164 |
-
inclusion, exclusion = ('','')
|
165 |
-
|
166 |
-
|
167 |
-
if not list_elements:
|
168 |
-
print ("WARNING: Study number " + NCTnumber + " doesn't have eligibility criteria or HTML tag format is not a list")
|
169 |
-
else:
|
170 |
-
|
171 |
-
if len(list_elements) == 1:
|
172 |
-
try:
|
173 |
-
if wrapping_crit_class[1].find(text = 'Inclusion Criteria:'):
|
174 |
-
inclusion = list_elements[0].find_all("li")
|
175 |
-
|
176 |
-
elif wrapping_crit_class[1].find(text = 'Exclusion Criteria:'):
|
177 |
-
exclusion = list_elements[0].find_all("li")
|
178 |
-
except:
|
179 |
-
print('criteria doesnt exist')
|
180 |
-
else:
|
181 |
-
inclusion = list_elements[0].find_all("li")
|
182 |
-
exclusion = list_elements[1].find_all("li")
|
183 |
-
|
184 |
-
|
185 |
-
inclusion = ' '.join([t.text.strip() for t in inclusion ])
|
186 |
-
exclusion = ' '.join([t.text.strip() for t in exclusion ])
|
187 |
-
|
188 |
-
return(inclusion, exclusion)
|
189 |
-
|
190 |
-
#function that gets number of patients enrolled in a study
|
191 |
-
def get_enrollment (self, NCTnumber):
|
192 |
-
url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
|
193 |
-
ClinicalTrialpage = requests.get(url)
|
194 |
-
soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
|
195 |
-
enrollment = ''
|
196 |
-
wrapping_enrol_class = soup.find_all('td', {'headers':'studyInfoColData','style':"padding-left:1em"})
|
197 |
-
if not wrapping_enrol_class:
|
198 |
-
print('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
|
199 |
-
else:
|
200 |
-
enrollment = wrapping_enrol_class[1]
|
201 |
-
enrollment = enrollment.text.split()[0]
|
202 |
-
if enrollment.isdigit() == False:
|
203 |
-
print ('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
|
204 |
-
else:
|
205 |
-
return(enrollment)
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
def __call__(self, condition, listed_attributes, listed_states, amount_of_data):
|
210 |
-
|
211 |
-
self.driver.get('https://clinicaltrials.gov/ct2/results?cond=' + condition + '&rank=1&view=record#rowId0')
|
212 |
-
self.select_attributes_to_show(listed_attributes, self.attribute_dict)
|
213 |
-
|
214 |
-
try:
|
215 |
-
self.select_by_status(listed_states, self.status_dict)
|
216 |
-
except:
|
217 |
-
print('select by status is a problem')
|
218 |
-
n = []
|
219 |
-
for i in listed_attributes:
|
220 |
-
n.append(self.attribute_dict[i.lower()])
|
221 |
-
attribute_ordered = [list(self.attribute_dict.keys())[list(self.attribute_dict.values()).index(i)]for i in sorted(n)]
|
222 |
-
|
223 |
-
search_data = self.collect_data_search_page(attribute_ordered, amount_of_data=amount_of_data)
|
224 |
-
nct_numbers = [e[search_data[0].index('nct number')] for e in search_data[1:]]
|
225 |
-
search_data[0].extend(['inclusion', 'exclusion', 'enrollment'])
|
226 |
-
for index, nct in enumerate(nct_numbers):
|
227 |
-
if index % 100 == 0 and index!= 0:
|
228 |
-
print("Collected Data from {} Studies: ".format(index))
|
229 |
-
|
230 |
-
inc, exc = self.get_criteria(nct)
|
231 |
-
enrol = self.get_enrollment(nct)
|
232 |
-
search_data[index + 1].extend([inc, exc, enrol])
|
233 |
-
return search_data
|
234 |
-
# except:
|
235 |
-
# print('no data available with the specified status')
|
236 |
-
|
237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -10,127 +10,199 @@ pinned: false
|
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
-
# 🏥
|
14 |
|
15 |
-
|
16 |
|
17 |
-
## ✨
|
18 |
|
19 |
-
###
|
20 |
-
|
|
|
|
|
|
|
21 |
|
22 |
-
|
23 |
-
-
|
24 |
-
-
|
|
|
|
|
25 |
|
26 |
-
|
27 |
-
-
|
28 |
-
-
|
|
|
|
|
29 |
|
30 |
-
|
31 |
-
-
|
32 |
-
-
|
|
|
33 |
|
34 |
## 🚀 使用方法
|
35 |
|
36 |
-
### 1.
|
37 |
-
|
38 |
-
2. **詳細情報**: 遺伝子変異、測定可能腫瘍の有無を選択
|
39 |
-
3. **検索実行**: 対応するボタンをクリック
|
40 |
-
4. **結果確認**: HTMLテーブルで結果を確認
|
41 |
-
5. **フィルタリング**: ✅適格 / ❌不適格 / ❓要検討 で絞り込み
|
42 |
-
6. **データ保存**: CSV形式でダウンロード
|
43 |
|
44 |
-
### 2. 環境設定(完全版利用時)
|
45 |
**Settings → Variables and secrets** で設定:
|
46 |
```
|
47 |
-
GROQ_API_KEY: あなたのGroq API
|
48 |
OPENAI_API_KEY: あなたのOpenAI APIキー(オプション)
|
49 |
```
|
50 |
|
51 |
-
|
|
|
|
|
52 |
|
53 |
-
###
|
54 |
-
- **Gradio 4.20.1**: 安定版LTSバージョン
|
55 |
-
- **HTMLテーブル**: カスタムCSS + レスポンシブデザイン
|
56 |
-
- **段階的フォールバック**: 依存関係エラー耐性
|
57 |
|
58 |
-
|
59 |
-
-
|
60 |
-
-
|
61 |
|
62 |
-
|
63 |
-
-
|
64 |
-
-
|
65 |
-
|
|
|
|
|
|
|
66 |
|
67 |
-
|
|
|
|
|
|
|
|
|
68 |
|
69 |
-
|
70 |
-
- **色分け表示**: 適格性レベルごとの背景色
|
71 |
-
- **リンク機能**: NCTIDクリックで公式ページへ
|
72 |
-
- **統計表示**: 適��/不適格/要検討の件数集計
|
73 |
-
- **レスポンシブ**: モバイル対応デザイン
|
74 |
|
75 |
-
###
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
-
###
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
## 🔐 プライバシー・セキュリティ
|
87 |
|
88 |
-
|
89 |
-
-
|
90 |
-
-
|
91 |
-
-
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
-
## 📝
|
94 |
|
95 |
-
###
|
96 |
```
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
101 |
```
|
102 |
|
103 |
-
###
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
```
|
105 |
-
|
106 |
-
|
107 |
-
+ SQLAlchemy
|
108 |
-
+ pydantic
|
109 |
-
+ tiktoken
|
110 |
-
+ tenacity
|
111 |
```
|
112 |
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
-
|
116 |
|
117 |
-
|
118 |
-
|
119 |
-
- システム状態を確認して必要な依存関係を追加
|
120 |
|
121 |
-
|
122 |
-
|
123 |
-
|
|
|
124 |
|
125 |
-
|
126 |
-
- 腫瘍タイプの英語表記を確認
|
127 |
-
- より一般的な用語で再検索
|
128 |
|
129 |
-
|
130 |
-
-
|
131 |
-
-
|
132 |
-
- **段階的診断**: システム状態表示を確認
|
133 |
|
134 |
---
|
135 |
|
136 |
-
|
|
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
+
# 🏥 臨床試験適格性評価システム(完全版)
|
14 |
|
15 |
+
患者情報に基づいて適切な臨床試験を見つけ、AIエージェントが適格性を自動評価する統合システムです。
|
16 |
|
17 |
+
## ✨ 完全版の主要機能
|
18 |
|
19 |
+
### 🤖 AI適格性評価システム
|
20 |
+
- **Groq Llama3-70B**: 高速で正確な自然言語処理
|
21 |
+
- **3段階自動評価**: ✅適格 / ❌不適格 / ❓要検討
|
22 |
+
- **詳細判断理由**: 各試験への適格性を具体的に説明
|
23 |
+
- **エラー耐性**: API障害時の自動リトライ機能
|
24 |
|
25 |
+
### 🔍 リアルタイム臨床試験検索
|
26 |
+
- **ClinicalTrials.gov API**: 最新の日本国内実施中試験を検索
|
27 |
+
- **多言語対応**: 日本語入力 → 英語検索 → 日本語結果
|
28 |
+
- **条件絞り込み**: 腫瘍タイプ、募集状況、実施地域での自動フィルタリング
|
29 |
+
- **ページネーション**: 大量データの効率的な処理
|
30 |
|
31 |
+
### 📊 高度なデータ可視化
|
32 |
+
- **インタラクティブテーブル**: 色分け表示 + ホバー効果
|
33 |
+
- **統計ダッシュボード**: 適格性レベル別の集計表示
|
34 |
+
- **直接リンク**: NCTIDクリックで公式ページへ遷移
|
35 |
+
- **レスポンシブデザイン**: モバイル対応
|
36 |
|
37 |
+
### 💾 データ管理機能
|
38 |
+
- **リアルタイムフィルタリング**: 適格性レベル別表示
|
39 |
+
- **CSV エクスポート**: Excel対応(UTF-8 with BOM)
|
40 |
+
- **全データ保持**: フィルタ状態に関係なく全情報保存
|
41 |
|
42 |
## 🚀 使用方法
|
43 |
|
44 |
+
### 1. 環境設定(重要)
|
45 |
+
完全版の機能を利用するには、以下のAPIキーが必要です:
|
|
|
|
|
|
|
|
|
|
|
46 |
|
|
|
47 |
**Settings → Variables and secrets** で設定:
|
48 |
```
|
49 |
+
GROQ_API_KEY: あなたのGroq APIキー(必須)
|
50 |
OPENAI_API_KEY: あなたのOpenAI APIキー(オプション)
|
51 |
```
|
52 |
|
53 |
+
> 📝 **APIキー取得方法**:
|
54 |
+
> - **Groq**: [console.groq.com](https://console.groq.com) でアカウント作成 → API Keys
|
55 |
+
> - **OpenAI**: [platform.openai.com](https://platform.openai.com) でアカウント作成 → API Keys
|
56 |
|
57 |
+
### 2. 基本操作手順
|
|
|
|
|
|
|
58 |
|
59 |
+
#### Step 1: 患者情報入力
|
60 |
+
- **基本情報**: 年齢、性別、腫瘍タイプ
|
61 |
+
- **詳細情報**: 遺伝子変異、測定可能腫瘍、生検可能性
|
62 |
|
63 |
+
#### Step 2: AI検索実行
|
64 |
+
- 「🤖 AI適格性評価付き検索(完全版)」をクリック
|
65 |
+
- システムが自動的に:
|
66 |
+
1. 腫瘍タイプを英語に翻訳
|
67 |
+
2. ClinicalTrials.govから最新データを取得
|
68 |
+
3. AIが各試験の適格性を評価
|
69 |
+
4. 結果を3段階でグレード分け
|
70 |
|
71 |
+
#### Step 3: 結果の確認・活用
|
72 |
+
- **色分け表示**: 適格性レベルごとの視覚的識別
|
73 |
+
- **詳細確認**: AI判断理由の詳細閲覧
|
74 |
+
- **フィルタリング**: 特定のグレードのみ表示
|
75 |
+
- **データ保存**: CSV形式でのダウンロード
|
76 |
|
77 |
+
## 🛠️ システムアーキテクチャ
|
|
|
|
|
|
|
|
|
78 |
|
79 |
+
### AI/機械学習スタック
|
80 |
+
```
|
81 |
+
🤖 Groq Llama3-70B (メイン推論エンジン)
|
82 |
+
↓
|
83 |
+
📚 LangChain (エージェント管理)
|
84 |
+
↓
|
85 |
+
🔧 SimpleClinicalTrialAgent (適格性評価)
|
86 |
+
↓
|
87 |
+
📊 GraderAgent (3段階グレード判定)
|
88 |
+
```
|
89 |
+
|
90 |
+
### データフローアーキテクチャ
|
91 |
+
```
|
92 |
+
患者情報入力
|
93 |
+
↓
|
94 |
+
LLMTranslator (日→英翻訳)
|
95 |
+
↓
|
96 |
+
ClinicalTrials.gov API (リアルタイム検索)
|
97 |
+
↓
|
98 |
+
AI適格性評価エンジン (並列処理)
|
99 |
+
↓
|
100 |
+
HTMLテーブル生成 (結果表示)
|
101 |
+
```
|
102 |
+
|
103 |
+
### フォールバックシステム
|
104 |
+
完全版 → 基本版 → 軽量版の段階的フォールバック機能により、依存関係エラー時でも基本機能を提供。
|
105 |
+
|
106 |
+
## 📊 AI評価システムの詳細
|
107 |
|
108 |
+
### 評価プロセス
|
109 |
+
1. **質問文生成**: 患者情報から構造化質問を自動生成
|
110 |
+
2. **適格基準分析**: 各試験のEligibility Criteriaを詳細解析
|
111 |
+
3. **適合性判定**: AIが論理的根拠とともに判断
|
112 |
+
4. **グレード評価**: 判断結果を3段階で分類
|
113 |
+
|
114 |
+
### 評価基準
|
115 |
+
- **✅ Yes (適格)**: 明確に参加可能
|
116 |
+
- **❌ No (不適格)**: 明確に参加不可能
|
117 |
+
- **❓ Unclear (要検討)**: 追加情報や専門医判断が必要
|
118 |
+
|
119 |
+
### AI制限事項
|
120 |
+
- **評価件数**: パフォーマンス考慮で最大10件まで自動評価
|
121 |
+
- **処理時間**: 1件あたり5-10秒程度
|
122 |
+
- **精度**: 参考情報として活用(最終判断は専門医へ)
|
123 |
|
124 |
## 🔐 プライバシー・セキュリティ
|
125 |
|
126 |
+
### データ保護
|
127 |
+
- **ローカル処理**: 患者情報はサーバーに永続保存されません
|
128 |
+
- **セッション限定**: ブラウザ終了で全データ自動削除
|
129 |
+
- **匿名化**: 個人識別情報は外部送信されません
|
130 |
+
|
131 |
+
### API通信セキュリティ
|
132 |
+
- **HTTPS暗号化**: 全API通信が暗号化
|
133 |
+
- **キー管理**: 環境変数での安全な認証情報管理
|
134 |
+
- **エラーログ**: API応答に個人情報を含まない
|
135 |
|
136 |
+
## 📝 技術仕様
|
137 |
|
138 |
+
### システム要件
|
139 |
```
|
140 |
+
Python 3.10+
|
141 |
+
gradio==4.20.1 (LTS安定版)
|
142 |
+
langchain ecosystem (0.2.x)
|
143 |
+
pandas + numpy (データ処理)
|
144 |
+
requests (API通信)
|
145 |
```
|
146 |
|
147 |
+
### 外部API依存関係
|
148 |
+
- **ClinicalTrials.gov API v2** (公開API、制限なし)
|
149 |
+
- **Groq API** (推論API、要認証)
|
150 |
+
- **OpenAI API** (補完機能、オプション)
|
151 |
+
|
152 |
+
### パフォーマンス
|
153 |
+
- **検索速度**: 1-3秒(データ件数により変動)
|
154 |
+
- **AI評価**: 件あたり5-10秒(最大10件並列)
|
155 |
+
- **メモリ使用量**: 100-200MB程度
|
156 |
+
|
157 |
+
## 🙋♂️ トラブルシューティング
|
158 |
+
|
159 |
+
### よくある問題と解決法
|
160 |
+
|
161 |
+
#### 1. **起動時のエラー**
|
162 |
```
|
163 |
+
⚠️ 完全版モジュールのインポートに失敗
|
164 |
+
→ OpenAIToolsフォルダが正しくアップロードされているか確認
|
|
|
|
|
|
|
|
|
165 |
```
|
166 |
|
167 |
+
#### 2. **API接続エラー**
|
168 |
+
```
|
169 |
+
❌ エージェント初期化エラー: API key missing
|
170 |
+
→ Settings → Variables and secrets でGROQ_API_KEYを設定
|
171 |
+
```
|
172 |
+
|
173 |
+
#### 3. **検索結果なし**
|
174 |
+
```
|
175 |
+
⚠️ 該当する臨床試験が見つかりませんでした
|
176 |
+
→ より一般的な腫瘍タイプ(例: "cancer" "carcinoma")で再検索
|
177 |
+
```
|
178 |
+
|
179 |
+
#### 4. **AI評価が動作しない**
|
180 |
+
```
|
181 |
+
基本版:XXX患者への詳細評価にはAI機能が必要です
|
182 |
+
→ 環境変数GROQ_API_KEYが正しく設定されているか確認
|
183 |
+
```
|
184 |
+
|
185 |
+
### サポートリソース
|
186 |
+
- **ログ確認**: Spaces の Logs タブでエラー詳細を確認
|
187 |
+
- **システム状態**: アプリ上部のステータス表示を確認
|
188 |
+
- **段階的診断**: 軽量版 → 基本版 → 完全版の順で動作確認
|
189 |
|
190 |
+
## 📋 利用上の注意事項
|
191 |
|
192 |
+
### 医療免責事項
|
193 |
+
⚠️ **重要**: このシステムは研究・教育目��で開発されており、実際の臨床決定には使用しないでください。臨床試験への参加については、必ず主治医にご相談ください。
|
|
|
194 |
|
195 |
+
### データの精度について
|
196 |
+
- **AI評価**: 参考情報として活用し、最終判断は専門医へ
|
197 |
+
- **検索結果**: ClinicalTrials.govの最新データを反映
|
198 |
+
- **翻訳精度**: 医学用語の翻訳には限界があります
|
199 |
|
200 |
+
## 🔄 更新履歴
|
|
|
|
|
201 |
|
202 |
+
- **v3.0** (2025-06): 完全版リリース - AI適格性評価機能追加
|
203 |
+
- **v2.0** (2025-06): 基本版 - ClinicalTrials.gov API連携
|
204 |
+
- **v1.0** (2025-06): 軽量版 - HTMLテーブル表示機能
|
|
|
205 |
|
206 |
---
|
207 |
|
208 |
+
*Developed for research and educational purposes. Always consult with healthcare professionals for clinical decisions.*
|
app.py
CHANGED
@@ -67,6 +67,64 @@ def safe_init_agents():
|
|
67 |
# エージェント初期化
|
68 |
translator, CriteriaCheckAgent, grader_agent = safe_init_agents()
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
# 基本的なClinicalTrials.gov API呼び出し(軽量版)
|
71 |
def fetch_clinical_trials_basic(cancer_name):
|
72 |
"""基本的な臨床試験データ取得(requestsのみ使用)"""
|
@@ -198,6 +256,84 @@ def generate_basic_data(age, sex, tumor_type, GeneMutation, Meseable, Biopsiable
|
|
198 |
traceback.print_exc()
|
199 |
return []
|
200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
# HTMLテーブル生成関数
|
202 |
def create_html_table(data, show_grade=True):
|
203 |
"""データをHTMLテーブルに変換"""
|
@@ -346,7 +482,9 @@ def filter_data(data, grade):
|
|
346 |
def get_system_status():
|
347 |
"""システムの現在の状態を確認"""
|
348 |
if FULL_VERSION and env_ok:
|
349 |
-
return "🟢 完全版", "
|
|
|
|
|
350 |
elif LANGCHAIN_AVAILABLE and env_ok:
|
351 |
return "🟡 基本版", "ClinicalTrials.gov API検索が可能です(AI評価機能は制限)"
|
352 |
elif LANGCHAIN_AVAILABLE:
|
@@ -375,15 +513,19 @@ def export_to_csv(data):
|
|
375 |
|
376 |
# Gradioインターフェースの作成
|
377 |
with gr.Blocks(title="臨床試験適格性評価", theme=gr.themes.Soft()) as demo:
|
378 |
-
gr.Markdown("## 🏥
|
379 |
|
380 |
# システム状態表示
|
381 |
status_level, status_message = get_system_status()
|
382 |
gr.Markdown(f"**システム状態**: {status_level} - {status_message}")
|
383 |
|
384 |
# 機能説明
|
385 |
-
if FULL_VERSION:
|
386 |
-
gr.Markdown("
|
|
|
|
|
|
|
|
|
387 |
elif LANGCHAIN_AVAILABLE:
|
388 |
gr.Markdown("🔧 **利用可能機能**: ClinicalTrials.gov検索 + 基本評価 + データエクスポート")
|
389 |
else:
|
@@ -411,12 +553,14 @@ with gr.Blocks(title="臨床試験適格性評価", theme=gr.themes.Soft()) as d
|
|
411 |
|
412 |
# ボタン類
|
413 |
with gr.Row():
|
414 |
-
if FULL_VERSION:
|
415 |
-
generate_button = gr.Button("
|
|
|
|
|
416 |
elif LANGCHAIN_AVAILABLE:
|
417 |
-
generate_button = gr.Button("📡
|
418 |
else:
|
419 |
-
generate_button = gr.Button("📋
|
420 |
|
421 |
with gr.Row():
|
422 |
yes_button = gr.Button("✅ Show Eligible Trials", variant="secondary")
|
@@ -437,10 +581,12 @@ with gr.Blocks(title="臨床試験適格性評価", theme=gr.themes.Soft()) as d
|
|
437 |
def update_data_and_display(age, sex, tumor_type, gene_mutation, measurable, biopsiable):
|
438 |
"""データ生成と表示更新"""
|
439 |
try:
|
440 |
-
if FULL_VERSION:
|
441 |
-
progress_msg = "
|
442 |
-
|
443 |
-
|
|
|
|
|
444 |
elif LANGCHAIN_AVAILABLE:
|
445 |
progress_msg = "📡 ClinicalTrials.govから基本データを検索中..."
|
446 |
data = generate_basic_data(age, sex, tumor_type, gene_mutation, measurable, biopsiable)
|
@@ -451,6 +597,9 @@ with gr.Blocks(title="臨床試験適格性評価", theme=gr.themes.Soft()) as d
|
|
451 |
if data:
|
452 |
html_table = create_html_table(data)
|
453 |
final_progress = f"✅ 完了: {len(data)} 件の臨床試験が見つかりました"
|
|
|
|
|
|
|
454 |
else:
|
455 |
html_table = "<div style='text-align: center; padding: 20px; color: #666;'>⚠️ 該当する臨床試験が見つかりませんでした</div>"
|
456 |
final_progress = "⚠️ 該当する臨床試験が見つかりませんでした"
|
@@ -522,8 +671,8 @@ with gr.Blocks(title="臨床試験適格性評価", theme=gr.themes.Soft()) as d
|
|
522 |
# フッター情報
|
523 |
gr.Markdown("---")
|
524 |
with gr.Row():
|
525 |
-
gr.Markdown("🔬 **技術情報**: ClinicalTrials.gov API
|
526 |
-
gr.Markdown("📝
|
527 |
|
528 |
if __name__ == "__main__":
|
529 |
demo.launch(
|
|
|
67 |
# エージェント初期化
|
68 |
translator, CriteriaCheckAgent, grader_agent = safe_init_agents()
|
69 |
|
70 |
+
# エラーハンドリング付きでエージェント評価を実行する関数
|
71 |
+
def evaluate_with_retry(agent, criteria, question, max_retries=3):
|
72 |
+
"""エラーハンドリング付きでエージェント評価を実行"""
|
73 |
+
if agent is None:
|
74 |
+
return "評価エラー: エージェントが初期化されていません。API keyを確認してください。"
|
75 |
+
|
76 |
+
for attempt in range(max_retries):
|
77 |
+
try:
|
78 |
+
return agent.evaluate_eligibility(criteria, question)
|
79 |
+
except Exception as e:
|
80 |
+
if "missing variables" in str(e):
|
81 |
+
print(f"プロンプトテンプレートエラー (試行 {attempt + 1}/{max_retries}): {e}")
|
82 |
+
return "評価エラー: プロンプトテンプレートの設定に問題があります"
|
83 |
+
elif "no healthy upstream" in str(e) or "InternalServerError" in str(e):
|
84 |
+
print(f"Groqサーバーエラー (試行 {attempt + 1}/{max_retries}): {e}")
|
85 |
+
if attempt < max_retries - 1:
|
86 |
+
time.sleep(2)
|
87 |
+
continue
|
88 |
+
else:
|
89 |
+
return "評価エラー: サーバーに接続できませんでした"
|
90 |
+
elif "API key" in str(e) or "authentication" in str(e).lower():
|
91 |
+
return "評価エラー: API keyが無効または設定されていません"
|
92 |
+
else:
|
93 |
+
print(f"予期しないエラー (試行 {attempt + 1}/{max_retries}): {e}")
|
94 |
+
if attempt < max_retries - 1:
|
95 |
+
time.sleep(1)
|
96 |
+
continue
|
97 |
+
else:
|
98 |
+
return f"評価エラー: {str(e)}"
|
99 |
+
return "評価エラー: 最大リトライ回数に達しました"
|
100 |
+
|
101 |
+
def evaluate_grade_with_retry(agent, judgment, max_retries=3):
|
102 |
+
"""エラーハンドリング付きでグレード評価を実行"""
|
103 |
+
if agent is None:
|
104 |
+
return "unclear"
|
105 |
+
|
106 |
+
for attempt in range(max_retries):
|
107 |
+
try:
|
108 |
+
return agent.evaluate_eligibility(judgment)
|
109 |
+
except Exception as e:
|
110 |
+
if "no healthy upstream" in str(e) or "InternalServerError" in str(e):
|
111 |
+
print(f"Groqサーバーエラー (グレード評価 - 試行 {attempt + 1}/{max_retries}): {e}")
|
112 |
+
if attempt < max_retries - 1:
|
113 |
+
time.sleep(2)
|
114 |
+
continue
|
115 |
+
else:
|
116 |
+
return "unclear"
|
117 |
+
elif "API key" in str(e) or "authentication" in str(e).lower():
|
118 |
+
return "unclear"
|
119 |
+
else:
|
120 |
+
print(f"予期しないエラー (グレード評価 - 試行 {attempt + 1}/{max_retries}): {e}")
|
121 |
+
if attempt < max_retries - 1:
|
122 |
+
time.sleep(1)
|
123 |
+
continue
|
124 |
+
else:
|
125 |
+
return "unclear"
|
126 |
+
return "unclear"
|
127 |
+
|
128 |
# 基本的なClinicalTrials.gov API呼び出し(軽量版)
|
129 |
def fetch_clinical_trials_basic(cancer_name):
|
130 |
"""基本的な臨床試験データ取得(requestsのみ使用)"""
|
|
|
256 |
traceback.print_exc()
|
257 |
return []
|
258 |
|
259 |
+
# 完全版データ生成関数(AI評価付き)
|
260 |
+
def generate_full_data(age, sex, tumor_type, GeneMutation, Meseable, Biopsiable):
|
261 |
+
"""完全版のデータ生成(実際のAPI使用 + AI評価)"""
|
262 |
+
try:
|
263 |
+
if not all([age, sex, tumor_type]):
|
264 |
+
return []
|
265 |
+
|
266 |
+
# 日本語の腫瘍タイプを英語に翻訳
|
267 |
+
try:
|
268 |
+
if translator is not None:
|
269 |
+
TumorName = translator.translate(tumor_type)
|
270 |
+
print(f"腫瘍タイプ翻訳: {tumor_type} → {TumorName}")
|
271 |
+
else:
|
272 |
+
print("翻訳エージェントが利用できません。元の値を使用します。")
|
273 |
+
TumorName = tumor_type
|
274 |
+
except Exception as e:
|
275 |
+
print(f"翻訳エラー: {e}")
|
276 |
+
TumorName = tumor_type
|
277 |
+
|
278 |
+
# 質問文を生成
|
279 |
+
try:
|
280 |
+
ex_question = generate_ex_question_English(age, sex, TumorName, GeneMutation, Meseable, Biopsiable)
|
281 |
+
print(f"生成された質問: {ex_question}")
|
282 |
+
except Exception as e:
|
283 |
+
print(f"質問生成エラー: {e}")
|
284 |
+
return []
|
285 |
+
|
286 |
+
# 臨床試験データの取得
|
287 |
+
try:
|
288 |
+
print(f"臨床試験データを検索中: {TumorName}")
|
289 |
+
df = fetch_clinical_trials(TumorName)
|
290 |
+
if df.empty:
|
291 |
+
print("臨床試験データが見つかりませんでした")
|
292 |
+
return []
|
293 |
+
print(f"取得した臨床試験数: {len(df)}")
|
294 |
+
|
295 |
+
# DataFrameを辞書のリストに変換
|
296 |
+
data_list = df.to_dict('records')
|
297 |
+
|
298 |
+
except Exception as e:
|
299 |
+
print(f"臨床試験データ取得エラー: {e}")
|
300 |
+
return []
|
301 |
+
|
302 |
+
# AI評価の実行(最大10件まで)
|
303 |
+
evaluation_limit = min(len(data_list), 10)
|
304 |
+
print(f"AI評価実行: {evaluation_limit} 件を処理します")
|
305 |
+
|
306 |
+
for i, item in enumerate(data_list[:evaluation_limit]):
|
307 |
+
try:
|
308 |
+
print(f"評価中 ({i+1}/{evaluation_limit}): {item['NCTID']}")
|
309 |
+
target_criteria = item['Eligibility Criteria']
|
310 |
+
|
311 |
+
# エラーハンドリング付きで評価実行
|
312 |
+
agent_judgment = evaluate_with_retry(CriteriaCheckAgent, target_criteria, ex_question)
|
313 |
+
agent_grade = evaluate_grade_with_retry(grader_agent, agent_judgment)
|
314 |
+
|
315 |
+
# データの更新
|
316 |
+
item['AgentJudgment'] = agent_judgment
|
317 |
+
item['AgentGrade'] = agent_grade
|
318 |
+
|
319 |
+
except Exception as e:
|
320 |
+
print(f"NCTID {item['NCTID']} の評価中にエラー: {e}")
|
321 |
+
item['AgentJudgment'] = f"エラー: {str(e)}"
|
322 |
+
item['AgentGrade'] = "unclear"
|
323 |
+
|
324 |
+
# 評価されなかった残りのアイテムにはプレースホルダーを設定
|
325 |
+
for item in data_list[evaluation_limit:]:
|
326 |
+
item['AgentJudgment'] = f"完全版:{age}歳{sex}の{tumor_type}患者(評価制限により未処理)"
|
327 |
+
item['AgentGrade'] = "unclear"
|
328 |
+
|
329 |
+
print(f"完全版評価完了。結果: {len(data_list)} 件(うち{evaluation_limit}件をAI評価)")
|
330 |
+
return data_list
|
331 |
+
|
332 |
+
except Exception as e:
|
333 |
+
print(f"完全版データ生成中に予期しないエラー: {e}")
|
334 |
+
traceback.print_exc()
|
335 |
+
return []
|
336 |
+
|
337 |
# HTMLテーブル生成関数
|
338 |
def create_html_table(data, show_grade=True):
|
339 |
"""データをHTMLテーブルに変換"""
|
|
|
482 |
def get_system_status():
|
483 |
"""システムの現在の状態を確認"""
|
484 |
if FULL_VERSION and env_ok:
|
485 |
+
return "🟢 完全版", "リアルタイム検索 + AI適格性評価が利用可能です"
|
486 |
+
elif FULL_VERSION and not env_ok:
|
487 |
+
return "🟡 完全版(制限)", "AI機能は利用可能ですが、API keyの設定をお願いします"
|
488 |
elif LANGCHAIN_AVAILABLE and env_ok:
|
489 |
return "🟡 基本版", "ClinicalTrials.gov API検索が可能です(AI評価機能は制限)"
|
490 |
elif LANGCHAIN_AVAILABLE:
|
|
|
513 |
|
514 |
# Gradioインターフェースの作成
|
515 |
with gr.Blocks(title="臨床試験適格性評価", theme=gr.themes.Soft()) as demo:
|
516 |
+
gr.Markdown("## 🏥 臨床試験適格性評価インターフェース(完全版)")
|
517 |
|
518 |
# システム状態表示
|
519 |
status_level, status_message = get_system_status()
|
520 |
gr.Markdown(f"**システム状態**: {status_level} - {status_message}")
|
521 |
|
522 |
# 機能説明
|
523 |
+
if FULL_VERSION and env_ok:
|
524 |
+
gr.Markdown("🚀 **利用可能機能**: ClinicalTrials.gov リアルタイム検索 + AI適格性評価 + データエクスポート")
|
525 |
+
gr.Markdown("🤖 **AI機能**: Groq Llama3-70B による自動適格性判断 + 3段階グレード評価")
|
526 |
+
elif FULL_VERSION:
|
527 |
+
gr.Markdown("🔧 **利用可能機能**: リアルタイム検索 + 基本評価(AI機能は環境変数設定後に有効化)")
|
528 |
+
gr.Markdown("⚠️ **API設定が必要**: Settings → Variables and secrets で GROQ_API_KEY を設定してください")
|
529 |
elif LANGCHAIN_AVAILABLE:
|
530 |
gr.Markdown("🔧 **利用可能機能**: ClinicalTrials.gov検索 + 基本評価 + データエクスポート")
|
531 |
else:
|
|
|
553 |
|
554 |
# ボタン類
|
555 |
with gr.Row():
|
556 |
+
if FULL_VERSION and env_ok:
|
557 |
+
generate_button = gr.Button("🤖 AI適格性評価付き検索(完全版)", variant="primary")
|
558 |
+
elif FULL_VERSION:
|
559 |
+
generate_button = gr.Button("🔍 リアルタイム検索(環境変数設定後にAI評価有効化)", variant="primary")
|
560 |
elif LANGCHAIN_AVAILABLE:
|
561 |
+
generate_button = gr.Button("📡 ClinicalTrials.gov検索(基本版)", variant="primary")
|
562 |
else:
|
563 |
+
generate_button = gr.Button("📋 サンプ���データ表示", variant="primary")
|
564 |
|
565 |
with gr.Row():
|
566 |
yes_button = gr.Button("✅ Show Eligible Trials", variant="secondary")
|
|
|
581 |
def update_data_and_display(age, sex, tumor_type, gene_mutation, measurable, biopsiable):
|
582 |
"""データ生成と表示更新"""
|
583 |
try:
|
584 |
+
if FULL_VERSION and env_ok:
|
585 |
+
progress_msg = "🤖 AI適格性評価付きで実際の臨床試験データを検索中..."
|
586 |
+
data = generate_full_data(age, sex, tumor_type, gene_mutation, measurable, biopsiable)
|
587 |
+
elif FULL_VERSION:
|
588 |
+
progress_msg = "🔍 実際の臨床試験データを検索中(AI評価は環境変数設定後に有効化)..."
|
589 |
+
data = generate_basic_data(age, sex, tumor_type, gene_mutation, measurable, biopsiable)
|
590 |
elif LANGCHAIN_AVAILABLE:
|
591 |
progress_msg = "📡 ClinicalTrials.govから基本データを検索中..."
|
592 |
data = generate_basic_data(age, sex, tumor_type, gene_mutation, measurable, biopsiable)
|
|
|
597 |
if data:
|
598 |
html_table = create_html_table(data)
|
599 |
final_progress = f"✅ 完了: {len(data)} 件の臨床試験が見つかりました"
|
600 |
+
if FULL_VERSION and env_ok:
|
601 |
+
ai_count = len([item for item in data if 'エラー' not in item.get('AgentJudgment', '')])
|
602 |
+
final_progress += f"(うち最大10件をAI評価済み)"
|
603 |
else:
|
604 |
html_table = "<div style='text-align: center; padding: 20px; color: #666;'>⚠️ 該当する臨床試験が見つかりませんでした</div>"
|
605 |
final_progress = "⚠️ 該当する臨床試験が見つかりませんでした"
|
|
|
671 |
# フッター情報
|
672 |
gr.Markdown("---")
|
673 |
with gr.Row():
|
674 |
+
gr.Markdown("🔬 **技術情報**: ClinicalTrials.gov API + LangChain + Groq Llama3-70B")
|
675 |
+
gr.Markdown("📝 **完全版状況**: " + ("AI評価機能有効" if (FULL_VERSION and env_ok) else "環境変数設定後にAI機能有効化"))
|
676 |
|
677 |
if __name__ == "__main__":
|
678 |
demo.launch(
|