Satoc commited on
Commit
92df76e
·
1 Parent(s): b296c12

Add application file

Browse files
.gitignore ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Add any directories, files, or patterns you don't want to be tracked by version control
2
+
3
+
4
+ # Byte-compiled / optimized / DLL files
5
+ __pycache__/
6
+ #*.py[cod]
7
+ #*$py.class
8
+ #*.txt
9
+ #*.tsv
10
+ *.csv
11
+ *.xlsx
12
+ *.pdf
13
+ *.nii
14
+ #*.nii.gz
15
+ *.DS_Store
16
+ #*.png
17
+ #*.pyn
18
+ *.jpg
19
+ *.nii.gz
20
+ *.pkl
21
+ *-checkpoint.ipynb
22
+ *.pkls
23
+ *.pth
24
+ *.yaml
25
+ *.ckpt
26
+ # C extensions
27
+ #*.so
28
+
29
+ # Distribution / packaging
30
+ #.Python
31
+ #build/
32
+ #develop-eggs/
33
+ #dist/
34
+ #downloads/
35
+ #eggs/
36
+ #.eggs/
37
+ #lib/
38
+ #lib64/
39
+ #parts/
40
+ #sdist/
41
+ #var/
42
+ #wheels/
43
+ #*.egg-info/
44
+ #.installed.cfg
45
+ #*.egg
46
+ #MANIFEST
47
+
48
+ # PyInstaller
49
+ # Usually these files are written by a python script from a template
50
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
51
+ #*.manifest
52
+ #*.spec
53
+
54
+ # Installer logs
55
+ #pip-log.txt
56
+ #pip-delete-this-directory.txt
57
+
58
+ # Unit test / coverage reports
59
+ #htmlcov/
60
+ #.tox/
61
+ #.coverage
62
+ #.coverage.*
63
+ #.cache
64
+ #nosetests.xml
65
+ #coverage.xml
66
+ #*.cover
67
+ #.hypothesis/
68
+ #.pytest_cache/
69
+
70
+ # Translations
71
+ #*.mo
72
+ #*.pot
73
+
74
+ # Django stuff:
75
+ #*.log
76
+ #.static_storage/
77
+ #.media/
78
+ #local_settings.py
79
+
80
+ # Flask stuff:
81
+ #instance/
82
+ #.webassets-cache
83
+
84
+ # Scrapy stuff:
85
+ #.scrapy
86
+
87
+ # Sphinx documentation
88
+ #docs/_build/
89
+
90
+ # PyBuilder
91
+ #target/
92
+
93
+ # Jupyter Notebook
94
+ .ipynb_checkpoint/*
95
+
96
+ #Folders
97
+ dev_JRCT_api/
98
+ dev/
99
+ # pyenv
100
+ #.python-version
101
+
102
+ # celery beat schedule file
103
+ #celerybeat-schedule
104
+
105
+ # SageMath parsed files
106
+ #*.sage.py
107
+ /
108
+ # Environments
109
+ #.env
110
+ #.venv
111
+ #env/
112
+ #venv/
113
+ #ENV/
114
+ #env.bak/
115
+ #venv.bak/
116
+
117
+ # Spyder project settings
118
+ #.spyderproject
119
+ #.spyproject
120
+
121
+ # Rope project settings
122
+ #.ropeproject
123
+
124
+ # mkdocs documentation
125
+ #/site
126
+ /models/
127
+ # mypy
128
+ #.mypy_cache/
129
+ #over 100MB
130
+
131
+ # Add any directories, files, or patterns you don't want to be tracked by version control
132
+
133
+
134
+ #deep settings
135
+ *.h5
136
+
137
+ .OpenAITools/chromedriver
138
+ /OpenAITools/chromedriver
ClinicalTrialCSV/dummy.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ dummy dummy
OpenAITools/.ipynb_checkpoints/ECarteTools-checkpoint.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import time
3
+ import wikipedia
4
+ import random
5
+ import re
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ import os
9
+ import glob
10
+ from natsort import natsorted
11
+ import requests
12
+ from bs4 import BeautifulSoup
13
+ import xml.etree.ElementTree as ET
14
+ import pandas as pd
15
+
16
+ wikipedia.set_lang("ja")
17
+ # APIキーの設定
18
+ openai.api_key = os.environ['OPENAI_API_KEY']
19
+ engine="gpt-3.5-turbo"
20
+
21
+
22
+ def generate(system_template,prompt,engine="gpt-3.5-turbo"):
23
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
24
+ try:
25
+ response = openai.ChatCompletion.create(
26
+ model=engine,
27
+ messages=[
28
+ {"role": "system", "content": system_template},
29
+ {"role": "user", "content":prompt},
30
+ ]
31
+ )
32
+ result=response["choices"][0]["message"]["content"]
33
+ return result
34
+ except:
35
+ print("リトライ")
36
+ time.sleep(30)
37
+ pass
38
+
39
+ def generate_carte(prompt,engine="gpt-3.5-turbo"):
40
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
41
+ try:
42
+ response = openai.ChatCompletion.create(
43
+ model=engine,
44
+ messages=[
45
+ {"role": "system", "content": "You are useful assistant"},
46
+ {"role": "user", "content":"%s\n・・・という患者と医師の会話をSOAP形式のカルテとして日本語で端的にまとめて下さい。各セクションはS),O), A),P)として下さい "%prompt},
47
+ ]
48
+ )
49
+ result=response["choices"][0]["message"]["content"]
50
+ return result
51
+ except:
52
+ print("リトライ")
53
+ time.sleep(30)
54
+ pass
55
+
56
+ def get_selected_fileds(texts):
57
+ input_name = texts.replace(' ' , "+")
58
+ corona_fields = ct.get_study_fields(
59
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
60
+ fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
61
+ max_studies=500,
62
+ fmt="csv")
63
+ return corona_fields
64
+
65
+ def get_retriever_str(fields):
66
+ retriever_str=''
67
+ for i in range(1,len(fields)):
68
+ colnames = fields[0]
69
+ targetCol = fields[i]
70
+ for f in range(len(fields[0])):
71
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
72
+ retriever_str+='\n'
73
+ return retriever_str
OpenAITools/.ipynb_checkpoints/ExpertTools-checkpoint.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import time
4
+ import wikipedia
5
+ import random
6
+ import re
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ import os
10
+ import glob
11
+ from natsort import natsorted
12
+ import requests
13
+ from bs4 import BeautifulSoup
14
+ import xml.etree.ElementTree as ET
15
+ from pytrials.client import ClinicalTrials
16
+ from Bio import Entrez
17
+ import pandas as pd
18
+ import numpy as np
19
+ import time
20
+ #from langchain.agents import create_pandas_dataframe_agent
21
+ from langchain_experimental.agents import create_pandas_dataframe_agent
22
+ from langchain.llms import OpenAI
23
+
24
+ # APIキーの設定
25
+ openai.api_key = os.environ['OPENAI_API_KEY']
26
+ gptengine="gpt-3.5-turbo"
27
+
28
+
29
+ """def get_selected_fileds(texts):
30
+ ct = ClinicalTrials()
31
+ input_name = texts.replace(' ' , "+")
32
+ corona_fields = ct.get_study_fields(
33
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
34
+ fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
35
+ max_studies=500,
36
+ fmt="csv")
37
+ return corona_fields"""
38
+
39
+ def get_retriever_str(fields):
40
+ retriever_str=''
41
+ for i in range(1,len(fields)):
42
+ colnames = fields[0]
43
+ targetCol = fields[i]
44
+ for f in range(len(fields[0])):
45
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
46
+ retriever_str+='\n'
47
+ return retriever_str
48
+
49
+ def get_chanked_retriever(fields):
50
+ retriever_list =[]
51
+ for i in range(1,len(fields)):
52
+ retriever_str=''
53
+ colnames = fields[0]
54
+ targetCol = fields[i]
55
+ for f in range(len(fields[0])):
56
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
57
+ retriever_list.append(retriever_str)
58
+ return retriever_list
59
+
60
+ from pytrials.client import ClinicalTrials
61
+ def get_selected_fields(texts, split_criteria=False,
62
+ split_word_number = False, split_number=700):
63
+ ct = ClinicalTrials()
64
+ input_name = texts.replace(' ', "+")
65
+ corona_fields = ct.get_study_fields(
66
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (input_name),
67
+ fields=["NCTId", "Condition", "BriefTitle", 'BriefSummary', 'EligibilityCriteria'],
68
+ max_studies=500,
69
+ fmt="csv")
70
+
71
+ if split_criteria:
72
+ new_fields = []
73
+
74
+ # 検索対象の文字列
75
+ target_string1 = 'Exclusion Criteria'
76
+ target_string2 = 'Exclusion criteria'
77
+
78
+ # 各要素で検索対象の文字列を探し、直前で分割して新しいリストに格納
79
+ for corona_field in corona_fields:
80
+ new_list = []
81
+ for item in corona_field:
82
+ if target_string1 in item:
83
+ split_position = item.index(target_string1)
84
+ new_list.append(item[:split_position])
85
+ new_list.append(item[split_position:])
86
+ elif target_string2 in item:
87
+ split_position = item.index(target_string2)
88
+ new_list.append(item[:split_position])
89
+ new_list.append(item[split_position:])
90
+ else:
91
+ new_list.append(item)
92
+ new_fields.append(new_list)
93
+ else:
94
+ new_fields = corona_fields
95
+
96
+ if split_word_number:
97
+ split_fields = []
98
+ for new_field in new_fields:
99
+ new_list= []
100
+
101
+ # 各要素を調べて、700文字以上であれば分割し、新しいリストに格納
102
+ for item in new_field:
103
+ item_length = len(item)
104
+ if item_length > split_number:
105
+ num_parts = -(-item_length // split_number) # 向上の除算を用いて分割数を計算
106
+ for i in range(num_parts):
107
+ start_index = i * split_number
108
+ end_index = min((i + 1) * split_number, item_length) # 文字列の終わりを超えないように調整
109
+ new_list.append(item[start_index:end_index])
110
+ else:
111
+ new_list.append(item)
112
+
113
+ split_fields.append(new_list)
114
+ new_fields = split_fields
115
+
116
+ return new_fields
117
+
118
+
119
+ def print_agent_results(df, Ids,
120
+ interesteds = ['Condition', 'BriefTitle', 'BriefSummary', 'EligibilityCriteria'],
121
+ translater=None):
122
+ results = ""
123
+ for Id in Ids:
124
+ print("%s\n"%Id)
125
+ sdf = df[df['NCTId'] == Id]
126
+ for interested in interesteds:
127
+ # 最初の要素を取得
128
+ results += '%s: \n %s \n' % (interested, sdf[interested].iloc[0])
129
+ #print('%s: \n %s \n' % (interested, sdf[interested].iloc[0]))
130
+ if translater:
131
+ to_be_printed = translater.translate(results)
132
+ else:
133
+ to_be_printed =results
134
+ print(to_be_printed)
135
+
136
+ def search(query):
137
+ Entrez.email = os.getenv('MAIL_ADRESS')
138
+ #Entrez.email='[email protected]'
139
+ handle = Entrez.esearch(db='pubmed',
140
+ sort = 'relevance',
141
+ retmax = '20',
142
+ retmode = 'xml',
143
+ term = query)
144
+ results = Entrez.read(handle)
145
+ return results
146
+
147
+ def fetch_details(id_list):
148
+ ids = ','.join(id_list)
149
+ Entrez.email = os.getenv('MAIL_ADRESS')
150
+ #Entrez.email = '[email protected]'
151
+ handle = Entrez.efetch(db = 'pubmed',
152
+ retmode = 'xml',
153
+ id = ids)
154
+ results = Entrez.read(handle)
155
+ return results
156
+ '''def generate(prompt,engine=None):
157
+ if engine is None:
158
+ engine=gptengine
159
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
160
+ try:
161
+ response = openai.ChatCompletion.create(
162
+ model=engine,
163
+ messages=[
164
+ {"role": "system", "content": "You are useful assistant"},
165
+ {"role": "user", "content":prompt},
166
+ ]
167
+ )
168
+ result=response["choices"][0]["message"]["content"]
169
+ return result
170
+ except Exception as e:
171
+ print(e)
172
+ print("リトライ")
173
+ time.sleep(30)
174
+ pass
175
+ '''
176
+
177
+ def generate(prompt,engine=None):
178
+ if engine is None:
179
+ engine=gptengine
180
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
181
+ try:
182
+ response = openai.chat.completions.create(
183
+ model=engine,
184
+ messages=[
185
+ {"role": "system", "content": "You are useful assistant"},
186
+ {"role": "user", "content":prompt},
187
+ ]
188
+ )
189
+ #result=response["choices"][0]["message"]["content"]
190
+ result=response.choices[0].message.content
191
+ return result
192
+ except Exception as e:
193
+ print(e)
194
+ print("リトライ")
195
+ time.sleep(30)
196
+ pass
197
+
198
+ def GetPubmedSummaryDf(studies):
199
+ title_list= []
200
+ abstract_list=[]
201
+ journal_list = []
202
+ language_list =[]
203
+ pubdate_year_list = []
204
+ pubdate_month_list = []
205
+ studiesIdList = studies['IdList']
206
+ chunk_size = 10000
207
+ for chunk_i in range(0, len(studiesIdList), chunk_size):
208
+ chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
209
+
210
+ try:
211
+ papers = fetch_details(chunk)
212
+ for i, paper in enumerate(papers['PubmedArticle']):
213
+ title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
214
+ try:
215
+ abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
216
+ except:
217
+ abstract_list.append('No Abstract')
218
+ journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
219
+ language_list.append(paper['MedlineCitation']['Article']['Language'][0])
220
+ try:
221
+ pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
222
+ except:
223
+ pubdate_year_list.append('No Data')
224
+ try:
225
+ pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
226
+ except:
227
+ pubdate_month_list.append('No Data')
228
+ except: # occasionally a chunk might annoy your parser
229
+ pass
230
+ df = pd.DataFrame(list(zip(
231
+ title_list, abstract_list, journal_list, language_list, pubdate_year_list,
232
+ pubdate_month_list)),
233
+ columns=['Title', 'Abstract', 'Journal', 'Language', 'Year','Month'])
234
+ return df, abstract_list
235
+
236
+ def ClinicalAgent(fileds, verbose=False):
237
+ df = pd.DataFrame.from_records(fileds[1:], columns=fileds[0])
238
+ return create_pandas_dataframe_agent(OpenAI(temperature=0, model='gpt-3.5-turbo-16k'), df, verbose=verbose)
239
+
240
+ def GetNCTID(results):
241
+ # NCTで始まる単語を検索する正規表現
242
+ pattern = r'\bNCT\d+\b'
243
+ # 正規表現を使って単語を抽出
244
+ nct_words = re.findall(pattern,results)
245
+ return nct_words
OpenAITools/.ipynb_checkpoints/FetchTools-checkpoint.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ #from llama_index.llms.replicate import Replicate
4
+ import requests
5
+ import re
6
+
7
+
8
+ def extract_japan_cities(text):
9
+ # 正規表現を使用して " - Japan" で終わる都市名を抽出
10
+ pattern = r'(\b\w+\s*\w*\b) - Japan'
11
+ cities = re.findall(pattern, text)
12
+ unique_cities = list(set(cities))
13
+ # ユニークな都市名をソートしてカンマで区切られた文字列に変換
14
+ unique_cities.sort()
15
+ return ', '.join(unique_cities)
16
+
17
+ def fetch_clinical_trials(cancer_name):
18
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
19
+ # Initial URL for the first API call
20
+ base_url = "https://clinicaltrials.gov/api/v2/studies"
21
+ params = {
22
+ "query.titles": search_expr,
23
+ "pageSize": 100
24
+ }
25
+
26
+ # Initialize an empty list to store the data
27
+ data_list = []
28
+ # Loop until there is no nextPageToken
29
+ while True:
30
+ # Print the current URL (for debugging purposes)
31
+ print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
32
+
33
+ # Send a GET request to the API
34
+ response = requests.get(base_url, params=params)
35
+
36
+ # Check if the request was successful
37
+ if response.status_code == 200:
38
+ data = response.json() # Parse JSON response
39
+ studies = data.get('studies', []) # Extract the list of studies
40
+
41
+ # Loop through each study and extract specific information
42
+ for study in studies:
43
+ # Safely access nested keys
44
+ nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
45
+ startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
46
+ conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
47
+ title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
48
+ summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
49
+
50
+ # Extract locations safely
51
+ locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
52
+ locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
53
+
54
+ JapanesLocations = extract_japan_cities(locations)
55
+ # Extract dates and phases
56
+ primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
57
+
58
+ phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
59
+ eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
60
+
61
+ # Append the data to the list as a dictionary
62
+ data_list.append({
63
+ "NCTID": nctId,
64
+ "Title": title,
65
+ #"Start Date": startDate,
66
+ "Primary Completion Date": primaryCompletionDate,
67
+ #"Conditions": conditions,
68
+ "Cancer": conditions,
69
+ "Summary": summary,
70
+ "Japanes Locations": JapanesLocations,
71
+ #"Phases": phases,
72
+ "Eligibility Criteria": eligibilityCriteria
73
+ })
74
+
75
+ # Check for nextPageToken and update the params or break the loop
76
+ nextPageToken = data.get('nextPageToken')
77
+ if nextPageToken:
78
+ params['pageToken'] = nextPageToken # Set the pageToken for the next request
79
+ else:
80
+ break # Exit the loop if no nextPageToken is present
81
+ else:
82
+ print("Failed to fetch data. Status code:", response.status_code)
83
+ break
84
+
85
+ # Create a DataFrame from the list of dictionaries
86
+ df = pd.DataFrame(data_list)
87
+ return df
88
+
89
+ def fetch_clinical_trials_jp(cancer_name):
90
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
91
+ # Initial URL for the first API call
92
+ base_url = "https://clinicaltrials.gov/api/v2/studies"
93
+ params = {
94
+ "query.titles": search_expr,
95
+ "pageSize": 100
96
+ }
97
+
98
+ # Initialize an empty list to store the data
99
+ data_list = []
100
+ # Loop until there is no nextPageToken
101
+ while True:
102
+ # Print the current URL (for debugging purposes)
103
+ print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
104
+
105
+ # Send a GET request to the API
106
+ response = requests.get(base_url, params=params)
107
+
108
+ # Check if the request was successful
109
+ if response.status_code == 200:
110
+ data = response.json() # Parse JSON response
111
+ studies = data.get('studies', []) # Extract the list of studies
112
+
113
+ # Loop through each study and extract specific information
114
+ for study in studies:
115
+ # Safely access nested keys
116
+ nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
117
+ startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
118
+ conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
119
+ title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
120
+ summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
121
+
122
+ # Extract locations safely
123
+ locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
124
+ locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
125
+
126
+ JapanesLocations = extract_japan_cities(locations)
127
+ # Extract dates and phases
128
+ primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
129
+
130
+ phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
131
+ eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
132
+
133
+ # Append the data to the list as a dictionary
134
+ data_list.append({
135
+ "NCTID": nctId,
136
+ "タイトル": title,
137
+ #"Start Date": startDate,
138
+ #"Primary Completion Date": primaryCompletionDate,
139
+ "対象となる癌": conditions,
140
+ "サマリー": summary,
141
+ "場所": JapanesLocations,
142
+ #"Phases": phases,
143
+ "クライテリア": eligibilityCriteria
144
+ })
145
+
146
+ # Check for nextPageToken and update the params or break the loop
147
+ nextPageToken = data.get('nextPageToken')
148
+ if nextPageToken:
149
+ params['pageToken'] = nextPageToken # Set the pageToken for the next request
150
+ else:
151
+ break # Exit the loop if no nextPageToken is present
152
+ else:
153
+ print("Failed to fetch data. Status code:", response.status_code)
154
+ break
155
+
156
+ # Create a DataFrame from the list of dictionaries
157
+ df = pd.DataFrame(data_list)
158
+ return df
OpenAITools/.ipynb_checkpoints/scrapeThisData-checkpoint.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.support.ui import Select
3
+ from selenium.webdriver.common.by import By
4
+
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ import re
8
+
9
+ import os
10
+ import time
11
+
12
+ from selenium.webdriver.support.ui import WebDriverWait
13
+ from selenium.webdriver.common.by import By
14
+ from selenium.webdriver.support import expected_conditions as EC
15
+ from selenium.webdriver.common.action_chains import ActionChains
16
+ import chromedriver_autoinstaller
17
+
18
+ class ScrapeThatData:
19
+
20
+ def __init__(self, time_threshold = 10):
21
+
22
+ try:
23
+ chrome_options = webdriver.ChromeOptions()
24
+ chrome_options.add_argument('--no-sandbox')
25
+ self.driver = webdriver.Chrome(options=chrome_options)
26
+
27
+ except:
28
+ chromedriver_autoinstaller.install()
29
+ chrome_options = webdriver.ChromeOptions()
30
+ chrome_options.add_argument('--no-sandbox')
31
+ self.driver = webdriver.Chrome(options=chrome_options)
32
+
33
+
34
+
35
+ self.wait = WebDriverWait(self.driver,time_threshold)
36
+ self.attribute_dict = {'status':1 ,'conditions':2, 'interventions': 3, 'study type':4,
37
+ 'phase':5, 'sponsor':6, 'funder type':7 , 'study design': 8,
38
+ 'outcome measures':9, 'number enrolled':10, 'sex':11, 'age':12,
39
+ 'nct number': 13, 'other ids':14, 'title acronym': 15 , 'study start': 16,
40
+ 'primary completion': 17, 'study completion': 18 , 'first posted': 19,
41
+ 'last update posted': 20 , 'results first posted': 21 , 'locations':22, 'study documents': 23}
42
+
43
+ self.status_dict = {'not yet recruiting' : 'notYetRecrCB',
44
+ 'recruiting' : 'recruitingCB',
45
+ 'enrolling by invitation':'enrollingByInvCB',
46
+ 'active, not recruiting': 'activeCB',
47
+ 'suspended': 'suspendedCB',
48
+ 'terminated':'terminatedCB',
49
+ 'completed':'completedCB',
50
+ 'withdrawn': 'withdrawnCB',
51
+ 'unknown status': 'unknownCB'}
52
+
53
+ def clicking_show_hide_cols(self, driver):
54
+ columns = driver.find_element(By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/button')
55
+ action_chain = ActionChains(driver)
56
+ action_chain.move_to_element(columns).click()
57
+ action_chain.perform()
58
+
59
+ def select_attributes_to_show(self, listed_attributes, attribute_dict):
60
+ ll = [value.lower() for value in listed_attributes if value.lower() in ['status', 'conditions', 'interventions', 'locations']]
61
+ if ll:
62
+ to_show = [value.lower() for value in listed_attributes if value.lower() not in ll]
63
+ to_hide = [value for value in ['status', 'conditions', 'interventions', 'locations'] if value not in ll]
64
+ to_click = to_hide + to_show
65
+ for att in to_click:
66
+ self.clicking_show_hide_cols(self.driver)
67
+ time.sleep(1)
68
+ self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att]) + ']'))).click()
69
+ time.sleep(1)
70
+ else:
71
+ for att in listed_attributes:
72
+ self.clicking_show_hide_cols(self.driver)
73
+ time.sleep(1)
74
+ self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att.lower()]) + ']'))).click()
75
+ time.sleep(1)
76
+
77
+ def select_by_status(self, listed_states, status_dict):
78
+ if listed_states:
79
+ for status in listed_states:
80
+ self.driver.find_element(By.ID,status_dict[status.lower()]).click()
81
+
82
+ self.driver.find_element(By.XPATH,'//*[@id="FiltersBody"]/div[1]/input[1]').click()
83
+ time.sleep(3)
84
+
85
+
86
+ select = Select(self.driver.find_element_by_name('theDataTable_length'))
87
+ select.select_by_value('100')
88
+
89
+ def collect_data_search_page(self,l_ordered, amount_of_data = None):
90
+
91
+ class_name = ''
92
+ page_index = 1
93
+
94
+ elements = [l_ordered]
95
+
96
+ while 'disabled' not in class_name :
97
+
98
+
99
+
100
+ time.sleep(10)
101
+
102
+ print('Getting data from page {}'.format(page_index))
103
+
104
+ #Counting how many rows of the table appear
105
+ table = self.driver.find_element(By.ID,'theDataTable')
106
+ row_count = len(table.find_elements(By.TAG_NAME,"tr"))
107
+
108
+ #Looping table page
109
+ for index in range(1, row_count):
110
+ row = []
111
+ if 'status' in l_ordered:
112
+ self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3)')))
113
+ status_element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3) > span')
114
+ row.append(status_element.text.strip())
115
+ for i, val in enumerate(l_ordered):
116
+ if val == 'status':
117
+ continue
118
+
119
+ self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')))
120
+ element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')
121
+ try:
122
+ row.append(element.text.strip())
123
+ except:
124
+ print(i, element)
125
+ else:
126
+ for i, val in enumerate(l_ordered):
127
+ self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')))
128
+ element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')
129
+ try:
130
+ row.append(element.text.strip())
131
+ except:
132
+ print(i, element)
133
+ elements.append(row)
134
+
135
+
136
+
137
+
138
+ #Getting next page button
139
+ next_page= self.driver.find_element(By.ID,"theDataTable_next")
140
+
141
+ #Getting the class attribute of the next page button
142
+ class_name = next_page.get_attribute('class')
143
+
144
+ #Going to the next page
145
+ next_page.click()
146
+ page_index += 1
147
+
148
+ if amount_of_data:
149
+ if len(elements) >= amount_of_data or row_count < amount_of_data :
150
+ break
151
+ else:
152
+ continue
153
+
154
+ return elements
155
+
156
+ def get_criteria(self, NCTnumber):
157
+
158
+ url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
159
+ ClinicalTrialpage = requests.get(url)
160
+ soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
161
+
162
+ wrapping_crit_class = soup.find_all("div", {"class": "tr-indent2"})
163
+ list_elements = wrapping_crit_class[1].find_all(re.compile("(ul|ol)"))
164
+ inclusion, exclusion = ('','')
165
+
166
+
167
+ if not list_elements:
168
+ print ("WARNING: Study number " + NCTnumber + " doesn't have eligibility criteria or HTML tag format is not a list")
169
+ else:
170
+
171
+ if len(list_elements) == 1:
172
+ try:
173
+ if wrapping_crit_class[1].find(text = 'Inclusion Criteria:'):
174
+ inclusion = list_elements[0].find_all("li")
175
+
176
+ elif wrapping_crit_class[1].find(text = 'Exclusion Criteria:'):
177
+ exclusion = list_elements[0].find_all("li")
178
+ except:
179
+ print('criteria doesnt exist')
180
+ else:
181
+ inclusion = list_elements[0].find_all("li")
182
+ exclusion = list_elements[1].find_all("li")
183
+
184
+
185
+ inclusion = ' '.join([t.text.strip() for t in inclusion ])
186
+ exclusion = ' '.join([t.text.strip() for t in exclusion ])
187
+
188
+ return(inclusion, exclusion)
189
+
190
+ #function that gets number of patients enrolled in a study
191
+ def get_enrollment (self, NCTnumber):
192
+ url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
193
+ ClinicalTrialpage = requests.get(url)
194
+ soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
195
+ enrollment = ''
196
+ wrapping_enrol_class = soup.find_all('td', {'headers':'studyInfoColData','style':"padding-left:1em"})
197
+ if not wrapping_enrol_class:
198
+ print('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
199
+ else:
200
+ enrollment = wrapping_enrol_class[1]
201
+ enrollment = enrollment.text.split()[0]
202
+ if enrollment.isdigit() == False:
203
+ print ('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
204
+ else:
205
+ return(enrollment)
206
+
207
+
208
+
209
+ def __call__(self, condition, listed_attributes, listed_states, amount_of_data):
210
+
211
+ self.driver.get('https://clinicaltrials.gov/ct2/results?cond=' + condition + '&rank=1&view=record#rowId0')
212
+ self.select_attributes_to_show(listed_attributes, self.attribute_dict)
213
+
214
+ try:
215
+ self.select_by_status(listed_states, self.status_dict)
216
+ except:
217
+ print('select by status is a problem')
218
+ n = []
219
+ for i in listed_attributes:
220
+ n.append(self.attribute_dict[i.lower()])
221
+ attribute_ordered = [list(self.attribute_dict.keys())[list(self.attribute_dict.values()).index(i)]for i in sorted(n)]
222
+
223
+ search_data = self.collect_data_search_page(attribute_ordered, amount_of_data=amount_of_data)
224
+ nct_numbers = [e[search_data[0].index('nct number')] for e in search_data[1:]]
225
+ search_data[0].extend(['inclusion', 'exclusion', 'enrollment'])
226
+ for index, nct in enumerate(nct_numbers):
227
+ if index % 100 == 0 and index!= 0:
228
+ print("Collected Data from {} Studies: ".format(index))
229
+
230
+ inc, exc = self.get_criteria(nct)
231
+ enrol = self.get_enrollment(nct)
232
+ search_data[index + 1].extend([inc, exc, enrol])
233
+ return search_data
234
+ # except:
235
+ # print('no data available with the specified status')
236
+
237
+
OpenAITools/CrinicalTrialTools.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.agent_toolkits import create_sql_agent
2
+ from langchain_openai import ChatOpenAI
3
+ from langchain_groq import ChatGroq
4
+ from langchain_core.prompts import ChatPromptTemplate
5
+ from langchain_core.pydantic_v1 import BaseModel, Field
6
+ import pandas as pd
7
+ from pydantic import BaseModel, Field
8
+
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain_community.vectorstores import Chroma
11
+ from langchain.embeddings import HuggingFaceEmbeddings
12
+ from langchain_core.runnables import RunnablePassthrough
13
+ from langchain_core.output_parsers import StrOutputParser
14
+
15
+
16
+
17
+ gpt = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
18
+ #agent_gpt_executor = create_sql_agent(gpt, db=db, agent_type="tool-calling", verbose=True)
19
+
20
+ ## make database
21
+ from langchain_community.utilities import SQLDatabase
22
+ from sqlalchemy import create_engine
23
+
24
+ from langchain.prompts import ChatPromptTemplate
25
+ from langchain.schema import SystemMessage
26
+ from langchain_core.prompts import MessagesPlaceholder
27
+ #agent_groq_executor = create_sql_agent(llm, db=db, agent_type="tool-calling", verbose=True)
28
+
29
+ from OpenAITools.FetchTools import fetch_clinical_trials, fetch_clinical_trials_jp
30
+
31
+
32
+
33
+ ## Cancer Name の抽出
34
+ class ExtractTumorName(BaseModel):
35
+ """Extract tumor name from the user's question."""
36
+ tumor_name: str = Field(description="Extracted tumor name from the question, or 'None' if no tumor found")
37
+
38
+ class TumorNameExtractor:
39
+ def __init__(self, llm):
40
+ self.llm = llm
41
+
42
+ # LLMの出力を構造化するための設定
43
+ self.structured_llm_extractor = self.llm.with_structured_output(ExtractTumorName)
44
+
45
+ # システムプロンプトを設定
46
+ self.system_prompt = """あなたは、ユーザーの質問に基づいて腫瘍名を英語で抽出するシステムです。\n
47
+ 質問文に腫瘍の種類や名前が含まれている場合、それを英語で返してください。\n
48
+ 質問文に腫瘍名がない場合は 'None' と返答してください。"""
49
+
50
+ # プロンプトテンプレート
51
+ self.grade_prompt = ChatPromptTemplate.from_messages(
52
+ [
53
+ ("system", self.system_prompt),
54
+ ("human", "ユーザーの質問: {question}"),
55
+ ]
56
+ )
57
+
58
+ def extract_tumor_name(self, question: str) -> str:
59
+ """
60
+ 腫瘍名を抽出するメソッド。
61
+ :param question: 質問文
62
+ :return: 抽出された腫瘍名
63
+ """
64
+ # 質問から腫瘍名を抽出する処理
65
+ tumor_extractor = self.grade_prompt | self.structured_llm_extractor
66
+ result = tumor_extractor.invoke({"question": question})
67
+ return result.tumor_name
68
+
69
+ ### 質問変更システム
70
+
71
+ # ModifyQuestionの出力形式を定義
72
+ class ModifyQuestion(BaseModel):
73
+ """Class for modifying a question by inserting NCTID."""
74
+ modified_question: str = Field(description="The modified question with the inserted NCTID.")
75
+
76
+ class QuestionModifier:
77
+ def __init__(self, llm):
78
+ self.llm = llm
79
+
80
+ # LLMの出力を構造化するための設定
81
+ self.structured_llm_modifier = self.llm.with_structured_output(ModifyQuestion)
82
+
83
+ # システムプロンプトを設定
84
+ self.system_prompt = """あなたは、ユーザーの質問に対して適切なNCTIDを挿入して質問を変更するシステムです。\n
85
+ 質問文にNCTIDを挿入し、形式に基づいて新しい質問を生成してください。\n
86
+ 例えば16歳男性の神経膠腫の患者さんが参加できる臨床治験を教えて下さいという質問に対しては\n
87
+ 16歳男性の神経膠腫の患者さんは{nct_id}に参加できますか?と変更して下さい\n
88
+ NCTIDは {nct_id} を使用してください。"""
89
+
90
+ # プロンプトテンプレート
91
+ self.modify_prompt = ChatPromptTemplate.from_messages(
92
+ [
93
+ ("system", self.system_prompt),
94
+ ("human", "ユーザーの質問: {question}"),
95
+ ]
96
+ )
97
+
98
+ def modify_question(self, question: str, nct_id: str) -> str:
99
+ """
100
+ 質問を変更するメソッド。
101
+ :param question: 質問文
102
+ :param nct_id: NCTID
103
+ :return: NCTIDを挿入した新しい質問
104
+ """
105
+ # 質問を変更するプロセス
106
+ question_modifier = self.modify_prompt | self.structured_llm_modifier
107
+ result = question_modifier.invoke({"question": question, "nct_id": nct_id})
108
+ modify_question = result.modified_question
109
+ return modify_question
110
+
111
+ class QuestionModifierSecond:
112
+ def __init__(self, llm):
113
+ self.llm = llm
114
+
115
+ # LLMの出力を構造化するための設定
116
+ self.structured_llm_modifier = self.llm.with_structured_output(ModifyQuestion)
117
+
118
+ # システムプロンプトを設定
119
+ self.system_prompt = """あなたは、ユーザーの質問を変更するシステムです。\n
120
+ 形式に基づいて新しい質問を生成してください。\n
121
+ 例えば16歳男性の神経膠腫の患者さんが参加できる臨床治験を教えて下さいという質問に対しては\n
122
+ 16歳男性の神経膠腫の患者さんはlこの治験に参加できますか?と変更して下さい\n
123
+ """
124
+
125
+ # プロンプトテンプレート
126
+ self.modify_prompt = ChatPromptTemplate.from_messages(
127
+ [
128
+ ("system", self.system_prompt),
129
+ ("human", "ユーザーの質問: {question}"),
130
+ ]
131
+ )
132
+
133
+ def modify_question(self, question: str) -> str:
134
+ """
135
+ 質問を変更するメソッド。
136
+ :param question: 質問文
137
+ :param nct_id: NCTID
138
+ :return: NCTIDを挿入した新しい質問
139
+ """
140
+ # 質問を変更するプロセス
141
+ question_modifier = self.modify_prompt | self.structured_llm_modifier
142
+ result = question_modifier.invoke({"question": question})
143
+ modify_question = result.modified_question
144
+ return modify_question
145
+
146
+ class QuestionModifierEnglish:
147
+ def __init__(self, llm):
148
+ self.llm = llm
149
+
150
+ # LLMの出力を構造化するための設定
151
+ self.structured_llm_modifier = self.llm.with_structured_output(ModifyQuestion)
152
+
153
+ # システムプロンプトを設定
154
+ self.system_prompt = """あなたは、ユーザーの質問を変更し英語に翻訳するシステムです。\n
155
+ 形式に基づいて新しい質問を生成してください。\n
156
+ 例えば16歳男性の神経膠腫の患者さんが参加できる臨床治験を教えて下さいという質問に対しては\n
157
+ Can a 16 year old male patient with glioma participate in this clinical trial?と変更して下さい\n
158
+ """
159
+
160
+ # プロンプトテンプレート
161
+ self.modify_prompt = ChatPromptTemplate.from_messages(
162
+ [
163
+ ("system", self.system_prompt),
164
+ ("human", "ユーザーの質問: {question}"),
165
+ ]
166
+ )
167
+
168
+ def modify_question(self, question: str) -> str:
169
+ """
170
+ 質問を変更するメソッド。
171
+ :param question: 質問文
172
+ :param nct_id: NCTID
173
+ :return: NCTIDを挿入した新しい質問
174
+ """
175
+ # 質問を変更するプロセス
176
+ question_modifier = self.modify_prompt | self.structured_llm_modifier
177
+ result = question_modifier.invoke({"question": question})
178
+ modify_question = result.modified_question
179
+ return modify_question
180
+
181
+
182
+ ### Make criteria check Agent
183
+
184
+ class ClinicalTrialAgent:
185
+ def __init__(self, llm, db):
186
+ self.llm = llm
187
+ self.db = db
188
+
189
+ # システムプロンプトの定義
190
+ self.system_prompt = """
191
+ あなたは患者さんに適した治験を探すエージェントです。
192
+ データベースのEligibility Criteriaをチェックして患者さんがその治験を受けることが可能かどうか答えて下さい
193
+ """
194
+
195
+ # プロンプトテンプレートを作成
196
+ self.prompt = ChatPromptTemplate.from_messages(
197
+ [("system", self.system_prompt),
198
+ ("human", "{input}"),
199
+ MessagesPlaceholder("agent_scratchpad")]
200
+ )
201
+
202
+ # SQL Agentの設定
203
+ self.agent_executor = self.create_sql_agent(self.llm, self.db, self.prompt)
204
+
205
+ def create_sql_agent(self, llm, db, prompt):
206
+ """SQLエージェントを作成するメソッド"""
207
+ agent_executor = create_sql_agent(
208
+ llm,
209
+ db=db,
210
+ prompt=prompt,
211
+ agent_type="tool-calling",
212
+ verbose=True
213
+ )
214
+ return agent_executor
215
+
216
+ def get_agent_judgment(self, modify_question: str) -> str:
217
+ """
218
+ Modifyされた質問を元に、患者さんが治験に参加可能かどうかのエージェント判断を取得。
219
+ :param modify_question: NCTIDが挿入された質問
220
+ :return: エージェントの判断 (AgentJudgment)
221
+ """
222
+ # LLMに質問を投げて、判断を得る
223
+ result = self.agent_executor.invoke({"input": modify_question})
224
+ return result
225
+
226
+
227
+ class SimpleClinicalTrialAgent:
228
+ def __init__(self, llm):
229
+ self.llm = llm
230
+
231
+ def evaluate_eligibility(self, TargetCriteria: str, question: str) -> str:
232
+ """
233
+ 臨床試験の参加適格性を評価するメソッド。
234
+ :param TargetCriteria: 試験基準 (Inclusion/Exclusion criteria)
235
+ :param question: 患者の条件に関する質問
236
+ :return: 臨床試験に参加可能かどうかのLLMからの応答
237
+ """
238
+
239
+ # プロンプトの定義
240
+ prompt_template = """
241
+ You are an agent looking for a suitable clinical trial for a patient.
242
+ Please answer whether the patient is eligible for this clinical trial based on the following criteria. If you do not know the answer, say you do not know. Your answer should be brief, no more than 3 sentences.
243
+ Question: {question}
244
+ Criteria:
245
+ """ + TargetCriteria
246
+
247
+ # プロンプトテンプレートの作成
248
+ criteria_prompt = ChatPromptTemplate.from_messages(
249
+ [
250
+ (
251
+ "human",
252
+ prompt_template
253
+ )
254
+ ]
255
+ )
256
+
257
+ # RAGチェーンの作成
258
+ rag_chain = (
259
+ {"question": RunnablePassthrough()}
260
+ | criteria_prompt
261
+ | self.llm
262
+ | StrOutputParser()
263
+ )
264
+
265
+ # 質問をチェーンに渡して、応答を得る
266
+ response = rag_chain.invoke(question)
267
+ return response
268
+
269
+
270
+ ### output 評価システム
271
+ class TrialEligibilityGrader(BaseModel):
272
+ """3値評価: yes, no, unclear"""
273
+ score: str = Field(
274
+ description="The eligibility of the patient for the clinical trial based on the document. Options are: 'yes', 'no', or 'unclear'."
275
+ )
276
+
277
+ class GraderAgent:
278
+ def __init__(self, llm):
279
+ self.llm = llm
280
+
281
+ # LLMの出力を構造化するための設定
282
+ self.structured_llm_grader = self.llm.with_structured_output(TrialEligibilityGrader)
283
+
284
+ # Graderの入力プロンプト
285
+ self.system_prompt = """
286
+ あなたは治験に参加する患者の適合性を評価するGraderです。
287
+ 以下のドキュメントを読み、患者が治験に参加可能かどうかを判断してください。
288
+ 'yes'(参加可能)、'no'(参加不可能)、'unclear'(判断できない)の3値で答えてください。
289
+ """
290
+
291
+ # 評価のためのプロンプトを作成
292
+ self.grade_prompt = ChatPromptTemplate.from_messages(
293
+ [
294
+ ("system", self.system_prompt),
295
+ (
296
+ "human",
297
+ "取得したドキュメント: \n\n {document} ",
298
+ ),
299
+ ]
300
+ )
301
+
302
+ def evaluate_eligibility(self, AgentJudgment_output: str) -> str:
303
+ """
304
+ AgentJudgment['output']を基に患者が治験に参加可能かどうかを評価し、スコア(AgentGrade)を返す。
305
+ :param AgentJudgment_output: エージェント判断の 'output' の値
306
+ :return: 評価されたスコア (AgentGrade)
307
+ """
308
+ GraderAgent = self.grade_prompt | self.structured_llm_grader
309
+ result = GraderAgent.invoke({"document": AgentJudgment_output})
310
+ AgentGrade = result.score
311
+ return AgentGrade
312
+
313
+ import re
314
+
315
+ class LLMTranslator:
316
+ def __init__(self, llm):
317
+ self.llm = llm
318
+ self.structured_llm_modifier = self.llm.with_structured_output(ModifyQuestion)
319
+
320
+ self.system_prompt = """あなたは、優秀な翻訳者です。\n
321
+ 日本語を英語に翻訳して下さい。\n
322
+ """
323
+ self.system_prompt2 = """あなたは、優秀な翻訳者です。\n
324
+ 日本語を英語に以下のフォーマットに従って翻訳して下さい。\n
325
+ MainQuestion:
326
+ Known gene mutation:
327
+ Measurable tumour:
328
+ Biopsiable tumour:
329
+ """
330
+
331
+ self.modify_prompt = ChatPromptTemplate.from_messages(
332
+ [
333
+ ("system", self.system_prompt),
334
+ ("human", "ユーザーの質問: {question}"),
335
+ ]
336
+ )
337
+
338
+ self.modify_prompt2 = ChatPromptTemplate.from_messages(
339
+ [
340
+ ("system", self.system_prompt2),
341
+ ("human", "ユーザーの質問: {question}"),
342
+ ]
343
+ )
344
+
345
+ def is_english(self, text: str) -> bool:
346
+ """
347
+ 簡易的にテキストが英語かどうかを判定する関数
348
+ :param text: 判定するテキスト
349
+ :return: 英語の場合True、日本語の場合False
350
+ """
351
+ # 英語のアルファベットが多く含まれているかを確認
352
+ return bool(re.match(r'^[A-Za-z0-9\s.,?!]+$', text))
353
+
354
+ def translate(self, question: str) -> str:
355
+ """
356
+ 質問を翻訳するメソッド。英語の質問はそのまま返す。
357
+ :param question: 質問文
358
+ :return: 翻訳済みの質問文、または元の質問文(英語の場合)
359
+ """
360
+ # 質問が英語の場合、そのまま返す
361
+ if self.is_english(question):
362
+ return question
363
+
364
+ # 日本語の質問は翻訳プロセスにかける
365
+ question_modifier = self.modify_prompt | self.structured_llm_modifier
366
+ result = question_modifier.invoke({"question": question})
367
+ modify_question = result.modified_question
368
+ return modify_question
369
+
370
+ def translateQuestion(self, question: str) -> str:
371
+ """
372
+ フォーマット付きで質問を翻訳するメソッド。
373
+ :param question: 質問文
374
+ :return: フォーマットに従った翻訳済みの質問
375
+ """
376
+ question_modifier = self.modify_prompt2 | self.structured_llm_modifier
377
+ result = question_modifier.invoke({"question": question})
378
+ modify_question = result.modified_question
379
+ return modify_question
380
+
381
+ def generate_ex_question(age, sex, tumor_type, GeneMutation, Meseable, Biopsiable):
382
+ # GeneMutationが空の場合はUnknownに設定
383
+ gene_mutation_text = GeneMutation if GeneMutation else "Unknown"
384
+
385
+ # MeseableとBiopsiableの値をYes, No, Unknownに変換
386
+ meseable_text = (
387
+ "Yes" if Meseable == "有り" else "No" if Meseable == "無し" else "Unknown"
388
+ )
389
+ biopsiable_text = (
390
+ "Yes" if Biopsiable == "有り" else "No" if Biopsiable == "無し" else "Unknown"
391
+ )
392
+
393
+ # 質問文の生成
394
+ ex_question = f"""{age}歳{sex}の{tumor_type}患者さんはこの治験に参加することができますか?
395
+ 判明している遺伝子変異: {gene_mutation_text}
396
+ Meseable tumor: {meseable_text}
397
+ Biopsiable tumor: {biopsiable_text}
398
+ です。
399
+ """
400
+ return ex_question
401
+
402
+ def generate_ex_question_English(age, sex, tumor_type, GeneMutation, Meseable, Biopsiable):
403
+ # GeneMutationが空の場合は"Unknown"に設定
404
+ gene_mutation_text = GeneMutation if GeneMutation else "Unknown"
405
+
406
+ # sexの値を male または female に変換
407
+ sex_text = "male" if sex == "男性" else "female" if sex == "女性" else "Unknown"
408
+
409
+ # MeseableとBiopsiableの値を "Yes", "No", "Unknown" に変換
410
+ meseable_text = (
411
+ "Yes" if Meseable == "有り" else "No" if Meseable == "無し" else "Unknown"
412
+ )
413
+ biopsiable_text = (
414
+ "Yes" if Biopsiable == "有り" else "No" if Biopsiable == "無し" else "Unknown"
415
+ )
416
+
417
+ # 英語での質問文を生成
418
+ ex_question = f"""Can a {age}-year-old {sex_text} patient with {tumor_type} participate in this clinical trial?
419
+ Known gene mutation: {gene_mutation_text}
420
+ Measurable tumor: {meseable_text}
421
+ Biopsiable tumor: {biopsiable_text}
422
+ """
423
+ return ex_question
OpenAITools/ECarteTools.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import time
3
+ import wikipedia
4
+ import random
5
+ import re
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ import os
9
+ import glob
10
+ from natsort import natsorted
11
+ import requests
12
+ from bs4 import BeautifulSoup
13
+ import xml.etree.ElementTree as ET
14
+ import pandas as pd
15
+
16
+ wikipedia.set_lang("ja")
17
+ # APIキーの設定
18
+ openai.api_key = os.environ['OPENAI_API_KEY']
19
+ engine="gpt-3.5-turbo"
20
+
21
+
22
+ def generate(system_template,prompt,engine="gpt-3.5-turbo"):
23
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
24
+ try:
25
+ response = openai.ChatCompletion.create(
26
+ model=engine,
27
+ messages=[
28
+ {"role": "system", "content": system_template},
29
+ {"role": "user", "content":prompt},
30
+ ]
31
+ )
32
+ result=response["choices"][0]["message"]["content"]
33
+ return result
34
+ except:
35
+ print("リトライ")
36
+ time.sleep(30)
37
+ pass
38
+
39
+ def generate_carte(prompt,engine="gpt-3.5-turbo"):
40
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
41
+ try:
42
+ response = openai.ChatCompletion.create(
43
+ model=engine,
44
+ messages=[
45
+ {"role": "system", "content": "You are useful assistant"},
46
+ {"role": "user", "content":"%s\n・・・という患者と医師の会話をSOAP形式のカルテとして日本語で端的にまとめて下さい。各セクションはS),O), A),P)として下さい "%prompt},
47
+ ]
48
+ )
49
+ result=response["choices"][0]["message"]["content"]
50
+ return result
51
+ except:
52
+ print("リトライ")
53
+ time.sleep(30)
54
+ pass
55
+
56
+ def get_selected_fileds(texts):
57
+ input_name = texts.replace(' ' , "+")
58
+ corona_fields = ct.get_study_fields(
59
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
60
+ fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
61
+ max_studies=500,
62
+ fmt="csv")
63
+ return corona_fields
64
+
65
+ def get_retriever_str(fields):
66
+ retriever_str=''
67
+ for i in range(1,len(fields)):
68
+ colnames = fields[0]
69
+ targetCol = fields[i]
70
+ for f in range(len(fields[0])):
71
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
72
+ retriever_str+='\n'
73
+ return retriever_str
OpenAITools/ExpertTools.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import time
4
+ import wikipedia
5
+ import random
6
+ import re
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ import os
10
+ import glob
11
+ from natsort import natsorted
12
+ import requests
13
+ from bs4 import BeautifulSoup
14
+ import xml.etree.ElementTree as ET
15
+ from pytrials.client import ClinicalTrials
16
+ from Bio import Entrez
17
+ import pandas as pd
18
+ import numpy as np
19
+ import time
20
+ #from langchain.agents import create_pandas_dataframe_agent
21
+ from langchain_experimental.agents import create_pandas_dataframe_agent
22
+ #from langchain.llms import OpenAI
23
+ from langchain_community.llms import OpenAI
24
+
25
+ # APIキーの設定
26
+ openai.api_key = os.environ['OPENAI_API_KEY']
27
+ gptengine="gpt-3.5-turbo"
28
+
29
+
30
+ """def get_selected_fileds(texts):
31
+ ct = ClinicalTrials()
32
+ input_name = texts.replace(' ' , "+")
33
+ corona_fields = ct.get_study_fields(
34
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
35
+ fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
36
+ max_studies=500,
37
+ fmt="csv")
38
+ return corona_fields"""
39
+
40
+ def get_retriever_str(fields):
41
+ retriever_str=''
42
+ for i in range(1,len(fields)):
43
+ colnames = fields[0]
44
+ targetCol = fields[i]
45
+ for f in range(len(fields[0])):
46
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
47
+ retriever_str+='\n'
48
+ return retriever_str
49
+
50
+ def get_chanked_retriever(fields):
51
+ retriever_list =[]
52
+ for i in range(1,len(fields)):
53
+ retriever_str=''
54
+ colnames = fields[0]
55
+ targetCol = fields[i]
56
+ for f in range(len(fields[0])):
57
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
58
+ retriever_list.append(retriever_str)
59
+ return retriever_list
60
+
61
+ from pytrials.client import ClinicalTrials
62
+ def get_selected_fields(texts, split_criteria=False,
63
+ split_word_number = False, split_number=700):
64
+ ct = ClinicalTrials()
65
+ input_name = texts.replace(' ', "+")
66
+ corona_fields = ct.get_study_fields(
67
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (input_name),
68
+ fields=["NCTId", "Condition", "BriefTitle", 'BriefSummary', 'EligibilityCriteria'],
69
+ max_studies=500,
70
+ fmt="csv")
71
+
72
+ if split_criteria:
73
+ new_fields = []
74
+
75
+ # 検索対象の文字列
76
+ target_string1 = 'Exclusion Criteria'
77
+ target_string2 = 'Exclusion criteria'
78
+
79
+ # 各要素で検索対象の文字列を探し、直前で分割して新しいリストに格納
80
+ for corona_field in corona_fields:
81
+ new_list = []
82
+ for item in corona_field:
83
+ if target_string1 in item:
84
+ split_position = item.index(target_string1)
85
+ new_list.append(item[:split_position])
86
+ new_list.append(item[split_position:])
87
+ elif target_string2 in item:
88
+ split_position = item.index(target_string2)
89
+ new_list.append(item[:split_position])
90
+ new_list.append(item[split_position:])
91
+ else:
92
+ new_list.append(item)
93
+ new_fields.append(new_list)
94
+ else:
95
+ new_fields = corona_fields
96
+
97
+ if split_word_number:
98
+ split_fields = []
99
+ for new_field in new_fields:
100
+ new_list= []
101
+
102
+ # 各要素を調べて、700文字以上であれば分割し、新しいリストに格納
103
+ for item in new_field:
104
+ item_length = len(item)
105
+ if item_length > split_number:
106
+ num_parts = -(-item_length // split_number) # 向上の除算を用いて分割数を計算
107
+ for i in range(num_parts):
108
+ start_index = i * split_number
109
+ end_index = min((i + 1) * split_number, item_length) # 文字列の終わりを超えないように調整
110
+ new_list.append(item[start_index:end_index])
111
+ else:
112
+ new_list.append(item)
113
+
114
+ split_fields.append(new_list)
115
+ new_fields = split_fields
116
+
117
+ return new_fields
118
+
119
+
120
+ def print_agent_results(df, Ids,
121
+ interesteds = ['Condition', 'BriefTitle', 'BriefSummary', 'EligibilityCriteria'],
122
+ translater=None):
123
+ results = ""
124
+ for Id in Ids:
125
+ print("%s\n"%Id)
126
+ sdf = df[df['NCTId'] == Id]
127
+ for interested in interesteds:
128
+ # 最初の要素を取得
129
+ results += '%s: \n %s \n' % (interested, sdf[interested].iloc[0])
130
+ #print('%s: \n %s \n' % (interested, sdf[interested].iloc[0]))
131
+ if translater:
132
+ to_be_printed = translater.translate(results)
133
+ else:
134
+ to_be_printed =results
135
+ print(to_be_printed)
136
+
137
+ def search(query):
138
+ Entrez.email = os.getenv('MAIL_ADRESS')
139
+ #Entrez.email='[email protected]'
140
+ handle = Entrez.esearch(db='pubmed',
141
+ sort = 'relevance',
142
+ retmax = '20',
143
+ retmode = 'xml',
144
+ term = query)
145
+ results = Entrez.read(handle)
146
+ return results
147
+
148
+ def fetch_details(id_list):
149
+ ids = ','.join(id_list)
150
+ Entrez.email = os.getenv('MAIL_ADRESS')
151
+ #Entrez.email = '[email protected]'
152
+ handle = Entrez.efetch(db = 'pubmed',
153
+ retmode = 'xml',
154
+ id = ids)
155
+ results = Entrez.read(handle)
156
+ return results
157
+ '''def generate(prompt,engine=None):
158
+ if engine is None:
159
+ engine=gptengine
160
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
161
+ try:
162
+ response = openai.ChatCompletion.create(
163
+ model=engine,
164
+ messages=[
165
+ {"role": "system", "content": "You are useful assistant"},
166
+ {"role": "user", "content":prompt},
167
+ ]
168
+ )
169
+ result=response["choices"][0]["message"]["content"]
170
+ return result
171
+ except Exception as e:
172
+ print(e)
173
+ print("リトライ")
174
+ time.sleep(30)
175
+ pass
176
+ '''
177
+
178
+ def generate(prompt,engine=None):
179
+ if engine is None:
180
+ engine=gptengine
181
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
182
+ try:
183
+ response = openai.chat.completions.create(
184
+ model=engine,
185
+ messages=[
186
+ {"role": "system", "content": "You are useful assistant"},
187
+ {"role": "user", "content":prompt},
188
+ ]
189
+ )
190
+ #result=response["choices"][0]["message"]["content"]
191
+ result=response.choices[0].message.content
192
+ return result
193
+ except Exception as e:
194
+ print(e)
195
+ print("リトライ")
196
+ time.sleep(30)
197
+ pass
198
+
199
+ def GetPubmedSummaryDf(studies):
200
+ title_list= []
201
+ abstract_list=[]
202
+ journal_list = []
203
+ language_list =[]
204
+ pubdate_year_list = []
205
+ pubdate_month_list = []
206
+ studiesIdList = studies['IdList']
207
+ chunk_size = 10000
208
+ for chunk_i in range(0, len(studiesIdList), chunk_size):
209
+ chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
210
+
211
+ try:
212
+ papers = fetch_details(chunk)
213
+ for i, paper in enumerate(papers['PubmedArticle']):
214
+ title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
215
+ try:
216
+ abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
217
+ except:
218
+ abstract_list.append('No Abstract')
219
+ journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
220
+ language_list.append(paper['MedlineCitation']['Article']['Language'][0])
221
+ try:
222
+ pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
223
+ except:
224
+ pubdate_year_list.append('No Data')
225
+ try:
226
+ pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
227
+ except:
228
+ pubdate_month_list.append('No Data')
229
+ except: # occasionally a chunk might annoy your parser
230
+ pass
231
+ df = pd.DataFrame(list(zip(
232
+ title_list, abstract_list, journal_list, language_list, pubdate_year_list,
233
+ pubdate_month_list)),
234
+ columns=['Title', 'Abstract', 'Journal', 'Language', 'Year','Month'])
235
+ return df, abstract_list
236
+
237
+ def ClinicalAgent(fileds, verbose=False):
238
+ df = pd.DataFrame.from_records(fileds[1:], columns=fileds[0])
239
+ return create_pandas_dataframe_agent(OpenAI(temperature=0, model='gpt-3.5-turbo-16k'), df, verbose=verbose)
240
+
241
+ def GetNCTID(results):
242
+ # NCTで始まる単語を検索する正規表現
243
+ pattern = r'\bNCT\d+\b'
244
+ # 正規表現を使って単語を抽出
245
+ nct_words = re.findall(pattern,results)
246
+ return nct_words
OpenAITools/FetchTools.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ #from llama_index.llms.replicate import Replicate
4
+ import requests
5
+ import re
6
+
7
+
8
+ def extract_japan_cities(text):
9
+ # 正規表現を使用して " - Japan" で終わる都市名を抽出
10
+ pattern = r'(\b\w+\s*\w*\b) - Japan'
11
+ cities = re.findall(pattern, text)
12
+ unique_cities = list(set(cities))
13
+ # ユニークな都市名をソートしてカンマで区切られた文字列に変換
14
+ unique_cities.sort()
15
+ return ', '.join(unique_cities)
16
+
17
+ def fetch_clinical_trials(cancer_name):
18
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
19
+ # Initial URL for the first API call
20
+ base_url = "https://clinicaltrials.gov/api/v2/studies"
21
+ params = {
22
+ "query.titles": search_expr,
23
+ "pageSize": 100
24
+ }
25
+
26
+ # Initialize an empty list to store the data
27
+ data_list = []
28
+ # Loop until there is no nextPageToken
29
+ while True:
30
+ # Print the current URL (for debugging purposes)
31
+ print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
32
+
33
+ # Send a GET request to the API
34
+ response = requests.get(base_url, params=params)
35
+
36
+ # Check if the request was successful
37
+ if response.status_code == 200:
38
+ data = response.json() # Parse JSON response
39
+ studies = data.get('studies', []) # Extract the list of studies
40
+
41
+ # Loop through each study and extract specific information
42
+ for study in studies:
43
+ # Safely access nested keys
44
+ nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
45
+ startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
46
+ conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
47
+ title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
48
+ summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
49
+
50
+ # Extract locations safely
51
+ locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
52
+ locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
53
+
54
+ JapanesLocations = extract_japan_cities(locations)
55
+ # Extract dates and phases
56
+ primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
57
+
58
+ phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
59
+ eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
60
+
61
+ # Append the data to the list as a dictionary
62
+ data_list.append({
63
+ "NCTID": nctId,
64
+ "Title": title,
65
+ #"Start Date": startDate,
66
+ "Primary Completion Date": primaryCompletionDate,
67
+ #"Conditions": conditions,
68
+ "Cancer": conditions,
69
+ "Summary": summary,
70
+ "Japanes Locations": JapanesLocations,
71
+ #"Phases": phases,
72
+ "Eligibility Criteria": eligibilityCriteria
73
+ })
74
+
75
+ # Check for nextPageToken and update the params or break the loop
76
+ nextPageToken = data.get('nextPageToken')
77
+ if nextPageToken:
78
+ params['pageToken'] = nextPageToken # Set the pageToken for the next request
79
+ else:
80
+ break # Exit the loop if no nextPageToken is present
81
+ else:
82
+ print("Failed to fetch data. Status code:", response.status_code)
83
+ break
84
+
85
+ # Create a DataFrame from the list of dictionaries
86
+ df = pd.DataFrame(data_list)
87
+ return df
88
+
89
+ def fetch_clinical_trials_jp(cancer_name):
90
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
91
+ # Initial URL for the first API call
92
+ base_url = "https://clinicaltrials.gov/api/v2/studies"
93
+ params = {
94
+ "query.titles": search_expr,
95
+ "pageSize": 100
96
+ }
97
+
98
+ # Initialize an empty list to store the data
99
+ data_list = []
100
+ # Loop until there is no nextPageToken
101
+ while True:
102
+ # Print the current URL (for debugging purposes)
103
+ print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
104
+
105
+ # Send a GET request to the API
106
+ response = requests.get(base_url, params=params)
107
+
108
+ # Check if the request was successful
109
+ if response.status_code == 200:
110
+ data = response.json() # Parse JSON response
111
+ studies = data.get('studies', []) # Extract the list of studies
112
+
113
+ # Loop through each study and extract specific information
114
+ for study in studies:
115
+ # Safely access nested keys
116
+ nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
117
+ startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
118
+ conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
119
+ title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
120
+ summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
121
+
122
+ # Extract locations safely
123
+ locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
124
+ locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
125
+
126
+ JapanesLocations = extract_japan_cities(locations)
127
+ # Extract dates and phases
128
+ primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
129
+
130
+ phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
131
+ eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
132
+
133
+ # Append the data to the list as a dictionary
134
+ data_list.append({
135
+ "NCTID": nctId,
136
+ "タイトル": title,
137
+ #"Start Date": startDate,
138
+ #"Primary Completion Date": primaryCompletionDate,
139
+ "対象となる癌": conditions,
140
+ "サマリー": summary,
141
+ "場所": JapanesLocations,
142
+ #"Phases": phases,
143
+ "クライテリア": eligibilityCriteria
144
+ })
145
+
146
+ # Check for nextPageToken and update the params or break the loop
147
+ nextPageToken = data.get('nextPageToken')
148
+ if nextPageToken:
149
+ params['pageToken'] = nextPageToken # Set the pageToken for the next request
150
+ else:
151
+ break # Exit the loop if no nextPageToken is present
152
+ else:
153
+ print("Failed to fetch data. Status code:", response.status_code)
154
+ break
155
+
156
+ # Create a DataFrame from the list of dictionaries
157
+ df = pd.DataFrame(data_list)
158
+ return df
OpenAITools/JRCTTools.py ADDED
@@ -0,0 +1,559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.common.by import By
3
+ from selenium.webdriver.support.ui import WebDriverWait
4
+ from selenium.webdriver.support import expected_conditions as EC
5
+ import csv
6
+
7
+
8
+ from selenium import webdriver
9
+ from selenium.webdriver.common.by import By
10
+ from selenium.webdriver.support.ui import WebDriverWait
11
+ from selenium.webdriver.support import expected_conditions as EC
12
+ import csv
13
+
14
+ from selenium.common.exceptions import ElementClickInterceptedException, TimeoutException
15
+
16
+ import pandas as pd
17
+ import requests
18
+ from bs4 import BeautifulSoup
19
+ import time
20
+ import unicodedata
21
+ import re
22
+ import ast
23
+ import torch
24
+
25
+
26
+ from selenium import webdriver
27
+ from selenium.webdriver.common.by import By
28
+ from selenium.webdriver.support.ui import WebDriverWait
29
+ from selenium.webdriver.support import expected_conditions as EC
30
+ from selenium.common.exceptions import ElementClickInterceptedException
31
+
32
+
33
+ def fetch_clinical_trials(
34
+ disease_name="",
35
+ freeword="",
36
+ include_not_yet_recruiting=False,
37
+ include_suspended=False,
38
+ specific_clinical_research=True,
39
+ corporate_clinical_trial=True,
40
+ physician_initiated_clinical_trial=True,
41
+ ):
42
+ """
43
+ 指定された条件に基づいてjRCTから臨床試験情報を取得します。
44
+
45
+ Args:
46
+ disease_name (str): 対象疾患名(例: "がん 神経膠腫 骨髄腫")
47
+ freeword (str): フリーワード検索(例: "免疫療法")
48
+ include_not_yet_recruiting (bool): 募集前の試験も含める場合はTrue。
49
+ include_suspended (bool): 募集中断を含める場合はTrue。
50
+ specific_clinical_research (bool): 特定臨床研究を含める場合はTrue。
51
+ corporate_clinical_trial (bool): 企業治験を含める場合はTrue。
52
+ physician_initiated_clinical_trial (bool): 医師主導治験を含める場合はTrue。
53
+
54
+ Returns:
55
+ list: 検索結果のリスト([試験ID, タイトル, 対象疾患, 進捗状況, 日付, リンク])
56
+ """
57
+ # WebDriverを初期化
58
+ driver = webdriver.Chrome() # 必要に応じてChromeDriverを設定
59
+
60
+ all_results = []
61
+
62
+ try:
63
+ # jRCTの検索ページにアクセス
64
+ driver.get("https://jrct.niph.go.jp/search")
65
+
66
+ # 対象疾患名を入力
67
+ if disease_name:
68
+ disease_field = WebDriverWait(driver, 10).until(
69
+ EC.presence_of_element_located((By.ID, "reg-plobrem-1"))
70
+ )
71
+ disease_field.send_keys(disease_name)
72
+
73
+ # 対象疾患名の条件を「or」に設定
74
+ condition_select = driver.find_element(By.ID, "reg-plobrem-type")
75
+ condition_select.find_element(By.CSS_SELECTOR, "option[value='1']").click()
76
+
77
+ # フリーワード検索を入力
78
+ if freeword:
79
+ freeword_field = WebDriverWait(driver, 10).until(
80
+ EC.presence_of_element_located((By.ID, "demo-1"))
81
+ )
82
+ freeword_field.send_keys(freeword)
83
+
84
+ # フリーワード検索の条件を「or」に設定
85
+ condition_select = driver.find_element(By.ID, "others")
86
+ condition_select.find_element(By.CSS_SELECTOR, "option[value='1']").click()
87
+
88
+ # 募集中を選択
89
+ recruitment_checkbox = driver.find_element(By.ID, "reg-recruitment-2")
90
+ recruitment_checkbox.click()
91
+
92
+ # 募集前も含める場合
93
+ if include_not_yet_recruiting:
94
+ not_yet_checkbox = driver.find_element(By.ID, "reg-recruitment-1")
95
+ not_yet_checkbox.click()
96
+
97
+ # 募集中断を選択
98
+ if include_suspended:
99
+ suspended_checkbox = driver.find_element(By.ID, "reg-recruitment-3")
100
+ suspended_checkbox.click()
101
+
102
+ # 特定臨床研究を選択
103
+ if specific_clinical_research:
104
+ specific_checkbox = driver.find_element(By.ID, "is-specific1")
105
+ specific_checkbox.click()
106
+
107
+ # 企業治験を選択
108
+ if corporate_clinical_trial:
109
+ corporate_checkbox = driver.find_element(By.ID, "is-specific3")
110
+ corporate_checkbox.click()
111
+
112
+ # 医師主導治験を選択
113
+ if physician_initiated_clinical_trial:
114
+ physician_checkbox = driver.find_element(By.ID, "is-specific7")
115
+ physician_checkbox.click()
116
+
117
+ # 検索ボタンをクリック
118
+ try:
119
+ search_button = driver.find_element(By.NAME, "button_type")
120
+ driver.execute_script("arguments[0].scrollIntoView();", search_button) # ボタンを画面内にスクロール
121
+ WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.NAME, "button_type"))).click()
122
+ except ElementClickInterceptedException:
123
+ print("検索ボタンがクリックできないため、JavaScriptでクリックします。")
124
+ driver.execute_script("arguments[0].click();", search_button)
125
+
126
+ # ページネーション対応ループ
127
+ while True:
128
+ # 現在のページの結果がロードされるのを待機
129
+ WebDriverWait(driver, 10).until(
130
+ EC.presence_of_element_located((By.CSS_SELECTOR, "table tbody tr"))
131
+ )
132
+
133
+ # 現在のページの結果を取得
134
+ rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
135
+ for row in rows:
136
+ columns = row.find_elements(By.TAG_NAME, "td")
137
+ if len(columns) > 4:
138
+ # 試験情報をリストに追加
139
+ trial_id = columns[0].text
140
+ title = columns[1].text
141
+ condition = columns[2].text
142
+ status = columns[3].text
143
+ date = columns[4].text
144
+
145
+ # リンクを取得(エラー処理を追加)
146
+ try:
147
+ link = columns[1].find_element(By.TAG_NAME, "a").get_attribute("href")
148
+ except Exception:
149
+ link = "リンク取得エラー"
150
+
151
+ all_results.append([trial_id, title, condition, status, date, link])
152
+
153
+ # ページネーションの確認
154
+ try:
155
+ current_page = driver.find_element(By.CSS_SELECTOR, "ul.pagination li.active").text
156
+ print(f"{current_page} ページ目を処理しました。")
157
+ except Exception:
158
+ print("ページネーションが存在しません。全ての結果を取得しました。")
159
+ break
160
+
161
+ # 次ページボタンのリストを取得
162
+ pagination_buttons = driver.find_elements(By.CSS_SELECTOR, "ul.pagination li a")
163
+ next_button = None
164
+ for button in pagination_buttons:
165
+ if button.text.isdigit() and int(button.text) > int(current_page):
166
+ next_button = button
167
+ break
168
+
169
+ if next_button:
170
+ try:
171
+ driver.execute_script("arguments[0].scrollIntoView();", next_button) # ボタンを画面内にスクロール
172
+ WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.LINK_TEXT, next_button.text))).click()
173
+ except ElementClickInterceptedException:
174
+ print("次ページボタンがクリックできないため、JavaScriptでクリックします。")
175
+ driver.execute_script("arguments[0].click();", next_button)
176
+ WebDriverWait(driver, 10).until(EC.staleness_of(rows[0])) # ページが変わるまで待機
177
+ else:
178
+ print("次のページはありません。全ての結果を取得しました。")
179
+ break
180
+
181
+ finally:
182
+ # ブラウザを閉じる
183
+ driver.quit()
184
+
185
+ return all_results
186
+
187
+
188
+
189
+ def scrape_jrct_all_details(url):
190
+ """
191
+ 指定されたjRCT URLから必要なすべての情報を抽出します。
192
+ """
193
+
194
+ def normalize_text(text):
195
+ if not text:
196
+ return ""
197
+ # Unicode正規化 + 余分な空白除去
198
+ text = unicodedata.normalize('NFKC', text)
199
+ return " ".join(text.split())
200
+
201
+ # リクエストを送信
202
+ headers = {
203
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
204
+ }
205
+ try:
206
+ response = requests.get(url, headers=headers, timeout=10)
207
+ response.raise_for_status()
208
+ except requests.RequestException as e:
209
+ print(f"URLリクエストに失敗しました: {url} - エラー: {e}")
210
+ return {"URL": url, "エラー": "リクエスト失敗"}
211
+
212
+ soup = BeautifulSoup(response.text, 'html.parser')
213
+
214
+ data = {"URL": url}
215
+
216
+ def extract_label_data(label_text, label_en=None):
217
+ """
218
+ 特定のラベルに対応するデータを抽出するヘルパー関数
219
+
220
+ 複数の候補があった場合は、すべて取得してからフィルタする方式をとる。
221
+ """
222
+ results = []
223
+ # 日本語ラベルと英語ラベルが両方指定されていれば、両方含む行を優先的に探す
224
+ combined_search = None
225
+ if label_en:
226
+ combined_search = f"{label_text} / {label_en}"
227
+
228
+ # ページ内のすべての<label>を探索
229
+ for l in soup.find_all('label'):
230
+ lt = normalize_text(l.get_text())
231
+ # combined_searchが利用可能ならまず完全な結合形でマッチを試みる
232
+ # なければ従来通りlabel_textをinでマッチ
233
+ if combined_search:
234
+ if combined_search in lt:
235
+ th = l.find_parent('th')
236
+ if not th:
237
+ continue
238
+ tr = th.find_parent('tr')
239
+ if not tr:
240
+ continue
241
+ tds = tr.find_all('td')
242
+ if len(tds) >= 1:
243
+ jp_data = normalize_text(tds[0].get_text()) if len(tds) > 0 else None
244
+ en_data = normalize_text(tds[1].get_text()) if label_en and len(tds) > 1 else None
245
+ results.append((jp_data, en_data))
246
+ else:
247
+ # label_enが無い場合は、label_textだけで検索
248
+ if label_text in lt:
249
+ th = l.find_parent('th')
250
+ if not th:
251
+ continue
252
+ tr = th.find_parent('tr')
253
+ if not tr:
254
+ continue
255
+ tds = tr.find_all('td')
256
+ if len(tds) >= 1:
257
+ jp_data = normalize_text(tds[0].get_text()) if len(tds) > 0 else None
258
+ en_data = normalize_text(tds[1].get_text()) if label_en and len(tds) > 1 else None
259
+ results.append((jp_data, en_data))
260
+
261
+ # resultsに候補が格納されている
262
+ if not results:
263
+ return None, None
264
+
265
+ # 複数候補がある場合、特定キーワードによるフィルタリングが可能
266
+ # ここでは特定キーワードがなければそのまま最初のを返す
267
+ # もし特定の疾患キーワードでフィルタリングしたい場合はここで処理を追加
268
+
269
+ # ひとまず最初の候補を返す
270
+ return results[0]
271
+
272
+ # "研究・治験の目的" を抽出
273
+ data["研究・治験の目的"], _ = extract_label_data("研究・治験の目的")
274
+
275
+ # 試験デザイン情報(日本語と英語)を抽出
276
+ design_labels = [
277
+ ('試験等のフェーズ', 'Phase'),
278
+ ('試験の種類', 'Study Type'),
279
+ ('無作為化', 'allocation'),
280
+ ('盲検化', 'masking'),
281
+ ('対照', 'control'),
282
+ ('割付け', 'assignment'),
283
+ ('研究目的', 'purpose')
284
+ ]
285
+ for label_jp, label_en in design_labels:
286
+ jp, en = extract_label_data(label_jp, label_en)
287
+ data[label_jp] = jp
288
+ data[label_en] = en
289
+
290
+ # その他の情報を抽出
291
+ # 対象疾患名 / Health Condition(s) or Problem(s) Studiedを追加
292
+ details_labels = [
293
+ ('主たる選択基準', 'Inclusion Criteria'),
294
+ ('主たる除外基準', 'Exclusion Criteria'),
295
+ ('年齢下限', 'Age Minimum'),
296
+ ('年齢上限', 'Age Maximum'),
297
+ ('性別', 'Gender'),
298
+ ('中止基準', 'Discontinuation Criteria'),
299
+ ('対象疾患名', 'Health Condition(s) or Problem(s) Studied'), # 追加
300
+ ('対象疾患キーワード', 'Keyword'),
301
+ ('介入の内容', 'Intervention(s)')
302
+ ]
303
+ for label_jp, label_en in details_labels:
304
+ jp, en = extract_label_data(label_jp, label_en)
305
+ data[label_jp] = jp
306
+ data[label_en] = en
307
+
308
+ # "他の臨床研究登録機関への登録" を探索
309
+ other_registries_section = soup.find("div", id="area-toggle-07-02")
310
+ japic_no_list = []
311
+ nct_no_list = []
312
+
313
+ if other_registries_section:
314
+ rows = other_registries_section.find_all("tr")
315
+ for row in rows:
316
+ label = row.find("label")
317
+ if label and ("ID番号" in label.text or "研究番号" in label.text):
318
+ value_td = row.find("td")
319
+ if value_td:
320
+ id_number = value_td.text.strip()
321
+ if id_number.startswith("JapicCTI"):
322
+ japic_no_list.append(id_number)
323
+ elif id_number.startswith("NCT"):
324
+ nct_no_list.append(id_number)
325
+
326
+ # JapicCTI No と NCT No を格納(複数あればカンマ区切り)
327
+ data["JapicCTI No"] = ", ".join(japic_no_list) if japic_no_list else None
328
+ data["NCT No"] = ", ".join(nct_no_list) if nct_no_list else None
329
+
330
+ # サーバーへの負荷を避けるためのスリープ
331
+ time.sleep(1) # 必要に応じて調整
332
+
333
+ return data
334
+
335
+
336
+
337
+
338
+
339
+
340
+
341
+
342
+
343
+
344
+ def create_dataframe_from_urls(urls, delay=5):
345
+ """
346
+ URLのリストを受け取り、pandas DataFrameを作成します。
347
+ リクエスト間に待機時間を設定して403エラーを防ぎます。
348
+
349
+ Args:
350
+ urls (list): jRCTの詳細ページURLリスト。
351
+ delay (int): 各リクエスト間の待機時間(秒単位、デフォルトは5秒)。
352
+
353
+ Returns:
354
+ pd.DataFrame: 取得したデータのDataFrame。
355
+ """
356
+ all_data = []
357
+
358
+ for url in urls:
359
+ print(f"Processing URL: {url}")
360
+ try:
361
+ # 各URLのデータを取得
362
+ data = scrape_jrct_all_details(url)
363
+ all_data.append(data)
364
+
365
+ # 次のリクエストまで待機
366
+ print(f"Waiting for {delay} seconds before the next request...")
367
+ time.sleep(delay)
368
+ except Exception as e:
369
+ print(f"Failed to process URL {url}: {e}")
370
+ # URLとエラー情報を記録しておく(必要ならログに保存など)
371
+ all_data.append({"URL": url, "Error": str(e)})
372
+
373
+ # pandas DataFrameに変換
374
+ return pd.DataFrame(all_data)
375
+
376
+
377
+ def extract_jrct_links(results):
378
+ """
379
+ fetch_clinical_trialsの結果からjRCT-Noを抽出し、詳細リンクを作成する。
380
+
381
+ Args:
382
+ results (list): fetch_clinical_trialsから得られる結果リスト
383
+
384
+ Returns:
385
+ list: jRCTの詳細ページリンクリスト
386
+ """
387
+ base_url = "https://jrct.niph.go.jp/latest-detail/"
388
+ links = []
389
+ for result in results:
390
+ if len(result) > 0:
391
+ jrct_no = result[0] # jRCT-Noは結果リストの最初の要素
392
+ links.append(base_url + jrct_no)
393
+ return links
394
+
395
+ def reorder_columns(df):
396
+ """
397
+ DataFrame の列を日本語の列を前半に、英語の列を後半に並び替える。
398
+ """
399
+ # 日本語と英語の列を分ける
400
+ jp_columns = [col for col in df.columns if all(ord(c) < 128 for c in col) is False] # 非 ASCII(日本語)文字列を含む列
401
+ en_columns = [col for col in df.columns if col not in jp_columns] # 残りの列を英語と仮定
402
+
403
+ # 日本語列 + 英語列の順序で整列
404
+ ordered_columns = jp_columns + en_columns
405
+
406
+ # 列を並び替えた DataFrame を返す
407
+ return df[ordered_columns]
408
+
409
+
410
+ # Target列を分割する関数
411
+ def split_target(target):
412
+ # 指定された区切り文字で分割
413
+ split_words = re.split(r'[,\n、・及びおよび又はまたは]+', target)
414
+ # 空白文字を除外してリストとして返す
415
+ return [word.strip() for word in split_words if word.strip()]
416
+
417
+
418
+ # Target列を分割する関数(改良後)
419
+ def split_target_English(target):
420
+ # 区切り文字を (,) or (\n) or (、) or (・) または文字列"or" として扱う
421
+ # 正規表現では、パイプ(|)でor条件を定義し、"(?: ... )"はグルーピングのみ行う非捕捉グループ
422
+ # [,\n、・] はいずれかの1文字とマッチ
423
+ # or は文字列全体とマッチ
424
+ # 複数連続した区切り文字をまとめて1回の分割として扱うために+(1回以上)とする
425
+ split_words = re.split(r'(?:[,\n、・]|or| and)+', target)
426
+
427
+ # 空白文字を除外してリストとして返す
428
+ return [word.strip() for word in split_words if word.strip()]
429
+
430
+ # 処理プログラム
431
+ def split_triple_negative_words(target_words):
432
+ updated_words = []
433
+ for word in target_words:
434
+ if 'triple negative' in word.lower():
435
+ # 'triple negative' の部分を追加
436
+ updated_words.append('Triple Negative') # 大文字で統一して追加
437
+ # 'triple negative' を除いた残りの部分を追加
438
+ remaining = word.lower().replace('triple negative', '').strip()
439
+ if remaining: # 残りの単語が存在する場合のみ追加
440
+ updated_words.append(remaining.title().strip()) # 単語の先頭を大文字化
441
+ else:
442
+ updated_words.append(word.strip().title()) # 単語の先頭を大文字化
443
+ return updated_words
444
+
445
+ class WordProcessor:
446
+ def __init__(self, target_words):
447
+ self.target_words = target_words
448
+
449
+ def process(self, target_words):
450
+ """
451
+ 入力された単語のリストを処理して、ターゲット単語に基づき分割します。
452
+ """
453
+ updated_words = []
454
+ for word in target_words:
455
+ word_lower = word.lower()
456
+ for target in self.target_words:
457
+ if target in word_lower:
458
+ # ターゲット単語を追加
459
+ updated_words.append(target.title())
460
+ # ターゲット単語を除いた残りを追加
461
+ remaining = word_lower.replace(target, '').strip()
462
+ if remaining:
463
+ updated_words.append(remaining.title())
464
+ break
465
+ else:
466
+ # ターゲット単語に該当しない場合
467
+ updated_words.append(word.strip().title())
468
+ return updated_words
469
+
470
+ def __call__(self, target_words):
471
+ """
472
+ インスタンスを関数として呼び出すためのエントリポイント。
473
+ """
474
+ return self.process(target_words)
475
+
476
+
477
+ import pandas as pd
478
+ from sentence_transformers import util
479
+ import torch
480
+
481
+ def DfPostProcess(exclusive_words, model, csv_loc=None, dataframe=None):
482
+ """
483
+ exclusive_words: 除外ワードリスト
484
+ model: SentenceTransformerなどのモデル
485
+ csv_loc: CSVファイルのパス(文字列)。dataframeが与えられない場合に使用。
486
+ dataframe: 既存のpandas.DataFrame。csv_locが与えられない場合に使用。
487
+ """
488
+ # csv_locもdataframeも与えられなかった場合はエラー
489
+ if csv_loc is None and dataframe is None:
490
+ raise ValueError("Either csv_loc or dataframe must be provided.")
491
+
492
+ # 入力データフレームの決定
493
+ if dataframe is not None:
494
+ basedf = dataframe.copy()
495
+ else:
496
+ basedf = pd.read_csv(csv_loc, index_col=0)
497
+
498
+ # '試験等のフェーズ'がNaNの行を削除
499
+ basedf = basedf.dropna(subset=['試験等のフェーズ'])
500
+
501
+ # WordProcessorインスタンス作成
502
+ processor = WordProcessor(exclusive_words)
503
+
504
+ # TargetEnglish列をsplit_target_Englishで処理しTargetWord列作成
505
+ basedf['TargetWord'] = basedf['TargetEnglish'].apply(split_target_English)
506
+
507
+ # NaNやNoneではない場合にprocessor適用
508
+ basedf['TargetWord'] = basedf['TargetWord'].apply(lambda x: processor(x) if isinstance(x, list) else x)
509
+
510
+ # TargetWord列をベクトル化し、リスト化して格納
511
+ target_vecs_list = []
512
+ for idx, target_words in enumerate(basedf['TargetWord']):
513
+ target_vecs = model.encode(target_words, convert_to_tensor=True).cpu()
514
+ # テンソルをリストに変換
515
+ target_vecs_list.append(target_vecs.tolist())
516
+
517
+ # TargetVec列にリストを格納 (dtype=objectのままでOK)
518
+ basedf['TargetVec'] = pd.Series(target_vecs_list, index=basedf.index, dtype=object)
519
+
520
+ return basedf
521
+
522
+
523
+
524
+ def get_matched_df(basedf, query, model, threshold=0.5):
525
+ # queryをベクトル化(テンソル化)しCPUへ移動
526
+ query_vec = model.encode(query, convert_to_tensor=True).cpu()
527
+
528
+ matched_indices = []
529
+ for idx, target_vec_str in enumerate(basedf['TargetVec']):
530
+ # CSVから読み込んだ時点でTargetVecはPythonリストを文字列化したものになっているため、
531
+ # ここでliteral_evalでリストに戻します。
532
+ if isinstance(target_vec_str, str):
533
+ # target_vec_strは"[[...], [...]]"のようなリスト形式
534
+ target_list = ast.literal_eval(target_vec_str) # リストに変換
535
+ target_vecs = torch.tensor(target_list) # リストからTensorへ
536
+ else:
537
+ # 万が一既にTensorの場合はそのまま使用
538
+ target_vecs = target_vec_str
539
+
540
+ # 必要であればCPUへ移動(通常はすでにCPU上のはず)
541
+ """if target_vecs[0].is_cuda:
542
+ target_vecs = target_vecs.cpu()"""
543
+
544
+ # コサイン類似度を計算
545
+ cosine_scores = util.cos_sim(query_vec, target_vecs).squeeze()
546
+
547
+ # thresholdを超えるスコアが1つでもあればマッチと判断
548
+ if (cosine_scores >= threshold).any():
549
+ matched_indices.append(idx)
550
+
551
+ # 条件を満たした行を抽出
552
+ matched_df = basedf.iloc[matched_indices]
553
+ return matched_df
554
+
555
+
556
+ def GetJRCTCriteria(dataframe, idx):
557
+ InC = dataframe.iloc[idx,:]['Inclusion Criteria']
558
+ ExC = dataframe.iloc[idx,:]['Exclusion Criteria']
559
+ return "Inclusion Criteria :" + InC + "\n" + "Exclusion Criteria :" + ExC
OpenAITools/ReviewPaperTools.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+
4
+ def parse_text_file(text):
5
+ # セクションを分割するための正規表現パターンを定義
6
+ # \d+ は1つ以上の数字にマッチします
7
+ pattern = re.compile(r'\n\n\n\d+\.')
8
+
9
+ # テキストをセクションごとに分割
10
+ sections = pattern.split(text)[1:] # 最初の空のセクションを除外
11
+
12
+ # 各セクションの前後の空白を削除
13
+ sections = [section.strip() for section in sections]
14
+
15
+ return sections
16
+
17
+ def split_sections(text):
18
+ contents = text.split('\n\n')
19
+ contents = [section.strip() for section in contents if section.strip()]
20
+ if len(contents) == 8 :
21
+ keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI', 'COI']
22
+ elif len(contents) == 7 :
23
+ keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI']
24
+ elif len(contents) == 6:
25
+ keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'DOI']
26
+ elif len(contents) == 5:
27
+ keys = ['PublishInfo', 'Title', 'AuthorName', 'Abstract', 'DOI']
28
+
29
+ # 辞書を作成し、キーが存在しない場合は空の文字列を設定
30
+ section_dict = {key: contents[i] if i < len(contents) else "" for i, key in enumerate(keys)}
31
+ return section_dict
32
+
33
+
34
+ def GetSummaryDf(textdir):
35
+ with open(textdir, 'r', encoding='utf-8') as f:
36
+ content = f.read()
37
+ sections = parse_text_file(content)
38
+ dicts = []
39
+ for section in sections:
40
+ splited_dic = split_sections(section)
41
+ dicts.append(splited_dic)
42
+ return pd.DataFrame(dicts)
OpenAITools/scrapeThisData.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.support.ui import Select
3
+ from selenium.webdriver.common.by import By
4
+
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ import re
8
+
9
+ import os
10
+ import time
11
+
12
+ from selenium.webdriver.support.ui import WebDriverWait
13
+ from selenium.webdriver.common.by import By
14
+ from selenium.webdriver.support import expected_conditions as EC
15
+ from selenium.webdriver.common.action_chains import ActionChains
16
+ import chromedriver_autoinstaller
17
+
18
+ class ScrapeThatData:
19
+
20
+ def __init__(self, time_threshold = 10):
21
+
22
+ try:
23
+ chrome_options = webdriver.ChromeOptions()
24
+ chrome_options.add_argument('--no-sandbox')
25
+ self.driver = webdriver.Chrome(options=chrome_options)
26
+
27
+ except:
28
+ chromedriver_autoinstaller.install()
29
+ chrome_options = webdriver.ChromeOptions()
30
+ chrome_options.add_argument('--no-sandbox')
31
+ self.driver = webdriver.Chrome(options=chrome_options)
32
+
33
+
34
+
35
+ self.wait = WebDriverWait(self.driver,time_threshold)
36
+ self.attribute_dict = {'status':1 ,'conditions':2, 'interventions': 3, 'study type':4,
37
+ 'phase':5, 'sponsor':6, 'funder type':7 , 'study design': 8,
38
+ 'outcome measures':9, 'number enrolled':10, 'sex':11, 'age':12,
39
+ 'nct number': 13, 'other ids':14, 'title acronym': 15 , 'study start': 16,
40
+ 'primary completion': 17, 'study completion': 18 , 'first posted': 19,
41
+ 'last update posted': 20 , 'results first posted': 21 , 'locations':22, 'study documents': 23}
42
+
43
+ self.status_dict = {'not yet recruiting' : 'notYetRecrCB',
44
+ 'recruiting' : 'recruitingCB',
45
+ 'enrolling by invitation':'enrollingByInvCB',
46
+ 'active, not recruiting': 'activeCB',
47
+ 'suspended': 'suspendedCB',
48
+ 'terminated':'terminatedCB',
49
+ 'completed':'completedCB',
50
+ 'withdrawn': 'withdrawnCB',
51
+ 'unknown status': 'unknownCB'}
52
+
53
+ def clicking_show_hide_cols(self, driver):
54
+ columns = driver.find_element(By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/button')
55
+ action_chain = ActionChains(driver)
56
+ action_chain.move_to_element(columns).click()
57
+ action_chain.perform()
58
+
59
+ def select_attributes_to_show(self, listed_attributes, attribute_dict):
60
+ ll = [value.lower() for value in listed_attributes if value.lower() in ['status', 'conditions', 'interventions', 'locations']]
61
+ if ll:
62
+ to_show = [value.lower() for value in listed_attributes if value.lower() not in ll]
63
+ to_hide = [value for value in ['status', 'conditions', 'interventions', 'locations'] if value not in ll]
64
+ to_click = to_hide + to_show
65
+ for att in to_click:
66
+ self.clicking_show_hide_cols(self.driver)
67
+ time.sleep(1)
68
+ self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att]) + ']'))).click()
69
+ time.sleep(1)
70
+ else:
71
+ for att in listed_attributes:
72
+ self.clicking_show_hide_cols(self.driver)
73
+ time.sleep(1)
74
+ self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att.lower()]) + ']'))).click()
75
+ time.sleep(1)
76
+
77
+ def select_by_status(self, listed_states, status_dict):
78
+ if listed_states:
79
+ for status in listed_states:
80
+ self.driver.find_element(By.ID,status_dict[status.lower()]).click()
81
+
82
+ self.driver.find_element(By.XPATH,'//*[@id="FiltersBody"]/div[1]/input[1]').click()
83
+ time.sleep(3)
84
+
85
+
86
+ select = Select(self.driver.find_element_by_name('theDataTable_length'))
87
+ select.select_by_value('100')
88
+
89
+ def collect_data_search_page(self,l_ordered, amount_of_data = None):
90
+
91
+ class_name = ''
92
+ page_index = 1
93
+
94
+ elements = [l_ordered]
95
+
96
+ while 'disabled' not in class_name :
97
+
98
+
99
+
100
+ time.sleep(10)
101
+
102
+ print('Getting data from page {}'.format(page_index))
103
+
104
+ #Counting how many rows of the table appear
105
+ table = self.driver.find_element(By.ID,'theDataTable')
106
+ row_count = len(table.find_elements(By.TAG_NAME,"tr"))
107
+
108
+ #Looping table page
109
+ for index in range(1, row_count):
110
+ row = []
111
+ if 'status' in l_ordered:
112
+ self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3)')))
113
+ status_element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3) > span')
114
+ row.append(status_element.text.strip())
115
+ for i, val in enumerate(l_ordered):
116
+ if val == 'status':
117
+ continue
118
+
119
+ self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')))
120
+ element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')
121
+ try:
122
+ row.append(element.text.strip())
123
+ except:
124
+ print(i, element)
125
+ else:
126
+ for i, val in enumerate(l_ordered):
127
+ self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')))
128
+ element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')
129
+ try:
130
+ row.append(element.text.strip())
131
+ except:
132
+ print(i, element)
133
+ elements.append(row)
134
+
135
+
136
+
137
+
138
+ #Getting next page button
139
+ next_page= self.driver.find_element(By.ID,"theDataTable_next")
140
+
141
+ #Getting the class attribute of the next page button
142
+ class_name = next_page.get_attribute('class')
143
+
144
+ #Going to the next page
145
+ next_page.click()
146
+ page_index += 1
147
+
148
+ if amount_of_data:
149
+ if len(elements) >= amount_of_data or row_count < amount_of_data :
150
+ break
151
+ else:
152
+ continue
153
+
154
+ return elements
155
+
156
+ def get_criteria(self, NCTnumber):
157
+
158
+ url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
159
+ ClinicalTrialpage = requests.get(url)
160
+ soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
161
+
162
+ wrapping_crit_class = soup.find_all("div", {"class": "tr-indent2"})
163
+ list_elements = wrapping_crit_class[1].find_all(re.compile("(ul|ol)"))
164
+ inclusion, exclusion = ('','')
165
+
166
+
167
+ if not list_elements:
168
+ print ("WARNING: Study number " + NCTnumber + " doesn't have eligibility criteria or HTML tag format is not a list")
169
+ else:
170
+
171
+ if len(list_elements) == 1:
172
+ try:
173
+ if wrapping_crit_class[1].find(text = 'Inclusion Criteria:'):
174
+ inclusion = list_elements[0].find_all("li")
175
+
176
+ elif wrapping_crit_class[1].find(text = 'Exclusion Criteria:'):
177
+ exclusion = list_elements[0].find_all("li")
178
+ except:
179
+ print('criteria doesnt exist')
180
+ else:
181
+ inclusion = list_elements[0].find_all("li")
182
+ exclusion = list_elements[1].find_all("li")
183
+
184
+
185
+ inclusion = ' '.join([t.text.strip() for t in inclusion ])
186
+ exclusion = ' '.join([t.text.strip() for t in exclusion ])
187
+
188
+ return(inclusion, exclusion)
189
+
190
+ #function that gets number of patients enrolled in a study
191
+ def get_enrollment (self, NCTnumber):
192
+ url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
193
+ ClinicalTrialpage = requests.get(url)
194
+ soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
195
+ enrollment = ''
196
+ wrapping_enrol_class = soup.find_all('td', {'headers':'studyInfoColData','style':"padding-left:1em"})
197
+ if not wrapping_enrol_class:
198
+ print('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
199
+ else:
200
+ enrollment = wrapping_enrol_class[1]
201
+ enrollment = enrollment.text.split()[0]
202
+ if enrollment.isdigit() == False:
203
+ print ('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
204
+ else:
205
+ return(enrollment)
206
+
207
+
208
+
209
+ def __call__(self, condition, listed_attributes, listed_states, amount_of_data):
210
+
211
+ self.driver.get('https://clinicaltrials.gov/ct2/results?cond=' + condition + '&rank=1&view=record#rowId0')
212
+ self.select_attributes_to_show(listed_attributes, self.attribute_dict)
213
+
214
+ try:
215
+ self.select_by_status(listed_states, self.status_dict)
216
+ except:
217
+ print('select by status is a problem')
218
+ n = []
219
+ for i in listed_attributes:
220
+ n.append(self.attribute_dict[i.lower()])
221
+ attribute_ordered = [list(self.attribute_dict.keys())[list(self.attribute_dict.values()).index(i)]for i in sorted(n)]
222
+
223
+ search_data = self.collect_data_search_page(attribute_ordered, amount_of_data=amount_of_data)
224
+ nct_numbers = [e[search_data[0].index('nct number')] for e in search_data[1:]]
225
+ search_data[0].extend(['inclusion', 'exclusion', 'enrollment'])
226
+ for index, nct in enumerate(nct_numbers):
227
+ if index % 100 == 0 and index!= 0:
228
+ print("Collected Data from {} Studies: ".format(index))
229
+
230
+ inc, exc = self.get_criteria(nct)
231
+ enrol = self.get_enrollment(nct)
232
+ search_data[index + 1].extend([inc, exc, enrol])
233
+ return search_data
234
+ # except:
235
+ # print('no data available with the specified status')
236
+
237
+
PATHtoOriginaltool.ipynb ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "text/plain": [
11
+ "['/Users/satoc/miniforge3/envs/gradio/lib/python312.zip',\n",
12
+ " '/Users/satoc/miniforge3/envs/gradio/lib/python3.12',\n",
13
+ " '/Users/satoc/miniforge3/envs/gradio/lib/python3.12/lib-dynload',\n",
14
+ " '',\n",
15
+ " '/Users/satoc/.local/lib/python3.12/site-packages',\n",
16
+ " '/Users/satoc/miniforge3/envs/gradio/lib/python3.12/site-packages',\n",
17
+ " '/Users/satoc/Dropbox/programing/python/ClinicalTrialV3']"
18
+ ]
19
+ },
20
+ "execution_count": 1,
21
+ "metadata": {},
22
+ "output_type": "execute_result"
23
+ }
24
+ ],
25
+ "source": [
26
+ "import sys\n",
27
+ "import os \n",
28
+ "paths =sys.path\n",
29
+ "paths"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 2,
35
+ "metadata": {},
36
+ "outputs": [
37
+ {
38
+ "data": {
39
+ "text/plain": [
40
+ "'/Users/satoc/Dropbox/programing/python/ClinicalTrialV2'"
41
+ ]
42
+ },
43
+ "execution_count": 2,
44
+ "metadata": {},
45
+ "output_type": "execute_result"
46
+ }
47
+ ],
48
+ "source": [
49
+ "paths[-1]"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 3,
55
+ "metadata": {},
56
+ "outputs": [
57
+ {
58
+ "data": {
59
+ "text/plain": [
60
+ "'/Users/satoc/Dropbox/programing/python/ClinicalTrialV2/original_tools.pth'"
61
+ ]
62
+ },
63
+ "execution_count": 3,
64
+ "metadata": {},
65
+ "output_type": "execute_result"
66
+ }
67
+ ],
68
+ "source": [
69
+ "fileName = paths[-1] + '/' +'original_tools.pth'\n",
70
+ "#fileName = \"/Users/satoc/Dropbox/programing/python/ClinicalTrialV2\" + '/' +'original_tools.pth'\n",
71
+ "fileName"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "markdown",
76
+ "metadata": {},
77
+ "source": [
78
+ "name = ! pwd"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": 4,
84
+ "metadata": {},
85
+ "outputs": [],
86
+ "source": [
87
+ "name = os.getcwd()"
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "execution_count": 5,
93
+ "metadata": {},
94
+ "outputs": [
95
+ {
96
+ "data": {
97
+ "text/plain": [
98
+ "'/Users/satoc/Dropbox/programing/python/ClinicalTrialV3'"
99
+ ]
100
+ },
101
+ "execution_count": 5,
102
+ "metadata": {},
103
+ "output_type": "execute_result"
104
+ }
105
+ ],
106
+ "source": [
107
+ "name"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": 6,
113
+ "metadata": {},
114
+ "outputs": [],
115
+ "source": [
116
+ "with open (fileName, mode='w') as f:\n",
117
+ " f.write(name)"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "execution_count": null,
123
+ "metadata": {},
124
+ "outputs": [],
125
+ "source": []
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": 7,
130
+ "metadata": {},
131
+ "outputs": [
132
+ {
133
+ "name": "stdout",
134
+ "output_type": "stream",
135
+ "text": [
136
+ "['/Users/satoc/miniforge3/envs/gradio/lib/python312.zip', '/Users/satoc/miniforge3/envs/gradio/lib/python3.12', '/Users/satoc/miniforge3/envs/gradio/lib/python3.12/lib-dynload', '', '/Users/satoc/.local/lib/python3.12/site-packages', '/Users/satoc/miniforge3/envs/gradio/lib/python3.12/site-packages', '/Users/satoc/Dropbox/programing/python/ClinicalTrialV2']\n",
137
+ "/Users/satoc/Dropbox/programing/python/ClinicalTrialV2/original_tools.pth\n",
138
+ "/Users/satoc/Dropbox/programing/python/ClinicalTrialV3\n"
139
+ ]
140
+ }
141
+ ],
142
+ "source": [
143
+ "import sys\n",
144
+ "import os \n",
145
+ "paths =sys.path\n",
146
+ "print(paths)\n",
147
+ "fileName = paths[-1] + '/' +'original_tools.pth'\n",
148
+ "print(fileName)\n",
149
+ "cwd = os.getcwd()\n",
150
+ "print(cwd)\n",
151
+ "with open (fileName, mode='w') as f:\n",
152
+ " f.write(cwd)"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": 8,
158
+ "metadata": {},
159
+ "outputs": [],
160
+ "source": [
161
+ "cwd = os.getcwd()"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": 9,
167
+ "metadata": {},
168
+ "outputs": [
169
+ {
170
+ "ename": "ModuleNotFoundError",
171
+ "evalue": "No module named 'pytrials'",
172
+ "output_type": "error",
173
+ "traceback": [
174
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
175
+ "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
176
+ "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpytrials\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mclient\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ClinicalTrials \n",
177
+ "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pytrials'"
178
+ ]
179
+ }
180
+ ],
181
+ "source": [
182
+ "from pytrials.client import ClinicalTrials "
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": null,
188
+ "metadata": {},
189
+ "outputs": [],
190
+ "source": []
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": 10,
195
+ "metadata": {},
196
+ "outputs": [
197
+ {
198
+ "name": "stdout",
199
+ "output_type": "stream",
200
+ "text": [
201
+ ".pthファイルを作成するパス: /Users/satoc/miniforge3/envs/gradio/lib/python3.12/site-packages/original_tools.pth\n",
202
+ "現在の作業ディレクトリ: /Users/satoc/Dropbox/programing/python/ClinicalTrialV3\n",
203
+ "モジュール検索パス:\n",
204
+ "['/Users/satoc/miniforge3/envs/gradio/lib/python312.zip', '/Users/satoc/miniforge3/envs/gradio/lib/python3.12', '/Users/satoc/miniforge3/envs/gradio/lib/python3.12/lib-dynload', '', '/Users/satoc/.local/lib/python3.12/site-packages', '/Users/satoc/miniforge3/envs/gradio/lib/python3.12/site-packages', '/Users/satoc/Dropbox/programing/python/ClinicalTrialV2']\n"
205
+ ]
206
+ }
207
+ ],
208
+ "source": [
209
+ "import sys\n",
210
+ "import os\n",
211
+ "import site\n",
212
+ "\n",
213
+ "# site-packagesのディレクトリを取得\n",
214
+ "site_packages_path = site.getsitepackages()[0] # 複数ある場合、通常は最初のものを使用\n",
215
+ "\n",
216
+ "# .pthファイルのパスをsite-packagesに設定\n",
217
+ "fileName = os.path.join(site_packages_path, 'original_tools.pth')\n",
218
+ "print(f\".pthファイルを作成するパス: {fileName}\")\n",
219
+ "\n",
220
+ "# 現在の作業ディレクトリを取得\n",
221
+ "cwd = os.getcwd()\n",
222
+ "print(f\"現在の作業ディレクトリ: {cwd}\")\n",
223
+ "\n",
224
+ "# .pthファイルを作成し、現在の作業ディレクトリのパスを書き込む\n",
225
+ "with open(fileName, mode='w') as f:\n",
226
+ " f.write(cwd)\n",
227
+ "\n",
228
+ "# 確認のためにsys.pathを再表示\n",
229
+ "print(\"モジュール検索パス:\")\n",
230
+ "print(sys.path)\n"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "code",
235
+ "execution_count": null,
236
+ "metadata": {},
237
+ "outputs": [],
238
+ "source": []
239
+ }
240
+ ],
241
+ "metadata": {
242
+ "kernelspec": {
243
+ "display_name": "Python 3 (ipykernel)",
244
+ "language": "python",
245
+ "name": "python3"
246
+ },
247
+ "language_info": {
248
+ "codemirror_mode": {
249
+ "name": "ipython",
250
+ "version": 3
251
+ },
252
+ "file_extension": ".py",
253
+ "mimetype": "text/x-python",
254
+ "name": "python",
255
+ "nbconvert_exporter": "python",
256
+ "pygments_lexer": "ipython3",
257
+ "version": "3.12.3"
258
+ }
259
+ },
260
+ "nbformat": 4,
261
+ "nbformat_minor": 4
262
+ }
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: ClinicalTrialV3
3
- emoji: 👁
4
- colorFrom: indigo
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.9.1
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
+ title: ClinicalTrialV2
3
+ emoji: 💻
4
+ colorFrom: yellow
5
+ colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 5.4.0
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from OpenAITools.FetchTools import fetch_clinical_trials
4
+ from langchain_openai import ChatOpenAI
5
+ from langchain_groq import ChatGroq
6
+ from OpenAITools.CrinicalTrialTools import SimpleClinicalTrialAgent, GraderAgent, LLMTranslator, generate_ex_question_English
7
+ from OpenAITools.JRCTTools import get_matched_df,GetJRCTCriteria
8
+ from sentence_transformers import SentenceTransformer
9
+ from sentence_transformers import util
10
+
11
+ # モデルとエージェントの初期化
12
+ groq = ChatGroq(model_name="llama3-70b-8192", temperature=0)
13
+ translator = LLMTranslator(groq)
14
+ CriteriaCheckAgent = SimpleClinicalTrialAgent(groq)
15
+ grader_agent = GraderAgent(groq)
16
+ selectionModel = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')
17
+
18
+ # データフレームを生成する関数
19
+ def generate_dataframe(age, sex, tumor_type, GeneMutation, Meseable, Biopsiable):
20
+ # 日本語の腫瘍タイプを英語に翻訳
21
+ TumorName = translator.translate(tumor_type)
22
+
23
+ # 質問文を生成
24
+ ex_question = generate_ex_question_English(age, sex, TumorName, GeneMutation, Meseable, Biopsiable)
25
+
26
+ # 臨床試験データの取得
27
+ basedf = pd.read_csv("../ClinicalTrialCSV/JRCT20241215CancerPost.csv", index_col=0)
28
+ df = get_matched_df(basedf=basedf, query=TumorName, model=selectionModel, threshold=0.925)
29
+ df['AgentJudgment'] = None
30
+ df['AgentGrade'] = None
31
+
32
+ # 臨床試験の適格性の評価
33
+ progress = gr.Progress(track_tqdm=True)
34
+ for i in range(len(df)):
35
+ TargetCriteria = GetJRCTCriteria(df, i)
36
+ AgentJudgment = CriteriaCheckAgent.evaluate_eligibility(TargetCriteria, ex_question)
37
+ AgentGrade = grader_agent.evaluate_eligibility(AgentJudgment)
38
+ # df.locを使って値を代入(行・列名で指定)
39
+ df.loc[df.index[i], 'AgentJudgment'] = AgentJudgment
40
+ df.loc[df.index[i], 'AgentGrade'] = AgentGrade
41
+ progress((i + 1) / len(df))
42
+
43
+ # 列を指定した順に並び替え
44
+ columns_order = ['JRCT ID', 'Title', '研究・治験の目的','AgentJudgment', 'AgentGrade','主たる選択基準', '主たる除外基準','Inclusion Criteria','Exclusion Criteria','NCT No', 'JapicCTI No']
45
+ df = df[columns_order]
46
+
47
+ return df, df # フィルタ用と表示用にデータフレームを返す
48
+
49
+ # 特定のAgentGrade(yes, no, unclear)に基づいて行をフィルタリングする関数
50
+ def filter_rows_by_grade(original_df, grade):
51
+ df_filtered = original_df[original_df['AgentGrade'] == grade]
52
+ return df_filtered, df_filtered
53
+
54
+ # CSVとして保存しダウンロードする関数
55
+ def download_filtered_csv(df):
56
+ file_path = "filtered_data.csv"
57
+ df.to_csv(file_path, index=False)
58
+ return file_path
59
+
60
+ # 全体結果をCSVとして保存しダウンロードする関数
61
+ def download_full_csv(df):
62
+ file_path = "full_data.csv"
63
+ df.to_csv(file_path, index=False)
64
+ return file_path
65
+
66
+ # Gradioインターフェースの作成
67
+ with gr.Blocks() as demo:
68
+ gr.Markdown("## 臨床試験適格性評価インターフェース")
69
+
70
+ # 各種入力フィールド
71
+ age_input = gr.Textbox(label="Age", placeholder="例: 65")
72
+ sex_input = gr.Dropdown(choices=["男性", "女性"], label="Sex")
73
+ tumor_type_input = gr.Textbox(label="Tumor Type", placeholder="例: gastric cancer, 日本でも良いですが英語の方が精度が高いです。")
74
+ gene_mutation_input = gr.Textbox(label="Gene Mutation", placeholder="例: HER2")
75
+ measurable_input = gr.Dropdown(choices=["有り", "無し", "不明"], label="Measurable Tumor")
76
+ biopsiable_input = gr.Dropdown(choices=["有り", "無し", "不明"], label="Biopsiable Tumor")
77
+
78
+ # データフレーム表示エリア
79
+ dataframe_output = gr.DataFrame()
80
+ original_df = gr.State()
81
+ filtered_df = gr.State()
82
+
83
+ # データフレーム生成ボタン
84
+ generate_button = gr.Button("Generate Clinical Trials Data")
85
+
86
+ # フィルタリングボタン
87
+ yes_button = gr.Button("Show Eligible Trials")
88
+ no_button = gr.Button("Show Ineligible Trials")
89
+ unclear_button = gr.Button("Show Unclear Trials")
90
+
91
+ # ダウンロードボタン
92
+ download_filtered_button = gr.Button("Download Filtered Data")
93
+ download_filtered_output = gr.File(label="Download Filtered Data")
94
+
95
+ download_full_button = gr.Button("Download Full Data")
96
+ download_full_output = gr.File(label="Download Full Data")
97
+
98
+
99
+ # ボタン動作の設定
100
+ generate_button.click(fn=generate_dataframe, inputs=[age_input, sex_input, tumor_type_input, gene_mutation_input, measurable_input, biopsiable_input], outputs=[dataframe_output, original_df])
101
+ yes_button.click(fn=filter_rows_by_grade, inputs=[original_df, gr.State("yes")], outputs=[dataframe_output, filtered_df])
102
+ no_button.click(fn=filter_rows_by_grade, inputs=[original_df, gr.State("no")], outputs=[dataframe_output, filtered_df])
103
+ unclear_button.click(fn=filter_rows_by_grade, inputs=[original_df, gr.State("unclear")], outputs=[dataframe_output, filtered_df])
104
+ download_filtered_button.click(fn=download_filtered_csv, inputs=filtered_df, outputs=download_filtered_output)
105
+ download_full_button.click(fn=download_full_csv, inputs=original_df, outputs=download_full_output)
106
+
107
+
108
+ if __name__ == "__main__":
109
+ demo.launch()
environment.yml ADDED
@@ -0,0 +1,487 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: gradio
2
+ channels:
3
+ - conda-forge
4
+ dependencies:
5
+ - aiofiles=23.2.1
6
+ - altair=5.3.0
7
+ - annotated-types=0.7.0
8
+ - anyio=4.3.0
9
+ - aom=3.9.0
10
+ - appnope=0.1.4
11
+ - argon2-cffi=23.1.0
12
+ - argon2-cffi-bindings=21.2.0
13
+ - arrow=1.3.0
14
+ - asttokens=2.4.1
15
+ - async-lru=2.0.4
16
+ - attrs=23.2.0
17
+ - babel=2.14.0
18
+ - beautifulsoup4=4.12.3
19
+ - bleach=6.1.0
20
+ - blinker=1.8.2
21
+ - brotli=1.1.0
22
+ - brotli-bin=1.1.0
23
+ - brotli-python=1.1.0
24
+ - bzip2=1.0.8
25
+ - ca-certificates=2024.8.30
26
+ - cached-property=1.5.2
27
+ - cached_property=1.5.2
28
+ - cairo=1.18.0
29
+ - certifi=2024.8.30
30
+ - cffi=1.16.0
31
+ - charset-normalizer=3.3.2
32
+ - click=8.1.7
33
+ - colorama=0.4.6
34
+ - comm=0.2.2
35
+ - contourpy=1.2.1
36
+ - cycler=0.12.1
37
+ - dav1d=1.2.1
38
+ - debugpy=1.8.1
39
+ - decorator=5.1.1
40
+ - defusedxml=0.7.1
41
+ - dnspython=2.6.1
42
+ - email-validator=2.1.1
43
+ - email_validator=2.1.1
44
+ - entrypoints=0.4
45
+ - exceptiongroup=1.2.0
46
+ - executing=2.0.1
47
+ - expat=2.6.2
48
+ - fastapi=0.111.0
49
+ - fastapi-cli=0.0.4
50
+ - ffmpeg=7.0.1
51
+ - ffmpy=0.3.0
52
+ - filelock=3.14.0
53
+ - flask=3.0.3
54
+ - font-ttf-dejavu-sans-mono=2.37
55
+ - font-ttf-inconsolata=3.000
56
+ - font-ttf-source-code-pro=2.038
57
+ - font-ttf-ubuntu=0.83
58
+ - fontconfig=2.14.2
59
+ - fonts-conda-ecosystem=1
60
+ - fonts-conda-forge=1
61
+ - fonttools=4.53.0
62
+ - fqdn=1.5.1
63
+ - freetype=2.12.1
64
+ - fribidi=1.0.10
65
+ - fsspec=2024.6.0
66
+ - gettext=0.22.5
67
+ - gettext-tools=0.22.5
68
+ - gmp=6.3.0
69
+ - gnutls=3.7.9
70
+ - gradio=4.33.0
71
+ - gradio-client=0.17.0
72
+ - graphite2=1.3.13
73
+ - h11=0.14.0
74
+ - h2=4.1.0
75
+ - harfbuzz=8.5.0
76
+ - hpack=4.0.0
77
+ - httpcore=1.0.5
78
+ - httpx=0.27.0
79
+ - huggingface_hub=0.23.2
80
+ - hyperframe=6.0.1
81
+ - icu=73.2
82
+ - idna=3.7
83
+ - importlib-metadata=7.1.0
84
+ - importlib-resources=6.4.0
85
+ - importlib_metadata=7.1.0
86
+ - importlib_resources=6.4.0
87
+ - ipykernel=6.29.3
88
+ - ipython=8.25.0
89
+ - ipywidgets=8.1.3
90
+ - isoduration=20.11.0
91
+ - itsdangerous=2.2.0
92
+ - jedi=0.19.1
93
+ - jinja2=3.1.4
94
+ - joblib=1.4.2
95
+ - json5=0.9.25
96
+ - jsonpointer=2.4
97
+ - jsonschema=4.22.0
98
+ - jsonschema-specifications=2023.12.1
99
+ - jsonschema-with-format-nongpl=4.22.0
100
+ - jupyter=1.1.1
101
+ - jupyter-lsp=2.2.5
102
+ - jupyter_client=8.6.2
103
+ - jupyter_console=6.6.3
104
+ - jupyter_core=5.7.2
105
+ - jupyter_events=0.10.0
106
+ - jupyter_server=2.14.1
107
+ - jupyter_server_terminals=0.5.3
108
+ - jupyterlab=4.2.5
109
+ - jupyterlab_pygments=0.3.0
110
+ - jupyterlab_server=2.27.2
111
+ - jupyterlab_widgets=3.0.11
112
+ - kiwisolver=1.4.5
113
+ - krb5=1.21.2
114
+ - lame=3.100
115
+ - lcms2=2.16
116
+ - lerc=4.0.0
117
+ - libabseil=20240116.2
118
+ - libasprintf=0.22.5
119
+ - libasprintf-devel=0.22.5
120
+ - libass=0.17.1
121
+ - libblas=3.9.0
122
+ - libbrotlicommon=1.1.0
123
+ - libbrotlidec=1.1.0
124
+ - libbrotlienc=1.1.0
125
+ - libcblas=3.9.0
126
+ - libcxx=17.0.6
127
+ - libdeflate=1.20
128
+ - libedit=3.1.20191231
129
+ - libexpat=2.6.2
130
+ - libffi=3.4.2
131
+ - libgettextpo=0.22.5
132
+ - libgettextpo-devel=0.22.5
133
+ - libgfortran=5.0.0
134
+ - libgfortran5=13.2.0
135
+ - libglib=2.80.2
136
+ - libhwloc=2.10.0
137
+ - libiconv=1.17
138
+ - libidn2=2.3.7
139
+ - libintl=0.22.5
140
+ - libintl-devel=0.22.5
141
+ - libjpeg-turbo=3.0.0
142
+ - liblapack=3.9.0
143
+ - libopenblas=0.3.27
144
+ - libopenvino=2024.1.0
145
+ - libopenvino-arm-cpu-plugin=2024.1.0
146
+ - libopenvino-auto-batch-plugin=2024.1.0
147
+ - libopenvino-auto-plugin=2024.1.0
148
+ - libopenvino-hetero-plugin=2024.1.0
149
+ - libopenvino-ir-frontend=2024.1.0
150
+ - libopenvino-onnx-frontend=2024.1.0
151
+ - libopenvino-paddle-frontend=2024.1.0
152
+ - libopenvino-pytorch-frontend=2024.1.0
153
+ - libopenvino-tensorflow-frontend=2024.1.0
154
+ - libopenvino-tensorflow-lite-frontend=2024.1.0
155
+ - libopus=1.3.1
156
+ - libpng=1.6.43
157
+ - libprotobuf=4.25.3
158
+ - libsodium=1.0.18
159
+ - libsqlite=3.45.3
160
+ - libtasn1=4.19.0
161
+ - libtiff=4.6.0
162
+ - libunistring=0.9.10
163
+ - libvpx=1.14.0
164
+ - libwebp-base=1.4.0
165
+ - libxcb=1.15
166
+ - libxml2=2.12.7
167
+ - libzlib=1.3.1
168
+ - llvm-openmp=18.1.6
169
+ - markdown-it-py=3.0.0
170
+ - markupsafe=2.1.5
171
+ - matplotlib=3.8.4
172
+ - matplotlib-base=3.8.4
173
+ - matplotlib-inline=0.1.7
174
+ - mdurl=0.1.2
175
+ - mistune=3.0.2
176
+ - munkres=1.1.4
177
+ - natsort=8.4.0
178
+ - nbclient=0.10.0
179
+ - nbconvert=7.16.4
180
+ - nbconvert-core=7.16.4
181
+ - nbconvert-pandoc=7.16.4
182
+ - nbformat=5.10.4
183
+ - ncurses=6.5
184
+ - nest-asyncio=1.6.0
185
+ - nettle=3.9.1
186
+ - notebook=7.2.0
187
+ - notebook-shim=0.2.4
188
+ - numpy=1.26.4
189
+ - openh264=2.4.1
190
+ - openjpeg=2.5.2
191
+ - openssl=3.3.2
192
+ - orjson=3.10.3
193
+ - overrides=7.7.0
194
+ - p11-kit=0.24.1
195
+ - pandoc=3.2
196
+ - pandocfilters=1.5.0
197
+ - parso=0.8.4
198
+ - patsy=0.5.6
199
+ - pcre2=10.43
200
+ - pexpect=4.9.0
201
+ - pickleshare=0.7.5
202
+ - pillow=10.3.0
203
+ - pip=24.0
204
+ - pixman=0.43.4
205
+ - pkgutil-resolve-name=1.3.10
206
+ - platformdirs=4.2.2
207
+ - prometheus_client=0.20.0
208
+ - prompt-toolkit=3.0.42
209
+ - prompt_toolkit=3.0.42
210
+ - psutil=5.9.8
211
+ - pthread-stubs=0.4
212
+ - ptyprocess=0.7.0
213
+ - pugixml=1.14
214
+ - pure_eval=0.2.2
215
+ - pycparser=2.22
216
+ - pydub=0.25.1
217
+ - pygments=2.18.0
218
+ - pyobjc-core=10.2
219
+ - pyobjc-framework-cocoa=10.2
220
+ - pyparsing=3.1.2
221
+ - pysocks=1.7.1
222
+ - python=3.12.3
223
+ - python-fastjsonschema=2.19.1
224
+ - python-json-logger=2.0.7
225
+ - python-multipart=0.0.9
226
+ - python-tzdata=2024.1
227
+ - python_abi=3.12
228
+ - pytz=2024.1
229
+ - pyyaml=6.0.1
230
+ - pyzmq=26.0.3
231
+ - qtconsole-base=5.5.2
232
+ - qtpy=2.4.1
233
+ - readline=8.2
234
+ - referencing=0.35.1
235
+ - requests=2.32.3
236
+ - rfc3339-validator=0.1.4
237
+ - rfc3986-validator=0.1.1
238
+ - rich=13.7.1
239
+ - rpds-py=0.18.1
240
+ - ruff=0.4.7
241
+ - scikit-learn=1.5.2
242
+ - scipy=1.13.1
243
+ - seaborn=0.13.2
244
+ - seaborn-base=0.13.2
245
+ - semantic_version=2.10.0
246
+ - send2trash=1.8.3
247
+ - setuptools=70.0.0
248
+ - shellingham=1.5.4
249
+ - six=1.16.0
250
+ - snappy=1.2.0
251
+ - sniffio=1.3.1
252
+ - soupsieve=2.5
253
+ - stack_data=0.6.2
254
+ - starlette=0.37.2
255
+ - statsmodels=0.14.2
256
+ - svt-av1=2.1.0
257
+ - tbb=2021.12.0
258
+ - terminado=0.18.1
259
+ - threadpoolctl=3.5.0
260
+ - tinycss2=1.3.0
261
+ - tk=8.6.13
262
+ - tomli=2.0.1
263
+ - tomlkit=0.12.0
264
+ - toolz=0.12.1
265
+ - tornado=6.4
266
+ - tqdm=4.66.4
267
+ - traitlets=5.14.3
268
+ - typer=0.12.3
269
+ - typer-slim=0.12.3
270
+ - typer-slim-standard=0.12.3
271
+ - types-python-dateutil=2.9.0.20240316
272
+ - typing-extensions=4.12.1
273
+ - typing_extensions=4.12.1
274
+ - typing_utils=0.1.0
275
+ - tzdata=2024a
276
+ - ujson=5.10.0
277
+ - uri-template=1.3.0
278
+ - urllib3=2.2.1
279
+ - uvicorn=0.30.1
280
+ - wcwidth=0.2.13
281
+ - webcolors=1.13
282
+ - webencodings=0.5.1
283
+ - websocket-client=1.8.0
284
+ - websockets=11.0.3
285
+ - werkzeug=3.0.3
286
+ - wheel=0.43.0
287
+ - widgetsnbextension=4.0.11
288
+ - wikipedia=1.4.0
289
+ - wtforms=3.1.2
290
+ - x264=1!164.3095
291
+ - x265=3.5
292
+ - xorg-libxau=1.0.11
293
+ - xorg-libxdmcp=1.1.3
294
+ - xz=5.2.6
295
+ - yaml=0.2.5
296
+ - zeromq=4.3.5
297
+ - zipp=3.17.0
298
+ - zlib=1.3.1
299
+ - zstd=1.5.6
300
+ - pip:
301
+ - aiohttp==3.9.5
302
+ - aioitertools==0.12.0
303
+ - aiosignal==1.3.1
304
+ - aiosmtplib==2.0.2
305
+ - aiosqlite==0.20.0
306
+ - alembic==1.13.3
307
+ - anthropic==0.34.2
308
+ - arize-phoenix==5.1.2
309
+ - arize-phoenix-evals==0.16.1
310
+ - arize-phoenix-otel==0.5.1
311
+ - asgiref==3.8.1
312
+ - astor==0.8.1
313
+ - authlib==1.3.2
314
+ - azure-core==1.31.0
315
+ - azure-identity==1.17.1
316
+ - backoff==2.2.1
317
+ - bcrypt==4.2.0
318
+ - bio==1.7.1
319
+ - biopython==1.83
320
+ - biothings-client==0.3.1
321
+ - boto3==1.35.18
322
+ - botocore==1.35.18
323
+ - build==1.2.2
324
+ - cachetools==5.5.0
325
+ - chroma-hnswlib==0.7.6
326
+ - chromadb==0.5.11
327
+ - cohere==5.9.1
328
+ - coloredlogs==15.0.1
329
+ - cryptography==43.0.1
330
+ - dataclasses-json==0.6.6
331
+ - deprecated==1.2.14
332
+ - dirtyjson==1.0.8
333
+ - diskcache==5.6.3
334
+ - distro==1.9.0
335
+ - duckdb==1.1.1
336
+ - duckduckgo-search==6.2.13
337
+ - durationpy==0.9
338
+ - faker==19.13.0
339
+ - fastapi-mail==1.4.1
340
+ - fastavro==1.9.7
341
+ - flatbuffers==24.3.25
342
+ - frozenlist==1.4.1
343
+ - google-auth==2.35.0
344
+ - google-search-results==2.4.2
345
+ - googleapis-common-protos==1.65.0
346
+ - gprofiler-official==1.0.0
347
+ - graphql-core==3.2.4
348
+ - greenlet==3.0.3
349
+ - groq==0.11.0
350
+ - grpc-interceptor==0.15.4
351
+ - grpcio==1.66.2
352
+ - grpcio-tools==1.66.2
353
+ - hdbscan==0.8.38.post1
354
+ - httptools==0.6.1
355
+ - httpx-sse==0.4.0
356
+ - humanfriendly==10.0
357
+ - jiter==0.5.0
358
+ - jmespath==1.0.1
359
+ - jsonpatch==1.33
360
+ - jsonpath-python==1.0.6
361
+ - kubernetes==31.0.0
362
+ - langchain==0.3.1
363
+ - langchain-anthropic==0.2.1
364
+ - langchain-community==0.3.1
365
+ - langchain-core==0.3.9
366
+ - langchain-experimental==0.0.60
367
+ - langchain-groq==0.2.0
368
+ - langchain-openai==0.2.2
369
+ - langchain-text-splitters==0.3.0
370
+ - langgraph==0.2.34
371
+ - langgraph-checkpoint==2.0.0
372
+ - langsmith==0.1.130
373
+ - llama-cloud==0.1.0
374
+ - llama-cpp-python==0.2.77
375
+ - llama-index==0.11.14
376
+ - llama-index-agent-openai==0.3.4
377
+ - llama-index-callbacks-arize-phoenix==0.2.1
378
+ - llama-index-cli==0.3.1
379
+ - llama-index-core==0.11.14
380
+ - llama-index-embeddings-adapter==0.2.1
381
+ - llama-index-embeddings-openai==0.2.5
382
+ - llama-index-experimental==0.3.1
383
+ - llama-index-finetuning==0.2.0
384
+ - llama-index-indices-managed-llama-cloud==0.4.0
385
+ - llama-index-legacy==0.9.48
386
+ - llama-index-llms-azure-openai==0.2.1
387
+ - llama-index-llms-groq==0.2.0
388
+ - llama-index-llms-llama-cpp==0.1.3
389
+ - llama-index-llms-mistralai==0.2.3
390
+ - llama-index-llms-openai==0.2.9
391
+ - llama-index-llms-openai-like==0.2.0
392
+ - llama-index-llms-replicate==0.1.3
393
+ - llama-index-multi-modal-llms-openai==0.2.1
394
+ - llama-index-postprocessor-cohere-rerank==0.2.0
395
+ - llama-index-program-openai==0.2.0
396
+ - llama-index-question-gen-openai==0.2.0
397
+ - llama-index-readers-file==0.2.2
398
+ - llama-index-readers-llama-parse==0.3.0
399
+ - llama-parse==0.5.6
400
+ - llamaindex-py-client==0.1.19
401
+ - llvmlite==0.43.0
402
+ - mako==1.3.5
403
+ - marshmallow==3.21.2
404
+ - mistralai==1.0.3
405
+ - mmh3==5.0.1
406
+ - monotonic==1.6
407
+ - mpmath==1.3.0
408
+ - msal==1.31.0
409
+ - msal-extensions==1.2.0
410
+ - msgpack==1.1.0
411
+ - multidict==6.0.5
412
+ - mygene==3.2.2
413
+ - mypy-extensions==1.0.0
414
+ - networkx==3.3
415
+ - nltk==3.9.1
416
+ - numba==0.60.0
417
+ - numexpr==2.10.1
418
+ - oauthlib==3.2.2
419
+ - onnxruntime==1.19.2
420
+ - openai==1.51.0
421
+ - openinference-instrumentation==0.1.18
422
+ - openinference-instrumentation-llama-index==3.0.2
423
+ - openinference-semantic-conventions==0.1.10
424
+ - opentelemetry-api==1.27.0
425
+ - opentelemetry-exporter-otlp==1.27.0
426
+ - opentelemetry-exporter-otlp-proto-common==1.27.0
427
+ - opentelemetry-exporter-otlp-proto-grpc==1.27.0
428
+ - opentelemetry-exporter-otlp-proto-http==1.27.0
429
+ - opentelemetry-instrumentation==0.48b0
430
+ - opentelemetry-instrumentation-asgi==0.48b0
431
+ - opentelemetry-instrumentation-fastapi==0.48b0
432
+ - opentelemetry-proto==1.27.0
433
+ - opentelemetry-sdk==1.27.0
434
+ - opentelemetry-semantic-conventions==0.48b0
435
+ - opentelemetry-util-http==0.48b0
436
+ - packaging==23.2
437
+ - pandas==1.5.3
438
+ - pandasai==2.2.15
439
+ - parameterized==0.9.0
440
+ - pooch==1.8.1
441
+ - portalocker==2.10.1
442
+ - posthog==3.7.0
443
+ - primp==0.6.3
444
+ - protobuf==4.25.5
445
+ - pyarrow==17.0.0
446
+ - pyasn1==0.6.1
447
+ - pyasn1-modules==0.4.1
448
+ - pydantic==2.9.2
449
+ - pydantic-core==2.23.4
450
+ - pydantic-settings==2.5.2
451
+ - pyjwt==2.9.0
452
+ - pynndescent==0.5.13
453
+ - pypdf==4.2.0
454
+ - pypika==0.48.9
455
+ - pyproject-hooks==1.2.0
456
+ - python-dateutil==2.9.0.post0
457
+ - python-dotenv==1.0.1
458
+ - pytrials==1.0.0
459
+ - qdrant-client==1.11.3
460
+ - regex==2024.5.15
461
+ - replicate==0.26.0
462
+ - requests-oauthlib==2.0.0
463
+ - requests-toolbelt==1.0.0
464
+ - rsa==4.9
465
+ - s3transfer==0.10.2
466
+ - safetensors==0.4.3
467
+ - sentence-transformers==2.7.0
468
+ - sqlalchemy==2.0.30
469
+ - sqlean-py==3.45.1
470
+ - sqlglot==25.24.3
471
+ - sqlglotrs==0.2.12
472
+ - strawberry-graphql==0.236.0
473
+ - striprtf==0.0.26
474
+ - sympy==1.13.2
475
+ - tenacity==8.3.0
476
+ - tiktoken==0.7.0
477
+ - tokenizers==0.19.1
478
+ - torch==2.4.1
479
+ - transformers==4.41.2
480
+ - types-requests==2.32.0.20240907
481
+ - typing-inspect==0.9.0
482
+ - umap-learn==0.5.6
483
+ - uvloop==0.20.0
484
+ - watchfiles==0.24.0
485
+ - wrapt==1.16.0
486
+ - yarl==1.9.4
487
+ prefix: /Users/satoc/miniforge3/envs/gradio
oldapp.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from OpenAITools.FetchTools import fetch_clinical_trials, fetch_clinical_trials_jp
4
+ from langchain_openai import ChatOpenAI
5
+ from langchain_groq import ChatGroq
6
+ from OpenAITools.CrinicalTrialTools import QuestionModifierEnglish, TumorNameExtractor, SimpleClinicalTrialAgent, GraderAgent
7
+
8
+ # モデルとエージェントの初期化
9
+ groq = ChatGroq(model_name="llama3-70b-8192", temperature=0)
10
+ modifier = QuestionModifierEnglish(groq)
11
+ extractor = TumorNameExtractor(groq)
12
+ CriteriaCheckAgent = SimpleClinicalTrialAgent(groq)
13
+ grader_agent = GraderAgent(groq)
14
+
15
+ # データフレームを生成する関数
16
+ def generate_dataframe_from_question(ex_question):
17
+ # Modify and extract tumor name
18
+ modified_question = modifier.modify_question(ex_question)
19
+ tumor_name = extractor.extract_tumor_name(ex_question)
20
+
21
+ # Get clinical trials data based on tumor name
22
+ df = fetch_clinical_trials(tumor_name)
23
+ df['AgentJudgment'] = None
24
+ df['AgentGrade'] = None
25
+
26
+ # NCTIDのリストを作成し、プログレスバーを表示
27
+ NCTIDs = list(df['NCTID'])
28
+ progress = gr.Progress(track_tqdm=True)
29
+
30
+ for i, nct_id in enumerate(NCTIDs):
31
+ target_criteria = df.loc[df['NCTID'] == nct_id, 'Eligibility Criteria'].values[0]
32
+ agent_judgment = CriteriaCheckAgent.evaluate_eligibility(target_criteria, modified_question)
33
+ agent_grade = grader_agent.evaluate_eligibility(agent_judgment)
34
+
35
+ # Update DataFrame
36
+ df.loc[df['NCTID'] == nct_id, 'AgentJudgment'] = agent_judgment
37
+ df.loc[df['NCTID'] == nct_id, 'AgentGrade'] = agent_grade
38
+
39
+ # プログレスバーを更新(進行状況を浮動小数点数で渡す)
40
+ progress((i + 1) / len(NCTIDs))
41
+
42
+ # 列を指定した順に並び替え
43
+ columns_order = ['NCTID', 'AgentGrade', 'Title', 'AgentJudgment', 'Japanes Locations',
44
+ 'Primary Completion Date', 'Cancer', 'Summary', 'Eligibility Criteria']
45
+ df = df[columns_order]
46
+
47
+ return df, df # フィルタ用と表示用にデータフレームを返す
48
+
49
+ # AgentGradeが特定の値(yes, no, unclear)の行だけを選択する関数
50
+ def filter_rows_by_grade(original_df, grade):
51
+ df_filtered = original_df[original_df['AgentGrade'] == grade]
52
+ return df_filtered, df_filtered # フィルタした結果を2つ返す
53
+
54
+ # CSVとして保存しダウンロードする関数
55
+ def download_filtered_csv(df):
56
+ file_path = "filtered_data.csv" # 現在の作業ディレクトリに保存
57
+ df.to_csv(file_path, index=False) # CSVファイルとして保存
58
+ return file_path
59
+
60
+ # Gradioインターフェースの作成
61
+ with gr.Blocks() as demo:
62
+ # 説明
63
+ gr.Markdown("## 質問を入力して、患者さんが参加可能な臨床治験の情報を収集。参加可能か否かを判断根拠も含めて提示します。結果はcsvとしてダウンロード可能です")
64
+
65
+ # 質問入力ボックス
66
+ question_input = gr.Textbox(label="質問を入力してください", placeholder="例: 65歳男性でBRCA遺伝子の変異がある前立腺癌患者さんが参加できる臨床治験を教えて下さい。")
67
+
68
+ # データフレーム表示エリア
69
+ dataframe_output = gr.DataFrame()
70
+
71
+ # データの元となるDataFrameを保存するためのstate
72
+ original_df = gr.State()
73
+ filtered_df = gr.State()
74
+
75
+ # データフレームを生成するボタン
76
+ generate_button = gr.Button("日本で行われている患者さんの癌腫の臨床治験を全て取得する")
77
+
78
+ # ボタンでAgentGradeがyes, no, unclearの行のみ表示
79
+ yes_button = gr.Button("AI Agentが患者さんが参加可能であると判断した臨床治験のみを表示")
80
+ no_button = gr.Button("I Agentが患者さんが参加不可であると判断した臨床治験のみを表示")
81
+ unclear_button = gr.Button("AI Agentが与えられた情報だけでは判断不可能とした臨床治験のみを表示")
82
+
83
+ # フィルタ結果をダウンロードするボタン
84
+ download_button = gr.Button("フィルタ結果をCSVとしてダウンロード")
85
+ download_output = gr.File() # ダウンロード用の出力エリア
86
+
87
+ # データフレームを生成して保存
88
+ generate_button.click(fn=generate_dataframe_from_question, inputs=question_input, outputs=[dataframe_output, original_df])
89
+
90
+ # yesボタン、noボタン、unclearボタンが押されたらフィルタしたデータを表示
91
+ yes_button.click(fn=filter_rows_by_grade, inputs=[original_df, gr.State("yes")], outputs=[dataframe_output, filtered_df])
92
+ no_button.click(fn=filter_rows_by_grade, inputs=[original_df, gr.State("no")], outputs=[dataframe_output, filtered_df])
93
+ unclear_button.click(fn=filter_rows_by_grade, inputs=[original_df, gr.State("unclear")], outputs=[dataframe_output, filtered_df])
94
+
95
+ # ダウンロードボタンを押すとフィルタ結果のCSV��ダウンロード
96
+ download_button.click(fn=download_filtered_csv, inputs=filtered_df, outputs=download_output)
97
+
98
+
99
+ if __name__ == "__main__":
100
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.5
2
+ aiosignal==1.3.1
3
+ bio==1.7.1
4
+ biopython==1.83
5
+ biothings-client==0.3.1
6
+ dataclasses-json==0.6.6
7
+ Deprecated==1.2.14
8
+ dirtyjson==1.0.8
9
+ diskcache==5.6.3
10
+ distro==1.9.0
11
+ frozenlist==1.4.1
12
+ gprofiler-official==1.0.0
13
+ greenlet==3.0.3
14
+ hpack==4.0.0
15
+ jsonpatch==1.33
16
+ langchain==0.3.2
17
+ langchain-community==0.3.1
18
+ langchain-core==0.3.9
19
+ langchain-experimental==0.3.2
20
+ langchain-groq==0.2.0
21
+ langchain-openai==0.2.2
22
+ langchain-text-splitters==0.3.0
23
+ langchainhub==0.1.21
24
+ langgraph==0.2.34
25
+ langgraph-checkpoint==2.0.1
26
+ langsmith==0.1.131
27
+ llama-index==0.10.43
28
+ llama-index-agent-openai==0.2.7
29
+ llama-index-cli==0.1.12
30
+ llama-index-core==0.10.43
31
+ llama-index-embeddings-openai==0.1.10
32
+ llama-index-indices-managed-llama-cloud==0.1.6
33
+ llama-index-legacy==0.9.48
34
+ llama-index-llms-groq==0.1.4
35
+ llama-index-llms-llama-cpp==0.1.3
36
+ llama-index-llms-openai==0.1.22
37
+ llama-index-llms-openai-like==0.1.3
38
+ llama-index-llms-replicate==0.1.3
39
+ llama-index-multi-modal-llms-openai==0.1.6
40
+ llama-index-program-openai==0.1.6
41
+ llama-index-question-gen-openai==0.1.3
42
+ llama-index-readers-file==0.1.23
43
+ llama-index-readers-llama-parse==0.1.4
44
+ llama-parse==0.4.4
45
+ llama_cpp_python==0.2.77
46
+ llamaindex-py-client==0.1.19
47
+ marshmallow==3.21.2
48
+ multidict==6.0.5
49
+ munkres==1.1.4
50
+ mygene==3.2.2
51
+ mypy-extensions==1.0.0
52
+ natsort==8.4.0
53
+ networkx==3.3
54
+ nltk
55
+ openai
56
+ packaging==23.2
57
+ pooch==1.8.1
58
+ pypdf==4.2.0
59
+ pytrials==1.0.0
60
+ regex==2024.5.15
61
+ replicate==0.26.0
62
+ safetensors
63
+ setuptools==70.0.0
64
+ SQLAlchemy==2.0.30
65
+ striprtf==0.0.26
66
+ tenacity==8.3.0
67
+ tiktoken==0.7.0
68
+ tokenizers==0.19.1
69
+ transformers==4.41.2
70
+ typer==0.12.3
71
+ typer-slim==0.12.3
72
+ typing-inspect==0.9.0
73
+ wheel==0.43.0
74
+ wikipedia==1.4.0
75
+ wrapt==1.16.0
76
+ yarl==1.9.4