高橋慧 commited on
Commit
5703564
·
1 Parent(s): cebc20f

Move chromedriver to Git LFS

Browse files
.gitignore ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Add any directories, files, or patterns you don't want to be tracked by version control
2
+
3
+
4
+ # Byte-compiled / optimized / DLL files
5
+ __pycache__/
6
+ #*.py[cod]
7
+ #*$py.class
8
+ #*.txt
9
+ #*.tsv
10
+ #*.csv
11
+ *.xlsx
12
+ *.pdf
13
+ *.nii
14
+ #*.nii.gz
15
+ *.DS_Store
16
+ #*.png
17
+ #*.pyn
18
+ *.jpg
19
+ *.nii.gz
20
+ *.pkl
21
+ *-checkpoint.ipynb
22
+ *.pkls
23
+ *.pth
24
+ *.yaml
25
+ *.ckpt
26
+ # C extensions
27
+ #*.so
28
+
29
+ # Distribution / packaging
30
+ #.Python
31
+ #build/
32
+ #develop-eggs/
33
+ #dist/
34
+ #downloads/
35
+ #eggs/
36
+ #.eggs/
37
+ #lib/
38
+ #lib64/
39
+ #parts/
40
+ #sdist/
41
+ #var/
42
+ #wheels/
43
+ #*.egg-info/
44
+ #.installed.cfg
45
+ #*.egg
46
+ #MANIFEST
47
+
48
+ # PyInstaller
49
+ # Usually these files are written by a python script from a template
50
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
51
+ #*.manifest
52
+ #*.spec
53
+
54
+ # Installer logs
55
+ #pip-log.txt
56
+ #pip-delete-this-directory.txt
57
+
58
+ # Unit test / coverage reports
59
+ #htmlcov/
60
+ #.tox/
61
+ #.coverage
62
+ #.coverage.*
63
+ #.cache
64
+ #nosetests.xml
65
+ #coverage.xml
66
+ #*.cover
67
+ #.hypothesis/
68
+ #.pytest_cache/
69
+
70
+ # Translations
71
+ #*.mo
72
+ #*.pot
73
+
74
+ # Django stuff:
75
+ #*.log
76
+ #.static_storage/
77
+ #.media/
78
+ #local_settings.py
79
+
80
+ # Flask stuff:
81
+ #instance/
82
+ #.webassets-cache
83
+
84
+ # Scrapy stuff:
85
+ #.scrapy
86
+
87
+ # Sphinx documentation
88
+ #docs/_build/
89
+
90
+ # PyBuilder
91
+ #target/
92
+
93
+ # Jupyter Notebook
94
+ .ipynb_checkpoint/*
95
+
96
+ # pyenv
97
+ #.python-version
98
+
99
+ # celery beat schedule file
100
+ #celerybeat-schedule
101
+
102
+ # SageMath parsed files
103
+ #*.sage.py
104
+
105
+ # Environments
106
+ #.env
107
+ #.venv
108
+ #env/
109
+ #venv/
110
+ #ENV/
111
+ #env.bak/
112
+ #venv.bak/
113
+
114
+ # Spyder project settings
115
+ #.spyderproject
116
+ #.spyproject
117
+
118
+ # Rope project settings
119
+ #.ropeproject
120
+
121
+ # mkdocs documentation
122
+ #/site
123
+ /models/
124
+ # mypy
125
+ #.mypy_cache/
126
+ #over 100MB
127
+
128
+ # Add any directories, files, or patterns you don't want to be tracked by version control
129
+
130
+
131
+ #deep settings
132
+ *.h5
133
+
134
+ .OpenAITools/chromedriver
135
+ /OpenAITools/chromedriver
OpenAITools/.ipynb_checkpoints/ECarteTools-checkpoint.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import time
3
+ import wikipedia
4
+ import random
5
+ import re
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ import os
9
+ import glob
10
+ from natsort import natsorted
11
+ import requests
12
+ from bs4 import BeautifulSoup
13
+ import xml.etree.ElementTree as ET
14
+ import pandas as pd
15
+
16
+ wikipedia.set_lang("ja")
17
+ # APIキーの設定
18
+ openai.api_key = os.environ['OPENAI_API_KEY']
19
+ engine="gpt-3.5-turbo"
20
+
21
+
22
+ def generate(system_template,prompt,engine="gpt-3.5-turbo"):
23
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
24
+ try:
25
+ response = openai.ChatCompletion.create(
26
+ model=engine,
27
+ messages=[
28
+ {"role": "system", "content": system_template},
29
+ {"role": "user", "content":prompt},
30
+ ]
31
+ )
32
+ result=response["choices"][0]["message"]["content"]
33
+ return result
34
+ except:
35
+ print("リトライ")
36
+ time.sleep(30)
37
+ pass
38
+
39
+ def generate_carte(prompt,engine="gpt-3.5-turbo"):
40
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
41
+ try:
42
+ response = openai.ChatCompletion.create(
43
+ model=engine,
44
+ messages=[
45
+ {"role": "system", "content": "You are useful assistant"},
46
+ {"role": "user", "content":"%s\n・・・という患者と医師の会話をSOAP形式のカルテとして日本語で端的にまとめて下さい。各セクションはS),O), A),P)として下さい "%prompt},
47
+ ]
48
+ )
49
+ result=response["choices"][0]["message"]["content"]
50
+ return result
51
+ except:
52
+ print("リトライ")
53
+ time.sleep(30)
54
+ pass
55
+
56
+ def get_selected_fileds(texts):
57
+ input_name = texts.replace(' ' , "+")
58
+ corona_fields = ct.get_study_fields(
59
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
60
+ fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
61
+ max_studies=500,
62
+ fmt="csv")
63
+ return corona_fields
64
+
65
+ def get_retriever_str(fields):
66
+ retriever_str=''
67
+ for i in range(1,len(fields)):
68
+ colnames = fields[0]
69
+ targetCol = fields[i]
70
+ for f in range(len(fields[0])):
71
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
72
+ retriever_str+='\n'
73
+ return retriever_str
OpenAITools/.ipynb_checkpoints/ExpertTools-checkpoint.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import time
4
+ import wikipedia
5
+ import random
6
+ import re
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ import os
10
+ import glob
11
+ from natsort import natsorted
12
+ import requests
13
+ from bs4 import BeautifulSoup
14
+ import xml.etree.ElementTree as ET
15
+ from pytrials.client import ClinicalTrials
16
+ from Bio import Entrez
17
+ import pandas as pd
18
+ import numpy as np
19
+ import time
20
+ #from langchain.agents import create_pandas_dataframe_agent
21
+ from langchain_experimental.agents import create_pandas_dataframe_agent
22
+ from langchain.llms import OpenAI
23
+
24
+ # APIキーの設定
25
+ openai.api_key = os.environ['OPENAI_API_KEY']
26
+ gptengine="gpt-3.5-turbo"
27
+
28
+
29
+ """def get_selected_fileds(texts):
30
+ ct = ClinicalTrials()
31
+ input_name = texts.replace(' ' , "+")
32
+ corona_fields = ct.get_study_fields(
33
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
34
+ fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
35
+ max_studies=500,
36
+ fmt="csv")
37
+ return corona_fields"""
38
+
39
+ def get_retriever_str(fields):
40
+ retriever_str=''
41
+ for i in range(1,len(fields)):
42
+ colnames = fields[0]
43
+ targetCol = fields[i]
44
+ for f in range(len(fields[0])):
45
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
46
+ retriever_str+='\n'
47
+ return retriever_str
48
+
49
+ def get_chanked_retriever(fields):
50
+ retriever_list =[]
51
+ for i in range(1,len(fields)):
52
+ retriever_str=''
53
+ colnames = fields[0]
54
+ targetCol = fields[i]
55
+ for f in range(len(fields[0])):
56
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
57
+ retriever_list.append(retriever_str)
58
+ return retriever_list
59
+
60
+ from pytrials.client import ClinicalTrials
61
+ def get_selected_fields(texts, split_criteria=False,
62
+ split_word_number = False, split_number=700):
63
+ ct = ClinicalTrials()
64
+ input_name = texts.replace(' ', "+")
65
+ corona_fields = ct.get_study_fields(
66
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (input_name),
67
+ fields=["NCTId", "Condition", "BriefTitle", 'BriefSummary', 'EligibilityCriteria'],
68
+ max_studies=500,
69
+ fmt="csv")
70
+
71
+ if split_criteria:
72
+ new_fields = []
73
+
74
+ # 検索対象の文字列
75
+ target_string1 = 'Exclusion Criteria'
76
+ target_string2 = 'Exclusion criteria'
77
+
78
+ # 各要素で検索対象の文字列を探し、直前で分割して新しいリストに格納
79
+ for corona_field in corona_fields:
80
+ new_list = []
81
+ for item in corona_field:
82
+ if target_string1 in item:
83
+ split_position = item.index(target_string1)
84
+ new_list.append(item[:split_position])
85
+ new_list.append(item[split_position:])
86
+ elif target_string2 in item:
87
+ split_position = item.index(target_string2)
88
+ new_list.append(item[:split_position])
89
+ new_list.append(item[split_position:])
90
+ else:
91
+ new_list.append(item)
92
+ new_fields.append(new_list)
93
+ else:
94
+ new_fields = corona_fields
95
+
96
+ if split_word_number:
97
+ split_fields = []
98
+ for new_field in new_fields:
99
+ new_list= []
100
+
101
+ # 各要素を調べて、700文字以上であれば分割し、新しいリストに格納
102
+ for item in new_field:
103
+ item_length = len(item)
104
+ if item_length > split_number:
105
+ num_parts = -(-item_length // split_number) # 向上の除算を用いて分割数を計算
106
+ for i in range(num_parts):
107
+ start_index = i * split_number
108
+ end_index = min((i + 1) * split_number, item_length) # 文字列の終わりを超えないように調整
109
+ new_list.append(item[start_index:end_index])
110
+ else:
111
+ new_list.append(item)
112
+
113
+ split_fields.append(new_list)
114
+ new_fields = split_fields
115
+
116
+ return new_fields
117
+
118
+
119
+ def print_agent_results(df, Ids,
120
+ interesteds = ['Condition', 'BriefTitle', 'BriefSummary', 'EligibilityCriteria'],
121
+ translater=None):
122
+ results = ""
123
+ for Id in Ids:
124
+ print("%s\n"%Id)
125
+ sdf = df[df['NCTId'] == Id]
126
+ for interested in interesteds:
127
+ # 最初の要素を取得
128
+ results += '%s: \n %s \n' % (interested, sdf[interested].iloc[0])
129
+ #print('%s: \n %s \n' % (interested, sdf[interested].iloc[0]))
130
+ if translater:
131
+ to_be_printed = translater.translate(results)
132
+ else:
133
+ to_be_printed =results
134
+ print(to_be_printed)
135
+
136
+ def search(query):
137
+ Entrez.email = os.getenv('MAIL_ADRESS')
138
+ #Entrez.email='[email protected]'
139
+ handle = Entrez.esearch(db='pubmed',
140
+ sort = 'relevance',
141
+ retmax = '20',
142
+ retmode = 'xml',
143
+ term = query)
144
+ results = Entrez.read(handle)
145
+ return results
146
+
147
+ def fetch_details(id_list):
148
+ ids = ','.join(id_list)
149
+ Entrez.email = os.getenv('MAIL_ADRESS')
150
+ #Entrez.email = '[email protected]'
151
+ handle = Entrez.efetch(db = 'pubmed',
152
+ retmode = 'xml',
153
+ id = ids)
154
+ results = Entrez.read(handle)
155
+ return results
156
+ '''def generate(prompt,engine=None):
157
+ if engine is None:
158
+ engine=gptengine
159
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
160
+ try:
161
+ response = openai.ChatCompletion.create(
162
+ model=engine,
163
+ messages=[
164
+ {"role": "system", "content": "You are useful assistant"},
165
+ {"role": "user", "content":prompt},
166
+ ]
167
+ )
168
+ result=response["choices"][0]["message"]["content"]
169
+ return result
170
+ except Exception as e:
171
+ print(e)
172
+ print("リトライ")
173
+ time.sleep(30)
174
+ pass
175
+ '''
176
+
177
+ def generate(prompt,engine=None):
178
+ if engine is None:
179
+ engine=gptengine
180
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
181
+ try:
182
+ response = openai.chat.completions.create(
183
+ model=engine,
184
+ messages=[
185
+ {"role": "system", "content": "You are useful assistant"},
186
+ {"role": "user", "content":prompt},
187
+ ]
188
+ )
189
+ #result=response["choices"][0]["message"]["content"]
190
+ result=response.choices[0].message.content
191
+ return result
192
+ except Exception as e:
193
+ print(e)
194
+ print("リトライ")
195
+ time.sleep(30)
196
+ pass
197
+
198
+ def GetPubmedSummaryDf(studies):
199
+ title_list= []
200
+ abstract_list=[]
201
+ journal_list = []
202
+ language_list =[]
203
+ pubdate_year_list = []
204
+ pubdate_month_list = []
205
+ studiesIdList = studies['IdList']
206
+ chunk_size = 10000
207
+ for chunk_i in range(0, len(studiesIdList), chunk_size):
208
+ chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
209
+
210
+ try:
211
+ papers = fetch_details(chunk)
212
+ for i, paper in enumerate(papers['PubmedArticle']):
213
+ title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
214
+ try:
215
+ abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
216
+ except:
217
+ abstract_list.append('No Abstract')
218
+ journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
219
+ language_list.append(paper['MedlineCitation']['Article']['Language'][0])
220
+ try:
221
+ pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
222
+ except:
223
+ pubdate_year_list.append('No Data')
224
+ try:
225
+ pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
226
+ except:
227
+ pubdate_month_list.append('No Data')
228
+ except: # occasionally a chunk might annoy your parser
229
+ pass
230
+ df = pd.DataFrame(list(zip(
231
+ title_list, abstract_list, journal_list, language_list, pubdate_year_list,
232
+ pubdate_month_list)),
233
+ columns=['Title', 'Abstract', 'Journal', 'Language', 'Year','Month'])
234
+ return df, abstract_list
235
+
236
+ def ClinicalAgent(fileds, verbose=False):
237
+ df = pd.DataFrame.from_records(fileds[1:], columns=fileds[0])
238
+ return create_pandas_dataframe_agent(OpenAI(temperature=0, model='gpt-3.5-turbo-16k'), df, verbose=verbose)
239
+
240
+ def GetNCTID(results):
241
+ # NCTで始まる単語を検索する正規表現
242
+ pattern = r'\bNCT\d+\b'
243
+ # 正規表現を使って単語を抽出
244
+ nct_words = re.findall(pattern,results)
245
+ return nct_words
OpenAITools/ECarteTools.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import time
3
+ import wikipedia
4
+ import random
5
+ import re
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ import os
9
+ import glob
10
+ from natsort import natsorted
11
+ import requests
12
+ from bs4 import BeautifulSoup
13
+ import xml.etree.ElementTree as ET
14
+ import pandas as pd
15
+
16
+ wikipedia.set_lang("ja")
17
+ # APIキーの設定
18
+ openai.api_key = os.environ['OPENAI_API_KEY']
19
+ engine="gpt-3.5-turbo"
20
+
21
+
22
+ def generate(system_template,prompt,engine="gpt-3.5-turbo"):
23
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
24
+ try:
25
+ response = openai.ChatCompletion.create(
26
+ model=engine,
27
+ messages=[
28
+ {"role": "system", "content": system_template},
29
+ {"role": "user", "content":prompt},
30
+ ]
31
+ )
32
+ result=response["choices"][0]["message"]["content"]
33
+ return result
34
+ except:
35
+ print("リトライ")
36
+ time.sleep(30)
37
+ pass
38
+
39
+ def generate_carte(prompt,engine="gpt-3.5-turbo"):
40
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
41
+ try:
42
+ response = openai.ChatCompletion.create(
43
+ model=engine,
44
+ messages=[
45
+ {"role": "system", "content": "You are useful assistant"},
46
+ {"role": "user", "content":"%s\n・・・という患者と医師の会話をSOAP形式のカルテとして日本語で端的にまとめて下さい。各セクションはS),O), A),P)として下さい "%prompt},
47
+ ]
48
+ )
49
+ result=response["choices"][0]["message"]["content"]
50
+ return result
51
+ except:
52
+ print("リトライ")
53
+ time.sleep(30)
54
+ pass
55
+
56
+ def get_selected_fileds(texts):
57
+ input_name = texts.replace(' ' , "+")
58
+ corona_fields = ct.get_study_fields(
59
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
60
+ fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
61
+ max_studies=500,
62
+ fmt="csv")
63
+ return corona_fields
64
+
65
+ def get_retriever_str(fields):
66
+ retriever_str=''
67
+ for i in range(1,len(fields)):
68
+ colnames = fields[0]
69
+ targetCol = fields[i]
70
+ for f in range(len(fields[0])):
71
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
72
+ retriever_str+='\n'
73
+ return retriever_str
OpenAITools/ExpertTools.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import time
4
+ import wikipedia
5
+ import random
6
+ import re
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ import os
10
+ import glob
11
+ from natsort import natsorted
12
+ import requests
13
+ from bs4 import BeautifulSoup
14
+ import xml.etree.ElementTree as ET
15
+ from pytrials.client import ClinicalTrials
16
+ from Bio import Entrez
17
+ import pandas as pd
18
+ import numpy as np
19
+ import time
20
+ #from langchain.agents import create_pandas_dataframe_agent
21
+ from langchain_experimental.agents import create_pandas_dataframe_agent
22
+ from langchain.llms import OpenAI
23
+
24
+ # APIキーの設定
25
+ openai.api_key = os.environ['OPENAI_API_KEY']
26
+ gptengine="gpt-3.5-turbo"
27
+
28
+
29
+ """def get_selected_fileds(texts):
30
+ ct = ClinicalTrials()
31
+ input_name = texts.replace(' ' , "+")
32
+ corona_fields = ct.get_study_fields(
33
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
34
+ fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
35
+ max_studies=500,
36
+ fmt="csv")
37
+ return corona_fields"""
38
+
39
+ def get_retriever_str(fields):
40
+ retriever_str=''
41
+ for i in range(1,len(fields)):
42
+ colnames = fields[0]
43
+ targetCol = fields[i]
44
+ for f in range(len(fields[0])):
45
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
46
+ retriever_str+='\n'
47
+ return retriever_str
48
+
49
+ def get_chanked_retriever(fields):
50
+ retriever_list =[]
51
+ for i in range(1,len(fields)):
52
+ retriever_str=''
53
+ colnames = fields[0]
54
+ targetCol = fields[i]
55
+ for f in range(len(fields[0])):
56
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
57
+ retriever_list.append(retriever_str)
58
+ return retriever_list
59
+
60
+ from pytrials.client import ClinicalTrials
61
+ def get_selected_fields(texts, split_criteria=False,
62
+ split_word_number = False, split_number=700):
63
+ ct = ClinicalTrials()
64
+ input_name = texts.replace(' ', "+")
65
+ corona_fields = ct.get_study_fields(
66
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (input_name),
67
+ fields=["NCTId", "Condition", "BriefTitle", 'BriefSummary', 'EligibilityCriteria'],
68
+ max_studies=500,
69
+ fmt="csv")
70
+
71
+ if split_criteria:
72
+ new_fields = []
73
+
74
+ # 検索対象の文字列
75
+ target_string1 = 'Exclusion Criteria'
76
+ target_string2 = 'Exclusion criteria'
77
+
78
+ # 各要素で検索対象の文字列を探し、直前で分割して新しいリストに格納
79
+ for corona_field in corona_fields:
80
+ new_list = []
81
+ for item in corona_field:
82
+ if target_string1 in item:
83
+ split_position = item.index(target_string1)
84
+ new_list.append(item[:split_position])
85
+ new_list.append(item[split_position:])
86
+ elif target_string2 in item:
87
+ split_position = item.index(target_string2)
88
+ new_list.append(item[:split_position])
89
+ new_list.append(item[split_position:])
90
+ else:
91
+ new_list.append(item)
92
+ new_fields.append(new_list)
93
+ else:
94
+ new_fields = corona_fields
95
+
96
+ if split_word_number:
97
+ split_fields = []
98
+ for new_field in new_fields:
99
+ new_list= []
100
+
101
+ # 各要素を調べて、700文字以上であれば分割し、新しいリストに格納
102
+ for item in new_field:
103
+ item_length = len(item)
104
+ if item_length > split_number:
105
+ num_parts = -(-item_length // split_number) # 向上の除算を用いて分割数を計算
106
+ for i in range(num_parts):
107
+ start_index = i * split_number
108
+ end_index = min((i + 1) * split_number, item_length) # 文字列の終わりを超えないように調整
109
+ new_list.append(item[start_index:end_index])
110
+ else:
111
+ new_list.append(item)
112
+
113
+ split_fields.append(new_list)
114
+ new_fields = split_fields
115
+
116
+ return new_fields
117
+
118
+
119
+ def print_agent_results(df, Ids,
120
+ interesteds = ['Condition', 'BriefTitle', 'BriefSummary', 'EligibilityCriteria'],
121
+ translater=None):
122
+ results = ""
123
+ for Id in Ids:
124
+ print("%s\n"%Id)
125
+ sdf = df[df['NCTId'] == Id]
126
+ for interested in interesteds:
127
+ # 最初の要素を取得
128
+ results += '%s: \n %s \n' % (interested, sdf[interested].iloc[0])
129
+ #print('%s: \n %s \n' % (interested, sdf[interested].iloc[0]))
130
+ if translater:
131
+ to_be_printed = translater.translate(results)
132
+ else:
133
+ to_be_printed =results
134
+ print(to_be_printed)
135
+
136
+ def search(query):
137
+ Entrez.email = os.getenv('MAIL_ADRESS')
138
+ handle = Entrez.esearch(db='pubmed',
139
+ sort = 'relevance',
140
+ retmax = '20',
141
+ retmode = 'xml',
142
+ term = query)
143
+ results = Entrez.read(handle)
144
+ return results
145
+
146
+ def fetch_details(id_list):
147
+ ids = ','.join(id_list)
148
+ Entrez.email = os.getenv('MAIL_ADRESS')
149
+ handle = Entrez.efetch(db = 'pubmed',
150
+ retmode = 'xml',
151
+ id = ids)
152
+ results = Entrez.read(handle)
153
+ return results
154
+ '''def generate(prompt,engine=None):
155
+ if engine is None:
156
+ engine=gptengine
157
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
158
+ try:
159
+ response = openai.ChatCompletion.create(
160
+ model=engine,
161
+ messages=[
162
+ {"role": "system", "content": "You are useful assistant"},
163
+ {"role": "user", "content":prompt},
164
+ ]
165
+ )
166
+ result=response["choices"][0]["message"]["content"]
167
+ return result
168
+ except Exception as e:
169
+ print(e)
170
+ print("リトライ")
171
+ time.sleep(30)
172
+ pass
173
+ '''
174
+
175
+ def generate(prompt,engine=None):
176
+ if engine is None:
177
+ engine=gptengine
178
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
179
+ try:
180
+ response = openai.chat.completions.create(
181
+ model=engine,
182
+ messages=[
183
+ {"role": "system", "content": "You are useful assistant"},
184
+ {"role": "user", "content":prompt},
185
+ ]
186
+ )
187
+ #result=response["choices"][0]["message"]["content"]
188
+ result=response.choices[0].message.content
189
+ return result
190
+ except Exception as e:
191
+ print(e)
192
+ print("リトライ")
193
+ time.sleep(30)
194
+ pass
195
+
196
+ def GetPubmedSummaryDf(studies):
197
+ title_list= []
198
+ abstract_list=[]
199
+ journal_list = []
200
+ language_list =[]
201
+ pubdate_year_list = []
202
+ pubdate_month_list = []
203
+ studiesIdList = studies['IdList']
204
+ chunk_size = 10000
205
+ for chunk_i in range(0, len(studiesIdList), chunk_size):
206
+ chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
207
+
208
+ try:
209
+ papers = fetch_details(chunk)
210
+ for i, paper in enumerate(papers['PubmedArticle']):
211
+ title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
212
+ try:
213
+ abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
214
+ except:
215
+ abstract_list.append('No Abstract')
216
+ journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
217
+ language_list.append(paper['MedlineCitation']['Article']['Language'][0])
218
+ try:
219
+ pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
220
+ except:
221
+ pubdate_year_list.append('No Data')
222
+ try:
223
+ pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
224
+ except:
225
+ pubdate_month_list.append('No Data')
226
+ except: # occasionally a chunk might annoy your parser
227
+ pass
228
+ df = pd.DataFrame(list(zip(
229
+ title_list, abstract_list, journal_list, language_list, pubdate_year_list,
230
+ pubdate_month_list)),
231
+ columns=['Title', 'Abstract', 'Journal', 'Language', 'Year','Month'])
232
+ return df, abstract_list
233
+
234
+ def ClinicalAgent(fileds, verbose=False):
235
+ df = pd.DataFrame.from_records(fileds[1:], columns=fileds[0])
236
+ return create_pandas_dataframe_agent(OpenAI(temperature=0, model='gpt-3.5-turbo-16k'), df, verbose=verbose)
237
+
238
+ def GetNCTID(results):
239
+ # NCTで始まる単語を検索する正規表現
240
+ pattern = r'\bNCT\d+\b'
241
+ # 正規表現を使って単語を抽出
242
+ nct_words = re.findall(pattern,results)
243
+ return nct_words
OpenAITools/ReviewPaperTools.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+
4
+ def parse_text_file(text):
5
+ # セクションを分割するための正規表現パターンを定義
6
+ # \d+ は1つ以上の数字にマッチします
7
+ pattern = re.compile(r'\n\n\n\d+\.')
8
+
9
+ # テキストをセクションごとに分割
10
+ sections = pattern.split(text)[1:] # 最初の空のセクションを除外
11
+
12
+ # 各セクションの前後の空白を削除
13
+ sections = [section.strip() for section in sections]
14
+
15
+ return sections
16
+
17
+ def split_sections(text):
18
+ contents = text.split('\n\n')
19
+ contents = [section.strip() for section in contents if section.strip()]
20
+ if len(contents) == 8 :
21
+ keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI', 'COI']
22
+ elif len(contents) == 7 :
23
+ keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI']
24
+ elif len(contents) == 6:
25
+ keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'DOI']
26
+ elif len(contents) == 5:
27
+ keys = ['PublishInfo', 'Title', 'AuthorName', 'Abstract', 'DOI']
28
+
29
+ # 辞書を作成し、キーが存在しない場合は空の文字列を設定
30
+ section_dict = {key: contents[i] if i < len(contents) else "" for i, key in enumerate(keys)}
31
+ return section_dict
32
+
33
+
34
+ def GetSummaryDf(textdir):
35
+ with open(textdir, 'r', encoding='utf-8') as f:
36
+ content = f.read()
37
+ sections = parse_text_file(content)
38
+ dicts = []
39
+ for section in sections:
40
+ splited_dic = split_sections(section)
41
+ dicts.append(splited_dic)
42
+ return pd.DataFrame(dicts)
OpenAITools/scrapeThisData.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.support.ui import Select
3
+ from selenium.webdriver.common.by import By
4
+
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ import re
8
+
9
+ import os
10
+ import time
11
+
12
+ from selenium.webdriver.support.ui import WebDriverWait
13
+ from selenium.webdriver.common.by import By
14
+ from selenium.webdriver.support import expected_conditions as EC
15
+ from selenium.webdriver.common.action_chains import ActionChains
16
+ import chromedriver_autoinstaller
17
+
18
+ class ScrapeThatData:
19
+
20
+ def __init__(self, time_threshold = 10):
21
+
22
+ try:
23
+ chrome_options = webdriver.ChromeOptions()
24
+ chrome_options.add_argument('--no-sandbox')
25
+ self.driver = webdriver.Chrome(options=chrome_options)
26
+
27
+ except:
28
+ chromedriver_autoinstaller.install()
29
+ chrome_options = webdriver.ChromeOptions()
30
+ chrome_options.add_argument('--no-sandbox')
31
+ self.driver = webdriver.Chrome(options=chrome_options)
32
+
33
+
34
+
35
+ self.wait = WebDriverWait(self.driver,time_threshold)
36
+ self.attribute_dict = {'status':1 ,'conditions':2, 'interventions': 3, 'study type':4,
37
+ 'phase':5, 'sponsor':6, 'funder type':7 , 'study design': 8,
38
+ 'outcome measures':9, 'number enrolled':10, 'sex':11, 'age':12,
39
+ 'nct number': 13, 'other ids':14, 'title acronym': 15 , 'study start': 16,
40
+ 'primary completion': 17, 'study completion': 18 , 'first posted': 19,
41
+ 'last update posted': 20 , 'results first posted': 21 , 'locations':22, 'study documents': 23}
42
+
43
+ self.status_dict = {'not yet recruiting' : 'notYetRecrCB',
44
+ 'recruiting' : 'recruitingCB',
45
+ 'enrolling by invitation':'enrollingByInvCB',
46
+ 'active, not recruiting': 'activeCB',
47
+ 'suspended': 'suspendedCB',
48
+ 'terminated':'terminatedCB',
49
+ 'completed':'completedCB',
50
+ 'withdrawn': 'withdrawnCB',
51
+ 'unknown status': 'unknownCB'}
52
+
53
+ def clicking_show_hide_cols(self, driver):
54
+ columns = driver.find_element(By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/button')
55
+ action_chain = ActionChains(driver)
56
+ action_chain.move_to_element(columns).click()
57
+ action_chain.perform()
58
+
59
+ def select_attributes_to_show(self, listed_attributes, attribute_dict):
60
+ ll = [value.lower() for value in listed_attributes if value.lower() in ['status', 'conditions', 'interventions', 'locations']]
61
+ if ll:
62
+ to_show = [value.lower() for value in listed_attributes if value.lower() not in ll]
63
+ to_hide = [value for value in ['status', 'conditions', 'interventions', 'locations'] if value not in ll]
64
+ to_click = to_hide + to_show
65
+ for att in to_click:
66
+ self.clicking_show_hide_cols(self.driver)
67
+ time.sleep(1)
68
+ self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att]) + ']'))).click()
69
+ time.sleep(1)
70
+ else:
71
+ for att in listed_attributes:
72
+ self.clicking_show_hide_cols(self.driver)
73
+ time.sleep(1)
74
+ self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att.lower()]) + ']'))).click()
75
+ time.sleep(1)
76
+
77
+ def select_by_status(self, listed_states, status_dict):
78
+ if listed_states:
79
+ for status in listed_states:
80
+ self.driver.find_element(By.ID,status_dict[status.lower()]).click()
81
+
82
+ self.driver.find_element(By.XPATH,'//*[@id="FiltersBody"]/div[1]/input[1]').click()
83
+ time.sleep(3)
84
+
85
+
86
+ select = Select(self.driver.find_element_by_name('theDataTable_length'))
87
+ select.select_by_value('100')
88
+
89
+ def collect_data_search_page(self,l_ordered, amount_of_data = None):
90
+
91
+ class_name = ''
92
+ page_index = 1
93
+
94
+ elements = [l_ordered]
95
+
96
+ while 'disabled' not in class_name :
97
+
98
+
99
+
100
+ time.sleep(10)
101
+
102
+ print('Getting data from page {}'.format(page_index))
103
+
104
+ #Counting how many rows of the table appear
105
+ table = self.driver.find_element(By.ID,'theDataTable')
106
+ row_count = len(table.find_elements(By.TAG_NAME,"tr"))
107
+
108
+ #Looping table page
109
+ for index in range(1, row_count):
110
+ row = []
111
+ if 'status' in l_ordered:
112
+ self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3)')))
113
+ status_element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3) > span')
114
+ row.append(status_element.text.strip())
115
+ for i, val in enumerate(l_ordered):
116
+ if val == 'status':
117
+ continue
118
+
119
+ self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')))
120
+ element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')
121
+ try:
122
+ row.append(element.text.strip())
123
+ except:
124
+ print(i, element)
125
+ else:
126
+ for i, val in enumerate(l_ordered):
127
+ self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')))
128
+ element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')
129
+ try:
130
+ row.append(element.text.strip())
131
+ except:
132
+ print(i, element)
133
+ elements.append(row)
134
+
135
+
136
+
137
+
138
+ #Getting next page button
139
+ next_page= self.driver.find_element(By.ID,"theDataTable_next")
140
+
141
+ #Getting the class attribute of the next page button
142
+ class_name = next_page.get_attribute('class')
143
+
144
+ #Going to the next page
145
+ next_page.click()
146
+ page_index += 1
147
+
148
+ if amount_of_data:
149
+ if len(elements) >= amount_of_data or row_count < amount_of_data :
150
+ break
151
+ else:
152
+ continue
153
+
154
+ return elements
155
+
156
+ def get_criteria(self, NCTnumber):
157
+
158
+ url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
159
+ ClinicalTrialpage = requests.get(url)
160
+ soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
161
+
162
+ wrapping_crit_class = soup.find_all("div", {"class": "tr-indent2"})
163
+ list_elements = wrapping_crit_class[1].find_all(re.compile("(ul|ol)"))
164
+ inclusion, exclusion = ('','')
165
+
166
+
167
+ if not list_elements:
168
+ print ("WARNING: Study number " + NCTnumber + " doesn't have eligibility criteria or HTML tag format is not a list")
169
+ else:
170
+
171
+ if len(list_elements) == 1:
172
+ try:
173
+ if wrapping_crit_class[1].find(text = 'Inclusion Criteria:'):
174
+ inclusion = list_elements[0].find_all("li")
175
+
176
+ elif wrapping_crit_class[1].find(text = 'Exclusion Criteria:'):
177
+ exclusion = list_elements[0].find_all("li")
178
+ except:
179
+ print('criteria doesnt exist')
180
+ else:
181
+ inclusion = list_elements[0].find_all("li")
182
+ exclusion = list_elements[1].find_all("li")
183
+
184
+
185
+ inclusion = ' '.join([t.text.strip() for t in inclusion ])
186
+ exclusion = ' '.join([t.text.strip() for t in exclusion ])
187
+
188
+ return(inclusion, exclusion)
189
+
190
+ #function that gets number of patients enrolled in a study
191
+ def get_enrollment (self, NCTnumber):
192
+ url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
193
+ ClinicalTrialpage = requests.get(url)
194
+ soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
195
+ enrollment = ''
196
+ wrapping_enrol_class = soup.find_all('td', {'headers':'studyInfoColData','style':"padding-left:1em"})
197
+ if not wrapping_enrol_class:
198
+ print('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
199
+ else:
200
+ enrollment = wrapping_enrol_class[1]
201
+ enrollment = enrollment.text.split()[0]
202
+ if enrollment.isdigit() == False:
203
+ print ('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
204
+ else:
205
+ return(enrollment)
206
+
207
+
208
+
209
+ def __call__(self, condition, listed_attributes, listed_states, amount_of_data):
210
+
211
+ self.driver.get('https://clinicaltrials.gov/ct2/results?cond=' + condition + '&rank=1&view=record#rowId0')
212
+ self.select_attributes_to_show(listed_attributes, self.attribute_dict)
213
+
214
+ try:
215
+ self.select_by_status(listed_states, self.status_dict)
216
+ except:
217
+ print('select by status is a problem')
218
+ n = []
219
+ for i in listed_attributes:
220
+ n.append(self.attribute_dict[i.lower()])
221
+ attribute_ordered = [list(self.attribute_dict.keys())[list(self.attribute_dict.values()).index(i)]for i in sorted(n)]
222
+
223
+ search_data = self.collect_data_search_page(attribute_ordered, amount_of_data=amount_of_data)
224
+ nct_numbers = [e[search_data[0].index('nct number')] for e in search_data[1:]]
225
+ search_data[0].extend(['inclusion', 'exclusion', 'enrollment'])
226
+ for index, nct in enumerate(nct_numbers):
227
+ if index % 100 == 0 and index!= 0:
228
+ print("Collected Data from {} Studies: ".format(index))
229
+
230
+ inc, exc = self.get_criteria(nct)
231
+ enrol = self.get_enrollment(nct)
232
+ search_data[index + 1].extend([inc, exc, enrol])
233
+ return search_data
234
+ # except:
235
+ # print('no data available with the specified status')
236
+
237
+
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
  title: GetMutationInfo
3
- emoji: 💻
4
- colorFrom: pink
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 4.36.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
  title: GetMutationInfo
3
+ emoji: 🔥
4
+ colorFrom: green
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.33.0
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from OpenAITools.ExpertTools import GetPubmedSummaryDf, generate, search
2
+ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, SummaryIndex
3
+ from llama_index.core import Document
4
+ from llama_index.llms.groq import Groq
5
+ from llama_index.core import ServiceContext, set_global_service_context
6
+ from llama_index.llms.llama_cpp.llama_utils import messages_to_prompt, completion_to_prompt
7
+ import gradio as gr
8
+
9
+ #models
10
+ LLAMA3_8B = "Llama3-8b-8192"
11
+ LLAMA3_70B = "Llama3-70b-8192"
12
+ Mixtral = "mixtral-8x7b-32768"
13
+
14
+ def custom_completion_to_prompt(completion:str) ->str:
15
+ return completion_to_prompt(
16
+ completion, system_prompt=(
17
+ "You are a Q&A assistant. Your goal is to answer questions as "
18
+ "accurately as possible is the instructions and context provided."
19
+ ),
20
+ )
21
+
22
+ def getMutationEffect(cancer_name, gene_name):
23
+ searchWords= "(" +str(cancer_name)+ ") AND " + "(" + str(gene_name) + ") AND(treatment)"
24
+ studies = search(searchWords)
25
+ df, abstracts= GetPubmedSummaryDf(studies)
26
+ #Define LLM
27
+ llm = Groq(
28
+ model=LLAMA3_8B,
29
+ temperature=0.01,
30
+ context_window=4096,
31
+ completion_to_prompt=custom_completion_to_prompt,
32
+ messages_to_prompt=messages_to_prompt,)
33
+ #set global service context
34
+ ctx = ServiceContext.from_defaults(llm=llm)
35
+ set_global_service_context(ctx)
36
+ documents = [Document(text=t) for t in abstracts[:10]]
37
+ index = SummaryIndex.from_documents(documents)
38
+ query_engine = index.as_query_engine(response_mode="tree_summarize")
39
+ prompt = "Please prepare a single summary of the abstracts of the following papers. Pay particular attention to the {} gene".format(gene_name)
40
+ response = query_engine.query(prompt)
41
+ return response
42
+
43
+ demo = gr.Interface(fn=getMutationEffect,
44
+ inputs=[gr.Textbox(label="CancerName"),
45
+ gr.Textbox(label="GeneName"),
46
+ ],
47
+ outputs="text")
48
+
49
+
50
+ if __name__ == "__main__":
51
+ demo.launch()
environment.yml ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: gradio
2
+ channels:
3
+ - conda-forge
4
+ dependencies:
5
+ - aiofiles=23.2.1
6
+ - altair=5.3.0
7
+ - annotated-types=0.7.0
8
+ - anyio=4.3.0
9
+ - aom=3.9.0
10
+ - appnope=0.1.4
11
+ - argon2-cffi=23.1.0
12
+ - argon2-cffi-bindings=21.2.0
13
+ - arrow=1.3.0
14
+ - asttokens=2.4.1
15
+ - async-lru=2.0.4
16
+ - attrs=23.2.0
17
+ - babel=2.14.0
18
+ - beautifulsoup4=4.12.3
19
+ - bleach=6.1.0
20
+ - blinker=1.8.2
21
+ - brotli=1.1.0
22
+ - brotli-bin=1.1.0
23
+ - brotli-python=1.1.0
24
+ - bzip2=1.0.8
25
+ - ca-certificates=2024.6.2
26
+ - cached-property=1.5.2
27
+ - cached_property=1.5.2
28
+ - cairo=1.18.0
29
+ - certifi=2024.2.2
30
+ - cffi=1.16.0
31
+ - charset-normalizer=3.3.2
32
+ - click=8.1.7
33
+ - colorama=0.4.6
34
+ - comm=0.2.2
35
+ - contourpy=1.2.1
36
+ - cycler=0.12.1
37
+ - dav1d=1.2.1
38
+ - debugpy=1.8.1
39
+ - decorator=5.1.1
40
+ - defusedxml=0.7.1
41
+ - dnspython=2.6.1
42
+ - email-validator=2.1.1
43
+ - email_validator=2.1.1
44
+ - entrypoints=0.4
45
+ - exceptiongroup=1.2.0
46
+ - executing=2.0.1
47
+ - expat=2.6.2
48
+ - fastapi=0.111.0
49
+ - fastapi-cli=0.0.4
50
+ - ffmpeg=7.0.1
51
+ - ffmpy=0.3.0
52
+ - filelock=3.14.0
53
+ - flask=3.0.3
54
+ - font-ttf-dejavu-sans-mono=2.37
55
+ - font-ttf-inconsolata=3.000
56
+ - font-ttf-source-code-pro=2.038
57
+ - font-ttf-ubuntu=0.83
58
+ - fontconfig=2.14.2
59
+ - fonts-conda-ecosystem=1
60
+ - fonts-conda-forge=1
61
+ - fonttools=4.53.0
62
+ - fqdn=1.5.1
63
+ - freetype=2.12.1
64
+ - fribidi=1.0.10
65
+ - fsspec=2024.6.0
66
+ - gettext=0.22.5
67
+ - gettext-tools=0.22.5
68
+ - gmp=6.3.0
69
+ - gnutls=3.7.9
70
+ - gradio=4.33.0
71
+ - gradio-client=0.17.0
72
+ - graphite2=1.3.13
73
+ - h11=0.14.0
74
+ - h2=4.1.0
75
+ - harfbuzz=8.5.0
76
+ - hpack=4.0.0
77
+ - httpcore=1.0.5
78
+ - httpx=0.27.0
79
+ - huggingface_hub=0.23.2
80
+ - hyperframe=6.0.1
81
+ - icu=73.2
82
+ - idna=3.7
83
+ - importlib-metadata=7.1.0
84
+ - importlib-resources=6.4.0
85
+ - importlib_metadata=7.1.0
86
+ - importlib_resources=6.4.0
87
+ - ipykernel=6.29.3
88
+ - ipython=8.25.0
89
+ - ipywidgets=8.1.3
90
+ - isoduration=20.11.0
91
+ - itsdangerous=2.2.0
92
+ - jedi=0.19.1
93
+ - jinja2=3.1.4
94
+ - joblib=1.4.2
95
+ - json5=0.9.25
96
+ - jsonpointer=2.4
97
+ - jsonschema=4.22.0
98
+ - jsonschema-specifications=2023.12.1
99
+ - jsonschema-with-format-nongpl=4.22.0
100
+ - jupyter=1.0.0
101
+ - jupyter-lsp=2.2.5
102
+ - jupyter_client=8.6.2
103
+ - jupyter_console=6.6.3
104
+ - jupyter_core=5.7.2
105
+ - jupyter_events=0.10.0
106
+ - jupyter_server=2.14.1
107
+ - jupyter_server_terminals=0.5.3
108
+ - jupyterlab=4.2.1
109
+ - jupyterlab_pygments=0.3.0
110
+ - jupyterlab_server=2.27.2
111
+ - jupyterlab_widgets=3.0.11
112
+ - kiwisolver=1.4.5
113
+ - krb5=1.21.2
114
+ - lame=3.100
115
+ - lcms2=2.16
116
+ - lerc=4.0.0
117
+ - libabseil=20240116.2
118
+ - libasprintf=0.22.5
119
+ - libasprintf-devel=0.22.5
120
+ - libass=0.17.1
121
+ - libblas=3.9.0
122
+ - libbrotlicommon=1.1.0
123
+ - libbrotlidec=1.1.0
124
+ - libbrotlienc=1.1.0
125
+ - libcblas=3.9.0
126
+ - libcxx=17.0.6
127
+ - libdeflate=1.20
128
+ - libedit=3.1.20191231
129
+ - libexpat=2.6.2
130
+ - libffi=3.4.2
131
+ - libgettextpo=0.22.5
132
+ - libgettextpo-devel=0.22.5
133
+ - libgfortran=5.0.0
134
+ - libgfortran5=13.2.0
135
+ - libglib=2.80.2
136
+ - libhwloc=2.10.0
137
+ - libiconv=1.17
138
+ - libidn2=2.3.7
139
+ - libintl=0.22.5
140
+ - libintl-devel=0.22.5
141
+ - libjpeg-turbo=3.0.0
142
+ - liblapack=3.9.0
143
+ - libopenblas=0.3.27
144
+ - libopenvino=2024.1.0
145
+ - libopenvino-arm-cpu-plugin=2024.1.0
146
+ - libopenvino-auto-batch-plugin=2024.1.0
147
+ - libopenvino-auto-plugin=2024.1.0
148
+ - libopenvino-hetero-plugin=2024.1.0
149
+ - libopenvino-ir-frontend=2024.1.0
150
+ - libopenvino-onnx-frontend=2024.1.0
151
+ - libopenvino-paddle-frontend=2024.1.0
152
+ - libopenvino-pytorch-frontend=2024.1.0
153
+ - libopenvino-tensorflow-frontend=2024.1.0
154
+ - libopenvino-tensorflow-lite-frontend=2024.1.0
155
+ - libopus=1.3.1
156
+ - libpng=1.6.43
157
+ - libprotobuf=4.25.3
158
+ - libsodium=1.0.18
159
+ - libsqlite=3.45.3
160
+ - libtasn1=4.19.0
161
+ - libtiff=4.6.0
162
+ - libunistring=0.9.10
163
+ - libvpx=1.14.0
164
+ - libwebp-base=1.4.0
165
+ - libxcb=1.15
166
+ - libxml2=2.12.7
167
+ - libzlib=1.3.1
168
+ - llvm-openmp=18.1.6
169
+ - markdown-it-py=3.0.0
170
+ - markupsafe=2.1.5
171
+ - matplotlib=3.8.4
172
+ - matplotlib-base=3.8.4
173
+ - matplotlib-inline=0.1.7
174
+ - mdurl=0.1.2
175
+ - mistune=3.0.2
176
+ - munkres=1.1.4
177
+ - natsort=8.4.0
178
+ - nbclient=0.10.0
179
+ - nbconvert=7.16.4
180
+ - nbconvert-core=7.16.4
181
+ - nbconvert-pandoc=7.16.4
182
+ - nbformat=5.10.4
183
+ - ncurses=6.5
184
+ - nest-asyncio=1.6.0
185
+ - nettle=3.9.1
186
+ - notebook=7.2.0
187
+ - notebook-shim=0.2.4
188
+ - numpy=1.26.4
189
+ - openh264=2.4.1
190
+ - openjpeg=2.5.2
191
+ - openssl=3.3.1
192
+ - orjson=3.10.3
193
+ - overrides=7.7.0
194
+ - p11-kit=0.24.1
195
+ - pandas=2.2.2
196
+ - pandoc=3.2
197
+ - pandocfilters=1.5.0
198
+ - parso=0.8.4
199
+ - patsy=0.5.6
200
+ - pcre2=10.43
201
+ - pexpect=4.9.0
202
+ - pickleshare=0.7.5
203
+ - pillow=10.3.0
204
+ - pip=24.0
205
+ - pixman=0.43.4
206
+ - pkgutil-resolve-name=1.3.10
207
+ - platformdirs=4.2.2
208
+ - prometheus_client=0.20.0
209
+ - prompt-toolkit=3.0.42
210
+ - prompt_toolkit=3.0.42
211
+ - psutil=5.9.8
212
+ - pthread-stubs=0.4
213
+ - ptyprocess=0.7.0
214
+ - pugixml=1.14
215
+ - pure_eval=0.2.2
216
+ - pycparser=2.22
217
+ - pydantic=2.7.3
218
+ - pydantic-core=2.18.4
219
+ - pydub=0.25.1
220
+ - pygments=2.18.0
221
+ - pyobjc-core=10.2
222
+ - pyobjc-framework-cocoa=10.2
223
+ - pyparsing=3.1.2
224
+ - pysocks=1.7.1
225
+ - python=3.12.3
226
+ - python-dateutil=2.9.0
227
+ - python-fastjsonschema=2.19.1
228
+ - python-json-logger=2.0.7
229
+ - python-multipart=0.0.9
230
+ - python-tzdata=2024.1
231
+ - python_abi=3.12
232
+ - pytz=2024.1
233
+ - pyyaml=6.0.1
234
+ - pyzmq=26.0.3
235
+ - qtconsole-base=5.5.2
236
+ - qtpy=2.4.1
237
+ - readline=8.2
238
+ - referencing=0.35.1
239
+ - requests=2.32.3
240
+ - rfc3339-validator=0.1.4
241
+ - rfc3986-validator=0.1.1
242
+ - rich=13.7.1
243
+ - rpds-py=0.18.1
244
+ - ruff=0.4.7
245
+ - scikit-learn=1.5.0
246
+ - scipy=1.13.1
247
+ - seaborn=0.13.2
248
+ - seaborn-base=0.13.2
249
+ - semantic_version=2.10.0
250
+ - send2trash=1.8.3
251
+ - setuptools=70.0.0
252
+ - shellingham=1.5.4
253
+ - six=1.16.0
254
+ - snappy=1.2.0
255
+ - sniffio=1.3.1
256
+ - soupsieve=2.5
257
+ - stack_data=0.6.2
258
+ - starlette=0.37.2
259
+ - statsmodels=0.14.2
260
+ - svt-av1=2.1.0
261
+ - tbb=2021.12.0
262
+ - terminado=0.18.1
263
+ - threadpoolctl=3.5.0
264
+ - tinycss2=1.3.0
265
+ - tk=8.6.13
266
+ - tomli=2.0.1
267
+ - tomlkit=0.12.0
268
+ - toolz=0.12.1
269
+ - tornado=6.4
270
+ - tqdm=4.66.4
271
+ - traitlets=5.14.3
272
+ - typer=0.12.3
273
+ - typer-slim=0.12.3
274
+ - typer-slim-standard=0.12.3
275
+ - types-python-dateutil=2.9.0.20240316
276
+ - typing-extensions=4.12.1
277
+ - typing_extensions=4.12.1
278
+ - typing_utils=0.1.0
279
+ - tzdata=2024a
280
+ - ujson=5.10.0
281
+ - uri-template=1.3.0
282
+ - urllib3=2.2.1
283
+ - uvicorn=0.30.1
284
+ - wcwidth=0.2.13
285
+ - webcolors=1.13
286
+ - webencodings=0.5.1
287
+ - websocket-client=1.8.0
288
+ - websockets=11.0.3
289
+ - werkzeug=3.0.3
290
+ - wheel=0.43.0
291
+ - widgetsnbextension=4.0.11
292
+ - wikipedia=1.4.0
293
+ - wtforms=3.1.2
294
+ - x264=1!164.3095
295
+ - x265=3.5
296
+ - xorg-libxau=1.0.11
297
+ - xorg-libxdmcp=1.1.3
298
+ - xz=5.2.6
299
+ - yaml=0.2.5
300
+ - zeromq=4.3.5
301
+ - zipp=3.17.0
302
+ - zlib=1.3.1
303
+ - zstd=1.5.6
304
+ - pip:
305
+ - aiohttp==3.9.5
306
+ - aiosignal==1.3.1
307
+ - bio==1.7.1
308
+ - biopython==1.83
309
+ - biothings-client==0.3.1
310
+ - dataclasses-json==0.6.6
311
+ - deprecated==1.2.14
312
+ - dirtyjson==1.0.8
313
+ - diskcache==5.6.3
314
+ - distro==1.9.0
315
+ - frozenlist==1.4.1
316
+ - gprofiler-official==1.0.0
317
+ - greenlet==3.0.3
318
+ - jsonpatch==1.33
319
+ - langchain==0.2.2
320
+ - langchain-community==0.2.2
321
+ - langchain-core==0.2.4
322
+ - langchain-experimental==0.0.60
323
+ - langchain-openai==0.1.8
324
+ - langchain-text-splitters==0.2.1
325
+ - langsmith==0.1.71
326
+ - llama-cpp-python==0.2.77
327
+ - llama-index==0.10.43
328
+ - llama-index-agent-openai==0.2.7
329
+ - llama-index-cli==0.1.12
330
+ - llama-index-core==0.10.43
331
+ - llama-index-embeddings-openai==0.1.10
332
+ - llama-index-indices-managed-llama-cloud==0.1.6
333
+ - llama-index-legacy==0.9.48
334
+ - llama-index-llms-groq==0.1.4
335
+ - llama-index-llms-llama-cpp==0.1.3
336
+ - llama-index-llms-openai==0.1.22
337
+ - llama-index-llms-openai-like==0.1.3
338
+ - llama-index-llms-replicate==0.1.3
339
+ - llama-index-multi-modal-llms-openai==0.1.6
340
+ - llama-index-program-openai==0.1.6
341
+ - llama-index-question-gen-openai==0.1.3
342
+ - llama-index-readers-file==0.1.23
343
+ - llama-index-readers-llama-parse==0.1.4
344
+ - llama-parse==0.4.4
345
+ - llamaindex-py-client==0.1.19
346
+ - marshmallow==3.21.2
347
+ - multidict==6.0.5
348
+ - mygene==3.2.2
349
+ - mypy-extensions==1.0.0
350
+ - networkx==3.3
351
+ - nltk==3.8.1
352
+ - openai==1.31.0
353
+ - packaging==23.2
354
+ - pooch==1.8.1
355
+ - pypdf==4.2.0
356
+ - pytrials==1.0.0
357
+ - regex==2024.5.15
358
+ - replicate==0.26.0
359
+ - safetensors==0.4.3
360
+ - sqlalchemy==2.0.30
361
+ - striprtf==0.0.26
362
+ - tenacity==8.3.0
363
+ - tiktoken==0.7.0
364
+ - tokenizers==0.19.1
365
+ - transformers==4.41.2
366
+ - typing-inspect==0.9.0
367
+ - wrapt==1.16.0
368
+ - yarl==1.9.4
369
+ prefix: /Users/satoc/miniforge3/envs/gradio
requirements.txt ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.5
2
+ aiosignal==1.3.1
3
+ bio==1.7.1
4
+ biopython==1.83
5
+ biothings-client==0.3.1
6
+ dataclasses-json==0.6.6
7
+ Deprecated==1.2.14
8
+ dirtyjson==1.0.8
9
+ diskcache==5.6.3
10
+ distro==1.9.0
11
+ frozenlist==1.4.1
12
+ gprofiler-official==1.0.0
13
+ greenlet==3.0.3
14
+ hpack==4.0.0
15
+ jsonpatch==1.33
16
+ langchain==0.2.2
17
+ langchain-community==0.2.2
18
+ langchain-core==0.2.4
19
+ langchain-experimental==0.0.60
20
+ langchain-openai==0.1.8
21
+ langchain-text-splitters==0.2.1
22
+ langsmith==0.1.71
23
+ llama-index==0.10.43
24
+ llama-index-agent-openai==0.2.7
25
+ llama-index-cli==0.1.12
26
+ llama-index-core==0.10.43
27
+ llama-index-embeddings-openai==0.1.10
28
+ llama-index-indices-managed-llama-cloud==0.1.6
29
+ llama-index-legacy==0.9.48
30
+ llama-index-llms-groq==0.1.4
31
+ llama-index-llms-llama-cpp==0.1.3
32
+ llama-index-llms-openai==0.1.22
33
+ llama-index-llms-openai-like==0.1.3
34
+ llama-index-llms-replicate==0.1.3
35
+ llama-index-multi-modal-llms-openai==0.1.6
36
+ llama-index-program-openai==0.1.6
37
+ llama-index-question-gen-openai==0.1.3
38
+ llama-index-readers-file==0.1.23
39
+ llama-index-readers-llama-parse==0.1.4
40
+ llama-parse==0.4.4
41
+ llama_cpp_python==0.2.77
42
+ llamaindex-py-client==0.1.19
43
+ marshmallow==3.21.2
44
+ multidict==6.0.5
45
+ munkres==1.1.4
46
+ mygene==3.2.2
47
+ mypy-extensions==1.0.0
48
+ networkx==3.3
49
+ nltk
50
+ openai
51
+ packaging==23.2
52
+ pooch==1.8.1
53
+ pypdf==4.2.0
54
+ pytrials==1.0.0
55
+ regex==2024.5.15
56
+ replicate==0.26.0
57
+ safetensors
58
+ setuptools==70.0.0
59
+ SQLAlchemy==2.0.30
60
+ striprtf==0.0.26
61
+ tenacity==8.3.0
62
+ tiktoken==0.7.0
63
+ tokenizers==0.19.1
64
+ transformers==4.41.2
65
+ typer==0.12.3
66
+ typer-slim==0.12.3
67
+ typing-inspect==0.9.0
68
+ wheel==0.43.0
69
+ wikipedia==1.4.0
70
+ wrapt==1.16.0
71
+ yarl==1.9.4