Spaces:
Sleeping
Sleeping
高橋慧
commited on
Commit
·
5703564
1
Parent(s):
cebc20f
Move chromedriver to Git LFS
Browse files- .gitignore +135 -0
- OpenAITools/.ipynb_checkpoints/ECarteTools-checkpoint.py +73 -0
- OpenAITools/.ipynb_checkpoints/ExpertTools-checkpoint.py +245 -0
- OpenAITools/ECarteTools.py +73 -0
- OpenAITools/ExpertTools.py +243 -0
- OpenAITools/ReviewPaperTools.py +42 -0
- OpenAITools/scrapeThisData.py +237 -0
- README.md +4 -4
- app.py +51 -0
- environment.yml +369 -0
- requirements.txt +71 -0
.gitignore
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Add any directories, files, or patterns you don't want to be tracked by version control
|
2 |
+
|
3 |
+
|
4 |
+
# Byte-compiled / optimized / DLL files
|
5 |
+
__pycache__/
|
6 |
+
#*.py[cod]
|
7 |
+
#*$py.class
|
8 |
+
#*.txt
|
9 |
+
#*.tsv
|
10 |
+
#*.csv
|
11 |
+
*.xlsx
|
12 |
+
*.pdf
|
13 |
+
*.nii
|
14 |
+
#*.nii.gz
|
15 |
+
*.DS_Store
|
16 |
+
#*.png
|
17 |
+
#*.pyn
|
18 |
+
*.jpg
|
19 |
+
*.nii.gz
|
20 |
+
*.pkl
|
21 |
+
*-checkpoint.ipynb
|
22 |
+
*.pkls
|
23 |
+
*.pth
|
24 |
+
*.yaml
|
25 |
+
*.ckpt
|
26 |
+
# C extensions
|
27 |
+
#*.so
|
28 |
+
|
29 |
+
# Distribution / packaging
|
30 |
+
#.Python
|
31 |
+
#build/
|
32 |
+
#develop-eggs/
|
33 |
+
#dist/
|
34 |
+
#downloads/
|
35 |
+
#eggs/
|
36 |
+
#.eggs/
|
37 |
+
#lib/
|
38 |
+
#lib64/
|
39 |
+
#parts/
|
40 |
+
#sdist/
|
41 |
+
#var/
|
42 |
+
#wheels/
|
43 |
+
#*.egg-info/
|
44 |
+
#.installed.cfg
|
45 |
+
#*.egg
|
46 |
+
#MANIFEST
|
47 |
+
|
48 |
+
# PyInstaller
|
49 |
+
# Usually these files are written by a python script from a template
|
50 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
51 |
+
#*.manifest
|
52 |
+
#*.spec
|
53 |
+
|
54 |
+
# Installer logs
|
55 |
+
#pip-log.txt
|
56 |
+
#pip-delete-this-directory.txt
|
57 |
+
|
58 |
+
# Unit test / coverage reports
|
59 |
+
#htmlcov/
|
60 |
+
#.tox/
|
61 |
+
#.coverage
|
62 |
+
#.coverage.*
|
63 |
+
#.cache
|
64 |
+
#nosetests.xml
|
65 |
+
#coverage.xml
|
66 |
+
#*.cover
|
67 |
+
#.hypothesis/
|
68 |
+
#.pytest_cache/
|
69 |
+
|
70 |
+
# Translations
|
71 |
+
#*.mo
|
72 |
+
#*.pot
|
73 |
+
|
74 |
+
# Django stuff:
|
75 |
+
#*.log
|
76 |
+
#.static_storage/
|
77 |
+
#.media/
|
78 |
+
#local_settings.py
|
79 |
+
|
80 |
+
# Flask stuff:
|
81 |
+
#instance/
|
82 |
+
#.webassets-cache
|
83 |
+
|
84 |
+
# Scrapy stuff:
|
85 |
+
#.scrapy
|
86 |
+
|
87 |
+
# Sphinx documentation
|
88 |
+
#docs/_build/
|
89 |
+
|
90 |
+
# PyBuilder
|
91 |
+
#target/
|
92 |
+
|
93 |
+
# Jupyter Notebook
|
94 |
+
.ipynb_checkpoint/*
|
95 |
+
|
96 |
+
# pyenv
|
97 |
+
#.python-version
|
98 |
+
|
99 |
+
# celery beat schedule file
|
100 |
+
#celerybeat-schedule
|
101 |
+
|
102 |
+
# SageMath parsed files
|
103 |
+
#*.sage.py
|
104 |
+
|
105 |
+
# Environments
|
106 |
+
#.env
|
107 |
+
#.venv
|
108 |
+
#env/
|
109 |
+
#venv/
|
110 |
+
#ENV/
|
111 |
+
#env.bak/
|
112 |
+
#venv.bak/
|
113 |
+
|
114 |
+
# Spyder project settings
|
115 |
+
#.spyderproject
|
116 |
+
#.spyproject
|
117 |
+
|
118 |
+
# Rope project settings
|
119 |
+
#.ropeproject
|
120 |
+
|
121 |
+
# mkdocs documentation
|
122 |
+
#/site
|
123 |
+
/models/
|
124 |
+
# mypy
|
125 |
+
#.mypy_cache/
|
126 |
+
#over 100MB
|
127 |
+
|
128 |
+
# Add any directories, files, or patterns you don't want to be tracked by version control
|
129 |
+
|
130 |
+
|
131 |
+
#deep settings
|
132 |
+
*.h5
|
133 |
+
|
134 |
+
.OpenAITools/chromedriver
|
135 |
+
/OpenAITools/chromedriver
|
OpenAITools/.ipynb_checkpoints/ECarteTools-checkpoint.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import time
|
3 |
+
import wikipedia
|
4 |
+
import random
|
5 |
+
import re
|
6 |
+
import requests
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
import os
|
9 |
+
import glob
|
10 |
+
from natsort import natsorted
|
11 |
+
import requests
|
12 |
+
from bs4 import BeautifulSoup
|
13 |
+
import xml.etree.ElementTree as ET
|
14 |
+
import pandas as pd
|
15 |
+
|
16 |
+
wikipedia.set_lang("ja")
|
17 |
+
# APIキーの設定
|
18 |
+
openai.api_key = os.environ['OPENAI_API_KEY']
|
19 |
+
engine="gpt-3.5-turbo"
|
20 |
+
|
21 |
+
|
22 |
+
def generate(system_template,prompt,engine="gpt-3.5-turbo"):
|
23 |
+
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
24 |
+
try:
|
25 |
+
response = openai.ChatCompletion.create(
|
26 |
+
model=engine,
|
27 |
+
messages=[
|
28 |
+
{"role": "system", "content": system_template},
|
29 |
+
{"role": "user", "content":prompt},
|
30 |
+
]
|
31 |
+
)
|
32 |
+
result=response["choices"][0]["message"]["content"]
|
33 |
+
return result
|
34 |
+
except:
|
35 |
+
print("リトライ")
|
36 |
+
time.sleep(30)
|
37 |
+
pass
|
38 |
+
|
39 |
+
def generate_carte(prompt,engine="gpt-3.5-turbo"):
|
40 |
+
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
41 |
+
try:
|
42 |
+
response = openai.ChatCompletion.create(
|
43 |
+
model=engine,
|
44 |
+
messages=[
|
45 |
+
{"role": "system", "content": "You are useful assistant"},
|
46 |
+
{"role": "user", "content":"%s\n・・・という患者と医師の会話をSOAP形式のカルテとして日本語で端的にまとめて下さい。各セクションはS),O), A),P)として下さい "%prompt},
|
47 |
+
]
|
48 |
+
)
|
49 |
+
result=response["choices"][0]["message"]["content"]
|
50 |
+
return result
|
51 |
+
except:
|
52 |
+
print("リトライ")
|
53 |
+
time.sleep(30)
|
54 |
+
pass
|
55 |
+
|
56 |
+
def get_selected_fileds(texts):
|
57 |
+
input_name = texts.replace(' ' , "+")
|
58 |
+
corona_fields = ct.get_study_fields(
|
59 |
+
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
|
60 |
+
fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
|
61 |
+
max_studies=500,
|
62 |
+
fmt="csv")
|
63 |
+
return corona_fields
|
64 |
+
|
65 |
+
def get_retriever_str(fields):
|
66 |
+
retriever_str=''
|
67 |
+
for i in range(1,len(fields)):
|
68 |
+
colnames = fields[0]
|
69 |
+
targetCol = fields[i]
|
70 |
+
for f in range(len(fields[0])):
|
71 |
+
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
72 |
+
retriever_str+='\n'
|
73 |
+
return retriever_str
|
OpenAITools/.ipynb_checkpoints/ExpertTools-checkpoint.py
ADDED
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import openai
|
3 |
+
import time
|
4 |
+
import wikipedia
|
5 |
+
import random
|
6 |
+
import re
|
7 |
+
import requests
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
import os
|
10 |
+
import glob
|
11 |
+
from natsort import natsorted
|
12 |
+
import requests
|
13 |
+
from bs4 import BeautifulSoup
|
14 |
+
import xml.etree.ElementTree as ET
|
15 |
+
from pytrials.client import ClinicalTrials
|
16 |
+
from Bio import Entrez
|
17 |
+
import pandas as pd
|
18 |
+
import numpy as np
|
19 |
+
import time
|
20 |
+
#from langchain.agents import create_pandas_dataframe_agent
|
21 |
+
from langchain_experimental.agents import create_pandas_dataframe_agent
|
22 |
+
from langchain.llms import OpenAI
|
23 |
+
|
24 |
+
# APIキーの設定
|
25 |
+
openai.api_key = os.environ['OPENAI_API_KEY']
|
26 |
+
gptengine="gpt-3.5-turbo"
|
27 |
+
|
28 |
+
|
29 |
+
"""def get_selected_fileds(texts):
|
30 |
+
ct = ClinicalTrials()
|
31 |
+
input_name = texts.replace(' ' , "+")
|
32 |
+
corona_fields = ct.get_study_fields(
|
33 |
+
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
|
34 |
+
fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
|
35 |
+
max_studies=500,
|
36 |
+
fmt="csv")
|
37 |
+
return corona_fields"""
|
38 |
+
|
39 |
+
def get_retriever_str(fields):
|
40 |
+
retriever_str=''
|
41 |
+
for i in range(1,len(fields)):
|
42 |
+
colnames = fields[0]
|
43 |
+
targetCol = fields[i]
|
44 |
+
for f in range(len(fields[0])):
|
45 |
+
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
46 |
+
retriever_str+='\n'
|
47 |
+
return retriever_str
|
48 |
+
|
49 |
+
def get_chanked_retriever(fields):
|
50 |
+
retriever_list =[]
|
51 |
+
for i in range(1,len(fields)):
|
52 |
+
retriever_str=''
|
53 |
+
colnames = fields[0]
|
54 |
+
targetCol = fields[i]
|
55 |
+
for f in range(len(fields[0])):
|
56 |
+
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
57 |
+
retriever_list.append(retriever_str)
|
58 |
+
return retriever_list
|
59 |
+
|
60 |
+
from pytrials.client import ClinicalTrials
|
61 |
+
def get_selected_fields(texts, split_criteria=False,
|
62 |
+
split_word_number = False, split_number=700):
|
63 |
+
ct = ClinicalTrials()
|
64 |
+
input_name = texts.replace(' ', "+")
|
65 |
+
corona_fields = ct.get_study_fields(
|
66 |
+
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (input_name),
|
67 |
+
fields=["NCTId", "Condition", "BriefTitle", 'BriefSummary', 'EligibilityCriteria'],
|
68 |
+
max_studies=500,
|
69 |
+
fmt="csv")
|
70 |
+
|
71 |
+
if split_criteria:
|
72 |
+
new_fields = []
|
73 |
+
|
74 |
+
# 検索対象の文字列
|
75 |
+
target_string1 = 'Exclusion Criteria'
|
76 |
+
target_string2 = 'Exclusion criteria'
|
77 |
+
|
78 |
+
# 各要素で検索対象の文字列を探し、直前で分割して新しいリストに格納
|
79 |
+
for corona_field in corona_fields:
|
80 |
+
new_list = []
|
81 |
+
for item in corona_field:
|
82 |
+
if target_string1 in item:
|
83 |
+
split_position = item.index(target_string1)
|
84 |
+
new_list.append(item[:split_position])
|
85 |
+
new_list.append(item[split_position:])
|
86 |
+
elif target_string2 in item:
|
87 |
+
split_position = item.index(target_string2)
|
88 |
+
new_list.append(item[:split_position])
|
89 |
+
new_list.append(item[split_position:])
|
90 |
+
else:
|
91 |
+
new_list.append(item)
|
92 |
+
new_fields.append(new_list)
|
93 |
+
else:
|
94 |
+
new_fields = corona_fields
|
95 |
+
|
96 |
+
if split_word_number:
|
97 |
+
split_fields = []
|
98 |
+
for new_field in new_fields:
|
99 |
+
new_list= []
|
100 |
+
|
101 |
+
# 各要素を調べて、700文字以上であれば分割し、新しいリストに格納
|
102 |
+
for item in new_field:
|
103 |
+
item_length = len(item)
|
104 |
+
if item_length > split_number:
|
105 |
+
num_parts = -(-item_length // split_number) # 向上の除算を用いて分割数を計算
|
106 |
+
for i in range(num_parts):
|
107 |
+
start_index = i * split_number
|
108 |
+
end_index = min((i + 1) * split_number, item_length) # 文字列の終わりを超えないように調整
|
109 |
+
new_list.append(item[start_index:end_index])
|
110 |
+
else:
|
111 |
+
new_list.append(item)
|
112 |
+
|
113 |
+
split_fields.append(new_list)
|
114 |
+
new_fields = split_fields
|
115 |
+
|
116 |
+
return new_fields
|
117 |
+
|
118 |
+
|
119 |
+
def print_agent_results(df, Ids,
|
120 |
+
interesteds = ['Condition', 'BriefTitle', 'BriefSummary', 'EligibilityCriteria'],
|
121 |
+
translater=None):
|
122 |
+
results = ""
|
123 |
+
for Id in Ids:
|
124 |
+
print("%s\n"%Id)
|
125 |
+
sdf = df[df['NCTId'] == Id]
|
126 |
+
for interested in interesteds:
|
127 |
+
# 最初の要素を取得
|
128 |
+
results += '%s: \n %s \n' % (interested, sdf[interested].iloc[0])
|
129 |
+
#print('%s: \n %s \n' % (interested, sdf[interested].iloc[0]))
|
130 |
+
if translater:
|
131 |
+
to_be_printed = translater.translate(results)
|
132 |
+
else:
|
133 |
+
to_be_printed =results
|
134 |
+
print(to_be_printed)
|
135 |
+
|
136 |
+
def search(query):
|
137 |
+
Entrez.email = os.getenv('MAIL_ADRESS')
|
138 |
+
#Entrez.email='[email protected]'
|
139 |
+
handle = Entrez.esearch(db='pubmed',
|
140 |
+
sort = 'relevance',
|
141 |
+
retmax = '20',
|
142 |
+
retmode = 'xml',
|
143 |
+
term = query)
|
144 |
+
results = Entrez.read(handle)
|
145 |
+
return results
|
146 |
+
|
147 |
+
def fetch_details(id_list):
|
148 |
+
ids = ','.join(id_list)
|
149 |
+
Entrez.email = os.getenv('MAIL_ADRESS')
|
150 |
+
#Entrez.email = '[email protected]'
|
151 |
+
handle = Entrez.efetch(db = 'pubmed',
|
152 |
+
retmode = 'xml',
|
153 |
+
id = ids)
|
154 |
+
results = Entrez.read(handle)
|
155 |
+
return results
|
156 |
+
'''def generate(prompt,engine=None):
|
157 |
+
if engine is None:
|
158 |
+
engine=gptengine
|
159 |
+
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
160 |
+
try:
|
161 |
+
response = openai.ChatCompletion.create(
|
162 |
+
model=engine,
|
163 |
+
messages=[
|
164 |
+
{"role": "system", "content": "You are useful assistant"},
|
165 |
+
{"role": "user", "content":prompt},
|
166 |
+
]
|
167 |
+
)
|
168 |
+
result=response["choices"][0]["message"]["content"]
|
169 |
+
return result
|
170 |
+
except Exception as e:
|
171 |
+
print(e)
|
172 |
+
print("リトライ")
|
173 |
+
time.sleep(30)
|
174 |
+
pass
|
175 |
+
'''
|
176 |
+
|
177 |
+
def generate(prompt,engine=None):
|
178 |
+
if engine is None:
|
179 |
+
engine=gptengine
|
180 |
+
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
181 |
+
try:
|
182 |
+
response = openai.chat.completions.create(
|
183 |
+
model=engine,
|
184 |
+
messages=[
|
185 |
+
{"role": "system", "content": "You are useful assistant"},
|
186 |
+
{"role": "user", "content":prompt},
|
187 |
+
]
|
188 |
+
)
|
189 |
+
#result=response["choices"][0]["message"]["content"]
|
190 |
+
result=response.choices[0].message.content
|
191 |
+
return result
|
192 |
+
except Exception as e:
|
193 |
+
print(e)
|
194 |
+
print("リトライ")
|
195 |
+
time.sleep(30)
|
196 |
+
pass
|
197 |
+
|
198 |
+
def GetPubmedSummaryDf(studies):
|
199 |
+
title_list= []
|
200 |
+
abstract_list=[]
|
201 |
+
journal_list = []
|
202 |
+
language_list =[]
|
203 |
+
pubdate_year_list = []
|
204 |
+
pubdate_month_list = []
|
205 |
+
studiesIdList = studies['IdList']
|
206 |
+
chunk_size = 10000
|
207 |
+
for chunk_i in range(0, len(studiesIdList), chunk_size):
|
208 |
+
chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
|
209 |
+
|
210 |
+
try:
|
211 |
+
papers = fetch_details(chunk)
|
212 |
+
for i, paper in enumerate(papers['PubmedArticle']):
|
213 |
+
title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
|
214 |
+
try:
|
215 |
+
abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
|
216 |
+
except:
|
217 |
+
abstract_list.append('No Abstract')
|
218 |
+
journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
|
219 |
+
language_list.append(paper['MedlineCitation']['Article']['Language'][0])
|
220 |
+
try:
|
221 |
+
pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
|
222 |
+
except:
|
223 |
+
pubdate_year_list.append('No Data')
|
224 |
+
try:
|
225 |
+
pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
|
226 |
+
except:
|
227 |
+
pubdate_month_list.append('No Data')
|
228 |
+
except: # occasionally a chunk might annoy your parser
|
229 |
+
pass
|
230 |
+
df = pd.DataFrame(list(zip(
|
231 |
+
title_list, abstract_list, journal_list, language_list, pubdate_year_list,
|
232 |
+
pubdate_month_list)),
|
233 |
+
columns=['Title', 'Abstract', 'Journal', 'Language', 'Year','Month'])
|
234 |
+
return df, abstract_list
|
235 |
+
|
236 |
+
def ClinicalAgent(fileds, verbose=False):
|
237 |
+
df = pd.DataFrame.from_records(fileds[1:], columns=fileds[0])
|
238 |
+
return create_pandas_dataframe_agent(OpenAI(temperature=0, model='gpt-3.5-turbo-16k'), df, verbose=verbose)
|
239 |
+
|
240 |
+
def GetNCTID(results):
|
241 |
+
# NCTで始まる単語を検索する正規表現
|
242 |
+
pattern = r'\bNCT\d+\b'
|
243 |
+
# 正規表現を使って単語を抽出
|
244 |
+
nct_words = re.findall(pattern,results)
|
245 |
+
return nct_words
|
OpenAITools/ECarteTools.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import time
|
3 |
+
import wikipedia
|
4 |
+
import random
|
5 |
+
import re
|
6 |
+
import requests
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
import os
|
9 |
+
import glob
|
10 |
+
from natsort import natsorted
|
11 |
+
import requests
|
12 |
+
from bs4 import BeautifulSoup
|
13 |
+
import xml.etree.ElementTree as ET
|
14 |
+
import pandas as pd
|
15 |
+
|
16 |
+
wikipedia.set_lang("ja")
|
17 |
+
# APIキーの設定
|
18 |
+
openai.api_key = os.environ['OPENAI_API_KEY']
|
19 |
+
engine="gpt-3.5-turbo"
|
20 |
+
|
21 |
+
|
22 |
+
def generate(system_template,prompt,engine="gpt-3.5-turbo"):
|
23 |
+
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
24 |
+
try:
|
25 |
+
response = openai.ChatCompletion.create(
|
26 |
+
model=engine,
|
27 |
+
messages=[
|
28 |
+
{"role": "system", "content": system_template},
|
29 |
+
{"role": "user", "content":prompt},
|
30 |
+
]
|
31 |
+
)
|
32 |
+
result=response["choices"][0]["message"]["content"]
|
33 |
+
return result
|
34 |
+
except:
|
35 |
+
print("リトライ")
|
36 |
+
time.sleep(30)
|
37 |
+
pass
|
38 |
+
|
39 |
+
def generate_carte(prompt,engine="gpt-3.5-turbo"):
|
40 |
+
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
41 |
+
try:
|
42 |
+
response = openai.ChatCompletion.create(
|
43 |
+
model=engine,
|
44 |
+
messages=[
|
45 |
+
{"role": "system", "content": "You are useful assistant"},
|
46 |
+
{"role": "user", "content":"%s\n・・・という患者と医師の会話をSOAP形式のカルテとして日本語で端的にまとめて下さい。各セクションはS),O), A),P)として下さい "%prompt},
|
47 |
+
]
|
48 |
+
)
|
49 |
+
result=response["choices"][0]["message"]["content"]
|
50 |
+
return result
|
51 |
+
except:
|
52 |
+
print("リトライ")
|
53 |
+
time.sleep(30)
|
54 |
+
pass
|
55 |
+
|
56 |
+
def get_selected_fileds(texts):
|
57 |
+
input_name = texts.replace(' ' , "+")
|
58 |
+
corona_fields = ct.get_study_fields(
|
59 |
+
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
|
60 |
+
fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
|
61 |
+
max_studies=500,
|
62 |
+
fmt="csv")
|
63 |
+
return corona_fields
|
64 |
+
|
65 |
+
def get_retriever_str(fields):
|
66 |
+
retriever_str=''
|
67 |
+
for i in range(1,len(fields)):
|
68 |
+
colnames = fields[0]
|
69 |
+
targetCol = fields[i]
|
70 |
+
for f in range(len(fields[0])):
|
71 |
+
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
72 |
+
retriever_str+='\n'
|
73 |
+
return retriever_str
|
OpenAITools/ExpertTools.py
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import openai
|
3 |
+
import time
|
4 |
+
import wikipedia
|
5 |
+
import random
|
6 |
+
import re
|
7 |
+
import requests
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
import os
|
10 |
+
import glob
|
11 |
+
from natsort import natsorted
|
12 |
+
import requests
|
13 |
+
from bs4 import BeautifulSoup
|
14 |
+
import xml.etree.ElementTree as ET
|
15 |
+
from pytrials.client import ClinicalTrials
|
16 |
+
from Bio import Entrez
|
17 |
+
import pandas as pd
|
18 |
+
import numpy as np
|
19 |
+
import time
|
20 |
+
#from langchain.agents import create_pandas_dataframe_agent
|
21 |
+
from langchain_experimental.agents import create_pandas_dataframe_agent
|
22 |
+
from langchain.llms import OpenAI
|
23 |
+
|
24 |
+
# APIキーの設定
|
25 |
+
openai.api_key = os.environ['OPENAI_API_KEY']
|
26 |
+
gptengine="gpt-3.5-turbo"
|
27 |
+
|
28 |
+
|
29 |
+
"""def get_selected_fileds(texts):
|
30 |
+
ct = ClinicalTrials()
|
31 |
+
input_name = texts.replace(' ' , "+")
|
32 |
+
corona_fields = ct.get_study_fields(
|
33 |
+
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
|
34 |
+
fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
|
35 |
+
max_studies=500,
|
36 |
+
fmt="csv")
|
37 |
+
return corona_fields"""
|
38 |
+
|
39 |
+
def get_retriever_str(fields):
|
40 |
+
retriever_str=''
|
41 |
+
for i in range(1,len(fields)):
|
42 |
+
colnames = fields[0]
|
43 |
+
targetCol = fields[i]
|
44 |
+
for f in range(len(fields[0])):
|
45 |
+
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
46 |
+
retriever_str+='\n'
|
47 |
+
return retriever_str
|
48 |
+
|
49 |
+
def get_chanked_retriever(fields):
|
50 |
+
retriever_list =[]
|
51 |
+
for i in range(1,len(fields)):
|
52 |
+
retriever_str=''
|
53 |
+
colnames = fields[0]
|
54 |
+
targetCol = fields[i]
|
55 |
+
for f in range(len(fields[0])):
|
56 |
+
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
57 |
+
retriever_list.append(retriever_str)
|
58 |
+
return retriever_list
|
59 |
+
|
60 |
+
from pytrials.client import ClinicalTrials
|
61 |
+
def get_selected_fields(texts, split_criteria=False,
|
62 |
+
split_word_number = False, split_number=700):
|
63 |
+
ct = ClinicalTrials()
|
64 |
+
input_name = texts.replace(' ', "+")
|
65 |
+
corona_fields = ct.get_study_fields(
|
66 |
+
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (input_name),
|
67 |
+
fields=["NCTId", "Condition", "BriefTitle", 'BriefSummary', 'EligibilityCriteria'],
|
68 |
+
max_studies=500,
|
69 |
+
fmt="csv")
|
70 |
+
|
71 |
+
if split_criteria:
|
72 |
+
new_fields = []
|
73 |
+
|
74 |
+
# 検索対象の文字列
|
75 |
+
target_string1 = 'Exclusion Criteria'
|
76 |
+
target_string2 = 'Exclusion criteria'
|
77 |
+
|
78 |
+
# 各要素で検索対象の文字列を探し、直前で分割して新しいリストに格納
|
79 |
+
for corona_field in corona_fields:
|
80 |
+
new_list = []
|
81 |
+
for item in corona_field:
|
82 |
+
if target_string1 in item:
|
83 |
+
split_position = item.index(target_string1)
|
84 |
+
new_list.append(item[:split_position])
|
85 |
+
new_list.append(item[split_position:])
|
86 |
+
elif target_string2 in item:
|
87 |
+
split_position = item.index(target_string2)
|
88 |
+
new_list.append(item[:split_position])
|
89 |
+
new_list.append(item[split_position:])
|
90 |
+
else:
|
91 |
+
new_list.append(item)
|
92 |
+
new_fields.append(new_list)
|
93 |
+
else:
|
94 |
+
new_fields = corona_fields
|
95 |
+
|
96 |
+
if split_word_number:
|
97 |
+
split_fields = []
|
98 |
+
for new_field in new_fields:
|
99 |
+
new_list= []
|
100 |
+
|
101 |
+
# 各要素を調べて、700文字以上であれば分割し、新しいリストに格納
|
102 |
+
for item in new_field:
|
103 |
+
item_length = len(item)
|
104 |
+
if item_length > split_number:
|
105 |
+
num_parts = -(-item_length // split_number) # 向上の除算を用いて分割数を計算
|
106 |
+
for i in range(num_parts):
|
107 |
+
start_index = i * split_number
|
108 |
+
end_index = min((i + 1) * split_number, item_length) # 文字列の終わりを超えないように調整
|
109 |
+
new_list.append(item[start_index:end_index])
|
110 |
+
else:
|
111 |
+
new_list.append(item)
|
112 |
+
|
113 |
+
split_fields.append(new_list)
|
114 |
+
new_fields = split_fields
|
115 |
+
|
116 |
+
return new_fields
|
117 |
+
|
118 |
+
|
119 |
+
def print_agent_results(df, Ids,
|
120 |
+
interesteds = ['Condition', 'BriefTitle', 'BriefSummary', 'EligibilityCriteria'],
|
121 |
+
translater=None):
|
122 |
+
results = ""
|
123 |
+
for Id in Ids:
|
124 |
+
print("%s\n"%Id)
|
125 |
+
sdf = df[df['NCTId'] == Id]
|
126 |
+
for interested in interesteds:
|
127 |
+
# 最初の要素を取得
|
128 |
+
results += '%s: \n %s \n' % (interested, sdf[interested].iloc[0])
|
129 |
+
#print('%s: \n %s \n' % (interested, sdf[interested].iloc[0]))
|
130 |
+
if translater:
|
131 |
+
to_be_printed = translater.translate(results)
|
132 |
+
else:
|
133 |
+
to_be_printed =results
|
134 |
+
print(to_be_printed)
|
135 |
+
|
136 |
+
def search(query):
|
137 |
+
Entrez.email = os.getenv('MAIL_ADRESS')
|
138 |
+
handle = Entrez.esearch(db='pubmed',
|
139 |
+
sort = 'relevance',
|
140 |
+
retmax = '20',
|
141 |
+
retmode = 'xml',
|
142 |
+
term = query)
|
143 |
+
results = Entrez.read(handle)
|
144 |
+
return results
|
145 |
+
|
146 |
+
def fetch_details(id_list):
|
147 |
+
ids = ','.join(id_list)
|
148 |
+
Entrez.email = os.getenv('MAIL_ADRESS')
|
149 |
+
handle = Entrez.efetch(db = 'pubmed',
|
150 |
+
retmode = 'xml',
|
151 |
+
id = ids)
|
152 |
+
results = Entrez.read(handle)
|
153 |
+
return results
|
154 |
+
'''def generate(prompt,engine=None):
|
155 |
+
if engine is None:
|
156 |
+
engine=gptengine
|
157 |
+
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
158 |
+
try:
|
159 |
+
response = openai.ChatCompletion.create(
|
160 |
+
model=engine,
|
161 |
+
messages=[
|
162 |
+
{"role": "system", "content": "You are useful assistant"},
|
163 |
+
{"role": "user", "content":prompt},
|
164 |
+
]
|
165 |
+
)
|
166 |
+
result=response["choices"][0]["message"]["content"]
|
167 |
+
return result
|
168 |
+
except Exception as e:
|
169 |
+
print(e)
|
170 |
+
print("リトライ")
|
171 |
+
time.sleep(30)
|
172 |
+
pass
|
173 |
+
'''
|
174 |
+
|
175 |
+
def generate(prompt,engine=None):
|
176 |
+
if engine is None:
|
177 |
+
engine=gptengine
|
178 |
+
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
179 |
+
try:
|
180 |
+
response = openai.chat.completions.create(
|
181 |
+
model=engine,
|
182 |
+
messages=[
|
183 |
+
{"role": "system", "content": "You are useful assistant"},
|
184 |
+
{"role": "user", "content":prompt},
|
185 |
+
]
|
186 |
+
)
|
187 |
+
#result=response["choices"][0]["message"]["content"]
|
188 |
+
result=response.choices[0].message.content
|
189 |
+
return result
|
190 |
+
except Exception as e:
|
191 |
+
print(e)
|
192 |
+
print("リトライ")
|
193 |
+
time.sleep(30)
|
194 |
+
pass
|
195 |
+
|
196 |
+
def GetPubmedSummaryDf(studies):
|
197 |
+
title_list= []
|
198 |
+
abstract_list=[]
|
199 |
+
journal_list = []
|
200 |
+
language_list =[]
|
201 |
+
pubdate_year_list = []
|
202 |
+
pubdate_month_list = []
|
203 |
+
studiesIdList = studies['IdList']
|
204 |
+
chunk_size = 10000
|
205 |
+
for chunk_i in range(0, len(studiesIdList), chunk_size):
|
206 |
+
chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
|
207 |
+
|
208 |
+
try:
|
209 |
+
papers = fetch_details(chunk)
|
210 |
+
for i, paper in enumerate(papers['PubmedArticle']):
|
211 |
+
title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
|
212 |
+
try:
|
213 |
+
abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
|
214 |
+
except:
|
215 |
+
abstract_list.append('No Abstract')
|
216 |
+
journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
|
217 |
+
language_list.append(paper['MedlineCitation']['Article']['Language'][0])
|
218 |
+
try:
|
219 |
+
pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
|
220 |
+
except:
|
221 |
+
pubdate_year_list.append('No Data')
|
222 |
+
try:
|
223 |
+
pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
|
224 |
+
except:
|
225 |
+
pubdate_month_list.append('No Data')
|
226 |
+
except: # occasionally a chunk might annoy your parser
|
227 |
+
pass
|
228 |
+
df = pd.DataFrame(list(zip(
|
229 |
+
title_list, abstract_list, journal_list, language_list, pubdate_year_list,
|
230 |
+
pubdate_month_list)),
|
231 |
+
columns=['Title', 'Abstract', 'Journal', 'Language', 'Year','Month'])
|
232 |
+
return df, abstract_list
|
233 |
+
|
234 |
+
def ClinicalAgent(fileds, verbose=False):
|
235 |
+
df = pd.DataFrame.from_records(fileds[1:], columns=fileds[0])
|
236 |
+
return create_pandas_dataframe_agent(OpenAI(temperature=0, model='gpt-3.5-turbo-16k'), df, verbose=verbose)
|
237 |
+
|
238 |
+
def GetNCTID(results):
|
239 |
+
# NCTで始まる単語を検索する正規表現
|
240 |
+
pattern = r'\bNCT\d+\b'
|
241 |
+
# 正規表現を使って単語を抽出
|
242 |
+
nct_words = re.findall(pattern,results)
|
243 |
+
return nct_words
|
OpenAITools/ReviewPaperTools.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
def parse_text_file(text):
|
5 |
+
# セクションを分割するための正規表現パターンを定義
|
6 |
+
# \d+ は1つ以上の数字にマッチします
|
7 |
+
pattern = re.compile(r'\n\n\n\d+\.')
|
8 |
+
|
9 |
+
# テキストをセクションごとに分割
|
10 |
+
sections = pattern.split(text)[1:] # 最初の空のセクションを除外
|
11 |
+
|
12 |
+
# 各セクションの前後の空白を削除
|
13 |
+
sections = [section.strip() for section in sections]
|
14 |
+
|
15 |
+
return sections
|
16 |
+
|
17 |
+
def split_sections(text):
|
18 |
+
contents = text.split('\n\n')
|
19 |
+
contents = [section.strip() for section in contents if section.strip()]
|
20 |
+
if len(contents) == 8 :
|
21 |
+
keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI', 'COI']
|
22 |
+
elif len(contents) == 7 :
|
23 |
+
keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI']
|
24 |
+
elif len(contents) == 6:
|
25 |
+
keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'DOI']
|
26 |
+
elif len(contents) == 5:
|
27 |
+
keys = ['PublishInfo', 'Title', 'AuthorName', 'Abstract', 'DOI']
|
28 |
+
|
29 |
+
# 辞書を作成し、キーが存在しない場合は空の文字列を設定
|
30 |
+
section_dict = {key: contents[i] if i < len(contents) else "" for i, key in enumerate(keys)}
|
31 |
+
return section_dict
|
32 |
+
|
33 |
+
|
34 |
+
def GetSummaryDf(textdir):
|
35 |
+
with open(textdir, 'r', encoding='utf-8') as f:
|
36 |
+
content = f.read()
|
37 |
+
sections = parse_text_file(content)
|
38 |
+
dicts = []
|
39 |
+
for section in sections:
|
40 |
+
splited_dic = split_sections(section)
|
41 |
+
dicts.append(splited_dic)
|
42 |
+
return pd.DataFrame(dicts)
|
OpenAITools/scrapeThisData.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from selenium.webdriver.support.ui import Select
|
3 |
+
from selenium.webdriver.common.by import By
|
4 |
+
|
5 |
+
import requests
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
+
import re
|
8 |
+
|
9 |
+
import os
|
10 |
+
import time
|
11 |
+
|
12 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
13 |
+
from selenium.webdriver.common.by import By
|
14 |
+
from selenium.webdriver.support import expected_conditions as EC
|
15 |
+
from selenium.webdriver.common.action_chains import ActionChains
|
16 |
+
import chromedriver_autoinstaller
|
17 |
+
|
18 |
+
class ScrapeThatData:
|
19 |
+
|
20 |
+
def __init__(self, time_threshold = 10):
|
21 |
+
|
22 |
+
try:
|
23 |
+
chrome_options = webdriver.ChromeOptions()
|
24 |
+
chrome_options.add_argument('--no-sandbox')
|
25 |
+
self.driver = webdriver.Chrome(options=chrome_options)
|
26 |
+
|
27 |
+
except:
|
28 |
+
chromedriver_autoinstaller.install()
|
29 |
+
chrome_options = webdriver.ChromeOptions()
|
30 |
+
chrome_options.add_argument('--no-sandbox')
|
31 |
+
self.driver = webdriver.Chrome(options=chrome_options)
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
self.wait = WebDriverWait(self.driver,time_threshold)
|
36 |
+
self.attribute_dict = {'status':1 ,'conditions':2, 'interventions': 3, 'study type':4,
|
37 |
+
'phase':5, 'sponsor':6, 'funder type':7 , 'study design': 8,
|
38 |
+
'outcome measures':9, 'number enrolled':10, 'sex':11, 'age':12,
|
39 |
+
'nct number': 13, 'other ids':14, 'title acronym': 15 , 'study start': 16,
|
40 |
+
'primary completion': 17, 'study completion': 18 , 'first posted': 19,
|
41 |
+
'last update posted': 20 , 'results first posted': 21 , 'locations':22, 'study documents': 23}
|
42 |
+
|
43 |
+
self.status_dict = {'not yet recruiting' : 'notYetRecrCB',
|
44 |
+
'recruiting' : 'recruitingCB',
|
45 |
+
'enrolling by invitation':'enrollingByInvCB',
|
46 |
+
'active, not recruiting': 'activeCB',
|
47 |
+
'suspended': 'suspendedCB',
|
48 |
+
'terminated':'terminatedCB',
|
49 |
+
'completed':'completedCB',
|
50 |
+
'withdrawn': 'withdrawnCB',
|
51 |
+
'unknown status': 'unknownCB'}
|
52 |
+
|
53 |
+
def clicking_show_hide_cols(self, driver):
|
54 |
+
columns = driver.find_element(By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/button')
|
55 |
+
action_chain = ActionChains(driver)
|
56 |
+
action_chain.move_to_element(columns).click()
|
57 |
+
action_chain.perform()
|
58 |
+
|
59 |
+
def select_attributes_to_show(self, listed_attributes, attribute_dict):
|
60 |
+
ll = [value.lower() for value in listed_attributes if value.lower() in ['status', 'conditions', 'interventions', 'locations']]
|
61 |
+
if ll:
|
62 |
+
to_show = [value.lower() for value in listed_attributes if value.lower() not in ll]
|
63 |
+
to_hide = [value for value in ['status', 'conditions', 'interventions', 'locations'] if value not in ll]
|
64 |
+
to_click = to_hide + to_show
|
65 |
+
for att in to_click:
|
66 |
+
self.clicking_show_hide_cols(self.driver)
|
67 |
+
time.sleep(1)
|
68 |
+
self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att]) + ']'))).click()
|
69 |
+
time.sleep(1)
|
70 |
+
else:
|
71 |
+
for att in listed_attributes:
|
72 |
+
self.clicking_show_hide_cols(self.driver)
|
73 |
+
time.sleep(1)
|
74 |
+
self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att.lower()]) + ']'))).click()
|
75 |
+
time.sleep(1)
|
76 |
+
|
77 |
+
def select_by_status(self, listed_states, status_dict):
|
78 |
+
if listed_states:
|
79 |
+
for status in listed_states:
|
80 |
+
self.driver.find_element(By.ID,status_dict[status.lower()]).click()
|
81 |
+
|
82 |
+
self.driver.find_element(By.XPATH,'//*[@id="FiltersBody"]/div[1]/input[1]').click()
|
83 |
+
time.sleep(3)
|
84 |
+
|
85 |
+
|
86 |
+
select = Select(self.driver.find_element_by_name('theDataTable_length'))
|
87 |
+
select.select_by_value('100')
|
88 |
+
|
89 |
+
def collect_data_search_page(self,l_ordered, amount_of_data = None):
|
90 |
+
|
91 |
+
class_name = ''
|
92 |
+
page_index = 1
|
93 |
+
|
94 |
+
elements = [l_ordered]
|
95 |
+
|
96 |
+
while 'disabled' not in class_name :
|
97 |
+
|
98 |
+
|
99 |
+
|
100 |
+
time.sleep(10)
|
101 |
+
|
102 |
+
print('Getting data from page {}'.format(page_index))
|
103 |
+
|
104 |
+
#Counting how many rows of the table appear
|
105 |
+
table = self.driver.find_element(By.ID,'theDataTable')
|
106 |
+
row_count = len(table.find_elements(By.TAG_NAME,"tr"))
|
107 |
+
|
108 |
+
#Looping table page
|
109 |
+
for index in range(1, row_count):
|
110 |
+
row = []
|
111 |
+
if 'status' in l_ordered:
|
112 |
+
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3)')))
|
113 |
+
status_element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3) > span')
|
114 |
+
row.append(status_element.text.strip())
|
115 |
+
for i, val in enumerate(l_ordered):
|
116 |
+
if val == 'status':
|
117 |
+
continue
|
118 |
+
|
119 |
+
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')))
|
120 |
+
element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')
|
121 |
+
try:
|
122 |
+
row.append(element.text.strip())
|
123 |
+
except:
|
124 |
+
print(i, element)
|
125 |
+
else:
|
126 |
+
for i, val in enumerate(l_ordered):
|
127 |
+
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')))
|
128 |
+
element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')
|
129 |
+
try:
|
130 |
+
row.append(element.text.strip())
|
131 |
+
except:
|
132 |
+
print(i, element)
|
133 |
+
elements.append(row)
|
134 |
+
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
#Getting next page button
|
139 |
+
next_page= self.driver.find_element(By.ID,"theDataTable_next")
|
140 |
+
|
141 |
+
#Getting the class attribute of the next page button
|
142 |
+
class_name = next_page.get_attribute('class')
|
143 |
+
|
144 |
+
#Going to the next page
|
145 |
+
next_page.click()
|
146 |
+
page_index += 1
|
147 |
+
|
148 |
+
if amount_of_data:
|
149 |
+
if len(elements) >= amount_of_data or row_count < amount_of_data :
|
150 |
+
break
|
151 |
+
else:
|
152 |
+
continue
|
153 |
+
|
154 |
+
return elements
|
155 |
+
|
156 |
+
def get_criteria(self, NCTnumber):
|
157 |
+
|
158 |
+
url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
|
159 |
+
ClinicalTrialpage = requests.get(url)
|
160 |
+
soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
|
161 |
+
|
162 |
+
wrapping_crit_class = soup.find_all("div", {"class": "tr-indent2"})
|
163 |
+
list_elements = wrapping_crit_class[1].find_all(re.compile("(ul|ol)"))
|
164 |
+
inclusion, exclusion = ('','')
|
165 |
+
|
166 |
+
|
167 |
+
if not list_elements:
|
168 |
+
print ("WARNING: Study number " + NCTnumber + " doesn't have eligibility criteria or HTML tag format is not a list")
|
169 |
+
else:
|
170 |
+
|
171 |
+
if len(list_elements) == 1:
|
172 |
+
try:
|
173 |
+
if wrapping_crit_class[1].find(text = 'Inclusion Criteria:'):
|
174 |
+
inclusion = list_elements[0].find_all("li")
|
175 |
+
|
176 |
+
elif wrapping_crit_class[1].find(text = 'Exclusion Criteria:'):
|
177 |
+
exclusion = list_elements[0].find_all("li")
|
178 |
+
except:
|
179 |
+
print('criteria doesnt exist')
|
180 |
+
else:
|
181 |
+
inclusion = list_elements[0].find_all("li")
|
182 |
+
exclusion = list_elements[1].find_all("li")
|
183 |
+
|
184 |
+
|
185 |
+
inclusion = ' '.join([t.text.strip() for t in inclusion ])
|
186 |
+
exclusion = ' '.join([t.text.strip() for t in exclusion ])
|
187 |
+
|
188 |
+
return(inclusion, exclusion)
|
189 |
+
|
190 |
+
#function that gets number of patients enrolled in a study
|
191 |
+
def get_enrollment (self, NCTnumber):
|
192 |
+
url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
|
193 |
+
ClinicalTrialpage = requests.get(url)
|
194 |
+
soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
|
195 |
+
enrollment = ''
|
196 |
+
wrapping_enrol_class = soup.find_all('td', {'headers':'studyInfoColData','style':"padding-left:1em"})
|
197 |
+
if not wrapping_enrol_class:
|
198 |
+
print('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
|
199 |
+
else:
|
200 |
+
enrollment = wrapping_enrol_class[1]
|
201 |
+
enrollment = enrollment.text.split()[0]
|
202 |
+
if enrollment.isdigit() == False:
|
203 |
+
print ('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
|
204 |
+
else:
|
205 |
+
return(enrollment)
|
206 |
+
|
207 |
+
|
208 |
+
|
209 |
+
def __call__(self, condition, listed_attributes, listed_states, amount_of_data):
|
210 |
+
|
211 |
+
self.driver.get('https://clinicaltrials.gov/ct2/results?cond=' + condition + '&rank=1&view=record#rowId0')
|
212 |
+
self.select_attributes_to_show(listed_attributes, self.attribute_dict)
|
213 |
+
|
214 |
+
try:
|
215 |
+
self.select_by_status(listed_states, self.status_dict)
|
216 |
+
except:
|
217 |
+
print('select by status is a problem')
|
218 |
+
n = []
|
219 |
+
for i in listed_attributes:
|
220 |
+
n.append(self.attribute_dict[i.lower()])
|
221 |
+
attribute_ordered = [list(self.attribute_dict.keys())[list(self.attribute_dict.values()).index(i)]for i in sorted(n)]
|
222 |
+
|
223 |
+
search_data = self.collect_data_search_page(attribute_ordered, amount_of_data=amount_of_data)
|
224 |
+
nct_numbers = [e[search_data[0].index('nct number')] for e in search_data[1:]]
|
225 |
+
search_data[0].extend(['inclusion', 'exclusion', 'enrollment'])
|
226 |
+
for index, nct in enumerate(nct_numbers):
|
227 |
+
if index % 100 == 0 and index!= 0:
|
228 |
+
print("Collected Data from {} Studies: ".format(index))
|
229 |
+
|
230 |
+
inc, exc = self.get_criteria(nct)
|
231 |
+
enrol = self.get_enrollment(nct)
|
232 |
+
search_data[index + 1].extend([inc, exc, enrol])
|
233 |
+
return search_data
|
234 |
+
# except:
|
235 |
+
# print('no data available with the specified status')
|
236 |
+
|
237 |
+
|
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
title: GetMutationInfo
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
|
|
1 |
---
|
2 |
title: GetMutationInfo
|
3 |
+
emoji: 🔥
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.33.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
app.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from OpenAITools.ExpertTools import GetPubmedSummaryDf, generate, search
|
2 |
+
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, SummaryIndex
|
3 |
+
from llama_index.core import Document
|
4 |
+
from llama_index.llms.groq import Groq
|
5 |
+
from llama_index.core import ServiceContext, set_global_service_context
|
6 |
+
from llama_index.llms.llama_cpp.llama_utils import messages_to_prompt, completion_to_prompt
|
7 |
+
import gradio as gr
|
8 |
+
|
9 |
+
#models
|
10 |
+
LLAMA3_8B = "Llama3-8b-8192"
|
11 |
+
LLAMA3_70B = "Llama3-70b-8192"
|
12 |
+
Mixtral = "mixtral-8x7b-32768"
|
13 |
+
|
14 |
+
def custom_completion_to_prompt(completion:str) ->str:
|
15 |
+
return completion_to_prompt(
|
16 |
+
completion, system_prompt=(
|
17 |
+
"You are a Q&A assistant. Your goal is to answer questions as "
|
18 |
+
"accurately as possible is the instructions and context provided."
|
19 |
+
),
|
20 |
+
)
|
21 |
+
|
22 |
+
def getMutationEffect(cancer_name, gene_name):
|
23 |
+
searchWords= "(" +str(cancer_name)+ ") AND " + "(" + str(gene_name) + ") AND(treatment)"
|
24 |
+
studies = search(searchWords)
|
25 |
+
df, abstracts= GetPubmedSummaryDf(studies)
|
26 |
+
#Define LLM
|
27 |
+
llm = Groq(
|
28 |
+
model=LLAMA3_8B,
|
29 |
+
temperature=0.01,
|
30 |
+
context_window=4096,
|
31 |
+
completion_to_prompt=custom_completion_to_prompt,
|
32 |
+
messages_to_prompt=messages_to_prompt,)
|
33 |
+
#set global service context
|
34 |
+
ctx = ServiceContext.from_defaults(llm=llm)
|
35 |
+
set_global_service_context(ctx)
|
36 |
+
documents = [Document(text=t) for t in abstracts[:10]]
|
37 |
+
index = SummaryIndex.from_documents(documents)
|
38 |
+
query_engine = index.as_query_engine(response_mode="tree_summarize")
|
39 |
+
prompt = "Please prepare a single summary of the abstracts of the following papers. Pay particular attention to the {} gene".format(gene_name)
|
40 |
+
response = query_engine.query(prompt)
|
41 |
+
return response
|
42 |
+
|
43 |
+
demo = gr.Interface(fn=getMutationEffect,
|
44 |
+
inputs=[gr.Textbox(label="CancerName"),
|
45 |
+
gr.Textbox(label="GeneName"),
|
46 |
+
],
|
47 |
+
outputs="text")
|
48 |
+
|
49 |
+
|
50 |
+
if __name__ == "__main__":
|
51 |
+
demo.launch()
|
environment.yml
ADDED
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: gradio
|
2 |
+
channels:
|
3 |
+
- conda-forge
|
4 |
+
dependencies:
|
5 |
+
- aiofiles=23.2.1
|
6 |
+
- altair=5.3.0
|
7 |
+
- annotated-types=0.7.0
|
8 |
+
- anyio=4.3.0
|
9 |
+
- aom=3.9.0
|
10 |
+
- appnope=0.1.4
|
11 |
+
- argon2-cffi=23.1.0
|
12 |
+
- argon2-cffi-bindings=21.2.0
|
13 |
+
- arrow=1.3.0
|
14 |
+
- asttokens=2.4.1
|
15 |
+
- async-lru=2.0.4
|
16 |
+
- attrs=23.2.0
|
17 |
+
- babel=2.14.0
|
18 |
+
- beautifulsoup4=4.12.3
|
19 |
+
- bleach=6.1.0
|
20 |
+
- blinker=1.8.2
|
21 |
+
- brotli=1.1.0
|
22 |
+
- brotli-bin=1.1.0
|
23 |
+
- brotli-python=1.1.0
|
24 |
+
- bzip2=1.0.8
|
25 |
+
- ca-certificates=2024.6.2
|
26 |
+
- cached-property=1.5.2
|
27 |
+
- cached_property=1.5.2
|
28 |
+
- cairo=1.18.0
|
29 |
+
- certifi=2024.2.2
|
30 |
+
- cffi=1.16.0
|
31 |
+
- charset-normalizer=3.3.2
|
32 |
+
- click=8.1.7
|
33 |
+
- colorama=0.4.6
|
34 |
+
- comm=0.2.2
|
35 |
+
- contourpy=1.2.1
|
36 |
+
- cycler=0.12.1
|
37 |
+
- dav1d=1.2.1
|
38 |
+
- debugpy=1.8.1
|
39 |
+
- decorator=5.1.1
|
40 |
+
- defusedxml=0.7.1
|
41 |
+
- dnspython=2.6.1
|
42 |
+
- email-validator=2.1.1
|
43 |
+
- email_validator=2.1.1
|
44 |
+
- entrypoints=0.4
|
45 |
+
- exceptiongroup=1.2.0
|
46 |
+
- executing=2.0.1
|
47 |
+
- expat=2.6.2
|
48 |
+
- fastapi=0.111.0
|
49 |
+
- fastapi-cli=0.0.4
|
50 |
+
- ffmpeg=7.0.1
|
51 |
+
- ffmpy=0.3.0
|
52 |
+
- filelock=3.14.0
|
53 |
+
- flask=3.0.3
|
54 |
+
- font-ttf-dejavu-sans-mono=2.37
|
55 |
+
- font-ttf-inconsolata=3.000
|
56 |
+
- font-ttf-source-code-pro=2.038
|
57 |
+
- font-ttf-ubuntu=0.83
|
58 |
+
- fontconfig=2.14.2
|
59 |
+
- fonts-conda-ecosystem=1
|
60 |
+
- fonts-conda-forge=1
|
61 |
+
- fonttools=4.53.0
|
62 |
+
- fqdn=1.5.1
|
63 |
+
- freetype=2.12.1
|
64 |
+
- fribidi=1.0.10
|
65 |
+
- fsspec=2024.6.0
|
66 |
+
- gettext=0.22.5
|
67 |
+
- gettext-tools=0.22.5
|
68 |
+
- gmp=6.3.0
|
69 |
+
- gnutls=3.7.9
|
70 |
+
- gradio=4.33.0
|
71 |
+
- gradio-client=0.17.0
|
72 |
+
- graphite2=1.3.13
|
73 |
+
- h11=0.14.0
|
74 |
+
- h2=4.1.0
|
75 |
+
- harfbuzz=8.5.0
|
76 |
+
- hpack=4.0.0
|
77 |
+
- httpcore=1.0.5
|
78 |
+
- httpx=0.27.0
|
79 |
+
- huggingface_hub=0.23.2
|
80 |
+
- hyperframe=6.0.1
|
81 |
+
- icu=73.2
|
82 |
+
- idna=3.7
|
83 |
+
- importlib-metadata=7.1.0
|
84 |
+
- importlib-resources=6.4.0
|
85 |
+
- importlib_metadata=7.1.0
|
86 |
+
- importlib_resources=6.4.0
|
87 |
+
- ipykernel=6.29.3
|
88 |
+
- ipython=8.25.0
|
89 |
+
- ipywidgets=8.1.3
|
90 |
+
- isoduration=20.11.0
|
91 |
+
- itsdangerous=2.2.0
|
92 |
+
- jedi=0.19.1
|
93 |
+
- jinja2=3.1.4
|
94 |
+
- joblib=1.4.2
|
95 |
+
- json5=0.9.25
|
96 |
+
- jsonpointer=2.4
|
97 |
+
- jsonschema=4.22.0
|
98 |
+
- jsonschema-specifications=2023.12.1
|
99 |
+
- jsonschema-with-format-nongpl=4.22.0
|
100 |
+
- jupyter=1.0.0
|
101 |
+
- jupyter-lsp=2.2.5
|
102 |
+
- jupyter_client=8.6.2
|
103 |
+
- jupyter_console=6.6.3
|
104 |
+
- jupyter_core=5.7.2
|
105 |
+
- jupyter_events=0.10.0
|
106 |
+
- jupyter_server=2.14.1
|
107 |
+
- jupyter_server_terminals=0.5.3
|
108 |
+
- jupyterlab=4.2.1
|
109 |
+
- jupyterlab_pygments=0.3.0
|
110 |
+
- jupyterlab_server=2.27.2
|
111 |
+
- jupyterlab_widgets=3.0.11
|
112 |
+
- kiwisolver=1.4.5
|
113 |
+
- krb5=1.21.2
|
114 |
+
- lame=3.100
|
115 |
+
- lcms2=2.16
|
116 |
+
- lerc=4.0.0
|
117 |
+
- libabseil=20240116.2
|
118 |
+
- libasprintf=0.22.5
|
119 |
+
- libasprintf-devel=0.22.5
|
120 |
+
- libass=0.17.1
|
121 |
+
- libblas=3.9.0
|
122 |
+
- libbrotlicommon=1.1.0
|
123 |
+
- libbrotlidec=1.1.0
|
124 |
+
- libbrotlienc=1.1.0
|
125 |
+
- libcblas=3.9.0
|
126 |
+
- libcxx=17.0.6
|
127 |
+
- libdeflate=1.20
|
128 |
+
- libedit=3.1.20191231
|
129 |
+
- libexpat=2.6.2
|
130 |
+
- libffi=3.4.2
|
131 |
+
- libgettextpo=0.22.5
|
132 |
+
- libgettextpo-devel=0.22.5
|
133 |
+
- libgfortran=5.0.0
|
134 |
+
- libgfortran5=13.2.0
|
135 |
+
- libglib=2.80.2
|
136 |
+
- libhwloc=2.10.0
|
137 |
+
- libiconv=1.17
|
138 |
+
- libidn2=2.3.7
|
139 |
+
- libintl=0.22.5
|
140 |
+
- libintl-devel=0.22.5
|
141 |
+
- libjpeg-turbo=3.0.0
|
142 |
+
- liblapack=3.9.0
|
143 |
+
- libopenblas=0.3.27
|
144 |
+
- libopenvino=2024.1.0
|
145 |
+
- libopenvino-arm-cpu-plugin=2024.1.0
|
146 |
+
- libopenvino-auto-batch-plugin=2024.1.0
|
147 |
+
- libopenvino-auto-plugin=2024.1.0
|
148 |
+
- libopenvino-hetero-plugin=2024.1.0
|
149 |
+
- libopenvino-ir-frontend=2024.1.0
|
150 |
+
- libopenvino-onnx-frontend=2024.1.0
|
151 |
+
- libopenvino-paddle-frontend=2024.1.0
|
152 |
+
- libopenvino-pytorch-frontend=2024.1.0
|
153 |
+
- libopenvino-tensorflow-frontend=2024.1.0
|
154 |
+
- libopenvino-tensorflow-lite-frontend=2024.1.0
|
155 |
+
- libopus=1.3.1
|
156 |
+
- libpng=1.6.43
|
157 |
+
- libprotobuf=4.25.3
|
158 |
+
- libsodium=1.0.18
|
159 |
+
- libsqlite=3.45.3
|
160 |
+
- libtasn1=4.19.0
|
161 |
+
- libtiff=4.6.0
|
162 |
+
- libunistring=0.9.10
|
163 |
+
- libvpx=1.14.0
|
164 |
+
- libwebp-base=1.4.0
|
165 |
+
- libxcb=1.15
|
166 |
+
- libxml2=2.12.7
|
167 |
+
- libzlib=1.3.1
|
168 |
+
- llvm-openmp=18.1.6
|
169 |
+
- markdown-it-py=3.0.0
|
170 |
+
- markupsafe=2.1.5
|
171 |
+
- matplotlib=3.8.4
|
172 |
+
- matplotlib-base=3.8.4
|
173 |
+
- matplotlib-inline=0.1.7
|
174 |
+
- mdurl=0.1.2
|
175 |
+
- mistune=3.0.2
|
176 |
+
- munkres=1.1.4
|
177 |
+
- natsort=8.4.0
|
178 |
+
- nbclient=0.10.0
|
179 |
+
- nbconvert=7.16.4
|
180 |
+
- nbconvert-core=7.16.4
|
181 |
+
- nbconvert-pandoc=7.16.4
|
182 |
+
- nbformat=5.10.4
|
183 |
+
- ncurses=6.5
|
184 |
+
- nest-asyncio=1.6.0
|
185 |
+
- nettle=3.9.1
|
186 |
+
- notebook=7.2.0
|
187 |
+
- notebook-shim=0.2.4
|
188 |
+
- numpy=1.26.4
|
189 |
+
- openh264=2.4.1
|
190 |
+
- openjpeg=2.5.2
|
191 |
+
- openssl=3.3.1
|
192 |
+
- orjson=3.10.3
|
193 |
+
- overrides=7.7.0
|
194 |
+
- p11-kit=0.24.1
|
195 |
+
- pandas=2.2.2
|
196 |
+
- pandoc=3.2
|
197 |
+
- pandocfilters=1.5.0
|
198 |
+
- parso=0.8.4
|
199 |
+
- patsy=0.5.6
|
200 |
+
- pcre2=10.43
|
201 |
+
- pexpect=4.9.0
|
202 |
+
- pickleshare=0.7.5
|
203 |
+
- pillow=10.3.0
|
204 |
+
- pip=24.0
|
205 |
+
- pixman=0.43.4
|
206 |
+
- pkgutil-resolve-name=1.3.10
|
207 |
+
- platformdirs=4.2.2
|
208 |
+
- prometheus_client=0.20.0
|
209 |
+
- prompt-toolkit=3.0.42
|
210 |
+
- prompt_toolkit=3.0.42
|
211 |
+
- psutil=5.9.8
|
212 |
+
- pthread-stubs=0.4
|
213 |
+
- ptyprocess=0.7.0
|
214 |
+
- pugixml=1.14
|
215 |
+
- pure_eval=0.2.2
|
216 |
+
- pycparser=2.22
|
217 |
+
- pydantic=2.7.3
|
218 |
+
- pydantic-core=2.18.4
|
219 |
+
- pydub=0.25.1
|
220 |
+
- pygments=2.18.0
|
221 |
+
- pyobjc-core=10.2
|
222 |
+
- pyobjc-framework-cocoa=10.2
|
223 |
+
- pyparsing=3.1.2
|
224 |
+
- pysocks=1.7.1
|
225 |
+
- python=3.12.3
|
226 |
+
- python-dateutil=2.9.0
|
227 |
+
- python-fastjsonschema=2.19.1
|
228 |
+
- python-json-logger=2.0.7
|
229 |
+
- python-multipart=0.0.9
|
230 |
+
- python-tzdata=2024.1
|
231 |
+
- python_abi=3.12
|
232 |
+
- pytz=2024.1
|
233 |
+
- pyyaml=6.0.1
|
234 |
+
- pyzmq=26.0.3
|
235 |
+
- qtconsole-base=5.5.2
|
236 |
+
- qtpy=2.4.1
|
237 |
+
- readline=8.2
|
238 |
+
- referencing=0.35.1
|
239 |
+
- requests=2.32.3
|
240 |
+
- rfc3339-validator=0.1.4
|
241 |
+
- rfc3986-validator=0.1.1
|
242 |
+
- rich=13.7.1
|
243 |
+
- rpds-py=0.18.1
|
244 |
+
- ruff=0.4.7
|
245 |
+
- scikit-learn=1.5.0
|
246 |
+
- scipy=1.13.1
|
247 |
+
- seaborn=0.13.2
|
248 |
+
- seaborn-base=0.13.2
|
249 |
+
- semantic_version=2.10.0
|
250 |
+
- send2trash=1.8.3
|
251 |
+
- setuptools=70.0.0
|
252 |
+
- shellingham=1.5.4
|
253 |
+
- six=1.16.0
|
254 |
+
- snappy=1.2.0
|
255 |
+
- sniffio=1.3.1
|
256 |
+
- soupsieve=2.5
|
257 |
+
- stack_data=0.6.2
|
258 |
+
- starlette=0.37.2
|
259 |
+
- statsmodels=0.14.2
|
260 |
+
- svt-av1=2.1.0
|
261 |
+
- tbb=2021.12.0
|
262 |
+
- terminado=0.18.1
|
263 |
+
- threadpoolctl=3.5.0
|
264 |
+
- tinycss2=1.3.0
|
265 |
+
- tk=8.6.13
|
266 |
+
- tomli=2.0.1
|
267 |
+
- tomlkit=0.12.0
|
268 |
+
- toolz=0.12.1
|
269 |
+
- tornado=6.4
|
270 |
+
- tqdm=4.66.4
|
271 |
+
- traitlets=5.14.3
|
272 |
+
- typer=0.12.3
|
273 |
+
- typer-slim=0.12.3
|
274 |
+
- typer-slim-standard=0.12.3
|
275 |
+
- types-python-dateutil=2.9.0.20240316
|
276 |
+
- typing-extensions=4.12.1
|
277 |
+
- typing_extensions=4.12.1
|
278 |
+
- typing_utils=0.1.0
|
279 |
+
- tzdata=2024a
|
280 |
+
- ujson=5.10.0
|
281 |
+
- uri-template=1.3.0
|
282 |
+
- urllib3=2.2.1
|
283 |
+
- uvicorn=0.30.1
|
284 |
+
- wcwidth=0.2.13
|
285 |
+
- webcolors=1.13
|
286 |
+
- webencodings=0.5.1
|
287 |
+
- websocket-client=1.8.0
|
288 |
+
- websockets=11.0.3
|
289 |
+
- werkzeug=3.0.3
|
290 |
+
- wheel=0.43.0
|
291 |
+
- widgetsnbextension=4.0.11
|
292 |
+
- wikipedia=1.4.0
|
293 |
+
- wtforms=3.1.2
|
294 |
+
- x264=1!164.3095
|
295 |
+
- x265=3.5
|
296 |
+
- xorg-libxau=1.0.11
|
297 |
+
- xorg-libxdmcp=1.1.3
|
298 |
+
- xz=5.2.6
|
299 |
+
- yaml=0.2.5
|
300 |
+
- zeromq=4.3.5
|
301 |
+
- zipp=3.17.0
|
302 |
+
- zlib=1.3.1
|
303 |
+
- zstd=1.5.6
|
304 |
+
- pip:
|
305 |
+
- aiohttp==3.9.5
|
306 |
+
- aiosignal==1.3.1
|
307 |
+
- bio==1.7.1
|
308 |
+
- biopython==1.83
|
309 |
+
- biothings-client==0.3.1
|
310 |
+
- dataclasses-json==0.6.6
|
311 |
+
- deprecated==1.2.14
|
312 |
+
- dirtyjson==1.0.8
|
313 |
+
- diskcache==5.6.3
|
314 |
+
- distro==1.9.0
|
315 |
+
- frozenlist==1.4.1
|
316 |
+
- gprofiler-official==1.0.0
|
317 |
+
- greenlet==3.0.3
|
318 |
+
- jsonpatch==1.33
|
319 |
+
- langchain==0.2.2
|
320 |
+
- langchain-community==0.2.2
|
321 |
+
- langchain-core==0.2.4
|
322 |
+
- langchain-experimental==0.0.60
|
323 |
+
- langchain-openai==0.1.8
|
324 |
+
- langchain-text-splitters==0.2.1
|
325 |
+
- langsmith==0.1.71
|
326 |
+
- llama-cpp-python==0.2.77
|
327 |
+
- llama-index==0.10.43
|
328 |
+
- llama-index-agent-openai==0.2.7
|
329 |
+
- llama-index-cli==0.1.12
|
330 |
+
- llama-index-core==0.10.43
|
331 |
+
- llama-index-embeddings-openai==0.1.10
|
332 |
+
- llama-index-indices-managed-llama-cloud==0.1.6
|
333 |
+
- llama-index-legacy==0.9.48
|
334 |
+
- llama-index-llms-groq==0.1.4
|
335 |
+
- llama-index-llms-llama-cpp==0.1.3
|
336 |
+
- llama-index-llms-openai==0.1.22
|
337 |
+
- llama-index-llms-openai-like==0.1.3
|
338 |
+
- llama-index-llms-replicate==0.1.3
|
339 |
+
- llama-index-multi-modal-llms-openai==0.1.6
|
340 |
+
- llama-index-program-openai==0.1.6
|
341 |
+
- llama-index-question-gen-openai==0.1.3
|
342 |
+
- llama-index-readers-file==0.1.23
|
343 |
+
- llama-index-readers-llama-parse==0.1.4
|
344 |
+
- llama-parse==0.4.4
|
345 |
+
- llamaindex-py-client==0.1.19
|
346 |
+
- marshmallow==3.21.2
|
347 |
+
- multidict==6.0.5
|
348 |
+
- mygene==3.2.2
|
349 |
+
- mypy-extensions==1.0.0
|
350 |
+
- networkx==3.3
|
351 |
+
- nltk==3.8.1
|
352 |
+
- openai==1.31.0
|
353 |
+
- packaging==23.2
|
354 |
+
- pooch==1.8.1
|
355 |
+
- pypdf==4.2.0
|
356 |
+
- pytrials==1.0.0
|
357 |
+
- regex==2024.5.15
|
358 |
+
- replicate==0.26.0
|
359 |
+
- safetensors==0.4.3
|
360 |
+
- sqlalchemy==2.0.30
|
361 |
+
- striprtf==0.0.26
|
362 |
+
- tenacity==8.3.0
|
363 |
+
- tiktoken==0.7.0
|
364 |
+
- tokenizers==0.19.1
|
365 |
+
- transformers==4.41.2
|
366 |
+
- typing-inspect==0.9.0
|
367 |
+
- wrapt==1.16.0
|
368 |
+
- yarl==1.9.4
|
369 |
+
prefix: /Users/satoc/miniforge3/envs/gradio
|
requirements.txt
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohttp==3.9.5
|
2 |
+
aiosignal==1.3.1
|
3 |
+
bio==1.7.1
|
4 |
+
biopython==1.83
|
5 |
+
biothings-client==0.3.1
|
6 |
+
dataclasses-json==0.6.6
|
7 |
+
Deprecated==1.2.14
|
8 |
+
dirtyjson==1.0.8
|
9 |
+
diskcache==5.6.3
|
10 |
+
distro==1.9.0
|
11 |
+
frozenlist==1.4.1
|
12 |
+
gprofiler-official==1.0.0
|
13 |
+
greenlet==3.0.3
|
14 |
+
hpack==4.0.0
|
15 |
+
jsonpatch==1.33
|
16 |
+
langchain==0.2.2
|
17 |
+
langchain-community==0.2.2
|
18 |
+
langchain-core==0.2.4
|
19 |
+
langchain-experimental==0.0.60
|
20 |
+
langchain-openai==0.1.8
|
21 |
+
langchain-text-splitters==0.2.1
|
22 |
+
langsmith==0.1.71
|
23 |
+
llama-index==0.10.43
|
24 |
+
llama-index-agent-openai==0.2.7
|
25 |
+
llama-index-cli==0.1.12
|
26 |
+
llama-index-core==0.10.43
|
27 |
+
llama-index-embeddings-openai==0.1.10
|
28 |
+
llama-index-indices-managed-llama-cloud==0.1.6
|
29 |
+
llama-index-legacy==0.9.48
|
30 |
+
llama-index-llms-groq==0.1.4
|
31 |
+
llama-index-llms-llama-cpp==0.1.3
|
32 |
+
llama-index-llms-openai==0.1.22
|
33 |
+
llama-index-llms-openai-like==0.1.3
|
34 |
+
llama-index-llms-replicate==0.1.3
|
35 |
+
llama-index-multi-modal-llms-openai==0.1.6
|
36 |
+
llama-index-program-openai==0.1.6
|
37 |
+
llama-index-question-gen-openai==0.1.3
|
38 |
+
llama-index-readers-file==0.1.23
|
39 |
+
llama-index-readers-llama-parse==0.1.4
|
40 |
+
llama-parse==0.4.4
|
41 |
+
llama_cpp_python==0.2.77
|
42 |
+
llamaindex-py-client==0.1.19
|
43 |
+
marshmallow==3.21.2
|
44 |
+
multidict==6.0.5
|
45 |
+
munkres==1.1.4
|
46 |
+
mygene==3.2.2
|
47 |
+
mypy-extensions==1.0.0
|
48 |
+
networkx==3.3
|
49 |
+
nltk
|
50 |
+
openai
|
51 |
+
packaging==23.2
|
52 |
+
pooch==1.8.1
|
53 |
+
pypdf==4.2.0
|
54 |
+
pytrials==1.0.0
|
55 |
+
regex==2024.5.15
|
56 |
+
replicate==0.26.0
|
57 |
+
safetensors
|
58 |
+
setuptools==70.0.0
|
59 |
+
SQLAlchemy==2.0.30
|
60 |
+
striprtf==0.0.26
|
61 |
+
tenacity==8.3.0
|
62 |
+
tiktoken==0.7.0
|
63 |
+
tokenizers==0.19.1
|
64 |
+
transformers==4.41.2
|
65 |
+
typer==0.12.3
|
66 |
+
typer-slim==0.12.3
|
67 |
+
typing-inspect==0.9.0
|
68 |
+
wheel==0.43.0
|
69 |
+
wikipedia==1.4.0
|
70 |
+
wrapt==1.16.0
|
71 |
+
yarl==1.9.4
|