Spaces:
Sleeping
Sleeping
firqaaa
commited on
Commit
·
e04865c
1
Parent(s):
dea570f
add app.py
Browse files
app.py
ADDED
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
import json
|
4 |
+
import glob
|
5 |
+
import shutil
|
6 |
+
import textwrap
|
7 |
+
import pdf2image
|
8 |
+
import pytesseract
|
9 |
+
|
10 |
+
import nltk
|
11 |
+
import openai
|
12 |
+
import pandas as pd
|
13 |
+
from langchain.chat_models import ChatOpenAI
|
14 |
+
from langchain.prompts import ChatPromptTemplate
|
15 |
+
from langchain.chains import create_extraction_chain
|
16 |
+
|
17 |
+
import re
|
18 |
+
from Bio import Entrez
|
19 |
+
from tqdm.auto import tqdm
|
20 |
+
|
21 |
+
import streamlit as st
|
22 |
+
from ast import literal_eval
|
23 |
+
|
24 |
+
nltk.download('punkt')
|
25 |
+
|
26 |
+
os.environ['OPENAI_API_KEY'] = "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"
|
27 |
+
Entrez.email = "[email protected]"
|
28 |
+
Entrez.api_key = "3d7a71231fe7f2d2bd7599e022535199a908"
|
29 |
+
|
30 |
+
fold = -1
|
31 |
+
# chunk_size = 8000
|
32 |
+
|
33 |
+
st.cache_data()
|
34 |
+
def convert_df(df):
|
35 |
+
return df.to_csv().encode('utf-8')
|
36 |
+
|
37 |
+
|
38 |
+
def replace_quotes(text):
|
39 |
+
pattern = r'(?<=")[^"]*(?=")'
|
40 |
+
return re.sub(pattern, lambda match: match.group(0).replace('"', "'"), text)
|
41 |
+
|
42 |
+
|
43 |
+
def clean_text(text):
|
44 |
+
"""Remove section titles and figure descriptions from text"""
|
45 |
+
pattern = r'[^\w\s]'
|
46 |
+
clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")])
|
47 |
+
return re.sub(pattern, '', clean)
|
48 |
+
|
49 |
+
|
50 |
+
def truncate_text(text, max_tokens):
|
51 |
+
wrapper = textwrap.TextWrapper(width=max_tokens)
|
52 |
+
truncated_text = wrapper.wrap(text)
|
53 |
+
if len(truncated_text) > 0:
|
54 |
+
return truncated_text[0]
|
55 |
+
else:
|
56 |
+
return ""
|
57 |
+
|
58 |
+
|
59 |
+
def split_text(text, chunk_size):
|
60 |
+
chunks = []
|
61 |
+
start = 0
|
62 |
+
end = chunk_size
|
63 |
+
while start < len(text):
|
64 |
+
chunks.append(text[start:end])
|
65 |
+
start = end
|
66 |
+
end += chunk_size
|
67 |
+
return chunks
|
68 |
+
|
69 |
+
|
70 |
+
def extract_gene_name(text):
|
71 |
+
text_str = text.decode("utf-8")
|
72 |
+
text_str = text_str.replace("\\n", "").replace("\\t", "").replace("\\'", "'")
|
73 |
+
pattern = r"<NAME>(.*?)</NAME>"
|
74 |
+
match = re.search(pattern, text_str)
|
75 |
+
if match:
|
76 |
+
gene_name = match.group(1)
|
77 |
+
return gene_name
|
78 |
+
else:
|
79 |
+
return None
|
80 |
+
|
81 |
+
|
82 |
+
def get_geneName(rsid):
|
83 |
+
text = Entrez.efetch(db="snp", id=rsid, retmode='xml').read()
|
84 |
+
text = extract_gene_name(text)
|
85 |
+
return text
|
86 |
+
|
87 |
+
|
88 |
+
def split_text_into_sentences(text, num_sentences):
|
89 |
+
sentences = nltk.sent_tokenize(text)
|
90 |
+
grouped_sentences = [sentences[i:i+num_sentences] for i in range(0, len(sentences), num_sentences)]
|
91 |
+
return grouped_sentences
|
92 |
+
|
93 |
+
|
94 |
+
def flatten_list(nested_list):
|
95 |
+
flattened_list = []
|
96 |
+
for item in nested_list:
|
97 |
+
if isinstance(item, list):
|
98 |
+
flattened_list.extend(flatten_list(item))
|
99 |
+
else:
|
100 |
+
flattened_list.append(item)
|
101 |
+
return flattened_list
|
102 |
+
|
103 |
+
|
104 |
+
def move_file(source_path, destination_path):
|
105 |
+
|
106 |
+
# Make sure the destination folder exists before moving the file
|
107 |
+
if not os.path.exists(destination_path):
|
108 |
+
os.makedirs(destination_path)
|
109 |
+
|
110 |
+
try:
|
111 |
+
shutil.move(source_path, destination_path)
|
112 |
+
print(f"File moved successfully from '{source_path}' to '{destination_path}'.")
|
113 |
+
except Exception as e:
|
114 |
+
print(f"Error: {e}")
|
115 |
+
|
116 |
+
|
117 |
+
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")
|
118 |
+
|
119 |
+
schema = {
|
120 |
+
"properties" : {
|
121 |
+
"title" : {"type" : "string"},
|
122 |
+
"author" : {"type" : "string"},
|
123 |
+
"publisher" : {"type" : "string"},
|
124 |
+
"publication_year" : {"type" : "string"},
|
125 |
+
"gene_codes" : {"type" : "string"},
|
126 |
+
"population_race" : {"type" : "string"},
|
127 |
+
"phenotypes_or_diseases" : {"type" : "string"},
|
128 |
+
"sample_size" : {"type" : "string"},
|
129 |
+
"SNPs" : {"type" : "string"},
|
130 |
+
"Study_Methodology" : {"type" : "string"},
|
131 |
+
"Study_Level" : {"type" : "string"},
|
132 |
+
"Outcome/Recommendation/Conclusion" : {"type" : "string"}
|
133 |
+
},
|
134 |
+
"required" : ["title"]
|
135 |
+
}
|
136 |
+
|
137 |
+
chain = create_extraction_chain(schema, llm)
|
138 |
+
err_path = []
|
139 |
+
|
140 |
+
# Page title
|
141 |
+
st.set_page_config(page_title="PubMed Paper Extraction")
|
142 |
+
st.title("PubMed Paper Extraction")
|
143 |
+
|
144 |
+
uploaded_file = st.file_uploader('Upload Paper Here : ', type="pdf")
|
145 |
+
if uploaded_file:
|
146 |
+
st.write(f"{uploaded_file.name} successfully uploaded")
|
147 |
+
|
148 |
+
chunk_size = st.selectbox(
|
149 |
+
'Tokens amounts per process :',
|
150 |
+
(16000, 12000, 10000, 8000, 5000)
|
151 |
+
)
|
152 |
+
|
153 |
+
parseButton = st.button("Extract Text")
|
154 |
+
|
155 |
+
if parseButton:
|
156 |
+
with st.spinner(text='Extraction in progress ...'):
|
157 |
+
try:
|
158 |
+
images = pdf2image.convert_from_bytes(uploaded_file.getvalue())
|
159 |
+
extracted_text = ""
|
160 |
+
for image in images[:-1]:
|
161 |
+
text = pytesseract.image_to_string(image)
|
162 |
+
text = clean_text(text)
|
163 |
+
extracted_text += text + " "
|
164 |
+
|
165 |
+
text = replace_quotes(extracted_text)
|
166 |
+
text_chunk = split_text(text, chunk_size)[:fold]
|
167 |
+
|
168 |
+
chunkdf = []
|
169 |
+
|
170 |
+
for i, chunk in enumerate(text_chunk):
|
171 |
+
inp = chunk
|
172 |
+
df = pd.DataFrame(literal_eval(str(json.dumps(chain.run(inp)[0])).replace("\'", "\"")), index=[0]).fillna('')
|
173 |
+
chunkdf.append(df)
|
174 |
+
|
175 |
+
concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
|
176 |
+
concat['title'] = concat['title'][0]
|
177 |
+
concat['author'] = concat['author'][0]
|
178 |
+
concat['publisher'] = concat['publisher'][0]
|
179 |
+
concat['publication_year'] = concat['publication_year'][0]
|
180 |
+
# concat = concat.min().to_frame().T
|
181 |
+
concat['SNPs'] = concat['SNPs'].apply(lambda x: x if x.startswith('rs') else '')
|
182 |
+
for col in list(concat.columns):
|
183 |
+
concat[col] = concat[col].apply(lambda x: x if x not in ['N/A', 'not mentioned', 'Not mentioned', 'Unknown'] else '')
|
184 |
+
|
185 |
+
L = []
|
186 |
+
for i in range(len(concat)):
|
187 |
+
if (len(concat['gene_codes'][i].split(',')) >= 1) and concat['SNPs'][i] == '':
|
188 |
+
for g in concat['gene_codes'][i].split(','):
|
189 |
+
L.append({
|
190 |
+
'Title' : concat['title'][0],
|
191 |
+
'Author' : concat['author'][0],
|
192 |
+
'Publisher' : concat['publisher'][0],
|
193 |
+
'Publication Year' : concat['publication_year'][0],
|
194 |
+
'Genes' : g.upper(),
|
195 |
+
'Population' : concat['population_race'][i],
|
196 |
+
'Phenotype' : concat['phenotypes_or_diseases'][i].title(),
|
197 |
+
'Sample Size' : concat['sample_size'][i],
|
198 |
+
'SNPs' : concat['SNPs'][i],
|
199 |
+
'Study Methodology' : concat['Study_Methodology'][i].title(),
|
200 |
+
'Study Level' : concat['Study_Level'][i].title(),
|
201 |
+
'Outcomes' : concat['Outcome/Recommendation/Conclusion'][i].capitalize()
|
202 |
+
})
|
203 |
+
elif (len(concat['SNPs'][i].split(',')) >= 1):
|
204 |
+
for s in concat['SNPs'][i].split(','):
|
205 |
+
try:
|
206 |
+
L.append({
|
207 |
+
'Title' : concat['title'][0],
|
208 |
+
'Author' : concat['author'][0],
|
209 |
+
'Publisher' : concat['publisher'][0],
|
210 |
+
'Publication Year' : concat['publication_year'][0],
|
211 |
+
'Genes' : get_geneName(s.strip()).upper(),
|
212 |
+
'Population' : concat['population_race'][0],
|
213 |
+
'Phenotype' : concat['phenotypes_or_diseases'][i].title(),
|
214 |
+
'Sample Size' : concat['sample_size'][i],
|
215 |
+
'SNPs' : s,
|
216 |
+
'Study Methodology' : concat['Study_Methodology'][i],
|
217 |
+
'Study Level' : concat['Study_Level'][i].title(),
|
218 |
+
'Outcomes' : concat['Outcome/Recommendation/Conclusion'][i].capitalize()
|
219 |
+
})
|
220 |
+
except Exception as e:
|
221 |
+
print(e)
|
222 |
+
# result = pd.DataFrame(L)
|
223 |
+
st.dataframe(pd.DataFrame(L))
|
224 |
+
csv = convert_df(pd.DataFrame(L))
|
225 |
+
|
226 |
+
st.download_button(
|
227 |
+
label="Save Result",
|
228 |
+
data=csv,
|
229 |
+
file_name=str(uploaded_file.name).replace('.pdf', ''),
|
230 |
+
mime='text/csv'
|
231 |
+
)
|
232 |
+
|
233 |
+
except Exception as e:
|
234 |
+
if e == json.JSONDecodeError:
|
235 |
+
st.write("Sorry, we are experiencing difficulties in extracting the information. Please try again with different context length.")
|
236 |
+
# st.write(e)
|
237 |
+
else:
|
238 |
+
st.write("Sorry, we are experiencing difficulties in extracting the information. Please ensure that you input an uncorrupted file.")
|
239 |
+
# move_file(pdf, "./unprocessed")
|