firqaaa commited on
Commit
e04865c
·
1 Parent(s): dea570f

add app.py

Browse files
Files changed (1) hide show
  1. app.py +239 -0
app.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import json
4
+ import glob
5
+ import shutil
6
+ import textwrap
7
+ import pdf2image
8
+ import pytesseract
9
+
10
+ import nltk
11
+ import openai
12
+ import pandas as pd
13
+ from langchain.chat_models import ChatOpenAI
14
+ from langchain.prompts import ChatPromptTemplate
15
+ from langchain.chains import create_extraction_chain
16
+
17
+ import re
18
+ from Bio import Entrez
19
+ from tqdm.auto import tqdm
20
+
21
+ import streamlit as st
22
+ from ast import literal_eval
23
+
24
+ nltk.download('punkt')
25
+
26
+ os.environ['OPENAI_API_KEY'] = "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"
27
+ Entrez.email = "[email protected]"
28
+ Entrez.api_key = "3d7a71231fe7f2d2bd7599e022535199a908"
29
+
30
+ fold = -1
31
+ # chunk_size = 8000
32
+
33
+ st.cache_data()
34
+ def convert_df(df):
35
+ return df.to_csv().encode('utf-8')
36
+
37
+
38
+ def replace_quotes(text):
39
+ pattern = r'(?<=")[^"]*(?=")'
40
+ return re.sub(pattern, lambda match: match.group(0).replace('"', "'"), text)
41
+
42
+
43
+ def clean_text(text):
44
+ """Remove section titles and figure descriptions from text"""
45
+ pattern = r'[^\w\s]'
46
+ clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")])
47
+ return re.sub(pattern, '', clean)
48
+
49
+
50
+ def truncate_text(text, max_tokens):
51
+ wrapper = textwrap.TextWrapper(width=max_tokens)
52
+ truncated_text = wrapper.wrap(text)
53
+ if len(truncated_text) > 0:
54
+ return truncated_text[0]
55
+ else:
56
+ return ""
57
+
58
+
59
+ def split_text(text, chunk_size):
60
+ chunks = []
61
+ start = 0
62
+ end = chunk_size
63
+ while start < len(text):
64
+ chunks.append(text[start:end])
65
+ start = end
66
+ end += chunk_size
67
+ return chunks
68
+
69
+
70
+ def extract_gene_name(text):
71
+ text_str = text.decode("utf-8")
72
+ text_str = text_str.replace("\\n", "").replace("\\t", "").replace("\\'", "'")
73
+ pattern = r"<NAME>(.*?)</NAME>"
74
+ match = re.search(pattern, text_str)
75
+ if match:
76
+ gene_name = match.group(1)
77
+ return gene_name
78
+ else:
79
+ return None
80
+
81
+
82
+ def get_geneName(rsid):
83
+ text = Entrez.efetch(db="snp", id=rsid, retmode='xml').read()
84
+ text = extract_gene_name(text)
85
+ return text
86
+
87
+
88
+ def split_text_into_sentences(text, num_sentences):
89
+ sentences = nltk.sent_tokenize(text)
90
+ grouped_sentences = [sentences[i:i+num_sentences] for i in range(0, len(sentences), num_sentences)]
91
+ return grouped_sentences
92
+
93
+
94
+ def flatten_list(nested_list):
95
+ flattened_list = []
96
+ for item in nested_list:
97
+ if isinstance(item, list):
98
+ flattened_list.extend(flatten_list(item))
99
+ else:
100
+ flattened_list.append(item)
101
+ return flattened_list
102
+
103
+
104
+ def move_file(source_path, destination_path):
105
+
106
+ # Make sure the destination folder exists before moving the file
107
+ if not os.path.exists(destination_path):
108
+ os.makedirs(destination_path)
109
+
110
+ try:
111
+ shutil.move(source_path, destination_path)
112
+ print(f"File moved successfully from '{source_path}' to '{destination_path}'.")
113
+ except Exception as e:
114
+ print(f"Error: {e}")
115
+
116
+
117
+ llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")
118
+
119
+ schema = {
120
+ "properties" : {
121
+ "title" : {"type" : "string"},
122
+ "author" : {"type" : "string"},
123
+ "publisher" : {"type" : "string"},
124
+ "publication_year" : {"type" : "string"},
125
+ "gene_codes" : {"type" : "string"},
126
+ "population_race" : {"type" : "string"},
127
+ "phenotypes_or_diseases" : {"type" : "string"},
128
+ "sample_size" : {"type" : "string"},
129
+ "SNPs" : {"type" : "string"},
130
+ "Study_Methodology" : {"type" : "string"},
131
+ "Study_Level" : {"type" : "string"},
132
+ "Outcome/Recommendation/Conclusion" : {"type" : "string"}
133
+ },
134
+ "required" : ["title"]
135
+ }
136
+
137
+ chain = create_extraction_chain(schema, llm)
138
+ err_path = []
139
+
140
+ # Page title
141
+ st.set_page_config(page_title="PubMed Paper Extraction")
142
+ st.title("PubMed Paper Extraction")
143
+
144
+ uploaded_file = st.file_uploader('Upload Paper Here : ', type="pdf")
145
+ if uploaded_file:
146
+ st.write(f"{uploaded_file.name} successfully uploaded")
147
+
148
+ chunk_size = st.selectbox(
149
+ 'Tokens amounts per process :',
150
+ (16000, 12000, 10000, 8000, 5000)
151
+ )
152
+
153
+ parseButton = st.button("Extract Text")
154
+
155
+ if parseButton:
156
+ with st.spinner(text='Extraction in progress ...'):
157
+ try:
158
+ images = pdf2image.convert_from_bytes(uploaded_file.getvalue())
159
+ extracted_text = ""
160
+ for image in images[:-1]:
161
+ text = pytesseract.image_to_string(image)
162
+ text = clean_text(text)
163
+ extracted_text += text + " "
164
+
165
+ text = replace_quotes(extracted_text)
166
+ text_chunk = split_text(text, chunk_size)[:fold]
167
+
168
+ chunkdf = []
169
+
170
+ for i, chunk in enumerate(text_chunk):
171
+ inp = chunk
172
+ df = pd.DataFrame(literal_eval(str(json.dumps(chain.run(inp)[0])).replace("\'", "\"")), index=[0]).fillna('')
173
+ chunkdf.append(df)
174
+
175
+ concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
176
+ concat['title'] = concat['title'][0]
177
+ concat['author'] = concat['author'][0]
178
+ concat['publisher'] = concat['publisher'][0]
179
+ concat['publication_year'] = concat['publication_year'][0]
180
+ # concat = concat.min().to_frame().T
181
+ concat['SNPs'] = concat['SNPs'].apply(lambda x: x if x.startswith('rs') else '')
182
+ for col in list(concat.columns):
183
+ concat[col] = concat[col].apply(lambda x: x if x not in ['N/A', 'not mentioned', 'Not mentioned', 'Unknown'] else '')
184
+
185
+ L = []
186
+ for i in range(len(concat)):
187
+ if (len(concat['gene_codes'][i].split(',')) >= 1) and concat['SNPs'][i] == '':
188
+ for g in concat['gene_codes'][i].split(','):
189
+ L.append({
190
+ 'Title' : concat['title'][0],
191
+ 'Author' : concat['author'][0],
192
+ 'Publisher' : concat['publisher'][0],
193
+ 'Publication Year' : concat['publication_year'][0],
194
+ 'Genes' : g.upper(),
195
+ 'Population' : concat['population_race'][i],
196
+ 'Phenotype' : concat['phenotypes_or_diseases'][i].title(),
197
+ 'Sample Size' : concat['sample_size'][i],
198
+ 'SNPs' : concat['SNPs'][i],
199
+ 'Study Methodology' : concat['Study_Methodology'][i].title(),
200
+ 'Study Level' : concat['Study_Level'][i].title(),
201
+ 'Outcomes' : concat['Outcome/Recommendation/Conclusion'][i].capitalize()
202
+ })
203
+ elif (len(concat['SNPs'][i].split(',')) >= 1):
204
+ for s in concat['SNPs'][i].split(','):
205
+ try:
206
+ L.append({
207
+ 'Title' : concat['title'][0],
208
+ 'Author' : concat['author'][0],
209
+ 'Publisher' : concat['publisher'][0],
210
+ 'Publication Year' : concat['publication_year'][0],
211
+ 'Genes' : get_geneName(s.strip()).upper(),
212
+ 'Population' : concat['population_race'][0],
213
+ 'Phenotype' : concat['phenotypes_or_diseases'][i].title(),
214
+ 'Sample Size' : concat['sample_size'][i],
215
+ 'SNPs' : s,
216
+ 'Study Methodology' : concat['Study_Methodology'][i],
217
+ 'Study Level' : concat['Study_Level'][i].title(),
218
+ 'Outcomes' : concat['Outcome/Recommendation/Conclusion'][i].capitalize()
219
+ })
220
+ except Exception as e:
221
+ print(e)
222
+ # result = pd.DataFrame(L)
223
+ st.dataframe(pd.DataFrame(L))
224
+ csv = convert_df(pd.DataFrame(L))
225
+
226
+ st.download_button(
227
+ label="Save Result",
228
+ data=csv,
229
+ file_name=str(uploaded_file.name).replace('.pdf', ''),
230
+ mime='text/csv'
231
+ )
232
+
233
+ except Exception as e:
234
+ if e == json.JSONDecodeError:
235
+ st.write("Sorry, we are experiencing difficulties in extracting the information. Please try again with different context length.")
236
+ # st.write(e)
237
+ else:
238
+ st.write("Sorry, we are experiencing difficulties in extracting the information. Please ensure that you input an uncorrupted file.")
239
+ # move_file(pdf, "./unprocessed")