import os import time import json import glob import shutil import textwrap import pdf2image import pytesseract import nltk import openai import pandas as pd from langchain.chat_models import ChatOpenAI from langchain.prompts import ChatPromptTemplate from langchain.chains import create_extraction_chain import re from Bio import Entrez from tqdm.auto import tqdm import streamlit as st from ast import literal_eval nltk.download('punkt') os.environ['OPENAI_API_KEY'] = "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" Entrez.email = "firqaaa@gmail.com" Entrez.api_key = "3d7a71231fe7f2d2bd7599e022535199a908" fold = -1 # chunk_size = 8000 st.cache_data() def convert_df(df): return df.to_csv().encode('utf-8') def replace_quotes(text): pattern = r'(?<=")[^"]*(?=")' return re.sub(pattern, lambda match: match.group(0).replace('"', "'"), text) def clean_text(text): """Remove section titles and figure descriptions from text""" pattern = r'[^\w\s]' clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")]) return re.sub(pattern, '', clean) def truncate_text(text, max_tokens): wrapper = textwrap.TextWrapper(width=max_tokens) truncated_text = wrapper.wrap(text) if len(truncated_text) > 0: return truncated_text[0] else: return "" def split_text(text, chunk_size): chunks = [] start = 0 end = chunk_size while start < len(text): chunks.append(text[start:end]) start = end end += chunk_size return chunks def extract_gene_name(text): text_str = text.decode("utf-8") text_str = text_str.replace("\\n", "").replace("\\t", "").replace("\\'", "'") pattern = r"(.*?)" match = re.search(pattern, text_str) if match: gene_name = match.group(1) return gene_name else: return None def get_geneName(rsid): text = Entrez.efetch(db="snp", id=rsid, retmode='xml').read() text = extract_gene_name(text) return text def split_text_into_sentences(text, num_sentences): sentences = nltk.sent_tokenize(text) grouped_sentences = [sentences[i:i+num_sentences] for i in range(0, len(sentences), num_sentences)] return grouped_sentences def flatten_list(nested_list): flattened_list = [] for item in nested_list: if isinstance(item, list): flattened_list.extend(flatten_list(item)) else: flattened_list.append(item) return flattened_list def move_file(source_path, destination_path): # Make sure the destination folder exists before moving the file if not os.path.exists(destination_path): os.makedirs(destination_path) try: shutil.move(source_path, destination_path) print(f"File moved successfully from '{source_path}' to '{destination_path}'.") except Exception as e: print(f"Error: {e}") llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613") schema = { "properties" : { "title" : {"type" : "string"}, "author" : {"type" : "string"}, "publisher" : {"type" : "string"}, "publication_year" : {"type" : "string"}, "gene_codes" : {"type" : "string"}, "population_race" : {"type" : "string"}, "phenotypes_or_diseases" : {"type" : "string"}, "sample_size" : {"type" : "string"}, "SNPs" : {"type" : "string"}, "Study_Methodology" : {"type" : "string"}, "Study_Level" : {"type" : "string"}, "Outcome/Recommendation/Conclusion" : {"type" : "string"} }, "required" : ["title"] } chain = create_extraction_chain(schema, llm) err_path = [] # Page title st.set_page_config(page_title="PubMed Paper Extraction") st.title("PubMed Paper Extraction") uploaded_file = st.file_uploader('Upload Paper Here : ', type="pdf") if uploaded_file: st.write(f"{uploaded_file.name} successfully uploaded") chunk_size = st.selectbox( 'Tokens amounts per process :', (16000, 12000, 10000, 8000, 5000) ) parseButton = st.button("Extract Text") if parseButton: with st.spinner(text='Extraction in progress ...'): try: images = pdf2image.convert_from_bytes(uploaded_file.getvalue()) extracted_text = "" for image in images[:-1]: text = pytesseract.image_to_string(image) text = clean_text(text) extracted_text += text + " " text = replace_quotes(extracted_text) text_chunk = split_text(text, chunk_size)[:fold] chunkdf = [] for i, chunk in enumerate(text_chunk): inp = chunk df = pd.DataFrame(literal_eval(str(json.dumps(chain.run(inp)[0])).replace("\'", "\"")), index=[0]).fillna('') chunkdf.append(df) concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('') concat['title'] = concat['title'][0] concat['author'] = concat['author'][0] concat['publisher'] = concat['publisher'][0] concat['publication_year'] = concat['publication_year'][0] # concat = concat.min().to_frame().T concat['SNPs'] = concat['SNPs'].apply(lambda x: x if x.startswith('rs') else '') for col in list(concat.columns): concat[col] = concat[col].apply(lambda x: x if x not in ['N/A', 'not mentioned', 'Not mentioned', 'Unknown'] else '') L = [] for i in range(len(concat)): if (len(concat['gene_codes'][i].split(',')) >= 1) and concat['SNPs'][i] == '': for g in concat['gene_codes'][i].split(','): L.append({ 'Title' : concat['title'][0], 'Author' : concat['author'][0], 'Publisher' : concat['publisher'][0], 'Publication Year' : concat['publication_year'][0], 'Genes' : g.upper(), 'Population' : concat['population_race'][i], 'Phenotype' : concat['phenotypes_or_diseases'][i].title(), 'Sample Size' : concat['sample_size'][i], 'SNPs' : concat['SNPs'][i], 'Study Methodology' : concat['Study_Methodology'][i].title(), 'Study Level' : concat['Study_Level'][i].title(), 'Outcomes' : concat['Outcome/Recommendation/Conclusion'][i].capitalize() }) elif (len(concat['SNPs'][i].split(',')) >= 1): for s in concat['SNPs'][i].split(','): try: L.append({ 'Title' : concat['title'][0], 'Author' : concat['author'][0], 'Publisher' : concat['publisher'][0], 'Publication Year' : concat['publication_year'][0], 'Genes' : get_geneName(s.strip()).upper(), 'Population' : concat['population_race'][0], 'Phenotype' : concat['phenotypes_or_diseases'][i].title(), 'Sample Size' : concat['sample_size'][i], 'SNPs' : s, 'Study Methodology' : concat['Study_Methodology'][i], 'Study Level' : concat['Study_Level'][i].title(), 'Outcomes' : concat['Outcome/Recommendation/Conclusion'][i].capitalize() }) except Exception as e: print(e) # result = pd.DataFrame(L) st.dataframe(pd.DataFrame(L)) csv = convert_df(pd.DataFrame(L)) st.download_button( label="Save Result", data=csv, file_name=str(uploaded_file.name).replace('.pdf', ''), mime='text/csv' ) except Exception as e: if e == json.JSONDecodeError: st.write("Sorry, we are experiencing difficulties in extracting the information. Please try again with different context length.") # st.write(e) else: st.write("Sorry, we are experiencing difficulties in extracting the information. Please ensure that you input an uncorrupted file.") # move_file(pdf, "./unprocessed")