Spaces:

KalbeDigitalLab
/

GenMExt

Sleeping

App Files Files Community

firqaaa commited on Aug 7, 2023

Commit

e04865c

1 Parent(s): dea570f

add app.py

Browse files

Files changed (1) hide show

app.py +239 -0

app.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import os
+import time
+import json
+import glob
+import shutil
+import textwrap
+import pdf2image
+import pytesseract
+import nltk
+import openai
+import pandas as pd
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import ChatPromptTemplate
+from langchain.chains import create_extraction_chain
+import re
+from Bio import Entrez
+from tqdm.auto import tqdm
+import streamlit as st
+from ast import literal_eval
+nltk.download('punkt')
+os.environ['OPENAI_API_KEY'] = "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"
+Entrez.email = "[email protected]"
+Entrez.api_key = "3d7a71231fe7f2d2bd7599e022535199a908"
+fold = -1
+# chunk_size = 8000
+st.cache_data()
+def convert_df(df):
+    return df.to_csv().encode('utf-8')
+def replace_quotes(text):
+    pattern = r'(?<=")[^"]*(?=")'
+    return re.sub(pattern, lambda match: match.group(0).replace('"', "'"), text)
+def clean_text(text):
+    """Remove section titles and figure descriptions from text"""
+    pattern = r'[^\w\s]'
+    clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")])
+    return re.sub(pattern, '', clean)
+def truncate_text(text, max_tokens):
+    wrapper = textwrap.TextWrapper(width=max_tokens)
+    truncated_text = wrapper.wrap(text)
+    if len(truncated_text) > 0:
+        return truncated_text[0]
+    else:
+        return ""
+def split_text(text, chunk_size):
+    chunks = []
+    start = 0
+    end = chunk_size
+    while start < len(text):
+        chunks.append(text[start:end])
+        start = end
+        end += chunk_size
+    return chunks
+def extract_gene_name(text):
+    text_str = text.decode("utf-8")
+    text_str = text_str.replace("\\n", "").replace("\\t", "").replace("\\'", "'")
+    pattern = r"<NAME>(.*?)</NAME>"
+    match = re.search(pattern, text_str)
+    if match:
+        gene_name = match.group(1)
+        return gene_name
+    else:
+        return None
+def get_geneName(rsid):
+    text = Entrez.efetch(db="snp", id=rsid, retmode='xml').read()
+    text = extract_gene_name(text)
+    return text
+def split_text_into_sentences(text, num_sentences):
+    sentences = nltk.sent_tokenize(text)
+    grouped_sentences = [sentences[i:i+num_sentences] for i in range(0, len(sentences), num_sentences)]
+    return grouped_sentences
+def flatten_list(nested_list):
+    flattened_list = []
+    for item in nested_list:
+        if isinstance(item, list):
+            flattened_list.extend(flatten_list(item))
+        else:
+            flattened_list.append(item)
+    return flattened_list
+def move_file(source_path, destination_path):
+    # Make sure the destination folder exists before moving the file
+    if not os.path.exists(destination_path):
+        os.makedirs(destination_path)
+    try:
+        shutil.move(source_path, destination_path)
+        print(f"File moved successfully from '{source_path}' to '{destination_path}'.")
+    except Exception as e:
+        print(f"Error: {e}")
+llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")
+schema = {
+    "properties" : {
+        "title" : {"type" : "string"},
+        "author" : {"type" : "string"},
+        "publisher" : {"type" : "string"},
+        "publication_year" : {"type" : "string"},
+        "gene_codes" : {"type" : "string"},
+        "population_race" : {"type" : "string"},
+        "phenotypes_or_diseases" : {"type" : "string"},
+        "sample_size" : {"type" : "string"},
+        "SNPs" : {"type" : "string"},
+        "Study_Methodology" : {"type" : "string"},
+        "Study_Level" : {"type" : "string"},
+        "Outcome/Recommendation/Conclusion" : {"type" : "string"}
+    },
+    "required" : ["title"]
+}
+chain = create_extraction_chain(schema, llm)
+err_path = []
+# Page title
+st.set_page_config(page_title="PubMed Paper Extraction")
+st.title("PubMed Paper Extraction")
+uploaded_file = st.file_uploader('Upload Paper Here : ', type="pdf")
+if uploaded_file:
+    st.write(f"{uploaded_file.name} successfully uploaded")
+    chunk_size = st.selectbox(
+        'Tokens amounts per process :',
+        (16000, 12000, 10000, 8000, 5000)
+    )
+    parseButton = st.button("Extract Text")
+    if parseButton:
+        with st.spinner(text='Extraction in progress ...'):
+            try:
+                images = pdf2image.convert_from_bytes(uploaded_file.getvalue())
+                extracted_text = ""
+                for image in images[:-1]:
+                    text = pytesseract.image_to_string(image)
+                    text = clean_text(text)
+                    extracted_text += text + " "
+                text = replace_quotes(extracted_text)
+                text_chunk = split_text(text, chunk_size)[:fold]
+                chunkdf = []
+                for i, chunk in enumerate(text_chunk):
+                    inp = chunk
+                    df = pd.DataFrame(literal_eval(str(json.dumps(chain.run(inp)[0])).replace("\'", "\"")), index=[0]).fillna('')
+                    chunkdf.append(df)
+                concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
+                concat['title'] = concat['title'][0]
+                concat['author'] = concat['author'][0]
+                concat['publisher'] = concat['publisher'][0]
+                concat['publication_year'] = concat['publication_year'][0]
+                # concat = concat.min().to_frame().T
+                concat['SNPs'] = concat['SNPs'].apply(lambda x: x if x.startswith('rs') else '')
+                for col in list(concat.columns):
+                    concat[col] = concat[col].apply(lambda x: x if x not in ['N/A', 'not mentioned', 'Not mentioned', 'Unknown'] else '')
+                L = []
+                for i in range(len(concat)):
+                    if (len(concat['gene_codes'][i].split(',')) >= 1) and concat['SNPs'][i] == '':
+                        for g in concat['gene_codes'][i].split(','):
+                            L.append({
+                                'Title' : concat['title'][0],
+                                'Author' : concat['author'][0],
+                                'Publisher' : concat['publisher'][0],
+                                'Publication Year' : concat['publication_year'][0],
+                                'Genes' : g.upper(),
+                                'Population' : concat['population_race'][i],
+                                'Phenotype' : concat['phenotypes_or_diseases'][i].title(),
+                                'Sample Size' : concat['sample_size'][i],
+                                'SNPs' : concat['SNPs'][i],
+                                'Study Methodology' : concat['Study_Methodology'][i].title(),
+                                'Study Level' : concat['Study_Level'][i].title(),
+                                'Outcomes' : concat['Outcome/Recommendation/Conclusion'][i].capitalize()
+                            })
+                    elif (len(concat['SNPs'][i].split(',')) >= 1):
+                        for s in concat['SNPs'][i].split(','):
+                            try:
+                                L.append({
+                                    'Title' : concat['title'][0],
+                                    'Author' : concat['author'][0],
+                                    'Publisher' : concat['publisher'][0],
+                                    'Publication Year' : concat['publication_year'][0],
+                                    'Genes' : get_geneName(s.strip()).upper(),
+                                    'Population' : concat['population_race'][0],
+                                    'Phenotype' : concat['phenotypes_or_diseases'][i].title(),
+                                    'Sample Size' : concat['sample_size'][i],
+                                    'SNPs' : s,
+                                    'Study Methodology' : concat['Study_Methodology'][i],
+                                    'Study Level' : concat['Study_Level'][i].title(),
+                                    'Outcomes' : concat['Outcome/Recommendation/Conclusion'][i].capitalize()
+                                })
+                            except Exception as e:
+                                print(e)
+                # result = pd.DataFrame(L)
+                st.dataframe(pd.DataFrame(L))
+                csv = convert_df(pd.DataFrame(L))
+                st.download_button(
+                    label="Save Result",
+                    data=csv,
+                    file_name=str(uploaded_file.name).replace('.pdf', ''),
+                    mime='text/csv'
+                )
+            except Exception as e:
+                if e == json.JSONDecodeError:
+                    st.write("Sorry, we are experiencing difficulties in extracting the information. Please try again with different context length.")
+                # st.write(e)
+                else:
+                    st.write("Sorry, we are experiencing difficulties in extracting the information. Please ensure that you input an uncorrupted file.")
+                # move_file(pdf, "./unprocessed")