#!/usr/bin/env python # coding: utf-8 # In[1]: from datasets import Dataset, ClassLabel, Sequence, load_dataset, load_metric import numpy as np import pandas as pd import bioc from spacy import displacy import transformers #import evaluate from transformers import (AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, pipeline, TrainingArguments, Trainer) # In[2]: label_list = ['O', 'B-DRUG', 'I-DRUG', 'B-DISEASE', 'I-DISEASE', 'B-GENE', 'I-GENE'] model_checkpoint = './trainedSB2' tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list)) effect_ner_model = pipeline(task="ner", model=model, tokenizer=tokenizer) # In[21]: def visualize_entities(sentence): tokens = effect_ner_model(sentence) entities = [] # ['O', 'B-DRUG', 'I-DRUG', 'B-DISEASE', 'I-DISEASE', 'B-GENE', 'I-GENE'] for token in tokens: label = int(token["entity"][-1]) if label != 0: token["label"] = label_list[label] entities.append(token) params = [{"text": sentence, "ents": entities, "title": None}] html = displacy.render(params, style="ent", manual=True, options={ "colors": { "B-DRUG": "#f08080", "I-DRUG": "#f08080", "B-DISEASE": "#9bddff", "I-DISEASE": "#9bddff", "B-GENE": "#008080", "I-GENE": "#008080", }, }) return html # In[25]: import gradio as gr exampleList = [ 'Famotidine is a histamine H2-receptor antagonist used in inpatient settings for prevention of stress ulcers and is showing increasing popularity because of its low cost.', 'A randomized Phase III trial demonstrated noninferiority of APF530 500 mg SC ( granisetron 10 mg ) to intravenous palonosetron 0.25 mg in preventing CINV in patients receiving MEC or HEC in acute ( 0 - 24 hours ) and delayed ( 24 - 120 hours ) settings , with activity over 120 hours .', 'What are the known interactions between Aspirin and the COX-1 enzyme?', 'Can you explain the mechanism of action of Metformin and its effect on the AMPK pathway?', 'Are there any genetic variations in the CYP2C9 gene that may influence the response to Warfarin therapy?', 'I am curious about the role of Herceptin in targeting the HER2/neu protein in breast cancer treatment. How does it work?', 'What are the common side effects associated with Lisinopril, an angiotensin-converting enzyme (ACE) inhibitor?', 'Can you explain the significance of the BCR-ABL fusion protein in the context of Imatinib therapy for chronic myeloid leukemia (CML)?', 'How does Ibuprofen affect the COX-2 enzyme compared to COX-1?', 'Are there any recent studies exploring the use of Pembrolizumab as an immune checkpoint inhibitor targeting PD-1?', 'I have heard about the SLC6A4 gene and its association with serotonin reuptake inhibitors (SSRIs) like Fluoxetine.', 'Could you provide insights into the BRAF mutation and its relevance in response to Vemurafenib treatment in melanoma patients?' ] footer = """ LLMGeneLinker uses a domain-specific transformer like SciBERT finetuned on AllenAI drug dataset, BC5CDR disease, NCBI disease, DrugProt and GeneTAG datasets. The resulting SciBERT model performs Named Entity Recognition to tag drug, protein, gene, diseases in input text. Sentence embedding of SciBERT is then fed into BERT This was made during the LLMs for Bio Hackathon organised by 4Catalyzer and SGInnovate.
Made by Team GeneLink (Nicholas, Yew Chong, Ting Wei, Brendan
Note: Performance is noted to be poorer on genes, acronyms, and receptors (named entities that may be targets for drugs or genes) Original notebook adapted from jsylee/scibert_scivocab_uncased-finetuned-ner """ with gr.Blocks() as demo: gr.Markdown("## LLMGeneLinker (LGL)") gr.Markdown(footer) txt = gr.Textbox(label="Input", lines=2) txt_3 = gr.HTML(label="Output") btn = gr.Button(value="Submit") btn.click(visualize_entities, inputs=txt, outputs=txt_3) gr.Markdown("## Text Examples") gr.Examples( [[x] for x in exampleList], txt, txt_3, visualize_entities, cache_examples=False, run_on_click=True ) if __name__ == "__main__": demo.launch() # In[ ]: