|
import pandas_profiling as pp |
|
import pandas as pd |
|
import tensorflow as tf |
|
|
|
from datasets import load_dataset |
|
from tensorflow.python.framework import tensor_shape |
|
|
|
|
|
datasetLOINC = load_dataset("awacke1/LOINC-CodeSet-Value-Description.csv", split="train") |
|
|
|
datasetSNOMED = load_dataset("awacke1/SNOMED-CT-Code-Value-Semantic-Set.csv", split="train") |
|
|
|
dataseteCQM = load_dataset("awacke1/eCQM-Code-Value-Semantic-Set.csv", split="train") |
|
|
|
|
|
from transformers import AutoTokenizer |
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") |
|
dataset = datasetLOINC.map(lambda examples: tokenizer(examples["Description"]), batched=True) |
|
JSONOBJ2=dataset[0] |
|
print(JSONOBJ2) |
|
|
|
sw = datasetLOINC.filter(lambda example: example["Description"].startswith("Allergy")) |
|
len(sw) |
|
print(sw) |
|
print(datasetLOINC) |
|
print(datasetSNOMED) |
|
print(dataseteCQM) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
import json |
|
import numpy as np |
|
import gradio as gr |
|
|
|
HF_TOKEN = os.environ.get("HF_TOKEN") |
|
CHOICES = ["SNOMED", "LOINC", "CQM"] |
|
JSONOBJ = """{"items":{"item":[{"id": "0001","type": null,"is_good": false,"ppu": 0.55,"batters":{"batter":[{ "id": "1001", "type": "Regular" },{ "id": "1002", "type": "Chocolate" },{ "id": "1003", "type": "Blueberry" },{ "id": "1004", "type": "Devil's Food" }]},"topping":[{ "id": "5001", "type": "None" },{ "id": "5002", "type": "Glazed" },{ "id": "5005", "type": "Sugar" },{ "id": "5007", "type": "Powdered Sugar" },{ "id": "5006", "type": "Chocolate with Sprinkles" },{ "id": "5003", "type": "Chocolate" },{ "id": "5004", "type": "Maple" }]}]}}""" |
|
|
|
|
|
def profile_dataset(dataset=datasetSNOMED, username="awacke1", token=HF_TOKEN, dataset_name="awacke1/SNOMED-CT-Code-Value-Semantic-Set.csv"): |
|
df = pd.read_csv(dataset.Description) |
|
if len(df.columns) <= 15: |
|
profile = pp.ProfileReport(df, title=f"{dataset_name} Report") |
|
else: |
|
profile = pp.ProfileReport(df, title=f"{dataset_name} Report", minimal = True) |
|
|
|
repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False) |
|
|
|
profile.to_file("./index.html") |
|
|
|
upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) |
|
readme = f"---\ntitle: {dataset_name}\nemoji: β¨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---" |
|
with open("README.md", "w+") as f: |
|
f.write(readme) |
|
upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) |
|
return f"Your dataset report will be ready at {repo_url}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def concatenate_text(examples): |
|
return { |
|
"text": examples["Code"] |
|
+ " \n " |
|
+ examples["Description"] |
|
+ " \n " |
|
+ examples["Purpose: Clinical Focus"] |
|
} |
|
|
|
def cls_pooling(model_output): |
|
return model_output.last_hidden_state[:, 0] |
|
|
|
def get_embeddings(text_list): |
|
encoded_input = tokenizer( |
|
text_list, padding=True, truncation=True, return_tensors="tf" |
|
) |
|
encoded_input = {k: v for k, v in encoded_input.items()} |
|
model_output = model(**encoded_input) |
|
return cls_pooling(model_output) |
|
|
|
|
|
def fn( text1, text2, num, slider1, slider2, single_checkbox, checkboxes, radio, dropdown, im1, im2, im3, im4, |
|
video, audio1, audio2, file, df1, df2,): |
|
|
|
|
|
searchTerm = text1 |
|
searchTermSentence = text2 |
|
|
|
start_with_searchTermLOINC = datasetLOINC.filter(lambda example:example["Description"].startswith('Allergy')) |
|
|
|
|
|
|
|
columns = start_with_searchTermLOINC.column_names |
|
columns_to_keep = ["Value Set Name", "Code", "Description", "Purpose: Clinical Focus", "Code System OID"] |
|
columns_to_remove = set(columns_to_keep).symmetric_difference(columns) |
|
start_with_searchTermLOINC = start_with_searchTermLOINC.remove_columns(columns_to_remove) |
|
start_with_searchTermLOINC |
|
start_with_searchTermLOINC.set_format("pandas") |
|
df = start_with_searchTermLOINC[:] |
|
|
|
df["Purpose: Clinical Focus"][0] |
|
|
|
df4 = df.explode("Purpose: Clinical Focus", ignore_index=True) |
|
df4.head(4) |
|
|
|
from datasets import Dataset |
|
clinical_dataset = Dataset.from_pandas(df4) |
|
clinical_dataset |
|
|
|
clinical_dataset = clinical_dataset.map(lambda x: {"c_length": len(x["Description"].split())}) |
|
|
|
clinical_dataset = clinical_dataset.filter(lambda x: x["c_length"] > 15) |
|
clinical_dataset |
|
|
|
|
|
clinical_dataset = clinical_dataset.map(concatenate_text) |
|
|
|
|
|
|
|
from transformers import AutoTokenizer, TFAutoModel |
|
|
|
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1" |
|
tokenizer = AutoTokenizer.from_pretrained(model_ckpt) |
|
model = TFAutoModel.from_pretrained(model_ckpt, from_pt=True) |
|
|
|
|
|
tf.shape([1, 768]) |
|
|
|
embeddings_dataset = clinical_dataset.map( |
|
lambda x: {"embeddings": get_embeddings(x["text"]).numpy()[0]}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
start_with_searchTermSNOMED = datasetSNOMED.filter(lambda example: example["Description"].startswith('Hospital')) |
|
start_with_searchTermCQM = dataseteCQM.filter(lambda example: example["Description"].startswith('Telephone')) |
|
|
|
print(start_with_searchTermLOINC ) |
|
print(start_with_searchTermSNOMED ) |
|
print(start_with_searchTermCQM) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return ( |
|
(text1 if single_checkbox else text2) |
|
+ ", selected:" |
|
+ ", ".join(checkboxes), |
|
{ |
|
"positive": num / (num + slider1 + slider2), |
|
"negative": slider1 / (num + slider1 + slider2), |
|
"neutral": slider2 / (num + slider1 + slider2), |
|
}, |
|
(audio1[0], np.flipud(audio1[1])) |
|
if audio1 is not None else os.path.join(os.path.dirname(__file__), "files/cantina.wav"), |
|
np.flipud(im1) |
|
if im1 is not None else os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), |
|
video |
|
if video is not None else os.path.join(os.path.dirname(__file__), "files/world.mp4"), |
|
[ |
|
("The", "art"), |
|
("quick brown", "adj"), |
|
("fox", "nn"), |
|
("jumped", "vrb"), |
|
("testing testing testing", None), |
|
("over", "prp"), |
|
("the", "art"), |
|
("testing", None), |
|
("lazy", "adj"), |
|
("dogs", "nn"), |
|
(".", "punc"), |
|
] + [(f"test {x}", f"test {x}") for x in range(10)], |
|
[ |
|
("The testing testing testing", None), |
|
("over", 0.6), |
|
("the", 0.2), |
|
("testing", None), |
|
("lazy", -0.1), |
|
("dogs", 0.4), |
|
(".", 0), |
|
] + [(f"test", x / 10) for x in range(-10, 10)], |
|
|
|
start_with_searchTermLOINC.to_json(orient="records", path_or_buf="None"), |
|
|
|
"<button style='background-color: red'>Click Me: " + radio + "</button>", |
|
os.path.join(os.path.dirname(__file__), "files/titanic.csv"), |
|
df1, |
|
np.random.randint(0, 10, (4, 4)), |
|
df2, |
|
) |
|
|
|
|
|
|
|
demo = gr.Interface( |
|
fn, |
|
inputs=[ |
|
gr.Textbox(value="Allergy", label="Textbox"), |
|
gr.Textbox(lines=3, value="Bathing", placeholder="Type here..", label="Textbox 2"), |
|
gr.Number(label="Number", value=42), |
|
gr.Slider(10, 20, value=15, label="Slider: 10 - 20"), |
|
gr.Slider(maximum=20, step=0.04, label="Slider: step @ 0.04"), |
|
gr.Checkbox(label="Check for NER Match on Submit"), |
|
gr.CheckboxGroup(label="Clinical Terminology to Check", choices=CHOICES, value=CHOICES[0:2]), |
|
gr.Radio(label="Preferred Terminology Output", choices=CHOICES, value=CHOICES[2]), |
|
gr.Dropdown(label="Dropdown", choices=CHOICES), |
|
gr.Image(label="Image"), |
|
gr.Image(label="Image w/ Cropper", tool="select"), |
|
gr.Image(label="Sketchpad", source="canvas"), |
|
gr.Image(label="Webcam", source="webcam"), |
|
gr.Video(label="Video"), |
|
gr.Audio(label="Audio"), |
|
gr.Audio(label="Microphone", source="microphone"), |
|
gr.File(label="File"), |
|
gr.Dataframe(label="Filters", headers=["Name", "Age", "Gender"]), |
|
gr.Timeseries(x="time", y=["price", "value"], colors=["pink", "purple"]), |
|
], |
|
outputs=[ |
|
gr.Textbox(label="Textbox"), |
|
gr.Label(label="Label"), |
|
gr.Audio(label="Audio"), |
|
gr.Image(label="Image"), |
|
gr.Video(label="Video"), |
|
gr.HighlightedText(label="HighlightedText", color_map={"punc": "pink", "test 0": "blue"}), |
|
gr.HighlightedText(label="HighlightedText", show_legend=True), |
|
gr.JSON(label="JSON"), |
|
gr.HTML(label="HTML"), |
|
gr.File(label="File"), |
|
gr.Dataframe(label="Dataframe"), |
|
gr.Dataframe(label="Numpy"), |
|
gr.Timeseries(x="time", y=["price", "value"], label="Timeseries"), |
|
], |
|
examples=[ |
|
[ |
|
"Allergy", |
|
"Admission", |
|
10, |
|
12, |
|
4, |
|
True, |
|
["SNOMED", "LOINC", "CQM"], |
|
"SNOMED", |
|
"bar", |
|
os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), |
|
os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), |
|
os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), |
|
os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), |
|
os.path.join(os.path.dirname(__file__), "files/world.mp4"), |
|
os.path.join(os.path.dirname(__file__), "files/cantina.wav"), |
|
os.path.join(os.path.dirname(__file__), "files/cantina.wav"), |
|
os.path.join(os.path.dirname(__file__), "files/titanic.csv"), |
|
[[1, 2, 3], [3, 4, 5]], |
|
os.path.join(os.path.dirname(__file__), "files/time.csv"), |
|
] |
|
] |
|
* 3, |
|
theme="default", |
|
title="βοΈπ§ π¬π§¬ Clinical Terminology Auto Mapper AI π©ββοΈπ©ΊβοΈπ", |
|
cache_examples=False, |
|
description="Clinical Terminology Auto Mapper AI", |
|
article="Learn more at [Yggdrasil](https://github.com/AaronCWacker/Yggdrasil)", |
|
|
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(debug=True) |