hschlotter commited on
Commit
f4d29e1
·
1 Parent(s): df7e582

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.txt +4 -4
  2. app.py +72 -159
  3. examples.json +18 -0
  4. requirements.txt +71 -7
README.txt CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: 🗣️SpeakNowASR🧠Memory💾Gradio
3
- emoji: 🗣️🧠💾
4
- colorFrom: yellow
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 3.5
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
1
  ---
2
+ title: ⚕️MedNER - Biomed Entity Recognizer
3
+ emoji: 👩‍⚕️🩺⚕️🙋
4
+ colorFrom: purple
5
  colorTo: red
6
  sdk: gradio
7
+ sdk_version: 3.8
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py CHANGED
@@ -1,168 +1,81 @@
1
  import gradio as gr
2
- import torch
3
- import time
4
- import librosa
5
- import soundfile
6
- import nemo.collections.asr as nemo_asr
7
- import tempfile
8
- import os
9
- import uuid
10
-
11
- from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
12
- import torch
13
-
14
- # PersistDataset -----
15
- import os
16
- import csv
17
- import gradio as gr
18
- from gradio import inputs, outputs
19
- import huggingface_hub
20
- from huggingface_hub import Repository, hf_hub_download, upload_file
21
- from datetime import datetime
22
-
23
- # ---------------------------------------------
24
- # Dataset and Token links - change awacke1 to your own HF id, and add a HF_TOKEN copy to your repo for write permissions
25
- # This should allow you to save your results to your own Dataset hosted on HF. ---
26
- #DATASET_REPO_URL = "https://huggingface.co/datasets/awacke1/Carddata.csv"
27
- #DATASET_REPO_ID = "awacke1/Carddata.csv"
28
- #DATA_FILENAME = "Carddata.csv"
29
- #DATA_FILE = os.path.join("data", DATA_FILENAME)
30
- #HF_TOKEN = os.environ.get("HF_TOKEN")
31
- #SCRIPT = """
32
-
33
- #<script>
34
- #if (!window.hasBeenRun) {
35
- # window.hasBeenRun = true;
36
- # console.log("should only happen once");
37
- # document.querySelector("button.submit").click();
38
- #}
39
- #</script>
40
- #"""
41
-
42
- #try:
43
- # hf_hub_download(
44
- # repo_id=DATASET_REPO_ID,
45
- # filename=DATA_FILENAME,
46
- # cache_dir=DATA_DIRNAME,
47
- # force_filename=DATA_FILENAME
48
- # )
49
- #except:
50
- # print("file not found")
51
- #repo = Repository(
52
- # local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN
53
- #)
54
-
55
- #def store_message(name: str, message: str):
56
- # if name and message:
57
- # with open(DATA_FILE, "a") as csvfile:
58
- # writer = csv.DictWriter(csvfile, fieldnames=["name", "message", "time"])
59
- # writer.writerow(
60
- # {"name": name.strip(), "message": message.strip(), "time": str(datetime.now())}
61
- # )
62
- # # uncomment line below to begin saving -
63
- # commit_url = repo.push_to_hub()
64
- # return ""
65
-
66
- #iface = gr.Interface(
67
- # store_message,
68
- # [
69
- # inputs.Textbox(placeholder="Your name"),
70
- # inputs.Textbox(placeholder="Your message", lines=2),
71
- # ],
72
- # "html",
73
- # css="""
74
- # .message {background-color:cornflowerblue;color:white; padding:4px;margin:4px;border-radius:4px; }
75
- # """,
76
- # title="Reading/writing to a HuggingFace dataset repo from Spaces",
77
- # description=f"This is a demo of how to do simple *shared data persistence* in a Gradio Space, backed by a dataset repo.",
78
- # article=f"The dataset repo is [{DATASET_REPO_URL}]({DATASET_REPO_URL})",
79
- #)
80
-
81
-
82
- # main -------------------------
83
- mname = "facebook/blenderbot-400M-distill"
84
- model = BlenderbotForConditionalGeneration.from_pretrained(mname)
85
- tokenizer = BlenderbotTokenizer.from_pretrained(mname)
86
-
87
- def take_last_tokens(inputs, note_history, history):
88
- """Filter the last 128 tokens"""
89
- if inputs['input_ids'].shape[1] > 128:
90
- inputs['input_ids'] = torch.tensor([inputs['input_ids'][0][-128:].tolist()])
91
- inputs['attention_mask'] = torch.tensor([inputs['attention_mask'][0][-128:].tolist()])
92
- note_history = ['</s> <s>'.join(note_history[0].split('</s> <s>')[2:])]
93
- history = history[1:]
94
- return inputs, note_history, history
95
-
96
- def add_note_to_history(note, note_history):
97
- """Add a note to the historical information"""
98
- note_history.append(note)
99
- note_history = '</s> <s>'.join(note_history)
100
- return [note_history]
101
-
102
-
103
- def chat(message, history):
104
- history = history or []
105
- if history:
106
- history_useful = ['</s> <s>'.join([str(a[0])+'</s> <s>'+str(a[1]) for a in history])]
107
- else:
108
- history_useful = []
109
- history_useful = add_note_to_history(message, history_useful)
110
- inputs = tokenizer(history_useful, return_tensors="pt")
111
- inputs, history_useful, history = take_last_tokens(inputs, history_useful, history)
112
- reply_ids = model.generate(**inputs)
113
- response = tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0]
114
- history_useful = add_note_to_history(response, history_useful)
115
- list_history = history_useful[0].split('</s> <s>')
116
- history.append((list_history[-2], list_history[-1]))
117
- # store_message(message, response) # Save to dataset - uncomment if you uncomment above to save inputs and outputs to your dataset
118
- return history, history
119
 
 
 
 
120
 
121
- SAMPLE_RATE = 16000
122
- model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_en_conformer_transducer_xlarge")
123
- model.change_decoding_strategy(None)
124
- model.eval()
125
-
126
- def process_audio_file(file):
127
- data, sr = librosa.load(file)
128
- if sr != SAMPLE_RATE:
129
- data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
130
- # monochannel
131
- data = librosa.to_mono(data)
132
- return data
133
 
 
134
 
135
- def transcribe(audio, state = ""):
136
- if state is None:
137
- state = ""
138
- audio_data = process_audio_file(audio)
139
- with tempfile.TemporaryDirectory() as tmpdir:
140
- audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
141
- soundfile.write(audio_path, audio_data, SAMPLE_RATE)
142
- transcriptions = model.transcribe([audio_path])
143
- if type(transcriptions) == tuple and len(transcriptions) == 2:
144
- transcriptions = transcriptions[0]
145
- transcriptions = transcriptions[0]
146
- # store_message(transcriptions, state) # Save to dataset - uncomment to store into a dataset - hint you will need your HF_TOKEN
147
- state = state + transcriptions + " "
148
- return state, state
149
 
150
- iface = gr.Interface(
151
- fn=transcribe,
152
- inputs=[
153
- gr.Audio(source="microphone", type='filepath', streaming=True),
154
- "state",
155
- ],
156
  outputs=[
157
- "textbox",
158
- "state",
 
 
159
  ],
160
- layout="horizontal",
161
- theme="huggingface",
162
- title="🗣️LiveSpeechRecognition🧠Memory💾",
163
- description=f"Live Automatic Speech Recognition (ASR) with Memory💾 Dataset.",
164
- allow_flagging='never',
165
- live=True,
166
- # article=f"Result Output Saved to Memory💾 Dataset: [{DATASET_REPO_URL}]({DATASET_REPO_URL})"
167
  )
168
- iface.launch()
 
 
1
  import gradio as gr
2
+ import pandas as pd
3
+ import json
4
+ from collections import defaultdict
5
+
6
+ # Create tokenizer for biomed model
7
+ from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
8
+ tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
9
+ model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")
10
+ pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
11
+
12
+ # Matplotlib for entity graph
13
+ import matplotlib.pyplot as plt
14
+ plt.switch_backend("Agg")
15
+
16
+ # Load examples from JSON
17
+ EXAMPLES = {}
18
+ with open("examples.json", "r") as f:
19
+ example_json = json.load(f)
20
+ EXAMPLES = {x["text"]: x["label"] for x in example_json}
21
+
22
+ def group_by_entity(raw):
23
+ out = defaultdict(int)
24
+ for ent in raw:
25
+ out[ent["entity_group"]] += 1
26
+ # out["total"] = sum(out.values())
27
+ return out
28
+
29
+
30
+ def plot_to_figure(grouped):
31
+ fig = plt.figure()
32
+ plt.bar(x=list(grouped.keys()), height=list(grouped.values()))
33
+ plt.margins(0.2)
34
+ plt.subplots_adjust(bottom=0.4)
35
+ plt.xticks(rotation=90)
36
+ return fig
37
+
38
+
39
+ def ner(text):
40
+ raw = pipe(text)
41
+ ner_content = {
42
+ "text": text,
43
+ "entities": [
44
+ {
45
+ "entity": x["entity_group"],
46
+ "word": x["word"],
47
+ "score": x["score"],
48
+ "start": x["start"],
49
+ "end": x["end"],
50
+ }
51
+ for x in raw
52
+ ],
53
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ grouped = group_by_entity(raw)
56
+ figure = plot_to_figure(grouped)
57
+ label = EXAMPLES.get(text, "Unknown")
58
 
59
+ meta = {
60
+ "entity_counts": grouped,
61
+ "entities": len(set(grouped.keys())),
62
+ "counts": sum(grouped.values()),
63
+ }
 
 
 
 
 
 
 
64
 
65
+ return (ner_content, meta, label, figure)
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
+ interface = gr.Interface(
69
+ ner,
70
+ inputs=gr.Textbox(label="Note text", value=""),
 
 
 
71
  outputs=[
72
+ gr.HighlightedText(label="NER", combine_adjacent=True),
73
+ gr.JSON(label="Entity Counts"),
74
+ gr.Label(label="Rating"),
75
+ gr.Plot(label="Bar"),
76
  ],
77
+ examples=list(EXAMPLES.keys()),
78
+ allow_flagging="never",
 
 
 
 
 
79
  )
80
+
81
+ interface.launch()
examples.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "text": "In March and April the patient had two falls. One was related to asthma, heart palpitations. The second was due to syncope and post covid vaccination dizziness during exercise. The patient is now getting an EKG. Former EKG had shown that there was a bundle branch block. Patient had some uncontrolled immune system reactions like anaphylaxis and shortness of breath.",
4
+ "label": 3
5
+ },
6
+ {
7
+ "text": "During the fertility engagement, there was alot of anxiety and we just felt depressed. It was difficult but finally after nutrition advice and changing habits such as cutting out smoking and improving daily activities we felt less agitated. Then our baby came. When he was born in December he had some minor jaundice but nothing serious, yet we had problems sleeping. Both of us felt so relieved to have had the help, and now that the baby doesn't cry all night, we feel less fatigue and can sleep through night.",
8
+ "label": 2
9
+ },
10
+ {
11
+ "text": "51 y/o male with history of skin cancer, asthma, allergic to NSAIDS, dogs, cats, mold. Daily long distance swimming helped breathing problems. COVID reactions were severe yet recovery was fastest when maintaining a healthy diet, low fat and chloresterol, as well as limiting sugar intake.",
12
+ "label": 0
13
+ },
14
+ {
15
+ "text": "SNOMEDCT 183452005 is an emergency hospital encounter. CPT 99378 is an intervention with supervision of a hospice patient. HCPCS T2046 Hospice long term care, room and board only; per diem. LOINC for Care Plan Document Type is code 80748-7 - Clinical genetics Plan of care note. LOINC Care Plan Document Type 80751-1 is a Dermatology Plan of care note. ICD10CM M84.371A is a Stress fracture, right ankle, initial encounter for fracture which is a Condition/Diagnosis/Problem Lower Body Fracture. The purpose of this value set is to represent concepts for a diagnosis of a fracture of a bone in the lower body.",
16
+ "label": 1
17
+ }
18
+ ]
requirements.txt CHANGED
@@ -1,7 +1,71 @@
1
- nemo_toolkit[asr]
2
- transformers
3
- torch
4
- gradio
5
- Werkzeug
6
- huggingface_hub
7
- Pillow
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.8.3
2
+ aiosignal==1.2.0
3
+ anyio==3.6.1
4
+ async-timeout==4.0.2
5
+ attrs==22.1.0
6
+ bcrypt==4.0.1
7
+ black==22.10.0
8
+ certifi==2022.9.24
9
+ cffi==1.15.1
10
+ charset-normalizer==2.1.1
11
+ click==8.1.3
12
+ contourpy==1.0.5
13
+ cryptography==38.0.1
14
+ cycler==0.11.0
15
+ fastapi==0.85.0
16
+ ffmpy==0.3.0
17
+ filelock==3.8.0
18
+ fonttools==4.37.4
19
+ frozenlist==1.3.1
20
+ fsspec==2022.8.2
21
+ gradio==3.4.1
22
+ h11==0.12.0
23
+ httpcore==0.15.0
24
+ httpx==0.23.0
25
+ huggingface-hub==0.10.0
26
+ idna==3.4
27
+ Jinja2==3.1.2
28
+ kiwisolver==1.4.4
29
+ linkify-it-py==1.0.3
30
+ markdown-it-py==2.1.0
31
+ MarkupSafe==2.1.1
32
+ matplotlib==3.6.1
33
+ mdit-py-plugins==0.3.1
34
+ mdurl==0.1.2
35
+ multidict==6.0.2
36
+ mypy-extensions==0.4.3
37
+ numpy==1.23.3
38
+ orjson==3.8.0
39
+ packaging==21.3
40
+ pandas==1.5.0
41
+ paramiko==2.11.0
42
+ pathspec==0.10.1
43
+ Pillow==9.2.0
44
+ platformdirs==2.5.2
45
+ pycparser==2.21
46
+ pycryptodome==3.15.0
47
+ pydantic==1.10.2
48
+ pydub==0.25.1
49
+ PyNaCl==1.5.0
50
+ pyparsing==3.0.9
51
+ python-dateutil==2.8.2
52
+ python-multipart==0.0.5
53
+ pytz==2022.4
54
+ PyYAML==6.0
55
+ regex==2022.9.13
56
+ requests==2.28.1
57
+ rfc3986==1.5.0
58
+ six==1.16.0
59
+ sniffio==1.3.0
60
+ starlette==0.20.4
61
+ tokenizers==0.12.1
62
+ tomli==2.0.1
63
+ torch==1.12.1
64
+ tqdm==4.64.1
65
+ transformers==4.22.2
66
+ typing_extensions==4.4.0
67
+ uc-micro-py==1.0.1
68
+ urllib3==1.26.12
69
+ uvicorn==0.18.3
70
+ websockets==10.3
71
+ yarl==1.8.1