Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,479 Bytes
5d52c32 6c226f9 8e787d3 550ced0 d790c0b 88183ad 6c226f9 2362603 9d6fa91 66efbc3 d790c0b 6c226f9 550ced0 6c226f9 550ced0 a200645 550ced0 5d52c32 3c0cd8e 550ced0 6c226f9 550ced0 d790c0b 550ced0 d790c0b 550ced0 d790c0b 550ced0 d790c0b 550ced0 5d52c32 550ced0 d790c0b 550ced0 b97a3c2 3c0cd8e 550ced0 6c226f9 550ced0 6c226f9 550ced0 47407ef 6c226f9 550ced0 3c0cd8e 17f14b2 550ced0 3c0cd8e 550ced0 6c226f9 550ced0 6c226f9 550ced0 6c226f9 550ced0 6c226f9 550ced0 6c226f9 47407ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import spaces
import torch
import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from threading import Thread
import tempfile
import os
MODEL_NAME = "openai/whisper-large-v3-turbo"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
device = 0 if torch.cuda.is_available() else "cpu"
# Initialize the transcription pipeline
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
# Hugging Face Token for the LLM model
HF_TOKEN = os.getenv("HF_TOKEN") # Make sure to set this in the environment variables
# Load tokenizer and model for SOAP note generation
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Hermes-3-Llama-3.1-8B")
model = AutoModelForCausalLM.from_pretrained("NousResearch/Hermes-3-Llama-3.1-8B", device_map="auto")
# Prompt for SOAP note generation
sys_prompt = "You are a world class clinical assistant."
task_prompt = """
Convert the following transcribed conversation into a clinical SOAP note.
The text includes dialogue between a physician and a patient. Please clearly distinguish between the physician's and the patient's statements.
Extract and organize the information into the relevant sections of a SOAP note:
- Subjective (symptoms and patient statements),
- Objective (clinical findings and observations, these might be missing if the physician has not conducted a physical exam or has not verbally stated findings),
- Assessment (diagnosis or potential diagnoses, objectively provide a top 5 most likely diagnosis based on just the subjective findings, and use the objective findings if available),
- Plan (treatment and follow-up).
Ensure the note is concise, clear, and accurately reflects the conversation.
"""
# Function to transcribe audio inputs
@spaces.GPU
def transcribe(inputs, task):
if inputs is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
return text
# Function to download audio from YouTube
def download_yt_audio(yt_url, filename):
info_loader = youtube_dl.YoutubeDL()
try:
info = info_loader.extract_info(yt_url, download=False)
except youtube_dl.utils.DownloadError as err:
raise gr.Error(str(err))
file_length_s = sum(x * int(t) for x, t in zip([3600, 60, 1], info["duration_string"].split(":")) if t.isdigit())
if file_length_s > YT_LENGTH_LIMIT_S:
raise gr.Error(f"Video too long. Maximum allowed duration is {YT_LENGTH_LIMIT_S / 60} minutes.")
ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([yt_url])
# Function to transcribe YouTube audio
@spaces.GPU
def yt_transcribe(yt_url, task):
with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "video.mp4")
download_yt_audio(yt_url, filepath)
with open(filepath, "rb") as f:
inputs = f.read()
inputs = pipe.feature_extractor.ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
return f'<iframe width="500" height="320" src="https://www.youtube.com/embed/{yt_url.split("?v=")[-1]}"> </iframe>', text
# Function to generate SOAP notes using LLM
def generate_soap(transcribed_text):
prompt = f"{sys_prompt}\n\n{task_prompt}\n{transcribed_text}"
inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
outputs = model.generate(inputs, max_new_tokens=512)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
# Gradio Interfaces for different inputs
demo = gr.Blocks(theme=gr.themes.Ocean())
mf_transcribe = gr.Interface(
fn=transcribe,
inputs=[gr.Audio(sources="microphone", type="filepath"), gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")],
outputs="text",
title="Whisper Large V3 Turbo: Transcribe Audio",
description="Transcribe long-form microphone or audio inputs."
)
file_transcribe = gr.Interface(
fn=transcribe,
inputs=[gr.Audio(sources="upload", type="filepath", label="Audio file"), gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")],
outputs="text",
title="Whisper Large V3: Transcribe Audio"
)
yt_transcribe = gr.Interface(
fn=yt_transcribe,
inputs=[gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"), gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")],
outputs=["html", "text"],
title="Whisper Large V3: Transcribe YouTube"
)
soap_note = gr.Interface(
fn=generate_soap,
inputs="text",
outputs="text",
title="Generate Clinical SOAP Note",
description="Convert transcribed conversation to a clinical SOAP note with structured sections (Subjective, Objective, Assessment, Plan)."
)
with demo:
gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe, soap_note], ["Microphone", "Audio file", "YouTube", "SOAP Note"])
demo.queue().launch(ssr_mode=False)
|