LocalScribe1

Running on Zero

File size: 5,479 Bytes

5d52c32
6c226f9
 
8e787d3
550ced0
 
d790c0b
88183ad
6c226f9
2362603
9d6fa91
66efbc3
d790c0b
6c226f9
 
 
550ced0
6c226f9
 
 
 
 
 
 
550ced0
 
 
 
a200645
 
550ced0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d52c32
3c0cd8e
 
 
 
550ced0
6c226f9
550ced0
d790c0b
 
 
 
 
 
550ced0
 
d790c0b
550ced0
 
d790c0b
 
550ced0
d790c0b
550ced0
5d52c32
550ced0
d790c0b
 
 
 
 
550ced0
b97a3c2
3c0cd8e
550ced0
6c226f9
550ced0
 
 
 
 
 
6c226f9
550ced0
47407ef
6c226f9
 
 
550ced0
3c0cd8e
17f14b2
550ced0
3c0cd8e
 
 
 
550ced0
6c226f9
550ced0
6c226f9
 
 
 
550ced0
6c226f9
550ced0
 
 
 
 
 
 
 
 
6c226f9
 
 
550ced0
6c226f9
47407ef

import spaces
import torch
import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from threading import Thread
import tempfile
import os

MODEL_NAME = "openai/whisper-large-v3-turbo"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600  # limit to 1 hour YouTube files

device = 0 if torch.cuda.is_available() else "cpu"

# Initialize the transcription pipeline
pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

# Hugging Face Token for the LLM model
HF_TOKEN = os.getenv("HF_TOKEN")  # Make sure to set this in the environment variables

# Load tokenizer and model for SOAP note generation
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Hermes-3-Llama-3.1-8B")
model = AutoModelForCausalLM.from_pretrained("NousResearch/Hermes-3-Llama-3.1-8B", device_map="auto")

# Prompt for SOAP note generation
sys_prompt = "You are a world class clinical assistant."
task_prompt = """
Convert the following transcribed conversation into a clinical SOAP note.
The text includes dialogue between a physician and a patient. Please clearly distinguish between the physician's and the patient's statements.
Extract and organize the information into the relevant sections of a SOAP note:
- Subjective (symptoms and patient statements),
- Objective (clinical findings and observations, these might be missing if the physician has not conducted a physical exam or has not verbally stated findings),
- Assessment (diagnosis or potential diagnoses, objectively provide a top 5 most likely diagnosis based on just the subjective findings, and use the objective findings if available),
- Plan (treatment and follow-up).
Ensure the note is concise, clear, and accurately reflects the conversation.
"""

# Function to transcribe audio inputs
@spaces.GPU
def transcribe(inputs, task):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
    return text

# Function to download audio from YouTube
def download_yt_audio(yt_url, filename):
    info_loader = youtube_dl.YoutubeDL()
    try:
        info = info_loader.extract_info(yt_url, download=False)
    except youtube_dl.utils.DownloadError as err:
        raise gr.Error(str(err))

    file_length_s = sum(x * int(t) for x, t in zip([3600, 60, 1], info["duration_string"].split(":")) if t.isdigit())
    if file_length_s > YT_LENGTH_LIMIT_S:
        raise gr.Error(f"Video too long. Maximum allowed duration is {YT_LENGTH_LIMIT_S / 60} minutes.")

    ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([yt_url])

# Function to transcribe YouTube audio
@spaces.GPU
def yt_transcribe(yt_url, task):
    with tempfile.TemporaryDirectory() as tmpdirname:
        filepath = os.path.join(tmpdirname, "video.mp4")
        download_yt_audio(yt_url, filepath)
        with open(filepath, "rb") as f:
            inputs = f.read()
    inputs = pipe.feature_extractor.ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
    return f'<iframe width="500" height="320" src="https://www.youtube.com/embed/{yt_url.split("?v=")[-1]}"> </iframe>', text

# Function to generate SOAP notes using LLM
def generate_soap(transcribed_text):
    prompt = f"{sys_prompt}\n\n{task_prompt}\n{transcribed_text}"
    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    outputs = model.generate(inputs, max_new_tokens=512)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Gradio Interfaces for different inputs
demo = gr.Blocks(theme=gr.themes.Ocean())

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[gr.Audio(sources="microphone", type="filepath"), gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")],
    outputs="text",
    title="Whisper Large V3 Turbo: Transcribe Audio",
    description="Transcribe long-form microphone or audio inputs."
)

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[gr.Audio(sources="upload", type="filepath", label="Audio file"), gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")],
    outputs="text",
    title="Whisper Large V3: Transcribe Audio"
)

yt_transcribe = gr.Interface(
    fn=yt_transcribe,
    inputs=[gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"), gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")],
    outputs=["html", "text"],
    title="Whisper Large V3: Transcribe YouTube"
)

soap_note = gr.Interface(
    fn=generate_soap,
    inputs="text",
    outputs="text",
    title="Generate Clinical SOAP Note",
    description="Convert transcribed conversation to a clinical SOAP note with structured sections (Subjective, Objective, Assessment, Plan)."
)

with demo:
    gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe, soap_note], ["Microphone", "Audio file", "YouTube", "SOAP Note"])

demo.queue().launch(ssr_mode=False)