Spaces:
Sleeping
Sleeping
File size: 2,902 Bytes
130fe19 dd4c06b 52cfee9 8607936 6e40332 dd4c06b 130fe19 dc1c6ab a4e4751 d1c3a70 a4e4751 8607936 52cfee9 8607936 d1c3a70 e648c2d d1c3a70 e648c2d d1c3a70 e648c2d d1c3a70 e648c2d d1c3a70 e648c2d 8607936 6e40332 f6b2f01 d1c3a70 e828a9f 6e40332 8607936 6e40332 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import spaces
import gradio as gr
# Use a pipeline as a high-level helper
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForCausalLM
@spaces.GPU(duration=120)
def transcribe_audio(audio, model_id):
if audio is None:
return "Please upload an audio file."
if model_id is None:
return "Please select a model."
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=25,
batch_size=16,
torch_dtype=torch_dtype,
device=device,
)
result = pipe(audio)
return result["text"]
@spaces.GPU(duration=180)
def proofread(prompt, text):
if text is None:
return "Please provide the transcribed text for proofreading."
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
prompt = "用繁體中文整理這段文字,在最後加上整段文字的重點。"
model = AutoModelForCausalLM.from_pretrained("hfl/llama-3-chinese-8b-instruct-v3")
tokenizer = AutoTokenizer.from_pretrained("hfl/llama-3-chinese-8b-instruct-v3")
model.to(device)
# Perform proofreading using the model
input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
output = model.generate(input_ids, max_length=len(input_ids[0])+50, num_return_sequences=1, temperature=0.7)
proofread_text = tokenizer.decode(output[0], skip_special_tokens=True)
return proofread_text
with gr.Blocks() as demo:
gr.Markdown("# Audio Transcription and Proofreading")
gr.Markdown("Upload an audio file, select a model for transcription, and then proofread the transcribed text.")
with gr.Row():
audio = gr.Audio(sources="upload", type="filepath")
model_dropdown = gr.Dropdown(choices=["openai/whisper-large-v3", "alvanlii/whisper-small-cantonese"], value="openai/whisper-large-v3")
transcribe_button = gr.Button("Transcribe")
transcribed_text = gr.Textbox(label="Transcribed Text")
proofread_button = gr.Button("Proofread")
proofread_output = gr.Textbox(label="Proofread Text")
transcribe_button.click(transcribe_audio, inputs=[audio, model_dropdown], outputs=transcribed_text)
proofread_button.click(proofread, inputs=transcribed_text, outputs=proofread_output)
demo.launch()
|