Spaces:
Sleeping
Sleeping
File size: 4,076 Bytes
130fe19 dd4c06b 4b18df1 8607936 0bbcfe0 dd4c06b 4b18df1 344a72e dc1c6ab a4e4751 d1c3a70 a4e4751 8607936 52cfee9 8607936 5ad2f2b e648c2d b99929d d1c3a70 e648c2d d1c3a70 e648c2d 0bbcfe0 4b18df1 d1c3a70 e648c2d 6e40332 0bbcfe0 6e40332 f8b77d4 4b18df1 d1c3a70 e828a9f 6e40332 8607936 6e40332 0bbcfe0 5ad2f2b 6e40332 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import spaces
import gradio as gr
import os
import logging
from pytube import YouTube
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForCausalLM, AutoTokenizer
def get_text(url):
if url != '':
output_text_transcribe = ''
yt = YouTube(url)
video = yt.streams.filter(only_audio=True).first()
out_file = video.download(output_path=".")
file_stats = os.stat(out_file)
logging.info(f'Size of audio file in Bytes: {file_stats.st_size}')
if file_stats.st_size <= 30000000:
base, ext = os.path.splitext(out_file)
new_file = base + '.mp3'
os.rename(out_file, new_file)
a = new_file
result = model.transcribe(a)
return result['text'].strip()
else:
logging.error('Videos for transcription on this space are limited to about 1.5 hours. Sorry about this limit but some joker thought they could stop this tool from working by transcribing many extremely long videos. Please visit https://steve.digital to contact me about this space.')
@spaces.GPU(duration=60)
def transcribe_audio(audio, model_id):
if audio is None:
return "Please upload an audio file."
if model_id is None:
return "Please select a model."
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=25,
batch_size=16,
torch_dtype=torch_dtype,
device=device,
)
result = pipe(audio)
return result["text"]
def proofread(text):
if text is None:
return "Please provide the transcribed text for proofreading."
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
prompt = "用繁體中文整理這段文字,分段及改正錯別字,最後加上整段文字的重點。"
model = AutoModelForCausalLM.from_pretrained("hfl/llama-3-chinese-8b-instruct-v3")
tokenizer = AutoTokenizer.from_pretrained("hfl/llama-3-chinese-8b-instruct-v3")
model.to(device)
input_text = prompt + text
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
output = model.generate(input_ids, max_length=len(input_ids[0]) + 50, num_return_sequences=1, temperature=0.7)
proofread_text = tokenizer.decode(output[0], skip_special_tokens=True)
return proofread_text
with gr.Blocks() as demo:
gr.Markdown("""
# Audio Transcription and Proofreading
1. Upload an audio file (Wait for the file to be fully loaded first)
2. Select a model for transcription
3. Proofread the transcribed text
""")
with gr.Row():
with gr.Column():
audio = gr.Audio(sources="upload", type="filepath")
input_text_url = gr.Textbox(label="Video URL")
model_dropdown = gr.Dropdown(choices=["openai/whisper-large-v3", "alvanlii/whisper-small-cantonese"], value="openai/whisper-large-v3")
transcribe_button = gr.Button("Transcribe")
transcribed_text = gr.Textbox(label="Transcribed Text")
proofread_button = gr.Button("Proofread")
proofread_output = gr.Textbox(label="Proofread Text")
transcribe_button.click(transcribe_audio, inputs=[audio, model_dropdown], outputs=transcribed_text)
proofread_button.click(proofread, inputs=[transcribed_text], outputs=proofread_output)
transcribed_text.change(proofread, inputs=[transcribed_text], outputs=proofread_output)
demo.launch()
|