Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import BitsAndBytesConfig, HqqConfig | |
from whisperplus import ( | |
SpeechToTextPipeline, | |
download_youtube_to_mp3, | |
download_youtube_to_mp4, | |
format_speech_to_dialogue, | |
) | |
from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline | |
from whisperplus.pipelines.summarization import TextSummarizationPipeline | |
from whisperplus.pipelines.text2speech import TextToSpeechPipeline | |
from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline | |
from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline | |
import subprocess | |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) | |
def youtube_url_to_text(url, model_id, language_choice): | |
""" | |
Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using | |
a specified model, and returns the transcript along with the video path. | |
Args: | |
url (str): The URL of the video to download and convert. | |
model_id (str): The ID of the speech-to-text model to use. | |
language_choice (str): The language choice for the speech-to-text conversion. | |
Returns: | |
transcript (str): The transcript of the speech-to-text conversion. | |
""" | |
audio_path = download_youtube_to_mp3(url, output_dir="downloads", filename="test") | |
hqq_config = HqqConfig( | |
nbits=4, | |
group_size=64, | |
quant_zero=False, | |
quant_scale=False, | |
axis=0, | |
offload_meta=False, | |
) # axis=0 is used by default | |
pipeline = SpeechToTextPipeline( | |
model_id=model_id, | |
quant_config=hqq_config, | |
flash_attention_2=True, | |
) | |
transcript = pipeline( | |
audio_path=audio_path, | |
chunk_length_s=30, | |
stride_length_s=5, | |
max_new_tokens=128, | |
batch_size=100, | |
language=language_choice, | |
return_timestamps=False, | |
) | |
return transcript | |
def summarization(text, model_id="facebook/bart-large-cnn"): | |
""" | |
Main function that performs summarization using a specified model and returns the summary. | |
Args: | |
text (str): The text to summarize. | |
model_id (str): The ID of the summarization model to use. | |
Returns: | |
summary (str): The summary of the text. | |
""" | |
summarizer = TextSummarizationPipeline(model_id=model_id) | |
summary = summarizer.summarize(text) | |
return summary[0]["summary_text"] | |
def long_text_summarization(text, model_id="facebook/bart-large-cnn"): | |
""" | |
Main function that performs summarization using a specified model and returns the summary. | |
Args: | |
text (str): The text to summarize. | |
model_id (str): The ID of the summarization model to use. | |
Returns: | |
summary (str): The summary of the text. | |
""" | |
summarizer = LongTextSummarizationPipeline(model_id=model_id) | |
summary_text = summarizer.summarize(text) | |
return summary_text | |
def speaker_diarization(url, model_id, device, num_speakers, min_speaker, max_speaker): | |
""" | |
Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using | |
a specified model, and returns the transcript along with the video path. | |
Args: | |
url (str): The URL of the video to download and convert. | |
model_id (str): The ID of the speech-to-text model to use. | |
language_choice (str): The language choice for the speech-to-text conversion. | |
Returns: | |
transcript (str): The transcript of the speech-to-text conversion. | |
video_path (str): The path of the downloaded video. | |
""" | |
pipeline = ASRDiarizationPipeline.from_pretrained( | |
asr_model=model_id, | |
diarizer_model="pyannote/speaker-diarization", | |
use_auth_token=False, | |
chunk_length_s=30, | |
device=device, | |
) | |
audio_path = download_youtube_to_mp3(url) | |
output_text = pipeline( | |
audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker) | |
dialogue = format_speech_to_dialogue(output_text) | |
return dialogue, audio_path | |
def text2spech_bark(text, model_id="suno/bark", voice_preset="v2/en_speaker_6"): | |
tts = TextToSpeechPipeline(model_id=model_id) | |
audio = tts(text=text, voice_preset=voice_preset) | |
return audio | |
def whisper_autocaption(url, language, model_id="openai/whisper-large-v3"): | |
video_path = download_youtube_to_mp4(url) | |
caption = WhisperAutoCaptionPipeline(model_id=model_id) | |
output = caption(video_path=video_path, output_path="output.mp4", language=language) | |
return output | |
with gr.Blocks() as demo: | |
with gr.Tab("YouTube URL to Text"): | |
with gr.Row(): | |
with gr.Column(): | |
url_input = gr.Textbox(label="Enter YouTube URL") | |
model_id_input = gr.Textbox(label="Enter Model ID", value="openai/whisper-medium") | |
language_input = gr.Textbox(label="Enter Language", value="en") | |
submit_btn1 = gr.Button("Submit") | |
with gr.Column(): | |
output1 = gr.Textbox(label="Transcript") | |
submit_btn1.click( | |
youtube_url_to_text, inputs=[url_input, model_id_input, language_input], outputs=output1) | |
with gr.Tab("Text Summarization"): | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox(label="Enter Text", lines=5) | |
model_id_input2 = gr.Textbox(label="Enter Model ID", value="facebook/bart-large-cnn") | |
submit_btn2 = gr.Button("Summarize") | |
with gr.Column(): | |
output2 = gr.Textbox(label="Summary") | |
submit_btn2.click(summarization, inputs=[text_input, model_id_input2], outputs=output2) | |
with gr.Tab("Long Text Summarization"): | |
with gr.Row(): | |
with gr.Column(): | |
long_text_input = gr.Textbox(label="Enter Long Text", lines=10) | |
model_id_input3 = gr.Textbox(label="Enter Model ID", value="facebook/bart-large-cnn") | |
submit_btn3 = gr.Button("Summarize Long Text") | |
with gr.Column(): | |
output3 = gr.Textbox(label="Long Text Summary") | |
submit_btn3.click(long_text_summarization, inputs=[long_text_input, model_id_input3], outputs=output3) | |
with gr.Tab("Speaker Diarization"): | |
with gr.Row(): | |
with gr.Column(): | |
url_input2 = gr.Textbox(label="Enter YouTube URL") | |
model_id_input4 = gr.Textbox(label="Enter Model ID") | |
num_speakers = gr.Number(label="Number of Speakers", value=2) | |
min_speakers = gr.Number(label="Min Speakers", value=1) | |
max_speakers = gr.Number(label="Max Speakers", value=4) | |
device = gr.Textbox(label="Device", value="cpu") | |
submit_btn4 = gr.Button("Diarize") | |
with gr.Column(): | |
output4 = gr.DataFrame(headers=["Speaker", "Text"], datatype=["str", "str"]) | |
submit_btn4.click( | |
speaker_diarization, | |
inputs=[url_input2, model_id_input4, device, num_speakers, min_speakers, max_speakers], | |
outputs=output4) | |
with gr.Tab("Text to Speech"): | |
with gr.Row(): | |
with gr.Column(): | |
text_input2 = gr.Textbox(label="Enter Text", lines=3) | |
model_id_input5 = gr.Textbox(label="Enter Model ID", value="suno/bark") | |
voice_preset = gr.Textbox(label="Voice Preset", value="v2/en_speaker_6") | |
submit_btn5 = gr.Button("Generate Audio") | |
with gr.Column(): | |
output5 = gr.Audio(label="Generated Audio") | |
submit_btn5.click( | |
text2spech_bark, inputs=[text_input2, model_id_input5, voice_preset], outputs=output5) | |
with gr.Tab("Whisper Autocaption"): | |
with gr.Row(): | |
with gr.Column(): | |
url_input3 = gr.Textbox(label="Enter YouTube URL") | |
language = gr.Textbox(label="Language", value="en") | |
model_id_input6 = gr.Textbox(label="Enter Model ID", value="openai/whisper-large-v2") | |
submit_btn6 = gr.Button("Generate Captions") | |
with gr.Column(): | |
output6 = gr.Video(label="Captioned Video") | |
submit_btn6.click( | |
whisper_autocaption, inputs=[url_input3, language, model_id_input6], outputs=output6) | |
demo.launch() |