File size: 3,209 Bytes
625d84b
cab275d
625d84b
 
cab275d
 
625d84b
 
 
14cda64
625d84b
cab275d
 
625d84b
 
 
 
 
 
 
cab275d
625d84b
 
 
cab275d
 
 
 
 
 
 
14cda64
625d84b
 
 
 
cab275d
625d84b
 
 
 
cab275d
625d84b
 
cab275d
625d84b
 
cab275d
625d84b
 
cab275d
625d84b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1f4016
cab275d
f1f4016
625d84b
 
 
cab275d
625d84b
cab275d
625d84b
 
 
 
 
 
14cda64
 
625d84b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from transformers import LlamaForCausalLM, LlamaTokenizer
from datasets import load_dataset
from openvoice import se_extractor
from openvoice.api import BaseSpeakerTTS, ToneColorConverter
import gradio as gr
import spaces

# Device setup
torch_dtype = torch.float16

# Whisper setup
whisper_model_id = "openai/whisper-large-v3"
whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    whisper_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
whisper_processor = AutoProcessor.from_pretrained(whisper_model_id)
whisper_pipe = pipeline(
    "automatic-speech-recognition",
    model=whisper_model,
    tokenizer=whisper_processor.tokenizer,
    feature_extractor=whisper_processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

# LLaMa3-8B setup
llama_model_id = "meta-llama/Meta-Llama-3-8B"
llama_tokenizer = LlamaTokenizer.from_pretrained(llama_model_id)
llama_model = LlamaForCausalLM.from_pretrained(llama_model_id, torch_dtype=torch_dtype)

# OpenVoiceV2 setup
ckpt_base = 'checkpoints/base_speakers/EN'
ckpt_converter = 'checkpoints/converter'
output_dir = 'outputs'

base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json',)
base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')

tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json',)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

os.makedirs(output_dir, exist_ok=True)
source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)

def process_audio(input_audio):
    # ASR with Whisper
    whisper_result = whisper_pipe(input_audio)["text"]
    
    # Text generation with LLaMa
    inputs = llama_tokenizer(whisper_result, return_tensors="pt").to(device)
    outputs = llama_model.generate(**inputs, max_new_tokens=50)
    generated_text = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # TTS with OpenVoiceV2
    reference_speaker = 'resources/example_reference.mp3'
    target_se, _ = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)
    save_path = f'{output_dir}/output_en_default.wav'
    src_path = f'{output_dir}/tmp.wav'
    
    base_speaker_tts.tts(generated_text, src_path, speaker='default', language='English', speed=1.0)
    tone_color_converter.convert(
        audio_src_path=src_path, 
        src_se=source_se, 
        tgt_se=target_se, 
        output_path=save_path,
        message="@MyShell"
    )
    
    return save_path
    
@spaces.GPU()
def real_time_processing(input_audio):
    return process_audio(input_audio)

# Gradio interface
iface = gr.Interface(
    fn=real_time_processing,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(type="file"),
    live=True,
    title="ASR + Text-to-Text + TTS with Whisper, LLaMa3-8B, and OpenVoiceV2",
    description="Real-time processing using Whisper for ASR, LLaMa3-8B for text generation, and OpenVoiceV2 for TTS."
)

if __name__ == "__main__":
    iface.launch()