File size: 3,706 Bytes
5be928f
 
 
 
0ecdad8
 
 
eeeccc0
 
ca27f63
0ecdad8
5be928f
 
eeeccc0
5be928f
 
 
eeeccc0
5be928f
 
 
eeeccc0
5be928f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c3abca
 
3ebd6b3
8929203
8c3abca
 
 
5be928f
0ecdad8
 
 
04f2289
0ecdad8
 
 
8c3abca
0ecdad8
5be928f
0ecdad8
26e0b3b
eeeccc0
8c3abca
5be928f
 
 
5eedbf3
5be928f
 
 
 
6d8b482
5be928f
 
5eedbf3
6d8b482
5be928f
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from TTS.api import TTS
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
import whisper
model = whisper.load_model("small")
import torch
import torchaudio
from speechbrain.pretrained import SpectralMaskEnhancement
from scipy.io import wavfile
import noisereduce as nr
import gradio as gr
import openai

mes1 = [
    {"role": "system", "content": "You are a TOEFL examiner. Help me improve my oral Englsih and give me feedback."}
]

mes2 = [
    {"role": "system", "content": "You are a mental health therapist. Your name is Tina."}
]

mes3 = [
    {"role": "system", "content": "You are my personal assistant. Your name is Alice."}
]

res = []

def transcribe(apikey, upload, audio, choice1):

    openai.api_key = apikey
    
    # time.sleep(3)
    # load audio and pad/trim it to fit 30 seconds
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    # make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # detect the spoken language
    _, probs = model.detect_language(mel)
    print(f"Detected language: {max(probs, key=probs.get)}")

    # decode the audio
    options = whisper.DecodingOptions()
    result = whisper.decode(model, mel, options)
    res.append(result.text)

    if choice1 == "TOEFL":
      messages = mes1
    elif choice1 == "Therapist":
      messages = mes2
    elif choice1 == "Alice":
      messages = mes3

    # chatgpt
    n = len(res)
    content = res[n-1]
    messages.append({"role": "user", "content": content})

    completion = openai.ChatCompletion.create(
      model = "gpt-3.5-turbo",
      messages = messages
    )

    chat_response = completion.choices[0].message.content

    messages.append({"role": "assistant", "content": chat_response})   

    tts.tts_to_file(chat_response, speaker_wav = upload, language="en", file_path="output.wav")
    
    rate, data = wavfile.read("output.wav")
    
    #reduced_noise = nr.reduce_noise(y=data, sr=rate, prop_decrease= 0.9, stationary=True)
    reduced_noise = nr.reduce_noise(y = data, sr=rate, prop_decrease= 0.8, thresh_n_mult_nonstationary=2, stationary=False)
    #reduced_noise = nr.reduce_noise(y = data, sr=rate, thresh_n_mult_nonstationary=2, stationary=False)

    wavfile.write("audio1.wav", rate, reduced_noise)

    enhance_model = SpectralMaskEnhancement.from_hparams(
    source="speechbrain/metricgan-plus-voicebank",
    savedir="pretrained_models/metricgan-plus-voicebank",
    run_opts={"device":"cuda"},
    )

    noisy = enhance_model.load_audio(
    "audio1.wav"
    ).unsqueeze(0)

    enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
    torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)

    return [result.text, chat_response, "enhanced.wav"]

output_1 = gr.Textbox(label="Speech to Text")
output_2 = gr.Textbox(label="ChatGPT Output")
output_3 = gr.Audio(label="Audio with Custom Voice")

gr.Interface(
    title = '🥳💬💕 - TalktoAI,随时随地,谈天说地!', 
    theme="huggingface",
    description = "🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!",
    fn=transcribe, 
    inputs=[
        gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key"),
        gr.inputs.Audio(source="upload", label = "请上传您喜欢的声音(wav文件)", type="filepath"),
        gr.inputs.Audio(source="microphone", type="filepath"),
        gr.Radio(["TOEFL", "Therapist", "Alice"], label="TOEFL Examiner, Therapist Tina, or Assistant Alice?"),
    ],
    outputs=[
      output_1, output_2, output_3
    ],
    ).launch()