Spaces:
Running
Running
File size: 4,751 Bytes
0cea3a7 bd5c6bb 8a61c8d 0eeec49 9fed2c7 2ae69cb 431e989 8a04c2b b1508bf 0cea3a7 431e989 0cea3a7 9cff099 0cea3a7 cac0a2c 0cea3a7 040ebdb 0cea3a7 431e989 0cea3a7 040ebdb 0cea3a7 040ebdb 0cea3a7 9cff099 cac0a2c 9cff099 431e989 32a72a1 0769ed0 431e989 32a72a1 431e989 9cff099 9c0b499 9cff099 32a72a1 67d3502 9cff099 67d3502 9cff099 67d3502 9cff099 67d3502 431e989 9cff099 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import os
#os.system("curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y") #Installing Rust manually
#os.system("exec bash")
#os.system("pip install --upgrade pip")
os.system("pip install transformers==4.33")
os.system("pip install tokenizers fairseq")
os.system("pip install numpy==1.24") #NumPy 1.24 or less needed by Numba. Use 1.23, librosa still uses np.complex which was dropped in NumPy 1.24
#os.system("pip install git+https://github.com/huggingface/transformers datasets[torch]")
os.system("pip install torch accelerate torchaudio datasets")
os.system("pip install librosa==0.9.0")
import gradio as gr
from transformers import pipeline, Wav2Vec2ForCTC, AutoProcessor, VitsModel, AutoTokenizer
from datasets import load_dataset, Audio, Dataset
import torch
import librosa #For converting audio sample rate to 16k
LANG = "dtp" #Change to tih for Timugon Murut or iba for Iban
model_id = "facebook/mms-1b-all"
processor = AutoProcessor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id).to("cpu")
processor.tokenizer.set_target_lang(LANG)
model.load_adapter(LANG)
asr_pipeline = pipeline(task = "automatic-speech-recognition", model = model_id) #Function that returns a dict, transcription stored in item with key "text"
model_tts = VitsModel.from_pretrained("facebook/mms-tts-dtp")
tokenizer_tts = AutoTokenizer.from_pretrained("facebook/mms-tts-dtp")
def preprocess(input): #Sets recording sampling rate to 16k and returns numpy ndarray from audio
speech, sample_rate = librosa.load(input)
speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
loaded_audio = Dataset.from_dict({"audio": [input]}).cast_column("audio", Audio(sampling_rate=16000))
audio_to_array = loaded_audio[0]["audio"]["array"]
return audio_to_array
def run(input):
inputs = processor(input, sampling_rate=16_000, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs).logits
ids = torch.argmax(outputs, dim=-1)[0]
transcription = processor.decode(ids)
return transcription
def transcribe(input): #Gradio UI wrapper function
audioarray = preprocess(input) #Call preprocessor function
out = run(audioarray)
return out
with gr.Blocks(theme = gr.themes.Soft()) as demo:
gr.HTML(
"""
<h1 align="center">Ponutun Tuturan om Pomorolou Sinuat Boros Dusun</h1>
<h5 align="center"> Poomitanan kopogunaan do somit tutun tuturan om pomorolou sinuat (speech recognition and text-to-speech models)
pinoluda' di Woyotanud Tuturan Gumukabang Tagayo di Meta (Meta Massive Multilingual Speech Project)</h5>
<h6 align = "center">Guguno (app) diti winonsoi di Ander © 2023 id Universiti Teknologi PETRONAS</h6>
<div style='display:flex; gap: 0.25rem; '>
<div class = "image"> <a href='https://github.com/andergisomon/dtp-nlp-demo'><img src='https://img.shields.io/badge/Github-Code-success'></a> </div>
<div class = "image"> <a href='https://huggingface.co/spaces/anderbogia/dtp-asr-demo-v2/'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a> </div>
</div>
""")
def tts_run(input):
tokenized_input = tokenizer_tts(input, return_tensors="pt")
with torch.no_grad():
output = model_tts(**tokenized_input).waveform
gradio_tuple = [16000, output]
return gradio_tuple
with gr.Row():
with gr.Column(scale = 1):
gr.HTML("""<h1 align="center"><img src="https://user-images.githubusercontent.com/120112847/249789954-8dbadc59-4f39-48fa-a97c-a70998f2c551.png", alt="" border="0" style="margin: 0 auto; height: 200px;" /></a></h1>""")
gr.Markdown("""
**Huminodun, nulai di somit pongulai kikito DALL-E**
*Huminodun, generated by the image generation model DALL-E*
""")
with gr.Column(scale = 4):
with gr.Tab("Rolou kumaa ginarit"):
input_audio = gr.Audio(sources = ["microphone"], type = "filepath", label = "Gakamai rolou nu")
output_text = gr.components.Textbox(label = "Dalinsuat")
button1 = gr.Button("Dalinsuato' | Transcribe")
button1.click(transcribe, inputs = input_audio, outputs = output_text)
with gr.Tab("Ginarit kumaa rolou"):
input_text = gr.components.Textbox(label = "Ginarit", placeholder = "Potutakai suat nu hiti")
button2 = gr.Button("Poulayo'")
output_audio = gr.components.Audio(label = "Rolou pinoulai")
button2.click(tts_run, inputs = input_text, outputs = output_audio)
demo.launch(debug = True) |