RasmusToivanen
commited on
Commit
•
da95d3d
1
Parent(s):
11243ea
add files
Browse files- app.py +55 -0
- model_t5/config.json +30 -0
- model_t5/pytorch_model.bin +3 -0
- requirements.txt +6 -0
app.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import librosa
|
3 |
+
import soundfile as sf
|
4 |
+
import torch
|
5 |
+
import warnings
|
6 |
+
|
7 |
+
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer
|
8 |
+
|
9 |
+
warnings.filterwarnings("ignore")
|
10 |
+
|
11 |
+
#load wav2vec2 tokenizer and model
|
12 |
+
|
13 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
14 |
+
import time
|
15 |
+
from fastapi import FastAPI, HTTPException, File
|
16 |
+
model_name = "Finnish-NLP/wav2vec2-xlsr-300m-finnish-lm"
|
17 |
+
from transformers import pipeline
|
18 |
+
#feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
|
19 |
+
#tokenizer = AutoTokenizer.from_pretrained(model_name)
|
20 |
+
#model = Wav2Vec2ForCTC.from_pretrained(model_name)
|
21 |
+
pipe = pipeline(model="Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2",chunk_length_s=20, stride_length_s=(4, 2))
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
27 |
+
model_checkpoint = 'Finnish-NLP/t5x-small-nl24-finnish'
|
28 |
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_auth_token=True)
|
29 |
+
model = AutoModelForSeq2SeqLM.from_pretrained('model_t5', from_flax=False, torch_dtype=torch.float32).to(device)
|
30 |
+
|
31 |
+
|
32 |
+
# define speech-to-text function
|
33 |
+
def asr_transcript(audio):
|
34 |
+
|
35 |
+
text = ""
|
36 |
+
|
37 |
+
if audio:
|
38 |
+
text = pipe(audio.name)
|
39 |
+
|
40 |
+
input_ids = tokenizer(text['text'], return_tensors="pt").input_ids.to(device)
|
41 |
+
outputs = model.generate(input_ids, max_length=128)
|
42 |
+
case_corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
43 |
+
return {"text_asr": text['text'], "text_case_corrected": case_corrected_text}
|
44 |
+
else:
|
45 |
+
return "File not valid"
|
46 |
+
|
47 |
+
gradio_ui = gr.Interface(
|
48 |
+
fn=asr_transcript,
|
49 |
+
title="Speech-to-Text with HuggingFace+Wav2Vec2",
|
50 |
+
description="Upload an audio clip, and let AI do the hard work of transcribing",
|
51 |
+
inputs=gr.inputs.Audio(label="Upload Audio File", type="file"),
|
52 |
+
outputs=gr.outputs.Textbox(label="Auto-Transcript"),
|
53 |
+
)
|
54 |
+
|
55 |
+
gradio_ui.launch()
|
model_t5/config.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "Finnish-NLP/t5x-small-nl24-finnish",
|
3 |
+
"architectures": [
|
4 |
+
"T5ForConditionalGeneration"
|
5 |
+
],
|
6 |
+
"d_ff": 2048,
|
7 |
+
"d_kv": 64,
|
8 |
+
"d_model": 512,
|
9 |
+
"decoder_start_token_id": 0,
|
10 |
+
"dropout_rate": 0.1,
|
11 |
+
"eos_token_id": 1,
|
12 |
+
"feed_forward_proj": "gated-gelu",
|
13 |
+
"initializer_factor": 1.0,
|
14 |
+
"is_encoder_decoder": true,
|
15 |
+
"layer_norm_epsilon": 1e-06,
|
16 |
+
"model_type": "t5",
|
17 |
+
"n_positions": 512,
|
18 |
+
"num_decoder_layers": 24,
|
19 |
+
"num_heads": 8,
|
20 |
+
"num_layers": 24,
|
21 |
+
"output_past": true,
|
22 |
+
"pad_token_id": 0,
|
23 |
+
"relative_attention_max_distance": 128,
|
24 |
+
"relative_attention_num_buckets": 32,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "float32",
|
27 |
+
"transformers_version": "4.18.0",
|
28 |
+
"use_cache": true,
|
29 |
+
"vocab_size": 32128
|
30 |
+
}
|
model_t5/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a02eed59634d268ae6f6f080deb53ca08ba17e6936ec08e4e41148e4a2757d2
|
3 |
+
size 1038007269
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
librosa==0.9.1
|
2 |
+
soundfile==0.10.3.post1
|
3 |
+
torch==1.11.0
|
4 |
+
transformers==4.18.0
|
5 |
+
sentencepiece
|
6 |
+
protobuf
|