RasmusToivanen commited on
Commit
da95d3d
1 Parent(s): 11243ea
Files changed (4) hide show
  1. app.py +55 -0
  2. model_t5/config.json +30 -0
  3. model_t5/pytorch_model.bin +3 -0
  4. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import soundfile as sf
4
+ import torch
5
+ import warnings
6
+
7
+ from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer
8
+
9
+ warnings.filterwarnings("ignore")
10
+
11
+ #load wav2vec2 tokenizer and model
12
+
13
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
14
+ import time
15
+ from fastapi import FastAPI, HTTPException, File
16
+ model_name = "Finnish-NLP/wav2vec2-xlsr-300m-finnish-lm"
17
+ from transformers import pipeline
18
+ #feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
19
+ #tokenizer = AutoTokenizer.from_pretrained(model_name)
20
+ #model = Wav2Vec2ForCTC.from_pretrained(model_name)
21
+ pipe = pipeline(model="Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2",chunk_length_s=20, stride_length_s=(4, 2))
22
+
23
+
24
+
25
+
26
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
+ model_checkpoint = 'Finnish-NLP/t5x-small-nl24-finnish'
28
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_auth_token=True)
29
+ model = AutoModelForSeq2SeqLM.from_pretrained('model_t5', from_flax=False, torch_dtype=torch.float32).to(device)
30
+
31
+
32
+ # define speech-to-text function
33
+ def asr_transcript(audio):
34
+
35
+ text = ""
36
+
37
+ if audio:
38
+ text = pipe(audio.name)
39
+
40
+ input_ids = tokenizer(text['text'], return_tensors="pt").input_ids.to(device)
41
+ outputs = model.generate(input_ids, max_length=128)
42
+ case_corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
43
+ return {"text_asr": text['text'], "text_case_corrected": case_corrected_text}
44
+ else:
45
+ return "File not valid"
46
+
47
+ gradio_ui = gr.Interface(
48
+ fn=asr_transcript,
49
+ title="Speech-to-Text with HuggingFace+Wav2Vec2",
50
+ description="Upload an audio clip, and let AI do the hard work of transcribing",
51
+ inputs=gr.inputs.Audio(label="Upload Audio File", type="file"),
52
+ outputs=gr.outputs.Textbox(label="Auto-Transcript"),
53
+ )
54
+
55
+ gradio_ui.launch()
model_t5/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Finnish-NLP/t5x-small-nl24-finnish",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 2048,
7
+ "d_kv": 64,
8
+ "d_model": 512,
9
+ "decoder_start_token_id": 0,
10
+ "dropout_rate": 0.1,
11
+ "eos_token_id": 1,
12
+ "feed_forward_proj": "gated-gelu",
13
+ "initializer_factor": 1.0,
14
+ "is_encoder_decoder": true,
15
+ "layer_norm_epsilon": 1e-06,
16
+ "model_type": "t5",
17
+ "n_positions": 512,
18
+ "num_decoder_layers": 24,
19
+ "num_heads": 8,
20
+ "num_layers": 24,
21
+ "output_past": true,
22
+ "pad_token_id": 0,
23
+ "relative_attention_max_distance": 128,
24
+ "relative_attention_num_buckets": 32,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.18.0",
28
+ "use_cache": true,
29
+ "vocab_size": 32128
30
+ }
model_t5/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a02eed59634d268ae6f6f080deb53ca08ba17e6936ec08e4e41148e4a2757d2
3
+ size 1038007269
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ librosa==0.9.1
2
+ soundfile==0.10.3.post1
3
+ torch==1.11.0
4
+ transformers==4.18.0
5
+ sentencepiece
6
+ protobuf