Spaces:
Runtime error
Runtime error
File size: 4,959 Bytes
0fb9949 1278823 0fb9949 3ad86d1 1278823 3ad86d1 1278823 3ad86d1 0fb9949 3ad86d1 0fb9949 3ad86d1 0fb9949 3ad86d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import gradio as gr
from transformers import pipeline
import time
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, pipeline
import numpy as np
# set up transcription pipeline
device = "cuda:0" if torch.cuda.is_available() else "cpu"
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", device=device)
# set up translation pipeline
translation_model_path = "mutisya/m2m100_418M-en-kik-v24.03.2"
def update_tokenizer_settings(tokenizer):
new_langTokens = { k: tokenizer.convert_tokens_to_ids(k) for k in tokenizer.additional_special_tokens }
all_lang_tokens =dict(list(tokenizer.lang_token_to_id.items()) + list(new_langTokens.items()))
tokenizer.id_to_lang_token = { v : k for k,v in all_lang_tokens.items() }
tokenizer.lang_token_to_id = { k : v for k,v in all_lang_tokens.items() }
tokenizer.lang_code_to_token = { k.replace("_", ""): k for k in all_lang_tokens.keys() }
tokenizer.lang_code_to_id = { k.replace("_", ""): v for k, v in all_lang_tokens.items() }
translation_model = M2M100ForConditionalGeneration.from_pretrained(translation_model_path)
translation_tokenizer = M2M100Tokenizer.from_pretrained(translation_model_path)
update_tokenizer_settings(translation_tokenizer)
# set translation direction
src_lang = "en"
tgt_lang = "kik"
translation_tokenizer.src_lang = src_lang
translation_tokenizer.tgt_lang = tgt_lang
translation_device = 0 if torch.cuda.is_available() else -1
translator = pipeline('translation', model=translation_model, tokenizer=translation_tokenizer, device=translation_device)
# transcribe sections while keeping state
chunk_tracker = []
ready_to_translate = []
text_at_chunk_end = ""
chunk_index = 0;
translated_text = ""
transcribed_text = ""
def get_next_translation_block():
global text_at_chunk_end
global chunk_tracker
global ready_to_translate
global translated_text
global transcribed_text
last_stop = text_at_chunk_end[0:-1].rfind('.')
ready_sentences = text_at_chunk_end[0:last_stop+1]
chunks_to_remove = []
if len(ready_sentences) > 0:
print("Trying to match: "+ ready_sentences)
found_match = False
for i in range(0, len(chunk_tracker)):
curr_chunk = chunk_tracker[i]
chunks_to_remove.append(curr_chunk)
if curr_chunk["text_at_begining"] == curr_chunk["text_at_end"] and curr_chunk["text_at_begining"] == ready_sentences:
found_match = True
break
if found_match == False:
print("ERROR: no match found for "+ ready_sentences)
chunks_to_remove = []
else:
transcribed_text += ready_sentences
translated_text += translator(ready_sentences, src_lang=src_lang,tgt_lang=tgt_lang)[0]['translation_text']
print("TRANSLATED: "+ translated_text)
return ready_sentences, chunks_to_remove
def transcribe(stream, new_chunk):
global text_at_chunk_end
global chunk_tracker
global ready_to_translate
global chunk_index
global translated_text
global transcribed_text
chunk_index +=1
sr, y = new_chunk
y = y.astype(np.float32)
y /= np.max(np.abs(y))
chunk_value = y
chunk_length = len(y)
if stream is not None:
stream = np.concatenate([stream, y])
else:
stream = y
text_at_chunk_begining = text_at_chunk_end
text_at_chunk_end = transcriber({"sampling_rate": sr, "raw": stream})["text"]
curr_chunk = {
"value": chunk_value,
"length": chunk_length,
"text_at_begining": text_at_chunk_begining,
"text_at_end": text_at_chunk_end
}
#print(curr_chunk)
chunk_tracker.append(curr_chunk)
# get translation block
if chunk_index % 5 == 0:
ready_sentences, chunks_to_remove = get_next_translation_block();
if len(chunks_to_remove) >0:
ready_to_translate.append(ready_sentences)
total_trim_length = 0
for i in range(0, len(chunks_to_remove)):
total_trim_length += chunks_to_remove[i]["length"]
removed = chunk_tracker.pop(0)
# print("REMOVED: "+ removed["text_at_begining"] +" -> " + removed["text_at_end"])
# set up new stream with remaining chunks
new_stream = chunk_tracker[0]["value"]
for i in range(1, len(chunk_tracker)):
new_stream = np.concatenate([new_stream, chunk_tracker[i]["value"]])
stream = new_stream
return stream, text_at_chunk_end, transcribed_text, translated_text
# set up UI
demo = gr.Interface(
transcribe,
["state", gr.Audio(sources=["microphone"], streaming=True)],
["state", gr.Textbox(label="in progress"), gr.Textbox(label="Transcribed text"), gr.Textbox(label="Translated text")],
live=True,
allow_flagging="never"
)
demo.dependencies[0]["show_progress"] = False # this should hide the progress report?
if __name__ == "__main__":
demo.launch(debug=True) |