Zeimoto commited on
Commit
e25d134
·
1 Parent(s): 2d84f70

add translation

Browse files
Files changed (3) hide show
  1. app.py +22 -15
  2. speech2text.py +3 -3
  3. translation.py +20 -0
app.py CHANGED
@@ -3,6 +3,7 @@ from st_audiorec import st_audiorec
3
 
4
  from nameder import init_model_ner, get_entity_labels
5
  from speech2text import init_model_trans, transcribe
 
6
  from resources import audit_elapsedtime, set_start
7
  import subprocess
8
 
@@ -10,24 +11,30 @@ def main ():
10
  print("------------------------------")
11
  print(f"Running main")
12
 
13
- print(subprocess.Popen('pip freeze > requirements_hug.txt', shell=True))
14
- s2t = init_model_trans()
15
- ner = init_model_ner() #async
 
 
 
 
16
 
17
- print("Rendering UI...")
18
- start_render = set_start()
19
- wav_audio_data = st_audiorec()
20
- audit_elapsedtime(function="Rendering UI", start=start_render)
21
 
22
- if wav_audio_data is not None and s2t is not None:
23
- print("Loading data...")
24
- start_loading = set_start()
25
- st.audio(wav_audio_data, format='audio/wav')
26
- text = transcribe(wav_audio_data, s2t)
 
 
27
 
28
- if text is not None and ner is not None:
29
- st.write('Entities: ', get_entity_labels(model=ner, text=text))
30
- audit_elapsedtime(function="Loading data", start=start_loading)
31
 
32
  if __name__ == "__main__":
33
  print("IN __name__")
 
3
 
4
  from nameder import init_model_ner, get_entity_labels
5
  from speech2text import init_model_trans, transcribe
6
+ from translation import get_translation
7
  from resources import audit_elapsedtime, set_start
8
  import subprocess
9
 
 
11
  print("------------------------------")
12
  print(f"Running main")
13
 
14
+ #print(subprocess.Popen('pip freeze > requirements_hug.txt', shell=True))
15
+ text = "Tenho uma proposta para a Caixa Geral de Depositos, para 3 consultores outsystems, 300 euros por dia e um periodo de seis meses."
16
+ st.write(text)
17
+ traducao = get_translation(text_to_translate=text, languageCode="pt")
18
+ st.write(traducao)
19
+ # s2t = init_model_trans()
20
+ # ner = init_model_ner() #async
21
 
22
+ # print("Rendering UI...")
23
+ # start_render = set_start()
24
+ # wav_audio_data = st_audiorec()
25
+ # audit_elapsedtime(function="Rendering UI", start=start_render)
26
 
27
+ # if wav_audio_data is not None and s2t is not None:
28
+ # print("Loading data...")
29
+ # start_loading = set_start()
30
+ # st.audio(wav_audio_data, format='audio/wav')
31
+ # text = transcribe(wav_audio_data, s2t)
32
+ # print("translating audio...")
33
+ # translation = get_translation("pt")
34
 
35
+ # if text is not None and ner is not None:
36
+ # st.write('Entities: ', get_entity_labels(model=ner, text=text))
37
+ # audit_elapsedtime(function="Loading data", start=start_loading)
38
 
39
  if __name__ == "__main__":
40
  print("IN __name__")
speech2text.py CHANGED
@@ -43,7 +43,7 @@ def transcribe (audio_sample: bytes, pipe) -> str:
43
  # sample = dataset[0]["audio"]
44
  result = pipe(audio_sample)
45
  audit_elapsedtime(function="Transcription", start=start)
46
- print(result)
47
-
48
- st.write('trancription: ', result["text"])
49
  return result["text"]
 
43
  # sample = dataset[0]["audio"]
44
  result = pipe(audio_sample)
45
  audit_elapsedtime(function="Transcription", start=start)
46
+ print("transcription result",result)
47
+
48
+ #st.write('trancription: ', result["text"])
49
  return result["text"]
translation.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import MarianMTModel, MarianTokenizer
2
+
3
+ def get_model_name(languageCode: str) -> str:
4
+ model_name = "Helsinki-NLP/opus-mt-pt-en"
5
+ return model_name
6
+
7
+ def init_translation_model():
8
+ model_name = get_model_name("pt")
9
+ model = MarianMTModel.from_pretrained(model_name)
10
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
11
+ return model, tokenizer
12
+
13
+ def get_translation(text_to_translate: str, languageCode: str) -> str:
14
+ model, tokenizer = init_translation_model()
15
+ inputs = tokenizer(text_to_translate, return_tensors="pt", truncation=True, padding=True)
16
+ translated_ids = model.generate(**inputs)
17
+ translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
18
+
19
+ print("Translated text:", translated_text)
20
+ return translated_text