Pranjal12345 commited on
Commit
33a55a6
1 Parent(s): 7ce1960

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -7
app.py CHANGED
@@ -2,12 +2,12 @@ import gradio as gr
2
  from transformers import pipeline
3
  from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
4
  from utils import lang_ids
5
- import nltk
6
- nltk.download('punkt')
7
 
8
  MODEL_NAME = "Pranjal12345/pranjal_whisper_medium"
9
  BATCH_SIZE = 8
10
- FILE_LIMIT_MB = 1000
 
 
11
 
12
  pipe = pipeline(
13
  task="automatic-speech-recognition",
@@ -18,6 +18,11 @@ pipe = pipeline(
18
 
19
  lang_list = list(lang_ids.keys())
20
 
 
 
 
 
 
21
  def translate_audio(inputs,target_language):
22
  if inputs is None:
23
  raise gr.Error("No audio file submitted! Please upload an audio file before submitting your request.")
@@ -30,11 +35,8 @@ def translate_audio(inputs,target_language):
30
  return text
31
 
32
  else:
33
- model = MBartForConditionalGeneration.from_pretrained("sanjitaa/mbart-many-to-many")
34
- tokenizer = MBart50TokenizerFast.from_pretrained("sanjitaa/mbart-many-to-many")
35
-
36
  tokenizer.src_lang = "en_XX"
37
- chunks = nltk.tokenize.sent_tokenize(text)
38
  translated_text = ''
39
 
40
  for segment in chunks:
 
2
  from transformers import pipeline
3
  from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
4
  from utils import lang_ids
 
 
5
 
6
  MODEL_NAME = "Pranjal12345/pranjal_whisper_medium"
7
  BATCH_SIZE = 8
8
+
9
+ model = MBartForConditionalGeneration.from_pretrained("sanjitaa/mbart-many-to-many")
10
+ tokenizer = MBart50TokenizerFast.from_pretrained("sanjitaa/mbart-many-to-many")
11
 
12
  pipe = pipeline(
13
  task="automatic-speech-recognition",
 
18
 
19
  lang_list = list(lang_ids.keys())
20
 
21
+ def split_into_sentences(text):
22
+ sentences = text.replace('?', '.').replace('!', '.').split('.')
23
+ return [sentence.strip() for sentence in sentences if sentence]
24
+
25
+
26
  def translate_audio(inputs,target_language):
27
  if inputs is None:
28
  raise gr.Error("No audio file submitted! Please upload an audio file before submitting your request.")
 
35
  return text
36
 
37
  else:
 
 
 
38
  tokenizer.src_lang = "en_XX"
39
+ chunks = split_into_sentences(text)
40
  translated_text = ''
41
 
42
  for segment in chunks: