saidivyesh commited on
Commit
5a0870e
·
verified ·
1 Parent(s): 0117888

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -32
app.py CHANGED
@@ -25,16 +25,13 @@ def load_models_and_data():
25
  )
26
 
27
  # Load a sample from a dataset for default embedding
28
- dataset = load_dataset("ylacombe/english_dialects ", split="train")
29
  example = dataset[304]
30
 
31
  return model, processor, vocoder, speaker_model, example
32
 
33
  model, processor, vocoder, speaker_model, default_example = load_models_and_data()
34
 
35
- # Choose the language dynamically (English or Regional Language)
36
- model, processor, vocoder, speaker_model, default_example = load_models_and_data(language="en")
37
-
38
  def create_speaker_embedding(waveform):
39
  with torch.no_grad():
40
  speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
@@ -48,41 +45,76 @@ def prepare_default_embedding(example):
48
 
49
  default_embedding = prepare_default_embedding(default_example)
50
 
51
- # Text normalization updates for English technical speech
52
- technical_replacements = [
53
- # Common technical replacements (examples)
54
- ("HTTP", "H T T P"),
55
- ("AI", "A I"),
56
- # Add more technical abbreviations as needed
 
 
 
 
57
  ]
58
 
59
- def normalize_text(text, language="en"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  text = text.lower()
61
 
62
- # Handle language-specific normalization
63
- if language == "en":
64
- # Replace technical terms or symbols
65
- for old, new in technical_replacements:
66
- text = text.replace(old, new)
67
 
68
- # For regional language, include character replacements like the Turkish example
69
- if language != "en":
70
- replacements = [
71
- # Character mappings for regional languages (like the Turkish example)
72
- # Add region/language-specific character normalization here
73
- ]
74
- for old, new in replacements:
75
- text = text.replace(old, new)
76
 
77
- # Remove punctuation or handle them contextually for technical speech
78
  text = re.sub(r'[^\w\s]', '', text)
79
 
80
  return text
81
 
82
  @spaces.GPU(duration=60)
83
- def text_to_speech(text, audio_file=None, language="en"):
84
  # Normalize the input text
85
- normalized_text = normalize_text(text, language=language)
86
 
87
  # Prepare the input for the model
88
  inputs = processor(text=normalized_text, return_tensors="pt").to(device)
@@ -101,14 +133,13 @@ def text_to_speech(text, audio_file=None, language="en"):
101
  iface = gr.Interface(
102
  fn=text_to_speech,
103
  inputs=[
104
- gr.Textbox(label="Enter text to convert to speech"),
105
- gr.Dropdown(label="Language", choices=["English Technical", "Regional"], value="English Technical")
106
  ],
107
  outputs=[
108
  gr.Audio(label="Generated Speech", type="numpy")
109
  ],
110
- title="Fine-Tuned TTS for Technical English and Regional Languages",
111
- description="Enter text, choose the language, and listen to the generated speech."
112
  )
113
 
114
- iface.launch(share=True)
 
25
  )
26
 
27
  # Load a sample from a dataset for default embedding
28
+ dataset = load_dataset("ylacombe/english_dialects","southern_male", split="train")
29
  example = dataset[304]
30
 
31
  return model, processor, vocoder, speaker_model, example
32
 
33
  model, processor, vocoder, speaker_model, default_example = load_models_and_data()
34
 
 
 
 
35
  def create_speaker_embedding(waveform):
36
  with torch.no_grad():
37
  speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
 
45
 
46
  default_embedding = prepare_default_embedding(default_example)
47
 
48
+ replacements = [
49
+ ("â", "a"), # Long a
50
+ ("ç", "ch"), # Ch as in "chair"
51
+ ("ğ", "gh"), # Silent g or slight elongation of the preceding vowel
52
+ ("ı", "i"), # Dotless i
53
+ ("î", "i"), # Long i
54
+ ("ö", "oe"), # Similar to German ö
55
+ ("ş", "sh"), # Sh as in "shoe"
56
+ ("ü", "ue"), # Similar to German ü
57
+ ("û", "u"), # Long u
58
  ]
59
 
60
+ number_words = {
61
+ 0: "sıfır", 1: "bir", 2: "iki", 3: "üç", 4: "dört", 5: "beş", 6: "altı", 7: "yedi", 8: "sekiz", 9: "dokuz",
62
+ 10: "on", 11: "on bir", 12: "on iki", 13: "on üç", 14: "on dört", 15: "on beş", 16: "on altı", 17: "on yedi",
63
+ 18: "on sekiz", 19: "on dokuz", 20: "yirmi", 30: "otuz", 40: "kırk", 50: "elli", 60: "altmış", 70: "yetmiş",
64
+ 80: "seksen", 90: "doksan", 100: "yüz", 1000: "bin"
65
+ }
66
+
67
+ def number_to_words(number):
68
+ if number < 20:
69
+ return number_words[number]
70
+ elif number < 100:
71
+ tens, unit = divmod(number, 10)
72
+ return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
73
+ elif number < 1000:
74
+ hundreds, remainder = divmod(number, 100)
75
+ return (number_words[hundreds] + " yüz" if hundreds > 1 else "yüz") + (" " + number_to_words(remainder) if remainder else "")
76
+ elif number < 1000000:
77
+ thousands, remainder = divmod(number, 1000)
78
+ return (number_to_words(thousands) + " bin" if thousands > 1 else "bin") + (" " + number_to_words(remainder) if remainder else "")
79
+ elif number < 1000000000:
80
+ millions, remainder = divmod(number, 1000000)
81
+ return number_to_words(millions) + " milyon" + (" " + number_to_words(remainder) if remainder else "")
82
+ elif number < 1000000000000:
83
+ billions, remainder = divmod(number, 1000000000)
84
+ return number_to_words(billions) + " milyar" + (" " + number_to_words(remainder) if remainder else "")
85
+ else:
86
+ return str(number)
87
+
88
+ def replace_numbers_with_words(text):
89
+ def replace(match):
90
+ number = int(match.group())
91
+ return number_to_words(number)
92
+
93
+ # Find the numbers and change with words.
94
+ result = re.sub(r'\b\d+\b', replace, text)
95
+
96
+ return result
97
+
98
+ def normalize_text(text):
99
+ # Convert to lowercase
100
  text = text.lower()
101
 
102
+ # Replace numbers with words
103
+ text = replace_numbers_with_words(text)
 
 
 
104
 
105
+ # Apply character replacements
106
+ for old, new in replacements:
107
+ text = text.replace(old, new)
 
 
 
 
 
108
 
109
+ # Remove punctuation
110
  text = re.sub(r'[^\w\s]', '', text)
111
 
112
  return text
113
 
114
  @spaces.GPU(duration=60)
115
+ def text_to_speech(text, audio_file=None):
116
  # Normalize the input text
117
+ normalized_text = normalize_text(text)
118
 
119
  # Prepare the input for the model
120
  inputs = processor(text=normalized_text, return_tensors="pt").to(device)
 
133
  iface = gr.Interface(
134
  fn=text_to_speech,
135
  inputs=[
136
+ gr.Textbox(label="Enter English text to convert to speech")
 
137
  ],
138
  outputs=[
139
  gr.Audio(label="Generated Speech", type="numpy")
140
  ],
141
+ title="English SpeechT5 Text-to-Speech Demo",
142
+ description="Enter English text, and listen to the generated speech."
143
  )
144
 
145
+ iface.launch(share=True)