imorcillo commited on
Commit
eaf5bbe
·
verified ·
1 Parent(s): 7541c50

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -15
app.py CHANGED
@@ -36,25 +36,30 @@ def transcribe_base(audio, language):
36
  speaker_class_string = f'Speaker found in database, ID {speaker}'
37
  return transcription#, speaker_class_string
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def transcribe_mic(audio_microphone, language):
40
  print("Transcription microphone")
41
  transcription = transcribe_base(audio_microphone, language)
 
42
  if language=="it":
43
- no_elision_cases = {
44
- "un autore", "un artista", "un uomo", "un amico", "un imperatore",
45
- "uno studente", "uno psicologo", "uno zio",
46
- "di autore", "a uomo", "su imperatore", "con amico", "per artista"
47
- }
48
- transcription = re.sub(r"\b(un|l|d|s|t|m|c|n|quest|all|dall|nell|sull|coll|pell)\s+(?=[aeiouhàèìòùáéíóú])", r"\1'", transcription)
49
- transcription = re.sub(r"\b(s|t|m|c|n)\s+(?=è\b|ha\b|hanno\b)", r"\1'", transcription)
50
- transcription = re.sub(r"\bpo\b", "po'", transcription)
51
- transcription = re.sub(r"\b(senz) ([aeiou])", r"\1'\2", transcription)
52
- pattern_numbers = r"\b(trenta|quaranta|cinquanta|sessanta|settanta|ottanta|novanta)\s+(?=anni|ore)\b"
53
- replacement_numbers = lambda m: m.group(1)[:-1] + "’" + m.group(0).split()[1]
54
- transcription = re.sub(pattern_numbers, replacement_numbers, transcription)
55
- for phrase in no_elision_cases:
56
- fixed = phrase.replace(" ", "'")
57
- transcription = transcription.replace(fixed, phrase)
58
  return transcription
59
  #return transcribe_base(audio_microphone, language)
60
 
 
36
  speaker_class_string = f'Speaker found in database, ID {speaker}'
37
  return transcription#, speaker_class_string
38
 
39
+ def fix_italian_transcription(transcription):
40
+ no_elision_cases = {
41
+ "un autore", "un artista", "un uomo", "un amico", "un imperatore",
42
+ "uno studente", "uno psicologo", "uno zio",
43
+ "di autore", "a uomo", "su imperatore", "con amico", "per artista"
44
+ }
45
+ transcription = re.sub(r"\b(un|l|d|s|t|m|c|n|quest|all|dall|nell|sull|coll|pell)\s+(?=[aeiouhàèìòùáéíóú])", r"\1'", transcription)
46
+ transcription = re.sub(r"\b(s|t|m|c|n)\s+(?=è\b|ha\b|hanno\b)", r"\1'", transcription)
47
+ transcription = re.sub(r"\bpo\b", "po'", transcription)
48
+ transcription = re.sub(r"\b(senz) ([aeiou])", r"\1'\2", transcription)
49
+ pattern_numbers = r"\b(trenta|quaranta|cinquanta|sessanta|settanta|ottanta|novanta)\s+(?=anni|ore)\b"
50
+ replacement_numbers = lambda m: m.group(1)[:-1] + "’" + m.group(0).split()[1]
51
+ transcription = re.sub(pattern_numbers, replacement_numbers, transcription)
52
+ for phrase in no_elision_cases:
53
+ fixed = phrase.replace(" ", "'")
54
+ transcription = transcription.replace(fixed, phrase)
55
+
56
  def transcribe_mic(audio_microphone, language):
57
  print("Transcription microphone")
58
  transcription = transcribe_base(audio_microphone, language)
59
+
60
  if language=="it":
61
+ transcription = fix_italian_transcription(transcription)
62
+
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  return transcription
64
  #return transcribe_base(audio_microphone, language)
65