Spaces:
Runtime error
Runtime error
File size: 4,148 Bytes
4f3f83c 425758f 4f3f83c 425758f 4f3f83c 425758f 4f3f83c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers.pipelines.audio_utils import ffmpeg_read
import gradio as gr
LANGUANGE_MAP = {
0: 'Arabic',
1: 'Basque',
2: 'Breton',
3: 'Catalan',
4: 'Chinese_China',
5: 'Chinese_Hongkong',
6: 'Chinese_Taiwan',
7: 'Chuvash',
8: 'Czech',
9: 'Dhivehi',
10: 'Dutch',
11: 'English',
12: 'Esperanto',
13: 'Estonian',
14: 'French',
15: 'Frisian',
16: 'Georgian',
17: 'German',
18: 'Greek',
19: 'Hakha_Chin',
20: 'Indonesian',
21: 'Interlingua',
22: 'Italian',
23: 'Japanese',
24: 'Kabyle',
25: 'Kinyarwanda',
26: 'Kyrgyz',
27: 'Latvian',
28: 'Maltese',
29: 'Mongolian',
30: 'Persian',
31: 'Polish',
32: 'Portuguese',
33: 'Romanian',
34: 'Romansh_Sursilvan',
35: 'Russian',
36: 'Sakha',
37: 'Slovenian',
38: 'Spanish',
39: 'Swedish',
40: 'Tamil',
41: 'Tatar',
42: 'Turkish',
43: 'Ukranian',
44: 'Welsh'
}
device = "cuda" if torch.cuda.is_available() else "CPU"
model_ckpt = "barto17/language-detection-fine-tuned-on-xlm-roberta-base"
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
def detect_language(sentence):
tokenized_sentence = tokenizer(sentence, return_tensors='pt')
output = model(**tokenized_sentence)
predictions = torch.nn.functional.softmax(output.logits, dim=-1)
probability, pred_idx = torch.max(predictions, dim=-1)
language = LANGUANGE_MAP[pred_idx.item()]
return language, probability.item()
def process_audio_file(file):
with open(file, "rb") as f:
inputs = f.read()
audio = ffmpeg_read(inputs, sampling_rate)
return audio
def transcribe(Microphone, File_Upload):
warn_output = ""
if (Microphone is not None) and (File_Upload is not None):
warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
file = Microphone
elif (Microphone is None) and (File_Upload is None):
return "ERROR: You have to either use the microphone or upload an audio file"
elif Microphone is not None:
file = Microphone
else:
file = File_Upload
audio_data = process_audio_file(file)
input_features = processor(audio_data, return_tensors="pt").input_features
with torch.no_grad():
logits = model.forward(input_features.to(device), decoder_input_ids=decoder_input_ids).logits
pred_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(pred_ids[0])
detect_language(transcription.capitalize())
examples=['sample1.mp3', 'sample2.mp3', 'sample3.mp3']
outputs=gr.outputs.Label(label="Language detected:")
article = """
Fine-tuned on xlm-roberta-base model.\n
Supported languages:\n
'Arabic', 'Basque', 'Breton', 'Catalan', 'Chinese_China', 'Chinese_Hongkong', 'Chinese_Taiwan', 'Chuvash', 'Czech',
'Dhivehi', 'Dutch', 'English', 'Esperanto', 'Estonian', 'French', 'Frisian', 'Georgian', 'German', 'Greek', 'Hakha_Chin',
'Indonesian', 'Interlingua', 'Italian', 'Japanese', 'Kabyle', 'Kinyarwanda', 'Kyrgyz', 'Latvian', 'Maltese',
'Mangolian', 'Persian', 'Polish', 'Portuguese', 'Romanian', 'Romansh_Sursilvan', 'Russian', 'Sakha', 'Slovenian',
'Spanish', 'Swedish', 'Tamil', 'Tatar', 'Turkish', 'Ukranian', 'Welsh'
"""
gr.Interface(
fn=detect_language,
fn=transcribe,
inputs=[
gr.inputs.Audio(source="microphone", type='filepath', optional=True),
gr.inputs.Audio(source="upload", type='filepath', optional=True),
],
outputs=outputs=[
gr.outputs.Textbox(label="Language"),
gr.Number(label="Probability"),
],
verbose=True,
examples = examples,
title="Language Identification from Audio",
description="Detect the Language from Audio.",
article=article,
theme="huggingface"
).launch()
|