Spaces:

jlvdoorn
/

WhisperATC

Running

File size: 3,072 Bytes

ab2b897
49c643d
bffc737
 
 
 
49c643d
 
fd41627
db11291
125f0d6
 
 
 
2ce74f8
125f0d6
d100b4b
125f0d6
 
2ce74f8
f091ddf
bffc737
ab2b897
f091ddf
 
 
 
bffc737
 
 
 
ab2b897
f091ddf
 
 
 
 
 
 
 
 
 
 
 
 
bffc737
f091ddf
 
 
bffc737
ab2b897
659ddf4
 
f091ddf
 
 
 
 
bffc737
ab2b897
bffc737
f091ddf
659ddf4
f091ddf
2ce74f8
 
bffc737
 
ab2b897
e81b0e9

#%%
from huggingface_hub import login
from transformers import pipeline
import gradio as gr
import os

login(token=os.environ['hf_token'])

# print(os.environ['hf_token'])
## Try to load a local model if available
# try:
#     whisper = pipeline(model='/mnt/projects/whisper/WhisperANSP/Models/whisper-large-v2-atco2-asr-atcosim-ANSP-3h1m', task='automatic-speech-recognition')
#     ttl = 'Whisper Large v2 - ATCO2-ATCOSIM-ANSP'
#     dis = 'This demo will transcribe ATC audio files by using the Whisper Large v2 model fine-tuned on the ATCO2, ATCOSIM and ANSP datasets. \n \n Further it uses a Named Entity Recognition model to extract callsigns, commands and values from the transcription. \n This model is based on Google\'s BERT model and fine-tuned on the ATCO2 dataset.'

# except:
whisper = pipeline(model='jlvdoorn/whisper-large-v2-atco2-asr-atcosim')
ttl = 'Whisper Large v2 - ATCO2-ATCOSIM'
dis = 'This demo will transcribe ATC audio files by using the Whisper Large v2 model fine-tuned on the ATCO2 and ATCOSIM datasets. \n \n Further it uses a Named Entity Recognition model to extract callsigns, commands and values from the transcription. \n This model is based on Google\'s BERT model and fine-tuned on the ATCO2 dataset.'

bert_atco_ner = pipeline(model='Jzuluaga/bert-base-ner-atc-en-atco2-1h')

#%%
def transcribe(audio_file, audio_mic):
    if audio_mic is not None:
        return whisper(audio_mic)['text']
    elif audio_file is not None:
        return whisper(audio_file)['text']
    else:
        return 'There was no audio to transcribe...'

#%%
def extractCallSignCommand(transcription):
    if type(transcription) is str:
        result = bert_atco_ner(transcription)
        callsigns = []
        commands = []
        values = []
        for item in result:
            if 'callsign' in item['entity']:
                callsigns.append(item['word'])
            if 'command' in item['entity']:
                commands.append(item['word'])
            if 'value' in item['entity']:
                values.append(item['word'])
                
        return 'Callsigns: ' + ', '.join(callsigns) + '\nCommands: ' + ', '.join(commands) + '\nValues: ' + ', '.join(values)
    else:
        return 'There was no transcription to extract a callsign or command from...'

#%%
def transcribeAndExtract(audio_file, audio_mic, transcribe_only):
    transcription = transcribe(audio_file, audio_mic)
    if not transcribe_only:
        callSignCommandValues = extractCallSignCommand(transcription)
    else:
        callSignCommandValues = ''
    return transcription, callSignCommandValues

#%%
iface = gr.Interface(
        fn=transcribeAndExtract,
        inputs=[gr.Audio(source='upload', type='filepath', interactive=True), gr.Audio(source='microphone', type='filepath'), gr.Checkbox(label='Transcribe only', default=False)],
        outputs=[gr.Text(label='Transcription'), gr.Text(label='Callsigns, commands and values')],
        title=ttl,
        description=dis,
)

#%%
#iface.launch(server_name='0.0.0.0', server_port=9000)
iface.launch()