aiola commited on
Commit
19912cb
·
verified ·
1 Parent(s): 654d40b

create app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -0
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
3
+ import torch
4
+ import torchaudio
5
+ import re # Import regex library
6
+
7
+ # Load model and processor
8
+ processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
9
+ model = WhisperForConditionalGeneration.from_pretrained("aiola/whisper-ner-v1").to("cuda")
10
+
11
+ def unify_ner_text(text, symbols_to_replace=("/", " ", ":", "_")):
12
+ """Process and standardize entity text by replacing certain symbols and normalizing spaces."""
13
+ text = " ".join(text.split())
14
+ for symbol in symbols_to_replace:
15
+ text = text.replace(symbol, "-")
16
+ return text.lower()
17
+
18
+
19
+ def transcribe_and_recognize_entities(audio_file, prompt):
20
+ target_sample_rate = 16000
21
+ signal, sampling_rate = torchaudio.load(audio_file)
22
+ resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=target_sample_rate)
23
+ signal = resampler(signal)
24
+ if signal.ndim == 2:
25
+ signal = torch.mean(signal, dim=0)
26
+
27
+ signal = signal.cpu() # Ensure signal is on CPU for processing
28
+ input_features = processor(signal, sampling_rate=target_sample_rate, return_tensors="pt").input_features
29
+
30
+ # Split the prompt into individual NER types and process each one
31
+ ner_types = prompt.split(',')
32
+ processed_ner_types = [unify_ner_text(ner_type.strip()) for ner_type in ner_types]
33
+ prompt = ", ".join(processed_ner_types)
34
+
35
+ print(f"Prompt after unify_ner_text: {prompt}")
36
+ prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt")
37
+ prompt_ids = prompt_ids.to("cuda")
38
+
39
+ predicted_ids = model.generate(
40
+ input_features.to("cuda"),
41
+ max_new_tokens=256,
42
+ prompt_ids=prompt_ids,
43
+ language='en', # Ensure transcription is translated to English
44
+ generation_config=model.generation_config,
45
+ )
46
+ # slice only the output without the prompt itself at the start.
47
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
48
+
49
+ # Determine the length of the prompt in the transcription
50
+ prompt_length_in_transcription = len(prompt)
51
+
52
+ # Slice the transcription to remove the prompt itself from the output
53
+ transcription = transcription[prompt_length_in_transcription + 1:]
54
+
55
+ return transcription
56
+
57
+ # Define Gradio interface
58
+ iface = gr.Interface(
59
+ fn=transcribe_and_recognize_entities,
60
+ inputs=[
61
+ gr.Audio(label="Upload Audio", type="filepath"),
62
+ gr.Textbox(label="Entity Recognition Prompt"),
63
+ ],
64
+ outputs=gr.Textbox(label="Transcription and Entities"),
65
+ title="Whisper-NER Demo",
66
+ description="Upload an audio file and enter entities to identify. The model will transcribe the audio and recognize entities."
67
+ )
68
+
69
+ # iface.launch()
70
+ iface.launch(share=True)