ctp-audio-image / app.py
ff98's picture
Features added
714ab7f
raw
history blame
3.79 kB
import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from PIL import Image
import matplotlib.pyplot as plt
def process_inputs(audio, option):
# Process inputs and return results
if option == "Translate":
generated_text = generate_text_from_audio(audio), None
return generated_text
elif option == "Summarize":
generated_text = generate_text_from_audio(audio)
return generate_summary_from_text(generated_text, minLength=50, maxLength=150), None
elif option == "text-classification":
generated_text = generate_text_from_audio(audio)
return "", text_classification(generated_text)
elif option == "Ask a Question":
generated_text = generate_text_from_audio(audio)
return ask_ques_from_text(generated_text), None
def generate_text_from_audio(audio):
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-small"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
# Load the audio using librosa and extract the audio data (not the sample rate)
audio_data = audio # audio_data is the NumPy array we need
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
torch_dtype=torch_dtype,
chunk_length_s=30,
batch_size=16, # batch size for inference - set based on your device
device=device,
)
audio_text_result = pipe(audio_data, generate_kwargs={"task": "translate", "forced_decoder_ids": [[1, None], [2, 50359]]})
return audio_text_result["text"]
def generate_summary_from_text(text, minLength, maxLength):
summarizer = pipeline("summarization", model="Falconsai/text_summarization")
return summarizer(text, max_length=maxLength, min_length=minLength, do_sample=False)
def text_classification(text):
classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)
model_outputs = classifier([text])
# Extract the labels and scores from the model's output
labels = [output['label'] for output in model_outputs[0]]
scores = [output['score'] for output in model_outputs[0]]
sorted_data = sorted(zip(scores, labels), reverse=True)
# Extract top 5 emotions
top_5_scores, top_5_labels = zip(*sorted_data[:5])
# Plotting the Bar Chart
plt.figure(figsize=(12, 8))
plt.barh(top_5_labels, top_5_scores, color='skyblue')
plt.title('Top 5 Sentiment Scores for Emotions')
plt.xlabel('Score')
plt.ylabel('Emotion')
# Display the plot
plt.savefig("classification_plot.png")
plt.close()
return "classification_plot.png"
def ask_ques_from_text(text):
model_name = "deepset/roberta-base-squad2"
# Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name, device=0)
QA_input = {
'question': 'who did not recognize?',
'context': text # Your context text from audio_text_result
}
res = nlp(QA_input)
print("Answer from pipeline:", res['answer'])
return res['answer']
demo = gr.Interface(
fn=process_inputs,
inputs=[
gr.Audio(label="Upload audio", type="filepath"), # Audio input
gr.Dropdown(choices=["Translate", "Summarize", "text-classification", "Ask a Question"], label="Choose an Option")
],
outputs=[gr.Textbox(label="Result"), gr.Image(label="Classification Plot")],
)
demo.launch()