ctp-audio-image / app.py
ff98's picture
summarize updated
aa0e579
import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from PIL import Image
import matplotlib.pyplot as plt
def process_inputs(audio, option, question=None):
# Process inputs and return results
if option == "Translate":
generated_text = generate_text_from_audio(audio), None
return generated_text
elif option == "Summarize":
generated_text = generate_text_from_audio(audio)
return generate_summary_from_text(generated_text, minLength=20, maxLength=150), None
elif option == "text-classification":
generated_text = generate_text_from_audio(audio)
return "", text_classification(generated_text)
elif option == "Ask a Question":
generated_text = generate_text_from_audio(audio)
return ask_ques_from_text(generated_text, question), None
def generate_text_from_audio(audio):
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-small"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
# Load the audio using librosa and extract the audio data (not the sample rate)
audio_data = audio # audio_data is the NumPy array we need
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
torch_dtype=torch_dtype,
chunk_length_s=30,
batch_size=16, # batch size for inference - set based on your device
device=device,
)
audio_text_result = pipe(audio_data, generate_kwargs={"task": "translate", "forced_decoder_ids": [[1, None], [2, 50359]]})
return audio_text_result["text"]
def generate_summary_from_text(text, minLength, maxLength):
summarizer = pipeline("summarization", model="Falconsai/text_summarization")
return summarizer(text, max_length=maxLength, min_length=minLength, do_sample=False)[0]['summary_text']
def text_classification(text):
classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)
model_outputs = classifier([text])
# Extract the labels and scores from the model's output
labels = [output['label'] for output in model_outputs[0]]
scores = [output['score'] for output in model_outputs[0]]
sorted_data = sorted(zip(scores, labels), reverse=True)
# Extract top 5 emotions
top_5_scores, top_5_labels = zip(*sorted_data[:5])
# Plotting the Bar Chart
plt.figure(figsize=(12, 8))
plt.barh(top_5_labels, top_5_scores, color='skyblue')
plt.title('Top 5 Sentiment Scores for Emotions')
plt.xlabel('Score')
plt.ylabel('Emotion')
# Display the plot
plt.savefig("classification_plot.png")
plt.close()
return "classification_plot.png"
def ask_ques_from_text(text, ques):
model_name = "deepset/roberta-base-squad2"
# Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name, device=0)
QA_input = {
'question': ques,
'context': text # Your context text from audio_text_result
}
res = nlp(QA_input)
print("Answer from pipeline:", res['answer'])
return res['answer']
demo = gr.Interface(
fn=process_inputs,
inputs=[
gr.Audio(label="Upload audio in .mp3 format", type="filepath"), # Audio input
gr.Dropdown(choices=["Translate", "Summarize", "text-classification", "Ask a Question"], label="Choose an Option"),
gr.Textbox(label="Enter your question if you chose Ask a question in dropdown", placeholder="Enter your question here", visible=True)
],
outputs=[gr.Textbox(label="Result"), gr.Image(label="Classification Plot")],
)
demo.launch()