Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, AutoModelForQuestionAnswering, AutoTokenizer, pipeline | |
from PIL import Image | |
import matplotlib.pyplot as plt | |
def process_inputs(audio, option, question=None): | |
# Process inputs and return results | |
if option == "Translate": | |
generated_text = generate_text_from_audio(audio), None | |
return generated_text | |
elif option == "Summarize": | |
generated_text = generate_text_from_audio(audio) | |
return generate_summary_from_text(generated_text, minLength=20, maxLength=150), None | |
elif option == "text-classification": | |
generated_text = generate_text_from_audio(audio) | |
return "", text_classification(generated_text) | |
elif option == "Ask a Question": | |
generated_text = generate_text_from_audio(audio) | |
return ask_ques_from_text(generated_text, question), None | |
def generate_text_from_audio(audio): | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
model_id = "openai/whisper-small" | |
model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True | |
) | |
model.to(device) | |
processor = AutoProcessor.from_pretrained(model_id) | |
# Load the audio using librosa and extract the audio data (not the sample rate) | |
audio_data = audio # audio_data is the NumPy array we need | |
pipe = pipeline( | |
"automatic-speech-recognition", | |
model=model, | |
tokenizer=processor.tokenizer, | |
feature_extractor=processor.feature_extractor, | |
torch_dtype=torch_dtype, | |
chunk_length_s=30, | |
batch_size=16, # batch size for inference - set based on your device | |
device=device, | |
) | |
audio_text_result = pipe(audio_data, generate_kwargs={"task": "translate", "forced_decoder_ids": [[1, None], [2, 50359]]}) | |
return audio_text_result["text"] | |
def generate_summary_from_text(text, minLength, maxLength): | |
summarizer = pipeline("summarization", model="Falconsai/text_summarization") | |
return summarizer(text, max_length=maxLength, min_length=minLength, do_sample=False)[0]['summary_text'] | |
def text_classification(text): | |
classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None) | |
model_outputs = classifier([text]) | |
# Extract the labels and scores from the model's output | |
labels = [output['label'] for output in model_outputs[0]] | |
scores = [output['score'] for output in model_outputs[0]] | |
sorted_data = sorted(zip(scores, labels), reverse=True) | |
# Extract top 5 emotions | |
top_5_scores, top_5_labels = zip(*sorted_data[:5]) | |
# Plotting the Bar Chart | |
plt.figure(figsize=(12, 8)) | |
plt.barh(top_5_labels, top_5_scores, color='skyblue') | |
plt.title('Top 5 Sentiment Scores for Emotions') | |
plt.xlabel('Score') | |
plt.ylabel('Emotion') | |
# Display the plot | |
plt.savefig("classification_plot.png") | |
plt.close() | |
return "classification_plot.png" | |
def ask_ques_from_text(text, ques): | |
model_name = "deepset/roberta-base-squad2" | |
# Get predictions | |
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name, device=0) | |
QA_input = { | |
'question': ques, | |
'context': text # Your context text from audio_text_result | |
} | |
res = nlp(QA_input) | |
print("Answer from pipeline:", res['answer']) | |
return res['answer'] | |
demo = gr.Interface( | |
fn=process_inputs, | |
inputs=[ | |
gr.Audio(label="Upload audio in .mp3 format", type="filepath"), # Audio input | |
gr.Dropdown(choices=["Translate", "Summarize", "text-classification", "Ask a Question"], label="Choose an Option"), | |
gr.Textbox(label="Enter your question if you chose Ask a question in dropdown", placeholder="Enter your question here", visible=True) | |
], | |
outputs=[gr.Textbox(label="Result"), gr.Image(label="Classification Plot")], | |
) | |
demo.launch() | |