import gradio as gr import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, AutoModelForQuestionAnswering, AutoTokenizer, pipeline from PIL import Image import matplotlib.pyplot as plt def process_inputs(audio, option, question=None): # Process inputs and return results if option == "Translate": generated_text = generate_text_from_audio(audio), None return generated_text elif option == "Summarize": generated_text = generate_text_from_audio(audio) return generate_summary_from_text(generated_text, minLength=20, maxLength=150), None elif option == "text-classification": generated_text = generate_text_from_audio(audio) return "", text_classification(generated_text) elif option == "Ask a Question": generated_text = generate_text_from_audio(audio) return ask_ques_from_text(generated_text, question), None def generate_text_from_audio(audio): device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model_id = "openai/whisper-small" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) # Load the audio using librosa and extract the audio data (not the sample rate) audio_data = audio # audio_data is the NumPy array we need pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, torch_dtype=torch_dtype, chunk_length_s=30, batch_size=16, # batch size for inference - set based on your device device=device, ) audio_text_result = pipe(audio_data, generate_kwargs={"task": "translate", "forced_decoder_ids": [[1, None], [2, 50359]]}) return audio_text_result["text"] def generate_summary_from_text(text, minLength, maxLength): summarizer = pipeline("summarization", model="Falconsai/text_summarization") return summarizer(text, max_length=maxLength, min_length=minLength, do_sample=False)[0]['summary_text'] def text_classification(text): classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None) model_outputs = classifier([text]) # Extract the labels and scores from the model's output labels = [output['label'] for output in model_outputs[0]] scores = [output['score'] for output in model_outputs[0]] sorted_data = sorted(zip(scores, labels), reverse=True) # Extract top 5 emotions top_5_scores, top_5_labels = zip(*sorted_data[:5]) # Plotting the Bar Chart plt.figure(figsize=(12, 8)) plt.barh(top_5_labels, top_5_scores, color='skyblue') plt.title('Top 5 Sentiment Scores for Emotions') plt.xlabel('Score') plt.ylabel('Emotion') # Display the plot plt.savefig("classification_plot.png") plt.close() return "classification_plot.png" def ask_ques_from_text(text, ques): model_name = "deepset/roberta-base-squad2" # Get predictions nlp = pipeline('question-answering', model=model_name, tokenizer=model_name, device=0) QA_input = { 'question': ques, 'context': text # Your context text from audio_text_result } res = nlp(QA_input) print("Answer from pipeline:", res['answer']) return res['answer'] demo = gr.Interface( fn=process_inputs, inputs=[ gr.Audio(label="Upload audio in .mp3 format", type="filepath"), # Audio input gr.Dropdown(choices=["Translate", "Summarize", "text-classification", "Ask a Question"], label="Choose an Option"), gr.Textbox(label="Enter your question if you chose Ask a question in dropdown", placeholder="Enter your question here", visible=True) ], outputs=[gr.Textbox(label="Result"), gr.Image(label="Classification Plot")], ) demo.launch()