File size: 3,977 Bytes
714ab7f
 
 
 
 
 
 
0344777
714ab7f
 
 
 
 
 
aa0e579
714ab7f
 
 
 
 
0344777
714ab7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa0e579
714ab7f
aa0e579
714ab7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0344777
714ab7f
 
 
 
 
 
0344777
714ab7f
 
 
 
 
 
 
 
 
 
 
ec31d89
0344777
 
714ab7f
 
 
 
0344777
714ab7f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from PIL import Image
import matplotlib.pyplot as plt


def process_inputs(audio, option, question=None):
   # Process inputs and return results
   if option == "Translate":
      generated_text = generate_text_from_audio(audio), None
      return generated_text
   elif option == "Summarize":
      generated_text = generate_text_from_audio(audio)
      return generate_summary_from_text(generated_text, minLength=20, maxLength=150), None
   elif option == "text-classification":
      generated_text = generate_text_from_audio(audio)
      return "", text_classification(generated_text)
   elif option == "Ask a Question":
      generated_text = generate_text_from_audio(audio)
      return ask_ques_from_text(generated_text, question), None

def generate_text_from_audio(audio):
   device = "cuda:0" if torch.cuda.is_available() else "cpu"
   torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
   model_id = "openai/whisper-small"
   
   model = AutoModelForSpeechSeq2Seq.from_pretrained(
       model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
   )
   model.to(device)

   processor = AutoProcessor.from_pretrained(model_id)

   # Load the audio using librosa and extract the audio data (not the sample rate)
   audio_data = audio  # audio_data is the NumPy array we need

   pipe = pipeline(
      "automatic-speech-recognition",
      model=model,
      tokenizer=processor.tokenizer,
      feature_extractor=processor.feature_extractor,
      torch_dtype=torch_dtype,
      chunk_length_s=30,
      batch_size=16,  # batch size for inference - set based on your device
      device=device,
   )

   audio_text_result = pipe(audio_data, generate_kwargs={"task": "translate", "forced_decoder_ids": [[1, None], [2, 50359]]})
   return audio_text_result["text"]

def generate_summary_from_text(text, minLength, maxLength):
   summarizer = pipeline("summarization", model="Falconsai/text_summarization")
   return summarizer(text, max_length=maxLength, min_length=minLength, do_sample=False)[0]['summary_text']

def text_classification(text):
   classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)
   model_outputs = classifier([text])

   # Extract the labels and scores from the model's output
   labels = [output['label'] for output in model_outputs[0]]
   scores = [output['score'] for output in model_outputs[0]]
   sorted_data = sorted(zip(scores, labels), reverse=True)

   # Extract top 5 emotions
   top_5_scores, top_5_labels = zip(*sorted_data[:5])

   # Plotting the Bar Chart
   plt.figure(figsize=(12, 8))
   plt.barh(top_5_labels, top_5_scores, color='skyblue')
   plt.title('Top 5 Sentiment Scores for Emotions')
   plt.xlabel('Score')
   plt.ylabel('Emotion')

   # Display the plot
   plt.savefig("classification_plot.png")
   plt.close()
   return "classification_plot.png"


def ask_ques_from_text(text, ques):
   model_name = "deepset/roberta-base-squad2"

   # Get predictions
   nlp = pipeline('question-answering', model=model_name, tokenizer=model_name, device=0)

   QA_input = {
      'question': ques,
      'context': text  # Your context text from audio_text_result
   }

   res = nlp(QA_input)
   print("Answer from pipeline:", res['answer'])
   
   return res['answer']

demo = gr.Interface(
   fn=process_inputs,
   inputs=[
       gr.Audio(label="Upload audio in .mp3 format", type="filepath"),  # Audio input
       gr.Dropdown(choices=["Translate", "Summarize", "text-classification", "Ask a Question"], label="Choose an Option"),
       gr.Textbox(label="Enter your question if you chose Ask a question in dropdown", placeholder="Enter your question here", visible=True)
   ],
   outputs=[gr.Textbox(label="Result"), gr.Image(label="Classification Plot")],
)


demo.launch()