Spaces:

ff98
/

ctp-audio-image

Sleeping

App Files Files Community

ff98 commited on Nov 7, 2024

Commit

714ab7f

1 Parent(s): afd9054

Features added

Browse files

Files changed (4) hide show

.gitignore +3 -0
app.py +106 -0
classification_plot.png +0 -0
requirements.txt +97 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# .gitignore
+venv/
+.venv/

app.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import gradio as gr
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, AutoModelForQuestionAnswering, AutoTokenizer, pipeline
+from PIL import Image
+import matplotlib.pyplot as plt
+def process_inputs(audio, option):
+   # Process inputs and return results
+   if option == "Translate":
+      generated_text = generate_text_from_audio(audio), None
+      return generated_text
+   elif option == "Summarize":
+      generated_text = generate_text_from_audio(audio)
+      return generate_summary_from_text(generated_text, minLength=50, maxLength=150), None
+   elif option == "text-classification":
+      generated_text = generate_text_from_audio(audio)
+      return "", text_classification(generated_text)
+   elif option == "Ask a Question":
+      generated_text = generate_text_from_audio(audio)
+      return ask_ques_from_text(generated_text), None
+def generate_text_from_audio(audio):
+   device = "cuda:0" if torch.cuda.is_available() else "cpu"
+   torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+   model_id = "openai/whisper-small"
+   model = AutoModelForSpeechSeq2Seq.from_pretrained(
+       model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+   )
+   model.to(device)
+   processor = AutoProcessor.from_pretrained(model_id)
+   # Load the audio using librosa and extract the audio data (not the sample rate)
+   audio_data = audio  # audio_data is the NumPy array we need
+   pipe = pipeline(
+      "automatic-speech-recognition",
+      model=model,
+      tokenizer=processor.tokenizer,
+      feature_extractor=processor.feature_extractor,
+      torch_dtype=torch_dtype,
+      chunk_length_s=30,
+      batch_size=16,  # batch size for inference - set based on your device
+      device=device,
+   )
+   audio_text_result = pipe(audio_data, generate_kwargs={"task": "translate", "forced_decoder_ids": [[1, None], [2, 50359]]})
+   return audio_text_result["text"]
+def generate_summary_from_text(text, minLength, maxLength):
+   summarizer = pipeline("summarization", model="Falconsai/text_summarization")
+   return summarizer(text, max_length=maxLength, min_length=minLength, do_sample=False)
+def text_classification(text):
+   classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)
+   model_outputs = classifier([text])
+   # Extract the labels and scores from the model's output
+   labels = [output['label'] for output in model_outputs[0]]
+   scores = [output['score'] for output in model_outputs[0]]
+   sorted_data = sorted(zip(scores, labels), reverse=True)
+   # Extract top 5 emotions
+   top_5_scores, top_5_labels = zip(*sorted_data[:5])
+   # Plotting the Bar Chart
+   plt.figure(figsize=(12, 8))
+   plt.barh(top_5_labels, top_5_scores, color='skyblue')
+   plt.title('Top 5 Sentiment Scores for Emotions')
+   plt.xlabel('Score')
+   plt.ylabel('Emotion')
+   # Display the plot
+   plt.savefig("classification_plot.png")
+   plt.close()
+   return "classification_plot.png"
+def ask_ques_from_text(text):
+   model_name = "deepset/roberta-base-squad2"
+   # Get predictions
+   nlp = pipeline('question-answering', model=model_name, tokenizer=model_name, device=0)
+   QA_input = {
+      'question': 'who did not recognize?',
+      'context': text  # Your context text from audio_text_result
+   }
+   res = nlp(QA_input)
+   print("Answer from pipeline:", res['answer'])
+   return res['answer']
+demo = gr.Interface(
+   fn=process_inputs,
+   inputs=[
+       gr.Audio(label="Upload audio", type="filepath"),  # Audio input
+       gr.Dropdown(choices=["Translate", "Summarize", "text-classification", "Ask a Question"], label="Choose an Option")
+   ],
+   outputs=[gr.Textbox(label="Result"), gr.Image(label="Classification Plot")],
+)
+demo.launch()

classification_plot.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,97 @@

+accelerate==1.1.1
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp==3.10.10
+aiosignal==1.3.1
+annotated-types==0.7.0
+anyio==4.6.2.post1
+attrs==24.2.0
+audioread==3.0.1
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.4.0
+click==8.1.7
+contourpy==1.3.0
+cycler==0.12.1
+datasets==3.1.0
+decorator==5.1.1
+dill==0.3.8
+fastapi==0.115.4
+ffmpy==0.4.0
+filelock==3.16.1
+fonttools==4.54.1
+frozenlist==1.5.0
+fsspec==2024.9.0
+gradio==5.5.0
+gradio_client==1.4.2
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.2
+idna==3.10
+Jinja2==3.1.4
+joblib==1.4.2
+kiwisolver==1.4.7
+lazy_loader==0.4
+librosa==0.10.2.post1
+llvmlite==0.43.0
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.2
+mdurl==0.1.2
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+networkx==3.4.2
+numba==0.60.0
+numpy==2.0.2
+orjson==3.10.11
+packaging==24.1
+pandas==2.2.3
+pillow==11.0.0
+platformdirs==4.3.6
+pooch==1.8.2
+propcache==0.2.0
+psutil==6.1.0
+pyarrow==18.0.0
+pycparser==2.22
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+Pygments==2.18.0
+pyparsing==3.2.0
+python-dateutil==2.9.0.post0
+python-multipart==0.0.12
+pytz==2024.2
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+ruff==0.7.2
+safehttpx==0.1.1
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.14.1
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+soundfile==0.12.1
+soxr==0.5.0.post1
+starlette==0.41.2
+sympy==1.13.1
+threadpoolctl==3.5.0
+tokenizers==0.20.3
+tomlkit==0.12.0
+torch==2.5.1
+tqdm==4.67.0
+transformers==4.46.2
+typer==0.12.5
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+uvicorn==0.32.0
+websockets==12.0
+xxhash==3.5.0
+yarl==1.17.1