Spaces:

NexaAIDev
/

omni-audio-demo

Running

File size: 3,981 Bytes

cfb4e8c
c68be50
22c5bdb
c68be50
cfb4e8c
c68be50
 
 
 
 
de76a17
c68be50
de76a17
22c5bdb
b97cf3c
 
1bccd9f
de76a17
c68be50
 
 
 
 
 
 
 
 
 
 
 
22c5bdb
c68be50
 
 
 
 
 
 
 
1bccd9f
c68be50
 
 
 
22c5bdb
c68be50
 
1bccd9f
c68be50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22c5bdb
de76a17
c68be50
ff9e518
c68be50
de76a17
22c5bdb
de76a17
 
 
 
 
 
1bccd9f
 
 
 
 
 
de76a17
 
22c5bdb
a2f7134
 
4201079
 
a2f7134
2e0b130
a6bd1ac
2e0b130
a2f7134
de76a17
6c152e8
 
 
d6282fe
6c152e8
de76a17
 
 
 
c68be50

import gradio as gr
import requests
import json
import os

API_KEY = os.getenv("API_KEY")
if not API_KEY:
    raise ValueError("API_KEY environment variable must be set")

def process_audio_stream(audio_path, max_tokens):
    """
    Process audio with streaming response via HTTP
    """
    if not audio_path:
        yield "Please upload or record an audio file first."
        return
    
    try:
        # Read and prepare audio file
        with open(audio_path, 'rb') as audio_file:
            files = {
                'audio_file': ('audio.wav', audio_file, 'audio/wav')
            }
            data = {
                'prompt': "",
                'max_tokens': max_tokens
            }
            headers = {
                'X-API-Key': API_KEY
            }
            
            # Make streaming request
            response = requests.post(
                'https://nexa-omni.nexa4ai.com/process-audio/',
                files=files,
                data=data,
                headers=headers,
                stream=True
            )
            
            if response.status_code != 200:
                yield f"Error: Server returned status code {response.status_code}"
                return

            # Initialize response
            response_text = ""
            token_count = 0
            
            # Process the streaming response
            for line in response.iter_lines():
                if line:
                    line = line.decode('utf-8')
                    if line.startswith('data: '):
                        try:
                            data = json.loads(line[6:])  # Skip 'data: ' prefix
                            if data["status"] == "generating":
                                if token_count < 3 and data["token"] in [" ", " \n", "\n", "<|im_start|>", "assistant"]:
                                    token_count += 1
                                    continue
                                response_text += data["token"]
                                gr.update(value=response_text)
                                yield response_text
                            elif data["status"] == "complete":
                                break
                            elif data["status"] == "error":
                                yield f"Error: {data['error']}"
                                break
                        except json.JSONDecodeError:
                            continue
                
    except Exception as e:
        yield f"Error processing request: {str(e)}"

# Create Gradio interface with specific queue configurations
demo = gr.Interface(
    fn=process_audio_stream,
    inputs=[
        gr.Audio(
            type="filepath",
            label="Upload or Record Audio",
            sources=["upload", "microphone"]
        ),
        gr.Slider(
            minimum=50,
            maximum=200,
            value=50,
            step=1,
            label="Max Tokens"
        )
    ],
    outputs=gr.Textbox(label="Response", interactive=False),
    title="NEXA OmniAudio-2.6B",
    description=f"""
    OmniAudio-2.6B is a compact audio-language model optimized for edge deployment.
        
    Model Repo: <a href="https://huggingface.co/NexaAIDev/OmniAudio-2.6B">NexaAIDev/OmniAudio-2.6B</a>
    
    Blog: <a href="https://nexa.ai/blogs/omniaudio-2.6b">OmniAudio-2.6B Blog</a>
    
    Upload an audio file and optionally provide a prompt to analyze the audio content.""",
    examples=[
        ["example_audios/voice_qa.mp3", 200],
        ["example_audios/voice_in_conversation.mp3", 200],
        ["example_audios/creative_content_generation.mp3", 200],
        ["example_audios/record_summary.mp3", 200],
        ["example_audios/change_tone.mp3", 200],
    ]
)

if __name__ == "__main__":
    # Configure the queue for better streaming performance
    demo.queue(
        max_size=20,
    ).launch(
        server_name="0.0.0.0",
        server_port=7860,
    )