Spaces:

Hammad712
/

recitation-compare

Sleeping

File size: 2,354 Bytes

521243d
8bb5ed1
 
 
 
d7fd2ab
8bb5ed1
1dbeaf5
8bb5ed1
 
 
 
1d61cef
8bb5ed1
 
1d61cef
8bb5ed1
1d61cef
8bb5ed1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521243d
8bb5ed1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521243d
8bb5ed1
 
1d61cef
8bb5ed1

import os
from fastapi import FastAPI, UploadFile, File
from google import genai
from google.genai import types
import uvicorn

app = FastAPI()

# Retrieve the GenAI API key from the environment variable.
api_key = os.getenv("GENAI_API_KEY")
if not api_key:
    raise EnvironmentError("GENAI_API_KEY environment variable not set")

# Initialize the GenAI client.
client = genai.Client(api_key=api_key)

@app.get("/")
async def root():
    return {
        "message": "Welcome to the Audio Similarity API!",
        "usage": {
            "endpoint": "/compare-audio",
            "description": "POST two audio files (user recitation and professional qarri) for similarity analysis.",
            "instructions": "Send audio files as form-data with keys 'audio1' and 'audio2'."
        }
    }

@app.post("/compare-audio")
async def compare_audio(
    audio1: UploadFile = File(...),
    audio2: UploadFile = File(...)
):
    # Read the uploaded audio files.
    audio1_bytes = await audio1.read()
    audio2_bytes = await audio2.read()

    # Create a refined prompt that clearly identifies the audio sources.
    prompt = (
        """Please analyze and compare the two provided audio clips.
The first audio is the user's recitation, and the second audio is the professional qarri recitation.
Evaluate their similarity on a scale from 0 to 1, where:
  - 1 indicates the user's recitation contains no mistakes compared to the professional version,
  - 0 indicates there are significant mistakes.
Provide your response with:
  1. A numerical similarity score on the first line.
  2. A single sentence that indicates whether the user's recitation is similar, moderately similar, or dissimilar to the professional qarri."""
    )

    # Generate the content using the Gemini model with the two audio inputs.
    response = client.models.generate_content(
        model='gemini-2.0-flash',
        contents=[
            prompt,
            types.Part.from_bytes(
                data=audio1_bytes,
                mime_type=audio1.content_type,
            ),
            types.Part.from_bytes(
                data=audio2_bytes,
                mime_type=audio2.content_type,
            )
        ]
    )

    # Return the model's response.
    return {"result": response.text}

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)