File size: 2,354 Bytes
521243d
8bb5ed1
 
 
 
d7fd2ab
8bb5ed1
1dbeaf5
8bb5ed1
 
 
 
1d61cef
8bb5ed1
 
1d61cef
8bb5ed1
1d61cef
8bb5ed1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521243d
8bb5ed1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521243d
8bb5ed1
 
1d61cef
8bb5ed1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import os
from fastapi import FastAPI, UploadFile, File
from google import genai
from google.genai import types
import uvicorn

app = FastAPI()

# Retrieve the GenAI API key from the environment variable.
api_key = os.getenv("GENAI_API_KEY")
if not api_key:
    raise EnvironmentError("GENAI_API_KEY environment variable not set")

# Initialize the GenAI client.
client = genai.Client(api_key=api_key)

@app.get("/")
async def root():
    return {
        "message": "Welcome to the Audio Similarity API!",
        "usage": {
            "endpoint": "/compare-audio",
            "description": "POST two audio files (user recitation and professional qarri) for similarity analysis.",
            "instructions": "Send audio files as form-data with keys 'audio1' and 'audio2'."
        }
    }

@app.post("/compare-audio")
async def compare_audio(
    audio1: UploadFile = File(...),
    audio2: UploadFile = File(...)
):
    # Read the uploaded audio files.
    audio1_bytes = await audio1.read()
    audio2_bytes = await audio2.read()

    # Create a refined prompt that clearly identifies the audio sources.
    prompt = (
        """Please analyze and compare the two provided audio clips.
The first audio is the user's recitation, and the second audio is the professional qarri recitation.
Evaluate their similarity on a scale from 0 to 1, where:
  - 1 indicates the user's recitation contains no mistakes compared to the professional version,
  - 0 indicates there are significant mistakes.
Provide your response with:
  1. A numerical similarity score on the first line.
  2. A single sentence that indicates whether the user's recitation is similar, moderately similar, or dissimilar to the professional qarri."""
    )

    # Generate the content using the Gemini model with the two audio inputs.
    response = client.models.generate_content(
        model='gemini-2.0-flash',
        contents=[
            prompt,
            types.Part.from_bytes(
                data=audio1_bytes,
                mime_type=audio1.content_type,
            ),
            types.Part.from_bytes(
                data=audio2_bytes,
                mime_type=audio2.content_type,
            )
        ]
    )

    # Return the model's response.
    return {"result": response.text}

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)