Spaces:
Running
Running
File size: 2,354 Bytes
521243d 8bb5ed1 d7fd2ab 8bb5ed1 1dbeaf5 8bb5ed1 1d61cef 8bb5ed1 1d61cef 8bb5ed1 1d61cef 8bb5ed1 521243d 8bb5ed1 521243d 8bb5ed1 1d61cef 8bb5ed1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import os
from fastapi import FastAPI, UploadFile, File
from google import genai
from google.genai import types
import uvicorn
app = FastAPI()
# Retrieve the GenAI API key from the environment variable.
api_key = os.getenv("GENAI_API_KEY")
if not api_key:
raise EnvironmentError("GENAI_API_KEY environment variable not set")
# Initialize the GenAI client.
client = genai.Client(api_key=api_key)
@app.get("/")
async def root():
return {
"message": "Welcome to the Audio Similarity API!",
"usage": {
"endpoint": "/compare-audio",
"description": "POST two audio files (user recitation and professional qarri) for similarity analysis.",
"instructions": "Send audio files as form-data with keys 'audio1' and 'audio2'."
}
}
@app.post("/compare-audio")
async def compare_audio(
audio1: UploadFile = File(...),
audio2: UploadFile = File(...)
):
# Read the uploaded audio files.
audio1_bytes = await audio1.read()
audio2_bytes = await audio2.read()
# Create a refined prompt that clearly identifies the audio sources.
prompt = (
"""Please analyze and compare the two provided audio clips.
The first audio is the user's recitation, and the second audio is the professional qarri recitation.
Evaluate their similarity on a scale from 0 to 1, where:
- 1 indicates the user's recitation contains no mistakes compared to the professional version,
- 0 indicates there are significant mistakes.
Provide your response with:
1. A numerical similarity score on the first line.
2. A single sentence that indicates whether the user's recitation is similar, moderately similar, or dissimilar to the professional qarri."""
)
# Generate the content using the Gemini model with the two audio inputs.
response = client.models.generate_content(
model='gemini-2.0-flash',
contents=[
prompt,
types.Part.from_bytes(
data=audio1_bytes,
mime_type=audio1.content_type,
),
types.Part.from_bytes(
data=audio2_bytes,
mime_type=audio2.content_type,
)
]
)
# Return the model's response.
return {"result": response.text}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
|