Spaces:

Hammad712
/

recitation-compare

Sleeping

App Files Files Community

Hammad712 commited on Mar 23

Commit

60a573f

verified ·

1 Parent(s): 44a3c0a

Update main.py

Browse files

Files changed (1) hide show

main.py +164 -7

main.py CHANGED Viewed

@@ -1,27 +1,150 @@
 import os
 from fastapi import FastAPI, UploadFile, File
 from google import genai
 from google.genai import types
-import uvicorn
 app = FastAPI()
 # Retrieve the GenAI API key from the environment variable.
-api_key = os.getenv("GENAI_API_KEY")
-if not api_key:
     raise EnvironmentError("GENAI_API_KEY environment variable not set")
 # Initialize the GenAI client.
-client = genai.Client(api_key=api_key)
 @app.get("/")
 async def root():
     return {
         "message": "Welcome to the Audio Similarity API!",
         "usage": {
-            "endpoint": "/compare-audio",
-            "description": "POST two audio files (user recitation and professional qarri) for similarity analysis.",
-            "instructions": "Send audio files as form-data with keys 'audio1' and 'audio2'."
         }
     }
@@ -30,6 +153,10 @@ async def compare_audio(
     audio1: UploadFile = File(...),
     audio2: UploadFile = File(...)
 ):
     # Read the uploaded audio files.
     audio1_bytes = await audio1.read()
     audio2_bytes = await audio2.read()
@@ -65,5 +192,35 @@ Provide your response with:
     # Return the model's response.
     return {"result": response.text}
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=8000)

 import os
+import tempfile
 from fastapi import FastAPI, UploadFile, File
+import uvicorn
+import torch
+import librosa
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+from librosa.sequence import dtw
 from google import genai
 from google.genai import types
 app = FastAPI()
+# ---------------------------
+# Gemini-based Comparison API
+# ---------------------------
 # Retrieve the GenAI API key from the environment variable.
+genai_api_key = os.getenv("GENAI_API_KEY")
+if not genai_api_key:
     raise EnvironmentError("GENAI_API_KEY environment variable not set")
 # Initialize the GenAI client.
+client = genai.Client(api_key=genai_api_key)
+# ---------------------------
+# DTW-based Comparison Class
+# ---------------------------
+class QuranRecitationComparer:
+    def __init__(self, model_name="jonatasgrosman/wav2vec2-large-xlsr-53-arabic", auth_token=None):
+        """Initialize the Quran recitation comparer with a specific Wav2Vec2 model."""
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Load model and processor once during initialization
+        if auth_token:
+            self.processor = Wav2Vec2Processor.from_pretrained(model_name, token=auth_token)
+            self.model = Wav2Vec2ForCTC.from_pretrained(model_name, token=auth_token)
+        else:
+            self.processor = Wav2Vec2Processor.from_pretrained(model_name)
+            self.model = Wav2Vec2ForCTC.from_pretrained(model_name)
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        # Cache for embeddings to avoid recomputation
+        self.embedding_cache = {}
+    def load_audio(self, file_path, target_sr=16000, trim_silence=True, normalize=True):
+        """Load and preprocess an audio file."""
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"Audio file not found: {file_path}")
+        y, sr = librosa.load(file_path, sr=target_sr)
+        if normalize:
+            y = librosa.util.normalize(y)
+        if trim_silence:
+            y, _ = librosa.effects.trim(y, top_db=30)
+        return y
+    def get_deep_embedding(self, audio, sr=16000):
+        """Extract frame-wise deep embeddings using the pretrained model."""
+        input_values = self.processor(
+            audio,
+            sampling_rate=sr,
+            return_tensors="pt"
+        ).input_values.to(self.device)
+        with torch.no_grad():
+            outputs = self.model(input_values, output_hidden_states=True)
+        hidden_states = outputs.hidden_states[-1]
+        embedding_seq = hidden_states.squeeze(0).cpu().numpy()
+        return embedding_seq
+    def compute_dtw_distance(self, features1, features2):
+        """Compute the DTW distance between two sequences of features."""
+        D, wp = dtw(X=features1, Y=features2, metric='euclidean')
+        distance = D[-1, -1]
+        normalized_distance = distance / len(wp)
+        return normalized_distance
+    def interpret_similarity(self, norm_distance):
+        """Interpret the normalized distance value."""
+        if norm_distance == 0:
+            result = "The recitations are identical based on the deep embeddings."
+            score = 100
+        elif norm_distance < 1:
+            result = "The recitations are extremely similar."
+            score = 95
+        elif norm_distance < 5:
+            result = "The recitations are very similar with minor differences."
+            score = 80
+        elif norm_distance < 10:
+            result = "The recitations show moderate similarity."
+            score = 60
+        elif norm_distance < 20:
+            result = "The recitations show some noticeable differences."
+            score = 40
+        else:
+            result = "The recitations are quite different."
+            score = max(0, 100 - norm_distance)
+        return result, score
+    def get_embedding_for_file(self, file_path):
+        """Get embedding for a file, using cache if available."""
+        if file_path in self.embedding_cache:
+            return self.embedding_cache[file_path]
+        audio = self.load_audio(file_path)
+        embedding = self.get_deep_embedding(audio)
+        self.embedding_cache[file_path] = embedding
+        return embedding
+    def predict(self, file_path1, file_path2):
+        """
+        Predict the similarity between two audio files.
+        Returns:
+            float: Similarity score
+            str: Interpretation of similarity
+        """
+        embedding1 = self.get_embedding_for_file(file_path1)
+        embedding2 = self.get_embedding_for_file(file_path2)
+        norm_distance = self.compute_dtw_distance(embedding1.T, embedding2.T)
+        interpretation, similarity_score = self.interpret_similarity(norm_distance)
+        return similarity_score, interpretation
+    def clear_cache(self):
+        """Clear the embedding cache to free memory."""
+        self.embedding_cache = {}
+# Retrieve HuggingFace auth token from environment variable (if needed).
+hf_auth_token = os.getenv("HF_AUTH_TOKEN")
+# Initialize the comparer instance once at startup.
+comparer = QuranRecitationComparer(auth_token=hf_auth_token)
+# ---------------------------
+# API Endpoints
+# ---------------------------
 @app.get("/")
 async def root():
     return {
         "message": "Welcome to the Audio Similarity API!",
         "usage": {
+            "endpoints": {
+                "gemini": {
+                    "path": "/compare-audio",
+                    "description": "POST two audio files (user recitation and professional qarri) for similarity analysis using Gemini."
+                },
+                "dtw": {
+                    "path": "/compare-dtw",
+                    "description": "POST two audio files (user recitation and professional qarri) for similarity analysis using deep embeddings and DTW."
+                }
+            }
         }
     }
     audio1: UploadFile = File(...),
     audio2: UploadFile = File(...)
 ):
+    """
+    Compare two audio files using the Gemini approach.
+    The first audio is the user's recitation and the second is the professional qarri recitation.
+    """
     # Read the uploaded audio files.
     audio1_bytes = await audio1.read()
     audio2_bytes = await audio2.read()
     # Return the model's response.
     return {"result": response.text}
+@app.post("/compare-dtw")
+async def compare_dtw(
+    audio1: UploadFile = File(...),
+    audio2: UploadFile = File(...)
+):
+    """
+    Compare two audio files using deep embeddings and DTW.
+    The first audio is the user's recitation and the second is the professional qarri recitation.
+    """
+    # Save the uploaded files to temporary files so they can be processed by the comparer.
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp1:
+        tmp1.write(await audio1.read())
+        tmp1_path = tmp1.name
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp2:
+        tmp2.write(await audio2.read())
+        tmp2_path = tmp2.name
+    try:
+        # Get similarity score and interpretation using DTW-based approach.
+        similarity_score, interpretation = comparer.predict(tmp1_path, tmp2_path)
+    finally:
+        # Clean up temporary files.
+        os.remove(tmp1_path)
+        os.remove(tmp2_path)
+    return {
+        "similarity_score": similarity_score,
+        "interpretation": interpretation
+    }
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=8000)