Spaces:

Hammad712
/

recitation-compare

Sleeping

App Files Files Community

Hammad712 commited on Mar 17

Commit

521243d

verified ·

1 Parent(s): 3dceebc

Update main.py

Browse files

Files changed (1) hide show

main.py +109 -266

main.py CHANGED Viewed

@@ -1,187 +1,68 @@
-from fastapi import FastAPI, HTTPException, UploadFile, File
-from pydantic import BaseModel
 import torch
 import librosa
 import numpy as np
-import os
-from transformers import AutoProcessor, AutoModelForCTC
 import tempfile
-import shutil
-import uvicorn
-from fastapi.middleware.cors import CORSMiddleware
-import warnings
-# Ignore deprecation warnings
-warnings.filterwarnings("ignore")
-# Load environment variables
-HF_TOKEN = os.getenv("HF_TOKEN")
-app = FastAPI(title="Quran Recitation Comparer API")
-# Add CORS middleware
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-class ComparisonResult(BaseModel):
-    similarity_score: float
-    interpretation: str
-# Custom implementation of DTW
-def custom_dtw(X, Y, metric='euclidean'):
-    """
-    Custom Dynamic Time Warping implementation.
-    Args:
-        X: First sequence
-        Y: Second sequence
-        metric: Distance metric ('euclidean' or 'cosine')
-    Returns:
-        D: Cost matrix
-        wp: Warping path
-    """
-    n, m = len(X), len(Y)
-    D = np.zeros((n + 1, m + 1))
-    D[0, 1:] = np.inf
-    D[1:, 0] = np.inf
-    D[0, 0] = 0
-    for i in range(1, n + 1):
-        for j in range(1, m + 1):
-            if metric == 'euclidean':
-                cost = np.sum((X[i-1] - Y[j-1])**2)
-            elif metric == 'cosine':
-                cost = 1 - np.dot(X[i-1], Y[j-1]) / (np.linalg.norm(X[i-1]) * np.linalg.norm(Y[j-1]))
-            D[i, j] = cost + min(D[i-1, j], D[i, j-1], D[i-1, j-1])
-    wp = [(n, m)]
-    i, j = n, m
-    while i > 0 or j > 0:
-        if i == 0:
-            j -= 1
-        elif j == 0:
-            i -= 1
-        else:
-            min_idx = np.argmin([D[i-1, j-1], D[i-1, j], D[i, j-1]])
-            if min_idx == 0:
-                i -= 1
-                j -= 1
-            elif min_idx == 1:
-                i -= 1
-            else:
-                j -= 1
-        wp.append((i, j))
-    wp.reverse()
-    return D, wp
 class QuranRecitationComparer:
-    def __init__(self, model_name="jonatasgrosman/wav2vec2-large-xlsr-53-arabic", token=None):
-        """Initialize the Quran recitation comparer with a specific Wav2Vec2 model."""
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        print(f"Using device: {self.device}")
-        try:
-            if token:
-                print(f"Loading model {model_name} with token...")
-                # Use 'use_auth_token' instead of the deprecated 'token' parameter
-                self.processor = AutoProcessor.from_pretrained(model_name, use_auth_token=token)
-                self.model = AutoModelForCTC.from_pretrained(model_name, use_auth_token=token)
-            else:
-                print(f"Loading model {model_name} without token...")
-                self.processor = AutoProcessor.from_pretrained(model_name)
-                self.model = AutoModelForCTC.from_pretrained(model_name)
-            self.model = self.model.to(self.device)
-            self.model.eval()
-            # Ensure that hidden states are returned by default
-            self.model.config.output_hidden_states = True
-            print("Model loaded successfully!")
-        except Exception as e:
-            print(f"Error loading model: {str(e)}")
-            raise
         # Cache for embeddings to avoid recomputation
         self.embedding_cache = {}
-    def load_audio(self, file_path, target_sr=16000, normalize=True):
         """Load and preprocess an audio file."""
         if not os.path.exists(file_path):
             raise FileNotFoundError(f"Audio file not found: {file_path}")
-        print(f"Loading audio: {file_path}")
         y, sr = librosa.load(file_path, sr=target_sr)
         if normalize:
             y = librosa.util.normalize(y)
-        # Trim silence using a simplified approach
-        trim_y = []
-        threshold = 0.02  # Threshold for silence detection
-        for i in range(len(y)):
-            if abs(y[i]) > threshold:
-                trim_y.append(y[i])
-        if len(trim_y) > 0:
-            y = np.array(trim_y)
         return y
     def get_deep_embedding(self, audio, sr=16000):
         """Extract frame-wise deep embeddings using the pretrained model."""
-        try:
-            inputs = self.processor(
-                audio,
-                sampling_rate=sr,
-                return_tensors="pt"
-            ).input_values.to(self.device)
-            with torch.no_grad():
-                # Call the model without explicitly passing output_hidden_states
-                outputs = self.model(inputs)
-            hidden_states = outputs.hidden_states[-1]
-            embedding_seq = hidden_states.squeeze(0).cpu().numpy()
-            return embedding_seq
-        except Exception as e:
-            print(f"Error in get_deep_embedding: {str(e)}")
-            raise
     def compute_dtw_distance(self, features1, features2):
         """Compute the DTW distance between two sequences of features."""
-        if features1.ndim == 1:
-            features1 = features1.reshape(-1, 1)
-        if features2.ndim == 1:
-            features2 = features2.reshape(-1, 1)
-        print(f"Feature shapes: {features1.shape}, {features2.shape}")
-        max_length = 300
-        if features1.shape[0] > max_length or features2.shape[0] > max_length:
-            step1 = max(1, features1.shape[0] // max_length)
-            step2 = max(1, features2.shape[0] // max_length)
-            features1 = features1[::step1]
-            features2 = features2[::step2]
-            print(f"Subsampled feature shapes: {features1.shape}, {features2.shape}")
-        try:
-            D, wp = custom_dtw(X=features1, Y=features2, metric='euclidean')
-            distance = D[-1, -1]
-            normalized_distance = distance / len(wp)
-            return normalized_distance
-        except Exception as e:
-            print(f"Error in compute_dtw_distance: {str(e)}")
-            mean_1 = np.mean(features1, axis=0)
-            mean_2 = np.mean(features2, axis=0)
-            euclidean_distance = np.sqrt(np.sum((mean_1 - mean_2) ** 2))
-            return euclidean_distance
     def interpret_similarity(self, norm_distance):
         """Interpret the normalized distance value."""
@@ -203,142 +84,104 @@ class QuranRecitationComparer:
         else:
             result = "The recitations are quite different."
             score = max(0, 100 - norm_distance)
         return result, score
     def get_embedding_for_file(self, file_path):
         """Get embedding for a file, using cache if available."""
         if file_path in self.embedding_cache:
-            print(f"Using cached embedding for {file_path}")
             return self.embedding_cache[file_path]
-        print(f"Computing new embedding for {file_path}")
-        try:
-            audio = self.load_audio(file_path)
-            embedding = self.get_deep_embedding(audio)
-            self.embedding_cache[file_path] = embedding
-            print(f"Embedding shape: {embedding.shape}")
-            return embedding
-        except Exception as e:
-            print(f"Error getting embedding: {str(e)}")
-            raise
     def predict(self, file_path1, file_path2):
         """
         Predict the similarity between two audio files.
         Args:
-            file_path1 (str): Path to first audio file
-            file_path2 (str): Path to second audio file
         Returns:
-            float: Similarity score
-            str: Interpretation of similarity
         """
-        print(f"Comparing {file_path1} and {file_path2}")
-        try:
-            embedding1 = self.get_embedding_for_file(file_path1)
-            embedding2 = self.get_embedding_for_file(file_path2)
-            print("Computing DTW distance...")
-            norm_distance = self.compute_dtw_distance(embedding1.T, embedding2.T)
-            print(f"Normalized distance: {norm_distance}")
-            interpretation, similarity_score = self.interpret_similarity(norm_distance)
-            print(f"Similarity score: {similarity_score}, Interpretation: {interpretation}")
-            return similarity_score, interpretation
-        except Exception as e:
-            print(f"Error in predict: {str(e)}")
-            return 0, f"Error comparing files: {str(e)}"
     def clear_cache(self):
         """Clear the embedding cache to free memory."""
         self.embedding_cache = {}
-        print("Embedding cache cleared")
-# Global variable for the comparer instance
-comparer = None
 @app.on_event("startup")
-async def startup_event():
-    """Initialize the model when the application starts."""
     global comparer
-    print("Initializing model... This may take a moment.")
-    try:
-        comparer = QuranRecitationComparer(
-            model_name="jonatasgrosman/wav2vec2-large-xlsr-53-arabic",
-            token=HF_TOKEN
-        )
-        print("Model initialized and ready for predictions!")
-    except Exception as e:
-        print(f"Error initializing model: {str(e)}")
-@app.get("/")
 async def root():
-    """Root endpoint to check if the API is running."""
-    status = "active" if comparer else "model not loaded"
-    return {"message": "Quran Recitation Comparer API is running", "status": status}
-@app.post("/compare", response_model=ComparisonResult)
-async def compare_files(
-    file1: UploadFile = File(...),
-    file2: UploadFile = File(...)
-):
     """
-    Compare two audio files and return similarity metrics.
-    - **file1**: First audio file (MP3, WAV, etc.)
-    - **file2**: Second audio file (MP3, WAV, etc.)
-    Returns similarity score and interpretation.
     """
-    if not comparer:
-        raise HTTPException(status_code=500, detail="Model not initialized. Please try again later.")
-    print(f"Received files: {file1.filename} and {file2.filename}")
-    temp_dir = tempfile.mkdtemp()
-    print(f"Created temporary directory: {temp_dir}")
     try:
-        temp_file1 = os.path.join(temp_dir, file1.filename)
-        temp_file2 = os.path.join(temp_dir, file2.filename)
-        with open(temp_file1, "wb") as f:
-            content = await file1.read()
-            f.write(content)
-        with open(temp_file2, "wb") as f:
-            content = await file2.read()
-            f.write(content)
-        print(f"Files saved to: {temp_file1} and {temp_file2}")
-        similarity_score, interpretation = comparer.predict(temp_file1, temp_file2)
-        return ComparisonResult(
-            similarity_score=similarity_score,
-            interpretation=interpretation
-        )
     except Exception as e:
-        print(f"Error processing files: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Error processing files: {str(e)}")
     finally:
-        print(f"Cleaning up temporary directory: {temp_dir}")
-        shutil.rmtree(temp_dir, ignore_errors=True)
-@app.post("/clear-cache")
 async def clear_cache():
-    """Clear the embedding cache to free memory."""
-    if not comparer:
-        raise HTTPException(status_code=500, detail="Model not initialized.")
     comparer.clear_cache()
-    return {"message": "Embedding cache cleared successfully"}
-if __name__ == "__main__":
-    uvicorn.run("main:app", host="0.0.0.0", port=7860, log_level="info")

+import os
 import torch
 import librosa
 import numpy as np
 import tempfile
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+from librosa.sequence import dtw
+app = FastAPI(title="Quran Recitation Comparer API", description="Compares two Quran recitations using a deep wav2vec2 model.", version="1.0")
+# --- Core Class Definition ---
 class QuranRecitationComparer:
+    def __init__(self, model_name="jonatasgrosman/wav2vec2-large-xlsr-53-arabic", auth_token=None):
+        """
+        Initialize the Quran recitation comparer with a specific Wav2Vec2 model.
+        """
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Load model and processor once during initialization
+        if auth_token:
+            self.processor = Wav2Vec2Processor.from_pretrained(model_name, token=auth_token)
+            self.model = Wav2Vec2ForCTC.from_pretrained(model_name, token=auth_token)
+        else:
+            self.processor = Wav2Vec2Processor.from_pretrained(model_name)
+            self.model = Wav2Vec2ForCTC.from_pretrained(model_name)
+        self.model = self.model.to(self.device)
+        self.model.eval()
         # Cache for embeddings to avoid recomputation
         self.embedding_cache = {}
+    def load_audio(self, file_path, target_sr=16000, trim_silence=True, normalize=True):
         """Load and preprocess an audio file."""
         if not os.path.exists(file_path):
             raise FileNotFoundError(f"Audio file not found: {file_path}")
         y, sr = librosa.load(file_path, sr=target_sr)
         if normalize:
             y = librosa.util.normalize(y)
+        if trim_silence:
+            y, _ = librosa.effects.trim(y, top_db=30)
         return y
     def get_deep_embedding(self, audio, sr=16000):
         """Extract frame-wise deep embeddings using the pretrained model."""
+        input_values = self.processor(
+            audio,
+            sampling_rate=sr,
+            return_tensors="pt"
+        ).input_values.to(self.device)
+        with torch.no_grad():
+            outputs = self.model(input_values, output_hidden_states=True)
+        hidden_states = outputs.hidden_states[-1]
+        embedding_seq = hidden_states.squeeze(0).cpu().numpy()
+        return embedding_seq
     def compute_dtw_distance(self, features1, features2):
         """Compute the DTW distance between two sequences of features."""
+        D, wp = dtw(X=features1, Y=features2, metric='euclidean')
+        distance = D[-1, -1]
+        normalized_distance = distance / len(wp)
+        return normalized_distance
     def interpret_similarity(self, norm_distance):
         """Interpret the normalized distance value."""
         else:
             result = "The recitations are quite different."
             score = max(0, 100 - norm_distance)
         return result, score
     def get_embedding_for_file(self, file_path):
         """Get embedding for a file, using cache if available."""
         if file_path in self.embedding_cache:
             return self.embedding_cache[file_path]
+        audio = self.load_audio(file_path)
+        embedding = self.get_deep_embedding(audio)
+        # Store in cache for future use
+        self.embedding_cache[file_path] = embedding
+        return embedding
     def predict(self, file_path1, file_path2):
         """
         Predict the similarity between two audio files.
         Args:
+            file_path1 (str): Path to first audio file.
+            file_path2 (str): Path to second audio file.
         Returns:
+            (float, str): Similarity score and interpretation.
         """
+        embedding1 = self.get_embedding_for_file(file_path1)
+        embedding2 = self.get_embedding_for_file(file_path2)
+        norm_distance = self.compute_dtw_distance(embedding1.T, embedding2.T)
+        interpretation, similarity_score = self.interpret_similarity(norm_distance)
+        # Optionally log the results instead of printing in production
+        print(f"Similarity Score: {similarity_score:.1f}/100")
+        print(f"Interpretation: {interpretation}")
+        return similarity_score, interpretation
     def clear_cache(self):
         """Clear the embedding cache to free memory."""
         self.embedding_cache = {}
+# --- FastAPI Startup Event ---
+# In production, consider loading sensitive tokens from environment variables or configuration files.
 @app.on_event("startup")
+def startup_event():
     global comparer
+    # For production, do not hardcode tokens; use os.environ.get(...) or a configuration system.
+    auth_token = os.environ.get("HF_TOKEN")
+    comparer = QuranRecitationComparer(
+        model_name="jonatasgrosman/wav2vec2-large-xlsr-53-arabic",
+        auth_token=auth_token
+    )
+    print("Model initialized and ready for predictions!")
+# --- API Endpoints ---
+@app.get("/", summary="Health Check")
 async def root():
+    return {"message": "Quran Recitation Comparer API is up and running."}
+@app.post("/predict", summary="Compare Two Audio Files", response_model=dict)
+async def predict(file1: UploadFile = File(...), file2: UploadFile = File(...)):
     """
+    Compare two uploaded audio files and return a similarity score along with an interpretation.
+    - **file1**: The first audio file.
+    - **file2**: The second audio file.
     """
+    tmp1_path = None
+    tmp2_path = None
     try:
+        # Save first file to a temporary location
+        suffix1 = os.path.splitext(file1.filename)[1] or ".wav"
+        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix1) as tmp1:
+            content1 = await file1.read()
+            tmp1.write(content1)
+            tmp1_path = tmp1.name
+        # Save second file to a temporary location
+        suffix2 = os.path.splitext(file2.filename)[1] or ".wav"
+        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix2) as tmp2:
+            content2 = await file2.read()
+            tmp2.write(content2)
+            tmp2_path = tmp2.name
+        similarity_score, interpretation = comparer.predict(tmp1_path, tmp2_path)
+        return {"similarity_score": similarity_score, "interpretation": interpretation}
     except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
     finally:
+        # Clean up temporary files
+        if tmp1_path and os.path.exists(tmp1_path):
+            os.remove(tmp1_path)
+        if tmp2_path and os.path.exists(tmp2_path):
+            os.remove(tmp2_path)
+@app.post("/clear_cache", summary="Clear Embedding Cache", response_model=dict)
 async def clear_cache():
+    """
+    Clear the embedding cache. This can help free memory if many comparisons have been made.
+    """
     comparer.clear_cache()
+    return {"message": "Cache cleared."}