Spaces:

rmoxon
/

strandtest

Sleeping

App Files Files Community

rmoxon commited on Jul 15

Commit

9ee1beb

verified ·

1 Parent(s): 87fa678

Upload 8 files

Browse files

Files changed (5) hide show

.gitignore +64 -0
app.py +88 -15
requirements-simple.txt +7 -0
requirements.txt +2 -1
test_fix.py +24 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,64 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+*.manifest
+*.spec
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+# OS
+.DS_Store
+Thumbs.db
+# Model cache
+transformers_cache/
+huggingface_hub/

app.py CHANGED Viewed

@@ -3,13 +3,15 @@ import tempfile
 from pathlib import Path
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from transformers import CLIPProcessor, CLIPModel
 import torch
 from PIL import Image
 import requests
 import numpy as np
 import io
 import logging
 # Set up cache directories
 cache_dir = os.environ.get('TRANSFORMERS_CACHE', '/code/cache')
@@ -26,29 +28,42 @@ app = FastAPI(title="CLIP Service", version="1.0.0")
 class CLIPService:
     def __init__(self):
-        logger.info("Loading CLIP model...")
         try:
             # Use CPU for Hugging Face free tier
             self.device = "cuda" if torch.cuda.is_available() else "cpu"
             logger.info(f"Using device: {self.device}")
-            # Load model with explicit cache directory
-            self.model = CLIPModel.from_pretrained(
                 "openai/clip-vit-large-patch14",
                 cache_dir=cache_dir,
                 local_files_only=False
             ).to(self.device)
-            self.processor = CLIPProcessor.from_pretrained(
                 "openai/clip-vit-large-patch14",
                 cache_dir=cache_dir,
                 local_files_only=False
             )
-            logger.info(f"CLIP model loaded successfully on {self.device}")
         except Exception as e:
-            logger.error(f"Failed to load CLIP model: {str(e)}")
             raise RuntimeError(f"Model loading failed: {str(e)}")
     def is_supported_format(self, image_url: str) -> bool:
@@ -123,7 +138,7 @@ class CLIPService:
             # Try multiple processor configurations
             try:
                 # Method 1: Standard CLIP processing
-                inputs = self.processor(
                     images=image,
                     return_tensors="pt",
                     do_rescale=True,
@@ -133,7 +148,7 @@ class CLIPService:
                 logger.warning(f"Method 1 failed: {e1}, trying method 2...")
                 try:
                     # Method 2: With padding
-                    inputs = self.processor(
                         images=image,
                         return_tensors="pt",
                         padding=True,
@@ -143,7 +158,7 @@ class CLIPService:
                 except Exception as e2:
                     logger.warning(f"Method 2 failed: {e2}, trying method 3...")
                     # Method 3: Manual preprocessing
-                    inputs = self.processor(
                         images=[image],
                         return_tensors="pt"
                     )
@@ -151,7 +166,7 @@ class CLIPService:
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
             with torch.no_grad():
-                image_features = self.model.get_image_features(**inputs)
                 image_features = image_features / image_features.norm(dim=-1, keepdim=True)
             return image_features.cpu().numpy().flatten().tolist()
@@ -163,16 +178,63 @@ class CLIPService:
     def encode_text(self, text: str) -> list:
         try:
             logger.info(f"Processing text: {text[:50]}...")
-            inputs = self.processor(text=[text], return_tensors="pt", padding=True).to(self.device)
             with torch.no_grad():
-                text_features = self.model.get_text_features(**inputs)
                 text_features = text_features / text_features.norm(dim=-1, keepdim=True)
             return text_features.cpu().numpy().flatten().tolist()
         except Exception as e:
             logger.error(f"Error encoding text '{text[:50]}...': {str(e)}")
             raise HTTPException(status_code=500, detail=f"Failed to encode text: {str(e)}")
 # Initialize service with error handling
 logger.info("Initializing CLIP service...")
@@ -190,13 +252,16 @@ class ImageRequest(BaseModel):
 class TextRequest(BaseModel):
     text: str
 @app.get("/")
 async def root():
     return {
         "message": "CLIP Service API",
         "version": "1.0.0",
         "model": "clip-vit-large-patch14",
-        "endpoints": ["/encode/image", "/encode/text", "/health"],
         "status": "ready" if clip_service else "error"
     }
@@ -216,6 +281,14 @@ async def encode_text(request: TextRequest):
     embedding = clip_service.encode_text(request.text)
     return {"embedding": embedding, "dimensions": len(embedding)}
 @app.get("/health")
 async def health_check():
     if not clip_service:
@@ -227,7 +300,7 @@ async def health_check():
     return {
         "status": "healthy",
-        "model": "clip-vit-large-patch14",
         "device": clip_service.device,
         "service": "ready",
         "cache_dir": cache_dir

 from pathlib import Path
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+from transformers import CLIPProcessor, CLIPModel, ClapModel, ClapProcessor
 import torch
 from PIL import Image
 import requests
 import numpy as np
 import io
 import logging
+import librosa
+import soundfile as sf
 # Set up cache directories
 cache_dir = os.environ.get('TRANSFORMERS_CACHE', '/code/cache')
 class CLIPService:
     def __init__(self):
+        logger.info("Loading CLIP and CLAP models...")
         try:
             # Use CPU for Hugging Face free tier
             self.device = "cuda" if torch.cuda.is_available() else "cpu"
             logger.info(f"Using device: {self.device}")
+            # Load CLIP model with explicit cache directory
+            self.clip_model = CLIPModel.from_pretrained(
                 "openai/clip-vit-large-patch14",
                 cache_dir=cache_dir,
                 local_files_only=False
             ).to(self.device)
+            self.clip_processor = CLIPProcessor.from_pretrained(
                 "openai/clip-vit-large-patch14",
                 cache_dir=cache_dir,
                 local_files_only=False
             )
+            # Load CLAP model for audio processing
+            self.clap_model = ClapModel.from_pretrained(
+                "laion/clap-htsat-unfused",
+                cache_dir=cache_dir,
+                local_files_only=False
+            ).to(self.device)
+            self.clap_processor = ClapProcessor.from_pretrained(
+                "laion/clap-htsat-unfused",
+                cache_dir=cache_dir,
+                local_files_only=False
+            )
+            logger.info(f"CLIP and CLAP models loaded successfully on {self.device}")
         except Exception as e:
+            logger.error(f"Failed to load models: {str(e)}")
             raise RuntimeError(f"Model loading failed: {str(e)}")
     def is_supported_format(self, image_url: str) -> bool:
             # Try multiple processor configurations
             try:
                 # Method 1: Standard CLIP processing
+                inputs = self.clip_processor(
                     images=image,
                     return_tensors="pt",
                     do_rescale=True,
                 logger.warning(f"Method 1 failed: {e1}, trying method 2...")
                 try:
                     # Method 2: With padding
+                    inputs = self.clip_processor(
                         images=image,
                         return_tensors="pt",
                         padding=True,
                 except Exception as e2:
                     logger.warning(f"Method 2 failed: {e2}, trying method 3...")
                     # Method 3: Manual preprocessing
+                    inputs = self.clip_processor(
                         images=[image],
                         return_tensors="pt"
                     )
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
             with torch.no_grad():
+                image_features = self.clip_model.get_image_features(**inputs)
                 image_features = image_features / image_features.norm(dim=-1, keepdim=True)
             return image_features.cpu().numpy().flatten().tolist()
     def encode_text(self, text: str) -> list:
         try:
             logger.info(f"Processing text: {text[:50]}...")
+            inputs = self.clip_processor(text=[text], return_tensors="pt", padding=True).to(self.device)
             with torch.no_grad():
+                text_features = self.clip_model.get_text_features(**inputs)
                 text_features = text_features / text_features.norm(dim=-1, keepdim=True)
             return text_features.cpu().numpy().flatten().tolist()
         except Exception as e:
             logger.error(f"Error encoding text '{text[:50]}...': {str(e)}")
             raise HTTPException(status_code=500, detail=f"Failed to encode text: {str(e)}")
+    def encode_audio(self, audio_url: str) -> list:
+        try:
+            logger.info(f"Processing audio: {audio_url}")
+            # Download audio file
+            response = requests.get(audio_url, timeout=60, headers={'User-Agent': 'CLAP-Service/1.0'})
+            response.raise_for_status()
+            # Save to temporary file
+            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
+                tmp_file.write(response.content)
+                tmp_path = tmp_file.name
+            try:
+                # Load audio with librosa
+                # CLAP expects 48kHz sampling rate
+                audio_array, sample_rate = librosa.load(tmp_path, sr=48000, mono=True)
+                # Ensure audio is not too long (max 30 seconds for CLAP)
+                max_length = 30 * 48000  # 30 seconds at 48kHz
+                if len(audio_array) > max_length:
+                    audio_array = audio_array[:max_length]
+                # Process with CLAP
+                inputs = self.clap_processor(
+                    audios=audio_array,
+                    sampling_rate=48000,
+                    return_tensors="pt"
+                )
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                with torch.no_grad():
+                    audio_features = self.clap_model.get_audio_features(**inputs)
+                    audio_features = audio_features / audio_features.norm(dim=-1, keepdim=True)
+                return audio_features.cpu().numpy().flatten().tolist()
+            finally:
+                # Clean up temp file
+                if os.path.exists(tmp_path):
+                    os.unlink(tmp_path)
+        except Exception as e:
+            logger.error(f"Error encoding audio {audio_url}: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Failed to encode audio: {str(e)}")
 # Initialize service with error handling
 logger.info("Initializing CLIP service...")
 class TextRequest(BaseModel):
     text: str
+class AudioRequest(BaseModel):
+    audio_url: str
 @app.get("/")
 async def root():
     return {
         "message": "CLIP Service API",
         "version": "1.0.0",
         "model": "clip-vit-large-patch14",
+        "endpoints": ["/encode/image", "/encode/text", "/encode/audio", "/health"],
         "status": "ready" if clip_service else "error"
     }
     embedding = clip_service.encode_text(request.text)
     return {"embedding": embedding, "dimensions": len(embedding)}
+@app.post("/encode/audio")
+async def encode_audio(request: AudioRequest):
+    if not clip_service:
+        raise HTTPException(status_code=503, detail="CLAP service not available")
+    embedding = clip_service.encode_audio(request.audio_url)
+    return {"embedding": embedding, "dimensions": len(embedding)}
 @app.get("/health")
 async def health_check():
     if not clip_service:
     return {
         "status": "healthy",
+        "models": ["clip-vit-large-patch14", "clap-htsat-unfused"],
         "device": clip_service.device,
         "service": "ready",
         "cache_dir": cache_dir

requirements-simple.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch>=2.0.0
+transformers>=4.30.0
+Pillow>=9.0.0
+requests>=2.28.0
+fastapi>=0.104.0
+uvicorn[standard]>=0.22.0
+python-multipart>=0.0.6

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ python-multipart==0.0.6
 pydantic==2.5.0
 numpy<2.0.0
 librosa>=0.10.0
-soundfile>=0.12.1

 pydantic==2.5.0
 numpy<2.0.0
 librosa>=0.10.0
+soundfile>=0.12.1
+datasets>=2.14.0

test_fix.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from transformers import CLIPProcessor, CLIPModel
+from PIL import Image
+import requests
+import io
+# Test the fix
+model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+# Download test image
+url = "https://xymtmeogzckraglhiuwt.supabase.co/storage/v1/object/public/pins/c1cfd4c9-77a3-4365-b38f-dda173e2a0c5/1750055972401.JPG"
+response = requests.get(url)
+image = Image.open(io.BytesIO(response.content))
+if image.mode != 'RGB':
+    image = image.convert('RGB')
+# Test the fix: images=[image] instead of images=image
+try:
+    inputs = processor(images=[image], return_tensors="pt")
+    print("✅ SUCCESS: Fix works!")
+    print(f"Input shape: {inputs['pixel_values'].shape}")
+except Exception as e:
+    print(f"❌ FAILED: {e}")