Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- .gitignore +64 -0
- app.py +88 -15
- requirements-simple.txt +7 -0
- requirements.txt +2 -1
- test_fix.py +24 -0
.gitignore
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
*.so
|
6 |
+
.Python
|
7 |
+
build/
|
8 |
+
develop-eggs/
|
9 |
+
dist/
|
10 |
+
downloads/
|
11 |
+
eggs/
|
12 |
+
.eggs/
|
13 |
+
lib/
|
14 |
+
lib64/
|
15 |
+
parts/
|
16 |
+
sdist/
|
17 |
+
var/
|
18 |
+
wheels/
|
19 |
+
pip-wheel-metadata/
|
20 |
+
share/python-wheels/
|
21 |
+
*.egg-info/
|
22 |
+
.installed.cfg
|
23 |
+
*.egg
|
24 |
+
MANIFEST
|
25 |
+
|
26 |
+
# PyInstaller
|
27 |
+
*.manifest
|
28 |
+
*.spec
|
29 |
+
|
30 |
+
# Unit test / coverage reports
|
31 |
+
htmlcov/
|
32 |
+
.tox/
|
33 |
+
.coverage
|
34 |
+
.coverage.*
|
35 |
+
.cache
|
36 |
+
nosetests.xml
|
37 |
+
coverage.xml
|
38 |
+
*.cover
|
39 |
+
*.py,cover
|
40 |
+
.hypothesis/
|
41 |
+
.pytest_cache/
|
42 |
+
|
43 |
+
# Environments
|
44 |
+
.env
|
45 |
+
.venv
|
46 |
+
env/
|
47 |
+
venv/
|
48 |
+
ENV/
|
49 |
+
env.bak/
|
50 |
+
venv.bak/
|
51 |
+
|
52 |
+
# IDE
|
53 |
+
.vscode/
|
54 |
+
.idea/
|
55 |
+
*.swp
|
56 |
+
*.swo
|
57 |
+
|
58 |
+
# OS
|
59 |
+
.DS_Store
|
60 |
+
Thumbs.db
|
61 |
+
|
62 |
+
# Model cache
|
63 |
+
transformers_cache/
|
64 |
+
huggingface_hub/
|
app.py
CHANGED
@@ -3,13 +3,15 @@ import tempfile
|
|
3 |
from pathlib import Path
|
4 |
from fastapi import FastAPI, HTTPException
|
5 |
from pydantic import BaseModel
|
6 |
-
from transformers import CLIPProcessor, CLIPModel
|
7 |
import torch
|
8 |
from PIL import Image
|
9 |
import requests
|
10 |
import numpy as np
|
11 |
import io
|
12 |
import logging
|
|
|
|
|
13 |
|
14 |
# Set up cache directories
|
15 |
cache_dir = os.environ.get('TRANSFORMERS_CACHE', '/code/cache')
|
@@ -26,29 +28,42 @@ app = FastAPI(title="CLIP Service", version="1.0.0")
|
|
26 |
|
27 |
class CLIPService:
|
28 |
def __init__(self):
|
29 |
-
logger.info("Loading CLIP
|
30 |
try:
|
31 |
# Use CPU for Hugging Face free tier
|
32 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
33 |
logger.info(f"Using device: {self.device}")
|
34 |
|
35 |
-
# Load model with explicit cache directory
|
36 |
-
self.
|
37 |
"openai/clip-vit-large-patch14",
|
38 |
cache_dir=cache_dir,
|
39 |
local_files_only=False
|
40 |
).to(self.device)
|
41 |
|
42 |
-
self.
|
43 |
"openai/clip-vit-large-patch14",
|
44 |
cache_dir=cache_dir,
|
45 |
local_files_only=False
|
46 |
)
|
47 |
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
except Exception as e:
|
51 |
-
logger.error(f"Failed to load
|
52 |
raise RuntimeError(f"Model loading failed: {str(e)}")
|
53 |
|
54 |
def is_supported_format(self, image_url: str) -> bool:
|
@@ -123,7 +138,7 @@ class CLIPService:
|
|
123 |
# Try multiple processor configurations
|
124 |
try:
|
125 |
# Method 1: Standard CLIP processing
|
126 |
-
inputs = self.
|
127 |
images=image,
|
128 |
return_tensors="pt",
|
129 |
do_rescale=True,
|
@@ -133,7 +148,7 @@ class CLIPService:
|
|
133 |
logger.warning(f"Method 1 failed: {e1}, trying method 2...")
|
134 |
try:
|
135 |
# Method 2: With padding
|
136 |
-
inputs = self.
|
137 |
images=image,
|
138 |
return_tensors="pt",
|
139 |
padding=True,
|
@@ -143,7 +158,7 @@ class CLIPService:
|
|
143 |
except Exception as e2:
|
144 |
logger.warning(f"Method 2 failed: {e2}, trying method 3...")
|
145 |
# Method 3: Manual preprocessing
|
146 |
-
inputs = self.
|
147 |
images=[image],
|
148 |
return_tensors="pt"
|
149 |
)
|
@@ -151,7 +166,7 @@ class CLIPService:
|
|
151 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
152 |
|
153 |
with torch.no_grad():
|
154 |
-
image_features = self.
|
155 |
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
156 |
|
157 |
return image_features.cpu().numpy().flatten().tolist()
|
@@ -163,16 +178,63 @@ class CLIPService:
|
|
163 |
def encode_text(self, text: str) -> list:
|
164 |
try:
|
165 |
logger.info(f"Processing text: {text[:50]}...")
|
166 |
-
inputs = self.
|
167 |
|
168 |
with torch.no_grad():
|
169 |
-
text_features = self.
|
170 |
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
171 |
|
172 |
return text_features.cpu().numpy().flatten().tolist()
|
173 |
except Exception as e:
|
174 |
logger.error(f"Error encoding text '{text[:50]}...': {str(e)}")
|
175 |
raise HTTPException(status_code=500, detail=f"Failed to encode text: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
|
177 |
# Initialize service with error handling
|
178 |
logger.info("Initializing CLIP service...")
|
@@ -190,13 +252,16 @@ class ImageRequest(BaseModel):
|
|
190 |
class TextRequest(BaseModel):
|
191 |
text: str
|
192 |
|
|
|
|
|
|
|
193 |
@app.get("/")
|
194 |
async def root():
|
195 |
return {
|
196 |
"message": "CLIP Service API",
|
197 |
"version": "1.0.0",
|
198 |
"model": "clip-vit-large-patch14",
|
199 |
-
"endpoints": ["/encode/image", "/encode/text", "/health"],
|
200 |
"status": "ready" if clip_service else "error"
|
201 |
}
|
202 |
|
@@ -216,6 +281,14 @@ async def encode_text(request: TextRequest):
|
|
216 |
embedding = clip_service.encode_text(request.text)
|
217 |
return {"embedding": embedding, "dimensions": len(embedding)}
|
218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
@app.get("/health")
|
220 |
async def health_check():
|
221 |
if not clip_service:
|
@@ -227,7 +300,7 @@ async def health_check():
|
|
227 |
|
228 |
return {
|
229 |
"status": "healthy",
|
230 |
-
"
|
231 |
"device": clip_service.device,
|
232 |
"service": "ready",
|
233 |
"cache_dir": cache_dir
|
|
|
3 |
from pathlib import Path
|
4 |
from fastapi import FastAPI, HTTPException
|
5 |
from pydantic import BaseModel
|
6 |
+
from transformers import CLIPProcessor, CLIPModel, ClapModel, ClapProcessor
|
7 |
import torch
|
8 |
from PIL import Image
|
9 |
import requests
|
10 |
import numpy as np
|
11 |
import io
|
12 |
import logging
|
13 |
+
import librosa
|
14 |
+
import soundfile as sf
|
15 |
|
16 |
# Set up cache directories
|
17 |
cache_dir = os.environ.get('TRANSFORMERS_CACHE', '/code/cache')
|
|
|
28 |
|
29 |
class CLIPService:
|
30 |
def __init__(self):
|
31 |
+
logger.info("Loading CLIP and CLAP models...")
|
32 |
try:
|
33 |
# Use CPU for Hugging Face free tier
|
34 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
35 |
logger.info(f"Using device: {self.device}")
|
36 |
|
37 |
+
# Load CLIP model with explicit cache directory
|
38 |
+
self.clip_model = CLIPModel.from_pretrained(
|
39 |
"openai/clip-vit-large-patch14",
|
40 |
cache_dir=cache_dir,
|
41 |
local_files_only=False
|
42 |
).to(self.device)
|
43 |
|
44 |
+
self.clip_processor = CLIPProcessor.from_pretrained(
|
45 |
"openai/clip-vit-large-patch14",
|
46 |
cache_dir=cache_dir,
|
47 |
local_files_only=False
|
48 |
)
|
49 |
|
50 |
+
# Load CLAP model for audio processing
|
51 |
+
self.clap_model = ClapModel.from_pretrained(
|
52 |
+
"laion/clap-htsat-unfused",
|
53 |
+
cache_dir=cache_dir,
|
54 |
+
local_files_only=False
|
55 |
+
).to(self.device)
|
56 |
+
|
57 |
+
self.clap_processor = ClapProcessor.from_pretrained(
|
58 |
+
"laion/clap-htsat-unfused",
|
59 |
+
cache_dir=cache_dir,
|
60 |
+
local_files_only=False
|
61 |
+
)
|
62 |
+
|
63 |
+
logger.info(f"CLIP and CLAP models loaded successfully on {self.device}")
|
64 |
|
65 |
except Exception as e:
|
66 |
+
logger.error(f"Failed to load models: {str(e)}")
|
67 |
raise RuntimeError(f"Model loading failed: {str(e)}")
|
68 |
|
69 |
def is_supported_format(self, image_url: str) -> bool:
|
|
|
138 |
# Try multiple processor configurations
|
139 |
try:
|
140 |
# Method 1: Standard CLIP processing
|
141 |
+
inputs = self.clip_processor(
|
142 |
images=image,
|
143 |
return_tensors="pt",
|
144 |
do_rescale=True,
|
|
|
148 |
logger.warning(f"Method 1 failed: {e1}, trying method 2...")
|
149 |
try:
|
150 |
# Method 2: With padding
|
151 |
+
inputs = self.clip_processor(
|
152 |
images=image,
|
153 |
return_tensors="pt",
|
154 |
padding=True,
|
|
|
158 |
except Exception as e2:
|
159 |
logger.warning(f"Method 2 failed: {e2}, trying method 3...")
|
160 |
# Method 3: Manual preprocessing
|
161 |
+
inputs = self.clip_processor(
|
162 |
images=[image],
|
163 |
return_tensors="pt"
|
164 |
)
|
|
|
166 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
167 |
|
168 |
with torch.no_grad():
|
169 |
+
image_features = self.clip_model.get_image_features(**inputs)
|
170 |
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
171 |
|
172 |
return image_features.cpu().numpy().flatten().tolist()
|
|
|
178 |
def encode_text(self, text: str) -> list:
|
179 |
try:
|
180 |
logger.info(f"Processing text: {text[:50]}...")
|
181 |
+
inputs = self.clip_processor(text=[text], return_tensors="pt", padding=True).to(self.device)
|
182 |
|
183 |
with torch.no_grad():
|
184 |
+
text_features = self.clip_model.get_text_features(**inputs)
|
185 |
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
186 |
|
187 |
return text_features.cpu().numpy().flatten().tolist()
|
188 |
except Exception as e:
|
189 |
logger.error(f"Error encoding text '{text[:50]}...': {str(e)}")
|
190 |
raise HTTPException(status_code=500, detail=f"Failed to encode text: {str(e)}")
|
191 |
+
|
192 |
+
def encode_audio(self, audio_url: str) -> list:
|
193 |
+
try:
|
194 |
+
logger.info(f"Processing audio: {audio_url}")
|
195 |
+
|
196 |
+
# Download audio file
|
197 |
+
response = requests.get(audio_url, timeout=60, headers={'User-Agent': 'CLAP-Service/1.0'})
|
198 |
+
response.raise_for_status()
|
199 |
+
|
200 |
+
# Save to temporary file
|
201 |
+
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
|
202 |
+
tmp_file.write(response.content)
|
203 |
+
tmp_path = tmp_file.name
|
204 |
+
|
205 |
+
try:
|
206 |
+
# Load audio with librosa
|
207 |
+
# CLAP expects 48kHz sampling rate
|
208 |
+
audio_array, sample_rate = librosa.load(tmp_path, sr=48000, mono=True)
|
209 |
+
|
210 |
+
# Ensure audio is not too long (max 30 seconds for CLAP)
|
211 |
+
max_length = 30 * 48000 # 30 seconds at 48kHz
|
212 |
+
if len(audio_array) > max_length:
|
213 |
+
audio_array = audio_array[:max_length]
|
214 |
+
|
215 |
+
# Process with CLAP
|
216 |
+
inputs = self.clap_processor(
|
217 |
+
audios=audio_array,
|
218 |
+
sampling_rate=48000,
|
219 |
+
return_tensors="pt"
|
220 |
+
)
|
221 |
+
|
222 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
223 |
+
|
224 |
+
with torch.no_grad():
|
225 |
+
audio_features = self.clap_model.get_audio_features(**inputs)
|
226 |
+
audio_features = audio_features / audio_features.norm(dim=-1, keepdim=True)
|
227 |
+
|
228 |
+
return audio_features.cpu().numpy().flatten().tolist()
|
229 |
+
|
230 |
+
finally:
|
231 |
+
# Clean up temp file
|
232 |
+
if os.path.exists(tmp_path):
|
233 |
+
os.unlink(tmp_path)
|
234 |
+
|
235 |
+
except Exception as e:
|
236 |
+
logger.error(f"Error encoding audio {audio_url}: {str(e)}")
|
237 |
+
raise HTTPException(status_code=500, detail=f"Failed to encode audio: {str(e)}")
|
238 |
|
239 |
# Initialize service with error handling
|
240 |
logger.info("Initializing CLIP service...")
|
|
|
252 |
class TextRequest(BaseModel):
|
253 |
text: str
|
254 |
|
255 |
+
class AudioRequest(BaseModel):
|
256 |
+
audio_url: str
|
257 |
+
|
258 |
@app.get("/")
|
259 |
async def root():
|
260 |
return {
|
261 |
"message": "CLIP Service API",
|
262 |
"version": "1.0.0",
|
263 |
"model": "clip-vit-large-patch14",
|
264 |
+
"endpoints": ["/encode/image", "/encode/text", "/encode/audio", "/health"],
|
265 |
"status": "ready" if clip_service else "error"
|
266 |
}
|
267 |
|
|
|
281 |
embedding = clip_service.encode_text(request.text)
|
282 |
return {"embedding": embedding, "dimensions": len(embedding)}
|
283 |
|
284 |
+
@app.post("/encode/audio")
|
285 |
+
async def encode_audio(request: AudioRequest):
|
286 |
+
if not clip_service:
|
287 |
+
raise HTTPException(status_code=503, detail="CLAP service not available")
|
288 |
+
|
289 |
+
embedding = clip_service.encode_audio(request.audio_url)
|
290 |
+
return {"embedding": embedding, "dimensions": len(embedding)}
|
291 |
+
|
292 |
@app.get("/health")
|
293 |
async def health_check():
|
294 |
if not clip_service:
|
|
|
300 |
|
301 |
return {
|
302 |
"status": "healthy",
|
303 |
+
"models": ["clip-vit-large-patch14", "clap-htsat-unfused"],
|
304 |
"device": clip_service.device,
|
305 |
"service": "ready",
|
306 |
"cache_dir": cache_dir
|
requirements-simple.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch>=2.0.0
|
2 |
+
transformers>=4.30.0
|
3 |
+
Pillow>=9.0.0
|
4 |
+
requests>=2.28.0
|
5 |
+
fastapi>=0.104.0
|
6 |
+
uvicorn[standard]>=0.22.0
|
7 |
+
python-multipart>=0.0.6
|
requirements.txt
CHANGED
@@ -8,4 +8,5 @@ python-multipart==0.0.6
|
|
8 |
pydantic==2.5.0
|
9 |
numpy<2.0.0
|
10 |
librosa>=0.10.0
|
11 |
-
soundfile>=0.12.1
|
|
|
|
8 |
pydantic==2.5.0
|
9 |
numpy<2.0.0
|
10 |
librosa>=0.10.0
|
11 |
+
soundfile>=0.12.1
|
12 |
+
datasets>=2.14.0
|
test_fix.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import CLIPProcessor, CLIPModel
|
2 |
+
from PIL import Image
|
3 |
+
import requests
|
4 |
+
import io
|
5 |
+
|
6 |
+
# Test the fix
|
7 |
+
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
|
8 |
+
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
|
9 |
+
|
10 |
+
# Download test image
|
11 |
+
url = "https://xymtmeogzckraglhiuwt.supabase.co/storage/v1/object/public/pins/c1cfd4c9-77a3-4365-b38f-dda173e2a0c5/1750055972401.JPG"
|
12 |
+
response = requests.get(url)
|
13 |
+
image = Image.open(io.BytesIO(response.content))
|
14 |
+
|
15 |
+
if image.mode != 'RGB':
|
16 |
+
image = image.convert('RGB')
|
17 |
+
|
18 |
+
# Test the fix: images=[image] instead of images=image
|
19 |
+
try:
|
20 |
+
inputs = processor(images=[image], return_tensors="pt")
|
21 |
+
print("✅ SUCCESS: Fix works!")
|
22 |
+
print(f"Input shape: {inputs['pixel_values'].shape}")
|
23 |
+
except Exception as e:
|
24 |
+
print(f"❌ FAILED: {e}")
|