rmoxon commited on
Commit
9ee1beb
·
verified ·
1 Parent(s): 87fa678

Upload 8 files

Browse files
Files changed (5) hide show
  1. .gitignore +64 -0
  2. app.py +88 -15
  3. requirements-simple.txt +7 -0
  4. requirements.txt +2 -1
  5. test_fix.py +24 -0
.gitignore ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ pip-wheel-metadata/
20
+ share/python-wheels/
21
+ *.egg-info/
22
+ .installed.cfg
23
+ *.egg
24
+ MANIFEST
25
+
26
+ # PyInstaller
27
+ *.manifest
28
+ *.spec
29
+
30
+ # Unit test / coverage reports
31
+ htmlcov/
32
+ .tox/
33
+ .coverage
34
+ .coverage.*
35
+ .cache
36
+ nosetests.xml
37
+ coverage.xml
38
+ *.cover
39
+ *.py,cover
40
+ .hypothesis/
41
+ .pytest_cache/
42
+
43
+ # Environments
44
+ .env
45
+ .venv
46
+ env/
47
+ venv/
48
+ ENV/
49
+ env.bak/
50
+ venv.bak/
51
+
52
+ # IDE
53
+ .vscode/
54
+ .idea/
55
+ *.swp
56
+ *.swo
57
+
58
+ # OS
59
+ .DS_Store
60
+ Thumbs.db
61
+
62
+ # Model cache
63
+ transformers_cache/
64
+ huggingface_hub/
app.py CHANGED
@@ -3,13 +3,15 @@ import tempfile
3
  from pathlib import Path
4
  from fastapi import FastAPI, HTTPException
5
  from pydantic import BaseModel
6
- from transformers import CLIPProcessor, CLIPModel
7
  import torch
8
  from PIL import Image
9
  import requests
10
  import numpy as np
11
  import io
12
  import logging
 
 
13
 
14
  # Set up cache directories
15
  cache_dir = os.environ.get('TRANSFORMERS_CACHE', '/code/cache')
@@ -26,29 +28,42 @@ app = FastAPI(title="CLIP Service", version="1.0.0")
26
 
27
  class CLIPService:
28
  def __init__(self):
29
- logger.info("Loading CLIP model...")
30
  try:
31
  # Use CPU for Hugging Face free tier
32
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
33
  logger.info(f"Using device: {self.device}")
34
 
35
- # Load model with explicit cache directory
36
- self.model = CLIPModel.from_pretrained(
37
  "openai/clip-vit-large-patch14",
38
  cache_dir=cache_dir,
39
  local_files_only=False
40
  ).to(self.device)
41
 
42
- self.processor = CLIPProcessor.from_pretrained(
43
  "openai/clip-vit-large-patch14",
44
  cache_dir=cache_dir,
45
  local_files_only=False
46
  )
47
 
48
- logger.info(f"CLIP model loaded successfully on {self.device}")
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  except Exception as e:
51
- logger.error(f"Failed to load CLIP model: {str(e)}")
52
  raise RuntimeError(f"Model loading failed: {str(e)}")
53
 
54
  def is_supported_format(self, image_url: str) -> bool:
@@ -123,7 +138,7 @@ class CLIPService:
123
  # Try multiple processor configurations
124
  try:
125
  # Method 1: Standard CLIP processing
126
- inputs = self.processor(
127
  images=image,
128
  return_tensors="pt",
129
  do_rescale=True,
@@ -133,7 +148,7 @@ class CLIPService:
133
  logger.warning(f"Method 1 failed: {e1}, trying method 2...")
134
  try:
135
  # Method 2: With padding
136
- inputs = self.processor(
137
  images=image,
138
  return_tensors="pt",
139
  padding=True,
@@ -143,7 +158,7 @@ class CLIPService:
143
  except Exception as e2:
144
  logger.warning(f"Method 2 failed: {e2}, trying method 3...")
145
  # Method 3: Manual preprocessing
146
- inputs = self.processor(
147
  images=[image],
148
  return_tensors="pt"
149
  )
@@ -151,7 +166,7 @@ class CLIPService:
151
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
152
 
153
  with torch.no_grad():
154
- image_features = self.model.get_image_features(**inputs)
155
  image_features = image_features / image_features.norm(dim=-1, keepdim=True)
156
 
157
  return image_features.cpu().numpy().flatten().tolist()
@@ -163,16 +178,63 @@ class CLIPService:
163
  def encode_text(self, text: str) -> list:
164
  try:
165
  logger.info(f"Processing text: {text[:50]}...")
166
- inputs = self.processor(text=[text], return_tensors="pt", padding=True).to(self.device)
167
 
168
  with torch.no_grad():
169
- text_features = self.model.get_text_features(**inputs)
170
  text_features = text_features / text_features.norm(dim=-1, keepdim=True)
171
 
172
  return text_features.cpu().numpy().flatten().tolist()
173
  except Exception as e:
174
  logger.error(f"Error encoding text '{text[:50]}...': {str(e)}")
175
  raise HTTPException(status_code=500, detail=f"Failed to encode text: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
  # Initialize service with error handling
178
  logger.info("Initializing CLIP service...")
@@ -190,13 +252,16 @@ class ImageRequest(BaseModel):
190
  class TextRequest(BaseModel):
191
  text: str
192
 
 
 
 
193
  @app.get("/")
194
  async def root():
195
  return {
196
  "message": "CLIP Service API",
197
  "version": "1.0.0",
198
  "model": "clip-vit-large-patch14",
199
- "endpoints": ["/encode/image", "/encode/text", "/health"],
200
  "status": "ready" if clip_service else "error"
201
  }
202
 
@@ -216,6 +281,14 @@ async def encode_text(request: TextRequest):
216
  embedding = clip_service.encode_text(request.text)
217
  return {"embedding": embedding, "dimensions": len(embedding)}
218
 
 
 
 
 
 
 
 
 
219
  @app.get("/health")
220
  async def health_check():
221
  if not clip_service:
@@ -227,7 +300,7 @@ async def health_check():
227
 
228
  return {
229
  "status": "healthy",
230
- "model": "clip-vit-large-patch14",
231
  "device": clip_service.device,
232
  "service": "ready",
233
  "cache_dir": cache_dir
 
3
  from pathlib import Path
4
  from fastapi import FastAPI, HTTPException
5
  from pydantic import BaseModel
6
+ from transformers import CLIPProcessor, CLIPModel, ClapModel, ClapProcessor
7
  import torch
8
  from PIL import Image
9
  import requests
10
  import numpy as np
11
  import io
12
  import logging
13
+ import librosa
14
+ import soundfile as sf
15
 
16
  # Set up cache directories
17
  cache_dir = os.environ.get('TRANSFORMERS_CACHE', '/code/cache')
 
28
 
29
  class CLIPService:
30
  def __init__(self):
31
+ logger.info("Loading CLIP and CLAP models...")
32
  try:
33
  # Use CPU for Hugging Face free tier
34
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
35
  logger.info(f"Using device: {self.device}")
36
 
37
+ # Load CLIP model with explicit cache directory
38
+ self.clip_model = CLIPModel.from_pretrained(
39
  "openai/clip-vit-large-patch14",
40
  cache_dir=cache_dir,
41
  local_files_only=False
42
  ).to(self.device)
43
 
44
+ self.clip_processor = CLIPProcessor.from_pretrained(
45
  "openai/clip-vit-large-patch14",
46
  cache_dir=cache_dir,
47
  local_files_only=False
48
  )
49
 
50
+ # Load CLAP model for audio processing
51
+ self.clap_model = ClapModel.from_pretrained(
52
+ "laion/clap-htsat-unfused",
53
+ cache_dir=cache_dir,
54
+ local_files_only=False
55
+ ).to(self.device)
56
+
57
+ self.clap_processor = ClapProcessor.from_pretrained(
58
+ "laion/clap-htsat-unfused",
59
+ cache_dir=cache_dir,
60
+ local_files_only=False
61
+ )
62
+
63
+ logger.info(f"CLIP and CLAP models loaded successfully on {self.device}")
64
 
65
  except Exception as e:
66
+ logger.error(f"Failed to load models: {str(e)}")
67
  raise RuntimeError(f"Model loading failed: {str(e)}")
68
 
69
  def is_supported_format(self, image_url: str) -> bool:
 
138
  # Try multiple processor configurations
139
  try:
140
  # Method 1: Standard CLIP processing
141
+ inputs = self.clip_processor(
142
  images=image,
143
  return_tensors="pt",
144
  do_rescale=True,
 
148
  logger.warning(f"Method 1 failed: {e1}, trying method 2...")
149
  try:
150
  # Method 2: With padding
151
+ inputs = self.clip_processor(
152
  images=image,
153
  return_tensors="pt",
154
  padding=True,
 
158
  except Exception as e2:
159
  logger.warning(f"Method 2 failed: {e2}, trying method 3...")
160
  # Method 3: Manual preprocessing
161
+ inputs = self.clip_processor(
162
  images=[image],
163
  return_tensors="pt"
164
  )
 
166
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
167
 
168
  with torch.no_grad():
169
+ image_features = self.clip_model.get_image_features(**inputs)
170
  image_features = image_features / image_features.norm(dim=-1, keepdim=True)
171
 
172
  return image_features.cpu().numpy().flatten().tolist()
 
178
  def encode_text(self, text: str) -> list:
179
  try:
180
  logger.info(f"Processing text: {text[:50]}...")
181
+ inputs = self.clip_processor(text=[text], return_tensors="pt", padding=True).to(self.device)
182
 
183
  with torch.no_grad():
184
+ text_features = self.clip_model.get_text_features(**inputs)
185
  text_features = text_features / text_features.norm(dim=-1, keepdim=True)
186
 
187
  return text_features.cpu().numpy().flatten().tolist()
188
  except Exception as e:
189
  logger.error(f"Error encoding text '{text[:50]}...': {str(e)}")
190
  raise HTTPException(status_code=500, detail=f"Failed to encode text: {str(e)}")
191
+
192
+ def encode_audio(self, audio_url: str) -> list:
193
+ try:
194
+ logger.info(f"Processing audio: {audio_url}")
195
+
196
+ # Download audio file
197
+ response = requests.get(audio_url, timeout=60, headers={'User-Agent': 'CLAP-Service/1.0'})
198
+ response.raise_for_status()
199
+
200
+ # Save to temporary file
201
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
202
+ tmp_file.write(response.content)
203
+ tmp_path = tmp_file.name
204
+
205
+ try:
206
+ # Load audio with librosa
207
+ # CLAP expects 48kHz sampling rate
208
+ audio_array, sample_rate = librosa.load(tmp_path, sr=48000, mono=True)
209
+
210
+ # Ensure audio is not too long (max 30 seconds for CLAP)
211
+ max_length = 30 * 48000 # 30 seconds at 48kHz
212
+ if len(audio_array) > max_length:
213
+ audio_array = audio_array[:max_length]
214
+
215
+ # Process with CLAP
216
+ inputs = self.clap_processor(
217
+ audios=audio_array,
218
+ sampling_rate=48000,
219
+ return_tensors="pt"
220
+ )
221
+
222
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
223
+
224
+ with torch.no_grad():
225
+ audio_features = self.clap_model.get_audio_features(**inputs)
226
+ audio_features = audio_features / audio_features.norm(dim=-1, keepdim=True)
227
+
228
+ return audio_features.cpu().numpy().flatten().tolist()
229
+
230
+ finally:
231
+ # Clean up temp file
232
+ if os.path.exists(tmp_path):
233
+ os.unlink(tmp_path)
234
+
235
+ except Exception as e:
236
+ logger.error(f"Error encoding audio {audio_url}: {str(e)}")
237
+ raise HTTPException(status_code=500, detail=f"Failed to encode audio: {str(e)}")
238
 
239
  # Initialize service with error handling
240
  logger.info("Initializing CLIP service...")
 
252
  class TextRequest(BaseModel):
253
  text: str
254
 
255
+ class AudioRequest(BaseModel):
256
+ audio_url: str
257
+
258
  @app.get("/")
259
  async def root():
260
  return {
261
  "message": "CLIP Service API",
262
  "version": "1.0.0",
263
  "model": "clip-vit-large-patch14",
264
+ "endpoints": ["/encode/image", "/encode/text", "/encode/audio", "/health"],
265
  "status": "ready" if clip_service else "error"
266
  }
267
 
 
281
  embedding = clip_service.encode_text(request.text)
282
  return {"embedding": embedding, "dimensions": len(embedding)}
283
 
284
+ @app.post("/encode/audio")
285
+ async def encode_audio(request: AudioRequest):
286
+ if not clip_service:
287
+ raise HTTPException(status_code=503, detail="CLAP service not available")
288
+
289
+ embedding = clip_service.encode_audio(request.audio_url)
290
+ return {"embedding": embedding, "dimensions": len(embedding)}
291
+
292
  @app.get("/health")
293
  async def health_check():
294
  if not clip_service:
 
300
 
301
  return {
302
  "status": "healthy",
303
+ "models": ["clip-vit-large-patch14", "clap-htsat-unfused"],
304
  "device": clip_service.device,
305
  "service": "ready",
306
  "cache_dir": cache_dir
requirements-simple.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ transformers>=4.30.0
3
+ Pillow>=9.0.0
4
+ requests>=2.28.0
5
+ fastapi>=0.104.0
6
+ uvicorn[standard]>=0.22.0
7
+ python-multipart>=0.0.6
requirements.txt CHANGED
@@ -8,4 +8,5 @@ python-multipart==0.0.6
8
  pydantic==2.5.0
9
  numpy<2.0.0
10
  librosa>=0.10.0
11
- soundfile>=0.12.1
 
 
8
  pydantic==2.5.0
9
  numpy<2.0.0
10
  librosa>=0.10.0
11
+ soundfile>=0.12.1
12
+ datasets>=2.14.0
test_fix.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import CLIPProcessor, CLIPModel
2
+ from PIL import Image
3
+ import requests
4
+ import io
5
+
6
+ # Test the fix
7
+ model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
8
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
9
+
10
+ # Download test image
11
+ url = "https://xymtmeogzckraglhiuwt.supabase.co/storage/v1/object/public/pins/c1cfd4c9-77a3-4365-b38f-dda173e2a0c5/1750055972401.JPG"
12
+ response = requests.get(url)
13
+ image = Image.open(io.BytesIO(response.content))
14
+
15
+ if image.mode != 'RGB':
16
+ image = image.convert('RGB')
17
+
18
+ # Test the fix: images=[image] instead of images=image
19
+ try:
20
+ inputs = processor(images=[image], return_tensors="pt")
21
+ print("✅ SUCCESS: Fix works!")
22
+ print(f"Input shape: {inputs['pixel_values'].shape}")
23
+ except Exception as e:
24
+ print(f"❌ FAILED: {e}")