Malaji71 commited on
Commit
84abbe3
·
verified ·
1 Parent(s): 659c8b0

Update optimizer.py

Browse files
Files changed (1) hide show
  1. optimizer.py +119 -178
optimizer.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
  Ultra Supreme Optimizer - Main optimization engine for image analysis
3
- VERSIÓN MEJORADA - Usa el prompt completo de CLIP Interrogator
4
  """
5
 
6
  # IMPORTANT: spaces must be imported BEFORE torch or any CUDA-using library
@@ -14,7 +14,7 @@ from typing import Tuple, Dict, Any, Optional
14
  import torch
15
  import numpy as np
16
  from PIL import Image
17
- from clip_interrogator import Config, Interrogator
18
 
19
  from analyzer import UltraSupremeAnalyzer
20
 
@@ -25,12 +25,12 @@ class UltraSupremeOptimizer:
25
  """Main optimizer class for ultra supreme image analysis"""
26
 
27
  def __init__(self):
28
- self.interrogator: Optional[Interrogator] = None
 
29
  self.analyzer = UltraSupremeAnalyzer()
30
  self.usage_count = 0
31
  self.device = self._get_device()
32
  self.is_initialized = False
33
- # NO inicializar modelo aquí - hacerlo lazy
34
 
35
  @staticmethod
36
  def _get_device() -> str:
@@ -43,31 +43,37 @@ class UltraSupremeOptimizer:
43
  return "cpu"
44
 
45
  def initialize_model(self) -> bool:
46
- """Initialize the CLIP interrogator model"""
47
  if self.is_initialized:
48
  return True
49
 
50
  try:
51
- # Configuración para CPU inicialmente
52
- config = Config(
53
- clip_model_name="ViT-L-14/openai",
54
- download_cache=True,
55
- chunk_size=2048,
56
- quiet=True,
57
- device="cpu" # Siempre inicializar en CPU
 
 
 
58
  )
59
 
60
- self.interrogator = Interrogator(config)
 
 
 
61
  self.is_initialized = True
62
 
63
  # Clean up memory after initialization
64
  gc.collect()
65
 
66
- logger.info("Model initialized successfully on CPU")
67
  return True
68
 
69
  except Exception as e:
70
- logger.error(f"Initialization error: {e}")
71
  return False
72
 
73
  def optimize_image(self, image: Any) -> Optional[Image.Image]:
@@ -86,8 +92,8 @@ class UltraSupremeOptimizer:
86
  if image.mode != 'RGB':
87
  image = image.convert('RGB')
88
 
89
- # Resize if too large
90
- max_size = 768 # Reducir tamaño para evitar problemas de memoria
91
  if image.size[0] > max_size or image.size[1] > max_size:
92
  image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
93
 
@@ -98,7 +104,7 @@ class UltraSupremeOptimizer:
98
  return None
99
 
100
  def apply_flux_rules(self, base_prompt: str) -> str:
101
- """Aplica las reglas de Flux a un prompt base de CLIP Interrogator"""
102
 
103
  # Limpiar el prompt de elementos no deseados
104
  cleanup_patterns = [
@@ -148,110 +154,88 @@ class UltraSupremeOptimizer:
148
 
149
  return final_prompt
150
 
151
- def _prepare_models_for_gpu(self):
152
- """Prepara los modelos para GPU con la precisión correcta"""
153
- try:
154
- if hasattr(self.interrogator, 'caption_model'):
155
- self.interrogator.caption_model = self.interrogator.caption_model.half().to("cuda")
156
-
157
- if hasattr(self.interrogator, 'clip_model'):
158
- self.interrogator.clip_model = self.interrogator.clip_model.half().to("cuda")
159
-
160
- if hasattr(self.interrogator, 'blip_model'):
161
- self.interrogator.blip_model = self.interrogator.blip_model.half().to("cuda")
162
-
163
- self.interrogator.config.device = "cuda"
164
- logger.info("Models prepared for GPU with FP16")
165
-
166
- except Exception as e:
167
- logger.error(f"Error preparing models for GPU: {e}")
168
- raise
169
-
170
- def _prepare_models_for_cpu(self):
171
- """Prepara los modelos para CPU con float32"""
172
- try:
173
- if hasattr(self.interrogator, 'caption_model'):
174
- self.interrogator.caption_model = self.interrogator.caption_model.float().to("cpu")
175
-
176
- if hasattr(self.interrogator, 'clip_model'):
177
- self.interrogator.clip_model = self.interrogator.clip_model.float().to("cpu")
178
-
179
- if hasattr(self.interrogator, 'blip_model'):
180
- self.interrogator.blip_model = self.interrogator.blip_model.float().to("cpu")
181
-
182
- self.interrogator.config.device = "cpu"
183
- logger.info("Models prepared for CPU with FP32")
184
-
185
- except Exception as e:
186
- logger.error(f"Error preparing models for CPU: {e}")
187
- raise
188
-
189
  @spaces.GPU(duration=60)
190
- def run_clip_inference(self, image: Image.Image) -> Tuple[str, str, str]:
191
- """Solo la inferencia CLIP usa GPU"""
192
  try:
193
- # NO usar half precision - mantener float32 para compatibilidad
194
- if hasattr(self.interrogator, 'caption_model'):
195
- self.interrogator.caption_model = self.interrogator.caption_model.to("cuda")
196
-
197
- if hasattr(self.interrogator, 'clip_model'):
198
- self.interrogator.clip_model = self.interrogator.clip_model.to("cuda")
199
-
200
- if hasattr(self.interrogator, 'blip_model'):
201
- self.interrogator.blip_model = self.interrogator.blip_model.to("cuda")
202
-
203
- self.interrogator.config.device = "cuda"
204
- logger.info("Models moved to GPU with float32 (full precision)")
205
-
206
- # Ejecutar inferencias sin autocast para evitar problemas de half precision
207
- full_prompt = self.interrogator.interrogate(image)
208
- clip_fast = self.interrogator.interrogate_fast(image)
209
- clip_classic = self.interrogator.interrogate_classic(image)
210
-
211
- return full_prompt, clip_fast, clip_classic
212
-
213
- except Exception as e:
214
- logger.error(f"GPU inference error: {e}")
215
- # Intentar en CPU como fallback
216
- return self._run_cpu_inference(image)
217
-
218
- def _safe_interrogate(self, image: Image.Image, method: str) -> str:
219
- """Ejecuta interrogate de forma segura manejando precisión"""
220
- try:
221
- # Temporalmente parchear el método de procesamiento de imagen
222
- original_method = getattr(self.interrogator, method)
223
-
224
- # Ejecutar el método
225
- result = original_method(image)
226
-
227
- return result
228
-
229
- except Exception as e:
230
- logger.error(f"Error in {method}: {e}")
231
- return f"Error processing with {method}"
232
-
233
- def _run_cpu_inference(self, image: Image.Image) -> Tuple[str, str, str]:
234
- """Ejecuta inferencia en CPU como fallback"""
235
- try:
236
- logger.info("Running CPU inference as fallback")
 
 
 
 
 
 
 
 
 
 
 
237
 
238
- # Preparar modelos para CPU
239
- self._prepare_models_for_cpu()
 
 
240
 
241
- # Ejecutar en CPU sin autocast
242
- full_prompt = self.interrogator.interrogate(image)
243
- clip_fast = self.interrogator.interrogate_fast(image)
244
- clip_classic = self.interrogator.interrogate_classic(image)
245
 
246
  return full_prompt, clip_fast, clip_classic
247
 
248
  except Exception as e:
249
- logger.error(f"CPU inference also failed: {e}")
250
- return "Error: Failed to process image", "Error", "Error"
 
 
 
 
 
 
251
 
252
  def generate_ultra_supreme_prompt(self, image: Any) -> Tuple[str, str, int, Dict[str, int]]:
253
  """
254
- Generate ultra supreme prompt from image usando el pipeline completo
255
 
256
  Returns:
257
  Tuple of (prompt, analysis_info, score, breakdown)
@@ -275,30 +259,30 @@ class UltraSupremeOptimizer:
275
 
276
  start_time = datetime.now()
277
 
278
- logger.info("ULTRA SUPREME ANALYSIS - Starting complete pipeline with multi-model analysis")
279
 
280
- # Ejecutar inferencia CLIP
281
- full_prompt, clip_fast, clip_classic = self.run_clip_inference(image)
282
-
283
- # Verificar si hubo errores
284
- if "Error" in full_prompt:
285
- logger.warning("Using fallback prompt due to inference error")
286
  full_prompt = "A photograph"
287
- clip_fast = "image"
288
- clip_classic = "picture"
289
 
290
- logger.info(f"CLIP complete prompt: {full_prompt[:100]}...")
291
 
292
- # NUEVO: Ejecutar análisis ultra supremo con múltiples modelos
293
  logger.info("Running multi-model ultra supreme analysis...")
294
  ultra_analysis = self.analyzer.ultra_supreme_analysis(
295
- image, clip_fast, clip_classic, full_prompt
296
  )
297
 
298
  # Construir prompt mejorado basado en análisis completo
299
  enhanced_prompt_parts = []
300
 
301
- # Base prompt de CLIP
302
  enhanced_prompt_parts.append(full_prompt)
303
 
304
  # Agregar información demográfica si está disponible
@@ -339,7 +323,7 @@ class UltraSupremeOptimizer:
339
 
340
  # Generate enhanced analysis report con datos de múltiples modelos
341
  analysis_info = self._generate_ultra_analysis_report(
342
- ultra_analysis, score, breakdown, duration
343
  )
344
 
345
  return optimized_prompt, analysis_info, score, breakdown
@@ -348,58 +332,9 @@ class UltraSupremeOptimizer:
348
  logger.error(f"Ultra supreme generation error: {e}", exc_info=True)
349
  return f"❌ Error: {str(e)}", "Please try with a different image.", 0, {}
350
 
351
- def _detect_style(self, prompt: str) -> str:
352
- """Detecta el estilo principal del prompt"""
353
- styles = {
354
- "portrait": ["portrait", "person", "face", "headshot"],
355
- "landscape": ["landscape", "mountain", "nature", "scenery"],
356
- "street": ["street", "urban", "city"],
357
- "artistic": ["artistic", "abstract", "conceptual"],
358
- "dramatic": ["dramatic", "cinematic", "moody"]
359
- }
360
-
361
- prompt_lower = prompt.lower()
362
- for style_name, keywords in styles.items():
363
- if any(keyword in prompt_lower for keyword in keywords):
364
- return style_name
365
-
366
- return "general"
367
-
368
- def _detect_subject(self, prompt: str) -> str:
369
- """Detecta el sujeto principal del prompt"""
370
- if not prompt:
371
- return "Unknown"
372
-
373
- # Tomar las primeras palabras significativas
374
- words = prompt.split(',')[0].split()
375
- if len(words) > 3:
376
- return ' '.join(words[:4])
377
- return prompt.split(',')[0] if prompt else "Unknown"
378
-
379
- def _calculate_score(self, optimized_prompt: str, base_prompt: str) -> int:
380
- """Calcula el score basado en la calidad del prompt"""
381
- score = 0
382
-
383
- # Base score por longitud y riqueza
384
- score += min(len(base_prompt) // 10, 25)
385
-
386
- # Technical enhancement
387
- if "Shot on" in optimized_prompt:
388
- score += 25
389
-
390
- # Lighting quality
391
- if "lighting" in optimized_prompt.lower():
392
- score += 25
393
-
394
- # Professional quality
395
- if any(word in optimized_prompt.lower() for word in ["professional", "masterful", "epic", "cinematic"]):
396
- score += 25
397
-
398
- return min(score, 100)
399
-
400
  def _generate_ultra_analysis_report(self, analysis: Dict[str, Any],
401
  score: int, breakdown: Dict[str, int],
402
- duration: float) -> str:
403
  """Generate ultra detailed analysis report with multi-model results"""
404
 
405
  device_used = "cuda" if torch.cuda.is_available() else "cpu"
@@ -457,9 +392,12 @@ class UltraSupremeOptimizer:
457
  # Intelligence metrics
458
  metrics = analysis["intelligence_metrics"]
459
 
 
 
 
460
  analysis_info = f"""**🚀 ULTRA SUPREME MULTI-MODEL ANALYSIS COMPLETE**
461
- **Processing:** {gpu_status} • {duration:.1f}s • Multi-Model Pipeline
462
- **Ultra Score:** {score}/100 • Models: CLIP + DeepFace + MediaPipe + Transformers
463
 
464
  **📊 BREAKDOWN:**
465
  • Prompt Quality: {breakdown.get('prompt_quality', 0)}/25
@@ -467,6 +405,9 @@ class UltraSupremeOptimizer:
467
  • Model Confidence: {breakdown.get('model_confidence', 0)}/25
468
  • Feature Richness: {breakdown.get('feature_richness', 0)}/25
469
 
 
 
 
470
  **🧠 DEEP ANALYSIS RESULTS:**
471
 
472
  **👤 DEMOGRAPHICS & IDENTITY:**
@@ -491,9 +432,9 @@ class UltraSupremeOptimizer:
491
  • **Technical Optimization:** {metrics['technical_optimization_score']}/100
492
 
493
  **✨ MULTI-MODEL ADVANTAGES:**
 
494
  ✅ DeepFace: Accurate age, gender, emotion detection
495
  ✅ MediaPipe: Body pose and gesture analysis
496
- ✅ CLIP: Semantic understanding and context
497
  ✅ Transformers: Advanced emotion classification
498
  ✅ OpenCV: Robust face detection
499
 
 
1
  """
2
  Ultra Supreme Optimizer - Main optimization engine for image analysis
3
+ VERSIÓN FLORENCE-2 - Usa Florence-2 en lugar de CLIP Interrogator
4
  """
5
 
6
  # IMPORTANT: spaces must be imported BEFORE torch or any CUDA-using library
 
14
  import torch
15
  import numpy as np
16
  from PIL import Image
17
+ from transformers import AutoProcessor, AutoModelForCausalLM
18
 
19
  from analyzer import UltraSupremeAnalyzer
20
 
 
25
  """Main optimizer class for ultra supreme image analysis"""
26
 
27
  def __init__(self):
28
+ self.processor = None
29
+ self.model = None
30
  self.analyzer = UltraSupremeAnalyzer()
31
  self.usage_count = 0
32
  self.device = self._get_device()
33
  self.is_initialized = False
 
34
 
35
  @staticmethod
36
  def _get_device() -> str:
 
43
  return "cpu"
44
 
45
  def initialize_model(self) -> bool:
46
+ """Initialize Florence-2 model"""
47
  if self.is_initialized:
48
  return True
49
 
50
  try:
51
+ logger.info("Loading Florence-2 model...")
52
+
53
+ # Load Florence-2 base model (you can also use 'microsoft/Florence-2-large' for better quality)
54
+ model_id = "microsoft/Florence-2-base"
55
+
56
+ self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
57
+ self.model = AutoModelForCausalLM.from_pretrained(
58
+ model_id,
59
+ trust_remote_code=True,
60
+ torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
61
  )
62
 
63
+ # Keep model on CPU initially
64
+ self.model = self.model.to("cpu")
65
+ self.model.eval()
66
+
67
  self.is_initialized = True
68
 
69
  # Clean up memory after initialization
70
  gc.collect()
71
 
72
+ logger.info("Florence-2 model initialized successfully")
73
  return True
74
 
75
  except Exception as e:
76
+ logger.error(f"Model initialization error: {e}")
77
  return False
78
 
79
  def optimize_image(self, image: Any) -> Optional[Image.Image]:
 
92
  if image.mode != 'RGB':
93
  image = image.convert('RGB')
94
 
95
+ # Florence-2 handles various sizes well, but let's be reasonable
96
+ max_size = 1024
97
  if image.size[0] > max_size or image.size[1] > max_size:
98
  image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
99
 
 
104
  return None
105
 
106
  def apply_flux_rules(self, base_prompt: str) -> str:
107
+ """Aplica las reglas de Flux a un prompt base"""
108
 
109
  # Limpiar el prompt de elementos no deseados
110
  cleanup_patterns = [
 
154
 
155
  return final_prompt
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  @spaces.GPU(duration=60)
158
+ def run_florence_inference(self, image: Image.Image) -> Tuple[str, str, str]:
159
+ """Run Florence-2 inference on GPU"""
160
  try:
161
+ # Move model to GPU
162
+ self.model = self.model.to("cuda")
163
+ logger.info("Florence-2 model moved to GPU")
164
+
165
+ # Task prompts for different types of analysis
166
+ tasks = {
167
+ "detailed_caption": "<DETAILED_CAPTION>",
168
+ "more_detailed_caption": "<MORE_DETAILED_CAPTION>",
169
+ "caption": "<CAPTION>",
170
+ "dense_region_caption": "<DENSE_REGION_CAPTION>"
171
+ }
172
+
173
+ results = {}
174
+
175
+ # Run different captioning tasks
176
+ for task_name, task_prompt in tasks.items():
177
+ try:
178
+ inputs = self.processor(text=task_prompt, images=image, return_tensors="pt")
179
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
180
+
181
+ with torch.cuda.amp.autocast(dtype=torch.float16):
182
+ generated_ids = self.model.generate(
183
+ input_ids=inputs["input_ids"],
184
+ pixel_values=inputs["pixel_values"],
185
+ max_new_tokens=1024,
186
+ num_beams=3,
187
+ do_sample=False
188
+ )
189
+
190
+ generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
191
+ parsed = self.processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
192
+
193
+ # Extract the caption from the parsed result
194
+ if task_prompt in parsed:
195
+ results[task_name] = parsed[task_prompt]
196
+ else:
197
+ # Sometimes the result is directly in the parsed output
198
+ results[task_name] = str(parsed) if parsed else ""
199
+
200
+ except Exception as e:
201
+ logger.warning(f"Error in {task_name}: {e}")
202
+ results[task_name] = ""
203
+
204
+ # Extract results
205
+ detailed_caption = results.get("detailed_caption", "")
206
+ more_detailed = results.get("more_detailed_caption", "")
207
+ caption = results.get("caption", "")
208
+
209
+ # Combine for a comprehensive description
210
+ if more_detailed:
211
+ full_prompt = more_detailed
212
+ elif detailed_caption:
213
+ full_prompt = detailed_caption
214
+ else:
215
+ full_prompt = caption
216
 
217
+ # Use different levels as our three outputs
218
+ clip_fast = caption if caption else "A photograph"
219
+ clip_classic = detailed_caption if detailed_caption else full_prompt
220
+ clip_best = more_detailed if more_detailed else full_prompt
221
 
222
+ logger.info(f"Florence-2 captions generated successfully")
 
 
 
223
 
224
  return full_prompt, clip_fast, clip_classic
225
 
226
  except Exception as e:
227
+ logger.error(f"Florence-2 inference error: {e}")
228
+ # Move model back to CPU to free GPU memory
229
+ self.model = self.model.to("cpu")
230
+ raise e
231
+ finally:
232
+ # Always move model back to CPU after inference
233
+ self.model = self.model.to("cpu")
234
+ torch.cuda.empty_cache()
235
 
236
  def generate_ultra_supreme_prompt(self, image: Any) -> Tuple[str, str, int, Dict[str, int]]:
237
  """
238
+ Generate ultra supreme prompt from image usando Florence-2
239
 
240
  Returns:
241
  Tuple of (prompt, analysis_info, score, breakdown)
 
259
 
260
  start_time = datetime.now()
261
 
262
+ logger.info("ULTRA SUPREME ANALYSIS - Starting with Florence-2")
263
 
264
+ # Ejecutar inferencia Florence-2
265
+ try:
266
+ full_prompt, caption_fast, caption_detailed = self.run_florence_inference(image)
267
+ except Exception as e:
268
+ logger.error(f"Florence-2 failed: {e}")
269
+ # Fallback básico
270
  full_prompt = "A photograph"
271
+ caption_fast = "image"
272
+ caption_detailed = "detailed image"
273
 
274
+ logger.info(f"Florence-2 caption: {full_prompt[:100]}...")
275
 
276
+ # Ejecutar análisis ultra supremo con múltiples modelos
277
  logger.info("Running multi-model ultra supreme analysis...")
278
  ultra_analysis = self.analyzer.ultra_supreme_analysis(
279
+ image, caption_fast, caption_detailed, full_prompt
280
  )
281
 
282
  # Construir prompt mejorado basado en análisis completo
283
  enhanced_prompt_parts = []
284
 
285
+ # Base prompt de Florence
286
  enhanced_prompt_parts.append(full_prompt)
287
 
288
  # Agregar información demográfica si está disponible
 
323
 
324
  # Generate enhanced analysis report con datos de múltiples modelos
325
  analysis_info = self._generate_ultra_analysis_report(
326
+ ultra_analysis, score, breakdown, duration, "Florence-2"
327
  )
328
 
329
  return optimized_prompt, analysis_info, score, breakdown
 
332
  logger.error(f"Ultra supreme generation error: {e}", exc_info=True)
333
  return f"❌ Error: {str(e)}", "Please try with a different image.", 0, {}
334
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  def _generate_ultra_analysis_report(self, analysis: Dict[str, Any],
336
  score: int, breakdown: Dict[str, int],
337
+ duration: float, caption_model: str = "Florence-2") -> str:
338
  """Generate ultra detailed analysis report with multi-model results"""
339
 
340
  device_used = "cuda" if torch.cuda.is_available() else "cpu"
 
392
  # Intelligence metrics
393
  metrics = analysis["intelligence_metrics"]
394
 
395
+ # Caption info
396
+ caption_info = analysis.get("clip_best", "")[:150] + "..." if len(analysis.get("clip_best", "")) > 150 else analysis.get("clip_best", "")
397
+
398
  analysis_info = f"""**🚀 ULTRA SUPREME MULTI-MODEL ANALYSIS COMPLETE**
399
+ **Processing:** {gpu_status} • {duration:.1f}s • {caption_model} + Multi-Model Pipeline
400
+ **Ultra Score:** {score}/100 • Models: {caption_model} + DeepFace + MediaPipe + Transformers
401
 
402
  **📊 BREAKDOWN:**
403
  • Prompt Quality: {breakdown.get('prompt_quality', 0)}/25
 
405
  • Model Confidence: {breakdown.get('model_confidence', 0)}/25
406
  • Feature Richness: {breakdown.get('feature_richness', 0)}/25
407
 
408
+ **📝 VISION-LANGUAGE ANALYSIS:**
409
+ **{caption_model} Caption:** {caption_info}
410
+
411
  **🧠 DEEP ANALYSIS RESULTS:**
412
 
413
  **👤 DEMOGRAPHICS & IDENTITY:**
 
432
  • **Technical Optimization:** {metrics['technical_optimization_score']}/100
433
 
434
  **✨ MULTI-MODEL ADVANTAGES:**
435
+ ✅ {caption_model}: State-of-the-art vision-language understanding
436
  ✅ DeepFace: Accurate age, gender, emotion detection
437
  ✅ MediaPipe: Body pose and gesture analysis
 
438
  ✅ Transformers: Advanced emotion classification
439
  ✅ OpenCV: Robust face detection
440