Tonic commited on
Commit
579637f
·
1 Parent(s): efd66db

attempt to solve gpu error

Browse files
Files changed (1) hide show
  1. app.py +138 -87
app.py CHANGED
@@ -335,55 +335,100 @@ def make_prediction(symbol: str, timeframe: str = "1d", prediction_days: int = 5
335
 
336
  actual_prediction_length = max(1, actual_prediction_length)
337
 
338
- with torch.inference_mode():
339
- try:
340
- print(f"Attempting prediction with context shape: {context.shape}")
341
- print(f"Prediction length: {actual_prediction_length}")
342
-
343
- # Ensure context is properly formatted for Chronos
344
- if len(context.shape) == 1:
345
- context = context.unsqueeze(0)
346
-
347
- # Verify device and dtype
348
- print(f"Context device: {context.device}")
349
- print(f"Context dtype: {context.dtype}")
350
- print(f"Model device: {next(pipe.model.parameters()).device}")
351
- print(f"Model dtype: {next(pipe.model.parameters()).dtype}")
352
-
353
- # Move model to evaluation mode
354
- pipe.model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
 
356
- # Move the entire model and all its components to GPU
357
  pipe.model = pipe.model.to(device)
358
 
359
- # Move all model parameters and buffers to GPU
360
- for param in pipe.model.parameters():
361
- param.data = param.data.to(device)
362
- for buffer in pipe.model.buffers():
363
- buffer.data = buffer.data.to(device)
364
-
365
- # Move all model submodules to GPU
366
- for module in pipe.model.modules():
367
  if hasattr(module, 'to'):
368
  module.to(device)
 
 
 
 
369
 
370
- # Move all model attributes to GPU
371
- for name, value in pipe.model.__dict__.items():
372
  if isinstance(value, torch.Tensor):
373
- pipe.model.__dict__[name] = value.to(device)
374
 
375
- # Move all model config tensors to GPU
 
 
 
376
  if hasattr(pipe.model, 'config'):
377
  for key, value in pipe.model.config.__dict__.items():
378
  if isinstance(value, torch.Tensor):
379
  setattr(pipe.model.config, key, value.to(device))
380
 
381
- # Move all pipeline tensors to GPU
382
- for name, value in pipe.__dict__.items():
383
- if isinstance(value, torch.Tensor):
384
- setattr(pipe, name, value.to(device))
385
-
386
- # Ensure all model states are on GPU
387
  if hasattr(pipe.model, 'state_dict'):
388
  state_dict = pipe.model.state_dict()
389
  for key in state_dict:
@@ -391,64 +436,70 @@ def make_prediction(symbol: str, timeframe: str = "1d", prediction_days: int = 5
391
  state_dict[key] = state_dict[key].to(device)
392
  pipe.model.load_state_dict(state_dict)
393
 
394
- # Move any additional components to GPU
395
- if hasattr(pipe, 'tokenizer'):
396
- for name, value in pipe.tokenizer.__dict__.items():
397
- if isinstance(value, torch.Tensor):
398
- setattr(pipe.tokenizer, name, value.to(device))
399
-
400
- # Ensure context is properly shaped and on GPU
401
- if len(context.shape) == 1:
402
- context = context.unsqueeze(0)
403
- context = context.to(device)
404
-
405
- # Ensure all inputs are on the same device
406
- with torch.cuda.device(device):
407
- # Force synchronization to ensure all tensors are on GPU
408
- torch.cuda.synchronize()
409
-
410
- # Make prediction
411
- quantiles, mean = pipe.predict_quantiles(
412
- context=context,
413
- prediction_length=actual_prediction_length,
414
- quantile_levels=[0.1, 0.5, 0.9]
415
- )
416
 
417
- if quantiles is None or mean is None:
418
- raise ValueError("Chronos returned empty prediction")
 
 
419
 
420
- print(f"Quantiles shape: {quantiles.shape}, Mean shape: {mean.shape}")
421
-
422
- # Convert to numpy arrays
423
- quantiles = quantiles.detach().cpu().numpy()
424
- mean = mean.detach().cpu().numpy()
425
-
426
- # Denormalize predictions
427
- mean_pred = scaler.inverse_transform(mean.reshape(-1, 1)).flatten()
428
- lower_bound = scaler.inverse_transform(quantiles[0, :, 0].reshape(-1, 1)).flatten()
429
- upper_bound = scaler.inverse_transform(quantiles[0, :, 2].reshape(-1, 1)).flatten()
430
 
431
- # Calculate standard deviation from quantiles
432
- std_pred = (upper_bound - lower_bound) / (2 * 1.645)
 
 
 
 
 
 
433
 
434
- # If we had to limit the prediction length, extend the prediction
435
- if actual_prediction_length < prediction_days:
436
- last_pred = mean_pred[-1]
437
- last_std = std_pred[-1]
438
- extension = np.array([last_pred * (1 + np.random.normal(0, last_std, prediction_days - actual_prediction_length))])
439
- mean_pred = np.concatenate([mean_pred, extension])
440
- std_pred = np.concatenate([std_pred, np.full(prediction_days - actual_prediction_length, last_std)])
441
 
442
- except Exception as e:
443
- print(f"Chronos prediction error: {str(e)}")
444
- print(f"Error type: {type(e)}")
445
- print(f"Error details: {str(e)}")
446
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
 
448
  except Exception as e:
449
- print(f"Chronos prediction failed: {str(e)}")
450
- print("Falling back to technical analysis")
451
- strategy = "technical"
 
452
 
453
  if strategy == "technical":
454
  # Technical analysis based prediction
 
335
 
336
  actual_prediction_length = max(1, actual_prediction_length)
337
 
338
+ with torch.amp.autocast('cuda'):
339
+ # Ensure all inputs are on GPU
340
+ context = context.to(device)
341
+
342
+ # Move quantile levels to GPU
343
+ quantile_levels = torch.tensor([0.1, 0.5, 0.9], device=device, dtype=dtype)
344
+
345
+ # Ensure prediction length is on GPU
346
+ prediction_length = torch.tensor(actual_prediction_length, device=device, dtype=torch.long)
347
+
348
+ # Force all model components to GPU
349
+ pipe.model = pipe.model.to(device)
350
+
351
+ # Move model to evaluation mode
352
+ pipe.model.eval()
353
+
354
+ # Ensure context is properly shaped and on GPU
355
+ if len(context.shape) == 1:
356
+ context = context.unsqueeze(0)
357
+ context = context.to(device)
358
+
359
+ # Move all model parameters and buffers to GPU
360
+ for param in pipe.model.parameters():
361
+ param.data = param.data.to(device)
362
+ for buffer in pipe.model.buffers():
363
+ buffer.data = buffer.data.to(device)
364
+
365
+ # Move all model submodules to GPU
366
+ for module in pipe.model.modules():
367
+ if hasattr(module, 'to'):
368
+ module.to(device)
369
+
370
+ # Move all model attributes to GPU
371
+ for name, value in pipe.model.__dict__.items():
372
+ if isinstance(value, torch.Tensor):
373
+ pipe.model.__dict__[name] = value.to(device)
374
+
375
+ # Move all model config tensors to GPU
376
+ if hasattr(pipe.model, 'config'):
377
+ for key, value in pipe.model.config.__dict__.items():
378
+ if isinstance(value, torch.Tensor):
379
+ setattr(pipe.model.config, key, value.to(device))
380
+
381
+ # Move all pipeline tensors to GPU
382
+ for name, value in pipe.__dict__.items():
383
+ if isinstance(value, torch.Tensor):
384
+ setattr(pipe, name, value.to(device))
385
+
386
+ # Ensure all model states are on GPU
387
+ if hasattr(pipe.model, 'state_dict'):
388
+ state_dict = pipe.model.state_dict()
389
+ for key in state_dict:
390
+ if isinstance(state_dict[key], torch.Tensor):
391
+ state_dict[key] = state_dict[key].to(device)
392
+ pipe.model.load_state_dict(state_dict)
393
+
394
+ # Move any additional components to GPU
395
+ if hasattr(pipe, 'tokenizer'):
396
+ for name, value in pipe.tokenizer.__dict__.items():
397
+ if isinstance(value, torch.Tensor):
398
+ setattr(pipe.tokenizer, name, value.to(device))
399
+
400
+ # Ensure all inputs are on the same device
401
+ with torch.cuda.device(device):
402
+ # Force synchronization to ensure all tensors are on GPU
403
+ torch.cuda.synchronize()
404
 
405
+ # Ensure all model components are on GPU
406
  pipe.model = pipe.model.to(device)
407
 
408
+ # Move any additional tensors in the model to GPU
409
+ for name, module in pipe.model.named_modules():
 
 
 
 
 
 
410
  if hasattr(module, 'to'):
411
  module.to(device)
412
+ # Move any tensors in the module's __dict__
413
+ for key, value in module.__dict__.items():
414
+ if isinstance(value, torch.Tensor):
415
+ setattr(module, key, value.to(device))
416
 
417
+ # Move any additional tensors in the pipeline to GPU
418
+ for name, value in pipe.__dict__.items():
419
  if isinstance(value, torch.Tensor):
420
+ setattr(pipe, name, value.to(device))
421
 
422
+ # Ensure all model components are in eval mode
423
+ pipe.model.eval()
424
+
425
+ # Move any additional tensors in the model's config to GPU
426
  if hasattr(pipe.model, 'config'):
427
  for key, value in pipe.model.config.__dict__.items():
428
  if isinstance(value, torch.Tensor):
429
  setattr(pipe.model.config, key, value.to(device))
430
 
431
+ # Move any additional tensors in the model's state dict to GPU
 
 
 
 
 
432
  if hasattr(pipe.model, 'state_dict'):
433
  state_dict = pipe.model.state_dict()
434
  for key in state_dict:
 
436
  state_dict[key] = state_dict[key].to(device)
437
  pipe.model.load_state_dict(state_dict)
438
 
439
+ # Move any additional tensors in the model's buffers to GPU
440
+ for name, buffer in pipe.model.named_buffers():
441
+ if buffer is not None:
442
+ pipe.model.register_buffer(name, buffer.to(device))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
 
444
+ # Move any additional tensors in the model's parameters to GPU
445
+ for name, param in pipe.model.named_parameters():
446
+ if param is not None:
447
+ param.data = param.data.to(device)
448
 
449
+ # Move any additional tensors in the model's attributes to GPU
450
+ for name, value in pipe.model.__dict__.items():
451
+ if isinstance(value, torch.Tensor):
452
+ pipe.model.__dict__[name] = value.to(device)
 
 
 
 
 
 
453
 
454
+ # Move any additional tensors in the model's modules to GPU
455
+ for name, module in pipe.model.named_modules():
456
+ if hasattr(module, 'to'):
457
+ module.to(device)
458
+ # Move any tensors in the module's __dict__
459
+ for key, value in module.__dict__.items():
460
+ if isinstance(value, torch.Tensor):
461
+ setattr(module, key, value.to(device))
462
 
463
+ # Force synchronization again to ensure all tensors are on GPU
464
+ torch.cuda.synchronize()
 
 
 
 
 
465
 
466
+ # Make prediction
467
+ quantiles, mean = pipe.predict_quantiles(
468
+ context=context,
469
+ prediction_length=actual_prediction_length,
470
+ quantile_levels=[0.1, 0.5, 0.9]
471
+ )
472
+
473
+ if quantiles is None or mean is None:
474
+ raise ValueError("Chronos returned empty prediction")
475
+
476
+ print(f"Quantiles shape: {quantiles.shape}, Mean shape: {mean.shape}")
477
+
478
+ # Convert to numpy arrays
479
+ quantiles = quantiles.detach().cpu().numpy()
480
+ mean = mean.detach().cpu().numpy()
481
+
482
+ # Denormalize predictions
483
+ mean_pred = scaler.inverse_transform(mean.reshape(-1, 1)).flatten()
484
+ lower_bound = scaler.inverse_transform(quantiles[0, :, 0].reshape(-1, 1)).flatten()
485
+ upper_bound = scaler.inverse_transform(quantiles[0, :, 2].reshape(-1, 1)).flatten()
486
+
487
+ # Calculate standard deviation from quantiles
488
+ std_pred = (upper_bound - lower_bound) / (2 * 1.645)
489
+
490
+ # If we had to limit the prediction length, extend the prediction
491
+ if actual_prediction_length < prediction_days:
492
+ last_pred = mean_pred[-1]
493
+ last_std = std_pred[-1]
494
+ extension = np.array([last_pred * (1 + np.random.normal(0, last_std, prediction_days - actual_prediction_length))])
495
+ mean_pred = np.concatenate([mean_pred, extension])
496
+ std_pred = np.concatenate([std_pred, np.full(prediction_days - actual_prediction_length, last_std)])
497
 
498
  except Exception as e:
499
+ print(f"Chronos prediction error: {str(e)}")
500
+ print(f"Error type: {type(e)}")
501
+ print(f"Error details: {str(e)}")
502
+ raise
503
 
504
  if strategy == "technical":
505
  # Technical analysis based prediction