meg-huggingface
commited on
Commit
·
e79b5e9
1
Parent(s):
58956f6
Trying to handle endpoint errors
Browse files
src/backend/inference_endpoint.py
CHANGED
@@ -20,32 +20,38 @@ def create_endpoint(endpoint_name, repository, framework="pytorch", task="text-g
|
|
20 |
logger.debug("Hit the following exception:")
|
21 |
logger.debug(e)
|
22 |
logger.debug("Attempting to continue.")
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
endpoint.fetch()
|
51 |
logger.info("Endpoint status: %s." % (endpoint.status))
|
|
|
20 |
logger.debug("Hit the following exception:")
|
21 |
logger.debug(e)
|
22 |
logger.debug("Attempting to continue.")
|
23 |
+
endpoint = get_inference_endpoint(endpoint_name)
|
24 |
+
endpoint.update(repository=repository, framework=framework, task=task, accelerator=accelerator, instance_size=instance_size, instance_type=instance_type)
|
25 |
+
except huggingface_hub.utils._errors.BadRequestError as e:
|
26 |
+
logger.debug("Hit the following exception:")
|
27 |
+
logger.debug(e)
|
28 |
+
logger.debug("Attempting a new instance type.")
|
29 |
+
if instance_type == "nvidia-l4":
|
30 |
+
# Try a larger, different, more expensive GPU.
|
31 |
+
endpoint = create_inference_endpoint(endpoint_name,
|
32 |
+
repository=repository,
|
33 |
+
framework=framework, task=task,
|
34 |
+
accelerator=accelerator,
|
35 |
+
vendor=vendor, region=region,
|
36 |
+
type=type,
|
37 |
+
instance_size="x1",
|
38 |
+
instance_type="nvidia-a100")
|
39 |
+
elif instance_type == "a100" and instance_size == "x1":
|
40 |
+
endpoint = create_inference_endpoint(endpoint_name,
|
41 |
+
repository=repository,
|
42 |
+
framework=framework, task=task,
|
43 |
+
accelerator=accelerator,
|
44 |
+
vendor=vendor, region=region,
|
45 |
+
type=type,
|
46 |
+
instance_size="x4",
|
47 |
+
instance_type="nvidia-a10g")
|
48 |
+
else:
|
49 |
+
logger.info("Getting expensive to try to run this model without human oversight. Exiting.")
|
50 |
+
sys.exit()
|
51 |
+
except Exception as e:
|
52 |
+
logger.debug("Hit error")
|
53 |
+
logger.debug(e)
|
54 |
+
sys.exit()
|
55 |
|
56 |
endpoint.fetch()
|
57 |
logger.info("Endpoint status: %s." % (endpoint.status))
|