import json import logging import os from typing import Any, Callable from urllib.parse import urlparse import httpx import tenacity from tenacity import RetryCallState from openhands.core.config import OpenHandsConfig from openhands.core.exceptions import ( AgentRuntimeDisconnectedError, AgentRuntimeError, AgentRuntimeNotFoundError, AgentRuntimeNotReadyError, AgentRuntimeUnavailableError, ) from openhands.core.logger import openhands_logger as logger from openhands.events import EventStream from openhands.integrations.provider import PROVIDER_TOKEN_TYPE from openhands.runtime.builder.remote import RemoteRuntimeBuilder from openhands.runtime.impl.action_execution.action_execution_client import ( ActionExecutionClient, ) from openhands.runtime.plugins import PluginRequirement from openhands.runtime.utils.command import ( DEFAULT_MAIN_MODULE, get_action_execution_server_startup_command, ) from openhands.runtime.utils.request import send_request from openhands.runtime.utils.runtime_build import build_runtime_image from openhands.utils.async_utils import call_sync_from_async from openhands.utils.tenacity_stop import stop_if_should_exit class RemoteRuntime(ActionExecutionClient): """This runtime will connect to a remote oh-runtime-client.""" port: int = 60000 # default port for the remote runtime client runtime_id: str | None = None runtime_url: str | None = None _runtime_initialized: bool = False runtime_builder: RemoteRuntimeBuilder container_image: str available_hosts: dict[str, int] main_module: str def __init__( self, config: OpenHandsConfig, event_stream: EventStream, sid: str = 'default', plugins: list[PluginRequirement] | None = None, env_vars: dict[str, str] | None = None, status_callback: Callable[..., None] | None = None, attach_to_existing: bool = False, headless_mode: bool = True, user_id: str | None = None, git_provider_tokens: PROVIDER_TOKEN_TYPE | None = None, main_module: str = DEFAULT_MAIN_MODULE, ) -> None: super().__init__( config, event_stream, sid, plugins, env_vars, status_callback, attach_to_existing, headless_mode, user_id, git_provider_tokens, ) if self.config.sandbox.api_key is None: raise ValueError( 'API key is required to use the remote runtime. ' 'Please set the API key in the config (config.toml) or as an environment variable (SANDBOX_API_KEY).' ) self.session.headers.update({'X-API-Key': self.config.sandbox.api_key}) if self.config.workspace_base is not None: self.log( 'debug', 'Setting workspace_base is not supported in the remote runtime.', ) if self.config.sandbox.remote_runtime_api_url is None: raise ValueError( 'remote_runtime_api_url is required in the remote runtime.' ) assert self.config.sandbox.remote_runtime_class in (None, 'sysbox', 'gvisor') self.main_module = main_module self.runtime_builder = RemoteRuntimeBuilder( self.config.sandbox.remote_runtime_api_url, self.config.sandbox.api_key, self.session, ) self.available_hosts: dict[str, int] = {} self._session_api_key: str | None = None def log(self, level: str, message: str, exc_info: bool | None = None) -> None: getattr(logger, level)( message, stacklevel=2, exc_info=exc_info, extra={ 'session_id': self.sid, 'runtime_id': self.runtime_id, }, ) @property def action_execution_server_url(self) -> str: if self.runtime_url is None: raise NotImplementedError('Runtime URL is not initialized') return self.runtime_url async def connect(self) -> None: try: await call_sync_from_async(self._start_or_attach_to_runtime) except Exception: self.close() self.log('error', 'Runtime failed to start', exc_info=True) raise await call_sync_from_async(self.setup_initial_env) self._runtime_initialized = True def _start_or_attach_to_runtime(self) -> None: self.log('info', 'Starting or attaching to runtime') existing_runtime = self._check_existing_runtime() if existing_runtime: self.log('info', f'Using existing runtime with ID: {self.runtime_id}') elif self.attach_to_existing: self.log('info', f'Failed to find existing runtime for SID: {self.sid}') raise AgentRuntimeNotFoundError( f'Could not find existing runtime for SID: {self.sid}' ) else: self.log('info', 'No existing runtime found, starting a new one') self.send_status_message('STATUS$STARTING_CONTAINER') if self.config.sandbox.runtime_container_image is None: self.log( 'info', f'Building remote runtime with base image: {self.config.sandbox.base_container_image}', ) self._build_runtime() else: self.log( 'info', f'Starting remote runtime with image: {self.config.sandbox.runtime_container_image}', ) self.container_image = self.config.sandbox.runtime_container_image self._start_runtime() assert self.runtime_id is not None, ( 'Runtime ID is not set. This should never happen.' ) assert self.runtime_url is not None, ( 'Runtime URL is not set. This should never happen.' ) self.send_status_message('STATUS$WAITING_FOR_CLIENT') if not self.attach_to_existing: self.log('info', 'Waiting for runtime to be alive...') self._wait_until_alive() if not self.attach_to_existing: self.log('info', 'Runtime is ready.') self.send_status_message(' ') def _check_existing_runtime(self) -> bool: self.log('info', f'Checking for existing runtime with session ID: {self.sid}') try: response = self._send_runtime_api_request( 'GET', f'{self.config.sandbox.remote_runtime_api_url}/sessions/{self.sid}', ) data = response.json() status = data.get('status') self.log('info', f'Found runtime with status: {status}') if status == 'running' or status == 'paused': self._parse_runtime_response(response) except httpx.HTTPError as e: if e.response.status_code == 404: self.log( 'info', f'No existing runtime found for session ID: {self.sid}' ) return False self.log('error', f'Error while looking for remote runtime: {e}') raise except json.decoder.JSONDecodeError as e: self.log( 'error', f'Invalid JSON response from runtime API: {e}. URL: {self.config.sandbox.remote_runtime_api_url}/sessions/{self.sid}. Response: {response}', ) raise if status == 'running': self.log('info', 'Found existing runtime in running state') return True elif status == 'stopped': self.log('info', 'Found existing runtime, but it is stopped') return False elif status == 'paused': self.log( 'info', 'Found existing runtime in paused state, attempting to resume' ) try: self._resume_runtime() self.log('info', 'Successfully resumed paused runtime') return True except Exception as e: self.log( 'error', f'Failed to resume paused runtime: {e}', exc_info=True ) # Return false to indicate we couldn't use the existing runtime return False else: self.log('error', f'Invalid response from runtime API: {data}') return False def _build_runtime(self) -> None: self.log('debug', f'Building RemoteRuntime config:\n{self.config}') response = self._send_runtime_api_request( 'GET', f'{self.config.sandbox.remote_runtime_api_url}/registry_prefix', ) response_json = response.json() registry_prefix = response_json['registry_prefix'] os.environ['OH_RUNTIME_RUNTIME_IMAGE_REPO'] = ( registry_prefix.rstrip('/') + '/runtime' ) self.log( 'debug', f'Runtime image repo: {os.environ["OH_RUNTIME_RUNTIME_IMAGE_REPO"]}', ) if self.config.sandbox.base_container_image is None: raise ValueError( 'base_container_image is required to build the runtime image. ' ) if self.config.sandbox.runtime_extra_deps: self.log( 'debug', f'Installing extra user-provided dependencies in the runtime image: {self.config.sandbox.runtime_extra_deps}', ) # Build the container image self.container_image = build_runtime_image( self.config.sandbox.base_container_image, self.runtime_builder, platform=self.config.sandbox.platform, extra_deps=self.config.sandbox.runtime_extra_deps, force_rebuild=self.config.sandbox.force_rebuild_runtime, ) response = self._send_runtime_api_request( 'GET', f'{self.config.sandbox.remote_runtime_api_url}/image_exists', params={'image': self.container_image}, ) if not response.json()['exists']: raise AgentRuntimeError( f'Container image {self.container_image} does not exist' ) def _start_runtime(self) -> None: # Prepare the request body for the /start endpoint command = self.get_action_execution_server_startup_command() environment: dict[str, str] = {} if self.config.debug or os.environ.get('DEBUG', 'false').lower() == 'true': environment['DEBUG'] = 'true' environment.update(self.config.sandbox.runtime_startup_env_vars) start_request: dict[str, Any] = { 'image': self.container_image, 'command': command, 'working_dir': '/openhands/code/', 'environment': environment, 'session_id': self.sid, 'resource_factor': self.config.sandbox.remote_runtime_resource_factor, } if self.config.sandbox.remote_runtime_class == 'sysbox': start_request['runtime_class'] = 'sysbox-runc' # We ignore other runtime classes for now, because both None and 'gvisor' map to 'gvisor' # Start the sandbox using the /start endpoint try: response = self._send_runtime_api_request( 'POST', f'{self.config.sandbox.remote_runtime_api_url}/start', json=start_request, ) self._parse_runtime_response(response) self.log( 'debug', f'Runtime started. URL: {self.runtime_url}', ) except httpx.HTTPError as e: self.log('error', f'Unable to start runtime: {str(e)}') raise AgentRuntimeUnavailableError() from e def _resume_runtime(self) -> None: """Resume a stopped runtime. Steps: 1. Show status update that runtime is being started. 2. Send the runtime API a /resume request 3. Poll for the runtime to be ready 4. Update env vars """ self.log('info', f'Attempting to resume runtime with ID: {self.runtime_id}') self.send_status_message('STATUS$STARTING_RUNTIME') try: response = self._send_runtime_api_request( 'POST', f'{self.config.sandbox.remote_runtime_api_url}/resume', json={'runtime_id': self.runtime_id}, ) self.log( 'info', f'Resume API call successful with status code: {response.status_code}', ) except Exception as e: self.log('error', f'Failed to call /resume API: {e}', exc_info=True) raise self.log( 'info', 'Runtime resume API call completed, waiting for it to be alive...' ) try: self._wait_until_alive() self.log('info', 'Runtime is now alive after resume') except Exception as e: self.log( 'error', f'Runtime failed to become alive after resume: {e}', exc_info=True, ) raise try: self.setup_initial_env() self.log('info', 'Successfully set up initial environment after resume') except Exception as e: self.log( 'error', f'Failed to set up initial environment after resume: {e}', exc_info=True, ) raise self.log('info', 'Runtime successfully resumed and alive.') def _parse_runtime_response(self, response: httpx.Response) -> None: start_response = response.json() self.runtime_id = start_response['runtime_id'] self.runtime_url = start_response['url'] self.available_hosts = start_response.get('work_hosts', {}) if 'session_api_key' in start_response: self.session.headers.update( {'X-Session-API-Key': start_response['session_api_key']} ) self._session_api_key = start_response['session_api_key'] self.log( 'debug', f'Session API key setted', ) @property def session_api_key(self) -> str | None: return self._session_api_key @property def vscode_url(self) -> str | None: token = super().get_vscode_token() if not token: return None _parsed_url = urlparse(self.runtime_url) assert isinstance(_parsed_url.scheme, str) and isinstance( _parsed_url.netloc, str ) vscode_url = f'{_parsed_url.scheme}://vscode-{_parsed_url.netloc}/?tkn={token}&folder={self.config.workspace_mount_path_in_sandbox}' self.log( 'debug', f'VSCode URL: {vscode_url}', ) return vscode_url @property def web_hosts(self) -> dict[str, int]: return self.available_hosts def _wait_until_alive(self) -> None: retry_decorator = tenacity.retry( stop=tenacity.stop_after_delay( self.config.sandbox.remote_runtime_init_timeout ) | stop_if_should_exit() | self._stop_if_closed, reraise=True, retry=tenacity.retry_if_exception_type(AgentRuntimeNotReadyError), wait=tenacity.wait_fixed(2), ) retry_decorator(self._wait_until_alive_impl)() def _wait_until_alive_impl(self) -> None: self.log('debug', f'Waiting for runtime to be alive at url: {self.runtime_url}') runtime_info_response = self._send_runtime_api_request( 'GET', f'{self.config.sandbox.remote_runtime_api_url}/runtime/{self.runtime_id}', ) runtime_data = runtime_info_response.json() assert 'runtime_id' in runtime_data assert runtime_data['runtime_id'] == self.runtime_id assert 'pod_status' in runtime_data pod_status = runtime_data['pod_status'].lower() self.log('debug', f'Pod status: {pod_status}') restart_count = runtime_data.get('restart_count', 0) if restart_count != 0: restart_reasons = runtime_data.get('restart_reasons') self.log( 'debug', f'Pod restarts: {restart_count}, reasons: {restart_reasons}' ) # FIXME: We should fix it at the backend of /start endpoint, make sure # the pod is created before returning the response. # Retry a period of time to give the cluster time to start the pod if pod_status == 'ready': try: self.check_if_alive() except httpx.HTTPError as e: self.log( 'warning', f"Runtime /alive failed, but pod says it's ready: {str(e)}", ) raise AgentRuntimeNotReadyError( f'Runtime /alive failed to respond with 200: {str(e)}' ) return elif ( pod_status == 'not found' or pod_status == 'pending' or pod_status == 'running' ): # nb: Running is not yet Ready raise AgentRuntimeNotReadyError( f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}' ) elif pod_status in ('failed', 'unknown', 'crashloopbackoff'): if pod_status == 'crashloopbackoff': raise AgentRuntimeUnavailableError( 'Runtime crashed and is being restarted, potentially due to memory usage. Please try again.' ) else: raise AgentRuntimeUnavailableError( f'Runtime is unavailable (status: {pod_status}). Please try again.' ) else: # Maybe this should be a hard failure, but passing through in case the API changes self.log('warning', f'Unknown pod status: {pod_status}') self.log( 'debug', f'Waiting for runtime pod to be active. Current status: {pod_status}', ) raise AgentRuntimeNotReadyError() def close(self) -> None: if self.attach_to_existing: super().close() return if self.config.sandbox.keep_runtime_alive: if self.config.sandbox.pause_closed_runtimes: try: if not self._runtime_closed: self._send_runtime_api_request( 'POST', f'{self.config.sandbox.remote_runtime_api_url}/pause', json={'runtime_id': self.runtime_id}, ) self.log('info', 'Runtime paused.') except Exception as e: self.log('error', f'Unable to pause runtime: {str(e)}') raise e super().close() return try: if not self._runtime_closed: self._send_runtime_api_request( 'POST', f'{self.config.sandbox.remote_runtime_api_url}/stop', json={'runtime_id': self.runtime_id}, ) self.log('info', 'Runtime stopped.') except Exception as e: self.log('error', f'Unable to stop runtime: {str(e)}') raise e finally: super().close() def _send_runtime_api_request( self, method: str, url: str, **kwargs: Any ) -> httpx.Response: try: kwargs['timeout'] = self.config.sandbox.remote_runtime_api_timeout return send_request(self.session, method, url, **kwargs) except httpx.TimeoutException: self.log( 'error', f'No response received within the timeout period for url: {url}', ) raise def _send_action_server_request( self, method: str, url: str, **kwargs: Any ) -> httpx.Response: if not self.config.sandbox.remote_runtime_enable_retries: return self._send_action_server_request_impl(method, url, **kwargs) retry_decorator = tenacity.retry( retry=tenacity.retry_if_exception_type(httpx.NetworkError), stop=tenacity.stop_after_attempt(3) | stop_if_should_exit() | self._stop_if_closed, before_sleep=tenacity.before_sleep_log(logger, logging.WARNING), wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), ) return retry_decorator(self._send_action_server_request_impl)( method, url, **kwargs ) def _send_action_server_request_impl( self, method: str, url: str, **kwargs: Any ) -> httpx.Response: try: return super()._send_action_server_request(method, url, **kwargs) except httpx.TimeoutException: self.log( 'error', f'No response received within the timeout period for url: {url}', ) raise except httpx.HTTPError as e: if hasattr(e, 'response') and e.response.status_code in (404, 502, 504): if e.response.status_code == 404: raise AgentRuntimeDisconnectedError( f'Runtime is not responding. This may be temporary, please try again. Original error: {e}' ) from e else: # 502, 504 raise AgentRuntimeDisconnectedError( f'Runtime is temporarily unavailable. This may be due to a restart or network issue, please try again. Original error: {e}' ) from e elif hasattr(e, 'response') and e.response.status_code == 503: if self.config.sandbox.keep_runtime_alive: self.log( 'info', f'Runtime appears to be paused (503 response). Runtime ID: {self.runtime_id}, URL: {url}', ) try: self._resume_runtime() self.log( 'info', 'Successfully resumed runtime after 503 response' ) return super()._send_action_server_request( method, url, **kwargs ) except Exception as resume_error: self.log( 'error', f'Failed to resume runtime after 503 response: {resume_error}', exc_info=True, ) raise AgentRuntimeDisconnectedError( f'Runtime is paused and could not be resumed. Original error: {e}, Resume error: {resume_error}' ) from resume_error else: self.log( 'info', 'Runtime appears to be paused (503 response) but keep_runtime_alive is False', ) raise AgentRuntimeDisconnectedError( f'Runtime is temporarily unavailable. This may be due to a restart or network issue, please try again. Original error: {e}' ) from e else: raise e def _stop_if_closed(self, retry_state: RetryCallState) -> bool: return self._runtime_closed def get_action_execution_server_startup_command(self): return get_action_execution_server_startup_command( server_port=self.port, plugins=self.plugins, app_config=self.config, main_module=self.main_module, )