Spaces:

guydav
/

restrictedpython_code_eval

Sleeping

App Files Files Community

guydav commited on Aug 30, 2023

Commit

6b34a48

1 Parent(s): 3baab76

Allowing capturing and returning output

Browse files

Files changed (2) hide show

README.md +4 -0
restrictedpython_code_eval.py +17 -8

README.md CHANGED Viewed

@@ -64,6 +64,10 @@ In addition, this metric supports three additional arguments, specifying which i
 **`allow_underscore_variable_names`**: (`bool`): Whether or not to allow the use of variable names starting with an underscore. Defaults to False, as it's considered [harmful](https://stackoverflow.com/questions/1301346/what-is-the-meaning-of-a-single-and-a-double-underscore-before-an-object-name).
 As the new arguments are optional, this could be used as a drop-in replacement for `code_eval`.
 Additionally, this metric sets several different `globals` if they are not provided as additional globals. The full list of globals set is: `__metaclass__, __name__, _getiter_, _iter_unpack_sequence_, _getitem_, getattr, _write_, _inplacevar_, _print_`. See the code for additional details.

 **`allow_underscore_variable_names`**: (`bool`): Whether or not to allow the use of variable names starting with an underscore. Defaults to False, as it's considered [harmful](https://stackoverflow.com/questions/1301346/what-is-the-meaning-of-a-single-and-a-double-underscore-before-an-object-name).
+**`return_output`**: (`bool`): Whether or not to return the output of the code. Defaults to False.
+**`output_variable`**: (`str`): The name of the variable to return the output of. Defaults to `'output'`.
 As the new arguments are optional, this could be used as a drop-in replacement for `code_eval`.
 Additionally, this metric sets several different `globals` if they are not provided as additional globals. The full list of globals set is: `__metaclass__, __name__, _getiter_, _iter_unpack_sequence_, _getitem_, getattr, _write_, _inplacevar_, _print_`. See the code for additional details.

restrictedpython_code_eval.py CHANGED Viewed

@@ -263,6 +263,8 @@ Args:
     allowed_imports: an optional list of string, modules the tested code is allowed to import
     allow_str_format: a bool indicating whether to allow the use of str.format() in the tested code
     allow_underscore_variable_names: a bool indicating whether to allow the use of underscore variable names in the tested code
 Returns:
     pass_at_k: dict with pass rates for each k
@@ -350,7 +352,7 @@ class RestrictedPythonCodeEval(evaluate.Metric):
                  use_safe_builtins: bool = True, use_limited_builtins: bool = True, use_utility_builtins: bool = True,
                  additional_globals: Optional[Dict[str, Any]] = None, additional_locals: Optional[Dict[str, Any]] = None,
                  allowed_imports: Optional[List[str]] = None, allow_str_format: bool = False,
-                 allow_underscore_variable_names: bool = False):
         """Returns the scores"""
         if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
@@ -372,7 +374,8 @@ class RestrictedPythonCodeEval(evaluate.Metric):
                         test_program, timeout, task_id, completion_id[task_id],
                         use_safe_builtins, use_limited_builtins, use_utility_builtins,
                         additional_globals, additional_locals,
-                        allowed_imports, allow_str_format, allow_underscore_variable_names
                     )
                     future = executor.submit(_check_correctness, *args)
                     futures.append(future)
@@ -421,7 +424,7 @@ def _check_correctness(check_program, timeout, task_id, completion_id,
                        use_safe_builtins: bool = True, use_limited_builtins: bool = True, use_utility_builtins: bool = True,
                        additional_globals: Optional[Dict[str, Any]] = None, additional_locals: Optional[Dict[str, Any]] = None,
                        allowed_imports: Optional[List[str]] = None, allow_str_format: bool = False,
-                       allow_underscore_variable_names: bool = False):
     """
     Evaluates the functional correctness of a completion by running the test
     suite provided in the problem.
@@ -437,6 +440,7 @@ def _check_correctness(check_program, timeout, task_id, completion_id,
         use_safe_builtins, use_limited_builtins, use_utility_builtins,
         additional_globals, additional_locals,
         allowed_imports, allow_str_format, allow_underscore_variable_names,
     )
     p = multiprocessing.Process(target=_unsafe_execute, args=args)
     p.start()
@@ -515,12 +519,11 @@ class DefaultPrinter:
         print(*objects, **kwargs)
 def _unsafe_execute(check_program, result, timeout,
                     use_safe_builtins: bool = True, use_limited_builtins: bool = True, use_utility_builtins: bool = True,
                     additional_globals: Optional[Dict[str, Any]] = None, additional_locals: Optional[Dict[str, Any]] = None,
                     allowed_imports: Optional[List[str]] = None, allow_str_format: bool = False,
-                    allow_underscore_variable_names: bool = False):
     with create_tempdir():
@@ -535,6 +538,9 @@ def _unsafe_execute(check_program, result, timeout,
         # Disable functionalities that can make destructive changes to the test.
         reliability_guard()
         # Run program.
         try:
             builtins = {}
@@ -604,12 +610,15 @@ def _unsafe_execute(check_program, result, timeout,
             with swallow_io():
                 policy_class = AllowAugmentedAssignAndUnderscoreVariableNamesRestrictingTransformer if allow_underscore_variable_names else AllowAugmentedAssignRestrictingTransformer
                 with time_limit(timeout):
                     byte_code = compile_restricted(check_program, filename="<model output>", mode="exec", policy=policy_class)
                     exec(byte_code, exec_globals, additional_locals)
-            result.append("passed")
         except EOFError:
             result.append("EOF error")
         except TimeoutException:

     allowed_imports: an optional list of string, modules the tested code is allowed to import
     allow_str_format: a bool indicating whether to allow the use of str.format() in the tested code
     allow_underscore_variable_names: a bool indicating whether to allow the use of underscore variable names in the tested code
+    return_output: a bool indicating whether to return the output of the tested code
+    output_variable: a string indicating the name of the variable to return if return_output is True
 Returns:
     pass_at_k: dict with pass rates for each k
                  use_safe_builtins: bool = True, use_limited_builtins: bool = True, use_utility_builtins: bool = True,
                  additional_globals: Optional[Dict[str, Any]] = None, additional_locals: Optional[Dict[str, Any]] = None,
                  allowed_imports: Optional[List[str]] = None, allow_str_format: bool = False,
+                 allow_underscore_variable_names: bool = False, return_output: bool = False, output_variable: str = "output"):
         """Returns the scores"""
         if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
                         test_program, timeout, task_id, completion_id[task_id],
                         use_safe_builtins, use_limited_builtins, use_utility_builtins,
                         additional_globals, additional_locals,
+                        allowed_imports, allow_str_format, allow_underscore_variable_names,
+                        return_output, output_variable,
                     )
                     future = executor.submit(_check_correctness, *args)
                     futures.append(future)
                        use_safe_builtins: bool = True, use_limited_builtins: bool = True, use_utility_builtins: bool = True,
                        additional_globals: Optional[Dict[str, Any]] = None, additional_locals: Optional[Dict[str, Any]] = None,
                        allowed_imports: Optional[List[str]] = None, allow_str_format: bool = False,
+                       allow_underscore_variable_names: bool = False, return_output: bool = False, output_variable: str = "output"):
     """
     Evaluates the functional correctness of a completion by running the test
     suite provided in the problem.
         use_safe_builtins, use_limited_builtins, use_utility_builtins,
         additional_globals, additional_locals,
         allowed_imports, allow_str_format, allow_underscore_variable_names,
+        return_output, output_variable
     )
     p = multiprocessing.Process(target=_unsafe_execute, args=args)
     p.start()
         print(*objects, **kwargs)
 def _unsafe_execute(check_program, result, timeout,
                     use_safe_builtins: bool = True, use_limited_builtins: bool = True, use_utility_builtins: bool = True,
                     additional_globals: Optional[Dict[str, Any]] = None, additional_locals: Optional[Dict[str, Any]] = None,
                     allowed_imports: Optional[List[str]] = None, allow_str_format: bool = False,
+                    allow_underscore_variable_names: bool = False, return_output: bool = False, output_variable: str = "output"):
     with create_tempdir():
         # Disable functionalities that can make destructive changes to the test.
         reliability_guard()
+        if return_output and additional_locals is None:
+            additional_locals = {}
         # Run program.
         try:
             builtins = {}
             with swallow_io():
                 policy_class = AllowAugmentedAssignAndUnderscoreVariableNamesRestrictingTransformer if allow_underscore_variable_names else AllowAugmentedAssignRestrictingTransformer
                 with time_limit(timeout):
                     byte_code = compile_restricted(check_program, filename="<model output>", mode="exec", policy=policy_class)
                     exec(byte_code, exec_globals, additional_locals)
+            if return_output:
+                result.append(additional_locals[output_variable])
+            else:
+                result.append("passed")
         except EOFError:
             result.append("EOF error")
         except TimeoutException: