increase timeout for parallel
Browse files
VerifiableRewardsForScalableLogicalReasoning.py
CHANGED
@@ -135,7 +135,7 @@ def _evaluate_with_prolog(prediction, validation_program, eval_config, timeout=5
|
|
135 |
# extract predicate from rule_to_evaluate
|
136 |
rule_to_evaluate = extract_ilp_from_text_v2(prediction, positive_pred, allow_multiple_rules)
|
137 |
if positive_pred not in rule_to_evaluate:
|
138 |
-
logger.warning(f"Rule
|
139 |
return {
|
140 |
"is_correct": False,
|
141 |
"partial_score": 0.0,
|
@@ -200,7 +200,7 @@ check_all :- forall((pos({vars});neg({vars})), check({vars})).
|
|
200 |
|
201 |
is_correct = True if partial_score == 1.0 else False
|
202 |
|
203 |
-
error =
|
204 |
t1 = time.time()
|
205 |
|
206 |
return {
|
@@ -212,7 +212,7 @@ check_all :- forall((pos({vars});neg({vars})), check({vars})).
|
|
212 |
}
|
213 |
|
214 |
except subprocess.TimeoutExpired:
|
215 |
-
logger.warning(f"Evaluation timed out after {timeout} seconds for rule: {rule_to_evaluate}
|
216 |
return {"is_correct": False, "partial_score": 0.0, "syntax_valid": False,
|
217 |
"error": f"Evaluation timed out after {timeout} seconds"}
|
218 |
except Exception as e:
|
@@ -323,6 +323,7 @@ class VerifiableRewardsForScalableLogicalReasoning(evaluate.Metric):
|
|
323 |
raise ValueError(
|
324 |
f"Number of predictions ({len(predictions)}) and references {len(references)}) don't match")
|
325 |
|
|
|
326 |
# Prepare evaluation inputs
|
327 |
eval_inputs = []
|
328 |
for i, (prediction, reference) in enumerate(zip(predictions, references)):
|
@@ -338,7 +339,7 @@ class VerifiableRewardsForScalableLogicalReasoning(evaluate.Metric):
|
|
338 |
if not validation_program:
|
339 |
raise ValueError(f"Example {i} does not contain validation program field")
|
340 |
|
341 |
-
eval_inputs.append((prediction, validation_program,
|
342 |
|
343 |
# if more than 1k predictions, we use multiprocessing to speed up the evaluation
|
344 |
if len(eval_inputs) > 500:
|
|
|
135 |
# extract predicate from rule_to_evaluate
|
136 |
rule_to_evaluate = extract_ilp_from_text_v2(prediction, positive_pred, allow_multiple_rules)
|
137 |
if positive_pred not in rule_to_evaluate:
|
138 |
+
logger.warning(f"Rule does not contain positive predicate '{positive_pred}'")
|
139 |
return {
|
140 |
"is_correct": False,
|
141 |
"partial_score": 0.0,
|
|
|
200 |
|
201 |
is_correct = True if partial_score == 1.0 else False
|
202 |
|
203 |
+
error = f'{result.stderr} -> Eval Rule "{rule_to_evaluate}"' if result.stderr else None
|
204 |
t1 = time.time()
|
205 |
|
206 |
return {
|
|
|
212 |
}
|
213 |
|
214 |
except subprocess.TimeoutExpired:
|
215 |
+
logger.warning(f"Evaluation timed out after {timeout} seconds for rule: '{rule_to_evaluate.replace('\n', ' ')}'")
|
216 |
return {"is_correct": False, "partial_score": 0.0, "syntax_valid": False,
|
217 |
"error": f"Evaluation timed out after {timeout} seconds"}
|
218 |
except Exception as e:
|
|
|
323 |
raise ValueError(
|
324 |
f"Number of predictions ({len(predictions)}) and references {len(references)}) don't match")
|
325 |
|
326 |
+
TIMEOUT = 15 if len(predictions) > 500 else 5
|
327 |
# Prepare evaluation inputs
|
328 |
eval_inputs = []
|
329 |
for i, (prediction, reference) in enumerate(zip(predictions, references)):
|
|
|
339 |
if not validation_program:
|
340 |
raise ValueError(f"Example {i} does not contain validation program field")
|
341 |
|
342 |
+
eval_inputs.append((prediction, validation_program, eval_configm, TIMEOUT))
|
343 |
|
344 |
# if more than 1k predictions, we use multiprocessing to speed up the evaluation
|
345 |
if len(eval_inputs) > 500:
|