sandbox / judging.py
justinxzhao's picture
Some refactoring, judging responses for direct assessment.
577870e
raw
history blame
660 Bytes
from pydantic import BaseModel, Field, conint
from typing import List, Optional, Literal, Union
class Criteria(BaseModel):
name: str
description: str
min_score: conint(ge=0)
max_score: conint(ge=0)
class DirectAssessment(BaseModel):
type: Literal["direct_assessment"]
criteria: List[Criteria]
prompt: str
class PairwiseComparison(BaseModel):
type: Literal["pairwise_comparison"]
granularity: Literal["coarse", "fine", "super fine"]
ties_allowed: bool
position_swapping: bool
reference_model: str
prompt: str
class JudgingConfig(BaseModel):
assessment: Union[DirectAssessment, PairwiseComparison]