Spaces:

StarPigeon
/

ViDove

Sleeping

File size: 4,491 Bytes

3f9cb68
b0198e8
 
 
 
 
1b862fc
b0198e8
 
3f9cb68
 
 
 
 
8abf414
 
3f9cb68
8abf414
3f9cb68
8abf414
3f9cb68
 
 
 
 
 
 
 
8abf414
 
3f9cb68
 
8abf414
3f9cb68
 
8abf414
3f9cb68
8abf414
 
 
 
 
 
 
3f9cb68
 
8abf414
 
 
1b862fc
 
 
 
 
 
 
 
8abf414
1b862fc
 
 
 
 
 
 
 
 
 
 
8abf414
3f9cb68
 
 
 
 
 
8abf414
3f9cb68
 
 
8abf414
1b862fc

# -*- coding: utf-8 -*-
# This script is used to evaluate the performance of Pigeon AI Video Translation system by using Large Language Model.

# Written by Jiaen LIU, 2023/09/18

# Import the necessary packages
import re
from langchain.evaluation import load_evaluator, EvaluatorType
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
# from src.srt_util.srt import SrtScript

# Load the evaluator

def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", model="gpt-4-0613"):
    llm = ChatOpenAI(temperature=0, model=model)

    lang_str = f"You are an expert {source_lang} to {target_lang} translator specialized in {domain}."

    fstring = """
            You are grading the following question:
            {input}
            Here is the real answer:
            {reference}
            You are grading the following predicted answer:
            {output}
            based on the following criteria: 
            {criteria}
            Give one grades, accuracy and rate them from a scale of 0 to 100, where 0 is the lowest (very low accuracy) and 100 is the highest (very high accuracy)? 
            Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are 
            numerically incorrect this also includes values that have the $ in front
            Please give the completeness score first followed by the accuracy score. 
            For example: Accuracy: 40. Explanation here
            Do not differ from the format ever
            """
    prompt = PromptTemplate.from_template(lang_str+fstring, template_format="f-string")


# Give two grades, one for completness and another for accuracy and rate them from a scale of 0 to 100, where 0 is the lowest (very low completeness/accuracy) and 100 is the highest (very high completness/accuracy)? 
#             Do not base the two scores off each other give them the scores independently. Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are 
#             numerically incorrect this also includes values that have the $ in front
#             Please give the completeness score first followed by the accuracy score. 
#             For example: Completeness: 70. Accuracy: 40. Explanation here
#             Do not differ from the format ever
    return load_evaluator("labeled_criteria", llm=llm, prompt=prompt, criteria="correctness")

# prase the output of the evaluation
# example : 
# 'value': 'Accuracy: 80. The predicted answer is partially correct. The sentence "这是一个测试句子" translates to "This is a test sentence" in English. However, the original sentence is "This is an test sentences" which is grammatically incorrect in English. The correct translation should be "这是一个测试句子" if we correct the English sentence to "This is a test sentence". Therefore, the predicted answer is not entirely wrong, but it does not match the original sentence exactly due to the grammatical error in the original sentence.'
# def parse_eval_result(eval_result):
#     # score = eval_result.score
#     value = eval_result["value"]
#     value = value.split("Accuracy: ")[1].split(".")
#     # combine the rest of the string into the whole explanation
#     explanation = ".".join(value[1:])
#     return int(value[0]), explanation

def parse_eval_result(eval_result):
    # Extract the 'Accuracy' score using a regular expression from the 'reasoning' key
    accuracy_match = re.search(r'Accuracy: (\d+)', eval_result['reasoning'])
    if accuracy_match:
        accuracy = int(accuracy_match.group(1))
    else:
        accuracy = 0

    # Directly get the 'Explanation' value from the 'value' key
    explanation = eval_result['value']

    return accuracy, explanation

def evaluate_prediction(input, reference, prediction, evaluator):
    eval_result = evaluator.evaluate_strings(
        prediction=prediction,
        input=input,
        reference=reference,
    )
    return parse_eval_result(eval_result)

if __name__ == "__main__":
    evaluator = init_evaluator()
    # For no input english sentence, just put "" in the input
    accuracy, explanation = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
    print("Accuracy:", accuracy)
    print("Explanation:", explanation)