File size: 5,873 Bytes
026b316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import json
from sentence_transformers import SentenceTransformer, util
import nltk
from openai import OpenAI
import os
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import time
import asyncio
import logging
import sys
# Configure logging
logging.basicConfig(level=logging.INFO)
# Download necessary NLTK resources
nltk.download('punkt')
def load_input_data():
    """Load input data from command line arguments."""
    try:
        input_data = json.loads(sys.argv[1])
        return input_data
    except json.JSONDecodeError as e:
        logging.error(f"Failed to decode JSON input: {e}")
        sys.exit(1)

def initialize_openai_client(api_key, base_url):
    """Initialize the OpenAI client."""
    return OpenAI(api_key=api_key, base_url=base_url)

def load_model():
    """Load the pre-trained models for evaluation."""
    semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
    return semantic_model

def evaluate_semantic_similarity(expected_response, model_response, semantic_model):
    """Evaluate semantic similarity using Sentence-BERT."""
    expected_embedding = semantic_model.encode(expected_response, convert_to_tensor=True)
    model_embedding = semantic_model.encode(model_response, convert_to_tensor=True)
    similarity_score = util.pytorch_cos_sim(expected_embedding, model_embedding)
    return similarity_score.item()

def evaluate_bleu(expected_response, model_response):
    """Evaluate BLEU score using NLTK's sentence_bleu."""
    expected_tokens = nltk.word_tokenize(expected_response.lower())
    model_tokens = nltk.word_tokenize(model_response.lower())
    smoothing_function = nltk.translate.bleu_score.SmoothingFunction().method1
    bleu_score = nltk.translate.bleu_score.sentence_bleu([expected_tokens], model_tokens, smoothing_function=smoothing_function)
    return bleu_score

async def create_with_retries(client, **kwargs):
    """Retry mechanism for handling transient server errors asynchronously."""
    max_retries = 3  # Retry up to 3 times
    retry_delay = 5  # Retry delay in seconds
    timeout = 60  # Set timeout to 60 seconds (or adjust as needed)

    for attempt in range(max_retries):
        try:
            # Attempt to make the API request with an increased timeout
            response = await client.chat.completions.create(**kwargs, timeout=timeout)
            return response  # Return the response if successful
        except Exception as e:  # Catch all exceptions
            if attempt < max_retries - 1:  # Only retry for the first two attempts
                logging.error(f"Attempt {attempt + 1}/{max_retries} failed: {e}. Retrying...")
                await asyncio.sleep(retry_delay)  # Wait before retrying
            else:
                logging.error(f"API request failed after {max_retries} attempts: {e}")
                # Capture additional debugging information here
                logging.debug(f"Request data: {kwargs}")
                raise Exception("API request failed after retries") from e


async def evaluate_model(data, model_name, client, semantic_model):
    """Evaluate the model using the provided data."""
    semantic_scores = []
    bleu_scores = []

    for entry in data:
        prompt = entry['prompt']
        expected_response = entry['response']

        # Create a chat completion using OpenAI API
        response = await create_with_retries(
            client,
            model=f"PharynxAI/{model_name}",
            messages=[
                {"role": "system", "content": " "},
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=200,
            timeout=400
        )

        # Ensure the response contains choices
        if not response.choices:
            logging.error(f"No choices returned for prompt: {prompt}. Skipping this entry.")
            continue

        model_response = response.choices[0].message.content  # Extract model's response

        # Evaluate scores
        semantic_score = evaluate_semantic_similarity(expected_response, model_response, semantic_model)
        semantic_scores.append(semantic_score)

        bleu_score = evaluate_bleu(expected_response, model_response)
        bleu_scores.append(bleu_score)

    # Calculate average scores
    avg_semantic_score = sum(semantic_scores) / len(semantic_scores) if semantic_scores else 0
    avg_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0

    print(f"Average Semantic Similarity: {avg_semantic_score:.4f}")
    print(f"Average BLEU Score: {avg_bleu_score:.4f}")

    # Create comprehensive results dictionary
    evaluation_results = {
        'average_semantic_score': avg_semantic_score,
        'average_bleu_score': avg_bleu_score
    }

    # Print results to stdout for capturing in handler
    print(json.dumps(evaluation_results))
    
    logging.info("\nOverall Average Scores:")
    logging.info(f"Average Semantic Similarity: {avg_semantic_score:.4f}")
    logging.info(f"Average BLEU Score: {avg_bleu_score:.4f}")

    return evaluation_results


async def main():
    # Load input data
    input_data = load_input_data()
    model_name = input_data["model_name"]
    # Initialize the OpenAI Client with your RunPod API Key and Endpoint URL
    client = OpenAI(
        api_key="MIGZGJKYD6PU8KTHTBQ8FMEMGP2RAW5DVXABFVFD",
        base_url="https://api.runpod.ai/v2/6vg8gj8ia9vd1w/openai/v1",
    )
    # Load pre-trained models
    semantic_model = load_model()
    # Load your dataset (replace with your actual JSON file)
    with open('output_json.json', 'r') as f:
        data = json.load(f)

    # Run the evaluation asynchronously
    await evaluate_model(data, model_name, client, semantic_model)

# Start the event loop
asyncio.run(main())