Spaces:
Paused
Paused
File size: 5,873 Bytes
026b316 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import json
from sentence_transformers import SentenceTransformer, util
import nltk
from openai import OpenAI
import os
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import time
import asyncio
import logging
import sys
# Configure logging
logging.basicConfig(level=logging.INFO)
# Download necessary NLTK resources
nltk.download('punkt')
def load_input_data():
"""Load input data from command line arguments."""
try:
input_data = json.loads(sys.argv[1])
return input_data
except json.JSONDecodeError as e:
logging.error(f"Failed to decode JSON input: {e}")
sys.exit(1)
def initialize_openai_client(api_key, base_url):
"""Initialize the OpenAI client."""
return OpenAI(api_key=api_key, base_url=base_url)
def load_model():
"""Load the pre-trained models for evaluation."""
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
return semantic_model
def evaluate_semantic_similarity(expected_response, model_response, semantic_model):
"""Evaluate semantic similarity using Sentence-BERT."""
expected_embedding = semantic_model.encode(expected_response, convert_to_tensor=True)
model_embedding = semantic_model.encode(model_response, convert_to_tensor=True)
similarity_score = util.pytorch_cos_sim(expected_embedding, model_embedding)
return similarity_score.item()
def evaluate_bleu(expected_response, model_response):
"""Evaluate BLEU score using NLTK's sentence_bleu."""
expected_tokens = nltk.word_tokenize(expected_response.lower())
model_tokens = nltk.word_tokenize(model_response.lower())
smoothing_function = nltk.translate.bleu_score.SmoothingFunction().method1
bleu_score = nltk.translate.bleu_score.sentence_bleu([expected_tokens], model_tokens, smoothing_function=smoothing_function)
return bleu_score
async def create_with_retries(client, **kwargs):
"""Retry mechanism for handling transient server errors asynchronously."""
max_retries = 3 # Retry up to 3 times
retry_delay = 5 # Retry delay in seconds
timeout = 60 # Set timeout to 60 seconds (or adjust as needed)
for attempt in range(max_retries):
try:
# Attempt to make the API request with an increased timeout
response = await client.chat.completions.create(**kwargs, timeout=timeout)
return response # Return the response if successful
except Exception as e: # Catch all exceptions
if attempt < max_retries - 1: # Only retry for the first two attempts
logging.error(f"Attempt {attempt + 1}/{max_retries} failed: {e}. Retrying...")
await asyncio.sleep(retry_delay) # Wait before retrying
else:
logging.error(f"API request failed after {max_retries} attempts: {e}")
# Capture additional debugging information here
logging.debug(f"Request data: {kwargs}")
raise Exception("API request failed after retries") from e
async def evaluate_model(data, model_name, client, semantic_model):
"""Evaluate the model using the provided data."""
semantic_scores = []
bleu_scores = []
for entry in data:
prompt = entry['prompt']
expected_response = entry['response']
# Create a chat completion using OpenAI API
response = await create_with_retries(
client,
model=f"PharynxAI/{model_name}",
messages=[
{"role": "system", "content": " "},
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
],
temperature=0.7,
max_tokens=200,
timeout=400
)
# Ensure the response contains choices
if not response.choices:
logging.error(f"No choices returned for prompt: {prompt}. Skipping this entry.")
continue
model_response = response.choices[0].message.content # Extract model's response
# Evaluate scores
semantic_score = evaluate_semantic_similarity(expected_response, model_response, semantic_model)
semantic_scores.append(semantic_score)
bleu_score = evaluate_bleu(expected_response, model_response)
bleu_scores.append(bleu_score)
# Calculate average scores
avg_semantic_score = sum(semantic_scores) / len(semantic_scores) if semantic_scores else 0
avg_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
print(f"Average Semantic Similarity: {avg_semantic_score:.4f}")
print(f"Average BLEU Score: {avg_bleu_score:.4f}")
# Create comprehensive results dictionary
evaluation_results = {
'average_semantic_score': avg_semantic_score,
'average_bleu_score': avg_bleu_score
}
# Print results to stdout for capturing in handler
print(json.dumps(evaluation_results))
logging.info("\nOverall Average Scores:")
logging.info(f"Average Semantic Similarity: {avg_semantic_score:.4f}")
logging.info(f"Average BLEU Score: {avg_bleu_score:.4f}")
return evaluation_results
async def main():
# Load input data
input_data = load_input_data()
model_name = input_data["model_name"]
# Initialize the OpenAI Client with your RunPod API Key and Endpoint URL
client = OpenAI(
api_key="MIGZGJKYD6PU8KTHTBQ8FMEMGP2RAW5DVXABFVFD",
base_url="https://api.runpod.ai/v2/6vg8gj8ia9vd1w/openai/v1",
)
# Load pre-trained models
semantic_model = load_model()
# Load your dataset (replace with your actual JSON file)
with open('output_json.json', 'r') as f:
data = json.load(f)
# Run the evaluation asynchronously
await evaluate_model(data, model_name, client, semantic_model)
# Start the event loop
asyncio.run(main())
|