Spaces:
Paused
Paused
import json | |
from sentence_transformers import SentenceTransformer, util | |
import nltk | |
from openai import OpenAI | |
import os | |
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction | |
import time | |
import asyncio | |
import logging | |
import sys | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
# Download necessary NLTK resources | |
nltk.download('punkt') | |
def load_input_data(): | |
"""Load input data from command line arguments.""" | |
try: | |
input_data = json.loads(sys.argv[1]) | |
return input_data | |
except json.JSONDecodeError as e: | |
logging.error(f"Failed to decode JSON input: {e}") | |
sys.exit(1) | |
def initialize_openai_client(api_key, base_url): | |
"""Initialize the OpenAI client.""" | |
return OpenAI(api_key=api_key, base_url=base_url) | |
def load_model(): | |
"""Load the pre-trained models for evaluation.""" | |
semantic_model = SentenceTransformer('all-MiniLM-L6-v2') | |
return semantic_model | |
def evaluate_semantic_similarity(expected_response, model_response, semantic_model): | |
"""Evaluate semantic similarity using Sentence-BERT.""" | |
expected_embedding = semantic_model.encode(expected_response, convert_to_tensor=True) | |
model_embedding = semantic_model.encode(model_response, convert_to_tensor=True) | |
similarity_score = util.pytorch_cos_sim(expected_embedding, model_embedding) | |
return similarity_score.item() | |
def evaluate_bleu(expected_response, model_response): | |
"""Evaluate BLEU score using NLTK's sentence_bleu.""" | |
expected_tokens = nltk.word_tokenize(expected_response.lower()) | |
model_tokens = nltk.word_tokenize(model_response.lower()) | |
smoothing_function = nltk.translate.bleu_score.SmoothingFunction().method1 | |
bleu_score = nltk.translate.bleu_score.sentence_bleu([expected_tokens], model_tokens, smoothing_function=smoothing_function) | |
return bleu_score | |
async def create_with_retries(client, **kwargs): | |
"""Retry mechanism for handling transient server errors asynchronously.""" | |
max_retries = 3 # Retry up to 3 times | |
retry_delay = 5 # Retry delay in seconds | |
timeout = 60 # Set timeout to 60 seconds (or adjust as needed) | |
for attempt in range(max_retries): | |
try: | |
# Attempt to make the API request with an increased timeout | |
response = await client.chat.completions.create(**kwargs, timeout=timeout) | |
return response # Return the response if successful | |
except Exception as e: # Catch all exceptions | |
if attempt < max_retries - 1: # Only retry for the first two attempts | |
logging.error(f"Attempt {attempt + 1}/{max_retries} failed: {e}. Retrying...") | |
await asyncio.sleep(retry_delay) # Wait before retrying | |
else: | |
logging.error(f"API request failed after {max_retries} attempts: {e}") | |
# Capture additional debugging information here | |
logging.debug(f"Request data: {kwargs}") | |
raise Exception("API request failed after retries") from e | |
async def evaluate_model(data, model_name, client, semantic_model): | |
"""Evaluate the model using the provided data.""" | |
semantic_scores = [] | |
bleu_scores = [] | |
for entry in data: | |
prompt = entry['prompt'] | |
expected_response = entry['response'] | |
# Create a chat completion using OpenAI API | |
response = await create_with_retries( | |
client, | |
model=f"PharynxAI/{model_name}", | |
messages=[ | |
{"role": "system", "content": " "}, | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": prompt} | |
], | |
temperature=0.7, | |
max_tokens=200, | |
timeout=400 | |
) | |
# Ensure the response contains choices | |
if not response.choices: | |
logging.error(f"No choices returned for prompt: {prompt}. Skipping this entry.") | |
continue | |
model_response = response.choices[0].message.content # Extract model's response | |
# Evaluate scores | |
semantic_score = evaluate_semantic_similarity(expected_response, model_response, semantic_model) | |
semantic_scores.append(semantic_score) | |
bleu_score = evaluate_bleu(expected_response, model_response) | |
bleu_scores.append(bleu_score) | |
# Calculate average scores | |
avg_semantic_score = sum(semantic_scores) / len(semantic_scores) if semantic_scores else 0 | |
avg_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0 | |
print(f"Average Semantic Similarity: {avg_semantic_score:.4f}") | |
print(f"Average BLEU Score: {avg_bleu_score:.4f}") | |
# Create comprehensive results dictionary | |
evaluation_results = { | |
'average_semantic_score': avg_semantic_score, | |
'average_bleu_score': avg_bleu_score | |
} | |
# Print results to stdout for capturing in handler | |
print(json.dumps(evaluation_results)) | |
logging.info("\nOverall Average Scores:") | |
logging.info(f"Average Semantic Similarity: {avg_semantic_score:.4f}") | |
logging.info(f"Average BLEU Score: {avg_bleu_score:.4f}") | |
return evaluation_results | |
async def main(): | |
# Load input data | |
input_data = load_input_data() | |
model_name = input_data["model_name"] | |
# Initialize the OpenAI Client with your RunPod API Key and Endpoint URL | |
client = OpenAI( | |
api_key="MIGZGJKYD6PU8KTHTBQ8FMEMGP2RAW5DVXABFVFD", | |
base_url="https://api.runpod.ai/v2/6vg8gj8ia9vd1w/openai/v1", | |
) | |
# Load pre-trained models | |
semantic_model = load_model() | |
# Load your dataset (replace with your actual JSON file) | |
with open('output_json.json', 'r') as f: | |
data = json.load(f) | |
# Run the evaluation asynchronously | |
await evaluate_model(data, model_name, client, semantic_model) | |
# Start the event loop | |
asyncio.run(main()) | |