Spaces:

PharynxAI
/

LLM_FinetuneR

Paused

File size: 5,873 Bytes

026b316

import json
from sentence_transformers import SentenceTransformer, util
import nltk
from openai import OpenAI
import os
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import time
import asyncio
import logging
import sys
# Configure logging
logging.basicConfig(level=logging.INFO)
# Download necessary NLTK resources
nltk.download('punkt')
def load_input_data():
    """Load input data from command line arguments."""
    try:
        input_data = json.loads(sys.argv[1])
        return input_data
    except json.JSONDecodeError as e:
        logging.error(f"Failed to decode JSON input: {e}")
        sys.exit(1)

def initialize_openai_client(api_key, base_url):
    """Initialize the OpenAI client."""
    return OpenAI(api_key=api_key, base_url=base_url)

def load_model():
    """Load the pre-trained models for evaluation."""
    semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
    return semantic_model

def evaluate_semantic_similarity(expected_response, model_response, semantic_model):
    """Evaluate semantic similarity using Sentence-BERT."""
    expected_embedding = semantic_model.encode(expected_response, convert_to_tensor=True)
    model_embedding = semantic_model.encode(model_response, convert_to_tensor=True)
    similarity_score = util.pytorch_cos_sim(expected_embedding, model_embedding)
    return similarity_score.item()

def evaluate_bleu(expected_response, model_response):
    """Evaluate BLEU score using NLTK's sentence_bleu."""
    expected_tokens = nltk.word_tokenize(expected_response.lower())
    model_tokens = nltk.word_tokenize(model_response.lower())
    smoothing_function = nltk.translate.bleu_score.SmoothingFunction().method1
    bleu_score = nltk.translate.bleu_score.sentence_bleu([expected_tokens], model_tokens, smoothing_function=smoothing_function)
    return bleu_score

async def create_with_retries(client, **kwargs):
    """Retry mechanism for handling transient server errors asynchronously."""
    max_retries = 3  # Retry up to 3 times
    retry_delay = 5  # Retry delay in seconds
    timeout = 60  # Set timeout to 60 seconds (or adjust as needed)

    for attempt in range(max_retries):
        try:
            # Attempt to make the API request with an increased timeout
            response = await client.chat.completions.create(**kwargs, timeout=timeout)
            return response  # Return the response if successful
        except Exception as e:  # Catch all exceptions
            if attempt < max_retries - 1:  # Only retry for the first two attempts
                logging.error(f"Attempt {attempt + 1}/{max_retries} failed: {e}. Retrying...")
                await asyncio.sleep(retry_delay)  # Wait before retrying
            else:
                logging.error(f"API request failed after {max_retries} attempts: {e}")
                # Capture additional debugging information here
                logging.debug(f"Request data: {kwargs}")
                raise Exception("API request failed after retries") from e


async def evaluate_model(data, model_name, client, semantic_model):
    """Evaluate the model using the provided data."""
    semantic_scores = []
    bleu_scores = []

    for entry in data:
        prompt = entry['prompt']
        expected_response = entry['response']

        # Create a chat completion using OpenAI API
        response = await create_with_retries(
            client,
            model=f"PharynxAI/{model_name}",
            messages=[
                {"role": "system", "content": " "},
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=200,
            timeout=400
        )

        # Ensure the response contains choices
        if not response.choices:
            logging.error(f"No choices returned for prompt: {prompt}. Skipping this entry.")
            continue

        model_response = response.choices[0].message.content  # Extract model's response

        # Evaluate scores
        semantic_score = evaluate_semantic_similarity(expected_response, model_response, semantic_model)
        semantic_scores.append(semantic_score)

        bleu_score = evaluate_bleu(expected_response, model_response)
        bleu_scores.append(bleu_score)

    # Calculate average scores
    avg_semantic_score = sum(semantic_scores) / len(semantic_scores) if semantic_scores else 0
    avg_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0

    print(f"Average Semantic Similarity: {avg_semantic_score:.4f}")
    print(f"Average BLEU Score: {avg_bleu_score:.4f}")

    # Create comprehensive results dictionary
    evaluation_results = {
        'average_semantic_score': avg_semantic_score,
        'average_bleu_score': avg_bleu_score
    }

    # Print results to stdout for capturing in handler
    print(json.dumps(evaluation_results))
    
    logging.info("\nOverall Average Scores:")
    logging.info(f"Average Semantic Similarity: {avg_semantic_score:.4f}")
    logging.info(f"Average BLEU Score: {avg_bleu_score:.4f}")

    return evaluation_results


async def main():
    # Load input data
    input_data = load_input_data()
    model_name = input_data["model_name"]
    # Initialize the OpenAI Client with your RunPod API Key and Endpoint URL
    client = OpenAI(
        api_key="MIGZGJKYD6PU8KTHTBQ8FMEMGP2RAW5DVXABFVFD",
        base_url="https://api.runpod.ai/v2/6vg8gj8ia9vd1w/openai/v1",
    )
    # Load pre-trained models
    semantic_model = load_model()
    # Load your dataset (replace with your actual JSON file)
    with open('output_json.json', 'r') as f:
        data = json.load(f)

    # Run the evaluation asynchronously
    await evaluate_model(data, model_name, client, semantic_model)

# Start the event loop
asyncio.run(main())