Spaces:

pmkhanh7890
/

news_verification

Sleeping

File size: 4,548 Bytes

import os

import torch
from dotenv import load_dotenv
from openai import (
    AzureOpenAI,
    OpenAIError,
)
from sentence_transformers import (
    SentenceTransformer,
    util,
)
from transformers import pipeline

load_dotenv()
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")

azure_client = AzureOpenAI(
    azure_endpoint="https://quoc-nguyen.openai.azure.com/",
    api_key=AZURE_OPENAI_API_KEY,
    api_version="2024-05-01-preview",
)

# TODO: move to a config file
# AI_TEXT_DECTECTION_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"

MODEL_HUMAN_LABEL = {AI_TEXT_DECTECTION_MODEL: "Human"}
HUMAN = "HUMAN"
MACHINE = "MACHINE"
UNKNOWN = "UNKNOWN"
PARAPHRASE = "PARAPHRASE"
NON_PARAPHRASE = "NON_PARAPHRASE"

# load the embedding model
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
PARAPHASE_MODEL.to(DEVICE)


def detect_text_by_ai_model(
    input_text: str,
    model: str = AI_TEXT_DECTECTION_MODEL,
    max_length: int = 512,
) -> tuple:
    """
    Model: RADAR-Vicuna-7B
    Ref: https://huggingface.co/TrustSafeAI/RADAR-Vicuna-7B

    Detects if text is human or machine generated.

    Returns:
        tuple: (label, confidence_score)
            where label is HUMAN or MACHINE.
    """
    try:
        pipe = pipeline(
            "text-classification",
            model=model,
            tokenizer=model,
            max_length=max_length,
            truncation=True,
            device_map="auto",  # good for GPU usage
        )
        input_text = input_text.replace("<br>", " ")
        result = pipe(input_text)[0]
        confidence_score = result["score"]
        if result["label"] == MODEL_HUMAN_LABEL[model]:
            label = HUMAN
        else:
            label = MACHINE
            generated_model, _ = predict_generation_model(input_text)
            label += f"<br>({generated_model})"
        return label, confidence_score
    except Exception as e:  # Add exception handling
        print(f"Error in Roberta model inference: {e}")
        return UNKNOWN, 0.5  # Return UNKNOWN and 0.0 confidence if error


def predict_generation_model(text: str) -> tuple[str, float]:
    """
    Predicts if text is generated by gpt-4o or gpt-4o-mini models.
    Compare the input text against the paraphrased text by the models.

    Returns:
        tuple: (label, confidence_score)
            where label is gpt-4o or gpt-4o-mini.
    """
    best_similarity = 0
    best_model = "gpt-4o"
    models = ["gpt-4o", "gpt-4o-mini"]
    for model in models:
        paraphrased_text = paraphrase_by_AI(text, model)
        if paraphrased_text is None:
            continue
        similarity = measure_text_similarity(text, paraphrased_text)
        if similarity > best_similarity:
            best_similarity = similarity
            best_model = model

    return best_model, best_similarity


def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str:
    """
    Paraphrase text using a given model.

    Returns:
        str: Paraphrased text.
    """

    prompt = f"""
Paraphrase the following news, only output the paraphrased text:
{input_text}
"""
    try:
        response = azure_client.chat.completions.create(
            model=model,
            messages=[
                {"role": "user", "content": prompt},
            ],
            # max_tokens=100,
            # temperature=0.7,
            # top_p=0.9,
            # n=1,
        )
        paraphrased_text = response.choices[0].message.content
        return paraphrased_text
    except OpenAIError as e:  # Add exception handling
        print(f"Error in AI model inference: {e}")
        return None


def measure_text_similarity(text1: str, text2: str) -> float:
    """
    Measure the similarity between two texts.

    Returns:
        float: Similarity score.
    """
    embeddings1 = PARAPHASE_MODEL.encode(
        text1,
        convert_to_tensor=True,
        device=DEVICE,
        show_progress_bar=False,
    )
    embeddings2 = PARAPHASE_MODEL.encode(
        text2,
        convert_to_tensor=True,
        device=DEVICE,
        show_progress_bar=False,
    )

    # Compute cosine similarity matrix
    similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
    print(similarity[0][0])
    return similarity[0][0]