pmkhanh7890's picture
pre-commit
bfe6692
raw
history blame
4.55 kB
import os
import torch
from dotenv import load_dotenv
from openai import (
AzureOpenAI,
OpenAIError,
)
from sentence_transformers import (
SentenceTransformer,
util,
)
from transformers import pipeline
load_dotenv()
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
azure_client = AzureOpenAI(
azure_endpoint="https://quoc-nguyen.openai.azure.com/",
api_key=AZURE_OPENAI_API_KEY,
api_version="2024-05-01-preview",
)
# TODO: move to a config file
# AI_TEXT_DECTECTION_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"
MODEL_HUMAN_LABEL = {AI_TEXT_DECTECTION_MODEL: "Human"}
HUMAN = "HUMAN"
MACHINE = "MACHINE"
UNKNOWN = "UNKNOWN"
PARAPHRASE = "PARAPHRASE"
NON_PARAPHRASE = "NON_PARAPHRASE"
# load the embedding model
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
PARAPHASE_MODEL.to(DEVICE)
def detect_text_by_ai_model(
input_text: str,
model: str = AI_TEXT_DECTECTION_MODEL,
max_length: int = 512,
) -> tuple:
"""
Model: RADAR-Vicuna-7B
Ref: https://huggingface.co/TrustSafeAI/RADAR-Vicuna-7B
Detects if text is human or machine generated.
Returns:
tuple: (label, confidence_score)
where label is HUMAN or MACHINE.
"""
try:
pipe = pipeline(
"text-classification",
model=model,
tokenizer=model,
max_length=max_length,
truncation=True,
device_map="auto", # good for GPU usage
)
input_text = input_text.replace("<br>", " ")
result = pipe(input_text)[0]
confidence_score = result["score"]
if result["label"] == MODEL_HUMAN_LABEL[model]:
label = HUMAN
else:
label = MACHINE
generated_model, _ = predict_generation_model(input_text)
label += f"<br>({generated_model})"
return label, confidence_score
except Exception as e: # Add exception handling
print(f"Error in Roberta model inference: {e}")
return UNKNOWN, 0.5 # Return UNKNOWN and 0.0 confidence if error
def predict_generation_model(text: str) -> tuple[str, float]:
"""
Predicts if text is generated by gpt-4o or gpt-4o-mini models.
Compare the input text against the paraphrased text by the models.
Returns:
tuple: (label, confidence_score)
where label is gpt-4o or gpt-4o-mini.
"""
best_similarity = 0
best_model = "gpt-4o"
models = ["gpt-4o", "gpt-4o-mini"]
for model in models:
paraphrased_text = paraphrase_by_AI(text, model)
if paraphrased_text is None:
continue
similarity = measure_text_similarity(text, paraphrased_text)
if similarity > best_similarity:
best_similarity = similarity
best_model = model
return best_model, best_similarity
def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str:
"""
Paraphrase text using a given model.
Returns:
str: Paraphrased text.
"""
prompt = f"""
Paraphrase the following news, only output the paraphrased text:
{input_text}
"""
try:
response = azure_client.chat.completions.create(
model=model,
messages=[
{"role": "user", "content": prompt},
],
# max_tokens=100,
# temperature=0.7,
# top_p=0.9,
# n=1,
)
paraphrased_text = response.choices[0].message.content
return paraphrased_text
except OpenAIError as e: # Add exception handling
print(f"Error in AI model inference: {e}")
return None
def measure_text_similarity(text1: str, text2: str) -> float:
"""
Measure the similarity between two texts.
Returns:
float: Similarity score.
"""
embeddings1 = PARAPHASE_MODEL.encode(
text1,
convert_to_tensor=True,
device=DEVICE,
show_progress_bar=False,
)
embeddings2 = PARAPHASE_MODEL.encode(
text2,
convert_to_tensor=True,
device=DEVICE,
show_progress_bar=False,
)
# Compute cosine similarity matrix
similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
print(similarity[0][0])
return similarity[0][0]