Spaces:
Sleeping
Sleeping
import os | |
import torch | |
from dotenv import load_dotenv | |
from openai import ( | |
AzureOpenAI, | |
OpenAIError, | |
) | |
from sentence_transformers import ( | |
SentenceTransformer, | |
util, | |
) | |
from transformers import pipeline | |
load_dotenv() | |
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") | |
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") | |
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION") | |
azure_client = AzureOpenAI( | |
azure_endpoint="https://quoc-nguyen.openai.azure.com/", | |
api_key=AZURE_OPENAI_API_KEY, | |
api_version="2024-05-01-preview", | |
) | |
# TODO: move to a config file | |
# AI_TEXT_DECTECTION_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta" | |
AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B" | |
MODEL_HUMAN_LABEL = {AI_TEXT_DECTECTION_MODEL: "Human"} | |
HUMAN = "HUMAN" | |
MACHINE = "MACHINE" | |
UNKNOWN = "UNKNOWN" | |
PARAPHRASE = "PARAPHRASE" | |
NON_PARAPHRASE = "NON_PARAPHRASE" | |
# load the embedding model | |
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2") | |
PARAPHASE_MODEL.to(DEVICE) | |
def detect_text_by_ai_model( | |
input_text: str, | |
model: str = AI_TEXT_DECTECTION_MODEL, | |
max_length: int = 512, | |
) -> tuple: | |
""" | |
Model: RADAR-Vicuna-7B | |
Ref: https://huggingface.co/TrustSafeAI/RADAR-Vicuna-7B | |
Detects if text is human or machine generated. | |
Returns: | |
tuple: (label, confidence_score) | |
where label is HUMAN or MACHINE. | |
""" | |
try: | |
pipe = pipeline( | |
"text-classification", | |
model=model, | |
tokenizer=model, | |
max_length=max_length, | |
truncation=True, | |
device_map="auto", # good for GPU usage | |
) | |
input_text = input_text.replace("<br>", " ") | |
result = pipe(input_text)[0] | |
confidence_score = result["score"] | |
if result["label"] == MODEL_HUMAN_LABEL[model]: | |
label = HUMAN | |
else: | |
label = MACHINE | |
generated_model, _ = predict_generation_model(input_text) | |
label += f"<br>({generated_model})" | |
return label, confidence_score | |
except Exception as e: # Add exception handling | |
print(f"Error in Roberta model inference: {e}") | |
return UNKNOWN, 0.5 # Return UNKNOWN and 0.0 confidence if error | |
def predict_generation_model(text: str) -> tuple[str, float]: | |
""" | |
Predicts if text is generated by gpt-4o or gpt-4o-mini models. | |
Compare the input text against the paraphrased text by the models. | |
Returns: | |
tuple: (label, confidence_score) | |
where label is gpt-4o or gpt-4o-mini. | |
""" | |
best_similarity = 0 | |
best_model = "gpt-4o" | |
models = ["gpt-4o", "gpt-4o-mini"] | |
for model in models: | |
paraphrased_text = paraphrase_by_AI(text, model) | |
if paraphrased_text is None: | |
continue | |
similarity = measure_text_similarity(text, paraphrased_text) | |
if similarity > best_similarity: | |
best_similarity = similarity | |
best_model = model | |
return best_model, best_similarity | |
def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str: | |
""" | |
Paraphrase text using a given model. | |
Returns: | |
str: Paraphrased text. | |
""" | |
prompt = f""" | |
Paraphrase the following news, only output the paraphrased text: | |
{input_text} | |
""" | |
try: | |
response = azure_client.chat.completions.create( | |
model=model, | |
messages=[ | |
{"role": "user", "content": prompt}, | |
], | |
# max_tokens=100, | |
# temperature=0.7, | |
# top_p=0.9, | |
# n=1, | |
) | |
paraphrased_text = response.choices[0].message.content | |
return paraphrased_text | |
except OpenAIError as e: # Add exception handling | |
print(f"Error in AI model inference: {e}") | |
return None | |
def measure_text_similarity(text1: str, text2: str) -> float: | |
""" | |
Measure the similarity between two texts. | |
Returns: | |
float: Similarity score. | |
""" | |
embeddings1 = PARAPHASE_MODEL.encode( | |
text1, | |
convert_to_tensor=True, | |
device=DEVICE, | |
show_progress_bar=False, | |
) | |
embeddings2 = PARAPHASE_MODEL.encode( | |
text2, | |
convert_to_tensor=True, | |
device=DEVICE, | |
show_progress_bar=False, | |
) | |
# Compute cosine similarity matrix | |
similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy() | |
print(similarity[0][0]) | |
return similarity[0][0] | |