import os import torch from dotenv import load_dotenv from openai import ( AzureOpenAI, OpenAIError, ) from sentence_transformers import ( SentenceTransformer, util, ) from transformers import pipeline load_dotenv() AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION") azure_client = AzureOpenAI( azure_endpoint="https://quoc-nguyen.openai.azure.com/", api_key=AZURE_OPENAI_API_KEY, api_version="2024-05-01-preview", ) # TODO: move to a config file # AI_TEXT_DECTECTION_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta" AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B" MODEL_HUMAN_LABEL = {AI_TEXT_DECTECTION_MODEL: "Human"} HUMAN = "HUMAN" MACHINE = "MACHINE" UNKNOWN = "UNKNOWN" PARAPHRASE = "PARAPHRASE" NON_PARAPHRASE = "NON_PARAPHRASE" # load the embedding model DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2") PARAPHASE_MODEL.to(DEVICE) def detect_text_by_ai_model( input_text: str, model: str = AI_TEXT_DECTECTION_MODEL, max_length: int = 512, ) -> tuple: """ Model: RADAR-Vicuna-7B Ref: https://huggingface.co/TrustSafeAI/RADAR-Vicuna-7B Detects if text is human or machine generated. Returns: tuple: (label, confidence_score) where label is HUMAN or MACHINE. """ try: pipe = pipeline( "text-classification", model=model, tokenizer=model, max_length=max_length, truncation=True, device_map="auto", # good for GPU usage ) input_text = input_text.replace("
", " ") result = pipe(input_text)[0] confidence_score = result["score"] if result["label"] == MODEL_HUMAN_LABEL[model]: label = HUMAN else: label = MACHINE generated_model, _ = predict_generation_model(input_text) label += f"
({generated_model})" return label, confidence_score except Exception as e: # Add exception handling print(f"Error in Roberta model inference: {e}") return UNKNOWN, 0.5 # Return UNKNOWN and 0.0 confidence if error def predict_generation_model(text: str) -> tuple[str, float]: """ Predicts if text is generated by gpt-4o or gpt-4o-mini models. Compare the input text against the paraphrased text by the models. Returns: tuple: (label, confidence_score) where label is gpt-4o or gpt-4o-mini. """ best_similarity = 0 best_model = "gpt-4o" models = ["gpt-4o", "gpt-4o-mini"] for model in models: paraphrased_text = paraphrase_by_AI(text, model) if paraphrased_text is None: continue similarity = measure_text_similarity(text, paraphrased_text) if similarity > best_similarity: best_similarity = similarity best_model = model return best_model, best_similarity def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str: """ Paraphrase text using a given model. Returns: str: Paraphrased text. """ prompt = f""" Paraphrase the following news, only output the paraphrased text: {input_text} """ try: response = azure_client.chat.completions.create( model=model, messages=[ {"role": "user", "content": prompt}, ], # max_tokens=100, # temperature=0.7, # top_p=0.9, # n=1, ) paraphrased_text = response.choices[0].message.content return paraphrased_text except OpenAIError as e: # Add exception handling print(f"Error in AI model inference: {e}") return None def measure_text_similarity(text1: str, text2: str) -> float: """ Measure the similarity between two texts. Returns: float: Similarity score. """ embeddings1 = PARAPHASE_MODEL.encode( text1, convert_to_tensor=True, device=DEVICE, show_progress_bar=False, ) embeddings2 = PARAPHASE_MODEL.encode( text2, convert_to_tensor=True, device=DEVICE, show_progress_bar=False, ) # Compute cosine similarity matrix similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy() print(similarity[0][0]) return similarity[0][0]