import math from _google_search_engine_testing_share import find_by_relative_search from transformers import pipeline # TODO: move to a config file # Constants should be UPPER_SNAKE_CASE PROOFREAD_FILE = "data/1_proofread/xsum/gpt-4o-mini_with_best_similarity.csv" WORD_FREQUENCY = None DEFAULT_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta" MODEL_HUMAN_LABEL = {DEFAULT_MODEL: "Human"} HUMAN = "HUMAN" MACHINE = "MACHINE" UNKNOWN = "UNKNOWN" PARAPHRASE = "PARAPHRASE" NON_PARAPHRASE = "NON_PARAPHRASE" def detect_ai_content( input_text: str, model: str = DEFAULT_MODEL, max_length: int = 512, ) -> tuple: """ Detects if text is human or machine generated. Returns: tuple: (label, confidence_score) where label is HUMAN or MACHINE. """ try: pipe = pipeline( "text-classification", model=model, tokenizer=model, max_length=max_length, truncation=True, device_map="auto", # good for GPU usage ) result = pipe(input_text)[0] confidence_score = result["score"] if result["label"] == MODEL_HUMAN_LABEL[model]: label = HUMAN else: label = MACHINE return label, confidence_score except Exception as e: # Add exception handling print(f"Error in Roberta model inference: {e}") return UNKNOWN, 0.0 # Return UNKNOWN and 0.0 confidence if error def check_human(data, min_ratio=0.7): """ Checks if a sufficient number of input sentences are found within source sentences. Returns: bool: True if the condition is met, False otherwise. """ if not data: # Handle empty data case return False min_matching = math.ceil(len(data) * min_ratio) count = 0 #for input_sentence, source_sentence, similiarity, is_paraprhase in data: for sentence in data: if sentence["similarity"] >= 0.99: count += 1 print(f"\tmatching_sentence_count : {count}, min_matching: {min_matching}") if count >= min_matching: return True return False def abstract_detect_generated_text(input_text): """ Abstracts the process of detecting generated text using search and a classification model. Returns: tuple: ( search_engine_prediction, SOTA_prediction, SOTA_confidence, found_url, sentence_pairs, ) """ is_paraphrase, found_url, data = find_by_relative_search( input_text, is_support_opposite=False, ) # Explicitly set the keyword argument SOTA_prediction, SOTA_confidence = detect_ai_content(input_text) if not is_paraphrase: search_engine_prediction = UNKNOWN else: search_engine_prediction = HUMAN if check_human(data) else MACHINE sentence_pairs = [] if data: # Check if data is not empty to avoid error when iterating for input_sentence, source_sentence, _, is_paraphrase in data: check_paraphrase = PARAPHRASE if is_paraphrase else NON_PARAPHRASE sentence_pairs.append( [input_sentence, source_sentence, check_paraphrase], ) return ( search_engine_prediction, SOTA_prediction, SOTA_confidence, found_url, sentence_pairs, )