from transformers import pipeline from _google_search_engine_testing_share import find_by_relative_search import math PROOFREAD_FILE = "data/1_proofread/xsum/gpt-4o-mini_with_best_similarity.csv" WORD_FREQUENCY = None DEFAULT_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta" """ data/MAGE/xsum_human.csv = {'HUMAN': 64, 'MACHINE': 36} correction = 20 => 84% data/MAGE/xsum_machine_topical_gpt-3.5-trubo.csv = {'HUMAN': 3, 'MACHINE': 97} => correction = 3 => 94% original acc = (64+97)/ 200 = 80.5% improve = (84 + 94) / 200 = 89% different = 8.5% https://huggingface.co/datasets/RealTimeData/bbc_news_alltime = {'HUMAN': 82, 'MACHINE': 18} => corrected 16 => 98% """ MODEL_HUMAN_MATCHING = dict() MODEL_HUMAN_MATCHING[DEFAULT_MODEL] = "Human" HUMAN = "HUMAN" MACHINE = "MACHINE" UNKNOWN = "UNKNOWN" PARAPHASE = "PARAPHASE" NON_PARAPHASE = "NON_PARAPHASE" def detect_by_huggingface_model(input_text, model = DEFAULT_MODEL, max_length=512): """ trả về kết quả là "HUMAN" hay "MACHINE" và confidence score (int) """ pipe = pipeline("text-classification", model=model,tokenizer=model, max_length=512, truncation=True, device_map="auto") result = pipe(input_text)[0] confidence_score = result['score'] if result['label'] == MODEL_HUMAN_MATCHING[model]: return HUMAN, confidence_score else: return MACHINE, confidence_score def check_human(data, min_ratio = 0.7): """ input: - data have item: + input sentence + source sentence + similarity + True/False : paraphrase or not output: is human (True/False) """ total_sentence = len(data) min_matching = int(math.ceil(total_sentence * min_ratio)) count = 0 for input_sentence, source_sentence, similiarity, is_paraprhase in data: if input_sentence in source_sentence: count += 1 if count >= min_matching: return True else: return False def abstract_detect_generated_text(input_text): """ Assists to detect the source of text using the search engine Output - prediction by search engine (HUMAN/MACHINE/UNKNOWN) - Prediction by SOTA (HUMAN/MACHINE) - SOTA confidence (float) - url to website (None if UNKNOWN) - pair of sentences. Each item have ([] if empty) - input sentence - source sentence best matching in url - matching result between input /source sentence (PARAPHASE/NON_PARAPHASE) """ is_support_opposite = False is_paraphrase, found_url, data = find_by_relative_search(input_text, is_support_opposite) sentence_pairs = [] SOTA_prediction, SOTA_confidence = detect_by_huggingface_model(input_text) if not is_paraphrase: search_engine_prediction = UNKNOWN else: if check_human(data): search_engine_prediction = HUMAN else: search_engine_prediction = MACHINE for input_sentence, source_sentence, similiarity, is_paraphrase in data: if is_paraphrase: check_paraphrase = PARAPHASE else: check_paraphrase = NON_PARAPHASE sentence_pairs.append([input_sentence, source_sentence, check_paraphrase]) return search_engine_prediction, SOTA_prediction, SOTA_confidence, found_url, sentence_pairs if __name__ == "__main__": pass