Spaces:
Running
Running
from transformers import pipeline | |
from _google_search_engine_testing_share import find_by_relative_search | |
import math | |
PROOFREAD_FILE = "data/1_proofread/xsum/gpt-4o-mini_with_best_similarity.csv" | |
WORD_FREQUENCY = None | |
DEFAULT_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta" | |
""" | |
data/MAGE/xsum_human.csv = {'HUMAN': 64, 'MACHINE': 36} correction = 20 => 84% | |
data/MAGE/xsum_machine_topical_gpt-3.5-trubo.csv = {'HUMAN': 3, 'MACHINE': 97} => correction = 3 => 94% | |
original acc = (64+97)/ 200 = 80.5% | |
improve = (84 + 94) / 200 = 89% | |
different = 8.5% | |
https://huggingface.co/datasets/RealTimeData/bbc_news_alltime = {'HUMAN': 82, 'MACHINE': 18} => corrected 16 => 98% | |
""" | |
MODEL_HUMAN_MATCHING = dict() | |
MODEL_HUMAN_MATCHING[DEFAULT_MODEL] = "Human" | |
HUMAN = "HUMAN" | |
MACHINE = "MACHINE" | |
UNKNOWN = "UNKNOWN" | |
PARAPHASE = "PARAPHASE" | |
NON_PARAPHASE = "NON_PARAPHASE" | |
def detect_by_huggingface_model(input_text, model = DEFAULT_MODEL, max_length=512): | |
""" | |
trả về kết quả là "HUMAN" hay "MACHINE" và confidence score (int) | |
""" | |
pipe = pipeline("text-classification", model=model,tokenizer=model, max_length=512, truncation=True, device_map="auto") | |
result = pipe(input_text)[0] | |
confidence_score = result['score'] | |
if result['label'] == MODEL_HUMAN_MATCHING[model]: | |
return HUMAN, confidence_score | |
else: | |
return MACHINE, confidence_score | |
def check_human(data, min_ratio = 0.7): | |
""" | |
input: | |
- data have item: | |
+ input sentence | |
+ source sentence | |
+ similarity | |
+ True/False : paraphrase or not | |
output: | |
is human (True/False) | |
""" | |
total_sentence = len(data) | |
min_matching = int(math.ceil(total_sentence * min_ratio)) | |
count = 0 | |
for input_sentence, source_sentence, similiarity, is_paraprhase in data: | |
if input_sentence in source_sentence: | |
count += 1 | |
if count >= min_matching: | |
return True | |
else: | |
return False | |
def abstract_detect_generated_text(input_text): | |
""" | |
Assists to detect the source of text using the search engine | |
Output | |
- prediction by search engine (HUMAN/MACHINE/UNKNOWN) | |
- Prediction by SOTA (HUMAN/MACHINE) | |
- SOTA confidence (float) | |
- url to website (None if UNKNOWN) | |
- pair of sentences. Each item have ([] if empty) | |
- input sentence | |
- source sentence best matching in url | |
- matching result between input /source sentence (PARAPHASE/NON_PARAPHASE) | |
""" | |
is_support_opposite = False | |
is_paraphrase, found_url, data = find_by_relative_search(input_text, is_support_opposite) | |
sentence_pairs = [] | |
SOTA_prediction, SOTA_confidence = detect_by_huggingface_model(input_text) | |
if not is_paraphrase: | |
search_engine_prediction = UNKNOWN | |
else: | |
if check_human(data): | |
search_engine_prediction = HUMAN | |
else: | |
search_engine_prediction = MACHINE | |
for input_sentence, source_sentence, similiarity, is_paraphrase in data: | |
if is_paraphrase: | |
check_paraphrase = PARAPHASE | |
else: | |
check_paraphrase = NON_PARAPHASE | |
sentence_pairs.append([input_sentence, source_sentence, check_paraphrase]) | |
return search_engine_prediction, SOTA_prediction, SOTA_confidence, found_url, sentence_pairs | |
if __name__ == "__main__": | |
pass | |