news_verification / src /texts /Search_Text /_text_detection_share.py
pmkhanh7890's picture
1st
22e1b62
raw
history blame
3.44 kB
from transformers import pipeline
from _google_search_engine_testing_share import find_by_relative_search
import math
PROOFREAD_FILE = "data/1_proofread/xsum/gpt-4o-mini_with_best_similarity.csv"
WORD_FREQUENCY = None
DEFAULT_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
"""
data/MAGE/xsum_human.csv = {'HUMAN': 64, 'MACHINE': 36} correction = 20 => 84%
data/MAGE/xsum_machine_topical_gpt-3.5-trubo.csv = {'HUMAN': 3, 'MACHINE': 97} => correction = 3 => 94%
original acc = (64+97)/ 200 = 80.5%
improve = (84 + 94) / 200 = 89%
different = 8.5%
https://huggingface.co/datasets/RealTimeData/bbc_news_alltime = {'HUMAN': 82, 'MACHINE': 18} => corrected 16 => 98%
"""
MODEL_HUMAN_MATCHING = dict()
MODEL_HUMAN_MATCHING[DEFAULT_MODEL] = "Human"
HUMAN = "HUMAN"
MACHINE = "MACHINE"
UNKNOWN = "UNKNOWN"
PARAPHASE = "PARAPHASE"
NON_PARAPHASE = "NON_PARAPHASE"
def detect_by_huggingface_model(input_text, model = DEFAULT_MODEL, max_length=512):
"""
trả về kết quả là "HUMAN" hay "MACHINE" và confidence score (int)
"""
pipe = pipeline("text-classification", model=model,tokenizer=model, max_length=512, truncation=True, device_map="auto")
result = pipe(input_text)[0]
confidence_score = result['score']
if result['label'] == MODEL_HUMAN_MATCHING[model]:
return HUMAN, confidence_score
else:
return MACHINE, confidence_score
def check_human(data, min_ratio = 0.7):
"""
input:
- data have item:
+ input sentence
+ source sentence
+ similarity
+ True/False : paraphrase or not
output:
is human (True/False)
"""
total_sentence = len(data)
min_matching = int(math.ceil(total_sentence * min_ratio))
count = 0
for input_sentence, source_sentence, similiarity, is_paraprhase in data:
if input_sentence in source_sentence:
count += 1
if count >= min_matching:
return True
else:
return False
def abstract_detect_generated_text(input_text):
"""
Assists to detect the source of text using the search engine
Output
- prediction by search engine (HUMAN/MACHINE/UNKNOWN)
- Prediction by SOTA (HUMAN/MACHINE)
- SOTA confidence (float)
- url to website (None if UNKNOWN)
- pair of sentences. Each item have ([] if empty)
- input sentence
- source sentence best matching in url
- matching result between input /source sentence (PARAPHASE/NON_PARAPHASE)
"""
is_support_opposite = False
is_paraphrase, found_url, data = find_by_relative_search(input_text, is_support_opposite)
sentence_pairs = []
SOTA_prediction, SOTA_confidence = detect_by_huggingface_model(input_text)
if not is_paraphrase:
search_engine_prediction = UNKNOWN
else:
if check_human(data):
search_engine_prediction = HUMAN
else:
search_engine_prediction = MACHINE
for input_sentence, source_sentence, similiarity, is_paraphrase in data:
if is_paraphrase:
check_paraphrase = PARAPHASE
else:
check_paraphrase = NON_PARAPHASE
sentence_pairs.append([input_sentence, source_sentence, check_paraphrase])
return search_engine_prediction, SOTA_prediction, SOTA_confidence, found_url, sentence_pairs
if __name__ == "__main__":
pass