news_verification / src /texts /Search_Text /chatgpt_detector_roberta.py
pmkhanh7890's picture
1st
22e1b62
raw
history blame
3.41 kB
import math
from _google_search_engine_testing_share import find_by_relative_search
from transformers import pipeline
# TODO: move to a config file
# Constants should be UPPER_SNAKE_CASE
PROOFREAD_FILE = "data/1_proofread/xsum/gpt-4o-mini_with_best_similarity.csv"
WORD_FREQUENCY = None
DEFAULT_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
MODEL_HUMAN_LABEL = {DEFAULT_MODEL: "Human"}
HUMAN = "HUMAN"
MACHINE = "MACHINE"
UNKNOWN = "UNKNOWN"
PARAPHRASE = "PARAPHRASE"
NON_PARAPHRASE = "NON_PARAPHRASE"
def detect_ai_content(
input_text: str,
model: str = DEFAULT_MODEL,
max_length: int = 512,
) -> tuple:
"""
Detects if text is human or machine generated.
Returns:
tuple: (label, confidence_score)
where label is HUMAN or MACHINE.
"""
try:
pipe = pipeline(
"text-classification",
model=model,
tokenizer=model,
max_length=max_length,
truncation=True,
device_map="auto", # good for GPU usage
)
result = pipe(input_text)[0]
confidence_score = result["score"]
if result["label"] == MODEL_HUMAN_LABEL[model]:
label = HUMAN
else:
label = MACHINE
return label, confidence_score
except Exception as e: # Add exception handling
print(f"Error in Roberta model inference: {e}")
return UNKNOWN, 0.0 # Return UNKNOWN and 0.0 confidence if error
def check_human(data, min_ratio=0.7):
"""
Checks if a sufficient number of input sentences are found within
source sentences.
Returns:
bool: True if the condition is met, False otherwise.
"""
if not data: # Handle empty data case
return False
min_matching = math.ceil(len(data) * min_ratio)
count = 0
#for input_sentence, source_sentence, similiarity, is_paraprhase in data:
for sentence in data:
if sentence["similarity"] >= 0.99:
count += 1
print(f"\tmatching_sentence_count : {count}, min_matching: {min_matching}")
if count >= min_matching:
return True
return False
def abstract_detect_generated_text(input_text):
"""
Abstracts the process of detecting generated text using search
and a classification model.
Returns:
tuple: (
search_engine_prediction,
SOTA_prediction,
SOTA_confidence,
found_url,
sentence_pairs,
)
"""
is_paraphrase, found_url, data = find_by_relative_search(
input_text,
is_support_opposite=False,
) # Explicitly set the keyword argument
SOTA_prediction, SOTA_confidence = detect_ai_content(input_text)
if not is_paraphrase:
search_engine_prediction = UNKNOWN
else:
search_engine_prediction = HUMAN if check_human(data) else MACHINE
sentence_pairs = []
if data: # Check if data is not empty to avoid error when iterating
for input_sentence, source_sentence, _, is_paraphrase in data:
check_paraphrase = PARAPHRASE if is_paraphrase else NON_PARAPHRASE
sentence_pairs.append(
[input_sentence, source_sentence, check_paraphrase],
)
return (
search_engine_prediction,
SOTA_prediction,
SOTA_confidence,
found_url,
sentence_pairs,
)