Spaces:
Running
Running
import math | |
from _google_search_engine_testing_share import find_by_relative_search | |
from transformers import pipeline | |
# TODO: move to a config file | |
# Constants should be UPPER_SNAKE_CASE | |
PROOFREAD_FILE = "data/1_proofread/xsum/gpt-4o-mini_with_best_similarity.csv" | |
WORD_FREQUENCY = None | |
DEFAULT_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta" | |
MODEL_HUMAN_LABEL = {DEFAULT_MODEL: "Human"} | |
HUMAN = "HUMAN" | |
MACHINE = "MACHINE" | |
UNKNOWN = "UNKNOWN" | |
PARAPHRASE = "PARAPHRASE" | |
NON_PARAPHRASE = "NON_PARAPHRASE" | |
def detect_ai_content( | |
input_text: str, | |
model: str = DEFAULT_MODEL, | |
max_length: int = 512, | |
) -> tuple: | |
""" | |
Detects if text is human or machine generated. | |
Returns: | |
tuple: (label, confidence_score) | |
where label is HUMAN or MACHINE. | |
""" | |
try: | |
pipe = pipeline( | |
"text-classification", | |
model=model, | |
tokenizer=model, | |
max_length=max_length, | |
truncation=True, | |
device_map="auto", # good for GPU usage | |
) | |
result = pipe(input_text)[0] | |
confidence_score = result["score"] | |
if result["label"] == MODEL_HUMAN_LABEL[model]: | |
label = HUMAN | |
else: | |
label = MACHINE | |
return label, confidence_score | |
except Exception as e: # Add exception handling | |
print(f"Error in Roberta model inference: {e}") | |
return UNKNOWN, 0.0 # Return UNKNOWN and 0.0 confidence if error | |
def check_human(data, min_ratio=0.7): | |
""" | |
Checks if a sufficient number of input sentences are found within | |
source sentences. | |
Returns: | |
bool: True if the condition is met, False otherwise. | |
""" | |
if not data: # Handle empty data case | |
return False | |
min_matching = math.ceil(len(data) * min_ratio) | |
count = 0 | |
#for input_sentence, source_sentence, similiarity, is_paraprhase in data: | |
for sentence in data: | |
if sentence["similarity"] >= 0.99: | |
count += 1 | |
print(f"\tmatching_sentence_count : {count}, min_matching: {min_matching}") | |
if count >= min_matching: | |
return True | |
return False | |
def abstract_detect_generated_text(input_text): | |
""" | |
Abstracts the process of detecting generated text using search | |
and a classification model. | |
Returns: | |
tuple: ( | |
search_engine_prediction, | |
SOTA_prediction, | |
SOTA_confidence, | |
found_url, | |
sentence_pairs, | |
) | |
""" | |
is_paraphrase, found_url, data = find_by_relative_search( | |
input_text, | |
is_support_opposite=False, | |
) # Explicitly set the keyword argument | |
SOTA_prediction, SOTA_confidence = detect_ai_content(input_text) | |
if not is_paraphrase: | |
search_engine_prediction = UNKNOWN | |
else: | |
search_engine_prediction = HUMAN if check_human(data) else MACHINE | |
sentence_pairs = [] | |
if data: # Check if data is not empty to avoid error when iterating | |
for input_sentence, source_sentence, _, is_paraphrase in data: | |
check_paraphrase = PARAPHRASE if is_paraphrase else NON_PARAPHRASE | |
sentence_pairs.append( | |
[input_sentence, source_sentence, check_paraphrase], | |
) | |
return ( | |
search_engine_prediction, | |
SOTA_prediction, | |
SOTA_confidence, | |
found_url, | |
sentence_pairs, | |
) | |