Spaces:

pmkhanh7890
/

news_verification

Running

App Files Files

news_verification / src /texts /Search_Text /chatgpt_detector_roberta.py

pmkhanh7890

1st

22e1b62 about 1 month ago

raw

history blame

3.41 kB

	import math

	from _google_search_engine_testing_share import find_by_relative_search
	from transformers import pipeline

	# TODO: move to a config file
	# Constants should be UPPER_SNAKE_CASE
	PROOFREAD_FILE = "data/1_proofread/xsum/gpt-4o-mini_with_best_similarity.csv"
	WORD_FREQUENCY = None

	DEFAULT_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"

	MODEL_HUMAN_LABEL = {DEFAULT_MODEL: "Human"}

	HUMAN = "HUMAN"
	MACHINE = "MACHINE"
	UNKNOWN = "UNKNOWN"
	PARAPHRASE = "PARAPHRASE"
	NON_PARAPHRASE = "NON_PARAPHRASE"


	def detect_ai_content(
	input_text: str,
	model: str = DEFAULT_MODEL,
	max_length: int = 512,
	) -> tuple:
	"""
	Detects if text is human or machine generated.

	Returns:
	tuple: (label, confidence_score)
	where label is HUMAN or MACHINE.
	"""
	try:
	pipe = pipeline(
	"text-classification",
	model=model,
	tokenizer=model,
	max_length=max_length,
	truncation=True,
	device_map="auto", # good for GPU usage
	)
	result = pipe(input_text)[0]
	confidence_score = result["score"]
	if result["label"] == MODEL_HUMAN_LABEL[model]:
	label = HUMAN
	else:
	label = MACHINE
	return label, confidence_score
	except Exception as e: # Add exception handling
	print(f"Error in Roberta model inference: {e}")
	return UNKNOWN, 0.0 # Return UNKNOWN and 0.0 confidence if error


	def check_human(data, min_ratio=0.7):
	"""
	Checks if a sufficient number of input sentences are found within
	source sentences.

	Returns:
	bool: True if the condition is met, False otherwise.
	"""
	if not data: # Handle empty data case
	return False
	min_matching = math.ceil(len(data) * min_ratio)

	count = 0

	#for input_sentence, source_sentence, similiarity, is_paraprhase in data:
	for sentence in data:
	if sentence["similarity"] >= 0.99:
	count += 1
	print(f"\tmatching_sentence_count : {count}, min_matching: {min_matching}")
	if count >= min_matching:
	return True
	return False


	def abstract_detect_generated_text(input_text):
	"""
	Abstracts the process of detecting generated text using search
	and a classification model.

	Returns:
	tuple: (
	search_engine_prediction,
	SOTA_prediction,
	SOTA_confidence,
	found_url,
	sentence_pairs,
	)
	"""

	is_paraphrase, found_url, data = find_by_relative_search(
	input_text,
	is_support_opposite=False,
	) # Explicitly set the keyword argument
	SOTA_prediction, SOTA_confidence = detect_ai_content(input_text)

	if not is_paraphrase:
	search_engine_prediction = UNKNOWN
	else:
	search_engine_prediction = HUMAN if check_human(data) else MACHINE

	sentence_pairs = []
	if data: # Check if data is not empty to avoid error when iterating
	for input_sentence, source_sentence, _, is_paraphrase in data:
	check_paraphrase = PARAPHRASE if is_paraphrase else NON_PARAPHRASE
	sentence_pairs.append(
	[input_sentence, source_sentence, check_paraphrase],
	)

	return (
	search_engine_prediction,
	SOTA_prediction,
	SOTA_confidence,
	found_url,
	sentence_pairs,
	)