Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /application /text /model_detection.py

pmkhanh7890

pre-commit

bfe6692 5 months ago

raw

history blame

4.55 kB

	import os

	import torch
	from dotenv import load_dotenv
	from openai import (
	AzureOpenAI,
	OpenAIError,
	)
	from sentence_transformers import (
	SentenceTransformer,
	util,
	)
	from transformers import pipeline

	load_dotenv()
	AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
	AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
	AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")

	azure_client = AzureOpenAI(
	azure_endpoint="https://quoc-nguyen.openai.azure.com/",
	api_key=AZURE_OPENAI_API_KEY,
	api_version="2024-05-01-preview",
	)

	# TODO: move to a config file
	# AI_TEXT_DECTECTION_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
	AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"

	MODEL_HUMAN_LABEL = {AI_TEXT_DECTECTION_MODEL: "Human"}
	HUMAN = "HUMAN"
	MACHINE = "MACHINE"
	UNKNOWN = "UNKNOWN"
	PARAPHRASE = "PARAPHRASE"
	NON_PARAPHRASE = "NON_PARAPHRASE"

	# load the embedding model
	DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
	PARAPHASE_MODEL.to(DEVICE)


	def detect_text_by_ai_model(
	input_text: str,
	model: str = AI_TEXT_DECTECTION_MODEL,
	max_length: int = 512,
	) -> tuple:
	"""
	Model: RADAR-Vicuna-7B
	Ref: https://huggingface.co/TrustSafeAI/RADAR-Vicuna-7B

	Detects if text is human or machine generated.

	Returns:
	tuple: (label, confidence_score)
	where label is HUMAN or MACHINE.
	"""
	try:
	pipe = pipeline(
	"text-classification",
	model=model,
	tokenizer=model,
	max_length=max_length,
	truncation=True,
	device_map="auto", # good for GPU usage
	)
	input_text = input_text.replace("<br>", " ")
	result = pipe(input_text)[0]
	confidence_score = result["score"]
	if result["label"] == MODEL_HUMAN_LABEL[model]:
	label = HUMAN
	else:
	label = MACHINE
	generated_model, _ = predict_generation_model(input_text)
	label += f"<br>({generated_model})"
	return label, confidence_score
	except Exception as e: # Add exception handling
	print(f"Error in Roberta model inference: {e}")
	return UNKNOWN, 0.5 # Return UNKNOWN and 0.0 confidence if error


	def predict_generation_model(text: str) -> tuple[str, float]:
	"""
	Predicts if text is generated by gpt-4o or gpt-4o-mini models.
	Compare the input text against the paraphrased text by the models.

	Returns:
	tuple: (label, confidence_score)
	where label is gpt-4o or gpt-4o-mini.
	"""
	best_similarity = 0
	best_model = "gpt-4o"
	models = ["gpt-4o", "gpt-4o-mini"]
	for model in models:
	paraphrased_text = paraphrase_by_AI(text, model)
	if paraphrased_text is None:
	continue
	similarity = measure_text_similarity(text, paraphrased_text)
	if similarity > best_similarity:
	best_similarity = similarity
	best_model = model

	return best_model, best_similarity


	def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str:
	"""
	Paraphrase text using a given model.

	Returns:
	str: Paraphrased text.
	"""

	prompt = f"""
	Paraphrase the following news, only output the paraphrased text:
	{input_text}
	"""
	try:
	response = azure_client.chat.completions.create(
	model=model,
	messages=[
	{"role": "user", "content": prompt},
	],
	# max_tokens=100,
	# temperature=0.7,
	# top_p=0.9,
	# n=1,
	)
	paraphrased_text = response.choices[0].message.content
	return paraphrased_text
	except OpenAIError as e: # Add exception handling
	print(f"Error in AI model inference: {e}")
	return None


	def measure_text_similarity(text1: str, text2: str) -> float:
	"""
	Measure the similarity between two texts.

	Returns:
	float: Similarity score.
	"""
	embeddings1 = PARAPHASE_MODEL.encode(
	text1,
	convert_to_tensor=True,
	device=DEVICE,
	show_progress_bar=False,
	)
	embeddings2 = PARAPHASE_MODEL.encode(
	text2,
	convert_to_tensor=True,
	device=DEVICE,
	show_progress_bar=False,
	)

	# Compute cosine similarity matrix
	similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
	print(similarity[0][0])
	return similarity[0][0]