Spaces:

pmkhanh7890
/

news_verification

Running

App Files Files

news_verification / src /texts /Search_Text /evaluation.py

pmkhanh7890

1st version of demo

1ce1659 about 1 month ago

raw

history blame

3.96 kB

	import csv
	import time

	import pandas as pd
	from chatgpt_detector_roberta import (
	check_human,
	detect_ai_content,
	)
	from search_text import detect_by_relative_search

	HUMAN = "HUMAN"
	MACHINE = "MACHINE"


	def read_csv_column(file_path, column_name, data_size=100):
	"""
	Reads a CSV file and extracts data from the specified column.

	Args:
	filename: Path to the CSV file.
	column_name: Name of the column to extract data from.

	Returns:
	A list containing the data from the specified column.
	"""

	try:
	df = pd.read_csv(file_path)
	column_data = df[column_name].tolist()
	return column_data[:data_size]
	except FileNotFoundError:
	print(f"Error: File '{file_path}' not found.")
	return []
	except KeyError:
	print(f"Error: Column '{column_name}' not found in the CSV file.")
	return []


	def evaluation(texts):
	results = []
	index = 0
	for text in texts:
	if index <= 82:
	print(f"index = {index}")
	index += 1
	continue

	# Classify by SOTA model
	# SOTA_prediction, SOTA_confidence = detect_by_huggingface_model(text)
	SOTA_prediction, SOTA_confidence = detect_ai_content(text)

	# Classify by search engine
	# is_paraphrased, _, data = find_by_relative_search(text)
	is_paraphrased, _, data = detect_by_relative_search(text)
	if not is_paraphrased:
	search_engine_prediction = "UNKNOWN"
	else:
	if check_human(data):
	search_engine_prediction = HUMAN
	else:
	search_engine_prediction = MACHINE
	print(
	f"RESULTS:\t{SOTA_prediction}\t{search_engine_prediction}"
	)
	results.append(
	(index, SOTA_prediction, SOTA_confidence, search_engine_prediction)
	)

	with open("eva_bbc_test.csv", "a", newline="") as csvfile:
	#with open("eva_MAGE_test.csv", "a", newline="") as csvfile:
	writer = csv.writer(csvfile)
	writer.writerow(
	[index, SOTA_prediction, SOTA_confidence, search_engine_prediction]
	)
	index += 1
	time.sleep(1) # avoid 100? queries per minute limit

	# Define the column names
	# columns = [
	# "index",
	# "SOTA_prediction",
	# "SOTA_confidence",
	# "search_engine_prediction",
	# ]

	# # Create the DataFrame
	# df = pd.DataFrame(results, columns=columns)

	# # Statistics
	# search_engine_acc = df["search_engine_prediction"].value_counts()[
	# "HUMAN"
	# ] / len(df)
	# SOTA_acc = df["SOTA_prediction"].value_counts()["HUMAN"] / len(df)

	# # Filter the DataFrame based on the given conditions
	# filtered_df = df[
	# (df["SOTA_prediction"] == "MACHINE")
	# & (df["search_engine_prediction"] == "HUMAN")
	# ]

	# print(f"Total data: {len(df)}")
	# print(f"SOTA accuracy: {SOTA_acc}")
	# print(f"Search engine accuracy: {search_engine_acc}")
	# print(f"Correction sample: {len(filtered_df)}")


	def extract_machine_data(file_path):
	df = pd.read_csv(file_path)
	machine_data = df[df["src"] == "xsum_machine_topical_gpt-3.5-trubo"]

	# write to file
	machine_data.to_csv("machine_data.csv", index=False)

	def extract_human_data(file_path):
	df = pd.read_csv(file_path)
	machine_data = df[df["src"] == "xsum_human"]

	# write to file
	machine_data.to_csv("machine_data.csv", index=False)


	if __name__ == "__main__":
	# extract_machine_data('data/test_data/test.csv')

	# BBC
	file_path = "data/test_data/test_100_bbc.csv"
	column_name = "content"

	# MAGE
	# file_path = "data/test_data/test_100_MAGE.csv"
	# column_name = "text"

	contents = read_csv_column(
	file_path=file_path,
	column_name=column_name,
	data_size=100,
	)
	evaluation(contents)