Spaces:

XS5217
/

text-classification

Sleeping

text-classification / evaluate /tests /test_trainer_evaluator_parity.py

XS-dev

trial

5657307 5 months ago

11.8 kB

	import json
	import os
	import shutil
	import subprocess
	import tempfile
	import unittest

	import numpy as np
	import torch
	import transformers
	from datasets import load_dataset
	from transformers import AutoFeatureExtractor, AutoModelForImageClassification, Trainer, TrainingArguments, pipeline

	from evaluate import evaluator, load

	from .utils import slow


	class TestEvaluatorTrainerParity(unittest.TestCase):
	def setUp(self):
	self.dir_path = tempfile.mkdtemp("evaluator_trainer_parity_test")

	transformers_version = transformers.__version__
	branch = ""
	if not transformers_version.endswith(".dev0"):
	branch = f"--branch v{transformers_version}"
	subprocess.run(
	f"git clone --depth 3 --filter=blob:none --sparse {branch} https://github.com/huggingface/transformers",
	shell=True,
	cwd=self.dir_path,
	)

	def tearDown(self):
	shutil.rmtree(self.dir_path, ignore_errors=True)

	def test_text_classification_parity(self):
	model_name = "philschmid/tiny-bert-sst2-distilled"

	subprocess.run(
	"git sparse-checkout set examples/pytorch/text-classification",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	subprocess.run(
	f"python examples/pytorch/text-classification/run_glue.py"
	f" --model_name_or_path {model_name}"
	f" --task_name sst2"
	f" --do_eval"
	f" --max_seq_length 9999999999" # rely on tokenizer.model_max_length for max_length
	f" --output_dir {os.path.join(self.dir_path, 'textclassification_sst2_transformers')}"
	f" --max_eval_samples 80",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	with open(
	f"{os.path.join(self.dir_path, 'textclassification_sst2_transformers', 'eval_results.json')}", "r"
	) as f:
	transformers_results = json.load(f)

	eval_dataset = load_dataset("glue", "sst2", split="validation[:80]")

	pipe = pipeline(task="text-classification", model=model_name, tokenizer=model_name)

	task_evaluator = evaluator(task="text-classification")
	evaluator_results = task_evaluator.compute(
	model_or_pipeline=pipe,
	data=eval_dataset,
	metric="accuracy",
	input_column="sentence",
	label_column="label",
	label_mapping={"negative": 0, "positive": 1},
	strategy="simple",
	)

	self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"])

	@slow
	def test_text_classification_parity_two_columns(self):
	model_name = "prajjwal1/bert-tiny-mnli"
	max_eval_samples = 150

	subprocess.run(
	"git sparse-checkout set examples/pytorch/text-classification",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	subprocess.run(
	f"python examples/pytorch/text-classification/run_glue.py"
	f" --model_name_or_path {model_name}"
	f" --task_name mnli"
	f" --do_eval"
	f" --max_seq_length 256"
	f" --output_dir {os.path.join(self.dir_path, 'textclassification_mnli_transformers')}"
	f" --max_eval_samples {max_eval_samples}",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	with open(
	f"{os.path.join(self.dir_path, 'textclassification_mnli_transformers', 'eval_results.json')}", "r"
	) as f:
	transformers_results = json.load(f)

	eval_dataset = load_dataset("glue", "mnli", split=f"validation_matched[:{max_eval_samples}]")

	pipe = pipeline(task="text-classification", model=model_name, tokenizer=model_name, max_length=256)

	task_evaluator = evaluator(task="text-classification")
	evaluator_results = task_evaluator.compute(
	model_or_pipeline=pipe,
	data=eval_dataset,
	metric="accuracy",
	input_column="premise",
	second_input_column="hypothesis",
	label_column="label",
	label_mapping={"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2},
	)

	self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"])

	def test_image_classification_parity(self):
	# we can not compare to the Pytorch transformers example, that uses custom preprocessing on the images
	model_name = "douwekiela/resnet-18-finetuned-dogfood"
	dataset_name = "beans"
	max_eval_samples = 120

	raw_dataset = load_dataset(dataset_name, split="validation")
	eval_dataset = raw_dataset.select(range(max_eval_samples))

	feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
	model = AutoModelForImageClassification.from_pretrained(model_name)

	def collate_fn(examples):
	pixel_values = torch.stack(
	[torch.tensor(feature_extractor(example["image"])["pixel_values"][0]) for example in examples]
	)
	labels = torch.tensor([example["labels"] for example in examples])
	return {"pixel_values": pixel_values, "labels": labels}

	metric = load("accuracy")
	trainer = Trainer(
	model=model,
	args=TrainingArguments(
	output_dir=os.path.join(self.dir_path, "imageclassification_beans_transformers"),
	remove_unused_columns=False,
	),
	train_dataset=None,
	eval_dataset=eval_dataset,
	compute_metrics=lambda p: metric.compute(
	predictions=np.argmax(p.predictions, axis=1), references=p.label_ids
	),
	tokenizer=None,
	data_collator=collate_fn,
	)

	metrics = trainer.evaluate()
	trainer.save_metrics("eval", metrics)

	with open(
	f"{os.path.join(self.dir_path, 'imageclassification_beans_transformers', 'eval_results.json')}", "r"
	) as f:
	transformers_results = json.load(f)

	pipe = pipeline(task="image-classification", model=model_name, feature_extractor=model_name)

	task_evaluator = evaluator(task="image-classification")
	evaluator_results = task_evaluator.compute(
	model_or_pipeline=pipe,
	data=eval_dataset,
	metric="accuracy",
	input_column="image",
	label_column="labels",
	label_mapping=model.config.label2id,
	strategy="simple",
	)

	self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"])

	def test_question_answering_parity(self):
	model_name_v1 = "anas-awadalla/bert-tiny-finetuned-squad"
	model_name_v2 = "mrm8488/bert-tiny-finetuned-squadv2"

	subprocess.run(
	"git sparse-checkout set examples/pytorch/question-answering",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	# test squad_v1-like dataset
	subprocess.run(
	f"python examples/pytorch/question-answering/run_qa.py"
	f" --model_name_or_path {model_name_v1}"
	f" --dataset_name squad"
	f" --do_eval"
	f" --output_dir {os.path.join(self.dir_path, 'questionanswering_squad_transformers')}"
	f" --max_eval_samples 100"
	f" --max_seq_length 384",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	with open(
	f"{os.path.join(self.dir_path, 'questionanswering_squad_transformers', 'eval_results.json')}", "r"
	) as f:
	transformers_results = json.load(f)

	eval_dataset = load_dataset("squad", split="validation[:100]")

	pipe = pipeline(
	task="question-answering",
	model=model_name_v1,
	tokenizer=model_name_v1,
	max_answer_len=30,
	padding="max_length",
	)

	task_evaluator = evaluator(task="question-answering")
	evaluator_results = task_evaluator.compute(
	model_or_pipeline=pipe,
	data=eval_dataset,
	metric="squad",
	strategy="simple",
	)

	self.assertEqual(transformers_results["eval_f1"], evaluator_results["f1"])
	self.assertEqual(transformers_results["eval_exact_match"], evaluator_results["exact_match"])

	# test squad_v2-like dataset
	subprocess.run(
	f"python examples/pytorch/question-answering/run_qa.py"
	f" --model_name_or_path {model_name_v2}"
	f" --dataset_name squad_v2"
	f" --version_2_with_negative"
	f" --do_eval"
	f" --output_dir {os.path.join(self.dir_path, 'questionanswering_squadv2_transformers')}"
	f" --max_eval_samples 100"
	f" --max_seq_length 384",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	with open(
	f"{os.path.join(self.dir_path, 'questionanswering_squadv2_transformers', 'eval_results.json')}", "r"
	) as f:
	transformers_results = json.load(f)

	eval_dataset = load_dataset("squad_v2", split="validation[:100]")

	pipe = pipeline(
	task="question-answering",
	model=model_name_v2,
	tokenizer=model_name_v2,
	max_answer_len=30,
	)

	task_evaluator = evaluator(task="question-answering")
	evaluator_results = task_evaluator.compute(
	model_or_pipeline=pipe,
	data=eval_dataset,
	metric="squad_v2",
	strategy="simple",
	squad_v2_format=True,
	)

	self.assertEqual(transformers_results["eval_f1"], evaluator_results["f1"])
	self.assertEqual(transformers_results["eval_HasAns_f1"], evaluator_results["HasAns_f1"])
	self.assertEqual(transformers_results["eval_NoAns_f1"], evaluator_results["NoAns_f1"])

	def test_token_classification_parity(self):
	model_name = "hf-internal-testing/tiny-bert-for-token-classification"
	n_samples = 500

	subprocess.run(
	"git sparse-checkout set examples/pytorch/token-classification",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	subprocess.run(
	f"python examples/pytorch/token-classification/run_ner.py"
	f" --model_name_or_path {model_name}"
	f" --dataset_name conll2003"
	f" --do_eval"
	f" --output_dir {os.path.join(self.dir_path, 'tokenclassification_conll2003_transformers')}"
	f" --max_eval_samples {n_samples}",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	with open(
	os.path.join(self.dir_path, "tokenclassification_conll2003_transformers", "eval_results.json"), "r"
	) as f:
	transformers_results = json.load(f)

	eval_dataset = load_dataset("conll2003", split=f"validation[:{n_samples}]")

	pipe = pipeline(task="token-classification", model=model_name)

	e = evaluator(task="token-classification")
	evaluator_results = e.compute(
	model_or_pipeline=pipe,
	data=eval_dataset,
	metric="seqeval",
	input_column="tokens",
	label_column="ner_tags",
	strategy="simple",
	)

	self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["overall_accuracy"])
	self.assertEqual(transformers_results["eval_f1"], evaluator_results["overall_f1"])