from unittest import TestCase

from tokenizers.pre_tokenizers import Whitespace

from recognizers.utils import DifferenceSample


class DifferenceSampleTestCase(TestCase):

    def setUp(self):
        self.text_a = "Chinese shares close higher Friday."
        self.text_b = "Les actions chinoises clĂ´turent en baisse mercredi."
        self.tokenizer = Whitespace()
        self.encoding_a = self.tokenizer.pre_tokenize_str(self.text_a)
        self.encoding_b = self.tokenizer.pre_tokenize_str(self.text_b)
        self.result = DifferenceSample(
            tokens_a=tuple([token[0] for token in self.encoding_a]),
            tokens_b=tuple([token[0] for token in self.encoding_b]),
            labels_a=tuple([0.1 for _ in range(len(self.encoding_a))]),
            labels_b=tuple([0.1 for _ in range(len(self.encoding_b))]),
        )

    def test_add_whitespace(self):
        self.result.add_whitespace(self.encoding_a, self.encoding_b)
        self.assertEqual("".join(self.result.tokens_a), self.text_a)
        self.assertEqual("".join(self.result.tokens_b), self.text_b)