Spaces:
Runtime error
Runtime error
File size: 2,399 Bytes
74fc30d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import unittest
import codecs
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)
from learn_bpe import learn_bpe
from apply_bpe import BPE
class TestBPELearnMethod(unittest.TestCase):
def test_learn_bpe(self):
infile = codecs.open(os.path.join(currentdir,'data','corpus.en'), encoding='utf-8')
outfile = codecs.open(os.path.join(currentdir,'data','bpe.out'), 'w', encoding='utf-8')
learn_bpe(infile, outfile, 1000)
infile.close()
outfile.close()
outlines = open(os.path.join(currentdir,'data','bpe.out'))
reflines = open(os.path.join(currentdir,'data','bpe.ref'))
for line, line2 in zip(outlines, reflines):
self.assertEqual(line, line2)
outlines.close()
reflines.close()
class TestBPESegmentMethod(unittest.TestCase):
def setUp(self):
with codecs.open(os.path.join(currentdir,'data','bpe.ref'), encoding='utf-8') as bpefile:
self.bpe = BPE(bpefile)
self.infile = codecs.open(os.path.join(currentdir,'data','corpus.en'), encoding='utf-8')
self.reffile = codecs.open(os.path.join(currentdir,'data','corpus.bpe.ref.en'), encoding='utf-8')
def tearDown(self):
self.infile.close()
self.reffile.close()
def test_apply_bpe(self):
for line, ref in zip(self.infile, self.reffile):
out = self.bpe.process_line(line)
self.assertEqual(out, ref)
def test_trailing_whitespace(self):
"""BPE.proces_line() preserves leading and trailing whitespace"""
orig = ' iron cement \n'
exp = ' ir@@ on c@@ ement \n'
out = self.bpe.process_line(orig)
self.assertEqual(out, exp)
def test_utf8_whitespace(self):
"""UTF-8 whitespace is treated as normal character, not word boundary"""
orig = 'iron\xa0cement\n'
exp = 'ir@@ on@@ \xa0@@ c@@ ement\n'
out = self.bpe.process_line(orig)
self.assertEqual(out, exp)
def test_empty_line(self):
orig = '\n'
exp = '\n'
out = self.bpe.process_line(orig)
self.assertEqual(out, exp)
if __name__ == '__main__':
unittest.main()
|