File size: 4,067 Bytes
2cddd11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import sys, os
# from normalizers.english import EnglishTextNormalizer

error_words_freqs = {}
infile = sys.argv[1]
# setname = sys.argv[2]
insert_error = 0
insert_rare = 0
freqlist_test = {}
# eng_norm = EnglishTextNormalizer()

freqlist = {}
with open("./blist/word_freq.txt") as fin:
    for line in fin:
        word, freq = line.split()
        freqlist[word.upper()] = int(freq)

with open("./blist/all_rare_words.txt") as fin:
    rareset = set()
    for line in fin:
        rareset.add(line.strip().upper())

project_set = set()
with open(infile) as fin:
    lines = fin.readlines()
for i, line in enumerate(lines):
    if line.startswith('id:'):
        project = line.strip(')\n').split('-')[-3:]
        project = '-'.join(project)
    if "REF:" in line:
        nextline = lines[i+1].split()
        for j, word in enumerate(line.split()):
            if '*' in word:
                insert_error += 1
                if nextline[j].upper() in rareset:
                    insert_rare += 1
        line = line.replace('*', '')
        line.replace('%BCACK', '')
        for word in line.split()[1:]:
            if not word.startswith('('):
                if word.upper() not in freqlist_test:
                    freqlist_test[word.upper()] = 1
                else:
                    freqlist_test[word.upper()] += 1

                if word != word.lower() and word.upper() in error_words_freqs:
                    error_words_freqs[word.upper()] += 1
                elif word != word.lower() and word.upper() not in error_words_freqs:
                    error_words_freqs[word.upper()] = 1
                elif word == word.lower() and word.upper() not in error_words_freqs:
                    error_words_freqs[word.upper()] = 0
print(len(error_words_freqs.keys()))
print(insert_rare)

commonwords = []
rarewords = []
oovwords = []
common_freq = 0
rare_freq = 0
oov_freq = 0
common_error = 0
rare_error = 0
oov_error = 0
partial_error = 0
partial_freq = 0
very_common_error = 0
very_common_words = 0
words_error_freq = {}
words_total_freq = {}
for word, error in error_words_freqs.items():
    if word in rareset:
        rarewords.append(word)
        rare_freq += freqlist_test[word]
        rare_error += error
    elif word not in freqlist:
        oovwords.append(word)
        oov_freq += freqlist_test[word] if word in freqlist_test else 1
        oov_error += error
    else:
        if freqlist[word] <= 10 and freqlist[word] >= 3:
            if freqlist[word] not in words_error_freq:
                words_error_freq[freqlist[word]] = error
                words_total_freq[freqlist[word]] = freqlist_test[word]
            else:
                words_error_freq[freqlist[word]] += error
                words_total_freq[freqlist[word]] += freqlist_test[word]
        if freqlist[word] <= 10 and freqlist[word] >= 3:
            very_common_error += error
            very_common_words += freqlist_test[word]
        commonwords.append(word)
        common_freq += freqlist_test[word]
        common_error += error

total_words = common_freq + rare_freq + oov_freq
total_errors = common_error+rare_error+oov_error + insert_error
WER = total_errors / total_words
print('='*89)
print('Common words error freq: {} / {} = {}'.format(common_error, common_freq, common_error/common_freq))
print('Rare words error freq: {} / {} = {}'.format(rare_error+insert_rare, rare_freq, (rare_error + insert_rare)/rare_freq))
print('OOV words error freq: {} / {} = {}'.format(oov_error, oov_freq, oov_error/max(oov_freq, 1)))
print('WER estimate: {} / {} = {}'.format(total_errors, total_words, WER))
# print('Partial word count: {} / {}'.format(partial_error, partial_freq))
print('Insert error: {} / {} = {}'.format(insert_error - insert_rare, total_words, (insert_error - insert_rare)/total_words))
print('Insertion + OOV error {}'.format((insert_error + oov_error - insert_rare) / total_words))
# print('Very common words error freq: {} / {} = {}'.format(very_common_error, very_common_words, very_common_error/very_common_words))
print('='*89)