File size: 721 Bytes
d08dd00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

import sys
import unicodedata as ud
import collections


def get_lang(w):
    try:
        if w[0] == '▁':
            lang = ud.name(w[1]).split()[0]
        else:
            lang = ud.name(w[0]).split()[0]
        return lang
    except:
        return 'unk'


fname = sys.argv[1]
words = open(fname).read().split('\n')
words = map(lambda w: w.split()[0] if w != '' else '', words)
words = filter(lambda w: '[' not in w, words)
words = map(lambda w: w.replace('#', ''), words)

langs = map(lambda w: get_lang(w), words)
counter = collections.Counter(langs)
counter = sorted(counter.items(), key=lambda k: -k[1])
counter = list(filter(lambda item: item[1] > 10, counter))

for k, v in counter:
    print(k, ": ", v)