File size: 4,413 Bytes
5596d03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108

# class UDlexPT - the PortiLexicon-UD it reads dic files from the current directory
#               - it should contain WORDmaster.txt plus the 12 tags .tsv files
#
# member functions:
#    UDlexPT                    - the constructor
#    sget(self, word):          # get the entries for a word - returns a list with 3-tuples (empty if absent)
#    exists(self, word):        # returns True if the word exists
#    pget(self, word, tag):     # get the entries of a word for a specific tag - return similar to sget
#    pexists(self, word, tag):  # returns True if this word has at least one entry for tag
#    theTags(self, word):       # returns an array of all tags of a word - empty if absent of the lexicon

from os import path

class UDlexPT:
    def __init__(self):  # creates the lexicon
        self.tags = ["ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", \
            "NOUN", "NUM", "PRON", "SCONJ", "VERB"]
        self.master = {}
        self.words = 0
        self.entries = 0
        nEnt = [0]*len(self.tags)
        nNAE = [0]*len(self.tags)
        nEnD = [0]*len(self.tags)
        infile = open(path.dirname(__file__)+"/WORDmaster.txt")
        for line in infile:
            buf = line[:-1].split(",")
            tg = buf[1].split(" ")
            self.master.update({buf[0]:tg})
            self.words += 1
            ### compute totals
            if (len(tg) == 1):
                nNAE[self.tags.index(tg[0])] += 1
            for t in tg:
                nEnt[self.tags.index(t)] += 1
        infile.close()
        self.t = []
        i = 0
        for t in self.tags:
            self.t.append({})
            infile = open(path.dirname(__file__)+"/"+t+".tsv")
            for line in infile:
                buf = line[:-1].split("\t")
                entry = self.t[i].get(buf[0],"none")
                if (entry == "none"):
                    self.t[i].update({buf[0]:[[buf[1],buf[2]]]})
                else:
                    entry.append([buf[1],buf[2]])
                    self.t[i].update({buf[0]:entry})
                self.entries += 1
                nEnD[self.tags.index(t)] += 1
            infile.close()
            i += 1
        print("UDlexPT read with", self.words, "distinct words and", self.entries, "entries")
        print("{:5} & {:6} & {:6} & {:6} \\\\ \\hline".format("tag","total","amb","non-amb"))
        accW, accN, accE = 0, 0, 0
        for t in self.tags:
            print("{:5} & {:6} & {:6} & {:6} & {:6} \\\\ \\hline".format(t, \
                nEnt[self.tags.index(t)], \
                nEnt[self.tags.index(t)]-nNAE[self.tags.index(t)], \
                nNAE[self.tags.index(t)], \
                nEnD[self.tags.index(t)]))
            accW += nEnt[self.tags.index(t)]
            accN += nNAE[self.tags.index(t)]
            accE += nEnD[self.tags.index(t)]
        print("{:5} & {:6} & {:6} & {:6} & {:6} \\\\ \\hline".format("total", self.words, self.words-accN, accN, accE))
    def sget(self, word):   # get the entries for a word
        tags = self.master.get(word,"none")
        if (tags == "none"):
            return []
        else:
            ans = []
            for t in tags:
                a = self.t[self.tags.index(t)].get(word)
                #if (a == None):
                #    input("fix WORDmaster for: "+word)
                for n in a:
                    ans.append([n[0],t,n[1]])
            return ans
    def exists(self, word):   # returns True if the word exists
        tags = self.master.get(word,"none")
        if (tags == "none"):
            return False
        else:
            return True
    def pget(self, word, tag):   # get the entries of a word for a specific tag
        a = self.t[self.tags.index(tag)].get(word,"none")
        if (a == "none"):
            return []
        else:
            ans = []
            for n in a:
                ans.append([n[0],tag,n[1]])
            return ans
    def pexists(self, word, tag):    # returns True if this word has at least one entry for tag
        a = self.t[self.tags.index(tag)].get(word,"none")
        if (a == "none"):
            return False
        else:
            return True
    def theTags(self, word):   # returns an array of all tags of a word - empty if absent of the dictionary
        ts = self.master.get(word,"none")
        if (ts == "none"):
            return []
        else:
            return ts