File size: 6,035 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# Natural Language Toolkit: Language ID module using TextCat algorithm
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Avital Pekker <[email protected]>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""

A module for language identification using the TextCat algorithm.

An implementation of the text categorization algorithm

presented in Cavnar, W. B. and J. M. Trenkle,

"N-Gram-Based Text Categorization".



The algorithm takes advantage of Zipf's law and uses

n-gram frequencies to profile languages and text-yet to

be identified-then compares using a distance measure.



Language n-grams are provided by the "An Crubadan"

project. A corpus reader was created separately to read

those files.



For details regarding the algorithm, see:

https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf



For details about An Crubadan, see:

https://borel.slu.edu/crubadan/index.html

"""

from sys import maxsize

from nltk.util import trigrams

# Note: this is NOT "re" you're likely used to. The regex module
# is an alternative to the standard re module that supports
# Unicode codepoint properties with the \p{} syntax.
# You may have to "pip install regx"
try:
    import regex as re
except ImportError:
    re = None
######################################################################
##  Language identification using TextCat
######################################################################


class TextCat:

    _corpus = None
    fingerprints = {}
    _START_CHAR = "<"
    _END_CHAR = ">"

    last_distances = {}

    def __init__(self):
        if not re:
            raise OSError(
                "classify.textcat requires the regex module that "
                "supports unicode. Try '$ pip install regex' and "
                "see https://pypi.python.org/pypi/regex for "
                "further details."
            )

        from nltk.corpus import crubadan

        self._corpus = crubadan
        # Load all language ngrams into cache
        for lang in self._corpus.langs():
            self._corpus.lang_freq(lang)

    def remove_punctuation(self, text):
        """Get rid of punctuation except apostrophes"""
        return re.sub(r"[^\P{P}\']+", "", text)

    def profile(self, text):
        """Create FreqDist of trigrams within text"""
        from nltk import FreqDist, word_tokenize

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = ["".join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint

    def calc_dist(self, lang, trigram, text_profile):
        """Calculate the "out-of-place" measure between the

        text and language profile for a single trigram"""

        lang_fd = self._corpus.lang_freq(lang)
        dist = 0

        if trigram in lang_fd:
            idx_lang_profile = list(lang_fd.keys()).index(trigram)
            idx_text = list(text_profile.keys()).index(trigram)

            # print(idx_lang_profile, ", ", idx_text)
            dist = abs(idx_lang_profile - idx_text)
        else:
            # Arbitrary but should be larger than
            # any possible trigram file length
            # in terms of total lines
            dist = maxsize

        return dist

    def lang_dists(self, text):
        """Calculate the "out-of-place" measure between

        the text and all languages"""

        distances = {}
        profile = self.profile(text)
        # For all the languages
        for lang in self._corpus._all_lang_freq.keys():
            # Calculate distance metric for every trigram in
            # input text to be identified
            lang_dist = 0
            for trigram in profile:
                lang_dist += self.calc_dist(lang, trigram, profile)

            distances[lang] = lang_dist

        return distances

    def guess_language(self, text):
        """Find the language with the min distance

        to the text and return its ISO 639-3 code"""
        self.last_distances = self.lang_dists(text)

        return min(self.last_distances, key=self.last_distances.get)
        #################################################')


def demo():
    from nltk.corpus import udhr

    langs = [
        "Kurdish-UTF8",
        "Abkhaz-UTF8",
        "Farsi_Persian-UTF8",
        "Hindi-UTF8",
        "Hawaiian-UTF8",
        "Russian-UTF8",
        "Vietnamese-UTF8",
        "Serbian_Srpski-UTF8",
        "Esperanto-UTF8",
    ]

    friendly = {
        "kmr": "Northern Kurdish",
        "abk": "Abkhazian",
        "pes": "Iranian Persian",
        "hin": "Hindi",
        "haw": "Hawaiian",
        "rus": "Russian",
        "vie": "Vietnamese",
        "srp": "Serbian",
        "epo": "Esperanto",
    }

    tc = TextCat()

    for cur_lang in langs:
        # Get raw data from UDHR corpus
        raw_sentences = udhr.sents(cur_lang)
        rows = len(raw_sentences) - 1
        cols = list(map(len, raw_sentences))

        sample = ""

        # Generate a sample text of the language
        for i in range(0, rows):
            cur_sent = ""
            for j in range(0, cols[i]):
                cur_sent += " " + raw_sentences[i][j]

            sample += cur_sent

        # Try to detect what it is
        print("Language snippet: " + sample[0:140] + "...")
        guess = tc.guess_language(sample)
        print(f"Language detection: {guess} ({friendly[guess]})")
        print("#" * 140)


if __name__ == "__main__":
    demo()