File size: 6,427 Bytes
8125190
 
 
f75d1f0
8125190
 
f75d1f0
 
8125190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f75d1f0
 
 
8125190
 
f75d1f0
 
8125190
 
 
f75d1f0
 
8125190
 
d176253
f75d1f0
 
 
 
 
 
8125190
f75d1f0
 
 
 
 
 
 
 
8125190
 
 
 
f75d1f0
 
 
 
 
 
 
 
 
 
8125190
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import string
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
import torch


def preprocess_text(text, remove_stopwords=True, use_lemmatization=True):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha()]
    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
        tokens = [token for token in tokens if token not in stop_words]
    if use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens


def get_special_chars():
    import emoji  # Use version emoji==1.6.1, otherwise it won't have UNICODE_EMOJI

    main_special_characters = string.punctuation + string.digits + string.whitespace
    other_special_characters = (
        "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–ー一▬…✦�­£​•€«»°·═"
        "×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖"
        "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
        "゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
        "」﴾》"
    )
    emoji = list(emoji.UNICODE_EMOJI["en"].keys())
    special_characters_default = set(main_special_characters + other_special_characters)
    special_characters_default.update(emoji)
    return special_characters_default

special_characters_default = get_special_chars()


# -------------------- Features --------------------
def syllable_count(word, d):
    return [len(list(y for y in x if y[-1].isdigit())) for x in d.get(word, [])]


def estimated_slightly_difficult_words_ratio(text, d):
    words = word_tokenize(text.lower())
    total_words = len(words)
    # Considering words with 3 or more syllables as difficult
    difficult_count = sum(
        1 for word in words if sum(1 for _ in syllable_count(word, d)) >= 2
    )
    return difficult_count / total_words if total_words > 0 else 0


# -------------------- Features --------------------
def entity_density(text, nlp):
    doc = nlp(text)
    return len(doc.ents) / len(doc)


# -------------------- Features --------------------
def determiners_frequency(text, nlp):
    doc = nlp(text)
    determiners = sum(1 for token in doc if token.pos_ == "DET")
    total_words = len(doc)
    return determiners / total_words if total_words else 0


# -------------------- Features --------------------
def punctuation_diversity(text):
    punctuation_counts = Counter(
        char for char in text if char in special_characters_default
    )
    diversity_score = (
        len(punctuation_counts) / len(special_characters_default)
        if special_characters_default
        else 0
    )
    return diversity_score


# -------------------- Features --------------------
def type_token_ratio(text, remove_stopwords=True, use_lemmatization=True):
    tokens = preprocess_text(text, remove_stopwords, use_lemmatization)
    unique_words = set(tokens)
    return len(unique_words) / len(tokens) if tokens else 0


# -------------------- Features --------------------
def hapax_legomena_ratio(text, remove_stopwords=True, use_lemmatization=True):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha()]

    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
        tokens = [token for token in tokens if token not in stop_words]

    if use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

    freq_dist = FreqDist(tokens)
    hapaxes = freq_dist.hapaxes()
    return len(hapaxes) / len(tokens) if tokens else 0


# -------------------- Features --------------------
def mtld(text, threshold=0.72, remove_stopwords=True, use_lemmatization=True):
    tokens = preprocess_text(text, remove_stopwords, use_lemmatization)

    def mtld_calc(direction):
        token_length, factor_count = 0, 0
        types = set()
        for token in tokens if direction == "forward" else reversed(tokens):
            types.add(token)
            token_length += 1
            if len(types) / token_length < threshold:
                factor_count += 1
                types = set()
                token_length = 0
        factor_count += 1  # For the last segment, even if it didn't reach the threshold
        return len(tokens) / factor_count if factor_count != 0 else 0

    return (mtld_calc("forward") + mtld_calc("backward")) / 2


# -------------------- Features --------------------
def calculate_max_depth(sent):
    return max(len(list(token.ancestors)) for token in sent)


def calculate_syntactic_tree_depth(text, nlp):
    doc = nlp(text)
    sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
    average_depth = (
        sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
    )
    return average_depth


# -------------------- Features --------------------
def calculate_perplexity(text, model, tokenizer, device, stride=512):
    encodings = tokenizer(text, return_tensors="pt")
    max_length = model.config.n_positions
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    return ppl.item()