File size: 6,427 Bytes
8125190 f75d1f0 8125190 f75d1f0 8125190 f75d1f0 8125190 f75d1f0 8125190 f75d1f0 8125190 d176253 f75d1f0 8125190 f75d1f0 8125190 f75d1f0 8125190 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import string
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
import torch
def preprocess_text(text, remove_stopwords=True, use_lemmatization=True):
tokens = word_tokenize(text.lower())
tokens = [token for token in tokens if token.isalpha()]
if remove_stopwords:
stop_words = set(stopwords.words("english"))
tokens = [token for token in tokens if token not in stop_words]
if use_lemmatization:
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]
return tokens
def get_special_chars():
import emoji # Use version emoji==1.6.1, otherwise it won't have UNICODE_EMOJI
main_special_characters = string.punctuation + string.digits + string.whitespace
other_special_characters = (
" ’“”–ー一▬…✦�£•€«»°·═"
emoji = list(emoji.UNICODE_EMOJI["en"].keys())
special_characters_default = set(main_special_characters + other_special_characters)
return special_characters_default
special_characters_default = get_special_chars()
# -------------------- Features --------------------
def syllable_count(word, d):
return [len(list(y for y in x if y[-1].isdigit())) for x in d.get(word, [])]
def estimated_slightly_difficult_words_ratio(text, d):
words = word_tokenize(text.lower())
total_words = len(words)
# Considering words with 3 or more syllables as difficult
difficult_count = sum(
1 for word in words if sum(1 for _ in syllable_count(word, d)) >= 2
return difficult_count / total_words if total_words > 0 else 0
# -------------------- Features --------------------
def entity_density(text, nlp):
doc = nlp(text)
return len(doc.ents) / len(doc)
# -------------------- Features --------------------
def determiners_frequency(text, nlp):
doc = nlp(text)
determiners = sum(1 for token in doc if token.pos_ == "DET")
total_words = len(doc)
return determiners / total_words if total_words else 0
# -------------------- Features --------------------
def punctuation_diversity(text):
punctuation_counts = Counter(
char for char in text if char in special_characters_default
diversity_score = (
len(punctuation_counts) / len(special_characters_default)
if special_characters_default
else 0
return diversity_score
# -------------------- Features --------------------
def type_token_ratio(text, remove_stopwords=True, use_lemmatization=True):
tokens = preprocess_text(text, remove_stopwords, use_lemmatization)
unique_words = set(tokens)
return len(unique_words) / len(tokens) if tokens else 0
# -------------------- Features --------------------
def hapax_legomena_ratio(text, remove_stopwords=True, use_lemmatization=True):
tokens = word_tokenize(text.lower())
tokens = [token for token in tokens if token.isalpha()]
if remove_stopwords:
stop_words = set(stopwords.words("english"))
tokens = [token for token in tokens if token not in stop_words]
if use_lemmatization:
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]
freq_dist = FreqDist(tokens)
hapaxes = freq_dist.hapaxes()
return len(hapaxes) / len(tokens) if tokens else 0
# -------------------- Features --------------------
def mtld(text, threshold=0.72, remove_stopwords=True, use_lemmatization=True):
tokens = preprocess_text(text, remove_stopwords, use_lemmatization)
def mtld_calc(direction):
token_length, factor_count = 0, 0
types = set()
for token in tokens if direction == "forward" else reversed(tokens):
token_length += 1
if len(types) / token_length < threshold:
factor_count += 1
types = set()
token_length = 0
factor_count += 1 # For the last segment, even if it didn't reach the threshold
return len(tokens) / factor_count if factor_count != 0 else 0
return (mtld_calc("forward") + mtld_calc("backward")) / 2
# -------------------- Features --------------------
def calculate_max_depth(sent):
return max(len(list(token.ancestors)) for token in sent)
def calculate_syntactic_tree_depth(text, nlp):
doc = nlp(text)
sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
average_depth = (
sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
return average_depth
# -------------------- Features --------------------
def calculate_perplexity(text, model, tokenizer, device, stride=512):
encodings = tokenizer(text, return_tensors="pt")
max_length = model.config.n_positions
seq_len = encodings.input_ids.size(1)
nlls = []
prev_end_loc = 0
for begin_loc in range(0, seq_len, stride):
end_loc = min(begin_loc + max_length, seq_len)
trg_len = end_loc - prev_end_loc # may be different from stride on last loop
input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
target_ids = input_ids.clone()
target_ids[:, :-trg_len] = -100
with torch.no_grad():
outputs = model(input_ids, labels=target_ids)
# loss is calculated using CrossEntropyLoss which averages over valid labels
# N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
# to the left by 1.
neg_log_likelihood = outputs.loss
prev_end_loc = end_loc
if end_loc == seq_len:
ppl = torch.exp(torch.stack(nlls).mean())
return ppl.item()