import string from collections import Counter from nltk import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.probability import FreqDist import torch def preprocess_text(text, remove_stopwords=True, use_lemmatization=True): tokens = word_tokenize(text.lower()) tokens = [token for token in tokens if token.isalpha()] if remove_stopwords: stop_words = set(stopwords.words("english")) tokens = [token for token in tokens if token not in stop_words] if use_lemmatization: lemmatizer = WordNetLemmatizer() tokens = [lemmatizer.lemmatize(token) for token in tokens] return tokens def get_special_chars(): import emoji # Use version emoji==1.6.1, otherwise it won't have UNICODE_EMOJI main_special_characters = string.punctuation + string.digits + string.whitespace other_special_characters = ( "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–ー一▬…✦�­£​•€«»°·═" "×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖" "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚" "゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖" "」﴾》" ) emoji = list(emoji.UNICODE_EMOJI["en"].keys()) special_characters_default = set(main_special_characters + other_special_characters) special_characters_default.update(emoji) return special_characters_default special_characters_default = get_special_chars() # -------------------- Features -------------------- def syllable_count(word, d): return [len(list(y for y in x if y[-1].isdigit())) for x in d.get(word, [])] def estimated_slightly_difficult_words_ratio(text, d): words = word_tokenize(text.lower()) total_words = len(words) # Considering words with 3 or more syllables as difficult difficult_count = sum( 1 for word in words if sum(1 for _ in syllable_count(word, d)) >= 2 ) return difficult_count / total_words if total_words > 0 else 0 # -------------------- Features -------------------- def entity_density(text, nlp): doc = nlp(text) return len(doc.ents) / len(doc) # -------------------- Features -------------------- def determiners_frequency(text, nlp): doc = nlp(text) determiners = sum(1 for token in doc if token.pos_ == "DET") total_words = len(doc) return determiners / total_words if total_words else 0 # -------------------- Features -------------------- def punctuation_diversity(text): punctuation_counts = Counter( char for char in text if char in special_characters_default ) diversity_score = ( len(punctuation_counts) / len(special_characters_default) if special_characters_default else 0 ) return diversity_score # -------------------- Features -------------------- def type_token_ratio(text, remove_stopwords=True, use_lemmatization=True): tokens = preprocess_text(text, remove_stopwords, use_lemmatization) unique_words = set(tokens) return len(unique_words) / len(tokens) if tokens else 0 # -------------------- Features -------------------- def hapax_legomena_ratio(text, remove_stopwords=True, use_lemmatization=True): tokens = word_tokenize(text.lower()) tokens = [token for token in tokens if token.isalpha()] if remove_stopwords: stop_words = set(stopwords.words("english")) tokens = [token for token in tokens if token not in stop_words] if use_lemmatization: lemmatizer = WordNetLemmatizer() tokens = [lemmatizer.lemmatize(token) for token in tokens] freq_dist = FreqDist(tokens) hapaxes = freq_dist.hapaxes() return len(hapaxes) / len(tokens) if tokens else 0 # -------------------- Features -------------------- def mtld(text, threshold=0.72, remove_stopwords=True, use_lemmatization=True): tokens = preprocess_text(text, remove_stopwords, use_lemmatization) def mtld_calc(direction): token_length, factor_count = 0, 0 types = set() for token in tokens if direction == "forward" else reversed(tokens): types.add(token) token_length += 1 if len(types) / token_length < threshold: factor_count += 1 types = set() token_length = 0 factor_count += 1 # For the last segment, even if it didn't reach the threshold return len(tokens) / factor_count if factor_count != 0 else 0 return (mtld_calc("forward") + mtld_calc("backward")) / 2 # -------------------- Features -------------------- def calculate_max_depth(sent): return max(len(list(token.ancestors)) for token in sent) def calculate_syntactic_tree_depth(text, nlp): doc = nlp(text) sentence_depths = [calculate_max_depth(sent) for sent in doc.sents] average_depth = ( sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0 ) return average_depth # -------------------- Features -------------------- def calculate_perplexity(text, model, tokenizer, device, stride=512): encodings = tokenizer(text, return_tensors="pt") max_length = model.config.n_positions seq_len = encodings.input_ids.size(1) nlls = [] prev_end_loc = 0 for begin_loc in range(0, seq_len, stride): end_loc = min(begin_loc + max_length, seq_len) trg_len = end_loc - prev_end_loc # may be different from stride on last loop input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device) target_ids = input_ids.clone() target_ids[:, :-trg_len] = -100 with torch.no_grad(): outputs = model(input_ids, labels=target_ids) # loss is calculated using CrossEntropyLoss which averages over valid labels # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels # to the left by 1. neg_log_likelihood = outputs.loss nlls.append(neg_log_likelihood) prev_end_loc = end_loc if end_loc == seq_len: break ppl = torch.exp(torch.stack(nlls).mean()) return ppl.item()