copyright_checker / writing_analysis.py
eljanmahammadli's picture
integreted new writing analysis features
8125190
raw
history blame
6.43 kB
import string
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
import torch
def preprocess_text(text, remove_stopwords=True, use_lemmatization=True):
tokens = word_tokenize(text.lower())
tokens = [token for token in tokens if token.isalpha()]
if remove_stopwords:
stop_words = set(stopwords.words("english"))
tokens = [token for token in tokens if token not in stop_words]
if use_lemmatization:
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]
return tokens
def get_special_chars():
import emoji # Use version emoji==1.6.1, otherwise it won't have UNICODE_EMOJI
main_special_characters = string.punctuation + string.digits + string.whitespace
other_special_characters = (
"’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–ー一▬…✦�­£​•€«»°·═"
"×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖"
"◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
"゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
"」﴾》"
)
emoji = list(emoji.UNICODE_EMOJI["en"].keys())
special_characters_default = set(main_special_characters + other_special_characters)
special_characters_default.update(emoji)
return special_characters_default
special_characters_default = get_special_chars()
# -------------------- Features --------------------
def syllable_count(word, d):
return [len(list(y for y in x if y[-1].isdigit())) for x in d.get(word, [])]
def estimated_slightly_difficult_words_ratio(text, d):
words = word_tokenize(text.lower())
total_words = len(words)
# Considering words with 3 or more syllables as difficult
difficult_count = sum(
1 for word in words if sum(1 for _ in syllable_count(word, d)) >= 2
)
return difficult_count / total_words if total_words > 0 else 0
# -------------------- Features --------------------
def entity_density(text, nlp):
doc = nlp(text)
return len(doc.ents) / len(doc)
# -------------------- Features --------------------
def determiners_frequency(text, nlp):
doc = nlp(text)
determiners = sum(1 for token in doc if token.pos_ == "DET")
total_words = len(doc)
return determiners / total_words if total_words else 0
# -------------------- Features --------------------
def punctuation_diversity(text):
punctuation_counts = Counter(
char for char in text if char in special_characters_default
)
diversity_score = (
len(punctuation_counts) / len(special_characters_default)
if special_characters_default
else 0
)
return diversity_score
# -------------------- Features --------------------
def type_token_ratio(text, remove_stopwords=True, use_lemmatization=True):
tokens = preprocess_text(text, remove_stopwords, use_lemmatization)
unique_words = set(tokens)
return len(unique_words) / len(tokens) if tokens else 0
# -------------------- Features --------------------
def hapax_legomena_ratio(text, remove_stopwords=True, use_lemmatization=True):
tokens = word_tokenize(text.lower())
tokens = [token for token in tokens if token.isalpha()]
if remove_stopwords:
stop_words = set(stopwords.words("english"))
tokens = [token for token in tokens if token not in stop_words]
if use_lemmatization:
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]
freq_dist = FreqDist(tokens)
hapaxes = freq_dist.hapaxes()
return len(hapaxes) / len(tokens) if tokens else 0
# -------------------- Features --------------------
def mtld(text, threshold=0.72, remove_stopwords=True, use_lemmatization=True):
tokens = preprocess_text(text, remove_stopwords, use_lemmatization)
def mtld_calc(direction):
token_length, factor_count = 0, 0
types = set()
for token in tokens if direction == "forward" else reversed(tokens):
types.add(token)
token_length += 1
if len(types) / token_length < threshold:
factor_count += 1
types = set()
token_length = 0
factor_count += 1 # For the last segment, even if it didn't reach the threshold
return len(tokens) / factor_count if factor_count != 0 else 0
return (mtld_calc("forward") + mtld_calc("backward")) / 2
# -------------------- Features --------------------
def calculate_max_depth(sent):
return max(len(list(token.ancestors)) for token in sent)
def calculate_syntactic_tree_depth(text, nlp):
doc = nlp(text)
sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
average_depth = (
sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
)
return average_depth
# -------------------- Features --------------------
def calculate_perplexity(text, model, tokenizer, device, stride=512):
encodings = tokenizer(text, return_tensors="pt")
max_length = model.config.n_positions
seq_len = encodings.input_ids.size(1)
nlls = []
prev_end_loc = 0
for begin_loc in range(0, seq_len, stride):
end_loc = min(begin_loc + max_length, seq_len)
trg_len = end_loc - prev_end_loc # may be different from stride on last loop
input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
target_ids = input_ids.clone()
target_ids[:, :-trg_len] = -100
with torch.no_grad():
outputs = model(input_ids, labels=target_ids)
# loss is calculated using CrossEntropyLoss which averages over valid labels
# N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
# to the left by 1.
neg_log_likelihood = outputs.loss
nlls.append(neg_log_likelihood)
prev_end_loc = end_loc
if end_loc == seq_len:
break
ppl = torch.exp(torch.stack(nlls).mean())
return ppl.item()