Spaces:

polygraf-ai
/

copyright_checker

Running

App Files Files Community

copyright_checker / writing_analysis.py

eljanmahammadli

integreted new writing analysis features

8125190 7 months ago

raw

history blame

6.43 kB

	import string
	from collections import Counter
	from nltk import word_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from nltk.probability import FreqDist
	import torch


	def preprocess_text(text, remove_stopwords=True, use_lemmatization=True):
	tokens = word_tokenize(text.lower())
	tokens = [token for token in tokens if token.isalpha()]
	if remove_stopwords:
	stop_words = set(stopwords.words("english"))
	tokens = [token for token in tokens if token not in stop_words]
	if use_lemmatization:
	lemmatizer = WordNetLemmatizer()
	tokens = [lemmatizer.lemmatize(token) for token in tokens]
	return tokens


	def get_special_chars():
	import emoji # Use version emoji==1.6.1, otherwise it won't have UNICODE_EMOJI

	main_special_characters = string.punctuation + string.digits + string.whitespace
	other_special_characters = (
	"   　 ’“”–ー一▬…✦�£•€«»°·═"
	"×士＾˘⇓↓↑←→（）§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃，ˌ¸‹›ʺˈʻ¦‐⠀‰‑≤≥‖"
	"◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン：∼⁄・♡✓⊕․．⋅÷１‟；،、¨ाাी्े◦˚"
	"゜ʼ≖ʼ¤ッツシ℃√！【】‿∞➤～πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬？▷Г♫∟™ª₪®「—❖"
	"」﴾》"
	)
	emoji = list(emoji.UNICODE_EMOJI["en"].keys())
	special_characters_default = set(main_special_characters + other_special_characters)
	special_characters_default.update(emoji)
	return special_characters_default

	special_characters_default = get_special_chars()


	# -------------------- Features --------------------
	def syllable_count(word, d):
	return [len(list(y for y in x if y[-1].isdigit())) for x in d.get(word, [])]


	def estimated_slightly_difficult_words_ratio(text, d):
	words = word_tokenize(text.lower())
	total_words = len(words)
	# Considering words with 3 or more syllables as difficult
	difficult_count = sum(
	1 for word in words if sum(1 for _ in syllable_count(word, d)) >= 2
	)
	return difficult_count / total_words if total_words > 0 else 0


	# -------------------- Features --------------------
	def entity_density(text, nlp):
	doc = nlp(text)
	return len(doc.ents) / len(doc)


	# -------------------- Features --------------------
	def determiners_frequency(text, nlp):
	doc = nlp(text)
	determiners = sum(1 for token in doc if token.pos_ == "DET")
	total_words = len(doc)
	return determiners / total_words if total_words else 0


	# -------------------- Features --------------------
	def punctuation_diversity(text):
	punctuation_counts = Counter(
	char for char in text if char in special_characters_default
	)
	diversity_score = (
	len(punctuation_counts) / len(special_characters_default)
	if special_characters_default
	else 0
	)
	return diversity_score


	# -------------------- Features --------------------
	def type_token_ratio(text, remove_stopwords=True, use_lemmatization=True):
	tokens = preprocess_text(text, remove_stopwords, use_lemmatization)
	unique_words = set(tokens)
	return len(unique_words) / len(tokens) if tokens else 0


	# -------------------- Features --------------------
	def hapax_legomena_ratio(text, remove_stopwords=True, use_lemmatization=True):
	tokens = word_tokenize(text.lower())
	tokens = [token for token in tokens if token.isalpha()]

	if remove_stopwords:
	stop_words = set(stopwords.words("english"))
	tokens = [token for token in tokens if token not in stop_words]

	if use_lemmatization:
	lemmatizer = WordNetLemmatizer()
	tokens = [lemmatizer.lemmatize(token) for token in tokens]

	freq_dist = FreqDist(tokens)
	hapaxes = freq_dist.hapaxes()
	return len(hapaxes) / len(tokens) if tokens else 0


	# -------------------- Features --------------------
	def mtld(text, threshold=0.72, remove_stopwords=True, use_lemmatization=True):
	tokens = preprocess_text(text, remove_stopwords, use_lemmatization)

	def mtld_calc(direction):
	token_length, factor_count = 0, 0
	types = set()
	for token in tokens if direction == "forward" else reversed(tokens):
	types.add(token)
	token_length += 1
	if len(types) / token_length < threshold:
	factor_count += 1
	types = set()
	token_length = 0
	factor_count += 1 # For the last segment, even if it didn't reach the threshold
	return len(tokens) / factor_count if factor_count != 0 else 0

	return (mtld_calc("forward") + mtld_calc("backward")) / 2


	# -------------------- Features --------------------
	def calculate_max_depth(sent):
	return max(len(list(token.ancestors)) for token in sent)


	def calculate_syntactic_tree_depth(text, nlp):
	doc = nlp(text)
	sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
	average_depth = (
	sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
	)
	return average_depth


	# -------------------- Features --------------------
	def calculate_perplexity(text, model, tokenizer, device, stride=512):
	encodings = tokenizer(text, return_tensors="pt")
	max_length = model.config.n_positions
	seq_len = encodings.input_ids.size(1)

	nlls = []
	prev_end_loc = 0
	for begin_loc in range(0, seq_len, stride):
	end_loc = min(begin_loc + max_length, seq_len)
	trg_len = end_loc - prev_end_loc # may be different from stride on last loop
	input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
	target_ids = input_ids.clone()
	target_ids[:, :-trg_len] = -100

	with torch.no_grad():
	outputs = model(input_ids, labels=target_ids)

	# loss is calculated using CrossEntropyLoss which averages over valid labels
	# N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
	# to the left by 1.
	neg_log_likelihood = outputs.loss

	nlls.append(neg_log_likelihood)

	prev_end_loc = end_loc
	if end_loc == seq_len:
	break

	ppl = torch.exp(torch.stack(nlls).mean())
	return ppl.item()