Spaces:

polygraf-ai
/

copyright_checker

Running

App Files Files Community

copyright_checker / analysis.py

aliasgerovs

Updated - Fixed plagiarsim.

af21e05 5 months ago

raw

history blame

7.56 kB

	import yaml
	import subprocess
	import nltk
	from nltk import word_tokenize
	from nltk.corpus import cmudict, stopwords
	import spacy
	import torch
	from transformers import GPT2LMHeadModel, GPT2TokenizerFast
	import matplotlib.pyplot as plt
	import numpy as np
	from predictors import update,update_main, correct_text, split_text

	from matplotlib.patches import Circle, RegularPolygon
	from matplotlib.path import Path
	from matplotlib.projections import register_projection
	from matplotlib.projections.polar import PolarAxes
	from matplotlib.spines import Spine
	from matplotlib.transforms import Affine2D
	from writing_analysis import (
	estimated_slightly_difficult_words_ratio,
	entity_density,
	determiners_frequency,
	punctuation_diversity,
	type_token_ratio,
	calculate_perplexity,
	calculate_syntactic_tree_depth,
	hapax_legomena_ratio,
	mtld,
	)

	nltk.download("cmudict")
	nltk.download("punkt")
	nltk.download("stopwords")
	nltk.download("wordnet")
	d = cmudict.dict()
	command = ["python3", "-m", "spacy", "download", "en_core_web_sm"]
	subprocess.run(command)
	nlp = spacy.load("en_core_web_sm")


	with open("config.yaml", "r") as file:
	params = yaml.safe_load(file)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	readability_model_id = params["READABILITY_MODEL_ID"]
	gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device)
	gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id)


	def normalize(value, min_value, max_value):
	normalized_value = ((value - min_value) * 100) / (max_value - min_value)
	return max(0, min(100, normalized_value))


	def depth_analysis(input_text, bias_buster_selected):
	if bias_buster_selected:
	input_text = update(input_text)

	usual_ranges = {
	"estimated_slightly_difficult_words_ratio": (
	0.2273693623058005,
	0.557383692351033,
	),
	"entity_density": (-0.07940776754145815, 0.23491038179986615),
	"determiners_frequency": (0.012461059190031154, 0.15700934579439252),
	"punctuation_diversity": (-0.21875, 0.53125),
	"type_token_ratio": (0.33002482852189063, 1.0894414982357028),
	"calculate_perplexity": (-25.110544681549072, 82.4620680809021),
	"calculate_syntactic_tree_depth": (
	1.8380681818181812,
	10.997159090909092,
	),
	"hapax_legomena_ratio": (0.0830971690138207, 1.0302715687215778),
	"mtld": (-84.03125000000001, 248.81875000000002),
	}

	vocabulary_level = estimated_slightly_difficult_words_ratio(input_text, d)
	entity_ratio = entity_density(input_text, nlp)
	determiner_use = determiners_frequency(input_text, nlp)
	punctuation_variety = punctuation_diversity(input_text)
	sentence_depth = calculate_syntactic_tree_depth(input_text, nlp)
	perplexity = calculate_perplexity(
	input_text, gpt2_model, gpt2_tokenizer, device
	)
	lexical_diversity = type_token_ratio(input_text)
	unique_words = hapax_legomena_ratio(input_text)
	vocabulary_stability = mtld(input_text)

	# normalize between 0 and 100
	vocabulary_level_norm = normalize(
	vocabulary_level,
	*usual_ranges["estimated_slightly_difficult_words_ratio"],
	)
	entity_ratio_norm = normalize(entity_ratio, *usual_ranges["entity_density"])
	determiner_use_norm = normalize(
	determiner_use, *usual_ranges["determiners_frequency"]
	)
	punctuation_variety_norm = normalize(
	punctuation_variety, *usual_ranges["punctuation_diversity"]
	)
	lexical_diversity_norm = normalize(
	lexical_diversity, *usual_ranges["type_token_ratio"]
	)
	unique_words_norm = normalize(
	unique_words, *usual_ranges["hapax_legomena_ratio"]
	)
	vocabulary_stability_norm = normalize(
	vocabulary_stability, *usual_ranges["mtld"]
	)
	sentence_depth_norm = normalize(
	sentence_depth, *usual_ranges["calculate_syntactic_tree_depth"]
	)
	perplexity_norm = normalize(
	perplexity, *usual_ranges["calculate_perplexity"]
	)

	features = {
	"Lexical Diversity": lexical_diversity_norm,
	"Vocabulary Level": vocabulary_level_norm,
	"Unique Words": unique_words_norm,
	"Determiner Use": determiner_use_norm,
	"Punctuation Variety": punctuation_variety_norm,
	"Sentence Depth": sentence_depth_norm,
	"Vocabulary Stability": vocabulary_stability_norm,
	"Entity Ratio": entity_ratio_norm,
	"Perplexity": perplexity_norm,
	}

	def radar_factory(num_vars, frame="circle"):
	theta = np.linspace(0, 2 * np.pi, num_vars, endpoint=False)

	class RadarTransform(PolarAxes.PolarTransform):
	def transform_path_non_affine(self, path):
	if path._interpolation_steps > 1:
	path = path.interpolated(num_vars)
	return Path(self.transform(path.vertices), path.codes)

	class RadarAxes(PolarAxes):
	name = "radar"
	PolarTransform = RadarTransform

	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)
	self.set_theta_zero_location("N")

	def fill(self, args, closed=True, *kwargs):
	return super().fill(closed=closed, args, *kwargs)

	def plot(self, args, *kwargs):
	lines = super().plot(args, *kwargs)
	for line in lines:
	self._close_line(line)

	def _close_line(self, line):
	x, y = line.get_data()
	if x[0] != x[-1]:
	x = np.append(x, x[0])
	y = np.append(y, y[0])
	line.set_data(x, y)

	def set_varlabels(self, labels):
	self.set_thetagrids(np.degrees(theta), labels)

	def _gen_axes_patch(self):
	if frame == "circle":
	return Circle((0.5, 0.5), 0.5)
	elif frame == "polygon":
	return RegularPolygon(
	(0.5, 0.5), num_vars, radius=0.5, edgecolor="k"
	)

	def _gen_axes_spines(self):
	if frame == "polygon":
	spine = Spine(
	axes=self,
	spine_type="circle",
	path=Path.unit_regular_polygon(num_vars),
	)
	spine.set_transform(
	Affine2D().scale(0.5).translate(0.5, 0.5)
	+ self.transAxes
	)
	return {"polar": spine}

	register_projection(RadarAxes)
	return theta

	N = 9
	theta = radar_factory(N, frame="polygon")
	data = features.values()
	labels = features.keys()
	fig, ax = plt.subplots(
	subplot_kw=dict(projection="radar"), figsize=(7.5, 5)
	)
	ax.plot(theta, data)
	ax.fill(theta, data, alpha=0.4)
	ax.set_varlabels(labels)

	rgrids = np.linspace(0, 100, num=6)
	ax.set_rgrids(
	rgrids,
	labels=[f"{round(r)}%" for r in rgrids],
	fontsize=8,
	color="black",
	)
	ax.grid(True, color="black", linestyle="-", linewidth=0.5, alpha=0.5)

	for dd, (label, value) in enumerate(zip(labels, data)):
	ax.text(
	theta[dd] + 0.1,
	value + 5,
	f"{value:.0f}",
	horizontalalignment="left",
	verticalalignment="bottom",
	fontsize=8,
	)

	return fig