import yaml import subprocess import nltk from nltk import word_tokenize from nltk.corpus import cmudict, stopwords import spacy import torch from transformers import GPT2LMHeadModel, GPT2TokenizerFast import matplotlib.pyplot as plt import numpy as np from predictors import update,update_main, correct_text, split_text from matplotlib.patches import Circle, RegularPolygon from matplotlib.path import Path from matplotlib.projections import register_projection from matplotlib.projections.polar import PolarAxes from matplotlib.spines import Spine from matplotlib.transforms import Affine2D from writing_analysis import ( estimated_slightly_difficult_words_ratio, entity_density, determiners_frequency, punctuation_diversity, type_token_ratio, calculate_perplexity, calculate_syntactic_tree_depth, hapax_legomena_ratio, mtld, ) nltk.download("cmudict") nltk.download("punkt") nltk.download("stopwords") nltk.download("wordnet") d = cmudict.dict() command = ["python3", "-m", "spacy", "download", "en_core_web_sm"] subprocess.run(command) nlp = spacy.load("en_core_web_sm") with open("config.yaml", "r") as file: params = yaml.safe_load(file) device = "cuda" if torch.cuda.is_available() else "cpu" readability_model_id = params["READABILITY_MODEL_ID"] gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device) gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id) def normalize(value, min_value, max_value): normalized_value = ((value - min_value) * 100) / (max_value - min_value) return max(0, min(100, normalized_value)) def depth_analysis(input_text, bias_buster_selected): if bias_buster_selected: input_text = update(input_text) usual_ranges = { "estimated_slightly_difficult_words_ratio": ( 0.2273693623058005, 0.557383692351033, ), "entity_density": (-0.07940776754145815, 0.23491038179986615), "determiners_frequency": (0.012461059190031154, 0.15700934579439252), "punctuation_diversity": (-0.21875, 0.53125), "type_token_ratio": (0.33002482852189063, 1.0894414982357028), "calculate_perplexity": (-25.110544681549072, 82.4620680809021), "calculate_syntactic_tree_depth": ( 1.8380681818181812, 10.997159090909092, ), "hapax_legomena_ratio": (0.0830971690138207, 1.0302715687215778), "mtld": (-84.03125000000001, 248.81875000000002), } vocabulary_level = estimated_slightly_difficult_words_ratio(input_text, d) entity_ratio = entity_density(input_text, nlp) determiner_use = determiners_frequency(input_text, nlp) punctuation_variety = punctuation_diversity(input_text) sentence_depth = calculate_syntactic_tree_depth(input_text, nlp) perplexity = calculate_perplexity( input_text, gpt2_model, gpt2_tokenizer, device ) lexical_diversity = type_token_ratio(input_text) unique_words = hapax_legomena_ratio(input_text) vocabulary_stability = mtld(input_text) # normalize between 0 and 100 vocabulary_level_norm = normalize( vocabulary_level, *usual_ranges["estimated_slightly_difficult_words_ratio"], ) entity_ratio_norm = normalize(entity_ratio, *usual_ranges["entity_density"]) determiner_use_norm = normalize( determiner_use, *usual_ranges["determiners_frequency"] ) punctuation_variety_norm = normalize( punctuation_variety, *usual_ranges["punctuation_diversity"] ) lexical_diversity_norm = normalize( lexical_diversity, *usual_ranges["type_token_ratio"] ) unique_words_norm = normalize( unique_words, *usual_ranges["hapax_legomena_ratio"] ) vocabulary_stability_norm = normalize( vocabulary_stability, *usual_ranges["mtld"] ) sentence_depth_norm = normalize( sentence_depth, *usual_ranges["calculate_syntactic_tree_depth"] ) perplexity_norm = normalize( perplexity, *usual_ranges["calculate_perplexity"] ) features = { "Lexical Diversity": lexical_diversity_norm, "Vocabulary Level": vocabulary_level_norm, "Unique Words": unique_words_norm, "Determiner Use": determiner_use_norm, "Punctuation Variety": punctuation_variety_norm, "Sentence Depth": sentence_depth_norm, "Vocabulary Stability": vocabulary_stability_norm, "Entity Ratio": entity_ratio_norm, "Perplexity": perplexity_norm, } def radar_factory(num_vars, frame="circle"): theta = np.linspace(0, 2 * np.pi, num_vars, endpoint=False) class RadarTransform(PolarAxes.PolarTransform): def transform_path_non_affine(self, path): if path._interpolation_steps > 1: path = path.interpolated(num_vars) return Path(self.transform(path.vertices), path.codes) class RadarAxes(PolarAxes): name = "radar" PolarTransform = RadarTransform def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.set_theta_zero_location("N") def fill(self, *args, closed=True, **kwargs): return super().fill(closed=closed, *args, **kwargs) def plot(self, *args, **kwargs): lines = super().plot(*args, **kwargs) for line in lines: self._close_line(line) def _close_line(self, line): x, y = line.get_data() if x[0] != x[-1]: x = np.append(x, x[0]) y = np.append(y, y[0]) line.set_data(x, y) def set_varlabels(self, labels): self.set_thetagrids(np.degrees(theta), labels) def _gen_axes_patch(self): if frame == "circle": return Circle((0.5, 0.5), 0.5) elif frame == "polygon": return RegularPolygon( (0.5, 0.5), num_vars, radius=0.5, edgecolor="k" ) def _gen_axes_spines(self): if frame == "polygon": spine = Spine( axes=self, spine_type="circle", path=Path.unit_regular_polygon(num_vars), ) spine.set_transform( Affine2D().scale(0.5).translate(0.5, 0.5) + self.transAxes ) return {"polar": spine} register_projection(RadarAxes) return theta N = 9 theta = radar_factory(N, frame="polygon") data = features.values() labels = features.keys() fig, ax = plt.subplots( subplot_kw=dict(projection="radar"), figsize=(7.5, 5) ) ax.plot(theta, data) ax.fill(theta, data, alpha=0.4) ax.set_varlabels(labels) rgrids = np.linspace(0, 100, num=6) ax.set_rgrids( rgrids, labels=[f"{round(r)}%" for r in rgrids], fontsize=8, color="black", ) ax.grid(True, color="black", linestyle="-", linewidth=0.5, alpha=0.5) for dd, (label, value) in enumerate(zip(labels, data)): ax.text( theta[dd] + 0.1, value + 5, f"{value:.0f}", horizontalalignment="left", verticalalignment="bottom", fontsize=8, ) return fig