Spaces:

polygraf-ai
/

copyright_checker

Running

File size: 7,555 Bytes

import yaml
import subprocess
import nltk
from nltk import word_tokenize
from nltk.corpus import cmudict, stopwords
import spacy
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import matplotlib.pyplot as plt
import numpy as np
from predictors import update,update_main, correct_text, split_text

from matplotlib.patches import Circle, RegularPolygon
from matplotlib.path import Path
from matplotlib.projections import register_projection
from matplotlib.projections.polar import PolarAxes
from matplotlib.spines import Spine
from matplotlib.transforms import Affine2D
from writing_analysis import (
    estimated_slightly_difficult_words_ratio,
    entity_density,
    determiners_frequency,
    punctuation_diversity,
    type_token_ratio,
    calculate_perplexity,
    calculate_syntactic_tree_depth,
    hapax_legomena_ratio,
    mtld,
)

nltk.download("cmudict")
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
d = cmudict.dict()
command = ["python3", "-m", "spacy", "download", "en_core_web_sm"]
subprocess.run(command)
nlp = spacy.load("en_core_web_sm")


with open("config.yaml", "r") as file:
    params = yaml.safe_load(file)
device = "cuda" if torch.cuda.is_available() else "cpu"
readability_model_id = params["READABILITY_MODEL_ID"]
gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device)
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id)


def normalize(value, min_value, max_value):
    normalized_value = ((value - min_value) * 100) / (max_value - min_value)
    return max(0, min(100, normalized_value))


def depth_analysis(input_text, bias_buster_selected):
    if bias_buster_selected:
        input_text = update(input_text)
        
    usual_ranges = {
        "estimated_slightly_difficult_words_ratio": (
            0.2273693623058005,
            0.557383692351033,
        ),
        "entity_density": (-0.07940776754145815, 0.23491038179986615),
        "determiners_frequency": (0.012461059190031154, 0.15700934579439252),
        "punctuation_diversity": (-0.21875, 0.53125),
        "type_token_ratio": (0.33002482852189063, 1.0894414982357028),
        "calculate_perplexity": (-25.110544681549072, 82.4620680809021),
        "calculate_syntactic_tree_depth": (
            1.8380681818181812,
            10.997159090909092,
        ),
        "hapax_legomena_ratio": (0.0830971690138207, 1.0302715687215778),
        "mtld": (-84.03125000000001, 248.81875000000002),
    }

    vocabulary_level = estimated_slightly_difficult_words_ratio(input_text, d)
    entity_ratio = entity_density(input_text, nlp)
    determiner_use = determiners_frequency(input_text, nlp)
    punctuation_variety = punctuation_diversity(input_text)
    sentence_depth = calculate_syntactic_tree_depth(input_text, nlp)
    perplexity = calculate_perplexity(
        input_text, gpt2_model, gpt2_tokenizer, device
    )
    lexical_diversity = type_token_ratio(input_text)
    unique_words = hapax_legomena_ratio(input_text)
    vocabulary_stability = mtld(input_text)

    # normalize between 0 and 100
    vocabulary_level_norm = normalize(
        vocabulary_level,
        *usual_ranges["estimated_slightly_difficult_words_ratio"],
    )
    entity_ratio_norm = normalize(entity_ratio, *usual_ranges["entity_density"])
    determiner_use_norm = normalize(
        determiner_use, *usual_ranges["determiners_frequency"]
    )
    punctuation_variety_norm = normalize(
        punctuation_variety, *usual_ranges["punctuation_diversity"]
    )
    lexical_diversity_norm = normalize(
        lexical_diversity, *usual_ranges["type_token_ratio"]
    )
    unique_words_norm = normalize(
        unique_words, *usual_ranges["hapax_legomena_ratio"]
    )
    vocabulary_stability_norm = normalize(
        vocabulary_stability, *usual_ranges["mtld"]
    )
    sentence_depth_norm = normalize(
        sentence_depth, *usual_ranges["calculate_syntactic_tree_depth"]
    )
    perplexity_norm = normalize(
        perplexity, *usual_ranges["calculate_perplexity"]
    )

    features = {
        "Lexical Diversity": lexical_diversity_norm,
        "Vocabulary Level": vocabulary_level_norm,
        "Unique Words": unique_words_norm,
        "Determiner Use": determiner_use_norm,
        "Punctuation Variety": punctuation_variety_norm,
        "Sentence Depth": sentence_depth_norm,
        "Vocabulary Stability": vocabulary_stability_norm,
        "Entity Ratio": entity_ratio_norm,
        "Perplexity": perplexity_norm,
    }

    def radar_factory(num_vars, frame="circle"):
        theta = np.linspace(0, 2 * np.pi, num_vars, endpoint=False)

        class RadarTransform(PolarAxes.PolarTransform):
            def transform_path_non_affine(self, path):
                if path._interpolation_steps > 1:
                    path = path.interpolated(num_vars)
                return Path(self.transform(path.vertices), path.codes)

        class RadarAxes(PolarAxes):
            name = "radar"
            PolarTransform = RadarTransform

            def __init__(self, *args, **kwargs):
                super().__init__(*args, **kwargs)
                self.set_theta_zero_location("N")

            def fill(self, *args, closed=True, **kwargs):
                return super().fill(closed=closed, *args, **kwargs)

            def plot(self, *args, **kwargs):
                lines = super().plot(*args, **kwargs)
                for line in lines:
                    self._close_line(line)

            def _close_line(self, line):
                x, y = line.get_data()
                if x[0] != x[-1]:
                    x = np.append(x, x[0])
                    y = np.append(y, y[0])
                    line.set_data(x, y)

            def set_varlabels(self, labels):
                self.set_thetagrids(np.degrees(theta), labels)

            def _gen_axes_patch(self):
                if frame == "circle":
                    return Circle((0.5, 0.5), 0.5)
                elif frame == "polygon":
                    return RegularPolygon(
                        (0.5, 0.5), num_vars, radius=0.5, edgecolor="k"
                    )

            def _gen_axes_spines(self):
                if frame == "polygon":
                    spine = Spine(
                        axes=self,
                        spine_type="circle",
                        path=Path.unit_regular_polygon(num_vars),
                    )
                    spine.set_transform(
                        Affine2D().scale(0.5).translate(0.5, 0.5)
                        + self.transAxes
                    )
                    return {"polar": spine}

        register_projection(RadarAxes)
        return theta

    N = 9
    theta = radar_factory(N, frame="polygon")
    data = features.values()
    labels = features.keys()
    fig, ax = plt.subplots(
        subplot_kw=dict(projection="radar"), figsize=(7.5, 5)
    )
    ax.plot(theta, data)
    ax.fill(theta, data, alpha=0.4)
    ax.set_varlabels(labels)

    rgrids = np.linspace(0, 100, num=6)
    ax.set_rgrids(
        rgrids,
        labels=[f"{round(r)}%" for r in rgrids],
        fontsize=8,
        color="black",
    )
    ax.grid(True, color="black", linestyle="-", linewidth=0.5, alpha=0.5)

    for dd, (label, value) in enumerate(zip(labels, data)):
        ax.text(
            theta[dd] + 0.1,
            value + 5,
            f"{value:.0f}",
            horizontalalignment="left",
            verticalalignment="bottom",
            fontsize=8,
        )

    return fig