Spaces:

ak5005
/

derrobot

Sleeping

File size: 6,496 Bytes

b837a10

import language_tool_python
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import numpy as np
import spacy

tool = language_tool_python.LanguageTool('en-US')
model_name="distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)
model.eval()

nlp = spacy.load("en_core_web_sm")

def pseudo_perplexity(text, max_len=128):
    """
    We want to return
    {
        "score": normalized value from 0 to 100,
        "errors": [
            {
                "start": word index,
                "end": word index,
                "message": "error message"
            }
        ]
    }
    """
    input_ids = tokenizer.encode(text, return_tensors="pt")[0]

    if len(input_ids) > max_len:
        raise ValueError(f"Input too long for model (>{max_len} tokens).")

    loss_values = []

    for i in range(1, len(input_ids) - 1):  # skip [CLS] and [SEP]
        masked_input = input_ids.clone()
        masked_input[i] = tokenizer.mask_token_id

        with torch.no_grad():
            outputs = model(masked_input.unsqueeze(0))
            logits = outputs.logits[0, i]
            probs = torch.softmax(logits, dim=-1)

        true_token_id = input_ids[i].item()
        prob_true_token = probs[true_token_id].item()
        log_prob = np.log(prob_true_token + 1e-12)
        loss_values.append(-log_prob)
    
    # get longest sequence of tokens with perplexity over some threshold
    threshold = 12  # Define a perplexity threshold
    longest_start, longest_end = 0, 0
    current_start, current_end = 0, 0
    max_length = 0
    curr_loss = 0

    for i, loss in enumerate(loss_values):
        if loss > threshold:
            if current_start == current_end:  # Start a new sequence
                current_start = i
            current_end = i + 1
            curr_loss = loss
        else:
            if current_end - current_start > max_length:
                longest_start, longest_end = current_start, current_end
                max_length = current_end - current_start
            current_start, current_end = 0, 0

    if current_end - current_start > max_length:  # Check the last sequence
        longest_start, longest_end = current_start, current_end

    longest_sequence = (longest_start, longest_end)

    ppl = np.exp(np.mean(loss_values))

    res = {
        "score": __fluency_score_from_ppl(ppl),
        "errors": [
            {
                "start": longest_sequence[0],
                "end": longest_sequence[1],
                "message": f"Perplexity above threshold: {curr_loss}"
            }
        ]
    }

    return res

def __fluency_score_from_ppl(ppl, midpoint=20, steepness=0.3):
    """
    Use a logistic function to map perplexity to 0–100.
    Midpoint is the PPL where score is 50.
    Steepness controls curve sharpness.
    """
    score = 100 / (1 + np.exp(steepness * (ppl - midpoint)))
    return round(score, 2)

def grammar_errors(text) -> tuple[int, list[str]]:
    """

    Returns
      int: number of grammar errors
      list: grammar errors
        tuple: (start, end, error message)
    """

    matches = tool.check(text)
    grammar_score = len(matches)/len(text.split())

    r = []
    for match in matches:
        words = text.split()
        char_to_word = []
        current_char = 0

        for i, word in enumerate(words):
            for _ in range(len(word)):
                char_to_word.append(i)
            current_char += len(word)
            if current_char < len(text):  # Account for spaces between words
                char_to_word.append(i)
                current_char += 1

        start = char_to_word[match.offset]
        end = char_to_word[match.offset + match.errorLength - 1] + 1
        r.append({"start": start, "end": end, "message": match.message})

    struct_err = __check_structural_grammar(text)
    r.extend(struct_err)

    res = {
        "score": __grammar_score_from_prob(grammar_score),
        "errors": r
    }

    return res

def __grammar_score_from_prob(error_ratio, steepness=10):
    """
    Transform the number of errors divided by words into a score from 0 to 100.
    Steepness controls how quickly the score drops as errors increase.
    """
    score = 100 / (1 + np.exp(steepness * error_ratio))
    return round(score, 2)


def __check_structural_grammar(text):
    doc = nlp(text)
    issues = []

    # 1. Missing main verb (ROOT)
    root_verbs = [tok for tok in doc if tok.dep_ == "ROOT" and tok.pos_ in {"VERB", "AUX"}]
    if not root_verbs:
        root_root = [tok for tok in doc if tok.dep_ == "ROOT"]
        token = root_root[0] if root_root else doc[0]
        issues.append({
            "start": token.i,
            "end": token.i + 1,
            "message": "Sentence is missing a main verb (no ROOT verb)."
        })

    # 2. Verb(s) present but no subject
    verbs = [tok for tok in doc if tok.pos_ in {"VERB", "AUX"}]
    subjects = [tok for tok in doc if tok.dep_ in {"nsubj", "nsubjpass"}]
    if verbs and not subjects:
        for verb in verbs:
            issues.append({
                "start": verb.i,
                "end": verb.i + 1,
                "message": "Sentence has verb(s) but no subject (possible fragment)."
            })

    # 3. Dangling prepositions
    for tok in doc:
        if tok.pos_ == "ADP" and len(list(tok.children)) == 0:
            issues.append({
                "start": tok.i,
                "end": tok.i + 1,
                "message": f"Dangling preposition '{tok.text}' (no object or complement)."
            })

    # 4. Noun pile-up (no verbs, all tokens are nominal)
    if not any(tok.pos_ in {"VERB", "AUX"} for tok in doc) and \
       all(tok.pos_ in {"NOUN", "PROPN", "ADJ", "DET", "NUM"} for tok in doc if tok.is_alpha):
        token = doc[0]
        issues.append({
            "start": token.i,
            "end": token.i + 1,
            "message": "Sentence lacks a verb or any verbal structure (nominal phrase pile-up)."
        })

    # 5. Multiple ROOTs (possible run-on)
    root_count = sum(1 for tok in doc if tok.dep_ == "ROOT")
    if root_count > 1:
        for tok in doc:
            if tok.dep_ == "ROOT":
                issues.append({
                    "start": tok.i,
                    "end": tok.i + 1,
                    "message": "Sentence has multiple ROOTs — possible run-on sentence."
                })

    return issues