Spaces:

ak5005
/

derrobot

Sleeping

File size: 6,918 Bytes

import language_tool_python
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import numpy as np
import spacy
import wordfreq

tool = language_tool_python.LanguageTool('en-US')
model_name="distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)
model.eval()

nlp = spacy.load("en_core_web_sm")

def __get_word_pr_score(word, lang="en") -> list[float]:
    return -np.log(wordfreq.word_frequency(word, lang) + 1e-12)

def pseudo_perplexity(text, threshold=20, max_len=128):
    """
    We want to return
    {
        "score": normalized value from 0 to 100,
        "errors": [
            {
                "start": word index,
                "end": word index,
                "message": "error message"
            }
        ]
    }
    """
    encoding = tokenizer(text, return_tensors="pt", return_offsets_mapping=True)
    input_ids = encoding["input_ids"][0]
    print(input_ids)
    offset_mapping = encoding["offset_mapping"][0]
    print(offset_mapping)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Group token indices by word based on offset mapping
    word_groups = []
    current_group = []

    prev_end = None

    for i, (start, end) in enumerate(offset_mapping):
        if input_ids[i] in tokenizer.all_special_ids:
            continue  # skip special tokens like [CLS] and [SEP]

        if prev_end is not None and start > prev_end:
            # Word boundary detected → start new group
            word_groups.append(current_group)
            current_group = [i]
        else:
            current_group.append(i)

        prev_end = end

    # Append final group
    if current_group:
        word_groups.append(current_group)

    loss_values = []
    tok_loss = []
    for group in word_groups:
        if group[0] == 0 or group[-1] == len(input_ids) - 1:
            continue  # skip [CLS] and [SEP]

        masked = input_ids.clone()
        for i in group:
            masked[i] = tokenizer.mask_token_id

        with torch.no_grad():
            outputs = model(masked.unsqueeze(0))
            logits = outputs.logits[0]

        log_probs = []
        for i in group:
            probs = torch.softmax(logits[i], dim=-1)
            true_token_id = input_ids[i].item()
            prob = probs[true_token_id].item()
            log_probs.append(np.log(prob + 1e-12))
            tok_loss.append(-np.log(prob + 1e-12))

        word_loss = -np.sum(log_probs) / len(log_probs)
        word = tokenizer.decode(input_ids[group[0]])
        word_loss -= 0.6 * __get_word_pr_score(word)
        loss_values.append(word_loss)
    
    print(loss_values)

    errors = []
    for i, l in enumerate(loss_values):
        if l < threshold:
            continue
        errors.append({
            "start": i,
            "end": i,
            "message": f"Perplexity {l} over threshold {threshold}"
        })

    print(tok_loss)
    s_ppl = np.mean(tok_loss)
    print(s_ppl)

    res = {
        "score": __fluency_score_from_ppl(s_ppl),
        "errors": errors
    }

    return res

def __fluency_score_from_ppl(ppl, midpoint=8, steepness=0.3):
    """
    Use a logistic function to map perplexity to 0–100.
    Midpoint is the PPL where score is 50.
    Steepness controls curve sharpness.
    """
    score = 100 / (1 + np.exp(steepness * (ppl - midpoint)))
    return round(score, 2)

def grammar_errors(text) -> tuple[int, list[str]]:
    """

    Returns
      int: number of grammar errors
      list: grammar errors
        tuple: (start, end, error message)
    """

    matches = tool.check(text)
    grammar_score = len(matches)/len(text.split())

    r = []
    for match in matches:
        words = text.split()
        char_to_word = []
        current_char = 0

        for i, word in enumerate(words):
            for _ in range(len(word)):
                char_to_word.append(i)
            current_char += len(word)
            if current_char < len(text):  # Account for spaces between words
                char_to_word.append(i)
                current_char += 1

        start = char_to_word[match.offset]
        end = char_to_word[match.offset + match.errorLength - 1] + 1
        r.append({"start": start, "end": end, "message": match.message})

    struct_err = __check_structural_grammar(text)
    r.extend(struct_err)

    res = {
        "score": __grammar_score_from_prob(grammar_score),
        "errors": r
    }

    return res

def __grammar_score_from_prob(error_ratio):
    """
    Transform the number of errors divided by words into a score from 0 to 100.
    Steepness controls how quickly the score drops as errors increase.
    """
    score = 100*(1-error_ratio)
    return round(score, 2)


def __check_structural_grammar(text):
    doc = nlp(text)
    issues = []

    # 1. Missing main verb (ROOT)
    root_verbs = [tok for tok in doc if tok.dep_ == "ROOT" and tok.pos_ in {"VERB", "AUX"}]
    if not root_verbs:
        root_root = [tok for tok in doc if tok.dep_ == "ROOT"]
        token = root_root[0] if root_root else doc[0]
        issues.append({
            "start": token.i,
            "end": token.i + 1,
            "message": "Sentence is missing a main verb (no ROOT verb)."
        })

    # 2. Verb(s) present but no subject
    verbs = [tok for tok in doc if tok.pos_ in {"VERB", "AUX"}]
    subjects = [tok for tok in doc if tok.dep_ in {"nsubj", "nsubjpass"}]
    if verbs and not subjects:
        for verb in verbs:
            issues.append({
                "start": verb.i,
                "end": verb.i + 1,
                "message": "Sentence has verb(s) but no subject (possible fragment)."
            })

    # 3. Dangling prepositions
    for tok in doc:
        if tok.pos_ == "ADP" and len(list(tok.children)) == 0:
            issues.append({
                "start": tok.i,
                "end": tok.i + 1,
                "message": f"Dangling preposition '{tok.text}' (no object or complement)."
            })

    # 4. Noun pile-up (no verbs, all tokens are nominal)
    if not any(tok.pos_ in {"VERB", "AUX"} for tok in doc) and \
       all(tok.pos_ in {"NOUN", "PROPN", "ADJ", "DET", "NUM"} for tok in doc if tok.is_alpha):
        token = doc[0]
        issues.append({
            "start": token.i,
            "end": token.i + 1,
            "message": "Sentence lacks a verb or any verbal structure (nominal phrase pile-up)."
        })

    # 5. Multiple ROOTs (possible run-on)
    root_count = sum(1 for tok in doc if tok.dep_ == "ROOT")
    if root_count > 1:
        for tok in doc:
            if tok.dep_ == "ROOT":
                issues.append({
                    "start": tok.i,
                    "end": tok.i + 1,
                    "message": "Sentence has multiple ROOTs — possible run-on sentence."
                })

    return issues