|
import language_tool_python |
|
from transformers import AutoTokenizer, AutoModelForMaskedLM |
|
import torch |
|
import numpy as np |
|
import spacy |
|
|
|
tool = language_tool_python.LanguageTool('en-US') |
|
model_name="distilbert-base-multilingual-cased" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForMaskedLM.from_pretrained(model_name) |
|
model.eval() |
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
def pseudo_perplexity(text, max_len=128): |
|
""" |
|
We want to return |
|
{ |
|
"score": normalized value from 0 to 100, |
|
"errors": [ |
|
{ |
|
"start": word index, |
|
"end": word index, |
|
"message": "error message" |
|
} |
|
] |
|
} |
|
""" |
|
input_ids = tokenizer.encode(text, return_tensors="pt")[0] |
|
|
|
if len(input_ids) > max_len: |
|
raise ValueError(f"Input too long for model (>{max_len} tokens).") |
|
|
|
loss_values = [] |
|
|
|
for i in range(1, len(input_ids) - 1): |
|
masked_input = input_ids.clone() |
|
masked_input[i] = tokenizer.mask_token_id |
|
|
|
with torch.no_grad(): |
|
outputs = model(masked_input.unsqueeze(0)) |
|
logits = outputs.logits[0, i] |
|
probs = torch.softmax(logits, dim=-1) |
|
|
|
true_token_id = input_ids[i].item() |
|
prob_true_token = probs[true_token_id].item() |
|
log_prob = np.log(prob_true_token + 1e-12) |
|
loss_values.append(-log_prob) |
|
|
|
|
|
threshold = 12 |
|
longest_start, longest_end = 0, 0 |
|
current_start, current_end = 0, 0 |
|
max_length = 0 |
|
curr_loss = 0 |
|
|
|
for i, loss in enumerate(loss_values): |
|
if loss > threshold: |
|
if current_start == current_end: |
|
current_start = i |
|
current_end = i + 1 |
|
curr_loss = loss |
|
else: |
|
if current_end - current_start > max_length: |
|
longest_start, longest_end = current_start, current_end |
|
max_length = current_end - current_start |
|
current_start, current_end = 0, 0 |
|
|
|
if current_end - current_start > max_length: |
|
longest_start, longest_end = current_start, current_end |
|
|
|
longest_sequence = (longest_start, longest_end) |
|
|
|
ppl = np.exp(np.mean(loss_values)) |
|
|
|
res = { |
|
"score": __fluency_score_from_ppl(ppl), |
|
"errors": [ |
|
{ |
|
"start": longest_sequence[0], |
|
"end": longest_sequence[1], |
|
"message": f"Perplexity above threshold: {curr_loss}" |
|
} |
|
] |
|
} |
|
|
|
return res |
|
|
|
def __fluency_score_from_ppl(ppl, midpoint=20, steepness=0.3): |
|
""" |
|
Use a logistic function to map perplexity to 0–100. |
|
Midpoint is the PPL where score is 50. |
|
Steepness controls curve sharpness. |
|
""" |
|
score = 100 / (1 + np.exp(steepness * (ppl - midpoint))) |
|
return round(score, 2) |
|
|
|
def grammar_errors(text) -> tuple[int, list[str]]: |
|
""" |
|
|
|
Returns |
|
int: number of grammar errors |
|
list: grammar errors |
|
tuple: (start, end, error message) |
|
""" |
|
|
|
matches = tool.check(text) |
|
grammar_score = len(matches)/len(text.split()) |
|
|
|
r = [] |
|
for match in matches: |
|
words = text.split() |
|
char_to_word = [] |
|
current_char = 0 |
|
|
|
for i, word in enumerate(words): |
|
for _ in range(len(word)): |
|
char_to_word.append(i) |
|
current_char += len(word) |
|
if current_char < len(text): |
|
char_to_word.append(i) |
|
current_char += 1 |
|
|
|
start = char_to_word[match.offset] |
|
end = char_to_word[match.offset + match.errorLength - 1] + 1 |
|
r.append({"start": start, "end": end, "message": match.message}) |
|
|
|
struct_err = __check_structural_grammar(text) |
|
r.extend(struct_err) |
|
|
|
res = { |
|
"score": __grammar_score_from_prob(grammar_score), |
|
"errors": r |
|
} |
|
|
|
return res |
|
|
|
def __grammar_score_from_prob(error_ratio, steepness=10): |
|
""" |
|
Transform the number of errors divided by words into a score from 0 to 100. |
|
Steepness controls how quickly the score drops as errors increase. |
|
""" |
|
score = 100 / (1 + np.exp(steepness * error_ratio)) |
|
return round(score, 2) |
|
|
|
|
|
def __check_structural_grammar(text): |
|
doc = nlp(text) |
|
issues = [] |
|
|
|
|
|
root_verbs = [tok for tok in doc if tok.dep_ == "ROOT" and tok.pos_ in {"VERB", "AUX"}] |
|
if not root_verbs: |
|
root_root = [tok for tok in doc if tok.dep_ == "ROOT"] |
|
token = root_root[0] if root_root else doc[0] |
|
issues.append({ |
|
"start": token.i, |
|
"end": token.i + 1, |
|
"message": "Sentence is missing a main verb (no ROOT verb)." |
|
}) |
|
|
|
|
|
verbs = [tok for tok in doc if tok.pos_ in {"VERB", "AUX"}] |
|
subjects = [tok for tok in doc if tok.dep_ in {"nsubj", "nsubjpass"}] |
|
if verbs and not subjects: |
|
for verb in verbs: |
|
issues.append({ |
|
"start": verb.i, |
|
"end": verb.i + 1, |
|
"message": "Sentence has verb(s) but no subject (possible fragment)." |
|
}) |
|
|
|
|
|
for tok in doc: |
|
if tok.pos_ == "ADP" and len(list(tok.children)) == 0: |
|
issues.append({ |
|
"start": tok.i, |
|
"end": tok.i + 1, |
|
"message": f"Dangling preposition '{tok.text}' (no object or complement)." |
|
}) |
|
|
|
|
|
if not any(tok.pos_ in {"VERB", "AUX"} for tok in doc) and \ |
|
all(tok.pos_ in {"NOUN", "PROPN", "ADJ", "DET", "NUM"} for tok in doc if tok.is_alpha): |
|
token = doc[0] |
|
issues.append({ |
|
"start": token.i, |
|
"end": token.i + 1, |
|
"message": "Sentence lacks a verb or any verbal structure (nominal phrase pile-up)." |
|
}) |
|
|
|
|
|
root_count = sum(1 for tok in doc if tok.dep_ == "ROOT") |
|
if root_count > 1: |
|
for tok in doc: |
|
if tok.dep_ == "ROOT": |
|
issues.append({ |
|
"start": tok.i, |
|
"end": tok.i + 1, |
|
"message": "Sentence has multiple ROOTs — possible run-on sentence." |
|
}) |
|
|
|
return issues |
|
|