Aidan Phillips
commited on
Commit
·
0885169
1
Parent(s):
f5893dd
accuracy scoring pretty good
Browse files- categories/accuracy.py +146 -0
- categories/fluency.py +103 -66
- scorer.ipynb +44 -22
categories/accuracy.py
CHANGED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import string
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from scipy.spatial.distance import cosine
|
5 |
+
from simalign import SentenceAligner
|
6 |
+
from transformers import AutoModel, AutoTokenizer
|
7 |
+
|
8 |
+
# setup global variables on import (bad practice, but whatever)
|
9 |
+
# --------------------------------------------------------------
|
10 |
+
|
11 |
+
aligner = SentenceAligner(model="distilbert-base-multilingual-cased", layer=6)
|
12 |
+
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
|
13 |
+
model = AutoModel.from_pretrained("distilbert-base-multilingual-cased")
|
14 |
+
|
15 |
+
|
16 |
+
def accuracy(src_sentence: str, trg_sentence: str) -> dict:
|
17 |
+
"""
|
18 |
+
Calculate the accuracy of a translation by comparing the source and target
|
19 |
+
sentences.
|
20 |
+
|
21 |
+
Parameters:
|
22 |
+
src_sentence (str): The source sentence.
|
23 |
+
trg_sentence (str): The target sentence.
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
dict: A dictionary containing the accuracy score and errors.
|
27 |
+
"""
|
28 |
+
# Preprocess both sentences
|
29 |
+
src_sentence = __preprocess_text(src_sentence)
|
30 |
+
trg_sentence = __preprocess_text(trg_sentence)
|
31 |
+
|
32 |
+
r = __get_alignment_score(src_sentence, trg_sentence)
|
33 |
+
score = __get_bertscore(src_sentence, trg_sentence)
|
34 |
+
|
35 |
+
res = {"score": __bertscore_to_percentage(score), "errors": r}
|
36 |
+
return res
|
37 |
+
|
38 |
+
|
39 |
+
def __preprocess_text(text: str) -> str:
|
40 |
+
"""
|
41 |
+
Remove punctuation and convert text to lowercase.
|
42 |
+
|
43 |
+
Parameters:
|
44 |
+
text (str): The text to preprocess.
|
45 |
+
|
46 |
+
Returns:
|
47 |
+
str: The preprocessed text.
|
48 |
+
"""
|
49 |
+
# Remove punctuation
|
50 |
+
text = text.translate(str.maketrans("", "", string.punctuation))
|
51 |
+
# Convert to lowercase
|
52 |
+
text = text.lower()
|
53 |
+
return text
|
54 |
+
|
55 |
+
|
56 |
+
def __get_bertscore(src_sentence: str, trg_sentence: str) -> float:
|
57 |
+
"""
|
58 |
+
Get the BERTScore between two sentences.
|
59 |
+
|
60 |
+
Parameters:
|
61 |
+
src_sentence (str): The source sentence.
|
62 |
+
trg_sentence (str): The target sentence.
|
63 |
+
|
64 |
+
Returns:
|
65 |
+
float: The BERTScore.
|
66 |
+
"""
|
67 |
+
# Tokenize and generate embeddings
|
68 |
+
inputs_src = tokenizer(
|
69 |
+
src_sentence, return_tensors="pt", padding=True, truncation=True
|
70 |
+
)
|
71 |
+
inputs_trg = tokenizer(
|
72 |
+
trg_sentence, return_tensors="pt", padding=True, truncation=True
|
73 |
+
)
|
74 |
+
|
75 |
+
with torch.no_grad():
|
76 |
+
outputs_src = model(**inputs_src)
|
77 |
+
outputs_trg = model(**inputs_trg)
|
78 |
+
|
79 |
+
# Get sentence embeddings by averaging token embeddings (from last hidden state)
|
80 |
+
src_embedding = torch.mean(outputs_src.last_hidden_state, dim=1).squeeze().numpy()
|
81 |
+
trg_embedding = torch.mean(outputs_trg.last_hidden_state, dim=1).squeeze().numpy()
|
82 |
+
|
83 |
+
# Calculate cosine similarity (1 - cosine distance)
|
84 |
+
similarity = 1 - cosine(src_embedding, trg_embedding)
|
85 |
+
|
86 |
+
return similarity
|
87 |
+
|
88 |
+
|
89 |
+
def __bertscore_to_percentage(similarity: float) -> float:
|
90 |
+
"""
|
91 |
+
Convert the BERTScore cosine similarity to a percentage score (0-100).
|
92 |
+
|
93 |
+
Parameters:
|
94 |
+
similarity (float): The cosine similarity from BERTScore.
|
95 |
+
|
96 |
+
Returns:
|
97 |
+
int: A score from 0 to 100.
|
98 |
+
"""
|
99 |
+
# Scale the similarity score from [-1, 1] range to [0, 100] (rarely negative)
|
100 |
+
scaled_score = max(((similarity) / 2) * 100, 0)
|
101 |
+
return round(scaled_score, 2)
|
102 |
+
|
103 |
+
|
104 |
+
def __get_alignment_score(src_sentence: str, trg_sentence: str) -> list:
|
105 |
+
"""
|
106 |
+
Get the alignment score between two sentences.
|
107 |
+
|
108 |
+
Parameters:
|
109 |
+
src_sentence (str): The source sentence.
|
110 |
+
trg_sentence (str): The target sentence.
|
111 |
+
|
112 |
+
Returns:
|
113 |
+
list: Mistranslations
|
114 |
+
"""
|
115 |
+
src_list = src_sentence.split()
|
116 |
+
trg_list = trg_sentence.split()
|
117 |
+
|
118 |
+
# The output is a dictionary with different matching methods.
|
119 |
+
# Each method has a list of pairs indicating the indexes of aligned words (The alignments are zero-indexed).
|
120 |
+
alignments = aligner.get_word_aligns(src_list, trg_list)
|
121 |
+
|
122 |
+
src_aligns = {x[0] for x in alignments["inter"]}
|
123 |
+
trg_aligns = {x[1] for x in alignments["inter"]}
|
124 |
+
|
125 |
+
mistranslations = []
|
126 |
+
for i in range(len(src_list)):
|
127 |
+
if i not in src_aligns:
|
128 |
+
mistranslations.append(
|
129 |
+
{
|
130 |
+
"start": i,
|
131 |
+
"end": i,
|
132 |
+
"message": f"Word {src_list[i]} possibly mistranslated or omitted",
|
133 |
+
}
|
134 |
+
)
|
135 |
+
|
136 |
+
for i in range(len(trg_list)):
|
137 |
+
if i not in trg_aligns:
|
138 |
+
mistranslations.append(
|
139 |
+
{
|
140 |
+
"start": i,
|
141 |
+
"end": i,
|
142 |
+
"message": f"Word {trg_list[i]} possibly mistranslated or added erroneously",
|
143 |
+
}
|
144 |
+
)
|
145 |
+
|
146 |
+
return mistranslations
|
categories/fluency.py
CHANGED
@@ -1,28 +1,29 @@
|
|
1 |
import language_tool_python
|
2 |
-
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
3 |
-
import torch
|
4 |
import numpy as np
|
5 |
import spacy
|
|
|
6 |
import wordfreq
|
|
|
7 |
|
8 |
# setup global variables on import (bad practice, but whatever)
|
9 |
-
|
10 |
|
11 |
# grammar checker
|
12 |
-
tool = language_tool_python.LanguageTool(
|
13 |
|
14 |
# masked language model and tokenizer from huggingface
|
15 |
-
model_name="distilbert-base-multilingual-cased"
|
16 |
model = AutoModelForMaskedLM.from_pretrained(model_name)
|
17 |
model.eval()
|
18 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
19 |
|
20 |
# spacy model for parsing
|
21 |
nlp = spacy.load("en_core_web_sm")
|
22 |
|
23 |
-
|
|
|
24 |
"""
|
25 |
-
Returns the rarity of a word in the given language. word_freq retuns a value
|
26 |
between 0 and 1, where 1 is the most common word. Therefore, taking the log results
|
27 |
in a value between 0 (log 1 = 0) and -27.63 (log 1e-12). We then negate it so super
|
28 |
rare words have a high score and common words have a low score.
|
@@ -30,20 +31,21 @@ def __get_rarity(word, lang="en") -> float:
|
|
30 |
Parameters:
|
31 |
word (str): The word to check.
|
32 |
lang (str): The language to check. Default is "en".
|
33 |
-
|
34 |
Returns:
|
35 |
float: The rarity of the word.
|
36 |
"""
|
37 |
return -np.log(wordfreq.word_frequency(word, lang) + 1e-12)
|
38 |
|
39 |
-
|
|
|
40 |
"""
|
41 |
Produce groupings of tokens that are part of the same word.
|
42 |
|
43 |
Parameters:
|
44 |
offset_mapping (list): The offset mapping of the tokens.
|
45 |
input_ids (list): The input ids of the tokens.
|
46 |
-
|
47 |
Returns:
|
48 |
list: A list of groupings of tokens.
|
49 |
"""
|
@@ -64,10 +66,11 @@ def __produce_groupings(offset_mapping, input_ids):
|
|
64 |
# Append final group
|
65 |
if current_group:
|
66 |
res.append(current_group)
|
67 |
-
|
68 |
return res
|
69 |
|
70 |
-
|
|
|
71 |
"""
|
72 |
Calculate the pseudo-perplexity of a text using a masked language model. Return all
|
73 |
words that exceed a threshold of "adjusted awkwardness". The threshold is a measure
|
@@ -77,7 +80,7 @@ def pseudo_perplexity(text, threshold=4, max_len=128):
|
|
77 |
text (str): The text to check.
|
78 |
threshold (float): The threshold for awkwardness. Default is 4.
|
79 |
max_len (int): The maximum length of the text. Default is 128.
|
80 |
-
|
81 |
Returns:
|
82 |
dict: A dictionary containing the score and errors.
|
83 |
"""
|
@@ -94,7 +97,7 @@ def pseudo_perplexity(text, threshold=4, max_len=128):
|
|
94 |
for group in word_groups:
|
95 |
# Skip special tokens (CLS and SEP)
|
96 |
if group[0] == 0 or group[-1] == len(input_ids) - 1:
|
97 |
-
continue
|
98 |
|
99 |
# Mask the word group
|
100 |
masked = input_ids.clone()
|
@@ -119,7 +122,9 @@ def pseudo_perplexity(text, threshold=4, max_len=128):
|
|
119 |
word_loss = -np.sum(log_probs) / len(log_probs)
|
120 |
# Adjust the loss based on the rarity of the word
|
121 |
word = tokenizer.decode(input_ids[group[0]])
|
122 |
-
word_loss -= 0.6 * __get_rarity(
|
|
|
|
|
123 |
loss_values.append(word_loss)
|
124 |
|
125 |
# Structure the results for output
|
@@ -129,22 +134,24 @@ def pseudo_perplexity(text, threshold=4, max_len=128):
|
|
129 |
for i, l in enumerate(loss_values):
|
130 |
if l < threshold:
|
131 |
continue
|
132 |
-
errors.append(
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
137 |
|
138 |
-
res = {
|
139 |
-
"score": __fluency_score(average_loss),
|
140 |
-
"errors": errors
|
141 |
-
}
|
142 |
|
143 |
return res
|
144 |
|
145 |
-
|
|
|
|
|
|
|
146 |
"""
|
147 |
-
Transform the loss into a score from 0 to 100. Steepness controls how quickly the
|
148 |
score drops as loss increases and midpoint controls the loss at which the score is
|
149 |
50.
|
150 |
|
@@ -152,20 +159,21 @@ def __fluency_score(loss, midpoint=5, steepness=0.3):
|
|
152 |
loss (float): The loss to transform.
|
153 |
midpoint (float): The loss at which the score is 50. Default is 5.
|
154 |
steepness (float): The steepness of the curve. Default is 0.3.
|
155 |
-
|
156 |
Returns:
|
157 |
float: The score from 0 to 100.
|
158 |
"""
|
159 |
score = 100 / (1 + np.exp(steepness * (loss - midpoint)))
|
160 |
return round(score, 2)
|
161 |
|
162 |
-
|
|
|
163 |
"""
|
164 |
Check the grammar of a text using a grammar checker and a structural grammar check.
|
165 |
|
166 |
Parameters:
|
167 |
text (str): The text to check.
|
168 |
-
|
169 |
Returns:
|
170 |
dict: A dictionary containing the score and errors.
|
171 |
"""
|
@@ -195,83 +203,112 @@ def grammar_errors(text) -> tuple[int, list[str]]:
|
|
195 |
|
196 |
grammar_score = len(r) / len(text.split())
|
197 |
|
198 |
-
res = {
|
199 |
-
"score": __grammar_score_from_prob(grammar_score),
|
200 |
-
"errors": r
|
201 |
-
}
|
202 |
|
203 |
return res
|
204 |
|
205 |
-
|
|
|
206 |
"""
|
207 |
Transform the number of errors divided by words into a score from 0 to 100.
|
208 |
-
|
|
|
|
|
|
|
|
|
|
|
209 |
"""
|
210 |
-
score = 100*(1-error_ratio)
|
211 |
return round(score, 2)
|
212 |
|
213 |
|
214 |
-
def __check_structural_grammar(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
doc = nlp(text)
|
216 |
issues = []
|
217 |
|
218 |
# 1. Missing main verb (ROOT)
|
219 |
-
root_verbs = [
|
|
|
|
|
220 |
if not root_verbs:
|
221 |
root_root = [tok for tok in doc if tok.dep_ == "ROOT"]
|
222 |
token = root_root[0] if root_root else doc[0]
|
223 |
-
issues.append(
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
|
|
|
|
228 |
|
229 |
# 2. Verb(s) present but no subject
|
230 |
verbs = [tok for tok in doc if tok.pos_ in {"VERB", "AUX"}]
|
231 |
subjects = [tok for tok in doc if tok.dep_ in {"nsubj", "nsubjpass"}]
|
232 |
if verbs and not subjects:
|
233 |
for verb in verbs:
|
234 |
-
issues.append(
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
|
|
|
|
239 |
|
240 |
# 3. Dangling prepositions
|
241 |
for tok in doc:
|
242 |
if tok.pos_ == "ADP" and len(list(tok.children)) == 0:
|
243 |
-
issues.append(
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
|
|
|
|
248 |
|
249 |
# 4. Noun pile-up (no verbs, all tokens are nominal)
|
250 |
-
if not any(tok.pos_ in {"VERB", "AUX"} for tok in doc) and
|
251 |
-
|
|
|
|
|
|
|
252 |
token = doc[0]
|
253 |
-
issues.append(
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
|
|
|
|
258 |
|
259 |
# 5. Multiple ROOTs (possible run-on)
|
260 |
root_count = sum(1 for tok in doc if tok.dep_ == "ROOT")
|
261 |
if root_count > 1:
|
262 |
for tok in doc:
|
263 |
if tok.dep_ == "ROOT":
|
264 |
-
issues.append(
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
|
|
|
|
269 |
|
270 |
return issues
|
271 |
|
272 |
|
|
|
273 |
def main():
|
274 |
pass
|
275 |
|
|
|
276 |
if __name__ == "__main__":
|
277 |
main()
|
|
|
1 |
import language_tool_python
|
|
|
|
|
2 |
import numpy as np
|
3 |
import spacy
|
4 |
+
import torch
|
5 |
import wordfreq
|
6 |
+
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
7 |
|
8 |
# setup global variables on import (bad practice, but whatever)
|
9 |
+
# --------------------------------------------------------------
|
10 |
|
11 |
# grammar checker
|
12 |
+
tool = language_tool_python.LanguageTool("en-US")
|
13 |
|
14 |
# masked language model and tokenizer from huggingface
|
15 |
+
model_name = "distilbert-base-multilingual-cased"
|
16 |
model = AutoModelForMaskedLM.from_pretrained(model_name)
|
17 |
model.eval()
|
18 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name) # tokenizer
|
19 |
|
20 |
# spacy model for parsing
|
21 |
nlp = spacy.load("en_core_web_sm")
|
22 |
|
23 |
+
|
24 |
+
def __get_rarity(word: str, lang: str = "en") -> float:
|
25 |
"""
|
26 |
+
Returns the rarity of a word in the given language. word_freq retuns a value
|
27 |
between 0 and 1, where 1 is the most common word. Therefore, taking the log results
|
28 |
in a value between 0 (log 1 = 0) and -27.63 (log 1e-12). We then negate it so super
|
29 |
rare words have a high score and common words have a low score.
|
|
|
31 |
Parameters:
|
32 |
word (str): The word to check.
|
33 |
lang (str): The language to check. Default is "en".
|
34 |
+
|
35 |
Returns:
|
36 |
float: The rarity of the word.
|
37 |
"""
|
38 |
return -np.log(wordfreq.word_frequency(word, lang) + 1e-12)
|
39 |
|
40 |
+
|
41 |
+
def __produce_groupings(offset_mapping: list, input_ids: list) -> list:
|
42 |
"""
|
43 |
Produce groupings of tokens that are part of the same word.
|
44 |
|
45 |
Parameters:
|
46 |
offset_mapping (list): The offset mapping of the tokens.
|
47 |
input_ids (list): The input ids of the tokens.
|
48 |
+
|
49 |
Returns:
|
50 |
list: A list of groupings of tokens.
|
51 |
"""
|
|
|
66 |
# Append final group
|
67 |
if current_group:
|
68 |
res.append(current_group)
|
69 |
+
|
70 |
return res
|
71 |
|
72 |
+
|
73 |
+
def pseudo_perplexity(text: str, threshold: int = 4, max_len: int = 128) -> dict:
|
74 |
"""
|
75 |
Calculate the pseudo-perplexity of a text using a masked language model. Return all
|
76 |
words that exceed a threshold of "adjusted awkwardness". The threshold is a measure
|
|
|
80 |
text (str): The text to check.
|
81 |
threshold (float): The threshold for awkwardness. Default is 4.
|
82 |
max_len (int): The maximum length of the text. Default is 128.
|
83 |
+
|
84 |
Returns:
|
85 |
dict: A dictionary containing the score and errors.
|
86 |
"""
|
|
|
97 |
for group in word_groups:
|
98 |
# Skip special tokens (CLS and SEP)
|
99 |
if group[0] == 0 or group[-1] == len(input_ids) - 1:
|
100 |
+
continue
|
101 |
|
102 |
# Mask the word group
|
103 |
masked = input_ids.clone()
|
|
|
122 |
word_loss = -np.sum(log_probs) / len(log_probs)
|
123 |
# Adjust the loss based on the rarity of the word
|
124 |
word = tokenizer.decode(input_ids[group[0]])
|
125 |
+
word_loss -= 0.6 * __get_rarity(
|
126 |
+
word
|
127 |
+
) # subtract rarity (rare words reduce loss)
|
128 |
loss_values.append(word_loss)
|
129 |
|
130 |
# Structure the results for output
|
|
|
134 |
for i, l in enumerate(loss_values):
|
135 |
if l < threshold:
|
136 |
continue
|
137 |
+
errors.append(
|
138 |
+
{
|
139 |
+
"start": i,
|
140 |
+
"end": i,
|
141 |
+
"message": f"Adjusted liklihood {l} over threshold {threshold}",
|
142 |
+
}
|
143 |
+
)
|
144 |
|
145 |
+
res = {"score": __fluency_score(average_loss), "errors": errors}
|
|
|
|
|
|
|
146 |
|
147 |
return res
|
148 |
|
149 |
+
|
150 |
+
def __fluency_score(
|
151 |
+
loss: float, midpoint: float = 5.0, steepness: float = 0.3
|
152 |
+
) -> float:
|
153 |
"""
|
154 |
+
Transform the loss into a score from 0 to 100. Steepness controls how quickly the
|
155 |
score drops as loss increases and midpoint controls the loss at which the score is
|
156 |
50.
|
157 |
|
|
|
159 |
loss (float): The loss to transform.
|
160 |
midpoint (float): The loss at which the score is 50. Default is 5.
|
161 |
steepness (float): The steepness of the curve. Default is 0.3.
|
162 |
+
|
163 |
Returns:
|
164 |
float: The score from 0 to 100.
|
165 |
"""
|
166 |
score = 100 / (1 + np.exp(steepness * (loss - midpoint)))
|
167 |
return round(score, 2)
|
168 |
|
169 |
+
|
170 |
+
def grammar_errors(text: str) -> dict:
|
171 |
"""
|
172 |
Check the grammar of a text using a grammar checker and a structural grammar check.
|
173 |
|
174 |
Parameters:
|
175 |
text (str): The text to check.
|
176 |
+
|
177 |
Returns:
|
178 |
dict: A dictionary containing the score and errors.
|
179 |
"""
|
|
|
203 |
|
204 |
grammar_score = len(r) / len(text.split())
|
205 |
|
206 |
+
res = {"score": __grammar_score_from_prob(grammar_score), "errors": r}
|
|
|
|
|
|
|
207 |
|
208 |
return res
|
209 |
|
210 |
+
|
211 |
+
def __grammar_score_from_prob(error_ratio: float) -> float:
|
212 |
"""
|
213 |
Transform the number of errors divided by words into a score from 0 to 100.
|
214 |
+
|
215 |
+
Parameters:
|
216 |
+
error_ratio (float): The ratio of errors to words.
|
217 |
+
|
218 |
+
Returns:
|
219 |
+
float: The score from 0 to 100.
|
220 |
"""
|
221 |
+
score = 100 * (1 - error_ratio)
|
222 |
return round(score, 2)
|
223 |
|
224 |
|
225 |
+
def __check_structural_grammar(text: str) -> list:
|
226 |
+
"""
|
227 |
+
Check the structural grammar of a text using spaCy.
|
228 |
+
|
229 |
+
Parameters:
|
230 |
+
text (str): The text to check.
|
231 |
+
|
232 |
+
Returns:
|
233 |
+
list: A list of structural grammar errors.
|
234 |
+
"""
|
235 |
doc = nlp(text)
|
236 |
issues = []
|
237 |
|
238 |
# 1. Missing main verb (ROOT)
|
239 |
+
root_verbs = [
|
240 |
+
tok for tok in doc if tok.dep_ == "ROOT" and tok.pos_ in {"VERB", "AUX"}
|
241 |
+
]
|
242 |
if not root_verbs:
|
243 |
root_root = [tok for tok in doc if tok.dep_ == "ROOT"]
|
244 |
token = root_root[0] if root_root else doc[0]
|
245 |
+
issues.append(
|
246 |
+
{
|
247 |
+
"start": token.i,
|
248 |
+
"end": token.i + 1,
|
249 |
+
"message": "Sentence is missing a main verb (no ROOT verb).",
|
250 |
+
}
|
251 |
+
)
|
252 |
|
253 |
# 2. Verb(s) present but no subject
|
254 |
verbs = [tok for tok in doc if tok.pos_ in {"VERB", "AUX"}]
|
255 |
subjects = [tok for tok in doc if tok.dep_ in {"nsubj", "nsubjpass"}]
|
256 |
if verbs and not subjects:
|
257 |
for verb in verbs:
|
258 |
+
issues.append(
|
259 |
+
{
|
260 |
+
"start": verb.i,
|
261 |
+
"end": verb.i + 1,
|
262 |
+
"message": "Sentence has verb(s) but no subject (possible fragment).",
|
263 |
+
}
|
264 |
+
)
|
265 |
|
266 |
# 3. Dangling prepositions
|
267 |
for tok in doc:
|
268 |
if tok.pos_ == "ADP" and len(list(tok.children)) == 0:
|
269 |
+
issues.append(
|
270 |
+
{
|
271 |
+
"start": tok.i,
|
272 |
+
"end": tok.i + 1,
|
273 |
+
"message": f"Dangling preposition '{tok.text}' (no object or complement).",
|
274 |
+
}
|
275 |
+
)
|
276 |
|
277 |
# 4. Noun pile-up (no verbs, all tokens are nominal)
|
278 |
+
if not any(tok.pos_ in {"VERB", "AUX"} for tok in doc) and all(
|
279 |
+
tok.pos_ in {"NOUN", "PROPN", "ADJ", "DET", "NUM"}
|
280 |
+
for tok in doc
|
281 |
+
if tok.is_alpha
|
282 |
+
):
|
283 |
token = doc[0]
|
284 |
+
issues.append(
|
285 |
+
{
|
286 |
+
"start": token.i,
|
287 |
+
"end": token.i + 1,
|
288 |
+
"message": "Sentence lacks a verb or any verbal structure (nominal phrase pile-up).",
|
289 |
+
}
|
290 |
+
)
|
291 |
|
292 |
# 5. Multiple ROOTs (possible run-on)
|
293 |
root_count = sum(1 for tok in doc if tok.dep_ == "ROOT")
|
294 |
if root_count > 1:
|
295 |
for tok in doc:
|
296 |
if tok.dep_ == "ROOT":
|
297 |
+
issues.append(
|
298 |
+
{
|
299 |
+
"start": tok.i,
|
300 |
+
"end": tok.i + 1,
|
301 |
+
"message": "Sentence has multiple ROOTs — possible run-on sentence.",
|
302 |
+
}
|
303 |
+
)
|
304 |
|
305 |
return issues
|
306 |
|
307 |
|
308 |
+
# Unit tests can go here eventually
|
309 |
def main():
|
310 |
pass
|
311 |
|
312 |
+
|
313 |
if __name__ == "__main__":
|
314 |
main()
|
scorer.ipynb
CHANGED
@@ -4,78 +4,100 @@
|
|
4 |
"cell_type": "code",
|
5 |
"execution_count": 1,
|
6 |
"metadata": {},
|
7 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
"source": [
|
9 |
-
"from categories.fluency import
|
|
|
10 |
]
|
11 |
},
|
12 |
{
|
13 |
"cell_type": "code",
|
14 |
-
"execution_count":
|
15 |
"metadata": {},
|
16 |
"outputs": [
|
17 |
{
|
18 |
"name": "stdout",
|
19 |
"output_type": "stream",
|
20 |
"text": [
|
21 |
-
"Sentence:
|
22 |
]
|
23 |
}
|
24 |
],
|
25 |
"source": [
|
26 |
-
"
|
|
|
27 |
"\n",
|
28 |
-
"if
|
29 |
-
"
|
30 |
"\n",
|
31 |
-
"print(\"Sentence:\",
|
32 |
"\n",
|
33 |
-
"err = grammar_errors(
|
34 |
-
"flu = pseudo_perplexity(
|
|
|
35 |
]
|
36 |
},
|
37 |
{
|
38 |
"cell_type": "code",
|
39 |
-
"execution_count":
|
40 |
"metadata": {},
|
41 |
"outputs": [
|
42 |
{
|
43 |
"name": "stdout",
|
44 |
"output_type": "stream",
|
45 |
"text": [
|
46 |
-
"
|
47 |
-
"
|
48 |
-
"
|
49 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
]
|
51 |
}
|
52 |
],
|
53 |
"source": [
|
54 |
-
"combined_err = err[\"errors\"] + flu[\"errors\"] # Combine the error counts from both functions\n",
|
55 |
"\n",
|
56 |
"for e in combined_err:\n",
|
57 |
-
" substr = \" \".join(
|
58 |
" print(f\"{e['message']}: {substr}\") # Print the error messages\n"
|
59 |
]
|
60 |
},
|
61 |
{
|
62 |
"cell_type": "code",
|
63 |
-
"execution_count":
|
64 |
"metadata": {},
|
65 |
"outputs": [
|
66 |
{
|
67 |
"name": "stdout",
|
68 |
"output_type": "stream",
|
69 |
"text": [
|
70 |
-
"
|
71 |
-
"
|
72 |
]
|
73 |
}
|
74 |
],
|
75 |
"source": [
|
76 |
"fluency_score = 0.5 * err[\"score\"] + 0.5 * flu[\"score\"] # Calculate the fluency score\n",
|
77 |
-
"print(
|
78 |
-
"
|
|
|
79 |
]
|
80 |
}
|
81 |
],
|
|
|
4 |
"cell_type": "code",
|
5 |
"execution_count": 1,
|
6 |
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stderr",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"2025-04-08 22:18:10,848 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: distilbert-base-multilingual-cased\n",
|
13 |
+
"Initialized the EmbeddingLoader with model: distilbert-base-multilingual-cased\n"
|
14 |
+
]
|
15 |
+
}
|
16 |
+
],
|
17 |
"source": [
|
18 |
+
"from categories.fluency import *\n",
|
19 |
+
"from categories.accuracy import *"
|
20 |
]
|
21 |
},
|
22 |
{
|
23 |
"cell_type": "code",
|
24 |
+
"execution_count": 2,
|
25 |
"metadata": {},
|
26 |
"outputs": [
|
27 |
{
|
28 |
"name": "stdout",
|
29 |
"output_type": "stream",
|
30 |
"text": [
|
31 |
+
"Sentence: The cat sat the quickly up apples banana.\n"
|
32 |
]
|
33 |
}
|
34 |
],
|
35 |
"source": [
|
36 |
+
"src_sent = \"Das ist ein Test.\" # Example source sentence\n",
|
37 |
+
"trg_sent = input(f\"{src_sent}: \") # Prompt the user to enter a sentence\n",
|
38 |
"\n",
|
39 |
+
"if trg_sent == \"\":\n",
|
40 |
+
" trg_sent = \"The cat sat the quickly up apples banana.\"\n",
|
41 |
"\n",
|
42 |
+
"print(\"Sentence:\", trg_sent) # Print the input sentence\n",
|
43 |
"\n",
|
44 |
+
"err = grammar_errors(trg_sent) # Call the function to execute the grammar error checking\n",
|
45 |
+
"flu = pseudo_perplexity(trg_sent, threshold=3.1) # Call the function to execute the fluency checking\n",
|
46 |
+
"acc = accuracy(src_sent, trg_sent) # Call the function to execute the accuracy checking"
|
47 |
]
|
48 |
},
|
49 |
{
|
50 |
"cell_type": "code",
|
51 |
+
"execution_count": 3,
|
52 |
"metadata": {},
|
53 |
"outputs": [
|
54 |
{
|
55 |
"name": "stdout",
|
56 |
"output_type": "stream",
|
57 |
"text": [
|
58 |
+
"An apostrophe may be missing.: apples banana.\n",
|
59 |
+
"Adjusted liklihood 4.8056646935577145 over threshold 3.1: sat\n",
|
60 |
+
"Adjusted liklihood 4.473408069089179 over threshold 3.1: the\n",
|
61 |
+
"Adjusted liklihood 4.732453441503642 over threshold 3.1: quickly\n",
|
62 |
+
"Adjusted liklihood 5.1115574262487735 over threshold 3.1: apples\n",
|
63 |
+
"Word ist possibly mistranslated or omitted: cat\n",
|
64 |
+
"Word ein possibly mistranslated or omitted: sat\n",
|
65 |
+
"Word sat possibly mistranslated or added erroneously: sat\n",
|
66 |
+
"Word the possibly mistranslated or added erroneously: the\n",
|
67 |
+
"Word quickly possibly mistranslated or added erroneously: quickly\n",
|
68 |
+
"Word up possibly mistranslated or added erroneously: up\n",
|
69 |
+
"Word apples possibly mistranslated or added erroneously: apples\n",
|
70 |
+
"Word banana possibly mistranslated or added erroneously: banana.\n"
|
71 |
]
|
72 |
}
|
73 |
],
|
74 |
"source": [
|
75 |
+
"combined_err = err[\"errors\"] + flu[\"errors\"] + acc[\"errors\"] # Combine the error counts from both functions\n",
|
76 |
"\n",
|
77 |
"for e in combined_err:\n",
|
78 |
+
" substr = \" \".join(trg_sent.split(\" \")[e[\"start\"]:e[\"end\"]+1])\n",
|
79 |
" print(f\"{e['message']}: {substr}\") # Print the error messages\n"
|
80 |
]
|
81 |
},
|
82 |
{
|
83 |
"cell_type": "code",
|
84 |
+
"execution_count": null,
|
85 |
"metadata": {},
|
86 |
"outputs": [
|
87 |
{
|
88 |
"name": "stdout",
|
89 |
"output_type": "stream",
|
90 |
"text": [
|
91 |
+
"Fluency Score: 76.61500000000001\n",
|
92 |
+
"Accuracy Score: 24.45\n"
|
93 |
]
|
94 |
}
|
95 |
],
|
96 |
"source": [
|
97 |
"fluency_score = 0.5 * err[\"score\"] + 0.5 * flu[\"score\"] # Calculate the fluency score\n",
|
98 |
+
"print(\"Fluency Score:\", round(fluency_score, 2)) # Print the fluency score\n",
|
99 |
+
"\n",
|
100 |
+
"print(\"Accuracy Score:\", acc[\"score\"]) # Print the accuracy score"
|
101 |
]
|
102 |
}
|
103 |
],
|