Spaces:

ak5005
/

derrobot

Sleeping

App Files Files Community

Aidan Phillips commited on Apr 4

Commit

7159c31

1 Parent(s): f97ee3d

bug fix, threshold tweak

Browse files

Files changed (2) hide show

categories/fluency.py +6 -10
scorer.ipynb +12 -31

categories/fluency.py CHANGED Viewed

@@ -61,7 +61,6 @@ def pseudo_perplexity(text, threshold=20, max_len=128):
         word_groups.append(current_group)
     loss_values = []
-    tok_loss = []
     for group in word_groups:
         if group[0] == 0 or group[-1] == len(input_ids) - 1:
             continue  # skip [CLS] and [SEP]
@@ -80,14 +79,11 @@ def pseudo_perplexity(text, threshold=20, max_len=128):
             true_token_id = input_ids[i].item()
             prob = probs[true_token_id].item()
             log_probs.append(np.log(prob + 1e-12))
-            tok_loss.append(-np.log(prob + 1e-12))
         word_loss = -np.sum(log_probs) / len(log_probs)
         word = tokenizer.decode(input_ids[group[0]])
         word_loss -= 0.6 * __get_word_pr_score(word)
         loss_values.append(word_loss)
-    # print(loss_values)
     errors = []
     for i, l in enumerate(loss_values):
@@ -99,12 +95,10 @@ def pseudo_perplexity(text, threshold=20, max_len=128):
             "message": f"Perplexity {l} over threshold {threshold}"
         })
-    # print(tok_loss)
-    s_ppl = np.mean(tok_loss)
-    # print(s_ppl)
     res = {
-        "score": __fluency_score_from_ppl(s_ppl),
         "errors": errors
     }
@@ -129,7 +123,6 @@ def grammar_errors(text) -> tuple[int, list[str]]:
     """
     matches = tool.check(text)
-    grammar_score = len(matches)/len(text.split())
     r = []
     for match in matches:
@@ -150,7 +143,10 @@ def grammar_errors(text) -> tuple[int, list[str]]:
         r.append({"start": start, "end": end, "message": match.message})
     struct_err = __check_structural_grammar(text)
-    r.extend(struct_err)
     res = {
         "score": __grammar_score_from_prob(grammar_score),

         word_groups.append(current_group)
     loss_values = []
     for group in word_groups:
         if group[0] == 0 or group[-1] == len(input_ids) - 1:
             continue  # skip [CLS] and [SEP]
             true_token_id = input_ids[i].item()
             prob = probs[true_token_id].item()
             log_probs.append(np.log(prob + 1e-12))
         word_loss = -np.sum(log_probs) / len(log_probs)
         word = tokenizer.decode(input_ids[group[0]])
         word_loss -= 0.6 * __get_word_pr_score(word)
         loss_values.append(word_loss)
     errors = []
     for i, l in enumerate(loss_values):
             "message": f"Perplexity {l} over threshold {threshold}"
         })
+    error_rate = len(errors) / len(loss_values)
     res = {
+        "score": __grammar_score_from_prob(error_rate),
         "errors": errors
     }
     """
     matches = tool.check(text)
     r = []
     for match in matches:
         r.append({"start": start, "end": end, "message": match.message})
     struct_err = __check_structural_grammar(text)
+    for e in struct_err:
+        r.append(e)
+    grammar_score = len(r) / len(text.split())
     res = {
         "score": __grammar_score_from_prob(grammar_score),

scorer.ipynb CHANGED Viewed

@@ -11,32 +11,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Sentence: The cat sat the quickly up apples banana.\n",
-      "tensor([  101, 10117, 41163, 20694, 10105, 23590, 10741, 72894, 11268, 99304,\n",
-      "        10219,   119,   102])\n",
-      "tensor([[ 0,  0],\n",
-      "        [ 0,  3],\n",
-      "        [ 4,  7],\n",
-      "        [ 8, 11],\n",
-      "        [12, 15],\n",
-      "        [16, 23],\n",
-      "        [24, 26],\n",
-      "        [27, 30],\n",
-      "        [30, 33],\n",
-      "        [34, 38],\n",
-      "        [38, 40],\n",
-      "        [40, 41],\n",
-      "        [ 0,  0]])\n",
-      "[np.float64(0.00905743383887514), np.float64(1.1257066968185931), np.float64(4.8056646935577145), np.float64(4.473408069089179), np.float64(4.732453441503642), np.float64(3.028744414819041), np.float64(5.1115574262487735), np.float64(-0.6523823890571343)]\n",
-      "[np.float64(1.7636628003080927), np.float64(6.955413759407024), np.float64(10.828562153345375), np.float64(6.228013435558396), np.float64(10.258657658689351), np.float64(6.635744767229443), np.float64(11.163667119285972), np.float64(10.499412826924114), np.float64(11.96113847381264), np.float64(10.010973250156082), np.float64(2.470404176100153)]\n",
-      "0.5208035409471965\n"
      ]
     }
    ],
@@ -49,12 +31,12 @@
     "print(\"Sentence:\", s)  # Print the input sentence\n",
     "\n",
     "err = grammar_errors(s)  # Call the function to execute the grammar error checking\n",
-    "flu = pseudo_perplexity(s, threshold=2.5)  # Call the function to execute the fluency checking"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -62,11 +44,10 @@
      "output_type": "stream",
      "text": [
       "An apostrophe may be missing.: apples banana.\n",
-      "Perplexity 4.8056646935577145 over threshold 2.5: sat\n",
-      "Perplexity 4.473408069089179 over threshold 2.5: the\n",
-      "Perplexity 4.732453441503642 over threshold 2.5: quickly\n",
-      "Perplexity 3.028744414819041 over threshold 2.5: up\n",
-      "Perplexity 5.1115574262487735 over threshold 2.5: apples\n"
      ]
     }
    ],
@@ -80,20 +61,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "87.5 99.71\n",
-      "Fluency Score: 92.384\n"
      ]
     }
    ],
    "source": [
-    "fluency_score = 0.7 * err[\"score\"] + 0.3 * flu[\"score\"]  # Calculate the fluency score\n",
     "print(err[\"score\"], flu[\"score\"])  # Print the individual scores\n",
     "print(\"Fluency Score:\", fluency_score)  # Print the fluency score"
    ]

   },
   {
    "cell_type": "code",
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Sentence: The cat sat the quickly up apples banana.\n"
      ]
     }
    ],
     "print(\"Sentence:\", s)  # Print the input sentence\n",
     "\n",
     "err = grammar_errors(s)  # Call the function to execute the grammar error checking\n",
+    "flu = pseudo_perplexity(s, threshold=3.5)  # Call the function to execute the fluency checking"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
      "output_type": "stream",
      "text": [
       "An apostrophe may be missing.: apples banana.\n",
+      "Perplexity 4.8056646935577145 over threshold 3.5: sat\n",
+      "Perplexity 4.473408069089179 over threshold 3.5: the\n",
+      "Perplexity 4.732453441503642 over threshold 3.5: quickly\n",
+      "Perplexity 5.1115574262487735 over threshold 3.5: apples\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "87.5 50.0\n",
+      "Fluency Score: 68.75\n"
      ]
     }
    ],
    "source": [
+    "fluency_score = 0.5 * err[\"score\"] + 0.5 * flu[\"score\"]  # Calculate the fluency score\n",
     "print(err[\"score\"], flu[\"score\"])  # Print the individual scores\n",
     "print(\"Fluency Score:\", fluency_score)  # Print the fluency score"
    ]