Spaces:

speech-uk
/

kenlm-ui

Running

App Files Files Community

Yehor commited on 16 days ago

Commit

e6d2f62

verified ·

1 Parent(s): 0532bab

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -9

app.py CHANGED Viewed

@@ -198,6 +198,32 @@ def enhance_text(kenlm_model, text):
     return fixed_label
 def text_to_kenlm(
     _text_file,
     _order,
@@ -218,7 +244,7 @@ def text_to_kenlm(
         raise gr.Error("Please add an order.")
     gr.Info("Started to make the model, wait...")
     results = []
     # Read the file
@@ -229,14 +255,16 @@ def text_to_kenlm(
                 line = line.lower()
             results.append(line)
     # Write to intermediate file
-    intermediate_file = f"/tmp/intermediate.txt"
     with open(intermediate_file, "w") as f:
         f.write(" ".join(results))
     # Commands to run in the container
     cmd = (
-        f"{kenlm_bin}/lmplz --temp_prefix /tmp --memory 90% --text {intermediate_file} --arpa /tmp/my_model.arpa -o {_order} --prune {_arpa_prune} --discount_fallback",
     )
     print(subprocess.run(cmd, shell=True))
@@ -281,6 +309,8 @@ def text_to_kenlm(
             )
         )
         if _do_quantize:
             file_name_quantized = (
                 f"/tmp/my_model-{_binary_type}-{_topk_words}-words.bin"
@@ -298,8 +328,27 @@ def text_to_kenlm(
             print(subprocess.run(cmd, shell=True))
     gr.Success("Model created.")
-    return gr.DownloadButton(value=Path(file_name), label=f"Download: {file_name}")
 with gr.Blocks(
@@ -401,9 +450,22 @@ with gr.Blocks(
                     value=False,
                 )
-            kenlm_model = gr.DownloadButton(
-                label="Created KenLM model",
-            )
         gr.Button("Create").click(
             text_to_kenlm,
@@ -420,7 +482,7 @@ with gr.Blocks(
                 topk_words,
                 do_limit_topk,
             ],
-            outputs=kenlm_model,
         )
         with gr.Row():

     return fixed_label
+def generate_files(results):
+    # Write words to a file
+    words = [r.split() for r in results]
+    words = list(set([w for r in words for w in r]))
+    with open("/tmp/model_vocab.txt", "w") as f:
+        f.write("\n".join(words))
+    # Generate tokens file
+    tokens = set()
+    for word in words:
+        tokens.update(list(word))
+    # add "|" token
+    tokens.add("|")
+    with open("/tmp/model_tokens.txt", "w") as f:
+        tokens_ordered = sorted(tokens)
+        f.write("\n".join(tokens_ordered))
+    # Generate lexicon file
+    with open("/tmp/model_lexicon.txt", "w") as f:
+        for word in words:
+            splitted_word = " ".join(list(word + "|"))
+            f.write(f"{word}\t{splitted_word}\n")
 def text_to_kenlm(
     _text_file,
     _order,
         raise gr.Error("Please add an order.")
     gr.Info("Started to make the model, wait...")
     results = []
     # Read the file
                 line = line.lower()
             results.append(line)
+    generate_files(results)
     # Write to intermediate file
+    intermediate_file = "/tmp/intermediate.txt"
     with open(intermediate_file, "w") as f:
         f.write(" ".join(results))
     # Commands to run in the container
     cmd = (
+        f"{kenlm_bin}/lmplz --temp_prefix {app_dir} --memory 90% --text {intermediate_file} --arpa /tmp/my_model.arpa -o {_order} --prune {_arpa_prune} --discount_fallback",
     )
     print(subprocess.run(cmd, shell=True))
             )
         )
+        generate_files(vocab_str.split("\n"))
         if _do_quantize:
             file_name_quantized = (
                 f"/tmp/my_model-{_binary_type}-{_topk_words}-words.bin"
             print(subprocess.run(cmd, shell=True))
     gr.Success("Model created.")
+    model_file = gr.DownloadButton(
+        value=Path(file_name), label=f"Download: {file_name}"
+    )
+    vocab_file = gr.DownloadButton(
+        value=Path("/tmp/model_vocab.txt"),
+        label="Created model_vocab.txt",
+    )
+    lexicon_file = gr.DownloadButton(
+        value=Path("/tmp/model_lexicon.txt"),
+        label="Created model_lexicon.txt",
+    )
+    tokens_file = gr.DownloadButton(
+        value=Path("/tmp/model_tokens.txt"),
+        label="Created model_tokens.txt",
+    )
+    return [model_file, vocab_file, lexicon_file, tokens_file]
 with gr.Blocks(
                     value=False,
                 )
+            with gr.Column():
+                kenlm_model = gr.DownloadButton(
+                    label="Created KenLM model",
+                )
+                vocab_file = gr.DownloadButton(
+                    label="Created model_vocab.txt",
+                )
+                lexicon_file = gr.DownloadButton(
+                    label="Created model_lexicon.txt",
+                )
+                tokens_file = gr.DownloadButton(
+                    label="Created model_tokens.txt",
+                )
         gr.Button("Create").click(
             text_to_kenlm,
                 topk_words,
                 do_limit_topk,
             ],
+            outputs=[kenlm_model, vocab_file, lexicon_file, tokens_file],
         )
         with gr.Row():