Update app.py
Browse files
app.py
CHANGED
@@ -53,11 +53,11 @@ tech_libraries = f"""
|
|
53 |
""".strip()
|
54 |
|
55 |
|
56 |
-
def convert_and_filter_topk(
|
57 |
"""Convert to lowercase, count word occurrences and save top-k words to a file"""
|
58 |
|
59 |
counter = Counter()
|
60 |
-
data_lower =
|
61 |
|
62 |
print("\nConverting to lowercase and counting word occurrences ...")
|
63 |
with io.TextIOWrapper(
|
@@ -83,8 +83,7 @@ def convert_and_filter_topk(output_dir, input_txt, top_k):
|
|
83 |
print("\nSaving top {} words ...".format(top_k))
|
84 |
top_counter = counter.most_common(top_k)
|
85 |
vocab_str = "\n".join(word for word, count in top_counter)
|
86 |
-
vocab_path = "vocab-{}.txt".format(top_k)
|
87 |
-
vocab_path = os.path.join(output_dir, vocab_path)
|
88 |
with open(vocab_path, "w+") as file:
|
89 |
file.write(vocab_str)
|
90 |
|
@@ -294,7 +293,7 @@ def text_to_kenlm(
|
|
294 |
if _do_limit_topk:
|
295 |
file_name = f"/tmp/my_model-{_topk_words}-words.arpa"
|
296 |
|
297 |
-
_, vocab_str = convert_and_filter_topk(
|
298 |
|
299 |
print(
|
300 |
subprocess.run(
|
|
|
53 |
""".strip()
|
54 |
|
55 |
|
56 |
+
def convert_and_filter_topk(input_txt, top_k):
|
57 |
"""Convert to lowercase, count word occurrences and save top-k words to a file"""
|
58 |
|
59 |
counter = Counter()
|
60 |
+
data_lower = "/tmp/lower.txt.gz"
|
61 |
|
62 |
print("\nConverting to lowercase and counting word occurrences ...")
|
63 |
with io.TextIOWrapper(
|
|
|
83 |
print("\nSaving top {} words ...".format(top_k))
|
84 |
top_counter = counter.most_common(top_k)
|
85 |
vocab_str = "\n".join(word for word, count in top_counter)
|
86 |
+
vocab_path = "/tmp/vocab-{}.txt".format(top_k)
|
|
|
87 |
with open(vocab_path, "w+") as file:
|
88 |
file.write(vocab_str)
|
89 |
|
|
|
293 |
if _do_limit_topk:
|
294 |
file_name = f"/tmp/my_model-{_topk_words}-words.arpa"
|
295 |
|
296 |
+
_, vocab_str = convert_and_filter_topk(intermediate_file, _topk_words)
|
297 |
|
298 |
print(
|
299 |
subprocess.run(
|