Init
Browse files- .dockerignore +3 -0
- .gitignore +7 -0
- Dockerfile +65 -0
- README.md +43 -6
- app.py +433 -0
- demo.txt +24 -0
- requirements.txt +1 -0
.dockerignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.ruff_cache/
|
2 |
+
.venv/
|
3 |
+
kenlm/
|
.gitignore
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.idea/
|
2 |
+
.venv/
|
3 |
+
.ruff_cache/
|
4 |
+
|
5 |
+
flagged/
|
6 |
+
|
7 |
+
intermediate.txt
|
Dockerfile
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.12.9-bookworm
|
2 |
+
|
3 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
4 |
+
|
5 |
+
RUN apt-get update && \
|
6 |
+
apt-get upgrade -y && \
|
7 |
+
apt-get install -y --no-install-recommends \
|
8 |
+
git \
|
9 |
+
git-lfs \
|
10 |
+
wget \
|
11 |
+
curl \
|
12 |
+
ca-certificates \
|
13 |
+
# python build dependencies \
|
14 |
+
build-essential cmake libicu-dev libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev \
|
15 |
+
libssl-dev \
|
16 |
+
zlib1g-dev \
|
17 |
+
libreadline-dev \
|
18 |
+
libsqlite3-dev \
|
19 |
+
libncursesw5-dev \
|
20 |
+
xz-utils \
|
21 |
+
tk-dev \
|
22 |
+
libxml2-dev \
|
23 |
+
libxmlsec1-dev \
|
24 |
+
libffi-dev \
|
25 |
+
liblzma-dev \
|
26 |
+
# gradio dependencies \
|
27 |
+
ffmpeg \
|
28 |
+
&& apt-get clean \
|
29 |
+
&& rm -rf /var/lib/apt/lists/*
|
30 |
+
|
31 |
+
RUN python -m ensurepip --upgrade && python -m pip install --upgrade pip
|
32 |
+
|
33 |
+
RUN useradd -m -u 1001 hf-space
|
34 |
+
USER hf-space
|
35 |
+
|
36 |
+
ENV HOME=/home/hf-space \
|
37 |
+
PATH=/home/hf-space/.local/bin:${PATH} \
|
38 |
+
PYTHONPATH=/home/hf-space/app \
|
39 |
+
PYTHONUNBUFFERED=1 \
|
40 |
+
GRADIO_ALLOW_FLAGGING=never \
|
41 |
+
GRADIO_NUM_PORTS=1 \
|
42 |
+
GRADIO_SERVER_NAME=0.0.0.0 \
|
43 |
+
GRADIO_THEME=huggingface \
|
44 |
+
SYSTEM=spaces \
|
45 |
+
HF_HOME=/home/hf-space/app/hf-home
|
46 |
+
|
47 |
+
COPY --chown=hf-space:hf-space . ${HOME}/app
|
48 |
+
|
49 |
+
WORKDIR ${HOME}/app
|
50 |
+
|
51 |
+
RUN wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz && \
|
52 |
+
mkdir kenlm/build && \
|
53 |
+
cd kenlm/build && \
|
54 |
+
cmake .. && \
|
55 |
+
make -j2 && \
|
56 |
+
cd ../..
|
57 |
+
|
58 |
+
RUN mkdir ${HF_HOME} && chmod a+rwx ${HF_HOME}
|
59 |
+
|
60 |
+
RUN pip install --no-cache-dir -r /home/hf-space/app/requirements.txt
|
61 |
+
|
62 |
+
# Install KenLM module
|
63 |
+
RUN pip install https://github.com/kpu/kenlm/archive/master.zip
|
64 |
+
|
65 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
@@ -1,10 +1,47 @@
|
|
1 |
---
|
2 |
-
|
3 |
-
|
4 |
-
colorFrom: gray
|
5 |
-
colorTo: green
|
6 |
sdk: docker
|
7 |
-
|
|
|
|
|
|
|
8 |
---
|
9 |
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
license: apache-2.0
|
3 |
+
title: KenLM UI
|
|
|
|
|
4 |
sdk: docker
|
5 |
+
emoji: 📖
|
6 |
+
colorFrom: green
|
7 |
+
colorTo: gray
|
8 |
+
short_description: 'Score texts and build KenLMs'
|
9 |
---
|
10 |
|
11 |
+
# KenLM UI
|
12 |
+
|
13 |
+
## Install
|
14 |
+
|
15 |
+
```shell
|
16 |
+
uv venv --python 3.12.9
|
17 |
+
|
18 |
+
source .venv/bin/activate
|
19 |
+
|
20 |
+
uv pip install -r requirements.txt
|
21 |
+
|
22 |
+
uv pip install https://github.com/kpu/kenlm/archive/master.zip
|
23 |
+
```
|
24 |
+
|
25 |
+
## Build KenLM in a container
|
26 |
+
|
27 |
+
```
|
28 |
+
git clone https://github.com/kpu/kenlm/
|
29 |
+
|
30 |
+
mkdir kenlm/build
|
31 |
+
cd kenlm/build
|
32 |
+
|
33 |
+
cmake ..
|
34 |
+
make -j2
|
35 |
+
```
|
36 |
+
|
37 |
+
## Build image
|
38 |
+
|
39 |
+
```shell
|
40 |
+
docker build -t kenlm-trainer-gradio .
|
41 |
+
```
|
42 |
+
|
43 |
+
## Run
|
44 |
+
|
45 |
+
```shell
|
46 |
+
docker run -it --rm -p 8888:7860 --name kenlm-trainer kenlm-trainer-gradio
|
47 |
+
```
|
app.py
ADDED
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
https://github.com/mozilla/DeepSpeech/blob/master/data/lm/generate_lm.py
|
3 |
+
"""
|
4 |
+
|
5 |
+
import os
|
6 |
+
import gzip
|
7 |
+
import io
|
8 |
+
import sys
|
9 |
+
import subprocess
|
10 |
+
import functools
|
11 |
+
|
12 |
+
from importlib.metadata import version
|
13 |
+
from collections import Counter
|
14 |
+
from pathlib import Path
|
15 |
+
|
16 |
+
import gradio as gr
|
17 |
+
|
18 |
+
try:
|
19 |
+
import kenlm
|
20 |
+
except ImportError:
|
21 |
+
print("Please install `kenlm` library.")
|
22 |
+
|
23 |
+
# Config
|
24 |
+
title = "KenLM UI"
|
25 |
+
|
26 |
+
app_dir = "/home/hf-space/app"
|
27 |
+
kenlm_bin = f"{app_dir}/kenlm/build/bin"
|
28 |
+
|
29 |
+
examples = [
|
30 |
+
["demo.txt", 3, True],
|
31 |
+
]
|
32 |
+
|
33 |
+
description_head = f"""
|
34 |
+
# {title}
|
35 |
+
|
36 |
+
## Overview
|
37 |
+
|
38 |
+
This app gives you ability to debug KenLM models, enhance text using a trained model, and create a new KenLM model (Kneser-Ney) from a text corpus.
|
39 |
+
""".strip()
|
40 |
+
|
41 |
+
|
42 |
+
tech_env = f"""
|
43 |
+
#### Environment
|
44 |
+
|
45 |
+
- Python: {sys.version}
|
46 |
+
""".strip()
|
47 |
+
|
48 |
+
tech_libraries = f"""
|
49 |
+
#### Libraries
|
50 |
+
|
51 |
+
- kenlm: {version("kenlm")}
|
52 |
+
- gradio: {version("gradio")}
|
53 |
+
""".strip()
|
54 |
+
|
55 |
+
|
56 |
+
def convert_and_filter_topk(output_dir, input_txt, top_k):
|
57 |
+
"""Convert to lowercase, count word occurrences and save top-k words to a file"""
|
58 |
+
|
59 |
+
counter = Counter()
|
60 |
+
data_lower = os.path.join(output_dir, "lower.txt.gz")
|
61 |
+
|
62 |
+
print("\nConverting to lowercase and counting word occurrences ...")
|
63 |
+
with io.TextIOWrapper(
|
64 |
+
io.BufferedWriter(gzip.open(data_lower, "w+")), encoding="utf-8"
|
65 |
+
) as file_out:
|
66 |
+
# Open the input file either from input.txt or input.txt.gz
|
67 |
+
_, file_extension = os.path.splitext(input_txt)
|
68 |
+
if file_extension == ".gz":
|
69 |
+
file_in = io.TextIOWrapper(
|
70 |
+
io.BufferedReader(gzip.open(input_txt)), encoding="utf-8"
|
71 |
+
)
|
72 |
+
else:
|
73 |
+
file_in = open(input_txt, encoding="utf-8")
|
74 |
+
|
75 |
+
for line in file_in:
|
76 |
+
line_lower = line.lower()
|
77 |
+
counter.update(line_lower.split())
|
78 |
+
file_out.write(line_lower)
|
79 |
+
|
80 |
+
file_in.close()
|
81 |
+
|
82 |
+
# Save top-k words
|
83 |
+
print("\nSaving top {} words ...".format(top_k))
|
84 |
+
top_counter = counter.most_common(top_k)
|
85 |
+
vocab_str = "\n".join(word for word, count in top_counter)
|
86 |
+
vocab_path = "vocab-{}.txt".format(top_k)
|
87 |
+
vocab_path = os.path.join(output_dir, vocab_path)
|
88 |
+
with open(vocab_path, "w+") as file:
|
89 |
+
file.write(vocab_str)
|
90 |
+
|
91 |
+
print("\nCalculating word statistics ...")
|
92 |
+
total_words = sum(counter.values())
|
93 |
+
print(" Your text file has {} words in total".format(total_words))
|
94 |
+
print(" It has {} unique words".format(len(counter)))
|
95 |
+
top_words_sum = sum(count for word, count in top_counter)
|
96 |
+
word_fraction = (top_words_sum / total_words) * 100
|
97 |
+
print(
|
98 |
+
" Your top-{} words are {:.4f} percent of all words".format(
|
99 |
+
top_k, word_fraction
|
100 |
+
)
|
101 |
+
)
|
102 |
+
print(' Your most common word "{}" occurred {} times'.format(*top_counter[0]))
|
103 |
+
last_word, last_count = top_counter[-1]
|
104 |
+
print(
|
105 |
+
' The least common word in your top-k is "{}" with {} times'.format(
|
106 |
+
last_word, last_count
|
107 |
+
)
|
108 |
+
)
|
109 |
+
for i, (w, c) in enumerate(reversed(top_counter)):
|
110 |
+
if c > last_count:
|
111 |
+
print(
|
112 |
+
' The first word with {} occurrences is "{}" at place {}'.format(
|
113 |
+
c, w, len(top_counter) - 1 - i
|
114 |
+
)
|
115 |
+
)
|
116 |
+
break
|
117 |
+
|
118 |
+
return data_lower, vocab_str
|
119 |
+
|
120 |
+
|
121 |
+
def inference_model(kenlm_model, text):
|
122 |
+
if not kenlm_model:
|
123 |
+
raise gr.Error("Please upload your KenLM model.")
|
124 |
+
|
125 |
+
if not text:
|
126 |
+
raise gr.Error("Please paste the text to score.")
|
127 |
+
|
128 |
+
model = kenlm.Model(kenlm_model)
|
129 |
+
results = []
|
130 |
+
|
131 |
+
score = model.score(text, bos=True, eos=True)
|
132 |
+
|
133 |
+
results.append(f"Score: {score}")
|
134 |
+
results.append("---")
|
135 |
+
|
136 |
+
# Show scores and n-gram matches
|
137 |
+
words = ["<s>"] + text.split() + ["</s>"]
|
138 |
+
for i, (prob, length, oov) in enumerate(model.full_scores(text)):
|
139 |
+
results.append(
|
140 |
+
"{0} {1}: {2}".format(prob, length, " ".join(words[i + 2 - length : i + 2]))
|
141 |
+
)
|
142 |
+
if oov:
|
143 |
+
results.append('\t"{0}" is an OOV'.format(words[i + 1]))
|
144 |
+
|
145 |
+
results.append("---")
|
146 |
+
|
147 |
+
# Find out-of-vocabulary words
|
148 |
+
for w in words:
|
149 |
+
if w not in model:
|
150 |
+
results.append('"{0}" is an OOV'.format(w))
|
151 |
+
|
152 |
+
return "\n".join(results)
|
153 |
+
|
154 |
+
|
155 |
+
def score(lm, word, context):
|
156 |
+
new_context = kenlm.State()
|
157 |
+
full_score = lm.BaseFullScore(context, word, new_context)
|
158 |
+
if full_score.oov:
|
159 |
+
return -42, new_context # odefault ov score looks too high
|
160 |
+
return full_score.log_prob, new_context
|
161 |
+
|
162 |
+
|
163 |
+
@functools.lru_cache(maxsize=2**10)
|
164 |
+
def segment(lm, text, context=None, maxlen=20):
|
165 |
+
if context is None:
|
166 |
+
context = kenlm.State()
|
167 |
+
lm.NullContextWrite(context)
|
168 |
+
|
169 |
+
if not text:
|
170 |
+
return 0.0, []
|
171 |
+
|
172 |
+
textlen = min(len(text), maxlen)
|
173 |
+
splits = [(text[: i + 1], text[i + 1 :]) for i in range(textlen)]
|
174 |
+
|
175 |
+
candidates = []
|
176 |
+
for word, remain_word in splits:
|
177 |
+
first_prob, new_context = score(lm, word, context)
|
178 |
+
remain_prob, remain_word = segment(lm, remain_word, new_context)
|
179 |
+
|
180 |
+
candidates.append((first_prob + remain_prob, [word] + remain_word))
|
181 |
+
|
182 |
+
return max(candidates)
|
183 |
+
|
184 |
+
|
185 |
+
def enhance_text(kenlm_model, text):
|
186 |
+
if not kenlm_model:
|
187 |
+
raise gr.Error("Please upload your KenLM model.")
|
188 |
+
|
189 |
+
if not text:
|
190 |
+
raise gr.Error("Please paste the text to score.")
|
191 |
+
|
192 |
+
lm = kenlm.LanguageModel(kenlm_model)
|
193 |
+
|
194 |
+
label = text.replace(" ", "")
|
195 |
+
_, fixed_label_chunks = segment(lm, label)
|
196 |
+
fixed_label = " ".join(fixed_label_chunks)
|
197 |
+
|
198 |
+
return fixed_label
|
199 |
+
|
200 |
+
|
201 |
+
def text_to_kenlm(
|
202 |
+
_text_file,
|
203 |
+
_order,
|
204 |
+
_do_lowercase,
|
205 |
+
_binary_a_bits,
|
206 |
+
_binary_b_bits,
|
207 |
+
_binary_q_bits,
|
208 |
+
_binary_type,
|
209 |
+
_arpa_prune,
|
210 |
+
_do_quantize,
|
211 |
+
_topk_words,
|
212 |
+
_do_limit_topk,
|
213 |
+
):
|
214 |
+
if not _text_file:
|
215 |
+
raise gr.Error("Please add a file.")
|
216 |
+
|
217 |
+
if not _order:
|
218 |
+
raise gr.Error("Please add an order.")
|
219 |
+
|
220 |
+
results = []
|
221 |
+
|
222 |
+
# Read the file
|
223 |
+
with open(_text_file, "r") as f:
|
224 |
+
text = f.read()
|
225 |
+
for line in text.split("\n"):
|
226 |
+
if _do_lowercase:
|
227 |
+
line = line.lower()
|
228 |
+
results.append(line)
|
229 |
+
|
230 |
+
# Write to intermediate file
|
231 |
+
intermediate_file = f"{app_dir}/intermediate.txt"
|
232 |
+
with open(intermediate_file, "w") as f:
|
233 |
+
f.write(" ".join(results))
|
234 |
+
|
235 |
+
# Commands to run in the container
|
236 |
+
cmd = (
|
237 |
+
f"{kenlm_bin}/lmplz --temp_prefix {app_dir} --memory 90% --text {intermediate_file} --arpa {app_dir}/my_model.arpa -o {_order} --prune {_arpa_prune} --discount_fallback",
|
238 |
+
)
|
239 |
+
print(subprocess.run(cmd, shell=True))
|
240 |
+
|
241 |
+
file_name = f"{app_dir}/my_model.arpa"
|
242 |
+
file_name_fixed = f"{app_dir}/my_model_correct.arpa"
|
243 |
+
|
244 |
+
# Fix the ARPA file
|
245 |
+
with (
|
246 |
+
open(file_name, "r") as read_file,
|
247 |
+
open(file_name_fixed, "w") as write_file,
|
248 |
+
):
|
249 |
+
has_added_eos = False
|
250 |
+
for line in read_file:
|
251 |
+
if not has_added_eos and "ngram 1=" in line:
|
252 |
+
count = line.strip().split("=")[-1]
|
253 |
+
write_file.write(line.replace(f"{count}", f"{int(count) + 1}"))
|
254 |
+
elif not has_added_eos and "<s>" in line:
|
255 |
+
write_file.write(line)
|
256 |
+
write_file.write(line.replace("<s>", "</s>"))
|
257 |
+
has_added_eos = True
|
258 |
+
else:
|
259 |
+
write_file.write(line)
|
260 |
+
|
261 |
+
# Replace the file name
|
262 |
+
file_name = file_name_fixed
|
263 |
+
|
264 |
+
if _do_limit_topk:
|
265 |
+
file_name = f"{app_dir}/my_model-{_topk_words}-words.arpa"
|
266 |
+
|
267 |
+
_, vocab_str = convert_and_filter_topk(app_dir, intermediate_file, _topk_words)
|
268 |
+
|
269 |
+
print(
|
270 |
+
subprocess.run(
|
271 |
+
[
|
272 |
+
os.path.join(kenlm_bin, "filter"),
|
273 |
+
"single",
|
274 |
+
"model:{}".format(file_name_fixed),
|
275 |
+
file_name,
|
276 |
+
],
|
277 |
+
input=vocab_str.encode("utf-8"),
|
278 |
+
check=True,
|
279 |
+
)
|
280 |
+
)
|
281 |
+
|
282 |
+
if _do_quantize:
|
283 |
+
file_name_quantized = (
|
284 |
+
f"{app_dir}/my_model-{_binary_type}-{_topk_words}-words.bin"
|
285 |
+
)
|
286 |
+
|
287 |
+
cmd = f"{kenlm_bin}/build_binary -a {_binary_a_bits} -b {_binary_b_bits} -q {_binary_q_bits} -v {_binary_type} {file_name} {file_name_quantized}"
|
288 |
+
print(subprocess.run(cmd, shell=True))
|
289 |
+
else:
|
290 |
+
if _do_quantize:
|
291 |
+
file_name = f"{app_dir}/my_model-{_binary_type}.bin"
|
292 |
+
|
293 |
+
cmd = f"{kenlm_bin}/build_binary -a {_binary_a_bits} -b {_binary_b_bits} -q {_binary_q_bits} -v {_binary_type} {file_name_fixed} {file_name}"
|
294 |
+
print(subprocess.run(cmd, shell=True))
|
295 |
+
|
296 |
+
return gr.DownloadButton(value=Path(file_name), label=f"Download: {file_name}")
|
297 |
+
|
298 |
+
|
299 |
+
with gr.Blocks(
|
300 |
+
title=title,
|
301 |
+
analytics_enabled=False,
|
302 |
+
theme=gr.themes.Base(),
|
303 |
+
) as demo:
|
304 |
+
gr.Markdown(description_head)
|
305 |
+
gr.Markdown("## Usage")
|
306 |
+
|
307 |
+
with gr.Tab("Evaluate"):
|
308 |
+
with gr.Row():
|
309 |
+
with gr.Column():
|
310 |
+
kenlm_model = gr.File(label="KenLM model")
|
311 |
+
|
312 |
+
text = gr.Text(label="Paste text")
|
313 |
+
|
314 |
+
results = gr.Textbox(
|
315 |
+
label="Scores",
|
316 |
+
placeholder="Scores will be here.",
|
317 |
+
show_copy_button=True,
|
318 |
+
lines=10,
|
319 |
+
)
|
320 |
+
|
321 |
+
gr.Button("Run").click(
|
322 |
+
inference_model,
|
323 |
+
inputs=[kenlm_model, text],
|
324 |
+
outputs=results,
|
325 |
+
)
|
326 |
+
|
327 |
+
with gr.Tab("Enhance"):
|
328 |
+
with gr.Row():
|
329 |
+
with gr.Column():
|
330 |
+
kenlm_model = gr.File(label="Your KenLM model")
|
331 |
+
|
332 |
+
text = gr.Text(label="Paste text to enhance")
|
333 |
+
|
334 |
+
results = gr.Textbox(
|
335 |
+
label="Results",
|
336 |
+
placeholder="Results will be here.",
|
337 |
+
show_copy_button=True,
|
338 |
+
lines=10,
|
339 |
+
)
|
340 |
+
|
341 |
+
gr.Button("Run").click(
|
342 |
+
enhance_text,
|
343 |
+
inputs=[kenlm_model, text],
|
344 |
+
outputs=results,
|
345 |
+
)
|
346 |
+
|
347 |
+
with gr.Tab("Create KenLM model"):
|
348 |
+
with gr.Row():
|
349 |
+
with gr.Column():
|
350 |
+
text_file = gr.File(label="Text corpus")
|
351 |
+
|
352 |
+
order = gr.Number(label="Order", value=3, minimum=1, maximum=5)
|
353 |
+
|
354 |
+
do_lowercase = gr.Checkbox(
|
355 |
+
label="Lowercase text",
|
356 |
+
)
|
357 |
+
|
358 |
+
arpa_prune = gr.Text(
|
359 |
+
label="Prune",
|
360 |
+
value="0 1 1",
|
361 |
+
)
|
362 |
+
|
363 |
+
binary_a_bits = gr.Number(
|
364 |
+
label="Binary A bits",
|
365 |
+
value=256,
|
366 |
+
)
|
367 |
+
|
368 |
+
binary_b_bits = gr.Number(
|
369 |
+
label="Binary B bits",
|
370 |
+
value=7,
|
371 |
+
)
|
372 |
+
|
373 |
+
binary_q_bits = gr.Number(
|
374 |
+
label="Binary Q bits",
|
375 |
+
value=8,
|
376 |
+
)
|
377 |
+
|
378 |
+
binary_type = gr.Text(
|
379 |
+
label="Build binary data structure type",
|
380 |
+
value="trie",
|
381 |
+
)
|
382 |
+
|
383 |
+
do_quantize = gr.Checkbox(
|
384 |
+
label="Quantize the model",
|
385 |
+
value=False,
|
386 |
+
)
|
387 |
+
|
388 |
+
topk_words = gr.Number(
|
389 |
+
label="Top-K words",
|
390 |
+
value=10000,
|
391 |
+
)
|
392 |
+
|
393 |
+
do_limit_topk = gr.Checkbox(
|
394 |
+
label="Limit vocabulary by Top-K words",
|
395 |
+
value=False,
|
396 |
+
)
|
397 |
+
|
398 |
+
kenlm_model = gr.DownloadButton(
|
399 |
+
label="Created KenLM model",
|
400 |
+
)
|
401 |
+
|
402 |
+
gr.Button("Create").click(
|
403 |
+
text_to_kenlm,
|
404 |
+
inputs=[
|
405 |
+
text_file,
|
406 |
+
order,
|
407 |
+
do_lowercase,
|
408 |
+
binary_a_bits,
|
409 |
+
binary_b_bits,
|
410 |
+
binary_q_bits,
|
411 |
+
binary_type,
|
412 |
+
arpa_prune,
|
413 |
+
do_quantize,
|
414 |
+
topk_words,
|
415 |
+
do_limit_topk,
|
416 |
+
],
|
417 |
+
outputs=kenlm_model,
|
418 |
+
)
|
419 |
+
|
420 |
+
with gr.Row():
|
421 |
+
gr.Examples(
|
422 |
+
label="Choose an example",
|
423 |
+
inputs=[text_file, order, do_lowercase, do_quantize],
|
424 |
+
examples=examples,
|
425 |
+
)
|
426 |
+
|
427 |
+
gr.Markdown("### Gradio app uses:")
|
428 |
+
gr.Markdown(tech_env)
|
429 |
+
gr.Markdown(tech_libraries)
|
430 |
+
|
431 |
+
if __name__ == "__main__":
|
432 |
+
demo.queue()
|
433 |
+
demo.launch()
|
demo.txt
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Деталі: Льотчик, чия особистість наразі не розкривається, розповідає, що майже щодня пілоти винищувачів F-16 виконують не один і не два польоти на ураження противника за лінією бойового зіткнення, на території Російської Федерації, на тимчасово окупованій росіянами території. Також льотчики "Фальконів" виконують польоти на прикриття інших бойових побратимів побратимів (Міг-29, Су-27, Су-24, Су-25).
|
2 |
+
|
3 |
+
Пілот зазначає, що ефективність використання західної техніки українськими воїнами як наземної, так і повітряної дуже висока.
|
4 |
+
|
5 |
+
Пряма мова: "Майже кожна ракета приходить в ціль… Більш ніж 80% випущених нами ракет приходять у ціль. Знищують як "Шахеди", так і крилаті ракети морського, повітряного і наземного базування.
|
6 |
+
|
7 |
+
Цілі нам завчасно відомі, розвідка наша працює досить непогано, навіть попри те, що в медіа розганяли, що ми не отримуємо дані розвідки… Можна сказати, що наші сили розвідки дуже швидко адаптуються, ми отримуємо актуальні дані…
|
8 |
+
|
9 |
+
Наразі ми можемо завдавати удари тільки в тактичну глибину, проте ефективність цих ударів дуже висока: якщо ми захочемо, то наша бомбочка залетить комусь у віконце".
|
10 |
+
|
11 |
+
Президент Володимир Зеленський удень 26 березня прибув з анонсованим візитом до Франції.
|
12 |
+
|
13 |
+
Джерело: "Європейська правда"
|
14 |
+
|
15 |
+
Деталі: Інформацію підтвердив журналістам прес-секретар президента Сергій Нікіфоров.
|
16 |
+
|
17 |
+
Єлисейський палац напередодні анонсував, що ввечері середи Зеленського прийме президент Франції Емманюель Макрон для підготовки до безпекового саміту, який відбудеться наступного дня.
|
18 |
+
|
19 |
+
Ідеться про саміт "коаліції рішучих", запланований на 27 березня у Парижі. Вперше коаліція, яка готує підґрунтя для гарантування майбутнього повоєнного врегулювання, зустрілася на початку березня в Лондоні.
|
20 |
+
|
21 |
+
З того часу відбулося кілька зустрічей коаліції у різних форматах. Зокрема, 15 березня на околицях Лондона зустрілись військові керівники, щоб обговорити плани щодо введення в Україну міжнародних сил.
|
22 |
+
|
23 |
+
За даними агентства Bloomberg, Франція і Британія ведуть переговори між 37 країнами щодо формування "коаліції рішучих" для України в разі досягнення мирного врегулювання.
|
24 |
+
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
gradio==5.23.0
|