Spaces:
Build error
Build error
freemt
commited on
Commit
·
3d38118
1
Parent(s):
dc641b6
Update df_aligned file_csv file_xlsx
Browse files- radiobee/__main__.py +141 -172
- radiobee/align_texts.py +57 -0
- radiobee/cmat2tset.py +2 -2
- radiobee/gen_aset.py +59 -0
- radiobee/gen_eps_minsamples.py +13 -0
- radiobee/gen_pset.py +140 -0
- radiobee/gen_row_alignment.py +151 -0
- radiobee/interpolate_pset.py +41 -0
- requirements.txt +2 -1
radiobee/__main__.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
"""Run interactively."""
|
2 |
from typing import Tuple # , Optional
|
3 |
|
|
|
|
|
4 |
import joblib
|
5 |
from random import randint
|
6 |
from textwrap import dedent
|
@@ -25,6 +27,9 @@ from radiobee.process_upload import process_upload
|
|
25 |
from radiobee.files2df import files2df
|
26 |
from radiobee.file2text import file2text
|
27 |
from radiobee.lists2cmat import lists2cmat
|
|
|
|
|
|
|
28 |
|
29 |
# from radiobee.plot_df import plot_df
|
30 |
from radiobee.cmat2tset import cmat2tset
|
@@ -124,32 +129,30 @@ if __name__ == "__main__":
|
|
124 |
gr.inputs.Slider(
|
125 |
minimum=1,
|
126 |
maximum=20,
|
127 |
-
step=1,
|
128 |
-
default=
|
129 |
-
# label="suggested min_samples value: 4-8",
|
130 |
),
|
131 |
gr.inputs.Slider(
|
132 |
minimum=1,
|
133 |
maximum=20,
|
134 |
-
step=
|
135 |
-
default=
|
136 |
-
# label="suggested esp value: 1.7-3",
|
137 |
),
|
138 |
]
|
139 |
|
140 |
# modi
|
141 |
examples = [
|
142 |
-
["data/test_zh.txt", "data/test_en.txt", "linear", "None", "None", "None",
|
143 |
-
["data/test_en.txt", "data/test_zh.txt", "linear", "None", "None", "None",
|
144 |
-
["data/shakespeare_zh500.txt", "data/shakespeare_en500.txt", "linear", "None", "None", "None",
|
145 |
-
["data/shakespeare_en500.txt", "data/shakespeare_zh500.txt", "linear", "None", "None", "None",
|
146 |
-
["data/hlm-ch1-zh.txt", "data/hlm-ch1-en.txt", "linear", "None", "None", "None",
|
147 |
-
["data/hlm-ch1-en.txt", "data/hlm-ch1-zh.txt", "linear", "None", "None", "None",
|
148 |
]
|
149 |
outputs = ["dataframe", "plot"]
|
150 |
outputs = ["plot"]
|
151 |
outputs = ["dataframe", "plot"]
|
152 |
-
|
153 |
headers=None,
|
154 |
max_rows=12, # 20
|
155 |
max_cols=None,
|
@@ -157,9 +160,28 @@ if __name__ == "__main__":
|
|
157 |
type="auto",
|
158 |
label="To be aligned",
|
159 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
outputs = [
|
161 |
-
|
162 |
"plot",
|
|
|
|
|
|
|
163 |
]
|
164 |
# outputs = ["dataframe", "plot", "plot"] # wont work
|
165 |
# outputs = ["dataframe"]
|
@@ -174,14 +196,14 @@ if __name__ == "__main__":
|
|
174 |
idf_type,
|
175 |
dl_type,
|
176 |
norm,
|
|
|
177 |
min_samples,
|
178 |
-
eps
|
179 |
):
|
180 |
# modi fn
|
181 |
-
"""Process inputs."""
|
182 |
logger.debug(" *debug* ")
|
183 |
|
184 |
-
#
|
185 |
for _ in [idf_type, dl_type, norm]:
|
186 |
if _ in "None":
|
187 |
_ = None
|
@@ -190,159 +212,86 @@ if __name__ == "__main__":
|
|
190 |
logger.info("file1.name: *%s*, file2.name: *%s*", file1.name, file2.name)
|
191 |
|
192 |
# bypass if file1 or file2 is str input
|
193 |
-
if not (isinstance(file1, str) or isinstance(file2, str)):
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
# joblib.dump(lst1, f"data/{nameof(lst1)}.lzma")
|
215 |
-
# joblib.dump(lst2, f"data/{nameof(lst2)}.lzma")
|
216 |
-
|
217 |
-
# modi typing https://textacy.readthedocs.io/en/stable/api_reference/representations.html
|
218 |
-
# tf_type: Literal[linear, sqrt, log, binary] = 'linear'
|
219 |
-
# idf_tyep: Optional[Literal[standard, smooth, bm25]] = None
|
220 |
-
# dl_type: Optional[Literal[linear, sqrt, log]] = None
|
221 |
-
# norm: norm: Optional[Literal[l1, l2]] = None
|
222 |
-
# min_df: int | float = 1
|
223 |
-
# max_df: int | float = 1.0
|
224 |
-
|
225 |
-
# cmat = lists2cmat(lst1, lst2)
|
226 |
-
cmat = lists2cmat(
|
227 |
-
lst1,
|
228 |
-
lst2,
|
229 |
-
tf_type=tf_type,
|
230 |
-
idf_type=idf_type,
|
231 |
-
dl_type=dl_type,
|
232 |
-
norm=norm,
|
233 |
-
)
|
234 |
-
|
235 |
-
tset = pd.DataFrame(cmat2tset(cmat))
|
236 |
-
tset.columns = ["x", "y", "cos"]
|
237 |
-
|
238 |
-
# for debugging, logger.debug logger.info dont show up
|
239 |
-
# print("lst1: %s" % lst1)
|
240 |
-
# print("lst2: %s" % lst2)
|
241 |
-
# print("cmat: %s" % cmat)
|
242 |
-
# print("tset: %s" % tset)
|
243 |
-
|
244 |
-
logger.debug("lst1: %s", lst1)
|
245 |
-
logger.debug("lst2: %s", lst2)
|
246 |
-
logger.debug("cmat: %s", cmat)
|
247 |
-
logger.debug("tset: %s", tset)
|
248 |
-
|
249 |
-
# plt0 = plot_df(pd.DataFrame(cmat))
|
250 |
-
df_ = tset
|
251 |
-
|
252 |
-
# moved to inputs
|
253 |
-
# min_samples: int = 6
|
254 |
-
# eps: float = 10
|
255 |
-
|
256 |
-
# ylim: Optional[int] = None
|
257 |
-
xlabel: str = lang1
|
258 |
-
ylabel: str = lang2
|
259 |
-
|
260 |
-
sns.set()
|
261 |
-
sns.set_style("darkgrid")
|
262 |
-
|
263 |
-
# fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(11.69, 8.27))
|
264 |
-
# fig, ([ax2, ax0], [ax1, ax3]) = plt.subplots(2, 2, figsize=(11.69, 8.27))
|
265 |
-
# fig, (ax2, ax0, ax1) = plt.subplots(3)
|
266 |
-
# fig, (ax2, ax0, ax1) = plt.subplots(3, figsize=(11.69, 8.27))
|
267 |
-
# fig, (ax2, ax0, ax1) = plt.subplots(1, 3, figsize=(36.69, 8.27))
|
268 |
-
# fig, (ax2, ax0, ax1) = plt.subplots(1, 3, figsize=(66.69, 22.27))
|
269 |
-
# fig, (ax2, ax0, ax1) = plt.subplots(1, 3)
|
270 |
-
# fig.subplots_adjust(hspace=.4)
|
271 |
-
|
272 |
-
fig = plt.figure()
|
273 |
-
gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
|
274 |
-
ax2 = fig.add_subplot(gs[0, 0])
|
275 |
-
ax0 = fig.add_subplot(gs[0, 1])
|
276 |
-
ax1 = fig.add_subplot(gs[1, 0])
|
277 |
-
|
278 |
-
cmap = "viridis_r"
|
279 |
-
sns.heatmap(cmat, cmap=cmap, ax=ax2).invert_yaxis()
|
280 |
-
ax2.set_xlabel(xlabel)
|
281 |
-
ax2.set_ylabel(ylabel)
|
282 |
-
ax2.set_title("cos similarity heatmap")
|
283 |
-
|
284 |
-
fig.suptitle("alignment projection")
|
285 |
-
|
286 |
-
_ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
|
287 |
-
_x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
|
288 |
-
|
289 |
-
df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
|
290 |
-
|
291 |
-
# clustered
|
292 |
-
df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax1)
|
293 |
-
|
294 |
-
# outliers
|
295 |
-
df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
|
296 |
-
|
297 |
-
# ax0.set_xlabel("")
|
298 |
-
# ax0.set_ylabel("zh")
|
299 |
-
ax0.set_xlabel(xlabel)
|
300 |
-
ax0.set_ylabel(ylabel)
|
301 |
-
|
302 |
-
ax0.set_xlim(0, len1)
|
303 |
-
ax0.set_ylim(0, len2)
|
304 |
-
ax0.set_title("max along columns ('x': outliers)")
|
305 |
-
|
306 |
-
# ax1.set_xlabel("en")
|
307 |
-
# ax1.set_ylabel("zh")
|
308 |
-
ax1.set_xlabel(xlabel)
|
309 |
-
ax1.set_ylabel(ylabel)
|
310 |
-
|
311 |
-
ax1.set_xlim(0, len1)
|
312 |
-
ax1.set_ylim(0, len2)
|
313 |
-
ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")
|
314 |
-
|
315 |
-
# return df, plot_df(pd.DataFrame(cmat))
|
316 |
-
# tset.plot.scatter("x", "y", c="cos", cmap="viridis_r")
|
317 |
-
else:
|
318 |
-
fig, ax1 = plt.subplots()
|
319 |
-
df1 = pd.DataFrame(
|
320 |
-
[
|
321 |
-
[5.1, 3.5, 0],
|
322 |
-
[4.9, 3.0, 0],
|
323 |
-
[7.0, 3.2, 1],
|
324 |
-
[6.4, 3.2, 1],
|
325 |
-
[5.9, 3.0, 2],
|
326 |
-
],
|
327 |
-
columns=["length", "width", "species"],
|
328 |
-
)
|
329 |
-
df1.plot.scatter(x="length", y="width", c="DarkBlue", ax=ax1)
|
330 |
-
# plt_heatmap = plt
|
331 |
-
|
332 |
-
# plt.scatter(df.length, df.width) # gradio eturn plt.gcf() or plt
|
333 |
|
334 |
-
|
335 |
-
|
336 |
-
# return df, df
|
337 |
-
# return df1.iloc[:10, :], plt
|
338 |
|
339 |
-
|
340 |
-
# pd.concat([df0.iloc[:2, :], pd.DataFrame([[".", ".", "..."]], columns=df0.columns), df0.iloc[-1:, :]], ignore_index=1)
|
341 |
|
342 |
-
|
343 |
-
|
344 |
|
345 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
[
|
347 |
df1.iloc[:4, :],
|
348 |
pd.DataFrame(
|
@@ -359,10 +308,30 @@ if __name__ == "__main__":
|
|
359 |
ignore_index=1,
|
360 |
)
|
361 |
|
362 |
-
|
363 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
|
365 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
366 |
|
367 |
server_port = 7860
|
368 |
with socket(AF_INET, SOCK_STREAM) as sock:
|
@@ -382,8 +351,8 @@ if __name__ == "__main__":
|
|
382 |
## NB
|
383 |
* Click "Clear" first for subsequent submits when uploading files.
|
384 |
* `tf_type` `idf_type` `dl_type` `norm`: Normally there is no need to touch these unless you know what you are doing.
|
385 |
-
* Suggested `
|
386 |
-
- Smaller `
|
387 |
* If you need to have a better look at the image, you can right-click on the image and select copy-image-address and open a new tab in the browser with the copied image address.
|
388 |
* `Flag`: Should something go wrong, you can click Flag to save the output and inform the developer.
|
389 |
"""
|
@@ -408,7 +377,7 @@ if __name__ == "__main__":
|
|
408 |
inputs=inputs,
|
409 |
outputs=outputs,
|
410 |
title="radiobee-aligner🔠",
|
411 |
-
description="showcasing a blazing fast dualtext aligner, currrently supported language pairs: en-zh/zh-en",
|
412 |
article=article,
|
413 |
examples=examples,
|
414 |
# theme="darkgrass",
|
|
|
1 |
"""Run interactively."""
|
2 |
from typing import Tuple # , Optional
|
3 |
|
4 |
+
|
5 |
+
from pathlib import Path
|
6 |
import joblib
|
7 |
from random import randint
|
8 |
from textwrap import dedent
|
|
|
27 |
from radiobee.files2df import files2df
|
28 |
from radiobee.file2text import file2text
|
29 |
from radiobee.lists2cmat import lists2cmat
|
30 |
+
from radiobee.gen_pset import gen_pset
|
31 |
+
from radiobee.gen_aset import gen_aset
|
32 |
+
from radiobee.align_texts import align_texts
|
33 |
|
34 |
# from radiobee.plot_df import plot_df
|
35 |
from radiobee.cmat2tset import cmat2tset
|
|
|
129 |
gr.inputs.Slider(
|
130 |
minimum=1,
|
131 |
maximum=20,
|
132 |
+
step=0.1,
|
133 |
+
default=10,
|
|
|
134 |
),
|
135 |
gr.inputs.Slider(
|
136 |
minimum=1,
|
137 |
maximum=20,
|
138 |
+
step=1,
|
139 |
+
default=6,
|
|
|
140 |
),
|
141 |
]
|
142 |
|
143 |
# modi
|
144 |
examples = [
|
145 |
+
["data/test_zh.txt", "data/test_en.txt", "linear", "None", "None", "None", 10, 6, ],
|
146 |
+
["data/test_en.txt", "data/test_zh.txt", "linear", "None", "None", "None", 10, 6, ],
|
147 |
+
["data/shakespeare_zh500.txt", "data/shakespeare_en500.txt", "linear", "None", "None", "None", 10, 6, ],
|
148 |
+
["data/shakespeare_en500.txt", "data/shakespeare_zh500.txt", "linear", "None", "None", "None", 10, 6, ],
|
149 |
+
["data/hlm-ch1-zh.txt", "data/hlm-ch1-en.txt", "linear", "None", "None", "None", 10, 6, ],
|
150 |
+
["data/hlm-ch1-en.txt", "data/hlm-ch1-zh.txt", "linear", "None", "None", "None", 10, 6, ],
|
151 |
]
|
152 |
outputs = ["dataframe", "plot"]
|
153 |
outputs = ["plot"]
|
154 |
outputs = ["dataframe", "plot"]
|
155 |
+
out_df = gr.outputs.Dataframe(
|
156 |
headers=None,
|
157 |
max_rows=12, # 20
|
158 |
max_cols=None,
|
|
|
160 |
type="auto",
|
161 |
label="To be aligned",
|
162 |
)
|
163 |
+
out_df_aligned = gr.outputs.Dataframe(
|
164 |
+
headers=None,
|
165 |
+
# max_rows=12, # 20
|
166 |
+
max_cols=3,
|
167 |
+
overflow_row_behaviour="paginate",
|
168 |
+
type="auto",
|
169 |
+
label="aligned pairs",
|
170 |
+
)
|
171 |
+
out_file_dl = gr.outputs.File(
|
172 |
+
label="Click to download csv",
|
173 |
+
)
|
174 |
+
out_file_dl_excel = gr.outputs.File(
|
175 |
+
label="Click to download xlsx",
|
176 |
+
)
|
177 |
+
|
178 |
+
# modi outputs
|
179 |
outputs = [
|
180 |
+
out_df,
|
181 |
"plot",
|
182 |
+
out_file_dl,
|
183 |
+
out_file_dl_excel,
|
184 |
+
out_df_aligned,
|
185 |
]
|
186 |
# outputs = ["dataframe", "plot", "plot"] # wont work
|
187 |
# outputs = ["dataframe"]
|
|
|
196 |
idf_type,
|
197 |
dl_type,
|
198 |
norm,
|
199 |
+
eps,
|
200 |
min_samples,
|
|
|
201 |
):
|
202 |
# modi fn
|
203 |
+
"""Process inputs and return outputs."""
|
204 |
logger.debug(" *debug* ")
|
205 |
|
206 |
+
# conver "None" to None for those Radio types
|
207 |
for _ in [idf_type, dl_type, norm]:
|
208 |
if _ in "None":
|
209 |
_ = None
|
|
|
212 |
logger.info("file1.name: *%s*, file2.name: *%s*", file1.name, file2.name)
|
213 |
|
214 |
# bypass if file1 or file2 is str input
|
215 |
+
# if not (isinstance(file1, str) or isinstance(file2, str)):
|
216 |
+
text1 = file2text(file1)
|
217 |
+
text2 = file2text(file2)
|
218 |
+
lang1, _ = fastlid(text1)
|
219 |
+
lang2, _ = fastlid(text2)
|
220 |
+
|
221 |
+
df1 = files2df(file1, file2)
|
222 |
+
|
223 |
+
lst1 = [elm for elm in df1.text1 if elm]
|
224 |
+
lst2 = [elm for elm in df1.text2 if elm]
|
225 |
+
len1 = len(lst1)
|
226 |
+
len2 = len(lst2)
|
227 |
+
|
228 |
+
cmat = lists2cmat(
|
229 |
+
lst1,
|
230 |
+
lst2,
|
231 |
+
tf_type=tf_type,
|
232 |
+
idf_type=idf_type,
|
233 |
+
dl_type=dl_type,
|
234 |
+
norm=norm,
|
235 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
|
237 |
+
tset = pd.DataFrame(cmat2tset(cmat))
|
238 |
+
tset.columns = ["x", "y", "cos"]
|
|
|
|
|
239 |
|
240 |
+
df_ = tset
|
|
|
241 |
|
242 |
+
xlabel: str = lang1
|
243 |
+
ylabel: str = lang2
|
244 |
|
245 |
+
sns.set()
|
246 |
+
sns.set_style("darkgrid")
|
247 |
+
|
248 |
+
fig = plt.figure()
|
249 |
+
gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
|
250 |
+
ax2 = fig.add_subplot(gs[0, 0])
|
251 |
+
ax0 = fig.add_subplot(gs[0, 1])
|
252 |
+
ax1 = fig.add_subplot(gs[1, 0])
|
253 |
+
|
254 |
+
cmap = "viridis_r"
|
255 |
+
sns.heatmap(cmat, cmap=cmap, ax=ax2).invert_yaxis()
|
256 |
+
ax2.set_xlabel(xlabel)
|
257 |
+
ax2.set_ylabel(ylabel)
|
258 |
+
ax2.set_title("cos similarity heatmap")
|
259 |
+
|
260 |
+
fig.suptitle("alignment projection")
|
261 |
+
|
262 |
+
_ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
|
263 |
+
_x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
|
264 |
+
|
265 |
+
df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
|
266 |
+
|
267 |
+
# clustered
|
268 |
+
df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax1)
|
269 |
+
|
270 |
+
# outliers
|
271 |
+
df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
|
272 |
+
|
273 |
+
# ax0.set_xlabel("")
|
274 |
+
# ax0.set_ylabel("zh")
|
275 |
+
ax0.set_xlabel(xlabel)
|
276 |
+
ax0.set_ylabel(ylabel)
|
277 |
+
|
278 |
+
ax0.set_xlim(0, len1)
|
279 |
+
ax0.set_ylim(0, len2)
|
280 |
+
ax0.set_title("max along columns ('x': outliers)")
|
281 |
+
|
282 |
+
# ax1.set_xlabel("en")
|
283 |
+
# ax1.set_ylabel("zh")
|
284 |
+
ax1.set_xlabel(xlabel)
|
285 |
+
ax1.set_ylabel(ylabel)
|
286 |
+
|
287 |
+
ax1.set_xlim(0, len1)
|
288 |
+
ax1.set_ylim(0, len2)
|
289 |
+
ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")
|
290 |
+
|
291 |
+
# return df, plot_df(pd.DataFrame(cmat))
|
292 |
+
# tset.plot.scatter("x", "y", c="cos", cmap="viridis_r")
|
293 |
+
|
294 |
+
df_trimmed = pd.concat(
|
295 |
[
|
296 |
df1.iloc[:4, :],
|
297 |
pd.DataFrame(
|
|
|
308 |
ignore_index=1,
|
309 |
)
|
310 |
|
311 |
+
# process lst1, lst2 to obtained df_aligned
|
312 |
+
pset = gen_pset(
|
313 |
+
cmat,
|
314 |
+
eps=eps,
|
315 |
+
min_samples=min_samples,
|
316 |
+
delta=7,
|
317 |
+
)
|
318 |
+
src_len, tgt_len = cmat.shape
|
319 |
+
aset = gen_aset(pset, src_len, tgt_len)
|
320 |
+
final_list = align_texts(aset, lst2, lst1) # note the order
|
321 |
|
322 |
+
# df_aligned = df_trimmed
|
323 |
+
df_aligned = pd.DataFrame(final_list, columns=["text1", "text2", "likelihood"])
|
324 |
+
|
325 |
+
_ = df_aligned.to_csv(index=False)
|
326 |
+
file_dl = Path(f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.csv")
|
327 |
+
file_dl.write_text(_, encoding="utf8")
|
328 |
+
|
329 |
+
file_dl_xlsx = Path(f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.xlsx")
|
330 |
+
df_aligned.to_excel(file_dl_xlsx)
|
331 |
+
|
332 |
+
# return df_trimmed, plt
|
333 |
+
return df_trimmed, plt, file_dl, file_dl_xlsx, df_aligned
|
334 |
+
# modi outputs
|
335 |
|
336 |
server_port = 7860
|
337 |
with socket(AF_INET, SOCK_STREAM) as sock:
|
|
|
351 |
## NB
|
352 |
* Click "Clear" first for subsequent submits when uploading files.
|
353 |
* `tf_type` `idf_type` `dl_type` `norm`: Normally there is no need to touch these unless you know what you are doing.
|
354 |
+
* Suggested `esp` and `min_samples` values -- `esp` (minimum epsilon): 8-12, `min_samples`: 4-8.
|
355 |
+
- Smaller larger `esp` or `min_samples` will result in more aligned pairs but also more **false positives** (pairs falsely identified as candidates). On the other hand, larger smaller `esp` or `min_samples` values tend to miss 'good' pairs.
|
356 |
* If you need to have a better look at the image, you can right-click on the image and select copy-image-address and open a new tab in the browser with the copied image address.
|
357 |
* `Flag`: Should something go wrong, you can click Flag to save the output and inform the developer.
|
358 |
"""
|
|
|
377 |
inputs=inputs,
|
378 |
outputs=outputs,
|
379 |
title="radiobee-aligner🔠",
|
380 |
+
description="WIP showcasing a blazing fast dualtext aligner, currrently supported language pairs: en-zh/zh-en",
|
381 |
article=article,
|
382 |
examples=examples,
|
383 |
# theme="darkgrass",
|
radiobee/align_texts.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Align texts based on aset, src_text, tgt_text."""
|
2 |
+
from typing import List, Tuple, Union
|
3 |
+
from logzero import logger
|
4 |
+
|
5 |
+
|
6 |
+
# fmt: off
|
7 |
+
def align_texts(
|
8 |
+
aset: List[Tuple[Union[str, float], Union[str, float], Union[str, float]]],
|
9 |
+
src_text: List[str],
|
10 |
+
tgt_text: List[str],
|
11 |
+
) -> List[Tuple[Union[str], Union[str], Union[str, float]]]:
|
12 |
+
# fmt: on
|
13 |
+
"""Align texts (paras/sents) based on aset, src_text, tgt_text.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
aset: align set
|
17 |
+
src_text: source text
|
18 |
+
tgt_text: target text
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
aligned texts with possible mertics
|
22 |
+
"""
|
23 |
+
xset, yset, metrics = zip(*aset) # unzip aset
|
24 |
+
xset = [elm for elm in xset if elm != ""]
|
25 |
+
yset = [elm for elm in yset if elm != ""]
|
26 |
+
|
27 |
+
if (len(xset), len(yset)) != (len(tgt_text), len(src_text)):
|
28 |
+
logger.warning(
|
29 |
+
" (%s, %s) != (%s, %s) ", len(xset), len(yset), len(tgt_text), len(src_text)
|
30 |
+
)
|
31 |
+
# raise Exception(" See previous message")
|
32 |
+
|
33 |
+
texts = []
|
34 |
+
for elm in aset:
|
35 |
+
elm0, elm1, elm2 = elm
|
36 |
+
_ = []
|
37 |
+
|
38 |
+
# src_text first
|
39 |
+
if isinstance(elm1, str):
|
40 |
+
_.append("")
|
41 |
+
else:
|
42 |
+
_.append(src_text[int(elm1)])
|
43 |
+
|
44 |
+
if isinstance(elm0, str):
|
45 |
+
_.append("")
|
46 |
+
else:
|
47 |
+
_.append(tgt_text[int(elm0)])
|
48 |
+
|
49 |
+
if isinstance(elm2, str):
|
50 |
+
_.append("")
|
51 |
+
else:
|
52 |
+
_.append(round(elm2, 2))
|
53 |
+
|
54 |
+
texts.append(tuple(_))
|
55 |
+
|
56 |
+
# return [("", "", 0.)]
|
57 |
+
return texts
|
radiobee/cmat2tset.py
CHANGED
@@ -46,11 +46,11 @@ def cmat2tset(
|
|
46 |
"""
|
47 |
low_ = cmat.min() - 1
|
48 |
argmax_max = []
|
49 |
-
src_len, tgt_len = cmat.shape
|
50 |
for _ in range(min(src_len, tgt_len)):
|
51 |
argmax = int(cmat.argmax())
|
52 |
row, col = divmod(argmax, tgt_len)
|
53 |
-
argmax_max.append([col, row, cmat.max()])
|
54 |
|
55 |
# erase row-th row and col-th col of cmat
|
56 |
cmat[row, :] = low_
|
|
|
46 |
"""
|
47 |
low_ = cmat.min() - 1
|
48 |
argmax_max = []
|
49 |
+
src_len, tgt_len = cmat.shape # ylim, xlim
|
50 |
for _ in range(min(src_len, tgt_len)):
|
51 |
argmax = int(cmat.argmax())
|
52 |
row, col = divmod(argmax, tgt_len)
|
53 |
+
argmax_max.append([col, row, cmat.max()]) # x-axis, y-axis
|
54 |
|
55 |
# erase row-th row and col-th col of cmat
|
56 |
cmat[row, :] = low_
|
radiobee/gen_aset.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Genereat align set (aset) based on pset (pair set), src_lang and tgt_len."""
|
2 |
+
from typing import List, Tuple, Union
|
3 |
+
from itertools import zip_longest
|
4 |
+
|
5 |
+
# from logzero import logger
|
6 |
+
|
7 |
+
|
8 |
+
# fmt: off
|
9 |
+
def gen_aset(
|
10 |
+
pset: List[Tuple[int, int, float]],
|
11 |
+
src_len: int, # n_rows
|
12 |
+
tgt_len: int, # n_cols
|
13 |
+
) -> List[Tuple[Union[str, float], Union[str, float], Union[str, float]]]:
|
14 |
+
# fmt: on
|
15 |
+
"""Genereat align set (aset) based on pset, src_lang and tgt_len.
|
16 |
+
|
17 |
+
src_len, tgt_len = cmat.shape
|
18 |
+
zip_longest(..., fillvalue="")
|
19 |
+
|
20 |
+
Args:
|
21 |
+
pset: [x(lang2 zh), y(lang1 en), cos]
|
22 |
+
src_len: lang1 (en)
|
23 |
+
tgt_len: lang2 (zh)
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
aset:
|
27 |
+
[0...tgt_len, 0...src_len]
|
28 |
+
[0, 0, .]
|
29 |
+
...
|
30 |
+
[tgt_len-1, src_len-1, .]
|
31 |
+
"""
|
32 |
+
# empty pset []
|
33 |
+
if not pset:
|
34 |
+
return [*zip_longest(range(tgt_len), range(src_len), fillvalue="")]
|
35 |
+
# empty [[]]
|
36 |
+
if len(pset) == 1:
|
37 |
+
if not pset[0]:
|
38 |
+
return [*zip_longest(range(tgt_len), range(src_len), fillvalue="")]
|
39 |
+
|
40 |
+
buff = []
|
41 |
+
pos0, pos1 = -1, -1
|
42 |
+
for elm in pset:
|
43 |
+
# elm0, elm1, elm2 = elm
|
44 |
+
elm0, elm1, *elm2 = elm
|
45 |
+
elm0 = int(elm0)
|
46 |
+
elm1 = int(elm1)
|
47 |
+
interval = max(elm0 - pos0 - 1, elm1 - pos1 - 1)
|
48 |
+
_ = zip_longest(range(pos0 + 1, elm0), range(pos1 + 1, elm1), [""] * interval, fillvalue="")
|
49 |
+
buff.extend(_)
|
50 |
+
buff.append(elm)
|
51 |
+
pos0, pos1 = elm0, elm1
|
52 |
+
|
53 |
+
# last batch if any
|
54 |
+
elm0, elm1 = tgt_len, src_len
|
55 |
+
interval = max(elm0 - pos0 - 1, elm1 - pos1 - 1)
|
56 |
+
_ = zip_longest(range(pos0 + 1, elm0), range(pos1 + 1, elm1), [""] * interval, fillvalue="")
|
57 |
+
buff.extend(_)
|
58 |
+
|
59 |
+
return buff
|
radiobee/gen_eps_minsamples.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Gen suggested eps min_samples."""
|
2 |
+
|
3 |
+
|
4 |
+
def gen_eps_minsamples(src_len: int, tgt_len: int) -> dict:
|
5 |
+
"""Gen suggested eps min_samples."""
|
6 |
+
eps = src_len * 0.01
|
7 |
+
if eps < 3:
|
8 |
+
eps = 3
|
9 |
+
|
10 |
+
min_samples = tgt_len / 100 * 0.5
|
11 |
+
if min_samples < 3:
|
12 |
+
min_samples = 3
|
13 |
+
return {"eps": eps, "min_samples": min_samples}
|
radiobee/gen_pset.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Gne pset from cmat. Find pairs for a given cmat.
|
2 |
+
|
3 |
+
tinybee.find_pairs.py with fixed estimator='dbscan' eps=eps, min_samples=min_samples
|
4 |
+
"""
|
5 |
+
from typing import List, Tuple, Union
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
import pandas as pd
|
9 |
+
from sklearn.cluster import DBSCAN
|
10 |
+
import logzero
|
11 |
+
from logzero import logger
|
12 |
+
from radiobee.cmat2tset import cmat2tset
|
13 |
+
from radiobee.interpolate_pset import interpolate_pset
|
14 |
+
|
15 |
+
|
16 |
+
def gen_pset(
|
17 |
+
cmat1: Union[List[List[float]], np.ndarray, pd.DataFrame],
|
18 |
+
eps: float = 10,
|
19 |
+
min_samples: int = 6,
|
20 |
+
delta: float = 7,
|
21 |
+
verbose: Union[bool, int] = False,
|
22 |
+
) -> List[Tuple[int, int, Union[float, str]]]:
|
23 |
+
"""Gen pset from cmat.
|
24 |
+
Find pairs for a given cmat.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
cmat: correlation/similarity matrix
|
28 |
+
eps: min epsilon for DBSCAN (10)
|
29 |
+
min_samples: minimum # of samples for DBSCAN (6)
|
30 |
+
delta: tolerance (7)
|
31 |
+
|
32 |
+
Returns:
|
33 |
+
pairs + "" or metric (float)
|
34 |
+
|
35 |
+
dbscan_pairs' setup
|
36 |
+
if eps is None:
|
37 |
+
eps = src_len * .01
|
38 |
+
if eps < 3:
|
39 |
+
eps = 3
|
40 |
+
if min_samples is None:
|
41 |
+
min_samples = tgt_len / 100 * 0.5
|
42 |
+
if min_samples < 3:
|
43 |
+
min_samples = 3
|
44 |
+
|
45 |
+
def gen_eps_minsamples(src_len, tgt_len):
|
46 |
+
eps = src_len * .01
|
47 |
+
if eps < 3:
|
48 |
+
eps = 3
|
49 |
+
|
50 |
+
min_samples = tgt_len / 100 * 0.5
|
51 |
+
if min_samples < 3:
|
52 |
+
min_samples = 3
|
53 |
+
return {"eps": eps, "min_samples": min_samples}
|
54 |
+
|
55 |
+
"""
|
56 |
+
if isinstance(verbose, bool):
|
57 |
+
if verbose:
|
58 |
+
verbose = 10
|
59 |
+
else:
|
60 |
+
verbose = 20
|
61 |
+
logzero.loglevel(verbose)
|
62 |
+
|
63 |
+
# if isinstance(cmat, list):
|
64 |
+
cmat = np.array(cmat1)
|
65 |
+
|
66 |
+
src_len, tgt_len = cmat.shape
|
67 |
+
|
68 |
+
# tset = cmat2tset(cmat)
|
69 |
+
tset = cmat2tset(cmat).tolist()
|
70 |
+
|
71 |
+
logger.debug("tset: %s", tset)
|
72 |
+
|
73 |
+
# iset = gen_iset(cmat, verbose=verbose, estimator=estimator)
|
74 |
+
labels = DBSCAN(eps=eps, min_samples=min_samples).fit(tset).labels_
|
75 |
+
|
76 |
+
df_tset = pd.DataFrame(tset, columns=["x", "y", "cos"])
|
77 |
+
cset = df_tset[labels > -1].to_numpy()
|
78 |
+
|
79 |
+
# sort cset
|
80 |
+
_ = sorted(cset.tolist(), key=lambda x: x[0])
|
81 |
+
iset = interpolate_pset(_, tgt_len)
|
82 |
+
|
83 |
+
# *_, ymax = zip(*tset)
|
84 |
+
# ymax = list(ymax)
|
85 |
+
# low_ = np.min(ymax) - 1 # reset to minimum_value - 1
|
86 |
+
|
87 |
+
buff = [(-1, -1, ""), (tgt_len, src_len, "")]
|
88 |
+
# for _ in range(tgt_len):
|
89 |
+
for idx, tset_elm in enumerate(tset):
|
90 |
+
logger.debug("buff: %s", buff)
|
91 |
+
# postion max in ymax and insert in buff
|
92 |
+
# if with range given by iset+-delta and
|
93 |
+
# it's valid (do not exceed constraint
|
94 |
+
# by neighboring points
|
95 |
+
|
96 |
+
# argmax = int(np.argmax(ymax))
|
97 |
+
|
98 |
+
# logger.debug("=== %s,%s === %s", _, argmax, tset[_])
|
99 |
+
logger.debug("=== %s === %s", _, tset_elm)
|
100 |
+
|
101 |
+
# ymax[_] = low_
|
102 |
+
# elm = tset[argmax]
|
103 |
+
# elm0, *_ = elm
|
104 |
+
|
105 |
+
elm0, *_ = tset_elm
|
106 |
+
|
107 |
+
# position elm in buff
|
108 |
+
idx = -1 # for making pyright happy
|
109 |
+
for idx, loc in enumerate(buff):
|
110 |
+
if loc[0] > elm0:
|
111 |
+
break
|
112 |
+
else:
|
113 |
+
idx += 1 # last
|
114 |
+
|
115 |
+
# insert elm in for valid elm
|
116 |
+
# (within range inside two neighboring points)
|
117 |
+
|
118 |
+
# pos = int(tset[argmax][0])
|
119 |
+
pos = int(tset_elm[0])
|
120 |
+
logger.debug(" %s <=> %s ", tset_elm, iset[pos])
|
121 |
+
|
122 |
+
# if abs(tset[argmax][1] - iset[pos][1]) <= delta:
|
123 |
+
if abs(tset_elm[1] - iset[pos][1]) <= delta:
|
124 |
+
if tset_elm[1] > buff[idx - 1][1] and tset_elm[1] < buff[idx][1]:
|
125 |
+
buff.insert(idx, tset_elm)
|
126 |
+
logger.debug("idx: %s, tset_elm: %s", idx, tset_elm)
|
127 |
+
else:
|
128 |
+
logger.debug("\t***\t idx: %s, tset_elm: %s", idx, tset_elm)
|
129 |
+
_ = """
|
130 |
+
if abs(tset[loc][1] - iset[loc][1]) <= delta:
|
131 |
+
if tset[loc][1] > buff[idx][1] and tset[loc][1] < buff[idx + 1][1]:
|
132 |
+
buff.insert(idx + 1, tset[loc])
|
133 |
+
# """
|
134 |
+
|
135 |
+
# remove first and last entry in buff
|
136 |
+
buff.pop(0)
|
137 |
+
buff.pop()
|
138 |
+
|
139 |
+
# return [(1, 1, "")]
|
140 |
+
return [(int(elm0), int(elm1), elm2) for elm0, elm1, elm2 in buff]
|
radiobee/gen_row_alignment.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Gen proper alignment for a given triple_set.
|
2 |
+
|
3 |
+
cmat = fetch_sent_corr(src, tgt)
|
4 |
+
src_len, tgt_len = np.array(cmat).shape
|
5 |
+
r_ali = gen_row_alignment(cmat, tgt_len, src_len) # note the order
|
6 |
+
src[r_ali[1]], tgt[r_ali[0]], r_ali[2]
|
7 |
+
|
8 |
+
or !!! (targer, source)
|
9 |
+
cmat = fetch_sent_corr(tgt, src) # note the order
|
10 |
+
src_len, tgt_len = np.array(cmat).shape
|
11 |
+
r_ali = gen_row_alignment(cmat, src_len, tgt_len)
|
12 |
+
src[r_ali[0]], tgt[r_ali[1]], r_ali[2]
|
13 |
+
|
14 |
+
---
|
15 |
+
src_txt = 'data/wu_ch2_en.txt'
|
16 |
+
tgt_txt = 'data/wu_ch2_zh.txt'
|
17 |
+
|
18 |
+
assert Path(src_txt).exists()
|
19 |
+
assert Path(tgt_txt).exists()
|
20 |
+
|
21 |
+
src_text, _ = load_paras(src_txt)
|
22 |
+
tgt_text, _ = load_paras(tgt_txt)
|
23 |
+
|
24 |
+
cos_matrix = gen_cos_matrix(src_text, tgt_text)
|
25 |
+
t_set, m_matrix = find_aligned_pairs(cos_matrix0, thr=0.4, matrix=True)
|
26 |
+
|
27 |
+
resu = gen_row_alignment(t_set, src_len, tgt_len)
|
28 |
+
resu = np.array(resu)
|
29 |
+
|
30 |
+
idx = -1
|
31 |
+
idx += 1; (resu[idx], src_text[int(resu[idx, 0])],
|
32 |
+
tgt_text[int(resu[idx, 1])]) if all(resu[idx]) else resu[idx]
|
33 |
+
|
34 |
+
idx += 1; i0, i1, i2 = resu[idx]; '***' if i0 == ''
|
35 |
+
else src_text[int(i0)], '***' if i1 == '' else tgt_text[int(i1)], ''
|
36 |
+
if i2 == '' else i2
|
37 |
+
"""
|
38 |
+
# pylint: disable=line-too-long
|
39 |
+
from typing import List, Union
|
40 |
+
|
41 |
+
# natural extrapolation with slope equal to 1
|
42 |
+
from itertools import zip_longest as zip_longest_middle
|
43 |
+
|
44 |
+
import numpy as np
|
45 |
+
|
46 |
+
from logzero import logger
|
47 |
+
|
48 |
+
# from tinybee.zip_longest_middle import zip_longest_middle
|
49 |
+
|
50 |
+
# from tinybee.zip_longest_middle import zip_longest_middle
|
51 |
+
# from tinybee.find_pairs import find_pairs
|
52 |
+
|
53 |
+
# logger = logging.getLogger(__name__)
|
54 |
+
# logger.addHandler(logging.NullHandler())
|
55 |
+
|
56 |
+
|
57 |
+
def gen_row_alignment( # pylint: disable=too-many-locals
|
58 |
+
t_set,
|
59 |
+
src_len,
|
60 |
+
tgt_len,
|
61 |
+
# ) -> List[Tuple[Union[str, int], Union[str, int], Union[str, float]]]:
|
62 |
+
) -> List[List[Union[str, float]]]:
|
63 |
+
"""Gen proper rows for given triple_set.
|
64 |
+
|
65 |
+
Arguments:
|
66 |
+
[t_set {np.array or list}] -- [nll matrix]
|
67 |
+
[src_len {int}] -- numb of source texts (para/sents)
|
68 |
+
[tgt_len {int}] -- numb of target texts (para/sents)
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
[np.array] -- [proper rows]
|
72 |
+
"""
|
73 |
+
t_set = np.array(t_set, dtype="object")
|
74 |
+
|
75 |
+
# len0 = src_len
|
76 |
+
|
77 |
+
# len1 tgt text length, must be provided
|
78 |
+
len1 = tgt_len
|
79 |
+
|
80 |
+
# rearrange t_set as buff in increasing order
|
81 |
+
buff = [[-1, -1, ""]] #
|
82 |
+
idx_t = 0
|
83 |
+
# for elm in t_set:
|
84 |
+
# start with bigger value from the 3rd col
|
85 |
+
|
86 |
+
y00, yargmax, ymax = zip(*t_set)
|
87 |
+
ymax_ = np.array(ymax).copy()
|
88 |
+
reset_v = np.min(ymax_) - 1
|
89 |
+
for count in range(tgt_len):
|
90 |
+
argmax = np.argmax(ymax_)
|
91 |
+
# reset
|
92 |
+
ymax_[argmax] = reset_v
|
93 |
+
idx_t = argmax
|
94 |
+
elm = t_set[idx_t]
|
95 |
+
logger.debug("%s: %s, %s", count, idx_t, elm)
|
96 |
+
|
97 |
+
# find loc to insert
|
98 |
+
elm0, elm1, elm2 = elm
|
99 |
+
idx = -1
|
100 |
+
for idx, loc in enumerate(buff):
|
101 |
+
if loc[0] > elm0:
|
102 |
+
break
|
103 |
+
else:
|
104 |
+
idx += 1 # last
|
105 |
+
|
106 |
+
# make sure elm1 is within the range
|
107 |
+
# prev elm1 < elm1 < next elm1
|
108 |
+
if elm1 > buff[idx - 1][1]:
|
109 |
+
try: # overflow possible (idx + 1 in # last)
|
110 |
+
next_elm = buff[idx][1]
|
111 |
+
except IndexError:
|
112 |
+
next_elm = len1
|
113 |
+
if elm1 < next_elm:
|
114 |
+
# insert '' if necessary
|
115 |
+
# using zip_longest_middle
|
116 |
+
buff.insert(
|
117 |
+
idx, [elm0, elm1, elm2],
|
118 |
+
)
|
119 |
+
# logger.debug('---')
|
120 |
+
|
121 |
+
idx_t += 1
|
122 |
+
# if idx_t == 24: # 20:
|
123 |
+
# break
|
124 |
+
|
125 |
+
# remove [-1, -1]
|
126 |
+
# buff.pop(0)
|
127 |
+
# buff = np.array(buff, dtype='object')
|
128 |
+
|
129 |
+
# take care of the tail
|
130 |
+
buff += [[src_len, tgt_len, ""]]
|
131 |
+
|
132 |
+
resu = []
|
133 |
+
# merit = []
|
134 |
+
|
135 |
+
for idx, elm in enumerate(buff[1:]):
|
136 |
+
idx1 = idx + 1
|
137 |
+
elm0_, elm1_, elm2_ = buff[idx1 - 1] # idx starts from 0
|
138 |
+
elm0, elm1, elm2 = elm
|
139 |
+
del elm2_, elm2
|
140 |
+
|
141 |
+
tmp0 = zip_longest_middle(
|
142 |
+
list(range(elm0_ + 1, elm0)), list(range(elm1_ + 1, elm1)), fillvalue="",
|
143 |
+
)
|
144 |
+
# convet to list entries & attache merit
|
145 |
+
tmp = [list(t_elm) + [""] for t_elm in tmp0]
|
146 |
+
|
147 |
+
# update resu
|
148 |
+
resu += tmp + [buff[idx1]]
|
149 |
+
|
150 |
+
# remove the last entry
|
151 |
+
return resu[:-1]
|
radiobee/interpolate_pset.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Interpolate np.nan."""
|
2 |
+
from typing import List, Tuple
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
|
7 |
+
# fmt: off
|
8 |
+
def interpolate_pset(
|
9 |
+
pairs: List[Tuple[int, int, float]],
|
10 |
+
tgt_len: int,
|
11 |
+
method: str = 'linear',
|
12 |
+
limit_direction: str = 'both',
|
13 |
+
) -> List[Tuple[int, int]]:
|
14 |
+
# fmt: on
|
15 |
+
"""Interpolate.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
pairs: integer pairs, some np.nan
|
19 |
+
tgt_len: over 0...tgt_len-1 (x-axis, cmat.shape[1])
|
20 |
+
method: for use in pd.DataFrame.interpolate
|
21 |
+
limit_direction: for use in pd.DataFrame.interpolate
|
22 |
+
Returns:
|
23 |
+
np.nan converted
|
24 |
+
"""
|
25 |
+
y00, *_ = zip(*pairs)
|
26 |
+
|
27 |
+
res = []
|
28 |
+
for idx in range(tgt_len):
|
29 |
+
if idx in y00:
|
30 |
+
loc = y00.index(idx)
|
31 |
+
res.append(tuple(pairs[loc][:2]))
|
32 |
+
else:
|
33 |
+
res.append((idx, np.nan))
|
34 |
+
|
35 |
+
df = pd.DataFrame(res, columns=["y00", "yargmax"])
|
36 |
+
_ = df.interpolate(method=method, limit_direction=limit_direction, axis=0)
|
37 |
+
|
38 |
+
_ = _.to_numpy(dtype=int)
|
39 |
+
_ = [(int(elm0), int(elm1)) for elm0, elm1 in _]
|
40 |
+
|
41 |
+
return _
|
requirements.txt
CHANGED
@@ -12,5 +12,6 @@ seaborn
|
|
12 |
cchardet
|
13 |
tabulate
|
14 |
git+https://github.com/ffreemt/fast-langid
|
15 |
-
# dotenv
|
16 |
varname
|
|
|
|
12 |
cchardet
|
13 |
tabulate
|
14 |
git+https://github.com/ffreemt/fast-langid
|
15 |
+
# python-dotenv
|
16 |
varname
|
17 |
+
openpyxl
|