Spaces:

mikeee
/

radiobee-aligner

Build error

App Files Files Community

freemt commited on Dec 26, 2021

Commit

3d38118

1 Parent(s): dc641b6

Update df_aligned file_csv file_xlsx

Browse files

Files changed (9) hide show

radiobee/__main__.py +141 -172
radiobee/align_texts.py +57 -0
radiobee/cmat2tset.py +2 -2
radiobee/gen_aset.py +59 -0
radiobee/gen_eps_minsamples.py +13 -0
radiobee/gen_pset.py +140 -0
radiobee/gen_row_alignment.py +151 -0
radiobee/interpolate_pset.py +41 -0
requirements.txt +2 -1

radiobee/__main__.py CHANGED Viewed

@@ -1,6 +1,8 @@
 """Run interactively."""
 from typing import Tuple  # , Optional
 import joblib
 from random import randint
 from textwrap import dedent
@@ -25,6 +27,9 @@ from radiobee.process_upload import process_upload
 from radiobee.files2df import files2df
 from radiobee.file2text import file2text
 from radiobee.lists2cmat import lists2cmat
 # from radiobee.plot_df import plot_df
 from radiobee.cmat2tset import cmat2tset
@@ -124,32 +129,30 @@ if __name__ == "__main__":
         gr.inputs.Slider(
             minimum=1,
             maximum=20,
-            step=1,
-            default=6,
-            # label="suggested min_samples value: 4-8",
         ),
         gr.inputs.Slider(
             minimum=1,
             maximum=20,
-            step=0.1,
-            default=10,
-            # label="suggested esp value: 1.7-3",
         ),
     ]
     # modi
     examples = [
-        ["data/test_zh.txt", "data/test_en.txt", "linear", "None", "None", "None", 6, 10, ],
-        ["data/test_en.txt", "data/test_zh.txt", "linear", "None", "None", "None", 6, 10, ],
-        ["data/shakespeare_zh500.txt", "data/shakespeare_en500.txt", "linear", "None", "None", "None", 6, 10, ],
-        ["data/shakespeare_en500.txt", "data/shakespeare_zh500.txt", "linear", "None", "None", "None", 6, 10, ],
-        ["data/hlm-ch1-zh.txt", "data/hlm-ch1-en.txt", "linear", "None", "None", "None", 6, 10, ],
-        ["data/hlm-ch1-en.txt", "data/hlm-ch1-zh.txt", "linear", "None", "None", "None", 6, 10, ],
     ]
     outputs = ["dataframe", "plot"]
     outputs = ["plot"]
     outputs = ["dataframe", "plot"]
-    out1 = gr.outputs.Dataframe(
         headers=None,
         max_rows=12,  # 20
         max_cols=None,
@@ -157,9 +160,28 @@ if __name__ == "__main__":
         type="auto",
         label="To be aligned",
     )
     outputs = [
-        out1,
         "plot",
     ]
     # outputs = ["dataframe", "plot", "plot"]  # wont work
     # outputs = ["dataframe"]
@@ -174,14 +196,14 @@ if __name__ == "__main__":
         idf_type,
         dl_type,
         norm,
         min_samples,
-        eps
     ):
         # modi fn
-        """Process inputs."""
         logger.debug(" *debug* ")
-        # cnover "None" to None
         for _ in [idf_type, dl_type, norm]:
             if _ in "None":
                 _ = None
@@ -190,159 +212,86 @@ if __name__ == "__main__":
         logger.info("file1.name: *%s*, file2.name: *%s*", file1.name, file2.name)
         # bypass if file1 or file2 is str input
-        if not (isinstance(file1, str) or isinstance(file2, str)):
-            text1 = file2text(file1)
-            text2 = file2text(file2)
-            lang1, _ = fastlid(text1)
-            lang2, _ = fastlid(text2)
-            df1 = files2df(file1, file2)
-            lst1 = [elm for elm in df1.text1 if elm]
-            lst2 = [elm for elm in df1.text2 if elm]
-            len1 = len(lst1)
-            len2 = len(lst2)
-            # this wont work
-            # for obj in [text1, text2, df1, lst1, lst2, ]:
-            # savelzma(text1) wont work
-            # for debugging
-            # joblib.dump(text1, f"data/{nameof(text1)}.lzma")
-            # joblib.dump(text2, f"data/{nameof(text2)}.lzma")
-            # joblib.dump(df1, f"data/{nameof(df1)}.lzma")
-            # joblib.dump(lst1, f"data/{nameof(lst1)}.lzma")
-            # joblib.dump(lst2, f"data/{nameof(lst2)}.lzma")
-            # modi typing https://textacy.readthedocs.io/en/stable/api_reference/representations.html
-            # tf_type: Literal[linear, sqrt, log, binary] = 'linear'
-            # idf_tyep: Optional[Literal[standard, smooth, bm25]] = None
-            # dl_type: Optional[Literal[linear, sqrt, log]] = None
-            # norm: norm: Optional[Literal[l1, l2]] = None
-            # min_df: int | float = 1
-            # max_df: int | float = 1.0
-            # cmat = lists2cmat(lst1, lst2)
-            cmat = lists2cmat(
-                lst1,
-                lst2,
-                tf_type=tf_type,
-                idf_type=idf_type,
-                dl_type=dl_type,
-                norm=norm,
-            )
-            tset = pd.DataFrame(cmat2tset(cmat))
-            tset.columns = ["x", "y", "cos"]
-            # for debugging, logger.debug logger.info dont show up
-            # print("lst1: %s" % lst1)
-            # print("lst2: %s" % lst2)
-            # print("cmat: %s" % cmat)
-            # print("tset: %s" % tset)
-            logger.debug("lst1: %s", lst1)
-            logger.debug("lst2: %s", lst2)
-            logger.debug("cmat: %s", cmat)
-            logger.debug("tset: %s", tset)
-            # plt0 = plot_df(pd.DataFrame(cmat))
-            df_ = tset
-            # moved to inputs
-            # min_samples: int = 6
-            # eps: float = 10
-            # ylim: Optional[int] = None
-            xlabel: str = lang1
-            ylabel: str = lang2
-            sns.set()
-            sns.set_style("darkgrid")
-            # fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(11.69, 8.27))
-            # fig, ([ax2, ax0], [ax1, ax3]) = plt.subplots(2, 2, figsize=(11.69, 8.27))
-            # fig, (ax2, ax0, ax1) = plt.subplots(3)
-            # fig, (ax2, ax0, ax1) = plt.subplots(3, figsize=(11.69, 8.27))
-            # fig, (ax2, ax0, ax1) = plt.subplots(1, 3, figsize=(36.69, 8.27))
-            # fig, (ax2, ax0, ax1) = plt.subplots(1, 3, figsize=(66.69, 22.27))
-            # fig, (ax2, ax0, ax1) = plt.subplots(1, 3)
-            # fig.subplots_adjust(hspace=.4)
-            fig = plt.figure()
-            gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
-            ax2 = fig.add_subplot(gs[0, 0])
-            ax0 = fig.add_subplot(gs[0, 1])
-            ax1 = fig.add_subplot(gs[1, 0])
-            cmap = "viridis_r"
-            sns.heatmap(cmat, cmap=cmap, ax=ax2).invert_yaxis()
-            ax2.set_xlabel(xlabel)
-            ax2.set_ylabel(ylabel)
-            ax2.set_title("cos similarity heatmap")
-            fig.suptitle("alignment projection")
-            _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
-            _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
-            df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
-            # clustered
-            df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax1)
-            # outliers
-            df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
-            # ax0.set_xlabel("")
-            # ax0.set_ylabel("zh")
-            ax0.set_xlabel(xlabel)
-            ax0.set_ylabel(ylabel)
-            ax0.set_xlim(0, len1)
-            ax0.set_ylim(0, len2)
-            ax0.set_title("max along columns ('x': outliers)")
-            # ax1.set_xlabel("en")
-            # ax1.set_ylabel("zh")
-            ax1.set_xlabel(xlabel)
-            ax1.set_ylabel(ylabel)
-            ax1.set_xlim(0, len1)
-            ax1.set_ylim(0, len2)
-            ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")
-            # return df, plot_df(pd.DataFrame(cmat))
-            # tset.plot.scatter("x", "y", c="cos", cmap="viridis_r")
-        else:
-            fig, ax1 = plt.subplots()
-            df1 = pd.DataFrame(
-                [
-                    [5.1, 3.5, 0],
-                    [4.9, 3.0, 0],
-                    [7.0, 3.2, 1],
-                    [6.4, 3.2, 1],
-                    [5.9, 3.0, 2],
-                ],
-                columns=["length", "width", "species"],
-            )
-            df1.plot.scatter(x="length", y="width", c="DarkBlue", ax=ax1)
-            # plt_heatmap = plt
-        # plt.scatter(df.length, df.width)  # gradio eturn plt.gcf() or plt
-        # return df, plt
-        # return plt
-        # return df, df
-        # return df1.iloc[:10, :], plt
-        # pd.concat([df0, pd.DataFrame([[".", ".", "..."]], columns=df0.columns)], ignore_index=1)
-        # pd.concat([df0.iloc[:2, :], pd.DataFrame([[".", ".", "..."]], columns=df0.columns),  df0.iloc[-1:, :]], ignore_index=1)
-        # _ = pd.concat([df1.iloc[:4, :], pd.DataFrame([["...", "...", "...", ]], columns=df1.columns), df1.iloc[-2:, :]], ignore_index=True)
-        # _ = pd.concat([df.iloc[:2, :], pd.DataFrame([[".", ".", "..."]], columns=df.columns),  df.iloc[-1:, :]], ignore_index=1)
-        _ = pd.concat(
             [
                 df1.iloc[:4, :],
                 pd.DataFrame(
@@ -359,10 +308,30 @@ if __name__ == "__main__":
             ignore_index=1,
         )
-        return _, plt
-        # return _, plt
-    # """
     server_port = 7860
     with socket(AF_INET, SOCK_STREAM) as sock:
@@ -382,8 +351,8 @@ if __name__ == "__main__":
         ## NB
         *   Click "Clear" first for subsequent submits when uploading files.
         *   `tf_type` `idf_type` `dl_type` `norm`: Normally there is no need to touch these unless you know what you are doing.
-        *   Suggested `min_samples` and `esp` values -- `min_samples`: 4-8, `esp` (minimum epsilon): 8-12.
-           -   Smaller `min_samples` or larger `esp` will result in more aligned pairs but also more **false positives** (pairs falsely identified as candidates). On the other hand, larger `min_samples` or smaller `esp` values tend to miss 'good' pairs.
         *   If you need to have a better look at the image, you can right-click on the image and select copy-image-address and open a new tab in the browser with the copied image address.
         *   `Flag`: Should something go wrong, you can click Flag to save the output and inform the developer.
     """
@@ -408,7 +377,7 @@ if __name__ == "__main__":
         inputs=inputs,
         outputs=outputs,
         title="radiobee-aligner🔠",
-        description="showcasing a blazing fast dualtext aligner, currrently supported language pairs: en-zh/zh-en",
         article=article,
         examples=examples,
         # theme="darkgrass",

 """Run interactively."""
 from typing import Tuple  # , Optional
+from pathlib import Path
 import joblib
 from random import randint
 from textwrap import dedent
 from radiobee.files2df import files2df
 from radiobee.file2text import file2text
 from radiobee.lists2cmat import lists2cmat
+from radiobee.gen_pset import gen_pset
+from radiobee.gen_aset import gen_aset
+from radiobee.align_texts import align_texts
 # from radiobee.plot_df import plot_df
 from radiobee.cmat2tset import cmat2tset
         gr.inputs.Slider(
             minimum=1,
             maximum=20,
+            step=0.1,
+            default=10,
         ),
         gr.inputs.Slider(
             minimum=1,
             maximum=20,
+            step=1,
+            default=6,
         ),
     ]
     # modi
     examples = [
+        ["data/test_zh.txt", "data/test_en.txt", "linear", "None", "None", "None", 10, 6, ],
+        ["data/test_en.txt", "data/test_zh.txt", "linear", "None", "None", "None", 10, 6, ],
+        ["data/shakespeare_zh500.txt", "data/shakespeare_en500.txt", "linear", "None", "None", "None", 10, 6, ],
+        ["data/shakespeare_en500.txt", "data/shakespeare_zh500.txt", "linear", "None", "None", "None", 10, 6, ],
+        ["data/hlm-ch1-zh.txt", "data/hlm-ch1-en.txt", "linear", "None", "None", "None", 10, 6, ],
+        ["data/hlm-ch1-en.txt", "data/hlm-ch1-zh.txt", "linear", "None", "None", "None", 10, 6, ],
     ]
     outputs = ["dataframe", "plot"]
     outputs = ["plot"]
     outputs = ["dataframe", "plot"]
+    out_df = gr.outputs.Dataframe(
         headers=None,
         max_rows=12,  # 20
         max_cols=None,
         type="auto",
         label="To be aligned",
     )
+    out_df_aligned = gr.outputs.Dataframe(
+        headers=None,
+        # max_rows=12,  # 20
+        max_cols=3,
+        overflow_row_behaviour="paginate",
+        type="auto",
+        label="aligned pairs",
+    )
+    out_file_dl = gr.outputs.File(
+        label="Click to download csv",
+    )
+    out_file_dl_excel = gr.outputs.File(
+        label="Click to download xlsx",
+    )
+    # modi outputs
     outputs = [
+        out_df,
         "plot",
+        out_file_dl,
+        out_file_dl_excel,
+        out_df_aligned,
     ]
     # outputs = ["dataframe", "plot", "plot"]  # wont work
     # outputs = ["dataframe"]
         idf_type,
         dl_type,
         norm,
+        eps,
         min_samples,
     ):
         # modi fn
+        """Process inputs and return outputs."""
         logger.debug(" *debug* ")
+        # conver "None" to None for those Radio types
         for _ in [idf_type, dl_type, norm]:
             if _ in "None":
                 _ = None
         logger.info("file1.name: *%s*, file2.name: *%s*", file1.name, file2.name)
         # bypass if file1 or file2 is str input
+        # if not (isinstance(file1, str) or isinstance(file2, str)):
+        text1 = file2text(file1)
+        text2 = file2text(file2)
+        lang1, _ = fastlid(text1)
+        lang2, _ = fastlid(text2)
+        df1 = files2df(file1, file2)
+        lst1 = [elm for elm in df1.text1 if elm]
+        lst2 = [elm for elm in df1.text2 if elm]
+        len1 = len(lst1)
+        len2 = len(lst2)
+        cmat = lists2cmat(
+            lst1,
+            lst2,
+            tf_type=tf_type,
+            idf_type=idf_type,
+            dl_type=dl_type,
+            norm=norm,
+        )
+        tset = pd.DataFrame(cmat2tset(cmat))
+        tset.columns = ["x", "y", "cos"]
+        df_ = tset
+        xlabel: str = lang1
+        ylabel: str = lang2
+        sns.set()
+        sns.set_style("darkgrid")
+        fig = plt.figure()
+        gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
+        ax2 = fig.add_subplot(gs[0, 0])
+        ax0 = fig.add_subplot(gs[0, 1])
+        ax1 = fig.add_subplot(gs[1, 0])
+        cmap = "viridis_r"
+        sns.heatmap(cmat, cmap=cmap, ax=ax2).invert_yaxis()
+        ax2.set_xlabel(xlabel)
+        ax2.set_ylabel(ylabel)
+        ax2.set_title("cos similarity heatmap")
+        fig.suptitle("alignment projection")
+        _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
+        _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
+        df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
+        # clustered
+        df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax1)
+        # outliers
+        df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
+        # ax0.set_xlabel("")
+        # ax0.set_ylabel("zh")
+        ax0.set_xlabel(xlabel)
+        ax0.set_ylabel(ylabel)
+        ax0.set_xlim(0, len1)
+        ax0.set_ylim(0, len2)
+        ax0.set_title("max along columns ('x': outliers)")
+        # ax1.set_xlabel("en")
+        # ax1.set_ylabel("zh")
+        ax1.set_xlabel(xlabel)
+        ax1.set_ylabel(ylabel)
+        ax1.set_xlim(0, len1)
+        ax1.set_ylim(0, len2)
+        ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")
+        # return df, plot_df(pd.DataFrame(cmat))
+        # tset.plot.scatter("x", "y", c="cos", cmap="viridis_r")
+        df_trimmed = pd.concat(
             [
                 df1.iloc[:4, :],
                 pd.DataFrame(
             ignore_index=1,
         )
+        # process lst1, lst2 to obtained df_aligned
+        pset = gen_pset(
+            cmat,
+            eps=eps,
+            min_samples=min_samples,
+            delta=7,
+        )
+        src_len, tgt_len = cmat.shape
+        aset = gen_aset(pset, src_len, tgt_len)
+        final_list = align_texts(aset, lst2, lst1)  # note the order
+        # df_aligned = df_trimmed
+        df_aligned = pd.DataFrame(final_list, columns=["text1", "text2", "likelihood"])
+        _ = df_aligned.to_csv(index=False)
+        file_dl = Path(f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.csv")
+        file_dl.write_text(_, encoding="utf8")
+        file_dl_xlsx = Path(f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.xlsx")
+        df_aligned.to_excel(file_dl_xlsx)
+        # return df_trimmed, plt
+        return df_trimmed, plt, file_dl, file_dl_xlsx, df_aligned
+        # modi outputs
     server_port = 7860
     with socket(AF_INET, SOCK_STREAM) as sock:
         ## NB
         *   Click "Clear" first for subsequent submits when uploading files.
         *   `tf_type` `idf_type` `dl_type` `norm`: Normally there is no need to touch these unless you know what you are doing.
+        *   Suggested `esp` and `min_samples` values -- `esp` (minimum epsilon): 8-12, `min_samples`: 4-8.
+           -   Smaller larger `esp` or `min_samples` will result in more aligned pairs but also more **false positives** (pairs falsely identified as candidates). On the other hand, larger smaller `esp` or `min_samples` values tend to miss 'good' pairs.
         *   If you need to have a better look at the image, you can right-click on the image and select copy-image-address and open a new tab in the browser with the copied image address.
         *   `Flag`: Should something go wrong, you can click Flag to save the output and inform the developer.
     """
         inputs=inputs,
         outputs=outputs,
         title="radiobee-aligner🔠",
+        description="WIP showcasing a blazing fast dualtext aligner, currrently supported language pairs: en-zh/zh-en",
         article=article,
         examples=examples,
         # theme="darkgrass",

radiobee/align_texts.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Align texts based on aset, src_text, tgt_text."""
+from typing import List, Tuple, Union
+from logzero import logger
+# fmt: off
+def align_texts(
+        aset: List[Tuple[Union[str, float], Union[str, float], Union[str, float]]],
+        src_text: List[str],
+        tgt_text: List[str],
+) -> List[Tuple[Union[str], Union[str], Union[str, float]]]:
+    # fmt: on
+    """Align texts (paras/sents) based on aset, src_text, tgt_text.
+    Args:
+        aset: align set
+        src_text: source text
+        tgt_text: target text
+    Returns:
+        aligned texts with possible mertics
+    """
+    xset, yset, metrics = zip(*aset)  # unzip aset
+    xset = [elm for elm in xset if elm != ""]
+    yset = [elm for elm in yset if elm != ""]
+    if (len(xset), len(yset)) != (len(tgt_text), len(src_text)):
+        logger.warning(
+            " (%s, %s) != (%s, %s) ", len(xset), len(yset), len(tgt_text), len(src_text)
+        )
+        # raise Exception(" See previous message")
+    texts = []
+    for elm in aset:
+        elm0, elm1, elm2 = elm
+        _ = []
+        # src_text first
+        if isinstance(elm1, str):
+            _.append("")
+        else:
+            _.append(src_text[int(elm1)])
+        if isinstance(elm0, str):
+            _.append("")
+        else:
+            _.append(tgt_text[int(elm0)])
+        if isinstance(elm2, str):
+            _.append("")
+        else:
+            _.append(round(elm2, 2))
+        texts.append(tuple(_))
+    # return [("", "", 0.)]
+    return texts

radiobee/cmat2tset.py CHANGED Viewed

@@ -46,11 +46,11 @@ def cmat2tset(
     """
     low_ = cmat.min() - 1
     argmax_max = []
-    src_len, tgt_len = cmat.shape
     for _ in range(min(src_len, tgt_len)):
         argmax = int(cmat.argmax())
         row, col = divmod(argmax, tgt_len)
-        argmax_max.append([col, row, cmat.max()])
         # erase row-th row and col-th col of cmat
         cmat[row, :] = low_

     """
     low_ = cmat.min() - 1
     argmax_max = []
+    src_len, tgt_len = cmat.shape  # ylim, xlim
     for _ in range(min(src_len, tgt_len)):
         argmax = int(cmat.argmax())
         row, col = divmod(argmax, tgt_len)
+        argmax_max.append([col, row, cmat.max()])  # x-axis, y-axis
         # erase row-th row and col-th col of cmat
         cmat[row, :] = low_

radiobee/gen_aset.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""Genereat align set (aset) based on pset (pair set), src_lang and tgt_len."""
+from typing import List, Tuple, Union
+from itertools import zip_longest
+# from logzero import logger
+# fmt: off
+def gen_aset(
+        pset: List[Tuple[int, int, float]],
+        src_len: int,  # n_rows
+        tgt_len: int,  # n_cols
+) -> List[Tuple[Union[str, float], Union[str, float], Union[str, float]]]:
+    # fmt: on
+    """Genereat align set (aset) based on pset, src_lang and tgt_len.
+    src_len, tgt_len = cmat.shape
+    zip_longest(..., fillvalue="")
+    Args:
+        pset: [x(lang2 zh), y(lang1 en), cos]
+        src_len: lang1 (en)
+        tgt_len: lang2 (zh)
+    Returns:
+        aset:
+        [0...tgt_len, 0...src_len]
+        [0, 0, .]
+        ...
+        [tgt_len-1, src_len-1, .]
+    """
+    # empty pset []
+    if not pset:
+        return [*zip_longest(range(tgt_len), range(src_len), fillvalue="")]
+    # empty [[]]
+    if len(pset) == 1:
+        if not pset[0]:
+            return [*zip_longest(range(tgt_len), range(src_len), fillvalue="")]
+    buff = []
+    pos0, pos1 = -1, -1
+    for elm in pset:
+        # elm0, elm1, elm2 = elm
+        elm0, elm1, *elm2 = elm
+        elm0 = int(elm0)
+        elm1 = int(elm1)
+        interval = max(elm0 - pos0 - 1, elm1 - pos1 - 1)
+        _ = zip_longest(range(pos0 + 1, elm0), range(pos1 + 1, elm1), [""] * interval, fillvalue="")
+        buff.extend(_)
+        buff.append(elm)
+        pos0, pos1 = elm0, elm1
+    # last batch if any
+    elm0, elm1 = tgt_len, src_len
+    interval = max(elm0 - pos0 - 1, elm1 - pos1 - 1)
+    _ = zip_longest(range(pos0 + 1, elm0), range(pos1 + 1, elm1), [""] * interval, fillvalue="")
+    buff.extend(_)
+    return buff

radiobee/gen_eps_minsamples.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""Gen suggested eps min_samples."""
+def gen_eps_minsamples(src_len: int, tgt_len: int) -> dict:
+    """Gen suggested eps min_samples."""
+    eps = src_len * 0.01
+    if eps < 3:
+        eps = 3
+    min_samples = tgt_len / 100 * 0.5
+    if min_samples < 3:
+        min_samples = 3
+    return {"eps": eps, "min_samples": min_samples}

radiobee/gen_pset.py ADDED Viewed

	@@ -0,0 +1,140 @@

+"""Gne pset from cmat. Find pairs for a given cmat.
+tinybee.find_pairs.py with fixed estimator='dbscan' eps=eps, min_samples=min_samples
+"""
+from typing import List, Tuple, Union
+import numpy as np
+import pandas as pd
+from sklearn.cluster import DBSCAN
+import logzero
+from logzero import logger
+from radiobee.cmat2tset import cmat2tset
+from radiobee.interpolate_pset import interpolate_pset
+def gen_pset(
+    cmat1: Union[List[List[float]], np.ndarray, pd.DataFrame],
+    eps: float = 10,
+    min_samples: int = 6,
+    delta: float = 7,
+    verbose: Union[bool, int] = False,
+) -> List[Tuple[int, int, Union[float, str]]]:
+    """Gen pset from cmat.
+    Find pairs for a given cmat.
+    Args:
+        cmat: correlation/similarity matrix
+        eps: min epsilon for DBSCAN (10)
+        min_samples: minimum # of samples for DBSCAN (6)
+        delta: tolerance (7)
+    Returns:
+        pairs + "" or metric (float)
+    dbscan_pairs' setup
+        if eps is None:
+            eps = src_len * .01
+            if eps < 3:
+                eps = 3
+        if min_samples is None:
+            min_samples = tgt_len / 100 * 0.5
+            if min_samples < 3:
+                min_samples = 3
+    def gen_eps_minsamples(src_len, tgt_len):
+        eps = src_len * .01
+        if eps < 3:
+            eps = 3
+        min_samples = tgt_len / 100 * 0.5
+        if min_samples < 3:
+            min_samples = 3
+        return {"eps": eps, "min_samples": min_samples}
+    """
+    if isinstance(verbose, bool):
+        if verbose:
+            verbose = 10
+        else:
+            verbose = 20
+    logzero.loglevel(verbose)
+    # if isinstance(cmat, list):
+    cmat = np.array(cmat1)
+    src_len, tgt_len = cmat.shape
+    # tset = cmat2tset(cmat)
+    tset = cmat2tset(cmat).tolist()
+    logger.debug("tset: %s", tset)
+    # iset = gen_iset(cmat, verbose=verbose, estimator=estimator)
+    labels = DBSCAN(eps=eps, min_samples=min_samples).fit(tset).labels_
+    df_tset = pd.DataFrame(tset, columns=["x", "y", "cos"])
+    cset = df_tset[labels > -1].to_numpy()
+    # sort cset
+    _ = sorted(cset.tolist(), key=lambda x: x[0])
+    iset = interpolate_pset(_, tgt_len)
+    # *_, ymax = zip(*tset)
+    # ymax = list(ymax)
+    # low_ = np.min(ymax) - 1  # reset to minimum_value - 1
+    buff = [(-1, -1, ""), (tgt_len, src_len, "")]
+    # for _ in range(tgt_len):
+    for idx, tset_elm in enumerate(tset):
+        logger.debug("buff: %s", buff)
+        # postion max in ymax and insert in buff
+        # if with range given by iset+-delta and
+        # it's valid (do not exceed constraint
+        # by neighboring points
+        # argmax = int(np.argmax(ymax))
+        # logger.debug("=== %s,%s === %s", _, argmax, tset[_])
+        logger.debug("=== %s === %s", _, tset_elm)
+        # ymax[_] = low_
+        # elm = tset[argmax]
+        # elm0, *_ = elm
+        elm0, *_ = tset_elm
+        # position elm in buff
+        idx = -1  # for making pyright happy
+        for idx, loc in enumerate(buff):
+            if loc[0] > elm0:
+                break
+        else:
+            idx += 1  # last
+        # insert elm in for valid elm
+        # (within range inside two neighboring points)
+        # pos = int(tset[argmax][0])
+        pos = int(tset_elm[0])
+        logger.debug(" %s <=> %s ", tset_elm, iset[pos])
+        # if abs(tset[argmax][1] - iset[pos][1]) <= delta:
+        if abs(tset_elm[1] - iset[pos][1]) <= delta:
+            if tset_elm[1] > buff[idx - 1][1] and tset_elm[1] < buff[idx][1]:
+                buff.insert(idx, tset_elm)
+                logger.debug("idx: %s, tset_elm: %s", idx, tset_elm)
+            else:
+                logger.debug("\t***\t idx: %s, tset_elm: %s", idx, tset_elm)
+        _ = """
+        if abs(tset[loc][1] - iset[loc][1]) <= delta:
+            if tset[loc][1] > buff[idx][1] and tset[loc][1] < buff[idx + 1][1]:
+                buff.insert(idx + 1, tset[loc])
+        # """
+    # remove first and last entry in buff
+    buff.pop(0)
+    buff.pop()
+    # return [(1, 1, "")]
+    return [(int(elm0), int(elm1), elm2) for elm0, elm1, elm2 in buff]

radiobee/gen_row_alignment.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""Gen proper alignment for a given triple_set.
+cmat = fetch_sent_corr(src, tgt)
+src_len, tgt_len = np.array(cmat).shape
+r_ali = gen_row_alignment(cmat, tgt_len, src_len)  # note the order
+src[r_ali[1]], tgt[r_ali[0]], r_ali[2]
+or  !!!  (targer, source)
+cmat = fetch_sent_corr(tgt, src)  # note the order
+src_len, tgt_len = np.array(cmat).shape
+r_ali = gen_row_alignment(cmat, src_len, tgt_len)
+src[r_ali[0]], tgt[r_ali[1]], r_ali[2]
+---
+src_txt = 'data/wu_ch2_en.txt'
+tgt_txt = 'data/wu_ch2_zh.txt'
+assert Path(src_txt).exists()
+assert Path(tgt_txt).exists()
+src_text, _ = load_paras(src_txt)
+tgt_text, _ = load_paras(tgt_txt)
+cos_matrix = gen_cos_matrix(src_text, tgt_text)
+t_set, m_matrix = find_aligned_pairs(cos_matrix0, thr=0.4, matrix=True)
+resu = gen_row_alignment(t_set, src_len, tgt_len)
+resu = np.array(resu)
+idx = -1
+idx += 1; (resu[idx], src_text[int(resu[idx, 0])],
+    tgt_text[int(resu[idx, 1])]) if all(resu[idx]) else resu[idx]
+idx += 1;  i0, i1, i2 = resu[idx]; '***' if i0 == ''
+else src_text[int(i0)], '***' if i1 == '' else tgt_text[int(i1)], ''
+if i2 == '' else i2
+"""
+# pylint: disable=line-too-long
+from typing import List, Union
+# natural extrapolation with slope equal to 1
+from itertools import zip_longest as zip_longest_middle
+import numpy as np
+from logzero import logger
+# from tinybee.zip_longest_middle import zip_longest_middle
+# from tinybee.zip_longest_middle import zip_longest_middle
+# from tinybee.find_pairs import find_pairs
+# logger = logging.getLogger(__name__)
+# logger.addHandler(logging.NullHandler())
+def gen_row_alignment(  # pylint: disable=too-many-locals
+    t_set,
+    src_len,
+    tgt_len,
+    # ) -> List[Tuple[Union[str, int], Union[str, int], Union[str, float]]]:
+) -> List[List[Union[str, float]]]:
+    """Gen proper rows for given triple_set.
+    Arguments:
+        [t_set {np.array or list}] -- [nll matrix]
+        [src_len {int}] -- numb of source texts (para/sents)
+        [tgt_len {int}] -- numb of target texts (para/sents)
+    Returns:
+        [np.array] -- [proper rows]
+    """
+    t_set = np.array(t_set, dtype="object")
+    # len0 = src_len
+    # len1 tgt text length, must be provided
+    len1 = tgt_len
+    # rearrange t_set as buff in increasing order
+    buff = [[-1, -1, ""]]  #
+    idx_t = 0
+    # for elm in t_set:
+    # start with bigger value from the 3rd col
+    y00, yargmax, ymax = zip(*t_set)
+    ymax_ = np.array(ymax).copy()
+    reset_v = np.min(ymax_) - 1
+    for count in range(tgt_len):
+        argmax = np.argmax(ymax_)
+        # reset
+        ymax_[argmax] = reset_v
+        idx_t = argmax
+        elm = t_set[idx_t]
+        logger.debug("%s: %s, %s", count, idx_t, elm)
+        # find loc to insert
+        elm0, elm1, elm2 = elm
+        idx = -1
+        for idx, loc in enumerate(buff):
+            if loc[0] > elm0:
+                break
+        else:
+            idx += 1  # last
+        # make sure elm1 is within the range
+        # prev elm1 < elm1 < next elm1
+        if elm1 > buff[idx - 1][1]:
+            try:  # overflow possible (idx + 1 in # last)
+                next_elm = buff[idx][1]
+            except IndexError:
+                next_elm = len1
+            if elm1 < next_elm:
+                # insert '' if necessary
+                # using zip_longest_middle
+                buff.insert(
+                    idx, [elm0, elm1, elm2],
+                )
+                # logger.debug('---')
+        idx_t += 1
+        # if idx_t == 24:  # 20:
+        #     break
+    # remove [-1, -1]
+    # buff.pop(0)
+    # buff = np.array(buff, dtype='object')
+    # take care of the tail
+    buff += [[src_len, tgt_len, ""]]
+    resu = []
+    # merit = []
+    for idx, elm in enumerate(buff[1:]):
+        idx1 = idx + 1
+        elm0_, elm1_, elm2_ = buff[idx1 - 1]  # idx starts from 0
+        elm0, elm1, elm2 = elm
+        del elm2_, elm2
+        tmp0 = zip_longest_middle(
+            list(range(elm0_ + 1, elm0)), list(range(elm1_ + 1, elm1)), fillvalue="",
+        )
+        # convet to list entries & attache merit
+        tmp = [list(t_elm) + [""] for t_elm in tmp0]
+        # update resu
+        resu += tmp + [buff[idx1]]
+    # remove the last entry
+    return resu[:-1]

radiobee/interpolate_pset.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""Interpolate np.nan."""
+from typing import List, Tuple
+import numpy as np
+import pandas as pd
+# fmt: off
+def interpolate_pset(
+        pairs: List[Tuple[int, int, float]],
+        tgt_len: int,
+        method: str = 'linear',
+        limit_direction: str = 'both',
+) -> List[Tuple[int, int]]:
+    # fmt: on
+    """Interpolate.
+    Args:
+        pairs: integer pairs, some np.nan
+        tgt_len: over 0...tgt_len-1 (x-axis, cmat.shape[1])
+        method: for use in pd.DataFrame.interpolate
+        limit_direction:  for use in pd.DataFrame.interpolate
+    Returns:
+        np.nan converted
+    """
+    y00, *_ = zip(*pairs)
+    res = []
+    for idx in range(tgt_len):
+        if idx in y00:
+            loc = y00.index(idx)
+            res.append(tuple(pairs[loc][:2]))
+        else:
+            res.append((idx, np.nan))
+    df = pd.DataFrame(res, columns=["y00", "yargmax"])
+    _ = df.interpolate(method=method, limit_direction=limit_direction, axis=0)
+    _ = _.to_numpy(dtype=int)
+    _ = [(int(elm0), int(elm1)) for elm0, elm1 in _]
+    return _

requirements.txt CHANGED Viewed

@@ -12,5 +12,6 @@ seaborn
 cchardet
 tabulate
 git+https://github.com/ffreemt/fast-langid
-# dotenv
 varname

 cchardet
 tabulate
 git+https://github.com/ffreemt/fast-langid
+# python-dotenv
 varname
+openpyxl