Spaces:

mikeee
/

radiobee-aligner

Build error

App Files Files Community

freemt commited on Jan 20, 2022

Commit

52a9494

1 Parent(s): 4c04f50

Updte pyc

Browse files

Files changed (10) hide show

img/plt.png +0 -0
radiobee/__main__.py +65 -7
radiobee/align_sents.py +7 -2
radiobee/align_sents.pyc +0 -0
radiobee/error_msg.py +2 -2
radiobee/gradiobee.py +68 -5
radiobee/paras2sents.pyc +0 -0
radiobee/shuffle_sents.pyc +0 -0
run-pydocstle.bat → run-pydocstyle.bat +0 -0
tests/test_paras2sents.py +8 -2

img/plt.png CHANGED Viewed

radiobee/__main__.py CHANGED Viewed

@@ -4,6 +4,8 @@ from typing import Any, Tuple, Optional, Union  # noqa
 import sys
 from pathlib import Path  # noqa
 import platform
 import signal
 from random import randint
@@ -41,7 +43,10 @@ from radiobee.process_upload import process_upload
 from radiobee.gradiobee import gradiobee
 ic_install()
-ic.configureOutput(includeContext=True)
 ic.enable()
 # ic.disenable()  # to turn off
@@ -105,6 +110,12 @@ if __name__ == "__main__":
         debug = False
         debug = True
         share = True
     else:
         server_name = "127.0.0.1"
         share = False
@@ -128,7 +139,6 @@ if __name__ == "__main__":
         gr.inputs.File(label="file 2", optional=True),
     ]
-    # modi 1
     _ = """
         tf_type: Literal[linear, sqrt, log, binary] = 'linear'
         idf_type: Optional[Literal[standard, smooth, bm25]] = None
@@ -148,10 +158,13 @@ if __name__ == "__main__":
     )  # ditto
     input_norm_type = gr.inputs.Radio(["None", "l1", "l2"], default="None")  # ditto
-    inputs = [
         gr.inputs.File(label="file 1"),
         gr.inputs.File(label="file 2", optional=True),
-        input_tf_type,  # modi inputs
         input_idf_type,
         input_dl_type,
         input_norm_type,
@@ -167,6 +180,7 @@ if __name__ == "__main__":
             step=1,
             default=6,
         ),
     ]
     examples = [
@@ -179,16 +193,40 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/test_en.txt",
             "data/test_zh.txt",
             "linear",
             "None",
             "None",
             "None",
             10,
             6,
         ],
         [
             "data/shakespeare_zh500.txt",
@@ -199,6 +237,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/shakespeare_en500.txt",
@@ -209,6 +248,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/hlm-ch1-zh.txt",
@@ -219,6 +259,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/hlm-ch1-en.txt",
@@ -229,6 +270,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/ps-cn.txt",
@@ -239,6 +281,7 @@ if __name__ == "__main__":
             "None",
             10,
             4,
         ],
         [
             "data/test-dual.txt",
@@ -249,6 +292,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/英译中国现代散文选1(汉外对照丛书).txt",
@@ -259,6 +303,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/test-zh-ja.txt",
@@ -269,6 +314,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/xiyouji-ch1-zh.txt",
@@ -279,6 +325,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/demian-hesse-de.txt",
@@ -289,6 +336,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/catcher-in-the-rye-shixianrong-zh.txt",
@@ -299,6 +347,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
     ]
@@ -329,14 +378,23 @@ if __name__ == "__main__":
     out_file_dl_excel = gr.outputs.File(
         label="Click to download xlsx",
     )
-    # modi outputs
-    outputs = [
         out_df,
-        # "plot",
         gr.outputs.Image(label="plot"),
         out_file_dl,
         out_file_dl_excel,
         out_df_aligned,
         gr.outputs.HTML(),
     ]

 import sys
 from pathlib import Path  # noqa
+import subprocess as sp
+import shlex
 import platform
 import signal
 from random import randint
 from radiobee.gradiobee import gradiobee
 ic_install()
+ic.configureOutput(
+    includeContext=True,
+    outputFunction=logger.info,
+)
 ic.enable()
 # ic.disenable()  # to turn off
         debug = False
         debug = True
         share = True
+        # set UTC+8, probably wont work in hf spaces, no permission
+        try:
+            sp.check_output(shlex.split("ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime"))
+        except Exception as exc:
+            logger.error(" set timezonef failed: %s", exc)
     else:
         server_name = "127.0.0.1"
         share = False
         gr.inputs.File(label="file 2", optional=True),
     ]
     _ = """
         tf_type: Literal[linear, sqrt, log, binary] = 'linear'
         idf_type: Optional[Literal[standard, smooth, bm25]] = None
     )  # ditto
     input_norm_type = gr.inputs.Radio(["None", "l1", "l2"], default="None")  # ditto
+    # modi inputs 1, definitions
+    sent_ali_algo = gr.inputs.Radio(["None", "fast", "slow"], default="None")
+    inputs = [  # tot. 9, meed to modify input of gradio & examples
         gr.inputs.File(label="file 1"),
         gr.inputs.File(label="file 2", optional=True),
+        input_tf_type,  # modi inputs 2
         input_idf_type,
         input_dl_type,
         input_norm_type,
             step=1,
             default=6,
         ),
+        sent_ali_algo,
     ]
     examples = [
             "None",
             10,
             6,
+            "None",
         ],
         [
+            "data/test_zh.txt",
             "data/test_en.txt",
+            "linear",
+            "None",
+            "None",
+            "None",
+            10,
+            6,
+            "fast",
+        ],
+        [
             "data/test_zh.txt",
+            "data/test_en.txt",
             "linear",
             "None",
             "None",
             "None",
             10,
             6,
+            "slow",
+        ],
+        [
+            "data/test_en.txt",
+            "data/test_zh.txt",
+            "linear",
+            "None",
+            "None",
+            "None",
+            10,
+            6,
+            "None",
         ],
         [
             "data/shakespeare_zh500.txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/shakespeare_en500.txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/hlm-ch1-zh.txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/hlm-ch1-en.txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/ps-cn.txt",
             "None",
             10,
             4,
+            "None",
         ],
         [
             "data/test-dual.txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/英译中国现代散文选1(汉外对照丛书).txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/test-zh-ja.txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/xiyouji-ch1-zh.txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/demian-hesse-de.txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/catcher-in-the-rye-shixianrong-zh.txt",
             "None",
             10,
             6,
+            "None",
         ],
     ]
     out_file_dl_excel = gr.outputs.File(
         label="Click to download xlsx",
     )
+    out_sents_dl = gr.outputs.File(
+        label="Click to download sents csv",
+    )
+    out_sents_dl_excel = gr.outputs.File(
+        label="Click to download sents xlsx",
+    )
+    # modi outputs 1, definitions
+    # modi outputs 2, need to modify gradio error_msg
+    outputs = [  # tot. 8
         out_df,
         gr.outputs.Image(label="plot"),
         out_file_dl,
         out_file_dl_excel,
+        out_sents_dl,
+        out_sents_dl_excel,
         out_df_aligned,
         gr.outputs.HTML(),
     ]

radiobee/align_sents.py CHANGED Viewed

@@ -67,6 +67,11 @@ def align_sents(lst1: List[str], lst2: List[str]) -> List[Tuple[str, str]]:
         texts.append(tuple(_))
-    return texts
-    # return ["", ""]

         texts.append(tuple(_))
+    _ = """
+    _ = []
+    for elm in texts:
+        _.extend(elm)
+    return _
+    """
+    return texts

radiobee/align_sents.pyc ADDED Viewed

Binary file (1.42 kB). View file

radiobee/error_msg.py CHANGED Viewed

@@ -8,7 +8,7 @@ import pandas as pd
 def error_msg(
     msg: Optional[Union[str, Exception]],
     title: str = "error message",
-) -> Tuple[Union[pd.DataFrame, None], None, None, None, None, None]:
     """Prepare an error message for gradiobee outputs."""
     if msg is None:
         msg = "none..."
@@ -21,4 +21,4 @@ def error_msg(
     df = pd.DataFrame([msg], columns=[title])
     # return df, *((None,) * 4)  # pyright complains
-    return df, None, None, None, None, None

 def error_msg(
     msg: Optional[Union[str, Exception]],
     title: str = "error message",
+) -> Tuple[Union[pd.DataFrame, None], None, None, None, None, None, None, None]:
     """Prepare an error message for gradiobee outputs."""
     if msg is None:
         msg = "none..."
     df = pd.DataFrame([msg], columns=[title])
     # return df, *((None,) * 4)  # pyright complains
+    return df, None, None, None, None, None, None, None

radiobee/gradiobee.py CHANGED Viewed

@@ -30,6 +30,10 @@ from radiobee.trim_df import trim_df
 from radiobee.error_msg import error_msg
 from radiobee.text2lists import text2lists
 uname = platform.uname()
 HFSPACES = False
 if "amzn2" in uname.release:  # on hf spaces
@@ -43,7 +47,7 @@ debug = False
 debug = True
-def gradiobee(
     file1,
     file2,
     tf_type,
@@ -53,6 +57,7 @@ def gradiobee(
     eps,
     min_samples,
     # debug=False,
 ):
     """Process inputs and return outputs."""
     logger.debug(" *debug* ")
@@ -382,7 +387,7 @@ def gradiobee(
     df_aligned = df_aligned[["text2", "text1", "likelihood"]]
     df_aligned.columns = ["text1", "text2", "likelihood"]
-    ic(df_aligned.head())
     # round the last column to 2
     # df_aligned.likelihood = df_aligned.likelihood.round(2)
@@ -434,8 +439,66 @@ def gradiobee(
     # return df_trimmed, output_plot, file_dl, file_dl_xlsx, df_aligned
     # return df_trimmed, output_plot, file_dl, file_dl_xlsx, styled, df_html  # gradio cant handle style
-    ic("returning outputs")
-    return df_trimmed, output_plot, file_dl, file_dl_xlsx, df_aligned, df_html
-    # modi outputs

 from radiobee.error_msg import error_msg
 from radiobee.text2lists import text2lists
+from radiobee.align_sents import align_sents
+from radiobee.shuffle_sents import shuffle_sents  # type: ignore
+from radiobee.paras2sents import paras2sents  # type: ignore
 uname = platform.uname()
 HFSPACES = False
 if "amzn2" in uname.release:  # on hf spaces
 debug = True
+def gradiobee(  # noqa
     file1,
     file2,
     tf_type,
     eps,
     min_samples,
     # debug=False,
+    sent_ali_algo,
 ):
     """Process inputs and return outputs."""
     logger.debug(" *debug* ")
     df_aligned = df_aligned[["text2", "text1", "likelihood"]]
     df_aligned.columns = ["text1", "text2", "likelihood"]
+    ic("paras aligned: ", df_aligned.head(10))
     # round the last column to 2
     # df_aligned.likelihood = df_aligned.likelihood.round(2)
     # return df_trimmed, output_plot, file_dl, file_dl_xlsx, df_aligned
     # return df_trimmed, output_plot, file_dl, file_dl_xlsx, styled, df_html  # gradio cant handle style
+    ic("sent-ali-algo: ", sent_ali_algo)
+    # ### sent-ali-algo is None: para align
+    if sent_ali_algo in ["None"]:
+        ic("returning para-ali outputs")
+        return df_trimmed, output_plot, file_dl, file_dl_xlsx, None, None, df_aligned, df_html
+    # ### proceed with sent align
+    if sent_ali_algo in ["fast"]:
+        ic(sent_ali_algo)
+        align_func = align_sents
+        ic(df_aligned.shape, df_aligned.columns)
+        aligned_sents = paras2sents(df_aligned, align_func)
+        # ic(pd.DataFrame(aligned_sents).shape, aligned_sents)
+        ic(pd.DataFrame(aligned_sents).shape)
+        df_aligned_sents = pd.DataFrame(aligned_sents, columns=["text1", "text2"])
+    else:  # ["slow"]
+        ic(sent_ali_algo)
+        align_func = shuffle_sents
+        aligned_sents = paras2sents(df_aligned, align_func, lang1, lang2)
+        # add extra entry if necessary
+        aligned_sents = [list(sent) + [""] if len(sent) == 2 else list(sent) for sent in aligned_sents]
+        df_aligned_sents = pd.DataFrame(aligned_sents, columns=["text1", "text2", "likelihood"])
+    # prepare sents downloads
+    file_dl_sents = Path(f"{file_dl.stem}-sents{file_dl.suffix}")
+    file_dl_xlsx_sents = Path(f"{file_dl_xlsx.stem}-sents{file_dl_xlsx.suffix}")
+    _ = df_aligned_sents.to_csv(index=False)
+    file_dl_sents.write_text(_, encoding="utf8")
+    df_aligned_sents.to_excel(file_dl_xlsx_sents)
+    # prepare html output
+    if len(df_aligned_sents) > 200:
+        df_html = None
+    else:  # show a one-bathc table in html
+        # style
+        styled = df_aligned_sents.style.set_properties(
+            **{
+                "font-size": "10pt",
+                "border-color": "black",
+                "border": "1px black solid !important"
+            }
+            # border-color="black",
+        ).set_table_styles([{
+            "selector": "",  # noqs
+            "props": [("border", "2px black solid !important")]}]  # noqs
+        ).format(
+            precision=2
+        )
+        df_html = styled.to_html()
+    # aligned sents outputs
+    ic("aligned sents outputs")
+    # return df_trimmed, output_plot, file_dl, file_dl_xlsx, None, None, df_aligned, df_html
+    return df_trimmed, output_plot, file_dl, file_dl_xlsx, file_dl_sents, file_dl_xlsx_sents, df_aligned_sents, df_html

radiobee/paras2sents.pyc ADDED Viewed

Binary file (2.57 kB). View file

radiobee/shuffle_sents.pyc ADDED Viewed

Binary file (2.02 kB). View file

run-pydocstle.bat → run-pydocstyle.bat RENAMED Viewed

File without changes

tests/test_paras2sents.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Test paras2sents."""
 # pylint: disable=invalid-name
 import pandas as pd
 from radiobee.paras2sents import paras2sents
 from radiobee.shuffle_sents import shuffle_sents
@@ -14,15 +15,20 @@ def test_paras2sents_dual():
     """Test paras2sents_dual."""
     sents = paras2sents(paras)
     assert len(sents) > 202  # 208
     # assert not sents
 def test_paras2sents_dual_model_s():
     """Test paras2sents_dual_model_s."""
-    sents = paras2sents(paras, shuffle_sents)
-    assert len(sents) > 201  # 207
     # assert not sents

 """Test paras2sents."""
 # pylint: disable=invalid-name
+import numpy as np
 import pandas as pd
 from radiobee.paras2sents import paras2sents
 from radiobee.shuffle_sents import shuffle_sents
     """Test paras2sents_dual."""
     sents = paras2sents(paras)
+    assert np.array(sents).shape.__len__() > 1
     assert len(sents) > 202  # 208
     # assert not sents
 def test_paras2sents_dual_model_s():
     """Test paras2sents_dual_model_s."""
+    sents1 = paras2sents(paras, shuffle_sents)
+    # assert np.array(sents1).shape.__len__() > 1
+    assert pd.DataFrame(sents1).shape.__len__() > 1
+    assert len(sents1) > 201  # 207
     # assert not sents