freemt
commited on
Commit
·
a4a35d8
1
Parent(s):
25a8a17
Update altair
Browse files- .gitignore +4 -0
- app.cmd +6 -0
- app.exe +0 -0
- app.py +59 -5
- app.spec +65 -0
- chart_df.html +35 -0
- chart_df.png +0 -0
- flagged/Output 2/0.png +0 -0
- flagged/log.csv +3 -0
- gradiobee/seg_text.py +55 -9
- requirements.in +2 -0
- requirements.txt +25 -4
- run-nuitka.bat +2 -0
- run-pyinstaller-spec.bat +1 -0
- run-python-app_py.bat +1 -1
- tests/__init__.py +0 -0
- tests/test_seg_text.py +47 -0
.gitignore
CHANGED
@@ -1,2 +1,6 @@
|
|
1 |
.venv
|
2 |
**/__pycache__
|
|
|
|
|
|
|
|
|
|
1 |
.venv
|
2 |
**/__pycache__
|
3 |
+
app.build
|
4 |
+
app.dist
|
5 |
+
build
|
6 |
+
app.exe
|
app.cmd
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
@echo off
|
3 |
+
rem This script was created by Nuitka to execute 'app.exe' with Python DLL being found.
|
4 |
+
set PATH=c:\python\python37;%PATH%
|
5 |
+
set PYTHONHOME=c:\python\python37
|
6 |
+
"%~dp0.\app.exe"
|
app.exe
ADDED
Binary file (702 kB). View file
|
|
app.py
CHANGED
@@ -1,15 +1,25 @@
|
|
1 |
"""Talk to spaces VM via subprocess.check_output."""
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
# import httpx
|
3 |
import subprocess as sp
|
4 |
from shlex import split
|
|
|
5 |
|
6 |
# from textwrap import dedent
|
7 |
from inspect import cleandoc
|
8 |
import gradio as gr
|
|
|
9 |
from logzero import logger
|
10 |
|
11 |
from gradiobee.seg_text import seg_text
|
12 |
|
|
|
|
|
13 |
|
14 |
# def greet(command):
|
15 |
def process(command):
|
@@ -42,17 +52,54 @@ def process(command):
|
|
42 |
).strip()
|
43 |
if not out:
|
44 |
out = "No output, that's all we know."
|
45 |
-
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
# not is_command or not flag: text, do seg_text
|
48 |
_ = "\n\n".join(seg_text(command.strip()))
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
# _ = seg_text(command.strip())
|
50 |
-
|
51 |
f"""seg_text output (segmented sents):
|
52 |
{_}
|
53 |
"""
|
54 |
).strip()
|
55 |
|
|
|
|
|
|
|
|
|
56 |
|
57 |
iface = gr.Interface(
|
58 |
# fn=greet,
|
@@ -65,7 +112,13 @@ iface = gr.Interface(
|
|
65 |
default="python -m site",
|
66 |
label="command or multiline text",
|
67 |
),
|
68 |
-
outputs="text",
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
examples=[
|
70 |
"cat /proc/version",
|
71 |
"free # show free memory",
|
@@ -76,7 +129,8 @@ iface = gr.Interface(
|
|
76 |
],
|
77 |
title="probe the system",
|
78 |
description="Talk to the system via subprocess.check_output ",
|
|
|
79 |
)
|
80 |
|
81 |
-
|
82 |
-
iface.launch(
|
|
|
1 |
"""Talk to spaces VM via subprocess.check_output."""
|
2 |
+
# pylint: disable=wrong-import-position
|
3 |
+
import sys
|
4 |
+
from pathlib import Path
|
5 |
+
if "." not in sys.path:
|
6 |
+
sys.path.insert(0, ".")
|
7 |
+
|
8 |
# import httpx
|
9 |
import subprocess as sp
|
10 |
from shlex import split
|
11 |
+
import pandas as pd
|
12 |
|
13 |
# from textwrap import dedent
|
14 |
from inspect import cleandoc
|
15 |
import gradio as gr
|
16 |
+
import logzero
|
17 |
from logzero import logger
|
18 |
|
19 |
from gradiobee.seg_text import seg_text
|
20 |
|
21 |
+
logzero.loglevel() # default to 10
|
22 |
+
|
23 |
|
24 |
# def greet(command):
|
25 |
def process(command):
|
|
|
52 |
).strip()
|
53 |
if not out:
|
54 |
out = "No output, that's all we know."
|
55 |
+
return out, None
|
56 |
+
|
57 |
+
# quick test altair altair-save tooltip
|
58 |
+
# from PIL import Image
|
59 |
+
import altair as alt
|
60 |
+
from altair_saver import save
|
61 |
+
df_ = pd.DataFrame(data={'x': [1, 2], 'y': [3, 4], "cos": [0.1, 0.5]})
|
62 |
+
chart_df = alt.Chart(df_).mark_circle(size=60).encode(
|
63 |
+
x='x',
|
64 |
+
y='y',
|
65 |
+
color='cos',
|
66 |
+
# tooltip=['x', 'y', 'cos', ]
|
67 |
+
)
|
68 |
+
# .interactive()
|
69 |
+
|
70 |
+
# save(chart_df, "chart_df.html")
|
71 |
+
# chart_df_html = Path("chart_df.html").read_text("utf")
|
72 |
+
# save(chart_df, "chart_df.png")
|
73 |
+
# chart_df_png = Path("chart_df.png").read_bytes()
|
74 |
+
|
75 |
+
# chart_df_png = Image.open("chart_df.png")
|
76 |
+
# chart_df_png = "chart_df.png"
|
77 |
+
|
78 |
+
# scatter_plot.save('simple_scatter_plot_with_altairchart.html')
|
79 |
+
# chart_df.save("chart_df.html") # does not work, constains js
|
80 |
+
# chart_df_html = Path("chart_df.html").read_text("utf")
|
81 |
+
chart_df.save("chart_df.png") #
|
82 |
+
chart_df_png = "chart_df.png"
|
83 |
|
84 |
# not is_command or not flag: text, do seg_text
|
85 |
_ = "\n\n".join(seg_text(command.strip()))
|
86 |
+
|
87 |
+
logger.debug(_)
|
88 |
+
# logger.debug(chart_df_html)
|
89 |
+
# print(_)
|
90 |
+
# print(chart_df_html)
|
91 |
+
|
92 |
# _ = seg_text(command.strip())
|
93 |
+
_ = cleandoc(
|
94 |
f"""seg_text output (segmented sents):
|
95 |
{_}
|
96 |
"""
|
97 |
).strip()
|
98 |
|
99 |
+
# return _, chart_df_html
|
100 |
+
|
101 |
+
return _, chart_df_png
|
102 |
+
|
103 |
|
104 |
iface = gr.Interface(
|
105 |
# fn=greet,
|
|
|
112 |
default="python -m site",
|
113 |
label="command or multiline text",
|
114 |
),
|
115 |
+
# outputs="text",
|
116 |
+
# outputs=["text",],
|
117 |
+
# outputs=["text", "html"],
|
118 |
+
outputs=[
|
119 |
+
"text",
|
120 |
+
gr.outputs.Image("auto"),
|
121 |
+
],
|
122 |
examples=[
|
123 |
"cat /proc/version",
|
124 |
"free # show free memory",
|
|
|
129 |
],
|
130 |
title="probe the system",
|
131 |
description="Talk to the system via subprocess.check_output ",
|
132 |
+
layout="vertical",
|
133 |
)
|
134 |
|
135 |
+
iface.launch(share=True, debug=True)
|
136 |
+
# iface.launch()
|
app.spec
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- mode: python ; coding: utf-8 -*-
|
2 |
+
|
3 |
+
|
4 |
+
block_cipher = None
|
5 |
+
|
6 |
+
|
7 |
+
a = Analysis(['app.py'],
|
8 |
+
pathex=[],
|
9 |
+
binaries=[],
|
10 |
+
datas=[],
|
11 |
+
hiddenimports=[
|
12 |
+
"gradio",
|
13 |
+
"requests",
|
14 |
+
"Flask-Login",
|
15 |
+
"markdown2",
|
16 |
+
"analytics-python",
|
17 |
+
"Flask",
|
18 |
+
"Flask-Cors",
|
19 |
+
"flask-cachebuster",
|
20 |
+
"paramiko",
|
21 |
+
"tornado",
|
22 |
+
"matplotlib",
|
23 |
+
"pycryptodome",
|
24 |
+
"pandas",
|
25 |
+
"pillow",
|
26 |
+
"pydub",
|
27 |
+
"ffmpy",
|
28 |
+
],
|
29 |
+
hookspath=[],
|
30 |
+
hooksconfig={},
|
31 |
+
runtime_hooks=[],
|
32 |
+
excludes=[
|
33 |
+
"Ipython",
|
34 |
+
"wx",
|
35 |
+
"wx",
|
36 |
+
],
|
37 |
+
win_no_prefer_redirects=False,
|
38 |
+
win_private_assemblies=False,
|
39 |
+
cipher=block_cipher,
|
40 |
+
noarchive=False)
|
41 |
+
pyz = PYZ(a.pure, a.zipped_data,
|
42 |
+
cipher=block_cipher)
|
43 |
+
|
44 |
+
exe = EXE(pyz,
|
45 |
+
a.scripts,
|
46 |
+
[],
|
47 |
+
exclude_binaries=True,
|
48 |
+
name='app',
|
49 |
+
debug=False,
|
50 |
+
bootloader_ignore_signals=False,
|
51 |
+
strip=False,
|
52 |
+
upx=True,
|
53 |
+
console=True,
|
54 |
+
disable_windowed_traceback=False,
|
55 |
+
target_arch=None,
|
56 |
+
codesign_identity=None,
|
57 |
+
entitlements_file=None )
|
58 |
+
coll = COLLECT(exe,
|
59 |
+
a.binaries,
|
60 |
+
a.zipfiles,
|
61 |
+
a.datas,
|
62 |
+
strip=False,
|
63 |
+
upx=True,
|
64 |
+
upx_exclude=[],
|
65 |
+
name='app')
|
chart_df.html
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html>
|
3 |
+
<head>
|
4 |
+
<style>
|
5 |
+
.error {
|
6 |
+
color: red;
|
7 |
+
}
|
8 |
+
</style>
|
9 |
+
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm//vega@5"></script>
|
10 |
+
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm//[email protected]"></script>
|
11 |
+
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm//vega-embed@6"></script>
|
12 |
+
</head>
|
13 |
+
<body>
|
14 |
+
<div id="vis"></div>
|
15 |
+
<script>
|
16 |
+
(function(vegaEmbed) {
|
17 |
+
var spec = {"config": {"view": {"continuousWidth": 400, "continuousHeight": 300}}, "data": {"name": "data-e58adc7548bdded6fd09c49a28ff71ba"}, "mark": {"type": "circle", "size": 60}, "encoding": {"color": {"field": "cos", "type": "quantitative"}, "x": {"field": "x", "type": "quantitative"}, "y": {"field": "y", "type": "quantitative"}}, "$schema": "https://vega.github.io/schema/vega-lite/v4.17.0.json", "datasets": {"data-e58adc7548bdded6fd09c49a28ff71ba": [{"x": 1, "y": 3, "cos": 0.1}, {"x": 2, "y": 4, "cos": 0.5}]}};
|
18 |
+
var embedOpt = {"mode": "vega-lite"};
|
19 |
+
|
20 |
+
function showError(el, error){
|
21 |
+
el.innerHTML = ('<div class="error" style="color:red;">'
|
22 |
+
+ '<p>JavaScript Error: ' + error.message + '</p>'
|
23 |
+
+ "<p>This usually means there's a typo in your chart specification. "
|
24 |
+
+ "See the javascript console for the full traceback.</p>"
|
25 |
+
+ '</div>');
|
26 |
+
throw error;
|
27 |
+
}
|
28 |
+
const el = document.getElementById('vis');
|
29 |
+
vegaEmbed("#vis", spec, embedOpt)
|
30 |
+
.catch(error => showError(el, error));
|
31 |
+
})(vegaEmbed);
|
32 |
+
|
33 |
+
</script>
|
34 |
+
</body>
|
35 |
+
</html>
|
chart_df.png
ADDED
![]() |
flagged/Output 2/0.png
ADDED
![]() |
flagged/log.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
command or multiline text,Output 1,Output 2,timestamp
|
2 |
+
pyth1on -m site,"seg_text output (segmented sents):
|
3 |
+
pyth1on -m site",Output 2/0.png,2022-01-08 14:04:38.035704
|
gradiobee/seg_text.py
CHANGED
@@ -7,8 +7,11 @@ else use polyglot.text.Text
|
|
7 |
!install pyicu pycld2 Morfessor
|
8 |
!pip install polyglot sentence_splitter
|
9 |
"""
|
10 |
-
|
11 |
|
|
|
|
|
|
|
12 |
from tqdm.auto import tqdm
|
13 |
from polyglot.detect.base import logger as polyglot_logger
|
14 |
from polyglot.text import Detector, Text
|
@@ -27,34 +30,39 @@ LANG_S = ["ca", "cs", "da", "nl", "en", "fi", "fr", "de",
|
|
27 |
"pt", "ro", "ru", "sk", "sl", "es", "sv", "tr"]
|
28 |
|
29 |
|
30 |
-
def
|
31 |
text: str,
|
32 |
lang: Optional[str] = None,
|
33 |
-
qmode: bool = False,
|
34 |
maxlines: int = 1000
|
35 |
) -> List[str]:
|
36 |
# fmt: on
|
37 |
-
"""
|
38 |
-
Split text to sentences.
|
39 |
|
40 |
Use sentence_splitter if supported,
|
41 |
else use polyglot.text.Text.sentences
|
|
|
42 |
|
43 |
-
qmode: skip split_text_into_sentences if True, default False
|
44 |
vectors for all books are based on qmode=False.
|
45 |
qmode=True is for quick test purpose only
|
46 |
|
47 |
-
maxlines (default 1000),
|
48 |
set to <1 or a large number to turn it off
|
49 |
"""
|
50 |
if lang is None:
|
51 |
try:
|
52 |
lang = Detector(text).language.code
|
53 |
except Exception as exc:
|
54 |
-
logger.
|
|
|
|
|
|
|
|
|
55 |
lang = "en"
|
56 |
|
57 |
-
if not qmode and lang in LANG_S:
|
|
|
58 |
_ = []
|
59 |
lines = text.splitlines()
|
60 |
# if maxlines > 1 and len(lines) > maxlines:
|
@@ -70,4 +78,42 @@ def seg_text(
|
|
70 |
|
71 |
# return split_text_into_sentences(text, lang)
|
72 |
|
|
|
|
|
|
|
|
|
73 |
return [elm.string for elm in Text(text, lang).sentences]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
!install pyicu pycld2 Morfessor
|
8 |
!pip install polyglot sentence_splitter
|
9 |
"""
|
10 |
+
# pylint: disable=
|
11 |
|
12 |
+
from typing import List, Optional, Union
|
13 |
+
|
14 |
+
import re
|
15 |
from tqdm.auto import tqdm
|
16 |
from polyglot.detect.base import logger as polyglot_logger
|
17 |
from polyglot.text import Detector, Text
|
|
|
30 |
"pt", "ro", "ru", "sk", "sl", "es", "sv", "tr"]
|
31 |
|
32 |
|
33 |
+
def _seg_text(
|
34 |
text: str,
|
35 |
lang: Optional[str] = None,
|
36 |
+
# qmode: bool = False,
|
37 |
maxlines: int = 1000
|
38 |
) -> List[str]:
|
39 |
# fmt: on
|
40 |
+
"""Split text to sentences.
|
|
|
41 |
|
42 |
Use sentence_splitter if supported,
|
43 |
else use polyglot.text.Text.sentences
|
44 |
+
Blank lines will be removed.
|
45 |
|
46 |
+
qmode: quick mode, skip split_text_into_sentences if True, default False
|
47 |
vectors for all books are based on qmode=False.
|
48 |
qmode=True is for quick test purpose only
|
49 |
|
50 |
+
maxlines (default 1000), threshold for turn on tqdm progressbar
|
51 |
set to <1 or a large number to turn it off
|
52 |
"""
|
53 |
if lang is None:
|
54 |
try:
|
55 |
lang = Detector(text).language.code
|
56 |
except Exception as exc:
|
57 |
+
logger.info("text[:30]: %s", text[:30])
|
58 |
+
logger.warning(
|
59 |
+
"polyglot.text.Detector exc: %s, setting to 'en'",
|
60 |
+
exc
|
61 |
+
)
|
62 |
lang = "en"
|
63 |
|
64 |
+
# if not qmode and lang in LANG_S:
|
65 |
+
if lang in LANG_S:
|
66 |
_ = []
|
67 |
lines = text.splitlines()
|
68 |
# if maxlines > 1 and len(lines) > maxlines:
|
|
|
78 |
|
79 |
# return split_text_into_sentences(text, lang)
|
80 |
|
81 |
+
# empty "" text or blank to avoid Exception
|
82 |
+
if not text.strip():
|
83 |
+
return []
|
84 |
+
|
85 |
return [elm.string for elm in Text(text, lang).sentences]
|
86 |
+
|
87 |
+
|
88 |
+
# fmt: off
|
89 |
+
def seg_text(
|
90 |
+
lst: Union[str, List[str]],
|
91 |
+
lang: Optional[str] = None,
|
92 |
+
maxlines: int = 1000,
|
93 |
+
extra: Optional[str] = None,
|
94 |
+
) -> List[str]:
|
95 |
+
# fmt:on
|
96 |
+
"""Split a list of text.
|
97 |
+
|
98 |
+
Arguments:
|
99 |
+
lst: text or text list
|
100 |
+
extra: re.split(rf"{extra}, text) first
|
101 |
+
Returns:
|
102 |
+
list of splitted text.
|
103 |
+
"""
|
104 |
+
if isinstance(lst, str):
|
105 |
+
lst = [lst]
|
106 |
+
|
107 |
+
if extra:
|
108 |
+
# insert \n
|
109 |
+
lst = [re.sub(rf"({extra})", r"\1\n", elm) for elm in lst]
|
110 |
+
|
111 |
+
res = []
|
112 |
+
for elm in lst:
|
113 |
+
res.extend(_seg_text(
|
114 |
+
elm,
|
115 |
+
lang=lang,
|
116 |
+
maxlines=maxlines,
|
117 |
+
))
|
118 |
+
|
119 |
+
return res
|
requirements.in
CHANGED
@@ -10,3 +10,5 @@ pycld2
|
|
10 |
tqdm
|
11 |
polyglot
|
12 |
sentence_splitter
|
|
|
|
|
|
10 |
tqdm
|
11 |
polyglot
|
12 |
sentence_splitter
|
13 |
+
altair
|
14 |
+
altair_saver
|
requirements.txt
CHANGED
@@ -4,6 +4,10 @@
|
|
4 |
#
|
5 |
# pip-compile requirements.in
|
6 |
#
|
|
|
|
|
|
|
|
|
7 |
blis==0.7.5
|
8 |
# via
|
9 |
# spacy
|
@@ -34,18 +38,26 @@ cymem==2.0.6
|
|
34 |
# thinc
|
35 |
cytoolz==0.11.2
|
36 |
# via textacy
|
|
|
|
|
37 |
fonttools==4.28.5
|
38 |
# via matplotlib
|
39 |
idna==3.3
|
40 |
# via requests
|
|
|
|
|
41 |
jellyfish==0.8.9
|
42 |
# via textacy
|
43 |
jinja2==3.0.3
|
44 |
-
# via
|
|
|
|
|
45 |
joblib==1.1.0
|
46 |
# via
|
47 |
# scikit-learn
|
48 |
# textacy
|
|
|
|
|
49 |
kiwisolver==1.3.2
|
50 |
# via matplotlib
|
51 |
langcodes==3.3.0
|
@@ -69,6 +81,7 @@ networkx==2.6.3
|
|
69 |
# via textacy
|
70 |
numpy==1.21.5
|
71 |
# via
|
|
|
72 |
# blis
|
73 |
# matplotlib
|
74 |
# pandas
|
@@ -83,7 +96,9 @@ packaging==21.3
|
|
83 |
# matplotlib
|
84 |
# spacy
|
85 |
pandas==1.3.5
|
86 |
-
# via
|
|
|
|
|
87 |
pathy==0.6.1
|
88 |
# via spacy
|
89 |
pillow==8.4.0
|
@@ -108,6 +123,8 @@ pyparsing==3.0.6
|
|
108 |
# packaging
|
109 |
pyphen==0.12.0
|
110 |
# via textacy
|
|
|
|
|
111 |
python-dateutil==2.8.2
|
112 |
# via
|
113 |
# matplotlib
|
@@ -156,7 +173,9 @@ thinc==8.0.13
|
|
156 |
threadpoolctl==3.0.0
|
157 |
# via scikit-learn
|
158 |
toolz==0.11.2
|
159 |
-
# via
|
|
|
|
|
160 |
tqdm==4.62.3
|
161 |
# via
|
162 |
# -r requirements.in
|
@@ -175,7 +194,9 @@ wasabi==0.9.0
|
|
175 |
# spacy
|
176 |
# spacy-loggers
|
177 |
# thinc
|
|
|
|
|
178 |
|
179 |
# The following packages are considered to be unsafe in a requirements file:
|
180 |
# setuptools
|
181 |
-
|
|
|
4 |
#
|
5 |
# pip-compile requirements.in
|
6 |
#
|
7 |
+
altair==4.2.0
|
8 |
+
# via -r requirements.in
|
9 |
+
attrs==21.4.0
|
10 |
+
# via jsonschema
|
11 |
blis==0.7.5
|
12 |
# via
|
13 |
# spacy
|
|
|
38 |
# thinc
|
39 |
cytoolz==0.11.2
|
40 |
# via textacy
|
41 |
+
entrypoints==0.3
|
42 |
+
# via altair
|
43 |
fonttools==4.28.5
|
44 |
# via matplotlib
|
45 |
idna==3.3
|
46 |
# via requests
|
47 |
+
importlib-resources==5.4.0
|
48 |
+
# via jsonschema
|
49 |
jellyfish==0.8.9
|
50 |
# via textacy
|
51 |
jinja2==3.0.3
|
52 |
+
# via
|
53 |
+
# altair
|
54 |
+
# spacy
|
55 |
joblib==1.1.0
|
56 |
# via
|
57 |
# scikit-learn
|
58 |
# textacy
|
59 |
+
jsonschema==4.3.3
|
60 |
+
# via altair
|
61 |
kiwisolver==1.3.2
|
62 |
# via matplotlib
|
63 |
langcodes==3.3.0
|
|
|
81 |
# via textacy
|
82 |
numpy==1.21.5
|
83 |
# via
|
84 |
+
# altair
|
85 |
# blis
|
86 |
# matplotlib
|
87 |
# pandas
|
|
|
96 |
# matplotlib
|
97 |
# spacy
|
98 |
pandas==1.3.5
|
99 |
+
# via
|
100 |
+
# altair
|
101 |
+
# seaborn
|
102 |
pathy==0.6.1
|
103 |
# via spacy
|
104 |
pillow==8.4.0
|
|
|
123 |
# packaging
|
124 |
pyphen==0.12.0
|
125 |
# via textacy
|
126 |
+
pyrsistent==0.18.0
|
127 |
+
# via jsonschema
|
128 |
python-dateutil==2.8.2
|
129 |
# via
|
130 |
# matplotlib
|
|
|
173 |
threadpoolctl==3.0.0
|
174 |
# via scikit-learn
|
175 |
toolz==0.11.2
|
176 |
+
# via
|
177 |
+
# altair
|
178 |
+
# cytoolz
|
179 |
tqdm==4.62.3
|
180 |
# via
|
181 |
# -r requirements.in
|
|
|
194 |
# spacy
|
195 |
# spacy-loggers
|
196 |
# thinc
|
197 |
+
zipp==3.7.0
|
198 |
+
# via importlib-resources
|
199 |
|
200 |
# The following packages are considered to be unsafe in a requirements file:
|
201 |
# setuptools
|
202 |
+
altair_saver
|
run-nuitka.bat
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
REM python -m nuitka app.py
|
2 |
+
python -m nuitka --nofollow-imports app.py
|
run-pyinstaller-spec.bat
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
pyinstaller -y app.spec
|
run-python-app_py.bat
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
REM nodemon -w app.py -x .venv\Scripts\python app.py
|
2 |
REM nodemon -w app.py -x py -3.7 app.py
|
3 |
-
nodemon -w app.py -x py -3.8 app.py
|
|
|
1 |
REM nodemon -w app.py -x .venv\Scripts\python app.py
|
2 |
REM nodemon -w app.py -x py -3.7 app.py
|
3 |
+
nodemon -w app.py -x "pyright app.py && py -3.8 app.py"
|
tests/__init__.py
ADDED
File without changes
|
tests/test_seg_text.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Test seg_text."""
|
2 |
+
import pytest
|
3 |
+
from gradiobee.seg_text import seg_text
|
4 |
+
|
5 |
+
|
6 |
+
def test_seg_text1():
|
7 |
+
"""Test seg_text 1."""
|
8 |
+
text = " text 1\n\n test 2. test 3"
|
9 |
+
_ = seg_text(text)
|
10 |
+
assert len(_) == 2
|
11 |
+
|
12 |
+
text = " text 1\n\n test 2. Test 3"
|
13 |
+
_ = seg_text(text)
|
14 |
+
assert len(_) == 3
|
15 |
+
|
16 |
+
|
17 |
+
@pytest.mark.parametrize(
|
18 |
+
"test_input,expected", [
|
19 |
+
("", []),
|
20 |
+
(" ", []),
|
21 |
+
(" \n ", []),
|
22 |
+
]
|
23 |
+
)
|
24 |
+
def test_seg_text_blanks(test_input, expected):
|
25 |
+
"""Test blanks."""
|
26 |
+
assert seg_text(test_input) == expected
|
27 |
+
|
28 |
+
|
29 |
+
def test_seg_text_semicolon ():
|
30 |
+
"""Test semicolon."""
|
31 |
+
text = """ “元宇宙”,英文為“Metaverse”。該詞出自1992年;的科幻小說《雪崩》。 """
|
32 |
+
assert len(seg_text(text)) == 2
|
33 |
+
assert len(seg_text(text, 'zh')) == 2
|
34 |
+
assert len(seg_text(text, 'ja')) == 2
|
35 |
+
assert len(seg_text(text, 'ko')) == 2
|
36 |
+
assert len(seg_text(text, 'en')) == 1
|
37 |
+
|
38 |
+
|
39 |
+
def test_seg_text_semicolon_extra ():
|
40 |
+
"""Test semicolon."""
|
41 |
+
extra = "[;;]"
|
42 |
+
text = """ “元宇宙”,英文為“Metaverse”。該詞出自1992年;的科幻小說《雪崩》。 """
|
43 |
+
assert len(seg_text(text, extra=extra)) == 2 + 1
|
44 |
+
assert len(seg_text(text, 'zh', extra=extra)) == 2 + 1
|
45 |
+
assert len(seg_text(text, 'ja', extra=extra)) == 2 + 1
|
46 |
+
assert len(seg_text(text, 'ko', extra=extra)) == 2 + 1
|
47 |
+
assert len(seg_text(text, 'en', extra=extra)) == 1 + 1
|