saeedabc commited on
Commit
dd4b76a
·
1 Parent(s): bb9eec7

Initial commit

Browse files
Files changed (6) hide show
  1. .gitignore +164 -0
  2. LICENSE +9 -0
  3. README.md +40 -1
  4. app.py +236 -0
  5. requirements.txt +86 -0
  6. util.py +59 -0
.gitignore ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ .idea/
161
+
162
+ .cache/
163
+
164
+ .gradio/
LICENSE ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Saeed Abbasi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
README.md CHANGED
@@ -11,4 +11,43 @@ license: apache-2.0
11
  short_description: TextTiling using LLM Embeddings for Text Segmentation
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  short_description: TextTiling using LLM Embeddings for Text Segmentation
12
  ---
13
 
14
+ # LLM TextTiling Demo
15
+
16
+ This directory contains the **demo code** for an Extended TextTiling application, which segments text into coherent chunks by leveraging **LLM embeddings** (via Sentence Transformers) and a semantic shift probability threshold.
17
+
18
+ ## Live Demo on Hugging Face Spaces
19
+
20
+ You can try out the [**LLM TextTiling Demo**](https://huggingface.co/spaces/saeedabc/llm-text-tiling-demo) in your browser—no setup required.
21
+
22
+ ## Overview
23
+
24
+ - **Input Text**: Paste or type in any text you wish to segment.
25
+ - **Embedding Model**: Choose from supported Sentence Transformers models (e.g., `all-mpnet-base-v2`).
26
+ - **Window Size (`k`)**: Controls how many sentences on the left and right are compared for similarity.
27
+ - **Pooling Strategy** (`max`, `mean`, `min`): Determines how to combine similarity scores for sentences in the window.
28
+ - **Threshold**: The semantic shift probability above which a sentence boundary is declared a **segment boundary**.
29
+
30
+ ## Functionality
31
+
32
+ 1. **Tokenization**: Splits the text into sentences.
33
+ 2. **Embedding**: Each sentence is converted into a vector representation using the chosen Sentence Transformers model.
34
+ 3. **Cosine Similarity**: Sliding window comparisons of sentence vectors to detect shifts in topic or meaning.
35
+ 4. **Segmentation**: If the **semantic shift probability** = `(1 - similarity)` exceeds the threshold, a new segment is started.
36
+ 5. **Output**:
37
+ - **Segmented text** sentences grouped by segment
38
+ - **JSON data** containing segmentation metadata and chunk details
39
+ - **Plot** depicting the segmentation boundaries over a probability curve
40
+
41
+ ## Demo Code Files
42
+
43
+ - **`app.py`**: The Gradio app script that runs the demo.
44
+ - **`util.py`**: Utility functions for sentence tokenization and related operations.
45
+ - **`requirements.txt`**: Lists the dependencies used for this demo.
46
+
47
+ ## Contributing
48
+
49
+ Feel free to open issues or PRs in the main repository if you have feedback or suggestions for improving this demo.
50
+
51
+ ## License
52
+
53
+ The code in this demo is provided under the [MIT License](LICENSE).
app.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import pickle
3
+ from pathlib import Path
4
+ from itertools import zip_longest
5
+
6
+ import gradio as gr
7
+ import torch
8
+ from sentence_transformers import SentenceTransformer, util
9
+ import numpy as np
10
+ import matplotlib.pyplot as plt
11
+ import ruptures as rpt
12
+
13
+ from util import sent_tokenize
14
+
15
+
16
+ # _OPENAI_MODELS = ['text-embedding-ada-002', 'text-embedding-3-small', 'text-embedding-3-large']
17
+ _ST_MODELS = ['all-mpnet-base-v2', 'multi-qa-mpnet-base-dot-v1', 'all-MiniLM-L12-v2']
18
+
19
+ CACHE_DIR = '.cache'
20
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
21
+
22
+ plt.rcParams.update({
23
+ 'font.family': 'Times New Roman', #'Arial', # or 'Helvetica', 'Times New Roman'
24
+ 'font.size': 12, # General font size
25
+ 'axes.titlesize': 13, # Font size for titles
26
+ 'axes.labelsize': 12, # Font size for axis labels
27
+ 'xtick.labelsize': 11, # Font size for x-tick labels
28
+ 'ytick.labelsize': 11, # Font size for y-tick labels
29
+ 'legend.fontsize': 11, # Font size for legend
30
+ 'legend.title_fontsize': 11 # Font size for legend title
31
+ })
32
+
33
+
34
+ def embed_sentences(sentences, embedder_fn, cache_path):
35
+ if Path(cache_path).exists():
36
+ print(f'Loading embeddings from cache: {cache_path}')
37
+ with open(cache_path, 'rb') as file:
38
+ embedded_sents = pickle.load(file)
39
+ else:
40
+ print(f'Embedding sentences and saving to cache: {cache_path}')
41
+ embedded_sents = embedder_fn(sentences)
42
+ assert len(embedded_sents) == len(sentences)
43
+
44
+ with open(cache_path, 'wb') as file:
45
+ pickle.dump(embedded_sents, file)
46
+
47
+ return embedded_sents
48
+
49
+
50
+ def calculate_cosine_similarities(embedded_sents, k=1, pool='mean'):
51
+ def cosine_similarity(a, b):
52
+ sim = util.cos_sim(a, b)
53
+ if pool == 'mean':
54
+ return sim.mean().item()
55
+ elif pool == 'max':
56
+ return sim.max().item()
57
+ elif pool == 'min':
58
+ return sim.min().item()
59
+ else:
60
+ raise ValueError(f'Invalid pooling method: {pool}')
61
+
62
+ cosine_sims = []
63
+ for i in range(len(embedded_sents) - 1):
64
+ lctx = embedded_sents[max(0, i-k+1) : i+1]
65
+ rctx = embedded_sents[i+1 : i+k+1]
66
+ sim = cosine_similarity(lctx, rctx)
67
+ cosine_sims.append(sim)
68
+ # cosine_sims.append(0.0)
69
+
70
+ assert len(cosine_sims) == len(embedded_sents) - 1, f'{len(cosine_sims)} != {len(embedded_sents)}'
71
+ return cosine_sims
72
+
73
+
74
+ def predict_boundaries(cosine_sims, threshold):
75
+ probs = [1.0 - sim for sim in cosine_sims]
76
+ preds = [1 if prob >= threshold else 0 for prob in probs]
77
+ return preds, probs
78
+
79
+
80
+ def output_segments(sents, preds, probs):
81
+ assert len(sents) - 1 == len(preds) == len(probs), f'{len(sents)} - 1 != {len(preds)} != {len(probs)}'
82
+
83
+ def iter_segments(sents, preds, probs):
84
+ segment = []
85
+ for i, (sent, pred, prob) in enumerate(zip_longest(sents, preds, probs)):
86
+ segment.append({
87
+ # 'id': i + 1,
88
+ 'text': sent,
89
+ 'prob': round(prob, 4) if prob is not None else None,
90
+ })
91
+ if pred == 1:
92
+ yield segment
93
+ segment = []
94
+ if len(segment) > 0:
95
+ yield segment
96
+ segment = []
97
+
98
+ out = {
99
+ 'metadata': {},
100
+ 'chunks': [],
101
+ }
102
+ n_segs = 0
103
+ n_sents = 0
104
+ for _, segment in enumerate(iter_segments(sents, preds, probs)):
105
+ # out['chunks'].append({
106
+ # 'id': n_segs + 1,
107
+ # 'chunk': segment,
108
+ # })
109
+ out['chunks'].append(segment)
110
+ n_segs += 1
111
+ n_sents += len(segment)
112
+
113
+ out['metadata'] = {
114
+ 'n_chunks': n_segs,
115
+ 'n_sents': n_sents,
116
+ 'prob_mean': round(np.mean(probs), 4),
117
+ 'prob_std': round(np.std(probs), 4),
118
+ 'prob_min': round(min(probs), 4),
119
+ 'prob_max': round(max(probs), 4),
120
+ }
121
+
122
+ out_text = "\n-------------------------\n".join(["\n".join([sent['text'] for sent in segment]) for segment in out['chunks']])
123
+
124
+ def plot_regimes(signal, preds):
125
+ def get_bkps_from_labels(labels):
126
+ return [i+1 for i, l in enumerate(labels) if l == 1]
127
+
128
+ # signal = signal[:-1]
129
+ preds = preds + [1]
130
+ bkps = get_bkps_from_labels(preds)
131
+
132
+ # print(f'signal(#{len(signal)}): {signal}')
133
+ # print(f'bkps(#{len(bkps)}): {bkps}')
134
+ # if not bkps or bkps[-1] != len(signal):
135
+ # print('Note: last segment is incomplete!')
136
+
137
+ fig, [ax] = rpt.display(np.array(signal), bkps, figsize=(10, 5), dpi=250)
138
+ y_min = max(0.0, min(signal) - 0.1)
139
+ y_max = min(1.0, max(signal) + 0.1)
140
+ ax.set_ylim(y_min, y_max)
141
+ ax.set_title("Segment Regimes")
142
+ ax.set_xlabel("Sentence Index")
143
+ ax.set_ylabel("Semantic Shift Probability")
144
+ fig.tight_layout()
145
+
146
+ return fig
147
+
148
+ fig = plot_regimes(probs, preds)
149
+
150
+ return out_text, out, fig
151
+
152
+
153
+ def text_segmentation(input_text, model_name, k, pool, threshold):
154
+ if model_name in _ST_MODELS:
155
+ model = SentenceTransformer(model_name, device=DEVICE)
156
+ embedder_fn = model.encode
157
+ else:
158
+ raise ValueError(f'Invalid model name: {model_name}')
159
+
160
+ sents = sent_tokenize(input_text, method='nltk', initial_split_sep='\n')
161
+
162
+ cache_id = hashlib.md5(input_text.encode()).hexdigest()
163
+ cache_path = Path(CACHE_DIR) / f'{cache_id}.pkl'
164
+ embedded_sents = embed_sentences(sents, embedder_fn, cache_path=cache_path)
165
+
166
+ cosine_sims = calculate_cosine_similarities(embedded_sents, k=k, pool=pool)
167
+
168
+ preds, probs = predict_boundaries(cosine_sims, threshold=threshold)
169
+
170
+ return output_segments(sents, preds, probs)
171
+
172
+
173
+ # with gr.Blocks(css=".custom-tab { padding: 20px; margin: 20px; }") as app:
174
+ with gr.Blocks() as app:
175
+ gr.Markdown("""
176
+ # LLM TextTiling Demo
177
+
178
+ An **extended** approach to text segmentation that combines **TextTiling** with **LLM embeddings**.
179
+ Simply provide your text, choose an embedding model, and adjust segmentation parameters (window size, threshold, pooling).
180
+ The demo will split your text into coherent segments based on **semantic shifts**.
181
+
182
+ [**View the code on GitHub**](https://github.com/saeedabc/llm-text-tiling/demo)
183
+ """)
184
+
185
+ with gr.Row():
186
+ with gr.Column():
187
+ input_text = gr.Textbox(label="Input Text", placeholder="Enter your text here...", lines=15)
188
+
189
+ with gr.Row():
190
+ with gr.Column():
191
+ # model_name = gr.Radio(choices=_ST_MODELS, label="Embedding Model", value=_ST_MODELS[0])
192
+ model_name = gr.Dropdown(choices=_ST_MODELS, label="Embedding Model", value=_ST_MODELS[0])
193
+
194
+ with gr.Column():
195
+ pool = gr.Dropdown(choices=['max', 'mean', 'min'], label="Pooling Strategy", value='max')
196
+
197
+ with gr.Row():
198
+ with gr.Column():
199
+ threshold = gr.Slider(minimum=0, maximum=1, step=0.01, label="Threshold", value=0.5)
200
+
201
+ with gr.Column():
202
+ k = gr.Slider(minimum=1, maximum=10, step=1, label="Window Size", value=3)
203
+
204
+
205
+ submit_button = gr.Button("Chunk Text")
206
+
207
+ with gr.Column():
208
+ with gr.Tabs():
209
+ with gr.Tab("Output Text"):
210
+ output_text = gr.Textbox(label="Output Text", placeholder="Chunks will appear here...", lines=22)
211
+ with gr.Tab("Output Json"):
212
+ output_json = gr.Json(label="Output Json", open=False, max_height=500)
213
+ with gr.Tab("Output Visualization"): #, elem_classes="custom-tab"):
214
+ output_fig = gr.Plot(label="Output Visualization")
215
+
216
+ submit_button.click(text_segmentation, inputs=[input_text, model_name, k, pool, threshold], outputs=[output_text, output_json, output_fig])
217
+
218
+ examples = gr.Examples(
219
+ examples=[
220
+ ["Rib Mountain is a census-designated place (CDP) in the town of Rib Mountain in Marathon County, Wisconsin, United States. "
221
+ "The population was 5,651 at the 2010 census. "
222
+ "The community is named for Rib Mountain. "
223
+ "According to the United States Census Bureau, the CDP has a total area of 33.8 km² (13.0 mi²). "
224
+ "31.4 km² (12.1 mi²) of it is land and 2.4 km² (0.9 mi²) of it (6.98%) is water. "
225
+ "As of the census of 2000, there were 6,059 people, 2,211 households, and 1,782 families residing in the CDP. "
226
+ "The population density was 193.0/km² (499.8/mi²). "
227
+ "There were 2,278 housing units at an average density of 72.6/km² (187.9/mi²).", "all-mpnet-base-v2", 3, 'max', 0.52],
228
+ ],
229
+ inputs=[input_text, model_name, k, pool, threshold],
230
+ )
231
+
232
+ if __name__ == '__main__':
233
+ Path(CACHE_DIR).mkdir(exist_ok=True)
234
+
235
+ # Launch the app
236
+ app.launch() # share=True)
requirements.txt ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ annotated-types==0.7.0
3
+ anyio==4.8.0
4
+ certifi==2024.12.14
5
+ charset-normalizer==3.4.1
6
+ click==8.1.8
7
+ contourpy==1.3.1
8
+ cycler==0.12.1
9
+ fastapi==0.115.6
10
+ ffmpy==0.5.0
11
+ filelock==3.16.1
12
+ fonttools==4.55.3
13
+ fsspec==2024.12.0
14
+ gradio==5.12.0
15
+ gradio_client==1.5.4
16
+ h11==0.14.0
17
+ httpcore==1.0.7
18
+ httpx==0.28.1
19
+ huggingface-hub==0.27.1
20
+ idna==3.10
21
+ Jinja2==3.1.5
22
+ joblib==1.4.2
23
+ kiwisolver==1.4.8
24
+ markdown-it-py==3.0.0
25
+ MarkupSafe==2.1.5
26
+ matplotlib==3.10.0
27
+ mdurl==0.1.2
28
+ mpmath==1.3.0
29
+ networkx==3.4.2
30
+ nltk==3.9.1
31
+ numpy==2.2.1
32
+ nvidia-cublas-cu12==12.4.5.8
33
+ nvidia-cuda-cupti-cu12==12.4.127
34
+ nvidia-cuda-nvrtc-cu12==12.4.127
35
+ nvidia-cuda-runtime-cu12==12.4.127
36
+ nvidia-cudnn-cu12==9.1.0.70
37
+ nvidia-cufft-cu12==11.2.1.3
38
+ nvidia-curand-cu12==10.3.5.147
39
+ nvidia-cusolver-cu12==11.6.1.9
40
+ nvidia-cusparse-cu12==12.3.1.170
41
+ nvidia-nccl-cu12==2.21.5
42
+ nvidia-nvjitlink-cu12==12.4.127
43
+ nvidia-nvtx-cu12==12.4.127
44
+ orjson==3.10.14
45
+ packaging==24.2
46
+ pandas==2.2.3
47
+ pillow==11.1.0
48
+ pydantic==2.10.5
49
+ pydantic_core==2.27.2
50
+ pydub==0.25.1
51
+ Pygments==2.19.1
52
+ pyparsing==3.2.1
53
+ python-dateutil==2.9.0.post0
54
+ python-multipart==0.0.20
55
+ pytz==2024.2
56
+ PyYAML==6.0.2
57
+ regex==2024.11.6
58
+ requests==2.32.3
59
+ rich==13.9.4
60
+ ruff==0.9.1
61
+ ruptures==1.1.9
62
+ safehttpx==0.1.6
63
+ safetensors==0.5.2
64
+ scikit-learn==1.6.1
65
+ scipy==1.15.1
66
+ semantic-version==2.10.0
67
+ sentence-transformers==3.3.1
68
+ setuptools==75.8.0
69
+ shellingham==1.5.4
70
+ six==1.17.0
71
+ sniffio==1.3.1
72
+ starlette==0.41.3
73
+ sympy==1.13.1
74
+ threadpoolctl==3.5.0
75
+ tokenizers==0.21.0
76
+ tomlkit==0.13.2
77
+ torch==2.5.1
78
+ tqdm==4.67.1
79
+ transformers==4.48.0
80
+ triton==3.1.0
81
+ typer==0.15.1
82
+ typing_extensions==4.12.2
83
+ tzdata==2024.2
84
+ urllib3==2.3.0
85
+ uvicorn==0.34.0
86
+ websockets==14.1
util.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+
4
+ ### NLTK ###
5
+ import nltk
6
+ if not os.path.exists(os.path.join(nltk.data.find('tokenizers'), 'punkt')):
7
+ nltk.download('punkt')
8
+
9
+ def nltk_sent_tokenize(texts: list[str]):
10
+ return (sent for text in texts for sent in nltk.sent_tokenize(text))
11
+
12
+
13
+ # ### Spacy ###
14
+ # import spacy
15
+ # try:
16
+ # spacy_nlp = spacy.load('en_core_web_sm')
17
+ # except OSError:
18
+ # spacy.cli.download("en_core_web_sm")
19
+ # spacy_nlp = spacy.load('en_core_web_sm')
20
+
21
+ # def spacy_sent_tokenize(texts: list[str]):
22
+ # # nlp = spacy.load('en_core_web_sm')
23
+ # return (sent.text for text in texts for sent in spacy_nlp(text).sents)
24
+
25
+
26
+ # ### Segtok ###
27
+ # from segtok.segmenter import split_single, split_multi
28
+
29
+ # def segtok_sent_tokenize(texts: list[str]):
30
+ # return (sent for text in texts for sent in split_single(text))
31
+
32
+
33
+ ### Sentence Tokenization ###
34
+
35
+ def sent_tokenize(text, method: str = 'nltk', initial_split_sep: str = None) -> list[str]:
36
+ def has_info(text: str):
37
+ return any(char.isalnum() for char in text)
38
+
39
+ texts = [text] if isinstance(text, str) else text
40
+ assert isinstance(texts, list)
41
+
42
+ if initial_split_sep:
43
+ texts = [sline
44
+ for text in texts
45
+ for line in text.split(initial_split_sep)
46
+ if (sline := line.strip())]
47
+
48
+ if method == 'nltk':
49
+ sents = nltk_sent_tokenize(texts)
50
+ # elif method == 'spacy':
51
+ # sents = spacy_sent_tokenize(texts)
52
+ # elif method == 'segtok':
53
+ # sents = segtok_sent_tokenize(texts)
54
+ elif method == 'none':
55
+ sents = texts
56
+ else:
57
+ raise ValueError(f"Invalid method: {method}")
58
+
59
+ return [ssent for sent in sents if (ssent := sent.strip()) and has_info(ssent)]