Spaces:

saeedabc
/

llm-text-tiling-demo

Running

App Files Files Community

saeedabc commited on Jan 16

Commit

13ecc63

1 Parent(s): dd4b76a

Fixed downloading nltk.punkt

Browse files

Files changed (2) hide show

app.py +11 -22
util.py +2 -5

app.py CHANGED Viewed

@@ -13,21 +13,20 @@ import ruptures as rpt
 from util import sent_tokenize
-# _OPENAI_MODELS = ['text-embedding-ada-002', 'text-embedding-3-small', 'text-embedding-3-large']
 _ST_MODELS = ['all-mpnet-base-v2', 'multi-qa-mpnet-base-dot-v1', 'all-MiniLM-L12-v2']
 CACHE_DIR = '.cache'
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 plt.rcParams.update({
-    'font.family': 'Times New Roman',  #'Arial',  # or 'Helvetica', 'Times New Roman'
-    'font.size': 12,  # General font size
-    'axes.titlesize': 13,  # Font size for titles
-    'axes.labelsize': 12,  # Font size for axis labels
-    'xtick.labelsize': 11,  # Font size for x-tick labels
-    'ytick.labelsize': 11,   # Font size for y-tick labels
-    'legend.fontsize': 11,   # Font size for legend
-    'legend.title_fontsize': 11   # Font size for legend title
 })
@@ -129,11 +128,6 @@ def output_segments(sents, preds, probs):
         preds = preds + [1]
         bkps = get_bkps_from_labels(preds)
-        # print(f'signal(#{len(signal)}): {signal}')
-        # print(f'bkps(#{len(bkps)}): {bkps}')
-        # if not bkps or bkps[-1] != len(signal):
-        #     print('Note: last segment is incomplete!')
         fig, [ax] = rpt.display(np.array(signal), bkps, figsize=(10, 5), dpi=250)
         y_min = max(0.0, min(signal) - 0.1)
         y_max = min(1.0, max(signal) + 0.1)
@@ -170,16 +164,11 @@ def text_segmentation(input_text, model_name, k, pool, threshold):
     return output_segments(sents, preds, probs)
-# with gr.Blocks(css=".custom-tab { padding: 20px; margin: 20px; }") as app:
 with gr.Blocks() as app:
     gr.Markdown("""
 # LLM TextTiling Demo
-An **extended** approach to text segmentation that combines **TextTiling** with **LLM embeddings**.
-Simply provide your text, choose an embedding model, and adjust segmentation parameters (window size, threshold, pooling).
-The demo will split your text into coherent segments based on **semantic shifts**.
-[**View the code on GitHub**](https://github.com/saeedabc/llm-text-tiling/demo)
 """)
     with gr.Row():
@@ -210,7 +199,7 @@ The demo will split your text into coherent segments based on **semantic shifts*
                     output_text = gr.Textbox(label="Output Text", placeholder="Chunks will appear here...", lines=22)
                 with gr.Tab("Output Json"):
                     output_json = gr.Json(label="Output Json", open=False, max_height=500)
-                with gr.Tab("Output Visualization"):  #, elem_classes="custom-tab"):
                     output_fig = gr.Plot(label="Output Visualization")
     submit_button.click(text_segmentation, inputs=[input_text, model_name, k, pool, threshold], outputs=[output_text, output_json, output_fig])
@@ -233,4 +222,4 @@ if __name__ == '__main__':
     Path(CACHE_DIR).mkdir(exist_ok=True)
     # Launch the app
-    app.launch()  # share=True)

 from util import sent_tokenize
 _ST_MODELS = ['all-mpnet-base-v2', 'multi-qa-mpnet-base-dot-v1', 'all-MiniLM-L12-v2']
 CACHE_DIR = '.cache'
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 plt.rcParams.update({
+    'font.family': 'Times New Roman',  #'Arial', 'Helvetica'
+    'font.size': 12,
+    'axes.titlesize': 13,
+    'axes.labelsize': 12,
+    'xtick.labelsize': 11,
+    'ytick.labelsize': 11,
+    'legend.fontsize': 11,
+    'legend.title_fontsize': 11
 })
         preds = preds + [1]
         bkps = get_bkps_from_labels(preds)
         fig, [ax] = rpt.display(np.array(signal), bkps, figsize=(10, 5), dpi=250)
         y_min = max(0.0, min(signal) - 0.1)
         y_max = min(1.0, max(signal) + 0.1)
     return output_segments(sents, preds, probs)
 with gr.Blocks() as app:
     gr.Markdown("""
 # LLM TextTiling Demo
+An **extended** approach to text segmentation that combines **TextTiling** with **LLM embeddings**. Simply provide your text, choose an embedding model, and adjust segmentation parameters (window size, pooling, threshold). The demo will split your text into coherent segments based on **semantic shifts**.
 """)
     with gr.Row():
                     output_text = gr.Textbox(label="Output Text", placeholder="Chunks will appear here...", lines=22)
                 with gr.Tab("Output Json"):
                     output_json = gr.Json(label="Output Json", open=False, max_height=500)
+                with gr.Tab("Output Visualization"):
                     output_fig = gr.Plot(label="Output Visualization")
     submit_button.click(text_segmentation, inputs=[input_text, model_name, k, pool, threshold], outputs=[output_text, output_json, output_fig])
     Path(CACHE_DIR).mkdir(exist_ok=True)
     # Launch the app
+    app.launch()

util.py CHANGED Viewed

@@ -1,10 +1,7 @@
-import os
 ### NLTK ###
 import nltk
-if not os.path.exists(os.path.join(nltk.data.find('tokenizers'), 'punkt')):
-    nltk.download('punkt')
 def nltk_sent_tokenize(texts: list[str]):
     return (sent for text in texts for sent in nltk.sent_tokenize(text))

 ### NLTK ###
 import nltk
+nltk.download('punkt')
 def nltk_sent_tokenize(texts: list[str]):
     return (sent for text in texts for sent in nltk.sent_tokenize(text))