Spaces:

Tonic
/

segment-text

Sleeping

App Files Files Community

Tonic commited on Sep 17

Commit

c1a6fc6

•

1 Parent(s): 8bffd4f

add description

Browse files

Files changed (1) hide show

app.py +9 -20

app.py CHANGED Viewed

@@ -8,16 +8,6 @@ from globe import title, description, joinus, model_name, placeholder, modelinfo
 tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)
 model = DebertaV2ForTokenClassification.from_pretrained(model_name)
-# # Define id2label based on config.json
-#
-# id2label = {
-#     0: "author", 1: "bibliography", 2: "caption", 3: "contact",
-#     4: "date", 5: "dialog", 6: "footnote", 7: "keywords",
-#     8: "math", 9: "paratext", 10: "separator", 11: "table",
-#     12: "text", 13: "title"
-# }
 color_map = {
     "author": "blue", "bibliography": "purple", "caption": "orange",
     "contact": "cyan", "date": "green", "dialog": "yellow",
@@ -42,15 +32,15 @@ def segment_text(input_text):
     segments = []
     current_word = ""
     for token, label_id in zip(tokens_decoded, predictions):
-        if token.startswith("▁"):  # handling wordpieces, specific to some tokenizers
             if current_word:
-                segments.append((current_word, id2label[label_id]))
-            current_word = token.replace("▁", "")  # new word
         else:
             current_word += token  # append subword part to current word
     if current_word:
-        segments.append((current_word, id2label[label_id]))
     return segments
@@ -58,18 +48,17 @@ with gr.Blocks() as demo:
     with gr.Row():
         gr.Markdown(title)
     with gr.Row():
-        with gr.Column(scale=1):
-            with gr.Group():
-                gr.Markdown(description)
-            with gr.Accordion(label="Join Us", open=False):
-                gr.Markdown(joinus)
         with gr.Column(scale=1):
             with gr.Row():
                 with gr.Group():
                     gr.Markdown(modelinfor1)
                 with gr.Group():
                     gr.Markdown(modelinfor2)
     with gr.Row():
         input_text = gr.Textbox(label="Enter your text here👇🏻", lines=5, placeholder=placeholder)
         output_text = gr.HighlightedText(label=" PLeIAs/✂️📜 Segment Text", color_map=color_map, combine_adjacent=True, show_inline_category=True, show_legend=True)

 tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)
 model = DebertaV2ForTokenClassification.from_pretrained(model_name)
 color_map = {
     "author": "blue", "bibliography": "purple", "caption": "orange",
     "contact": "cyan", "date": "green", "dialog": "yellow",
     segments = []
     current_word = ""
     for token, label_id in zip(tokens_decoded, predictions):
+        if token.startswith("▁"):  # handle wordpieces
             if current_word:
+                segments.append((current_word, id2label[str(label_id)]))
+            current_word = token.replace("▁", "")  # start a new word
         else:
             current_word += token  # append subword part to current word
     if current_word:
+        segments.append((current_word, id2label[str(label_id)]))
     return segments
     with gr.Row():
         gr.Markdown(title)
     with gr.Row():
+        with gr.Group():
+            gr.Markdown(description)
+    with gr.Row():
         with gr.Column(scale=1):
             with gr.Row():
                 with gr.Group():
                     gr.Markdown(modelinfor1)
                 with gr.Group():
                     gr.Markdown(modelinfor2)
+    with gr.Accordion(label="Join Us", open=False):
+        gr.Markdown(joinus)
     with gr.Row():
         input_text = gr.Textbox(label="Enter your text here👇🏻", lines=5, placeholder=placeholder)
         output_text = gr.HighlightedText(label=" PLeIAs/✂️📜 Segment Text", color_map=color_map, combine_adjacent=True, show_inline_category=True, show_legend=True)