Spaces:

Tonic
/

segment-text

Sleeping

App Files Files Community

Tonic commited on Sep 17, 2024

Commit

b9152b3

unverified ·

1 Parent(s): c1a6fc6

add description

Browse files

Files changed (2) hide show

app.py +7 -8
globe.py +9 -8

app.py CHANGED Viewed

@@ -2,8 +2,7 @@ import gradio as gr
 from transformers import DebertaV2Tokenizer, DebertaV2ForTokenClassification
 import torch
 from huggingface_hub import hf_hub_download
-import json
-from globe import title, description, joinus, model_name, placeholder, modelinfor1, modelinfor2, id2label
 tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)
 model = DebertaV2ForTokenClassification.from_pretrained(model_name)
@@ -51,12 +50,12 @@ with gr.Blocks() as demo:
         with gr.Group():
             gr.Markdown(description)
     with gr.Row():
-        with gr.Column(scale=1):
-            with gr.Row():
-                with gr.Group():
-                    gr.Markdown(modelinfor1)
-                with gr.Group():
-                    gr.Markdown(modelinfor2)
     with gr.Accordion(label="Join Us", open=False):
         gr.Markdown(joinus)
     with gr.Row():

 from transformers import DebertaV2Tokenizer, DebertaV2ForTokenClassification
 import torch
 from huggingface_hub import hf_hub_download
+from globe import title, description, joinus, model_name, placeholder, modelinfor1, modelinfor2, modelinfor3, id2label
 tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)
 model = DebertaV2ForTokenClassification.from_pretrained(model_name)
         with gr.Group():
             gr.Markdown(description)
     with gr.Row():
+            with gr.Group():
+                gr.Markdown(modelinfor1)
+            with gr.Group():
+                gr.Markdown(modelinfor2)
+            with gr.Group():
+                gr.Markdown(modelinfor3)
     with gr.Accordion(label="Join Us", open=False):
         gr.Markdown(joinus)
     with gr.Row():

globe.py CHANGED Viewed

@@ -7,15 +7,15 @@ title = """# 🙋🏻‍♂️Welcome to Tonic's  PLeIAs/✂️📜Segment-Text
 description = """
-Segmentext is a specialized language model for text-segmentation. Segmentext has been trained to be resilient to broken and unstructured texts including digitzation artifacts and ill-recognized layout formats.
-In contrast with most text-segmentation approach, Segmentext is based on token classification. Editorial structure are reconstructed by the raw text without any reference to the original layout.
-Segmentext was trained using HPC resources from GENCI–IDRIS on Ad Astra with 3,500 example of manually annotated texts, mostly coming from three large scale dataset collected by PleIAs, Finance Commons (financial documents in open data), Common Corpus (cultural heritage texts) and the Science Pile (scientific publication in open licenses - to be released).
-Given the diversity of the training data, Segmentext should work correctly on diverse document formats in the main European languages.
-Segmentext can be tested on PleIAs-Bad-Data-Editor, a free demo along with OCRonos, another model trained by PleIAs for the correction of OCR errors and other digitization artifact.
 """
 joinus = """
@@ -46,7 +46,7 @@ modelinfor1 = f"""
 **Model Architecture**: `{config['architectures'][0]}`
 **Model Type**: `{config['model_type']}`
-Segmentext is a token classification model trained for segmenting unstructured or noisy text into meaningful categories. The model uses `{config['model_type']}` as its underlying architecture, optimized for token classification tasks with the following characteristics:
 - **Hidden Size**: `{config['hidden_size']}`
 - **Intermediate Size**: `{config['intermediate_size']}`
@@ -82,7 +82,8 @@ The model is capable of classifying tokens into 14 distinct categories :
 12. **{id2label['11']}** (11)
 13. **{id2label['12']}** (12)
 14. **{id2label['13']}** (13)
 ### Tokenizer:
 - **Tokenizer Class**: `{tokenizer_config['tokenizer_class']}`
@@ -95,7 +96,7 @@ The model is capable of classifying tokens into 14 distinct categories :
 This model was trained using diverse datasets, including cultural heritage texts and modern digitized documents. It excels in identifying editorial structures in noisy or unstructured text, making it robust for tasks involving broken text or OCR artifacts.
-Segmentext is ideal for text segmentation across a variety of document types, including financial reports, scientific papers, and historical manuscripts.
 """
 placeholder = """PLEIAS :

 description = """
+✂️📜Segment-Text is a specialized language model for text-segmentation. ✂️📜Segment-Text has been trained to be resilient to broken and unstructured texts including digitzation artifacts and ill-recognized layout formats.
+In contrast with most text-segmentation approach, ✂️📜Segment-Text is based on token classification. Editorial structure are reconstructed by the raw text without any reference to the original layout.
+✂️📜Segment-Text was trained using HPC resources from GENCI–IDRIS on Ad Astra with 3,500 example of manually annotated texts, mostly coming from three large scale dataset collected by PleIAs, Finance Commons (financial documents in open data), Common Corpus (cultural heritage texts) and the Science Pile (scientific publication in open licenses - to be released).
+Given the diversity of the training data, ✂️📜Segment-Text should work correctly on diverse document formats in the main European languages.
+✂️📜Segment-Text can be tested on PleIAs-Bad-Data-Editor, a free demo along with OCRonos, another model trained by PleIAs for the correction of OCR errors and other digitization artifact.
 """
 joinus = """
 **Model Architecture**: `{config['architectures'][0]}`
 **Model Type**: `{config['model_type']}`
+✂️📜Segment-Text is a token classification model trained for segmenting unstructured or noisy text into meaningful categories. The model uses `{config['model_type']}` as its underlying architecture, optimized for token classification tasks with the following characteristics:
 - **Hidden Size**: `{config['hidden_size']}`
 - **Intermediate Size**: `{config['intermediate_size']}`
 12. **{id2label['11']}** (11)
 13. **{id2label['12']}** (12)
 14. **{id2label['13']}** (13)
+"""
+modelinfor3 = f"""
 ### Tokenizer:
 - **Tokenizer Class**: `{tokenizer_config['tokenizer_class']}`
 This model was trained using diverse datasets, including cultural heritage texts and modern digitized documents. It excels in identifying editorial structures in noisy or unstructured text, making it robust for tasks involving broken text or OCR artifacts.
+✂️📜Segment-Text is ideal for text segmentation across a variety of document types, including financial reports, scientific papers, and historical manuscripts.
 """
 placeholder = """PLEIAS :