Spaces:
Sleeping
Sleeping
add description
Browse files
app.py
CHANGED
@@ -2,8 +2,7 @@ import gradio as gr
|
|
2 |
from transformers import DebertaV2Tokenizer, DebertaV2ForTokenClassification
|
3 |
import torch
|
4 |
from huggingface_hub import hf_hub_download
|
5 |
-
import
|
6 |
-
from globe import title, description, joinus, model_name, placeholder, modelinfor1, modelinfor2, id2label
|
7 |
|
8 |
tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)
|
9 |
model = DebertaV2ForTokenClassification.from_pretrained(model_name)
|
@@ -51,12 +50,12 @@ with gr.Blocks() as demo:
|
|
51 |
with gr.Group():
|
52 |
gr.Markdown(description)
|
53 |
with gr.Row():
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
with gr.Accordion(label="Join Us", open=False):
|
61 |
gr.Markdown(joinus)
|
62 |
with gr.Row():
|
|
|
2 |
from transformers import DebertaV2Tokenizer, DebertaV2ForTokenClassification
|
3 |
import torch
|
4 |
from huggingface_hub import hf_hub_download
|
5 |
+
from globe import title, description, joinus, model_name, placeholder, modelinfor1, modelinfor2, modelinfor3, id2label
|
|
|
6 |
|
7 |
tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)
|
8 |
model = DebertaV2ForTokenClassification.from_pretrained(model_name)
|
|
|
50 |
with gr.Group():
|
51 |
gr.Markdown(description)
|
52 |
with gr.Row():
|
53 |
+
with gr.Group():
|
54 |
+
gr.Markdown(modelinfor1)
|
55 |
+
with gr.Group():
|
56 |
+
gr.Markdown(modelinfor2)
|
57 |
+
with gr.Group():
|
58 |
+
gr.Markdown(modelinfor3)
|
59 |
with gr.Accordion(label="Join Us", open=False):
|
60 |
gr.Markdown(joinus)
|
61 |
with gr.Row():
|
globe.py
CHANGED
@@ -7,15 +7,15 @@ title = """# 🙋🏻♂️Welcome to Tonic's PLeIAs/✂️📜Segment-Text
|
|
7 |
|
8 |
|
9 |
description = """
|
10 |
-
|
11 |
|
12 |
-
In contrast with most text-segmentation approach,
|
13 |
|
14 |
-
|
15 |
|
16 |
-
Given the diversity of the training data,
|
17 |
|
18 |
-
|
19 |
"""
|
20 |
|
21 |
joinus = """
|
@@ -46,7 +46,7 @@ modelinfor1 = f"""
|
|
46 |
**Model Architecture**: `{config['architectures'][0]}`
|
47 |
**Model Type**: `{config['model_type']}`
|
48 |
|
49 |
-
|
50 |
|
51 |
- **Hidden Size**: `{config['hidden_size']}`
|
52 |
- **Intermediate Size**: `{config['intermediate_size']}`
|
@@ -82,7 +82,8 @@ The model is capable of classifying tokens into 14 distinct categories :
|
|
82 |
12. **{id2label['11']}** (11)
|
83 |
13. **{id2label['12']}** (12)
|
84 |
14. **{id2label['13']}** (13)
|
85 |
-
|
|
|
86 |
### Tokenizer:
|
87 |
|
88 |
- **Tokenizer Class**: `{tokenizer_config['tokenizer_class']}`
|
@@ -95,7 +96,7 @@ The model is capable of classifying tokens into 14 distinct categories :
|
|
95 |
|
96 |
This model was trained using diverse datasets, including cultural heritage texts and modern digitized documents. It excels in identifying editorial structures in noisy or unstructured text, making it robust for tasks involving broken text or OCR artifacts.
|
97 |
|
98 |
-
|
99 |
"""
|
100 |
|
101 |
placeholder = """PLEIAS :
|
|
|
7 |
|
8 |
|
9 |
description = """
|
10 |
+
✂️📜Segment-Text is a specialized language model for text-segmentation. ✂️📜Segment-Text has been trained to be resilient to broken and unstructured texts including digitzation artifacts and ill-recognized layout formats.
|
11 |
|
12 |
+
In contrast with most text-segmentation approach, ✂️📜Segment-Text is based on token classification. Editorial structure are reconstructed by the raw text without any reference to the original layout.
|
13 |
|
14 |
+
✂️📜Segment-Text was trained using HPC resources from GENCI–IDRIS on Ad Astra with 3,500 example of manually annotated texts, mostly coming from three large scale dataset collected by PleIAs, Finance Commons (financial documents in open data), Common Corpus (cultural heritage texts) and the Science Pile (scientific publication in open licenses - to be released).
|
15 |
|
16 |
+
Given the diversity of the training data, ✂️📜Segment-Text should work correctly on diverse document formats in the main European languages.
|
17 |
|
18 |
+
✂️📜Segment-Text can be tested on PleIAs-Bad-Data-Editor, a free demo along with OCRonos, another model trained by PleIAs for the correction of OCR errors and other digitization artifact.
|
19 |
"""
|
20 |
|
21 |
joinus = """
|
|
|
46 |
**Model Architecture**: `{config['architectures'][0]}`
|
47 |
**Model Type**: `{config['model_type']}`
|
48 |
|
49 |
+
✂️📜Segment-Text is a token classification model trained for segmenting unstructured or noisy text into meaningful categories. The model uses `{config['model_type']}` as its underlying architecture, optimized for token classification tasks with the following characteristics:
|
50 |
|
51 |
- **Hidden Size**: `{config['hidden_size']}`
|
52 |
- **Intermediate Size**: `{config['intermediate_size']}`
|
|
|
82 |
12. **{id2label['11']}** (11)
|
83 |
13. **{id2label['12']}** (12)
|
84 |
14. **{id2label['13']}** (13)
|
85 |
+
"""
|
86 |
+
modelinfor3 = f"""
|
87 |
### Tokenizer:
|
88 |
|
89 |
- **Tokenizer Class**: `{tokenizer_config['tokenizer_class']}`
|
|
|
96 |
|
97 |
This model was trained using diverse datasets, including cultural heritage texts and modern digitized documents. It excels in identifying editorial structures in noisy or unstructured text, making it robust for tasks involving broken text or OCR artifacts.
|
98 |
|
99 |
+
✂️📜Segment-Text is ideal for text segmentation across a variety of document types, including financial reports, scientific papers, and historical manuscripts.
|
100 |
"""
|
101 |
|
102 |
placeholder = """PLEIAS :
|