Tonic commited on
Commit
b9152b3
1 Parent(s): c1a6fc6

add description

Browse files
Files changed (2) hide show
  1. app.py +7 -8
  2. globe.py +9 -8
app.py CHANGED
@@ -2,8 +2,7 @@ import gradio as gr
2
  from transformers import DebertaV2Tokenizer, DebertaV2ForTokenClassification
3
  import torch
4
  from huggingface_hub import hf_hub_download
5
- import json
6
- from globe import title, description, joinus, model_name, placeholder, modelinfor1, modelinfor2, id2label
7
 
8
  tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)
9
  model = DebertaV2ForTokenClassification.from_pretrained(model_name)
@@ -51,12 +50,12 @@ with gr.Blocks() as demo:
51
  with gr.Group():
52
  gr.Markdown(description)
53
  with gr.Row():
54
- with gr.Column(scale=1):
55
- with gr.Row():
56
- with gr.Group():
57
- gr.Markdown(modelinfor1)
58
- with gr.Group():
59
- gr.Markdown(modelinfor2)
60
  with gr.Accordion(label="Join Us", open=False):
61
  gr.Markdown(joinus)
62
  with gr.Row():
 
2
  from transformers import DebertaV2Tokenizer, DebertaV2ForTokenClassification
3
  import torch
4
  from huggingface_hub import hf_hub_download
5
+ from globe import title, description, joinus, model_name, placeholder, modelinfor1, modelinfor2, modelinfor3, id2label
 
6
 
7
  tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)
8
  model = DebertaV2ForTokenClassification.from_pretrained(model_name)
 
50
  with gr.Group():
51
  gr.Markdown(description)
52
  with gr.Row():
53
+ with gr.Group():
54
+ gr.Markdown(modelinfor1)
55
+ with gr.Group():
56
+ gr.Markdown(modelinfor2)
57
+ with gr.Group():
58
+ gr.Markdown(modelinfor3)
59
  with gr.Accordion(label="Join Us", open=False):
60
  gr.Markdown(joinus)
61
  with gr.Row():
globe.py CHANGED
@@ -7,15 +7,15 @@ title = """# 🙋🏻‍♂️Welcome to Tonic's PLeIAs/✂️📜Segment-Text
7
 
8
 
9
  description = """
10
- Segmentext is a specialized language model for text-segmentation. Segmentext has been trained to be resilient to broken and unstructured texts including digitzation artifacts and ill-recognized layout formats.
11
 
12
- In contrast with most text-segmentation approach, Segmentext is based on token classification. Editorial structure are reconstructed by the raw text without any reference to the original layout.
13
 
14
- Segmentext was trained using HPC resources from GENCI–IDRIS on Ad Astra with 3,500 example of manually annotated texts, mostly coming from three large scale dataset collected by PleIAs, Finance Commons (financial documents in open data), Common Corpus (cultural heritage texts) and the Science Pile (scientific publication in open licenses - to be released).
15
 
16
- Given the diversity of the training data, Segmentext should work correctly on diverse document formats in the main European languages.
17
 
18
- Segmentext can be tested on PleIAs-Bad-Data-Editor, a free demo along with OCRonos, another model trained by PleIAs for the correction of OCR errors and other digitization artifact.
19
  """
20
 
21
  joinus = """
@@ -46,7 +46,7 @@ modelinfor1 = f"""
46
  **Model Architecture**: `{config['architectures'][0]}`
47
  **Model Type**: `{config['model_type']}`
48
 
49
- Segmentext is a token classification model trained for segmenting unstructured or noisy text into meaningful categories. The model uses `{config['model_type']}` as its underlying architecture, optimized for token classification tasks with the following characteristics:
50
 
51
  - **Hidden Size**: `{config['hidden_size']}`
52
  - **Intermediate Size**: `{config['intermediate_size']}`
@@ -82,7 +82,8 @@ The model is capable of classifying tokens into 14 distinct categories :
82
  12. **{id2label['11']}** (11)
83
  13. **{id2label['12']}** (12)
84
  14. **{id2label['13']}** (13)
85
-
 
86
  ### Tokenizer:
87
 
88
  - **Tokenizer Class**: `{tokenizer_config['tokenizer_class']}`
@@ -95,7 +96,7 @@ The model is capable of classifying tokens into 14 distinct categories :
95
 
96
  This model was trained using diverse datasets, including cultural heritage texts and modern digitized documents. It excels in identifying editorial structures in noisy or unstructured text, making it robust for tasks involving broken text or OCR artifacts.
97
 
98
- Segmentext is ideal for text segmentation across a variety of document types, including financial reports, scientific papers, and historical manuscripts.
99
  """
100
 
101
  placeholder = """PLEIAS :
 
7
 
8
 
9
  description = """
10
+ ✂️📜Segment-Text is a specialized language model for text-segmentation. ✂️📜Segment-Text has been trained to be resilient to broken and unstructured texts including digitzation artifacts and ill-recognized layout formats.
11
 
12
+ In contrast with most text-segmentation approach, ✂️📜Segment-Text is based on token classification. Editorial structure are reconstructed by the raw text without any reference to the original layout.
13
 
14
+ ✂️📜Segment-Text was trained using HPC resources from GENCI–IDRIS on Ad Astra with 3,500 example of manually annotated texts, mostly coming from three large scale dataset collected by PleIAs, Finance Commons (financial documents in open data), Common Corpus (cultural heritage texts) and the Science Pile (scientific publication in open licenses - to be released).
15
 
16
+ Given the diversity of the training data, ✂️📜Segment-Text should work correctly on diverse document formats in the main European languages.
17
 
18
+ ✂️📜Segment-Text can be tested on PleIAs-Bad-Data-Editor, a free demo along with OCRonos, another model trained by PleIAs for the correction of OCR errors and other digitization artifact.
19
  """
20
 
21
  joinus = """
 
46
  **Model Architecture**: `{config['architectures'][0]}`
47
  **Model Type**: `{config['model_type']}`
48
 
49
+ ✂️📜Segment-Text is a token classification model trained for segmenting unstructured or noisy text into meaningful categories. The model uses `{config['model_type']}` as its underlying architecture, optimized for token classification tasks with the following characteristics:
50
 
51
  - **Hidden Size**: `{config['hidden_size']}`
52
  - **Intermediate Size**: `{config['intermediate_size']}`
 
82
  12. **{id2label['11']}** (11)
83
  13. **{id2label['12']}** (12)
84
  14. **{id2label['13']}** (13)
85
+ """
86
+ modelinfor3 = f"""
87
  ### Tokenizer:
88
 
89
  - **Tokenizer Class**: `{tokenizer_config['tokenizer_class']}`
 
96
 
97
  This model was trained using diverse datasets, including cultural heritage texts and modern digitized documents. It excels in identifying editorial structures in noisy or unstructured text, making it robust for tasks involving broken text or OCR artifacts.
98
 
99
+ ✂️📜Segment-Text is ideal for text segmentation across a variety of document types, including financial reports, scientific papers, and historical manuscripts.
100
  """
101
 
102
  placeholder = """PLEIAS :