Tonic commited on
Commit
9d032cb
β€’
1 Parent(s): 69444a0

add segment text

Browse files
Files changed (3) hide show
  1. README.md +2 -2
  2. app.py +68 -0
  3. requirements.txt +3 -0
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
- title: Segment Text
3
  emoji: πŸ”₯
4
  colorFrom: yellow
5
  colorTo: blue
6
  sdk: gradio
7
  sdk_version: 4.44.0
8
  app_file: app.py
9
- pinned: false
10
  license: mit
11
  ---
12
 
 
1
  ---
2
+ title: Tonic's Segment Text
3
  emoji: πŸ”₯
4
  colorFrom: yellow
5
  colorTo: blue
6
  sdk: gradio
7
  sdk_version: 4.44.0
8
  app_file: app.py
9
+ pinned: true
10
  license: mit
11
  ---
12
 
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import DebertaV2Tokenizer, DebertaV2ForTokenClassification
3
+ import torch
4
+
5
+
6
+ model_name = "PleIAs/Segmentext"
7
+ tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)
8
+ model = DebertaV2ForTokenClassification.from_pretrained(model_name)
9
+
10
+ id2label = {
11
+ 0: "author", 1: "bibliography", 2: "caption", 3: "contact",
12
+ 4: "date", 5: "dialog", 6: "footnote", 7: "keywords",
13
+ 8: "math", 9: "paratext", 10: "separator", 11: "table",
14
+ 12: "text", 13: "title"
15
+ }
16
+
17
+
18
+ color_map = {
19
+ "author": "blue", "bibliography": "purple", "caption": "orange",
20
+ "contact": "cyan", "date": "green", "dialog": "yellow",
21
+ "footnote": "pink", "keywords": "lightblue", "math": "red",
22
+ "paratext": "lightgreen", "separator": "gray", "table": "brown",
23
+ "text": "lightgray", "title": "gold"
24
+ }
25
+
26
+
27
+ def segment_text(input_text):
28
+
29
+ tokens = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
30
+
31
+ with torch.no_grad():
32
+ outputs = model(**tokens)
33
+
34
+ logits = outputs.logits
35
+ predictions = torch.argmax(logits, dim=-1).squeeze().tolist()
36
+
37
+ tokens_decoded = tokenizer.convert_ids_to_tokens(tokens['input_ids'].squeeze())
38
+
39
+ segments = []
40
+ current_word = ""
41
+ for token, label_id in zip(tokens_decoded, predictions):
42
+ if token.startswith("▁"): # handling wordpieces, specific to some tokenizers
43
+ if current_word:
44
+ segments.append((current_word, id2label[label_id]))
45
+ current_word = token.replace("▁", "") # new word
46
+ else:
47
+ current_word += token # append subword part to current word
48
+
49
+ if current_word:
50
+ segments.append((current_word, id2label[label_id]))
51
+
52
+ return segments
53
+
54
+ with gr.Blocks() as demo:
55
+ gr.Markdown("# PleIAs/Segmentext Text Segmentation Demo")
56
+
57
+ with gr.Row():
58
+ input_text = gr.Textbox(label="Input Text", lines=5, placeholder="Enter text for segmentation")
59
+ output_text = gr.HighlightedText(label="Segmented Text", color_map=color_map, combine_adjacent=True)
60
+
61
+ def process(input_text):
62
+ return segment_text(input_text)
63
+
64
+ submit_button = gr.Button("Segment Text")
65
+ submit_button.click(fn=process, inputs=input_text, outputs=output_text)
66
+
67
+ if __name__ == "__main__":
68
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ transformers
3
+ accelerate