Emanuel Huber commited on
Commit
b149299
·
1 Parent(s): debadfd

Added multi-sentence support

Browse files
Files changed (4) hide show
  1. .gitignore +3 -1
  2. app.py +77 -17
  3. preprocessing.py +109 -0
  4. style.css +1 -1
.gitignore CHANGED
@@ -157,4 +157,6 @@ cython_debug/
157
  # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
  # and can be added to the global gitignore or merged into this file. For a more nuclear
159
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
- #.idea/
 
 
 
157
  # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
  # and can be added to the global gitignore or merged into this file. For a more nuclear
159
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
161
+
162
+ *.conllu
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import logging
2
  import os
 
 
3
  from typing import List, Tuple
4
 
5
  import gradio as gr
@@ -8,6 +10,8 @@ import spacy
8
  import torch
9
  from transformers import AutoModelForTokenClassification, AutoTokenizer
10
 
 
 
11
  try:
12
  nlp = spacy.load("pt_core_news_sm")
13
  except Exception:
@@ -58,6 +62,7 @@ def predict(text, nlp, logger=None) -> Tuple[List[str], List[str]]:
58
 
59
 
60
  def text_analysis(text):
 
61
  tokens, labels, scores = predict(text, nlp, logger)
62
  pos_count = pd.DataFrame(
63
  {
@@ -79,30 +84,85 @@ def text_analysis(text):
79
  }
80
 
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  css = open("style.css").read()
83
  top_html = open("top.html").read()
84
  bottom_html = open("bottom.html").read()
85
 
86
  with gr.Blocks(css=css) as demo:
87
  gr.HTML(top_html)
88
- text = gr.Textbox(placeholder="Enter your text here...", label="Input")
89
- examples = gr.Examples(
90
- examples=[
91
- [
92
- "A população não poderia ter acesso a relatórios que explicassem, por exemplo, os motivos exatos de atrasos em obras de linhas e estações."
 
 
 
 
 
93
  ],
94
- ["Filme 'Star Wars : Os Últimos Jedi' ganha trailer definitivo; assista."],
95
- ],
96
- inputs=[text],
97
- label="Select an example",
98
- )
99
- output_highlighted = gr.HighlightedText(label="Colorful output", visible=False)
100
- output_df = gr.Dataframe(label="Tabular output", visible=False)
101
- submit_btn = gr.Button("Send")
102
- submit_btn.click(
103
- fn=text_analysis, inputs=text, outputs=[output_highlighted, output_df]
104
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  gr.HTML(bottom_html)
106
 
107
 
108
- demo.launch(debug=True)
 
1
  import logging
2
  import os
3
+ import tempfile
4
+ from pathlib import Path
5
  from typing import List, Tuple
6
 
7
  import gradio as gr
 
10
  import torch
11
  from transformers import AutoModelForTokenClassification, AutoTokenizer
12
 
13
+ from preprocessing import expand_contractions
14
+
15
  try:
16
  nlp = spacy.load("pt_core_news_sm")
17
  except Exception:
 
62
 
63
 
64
  def text_analysis(text):
65
+ text = expand_contractions(text)
66
  tokens, labels, scores = predict(text, nlp, logger)
67
  pos_count = pd.DataFrame(
68
  {
 
84
  }
85
 
86
 
87
+ def batch_analysis(input_file):
88
+ text = open(input_file.name, encoding="utf-8").read()
89
+ text = text.split("\n")
90
+ name = Path(input_file.name).stem
91
+ sents = []
92
+ for sent in text:
93
+ sub_sents = nlp(sent).sents
94
+ sub_sents = [str(_sent).strip() for _sent in sub_sents]
95
+ sents += sub_sents
96
+ conllu_output = []
97
+
98
+ for i, sent in enumerate(sents):
99
+ conllu_output.append("# sent_id = {}-{}\n".format(name, i + 1))
100
+ conllu_output.append("# text = {}\n".format(sent))
101
+ tokens, labels, scores = predict(sent, nlp, logger)
102
+ for j, (token, label) in enumerate(zip(tokens, labels)):
103
+ conllu_output.append(
104
+ "{}\t{}\t_\t{}".format(j + 1, token, label) + "\t_" * 5 + "\n"
105
+ )
106
+ conllu_output.append("\n")
107
+
108
+ output_filename = "output.conllu"
109
+ with open(output_filename, "w") as out_f:
110
+ out_f.writelines(conllu_output)
111
+
112
+ return {output_file: output_file.update(visible=True, value=output_filename)}
113
+
114
+
115
  css = open("style.css").read()
116
  top_html = open("top.html").read()
117
  bottom_html = open("bottom.html").read()
118
 
119
  with gr.Blocks(css=css) as demo:
120
  gr.HTML(top_html)
121
+ with gr.Tab("Single sentence"):
122
+ text = gr.Textbox(placeholder="Enter your text here...", label="Input")
123
+ examples = gr.Examples(
124
+ examples=[
125
+ [
126
+ "A população não poderia ter acesso a relatórios que explicassem, por exemplo, os motivos exatos de atrasos em obras de linhas e estações."
127
+ ],
128
+ [
129
+ "Filme 'Star Wars : Os Últimos Jedi' ganha trailer definitivo; assista."
130
+ ],
131
  ],
132
+ inputs=[text],
133
+ label="Select an example",
134
+ )
135
+ output_highlighted = gr.HighlightedText(label="Colorful output", visible=False)
136
+ output_df = gr.Dataframe(label="Tabular output", visible=False)
137
+ submit_btn = gr.Button("Send")
138
+ submit_btn.click(
139
+ fn=text_analysis, inputs=text, outputs=[output_highlighted, output_df]
140
+ )
141
+ with gr.Tab("Multiple sentences"):
142
+ gr.HTML(
143
+ """
144
+ <p>Upload file with raw sentences in it. Below is an example of what we expect the contents of the file to look like.
145
+ Sentences are automatically splitted by Spacy's sentencizer.
146
+ To force an explicit division, manually separate the sentences on different lines.</p>
147
+ """
148
+ )
149
+ gr.Markdown(
150
+ """
151
+ ```
152
+ Então ele hesitou, quase como se estivesse surpreso com as próprias palavras, e recitou:
153
+ – Vá e não tornes a pecar!
154
+ Baley, sorrindo de repente, pegou no cotovelo de R. Daneel e eles saíram juntos pela porta.
155
+ ```
156
+ """
157
+ )
158
+ input_file = gr.File(label="Upload your input file here...")
159
+ output_file = gr.File(visible=False)
160
+ submit_btn_batch = gr.Button("Send")
161
+ submit_btn_batch.click(
162
+ fn=batch_analysis, inputs=input_file, outputs=output_file
163
+ )
164
+
165
  gr.HTML(bottom_html)
166
 
167
 
168
+ demo.launch(debug=True, server_port=15000)
preprocessing.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ contractions = {
4
+ r"(?<![\w.])no(s)?(?![$\w])": r"em o\g<1>",
5
+ r"(?<![\w.])na(s)?(?![$\w])": r"em a\g<1>",
6
+ r"(?<![\w.])da(s)?(?![$\w])": r"de a\g<1>",
7
+ r"(?<![\w.])do(s)?(?![$\w])": r"de o\g<1>",
8
+ r"(?<![\w.])ao(s)?(?![$\w])": r"a o\g<1>",
9
+ r"(?<![\w.])à(s)?(?![$\w])": r"a a\g<1>",
10
+ r"(?<![\w.])pela(s)?(?![$\w])": r"por a\g<1>",
11
+ r"(?<![\w.])pelo(s)?(?![$\w])": r"por o\g<1>",
12
+ r"(?<![\w.])nesta(s)?(?![$\w])": r"em esta\g<1>",
13
+ r"(?<![\w.])neste(s)?(?![$\w])": r"em este\g<1>",
14
+ r"(?<![\w.])nessa(s)?(?![$\w])": r"em essa\g<1>",
15
+ r"(?<![\w.])nesse(s)?(?![$\w])": r"em esse\g<1>",
16
+ r"(?<![\w.])num(?![$\w])": r"em um",
17
+ r"(?<![\w.])nuns(?![$\w])": r"em uns",
18
+ r"(?<![\w.])numa(s)?(?![$\w])": r"em uma\g<1>",
19
+ r"(?<![\w.])nisso(?![$\w])": r"em isso",
20
+ r"(?<![\w.])naquele(s)?(?![$\w])": r"em aquele\g<1>",
21
+ r"(?<![\w.])naquela(s)?(?![$\w])": r"em aquela\g<1>",
22
+ r"(?<![\w.])naquilo(?![$\w])": r"em aquelo",
23
+ r"(?<![\w.])duma(s)?(?![$\w])": r"de uma\g<1>",
24
+ r"(?<![\w.])daqui(?![$\w])": r"de aqui",
25
+ r"(?<![\w.])dali(?![$\w])": r"de ali",
26
+ r"(?<![\w.])daquele(s)?(?![$\w])": r"de aquele\g<1>",
27
+ r"(?<![\w.])daquela(s)?(?![$\w])": r"de aquela\g<1>",
28
+ r"(?<![\w.])deste(s)?(?![$\w])": r"de este\g<1>",
29
+ r"(?<![\w.])desta(s)?(?![$\w])": r"de esta\g<1>",
30
+ r"(?<![\w.])desse(s)?(?![$\w])": r"de esse\g<1>",
31
+ r"(?<![\w.])dessa(s)?(?![$\w])": r"de essa\g<1>",
32
+ r"(?<![\w.])daí(?![$\w])": r"de aí",
33
+ r"(?<![\w.])dum(?![$\w])": r"de um",
34
+ r"(?<![\w.])donde(?![$\w])": r"de onde",
35
+ r"(?<![\w.])disto(?![$\w])": r"de isto",
36
+ r"(?<![\w.])disso(?![$\w])": r"de isso",
37
+ r"(?<![\w.])daquilo(?![$\w])": r"de aquilo",
38
+ r"(?<![\w.])dela(s)?(?![$\w])": r"de ela\g<1>",
39
+ r"(?<![\w.])dele(s)?(?![$\w])": r"de ele\g<1>",
40
+ r"(?<![\w.])nisto(?![$\w])": r"em isto",
41
+ r"(?<![\w.])nele(s)?(?![$\w])": r"em ele\g<1>",
42
+ r"(?<![\w.])nela(s)?(?![$\w])": r"em ela\g<1>",
43
+ r"(?<![\w.])d'?ele(s)?(?![$\w])": r"de ele\g<1>",
44
+ r"(?<![\w.])d'?ela(s)?(?![$\w])": r"de ela\g<1>",
45
+ r"(?<![\w.])noutro(s)?(?![$\w])": r"em outro\g<1>",
46
+ r"(?<![\w.])aonde(?![$\w])": r"a onde",
47
+ r"(?<![\w.])àquela(s)?(?![$\w])": r"a aquela\g<1>",
48
+ r"(?<![\w.])àquele(s)?(?![$\w])": r"a aquele\g<1>",
49
+ r"(?<![\w.])àquilo(?![$\w])": r"a aquelo",
50
+ r"(?<![\w.])contigo(?![$\w])": r"com ti",
51
+ r"(?<![\w.])né(?![$\w])": r"não é",
52
+ r"(?<![\w.])comigo(?![$\w])": r"com mim",
53
+ r"(?<![\w.])contigo(?![$\w])": r"com ti",
54
+ r"(?<![\w.])conosco(?![$\w])": r"com nós",
55
+ r"(?<![\w.])consigo(?![$\w])": r"com si",
56
+ r"(?<![\w.])pra(?![$\w])": r"para a",
57
+ r"(?<![\w.])pro(?![$\w])": r"para o",
58
+ }
59
+
60
+
61
+ def replace_keep_case(word, replacement, text):
62
+ """
63
+ Custom function for replace keeping the original case.
64
+ Parameters
65
+ ----------
66
+ word: str
67
+ Text to be replaced.
68
+ replacement: str
69
+ String to replace word.
70
+ text:
71
+ Text to be processed.
72
+ Returns
73
+ -------
74
+ str:
75
+ Processed string
76
+ """
77
+
78
+ def func(match):
79
+ g = match.group()
80
+ repl = match.expand(replacement)
81
+ if g.islower():
82
+ return repl.lower()
83
+ if g.istitle():
84
+ return repl.capitalize()
85
+ if g.isupper():
86
+ return repl.upper()
87
+ return repl
88
+
89
+ return re.sub(word, func, text, flags=re.I)
90
+
91
+
92
+ def expand_contractions(text: str) -> str:
93
+ """
94
+ Replace contractions to their based form.
95
+ Parameters
96
+ ----------
97
+ text: str
98
+ Text that may contain contractions.
99
+ Returns
100
+ -------
101
+ str:
102
+ Text with expanded contractions.
103
+ """
104
+
105
+ for contraction in contractions.keys():
106
+ replace_str = contractions[contraction]
107
+ text = replace_keep_case(contraction, replace_str, text)
108
+
109
+ return text
style.css CHANGED
@@ -14,7 +14,7 @@ a {
14
  }
15
 
16
  .container {
17
- max-width: 730px;
18
  margin: auto;
19
  padding-top: 1.5rem;
20
  }
 
14
  }
15
 
16
  .container {
17
+ max-width: 900px;
18
  margin: auto;
19
  padding-top: 1.5rem;
20
  }