Spaces:
Runtime error
Runtime error
Fixed contraction format and added jupyter tagger version
Browse files- .dockerignore +24 -0
- .env.example +7 -0
- .gitattributes +1 -1
- .gitignore +5 -1
- Dockerfile +21 -0
- README.md +2 -2
- app.py +36 -71
- main.ipynb +0 -0
- main.py +132 -0
- preprocessing.py +57 -1
- requirements.txt +2 -0
- top.html +1 -1
.dockerignore
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
**/.classpath
|
2 |
+
**/.dockerignore
|
3 |
+
**/.git
|
4 |
+
**/.gitignore
|
5 |
+
**/.project
|
6 |
+
**/.settings
|
7 |
+
**/.toolstarget
|
8 |
+
**/.vs
|
9 |
+
**/.vscode
|
10 |
+
**/*.*proj.user
|
11 |
+
**/*.dbmdl
|
12 |
+
**/*.jfm
|
13 |
+
**/bin
|
14 |
+
**/charts
|
15 |
+
**/docker-compose*
|
16 |
+
**/compose*
|
17 |
+
**/Dockerfile*
|
18 |
+
**/node_modules
|
19 |
+
**/npm-debug.log
|
20 |
+
**/obj
|
21 |
+
**/secrets.dev.yaml
|
22 |
+
**/values.dev.yaml
|
23 |
+
LICENSE
|
24 |
+
README.md
|
.env.example
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
DEFAULT_MODEL=Nome do modelo aqui (News / Tweets (stock market) / Oil and Gas (academic texts) / Multigenre)
|
2 |
+
ID_COLUMN=nome da coluna com os ids dos tweets
|
3 |
+
CONTENT_COLUMN=nome da coluna com o conteudo dos tweets
|
4 |
+
PREFIX=prefixo para adicionar ao id dos tweets
|
5 |
+
DATA_PATH=caminho para o arquivos .csv dos tweets
|
6 |
+
OUTPUT_PATH=caminho para o arquivo de saída .conllu
|
7 |
+
KEEP_REPLACE_CONTRACTION=Se a forma original da contrações deve ser mantida (True/False)
|
.gitattributes
CHANGED
@@ -31,4 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
31 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
31 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -159,4 +159,8 @@ cython_debug/
|
|
159 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
#.idea/
|
161 |
|
162 |
-
*.conllu
|
|
|
|
|
|
|
|
|
|
159 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
#.idea/
|
161 |
|
162 |
+
*.conllu
|
163 |
+
|
164 |
+
/data
|
165 |
+
/output
|
166 |
+
.env
|
Dockerfile
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use a imagem oficial do Python 3.10 como base
|
2 |
+
FROM python:3.10-slim
|
3 |
+
|
4 |
+
# Defina o diretório de trabalho como /app
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
# Copie o arquivo de requisitos para o diretório de trabalho
|
8 |
+
COPY requirements.txt .
|
9 |
+
|
10 |
+
# Instale as dependências do Python
|
11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
12 |
+
|
13 |
+
# Copie o seu arquivo Python para o diretório de trabalho
|
14 |
+
COPY . .
|
15 |
+
|
16 |
+
# Comando padrão para executar seu arquivo Python quando o contêiner for iniciado
|
17 |
+
CMD ["python", "main.py"]
|
18 |
+
|
19 |
+
#Execute os seguintes comandos em ordem:
|
20 |
+
#docker build -t porttaggerdante .
|
21 |
+
#docker run -v "caminho pro arquivo de saída:/app/output" porttaggerdante
|
README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
---
|
2 |
-
title: Porttagger
|
3 |
emoji: ✍️
|
4 |
-
colorFrom:
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.9.1
|
|
|
1 |
---
|
2 |
+
title: Porttagger-DANTE
|
3 |
emoji: ✍️
|
4 |
+
colorFrom: orange
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.9.1
|
app.py
CHANGED
@@ -7,10 +7,12 @@ import gradio as gr
|
|
7 |
import pandas as pd
|
8 |
import spacy
|
9 |
import torch
|
|
|
10 |
from dante_tokenizer import DanteTokenizer
|
|
|
11 |
from transformers import AutoModelForTokenClassification, AutoTokenizer
|
12 |
|
13 |
-
from preprocessing import
|
14 |
|
15 |
try:
|
16 |
nlp = spacy.load("pt_core_news_sm")
|
@@ -19,7 +21,7 @@ except Exception:
|
|
19 |
nlp = spacy.load("pt_core_news_sm")
|
20 |
dt_tokenizer = DanteTokenizer()
|
21 |
|
22 |
-
default_model = "
|
23 |
model_choices = {
|
24 |
"News": "Emanuel/porttagger-news-base",
|
25 |
"Tweets (stock market)": "Emanuel/porttagger-tweets-base",
|
@@ -90,59 +92,37 @@ def predict(text, logger=None) -> Tuple[List[str], List[str]]:
|
|
90 |
return tokens, labels, scores
|
91 |
|
92 |
|
93 |
-
def
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
scores += [0] * m
|
100 |
-
pos_count = pd.DataFrame(
|
101 |
-
{
|
102 |
-
"token": tokens,
|
103 |
-
"tag": labels,
|
104 |
-
"confidence": scores,
|
105 |
-
}
|
106 |
-
)
|
107 |
-
pos_tokens = []
|
108 |
-
for token, label in zip(tokens, labels):
|
109 |
-
pos_tokens.extend([(token, label), (" ", None)])
|
110 |
-
|
111 |
-
output_highlighted.update(visible=True)
|
112 |
-
output_df.update(visible=True)
|
113 |
-
|
114 |
-
return {
|
115 |
-
output_highlighted: output_highlighted.update(visible=True, value=(pos_tokens)),
|
116 |
-
output_df: output_df.update(visible=True, value=pos_count),
|
117 |
-
}
|
118 |
-
|
119 |
-
|
120 |
-
def batch_analysis(input_file):
|
121 |
-
text = open(input_file.name, encoding="utf-8").read()
|
122 |
-
text = text.split("\n")
|
123 |
-
name = Path(input_file.name).stem
|
124 |
-
sents = []
|
125 |
-
for sent in text:
|
126 |
-
sub_sents = nlp(sent).sents
|
127 |
-
sub_sents = [str(_sent).strip() for _sent in sub_sents]
|
128 |
-
sents += sub_sents
|
129 |
conllu_output = []
|
130 |
|
131 |
-
for
|
132 |
-
|
133 |
-
conllu_output.append("# sent_id = {}-{}\n".format(name, i + 1))
|
134 |
conllu_output.append("# text = {}\n".format(sent))
|
135 |
-
tokens, labels,
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
conllu_output.append("\n")
|
141 |
-
|
142 |
output_filename = "output.conllu"
|
143 |
-
with open(output_filename, "w") as out_f:
|
144 |
out_f.writelines(conllu_output)
|
145 |
-
|
146 |
return {output_file: output_file.update(visible=True, value=output_filename)}
|
147 |
|
148 |
|
@@ -154,26 +134,11 @@ with gr.Blocks(css=css) as demo:
|
|
154 |
gr.HTML(top_html)
|
155 |
select_model = gr.Dropdown(choices=list(model_choices.keys()), label="Tagger model", value=default_model)
|
156 |
select_model.change(myapp.load_model, inputs=[select_model])
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
"A população não poderia ter acesso a relatórios que explicassem, por exemplo, os motivos exatos de atrasos em obras de linhas e estações."
|
163 |
-
],
|
164 |
-
[
|
165 |
-
"Filme 'Star Wars : Os Últimos Jedi' ganha trailer definitivo; assista."
|
166 |
-
],
|
167 |
-
],
|
168 |
-
inputs=[text],
|
169 |
-
label="Select an example",
|
170 |
-
)
|
171 |
-
output_highlighted = gr.HighlightedText(label="Colorful output", visible=False)
|
172 |
-
output_df = gr.Dataframe(label="Tabular output", visible=False)
|
173 |
-
submit_btn = gr.Button("Tag it")
|
174 |
-
submit_btn.click(
|
175 |
-
fn=text_analysis, inputs=text, outputs=[output_highlighted, output_df]
|
176 |
-
)
|
177 |
with gr.Tab("Multiple sentences"):
|
178 |
gr.HTML(
|
179 |
"""
|
@@ -197,10 +162,10 @@ with gr.Blocks(css=css) as demo:
|
|
197 |
output_file = gr.File(label="Tagged file", visible=False)
|
198 |
submit_btn_batch = gr.Button("Tag it")
|
199 |
submit_btn_batch.click(
|
200 |
-
fn=
|
201 |
)
|
202 |
|
203 |
gr.HTML(bottom_html)
|
204 |
|
205 |
|
206 |
-
demo.launch(debug=True)
|
|
|
7 |
import pandas as pd
|
8 |
import spacy
|
9 |
import torch
|
10 |
+
|
11 |
from dante_tokenizer import DanteTokenizer
|
12 |
+
|
13 |
from transformers import AutoModelForTokenClassification, AutoTokenizer
|
14 |
|
15 |
+
from preprocessing import *
|
16 |
|
17 |
try:
|
18 |
nlp = spacy.load("pt_core_news_sm")
|
|
|
21 |
nlp = spacy.load("pt_core_news_sm")
|
22 |
dt_tokenizer = DanteTokenizer()
|
23 |
|
24 |
+
default_model = "Tweets (stock market)"
|
25 |
model_choices = {
|
26 |
"News": "Emanuel/porttagger-news-base",
|
27 |
"Tweets (stock market)": "Emanuel/porttagger-tweets-base",
|
|
|
92 |
return tokens, labels, scores
|
93 |
|
94 |
|
95 |
+
def batch_analysis_csv(input_file, id_column: str='tweet_id', content_column: str='content', prefix: str='dante_02', keep_replace_contraction=True):
|
96 |
+
df = pd.read_csv(input_file.name, encoding='utf-8')
|
97 |
+
ids = df[id_column]
|
98 |
+
texts = df[content_column]
|
99 |
+
texts = texts.replace(r'\\n', ' ', regex=True)
|
100 |
+
texts = texts.apply(lambda x : x.strip())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
conllu_output = []
|
102 |
|
103 |
+
for id, sent in zip(ids, texts):
|
104 |
+
conllu_output.append("# sent_id = {}_{}\n".format(prefix, id))
|
|
|
105 |
conllu_output.append("# text = {}\n".format(sent))
|
106 |
+
tokens, labels, _ = predict(sent, logger)
|
107 |
+
tokens_labels = list(zip(tokens, labels))
|
108 |
+
|
109 |
+
for j, (token, label) in enumerate(tokens_labels):
|
110 |
+
try:
|
111 |
+
contr = tokens_labels[j][0] + ' ' + tokens_labels[j+1][0]
|
112 |
+
for expansion in expansions.keys():
|
113 |
+
replace_str = expansions[expansion]
|
114 |
+
match = re.match(expansion, contr, re.I)
|
115 |
+
expansion = replace_keep_case(expansion, replace_str, contr)
|
116 |
+
if match is not None:
|
117 |
+
conllu_output.append("{}\t{}".format(str(j+1)+'-'+str(j+2), expansion) + "\t_" * 8 + "\n")
|
118 |
+
break
|
119 |
+
conllu_output.append("{}\t{}\t_\t{}".format(j + 1, token, label) + "\t_" * 6 + "\n")
|
120 |
+
except IndexError:
|
121 |
+
conllu_output.append("{}\t{}\t_\t{}".format(j + 1, token, label) + "\t_" * 6 + "\n")
|
122 |
conllu_output.append("\n")
|
|
|
123 |
output_filename = "output.conllu"
|
124 |
+
with open(output_filename, "w", encoding='utf-8') as out_f:
|
125 |
out_f.writelines(conllu_output)
|
|
|
126 |
return {output_file: output_file.update(visible=True, value=output_filename)}
|
127 |
|
128 |
|
|
|
134 |
gr.HTML(top_html)
|
135 |
select_model = gr.Dropdown(choices=list(model_choices.keys()), label="Tagger model", value=default_model)
|
136 |
select_model.change(myapp.load_model, inputs=[select_model])
|
137 |
+
|
138 |
+
id_column = gr.Textbox(placeholder='tweet_id', label='Id column')
|
139 |
+
content_column = gr.Textbox(placeholder='content', label='Content column')
|
140 |
+
label_prefix = gr.Textbox(placeholder='dante_02', label='Label prefix')
|
141 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
with gr.Tab("Multiple sentences"):
|
143 |
gr.HTML(
|
144 |
"""
|
|
|
162 |
output_file = gr.File(label="Tagged file", visible=False)
|
163 |
submit_btn_batch = gr.Button("Tag it")
|
164 |
submit_btn_batch.click(
|
165 |
+
fn=batch_analysis_csv, inputs=[input_file, id_column], outputs=output_file
|
166 |
)
|
167 |
|
168 |
gr.HTML(bottom_html)
|
169 |
|
170 |
|
171 |
+
demo.launch(debug=True)
|
main.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
main.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
from typing import List, Tuple
|
4 |
+
import pandas as pd
|
5 |
+
import spacy
|
6 |
+
import torch
|
7 |
+
from dante_tokenizer import DanteTokenizer
|
8 |
+
from transformers import AutoModelForTokenClassification, AutoTokenizer
|
9 |
+
from dotenv import dotenv_values
|
10 |
+
|
11 |
+
from dante_tokenizer.data.preprocessing import split_monetary_tokens, normalize_text, split_enclisis
|
12 |
+
from preprocessing import *
|
13 |
+
|
14 |
+
try:
|
15 |
+
nlp = spacy.load("pt_core_news_sm")
|
16 |
+
except Exception:
|
17 |
+
os.system("python -m spacy download pt_core_news_sm")
|
18 |
+
nlp = spacy.load("pt_core_news_sm")
|
19 |
+
dt_tokenizer = DanteTokenizer()
|
20 |
+
|
21 |
+
model_choices = {
|
22 |
+
"News": "Emanuel/porttagger-news-base",
|
23 |
+
"Tweets (stock market)": "Emanuel/porttagger-tweets-base",
|
24 |
+
"Oil and Gas (academic texts)": "Emanuel/porttagger-oilgas-base",
|
25 |
+
"Multigenre": "Emanuel/porttagger-base",
|
26 |
+
}
|
27 |
+
pre_tokenizers = {
|
28 |
+
"News": nlp,
|
29 |
+
"Tweets (stock market)": dt_tokenizer.tokenize,
|
30 |
+
"Oil and Gas (academic texts)": nlp,
|
31 |
+
"Multigenre": nlp,
|
32 |
+
}
|
33 |
+
|
34 |
+
env_vars = dotenv_values('.env')
|
35 |
+
|
36 |
+
for key, value in env_vars.items():
|
37 |
+
globals()[key] = value
|
38 |
+
|
39 |
+
logger = logging.getLogger()
|
40 |
+
logger.setLevel(logging.DEBUG)
|
41 |
+
|
42 |
+
class MyApp:
|
43 |
+
def __init__(self) -> None:
|
44 |
+
self.model = None
|
45 |
+
self.tokenizer = None
|
46 |
+
self.pre_tokenizer = None
|
47 |
+
self.load_model()
|
48 |
+
|
49 |
+
def load_model(self, model_name: str = DEFAULT_MODEL):
|
50 |
+
if model_name not in model_choices.keys():
|
51 |
+
logger.error("Selected model is not supported, resetting to the default model.")
|
52 |
+
model_name = DEFAULT_MODEL
|
53 |
+
self.model = AutoModelForTokenClassification.from_pretrained(model_choices[model_name])
|
54 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_choices[model_name])
|
55 |
+
self.pre_tokenizer = pre_tokenizers[model_name]
|
56 |
+
|
57 |
+
myapp = MyApp()
|
58 |
+
|
59 |
+
def predict(text, logger=None) -> Tuple[List[str], List[str]]:
|
60 |
+
doc = myapp.pre_tokenizer(text)
|
61 |
+
tokens = [token.text if not isinstance(token, str) else token for token in doc]
|
62 |
+
|
63 |
+
logger.info("Starting predictions for sentence: {}".format(text))
|
64 |
+
print("Using model {}".format(myapp.model.config.__dict__["_name_or_path"]))
|
65 |
+
|
66 |
+
input_tokens = myapp.tokenizer(
|
67 |
+
tokens,
|
68 |
+
return_tensors="pt",
|
69 |
+
is_split_into_words=True,
|
70 |
+
return_offsets_mapping=True,
|
71 |
+
return_special_tokens_mask=True,
|
72 |
+
|
73 |
+
)
|
74 |
+
output = myapp.model(input_tokens["input_ids"])
|
75 |
+
|
76 |
+
i_token = 0
|
77 |
+
labels = []
|
78 |
+
scores = []
|
79 |
+
for off, is_special_token, pred in zip(
|
80 |
+
input_tokens["offset_mapping"][0],
|
81 |
+
input_tokens["special_tokens_mask"][0],
|
82 |
+
output.logits[0],
|
83 |
+
):
|
84 |
+
if is_special_token or off[0] > 0:
|
85 |
+
continue
|
86 |
+
label = myapp.model.config.__dict__["id2label"][int(pred.argmax(axis=-1))]
|
87 |
+
if logger is not None:
|
88 |
+
logger.info("{}, {}, {}".format(off, tokens[i_token], label))
|
89 |
+
labels.append(label)
|
90 |
+
scores.append(
|
91 |
+
"{:.2f}".format(100 * float(torch.softmax(pred, dim=-1).detach().max()))
|
92 |
+
)
|
93 |
+
i_token += 1
|
94 |
+
|
95 |
+
return tokens, labels, scores
|
96 |
+
|
97 |
+
def batch_analysis_csv(ID_COLUMN: str, CONTENT_COLUMN: str, DATA_PATH: str, PREFIX:str, OUTPUT_PATH: str, KEEP_REPLACE_CONTRACTION: bool):
|
98 |
+
df = pd.read_csv(DATA_PATH)
|
99 |
+
ids = df[ID_COLUMN]
|
100 |
+
texts = df[CONTENT_COLUMN]
|
101 |
+
texts = texts.replace(r'\\n', ' ', regex=True) # remover '\n' mas não por espaço
|
102 |
+
texts = texts.apply(lambda x : x.strip()) # remover espaços excedentes
|
103 |
+
conllu_output = []
|
104 |
+
|
105 |
+
for id, sent in zip(ids, texts):
|
106 |
+
conllu_output.append("# sent_id = {}_{}\n".format(PREFIX, id))
|
107 |
+
conllu_output.append("# text = {}\n".format(sent))
|
108 |
+
tokens, labels, _ = predict(sent, logger)
|
109 |
+
tokens_labels = list(zip(tokens, labels))
|
110 |
+
|
111 |
+
for j, (token, label) in enumerate(tokens_labels):
|
112 |
+
try:
|
113 |
+
contr = tokens_labels[j][0] + ' ' + tokens_labels[j+1][0]
|
114 |
+
for expansion in expansions.keys():
|
115 |
+
replace_str = expansions[expansion]
|
116 |
+
match = re.match(expansion, contr, re.IGNORECASE)
|
117 |
+
expansion = replace_keep_case(expansion, replace_str, contr)
|
118 |
+
if match is not None:
|
119 |
+
conllu_output.append("{}\t{}".format(str(j+1)+'-'+str(j+2), expansion) + "\t_" * 8 + "\n")
|
120 |
+
break
|
121 |
+
conllu_output.append("{}\t{}\t_\t{}".format(j + 1, token, label) + "\t_" * 6 + "\n")
|
122 |
+
except IndexError:
|
123 |
+
conllu_output.append("{}\t{}\t_\t{}".format(j + 1, token, label) + "\t_" * 6 + "\n")
|
124 |
+
conllu_output.append("\n")
|
125 |
+
with open(OUTPUT_PATH, 'w', encoding='utf-8') as out_f:
|
126 |
+
out_f.writelines(conllu_output)
|
127 |
+
|
128 |
+
def main():
|
129 |
+
batch_analysis_csv(ID_COLUMN, CONTENT_COLUMN, DATA_PATH, PREFIX, OUTPUT_PATH, KEEP_REPLACE_CONTRACTION)
|
130 |
+
|
131 |
+
if __name__ == '__main__':
|
132 |
+
main()
|
preprocessing.py
CHANGED
@@ -46,7 +46,7 @@ contractions = {
|
|
46 |
r"(?<![\w.])aonde(?![$\w])": r"a onde",
|
47 |
r"(?<![\w.])àquela(s)?(?![$\w])": r"a aquela\g<1>",
|
48 |
r"(?<![\w.])àquele(s)?(?![$\w])": r"a aquele\g<1>",
|
49 |
-
r"(?<![\w.])àquilo(?![$\w])": r"a
|
50 |
r"(?<![\w.])contigo(?![$\w])": r"com ti",
|
51 |
r"(?<![\w.])né(?![$\w])": r"não é",
|
52 |
r"(?<![\w.])comigo(?![$\w])": r"com mim",
|
@@ -58,6 +58,60 @@ contractions = {
|
|
58 |
}
|
59 |
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
def replace_keep_case(word, replacement, text):
|
62 |
"""
|
63 |
Custom function for replace keeping the original case.
|
@@ -84,6 +138,8 @@ def replace_keep_case(word, replacement, text):
|
|
84 |
return repl.capitalize()
|
85 |
if g.isupper():
|
86 |
return repl.upper()
|
|
|
|
|
87 |
return repl
|
88 |
|
89 |
return re.sub(word, func, text, flags=re.I)
|
|
|
46 |
r"(?<![\w.])aonde(?![$\w])": r"a onde",
|
47 |
r"(?<![\w.])àquela(s)?(?![$\w])": r"a aquela\g<1>",
|
48 |
r"(?<![\w.])àquele(s)?(?![$\w])": r"a aquele\g<1>",
|
49 |
+
r"(?<![\w.])àquilo(?![$\w])": r"a aquilo",
|
50 |
r"(?<![\w.])contigo(?![$\w])": r"com ti",
|
51 |
r"(?<![\w.])né(?![$\w])": r"não é",
|
52 |
r"(?<![\w.])comigo(?![$\w])": r"com mim",
|
|
|
58 |
}
|
59 |
|
60 |
|
61 |
+
expansions = {
|
62 |
+
r'^em o(s)?$': r'no\g<1>',
|
63 |
+
r'^em a(s)?$': r'na\g<1>',
|
64 |
+
r'^de a(s)?$': r'da\g<1>',
|
65 |
+
r'^de o(s)?$': r'do\g<1>',
|
66 |
+
r'^a o(s)?$': r'ao\g<1>',
|
67 |
+
r'^a a(s)?$': r'à\g<1>',
|
68 |
+
r'^por a(s)?$': r'pela\g<1>',
|
69 |
+
r'^por o(s)?$': r'pelo\g<1>',
|
70 |
+
r'^em esta(s)?$': r'nesta\g<1>',
|
71 |
+
r'^em este(s)?$': r'neste\g<1>',
|
72 |
+
r'^em essa(s)?$': r'nessa\g<1>',
|
73 |
+
r'^em esse(s)?$': r'nesse\g<1>',
|
74 |
+
r'^em um$': r'num',
|
75 |
+
r'^em uns$': r'nuns',
|
76 |
+
r'^em uma(s)?$': r'numa\g<1>',
|
77 |
+
r'^em isso$': r'nisso',
|
78 |
+
r'^em aquele(s)?$': r'naquele\g<1>',
|
79 |
+
r'^em aquela(s)?$': r'naquela\g<1>',
|
80 |
+
r'^em aquilo$': r'naquilo',
|
81 |
+
r'^de uma(s)?$': r'duma\g<1>',
|
82 |
+
r'^de aqui$': r'daqui',
|
83 |
+
r'^de ali$': r'dali',
|
84 |
+
r'^de aquele(s)?$': r'daquele\g<1>',
|
85 |
+
r'^de aquela(s)?$': r'daquela\g<1>',
|
86 |
+
r'^de este(s)?$': r'deste\g<1>',
|
87 |
+
r'^de esta(s)?$': r'desta\g<1>',
|
88 |
+
r'^de esse(s)?$': r'desse\g<1>',
|
89 |
+
r'^de essa(s)?$': r'dessa\g<1>',
|
90 |
+
r'^de aí$': r'daí',
|
91 |
+
r'^de um$': r'dum',
|
92 |
+
r'^de onde$': r'donde',
|
93 |
+
r'^de isto$': r'disto',
|
94 |
+
r'^de isso$': r'disso',
|
95 |
+
r'^de aquilo$': r'daquilo',
|
96 |
+
r'^de ela(s)?$': r"dela\g<1>",
|
97 |
+
r'^de ele(s)?$': r"dele\g<1>",
|
98 |
+
r'^em isto$': r'nisto',
|
99 |
+
r'^em ele(s)?$': r'nele\g<1>',
|
100 |
+
r'^em ela(s)?$': r'nela\g<1>',
|
101 |
+
r'^em outro(s)?$': r'noutro\g<1>',
|
102 |
+
r'^a onde$': r'aonde',
|
103 |
+
r'^a aquela(s)?$': r'àquela\g<1>',
|
104 |
+
r'^a aquele(s)?$': r'àquele\g<1>',
|
105 |
+
r'^a aquilo$': r'àquilo',
|
106 |
+
r'^com ti$': r'contigo',
|
107 |
+
r'^não é$': r'né',
|
108 |
+
r'^com mim$': r'comigo',
|
109 |
+
r'^com nós$': r'conosco',
|
110 |
+
r'^com si$': r'consigo',
|
111 |
+
r'^para a$': r'pra',
|
112 |
+
r'^para o$': r'pro'
|
113 |
+
}
|
114 |
+
|
115 |
def replace_keep_case(word, replacement, text):
|
116 |
"""
|
117 |
Custom function for replace keeping the original case.
|
|
|
138 |
return repl.capitalize()
|
139 |
if g.isupper():
|
140 |
return repl.upper()
|
141 |
+
if g[0].isupper():
|
142 |
+
return repl[0].upper() + repl[1:]
|
143 |
return repl
|
144 |
|
145 |
return re.sub(word, func, text, flags=re.I)
|
requirements.txt
CHANGED
@@ -57,6 +57,7 @@ pydub==0.25.1
|
|
57 |
PyNaCl==1.5.0
|
58 |
pyparsing==3.0.9
|
59 |
python-dateutil==2.8.2
|
|
|
60 |
python-multipart==0.0.5
|
61 |
pytz==2022.6
|
62 |
PyYAML==6.0
|
@@ -84,3 +85,4 @@ uvicorn==0.19.0
|
|
84 |
wasabi==0.10.1
|
85 |
websockets==10.4
|
86 |
yarl==1.8.1
|
|
|
|
57 |
PyNaCl==1.5.0
|
58 |
pyparsing==3.0.9
|
59 |
python-dateutil==2.8.2
|
60 |
+
python-dotenv==1.0.1
|
61 |
python-multipart==0.0.5
|
62 |
pytz==2022.6
|
63 |
PyYAML==6.0
|
|
|
85 |
wasabi==0.10.1
|
86 |
websockets==10.4
|
87 |
yarl==1.8.1
|
88 |
+
option==2.1.0
|
top.html
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
<div style="text-align: center; max-width: 650px; margin: 0 auto;">
|
2 |
<div>
|
3 |
<h1 style="font-weight: 900; font-size: 3rem; margin: 20px;">
|
4 |
-
|
5 |
</h1>
|
6 |
<p class="slogan">A Brazilian Portuguese part of speech tagger according to the <a
|
7 |
href="https://universaldependencies.org/">Universal Dependencies</a> model
|
|
|
1 |
<div style="text-align: center; max-width: 650px; margin: 0 auto;">
|
2 |
<div>
|
3 |
<h1 style="font-weight: 900; font-size: 3rem; margin: 20px;">
|
4 |
+
PorttaggerDANTE
|
5 |
</h1>
|
6 |
<p class="slogan">A Brazilian Portuguese part of speech tagger according to the <a
|
7 |
href="https://universaldependencies.org/">Universal Dependencies</a> model
|