Update spaCy pipeline
Browse files- .gitattributes +4 -0
- README.md +14 -16
- attribute_ruler/patterns +0 -0
- config.cfg +13 -33
- custom_textcat.py +142 -0
- en_tako_query_filter-any-py3-none-any.whl +2 -2
- meta.json +130 -35
- ner/model +0 -0
- ner/moves +1 -1
- parser/cfg +13 -0
- parser/model +3 -0
- parser/moves +1 -0
- tagger/cfg +57 -0
- tagger/model +0 -0
- textcat_classify/model +2 -2
- tok2vec/model +1 -1
- tok2vec_small/cfg +3 -0
- tok2vec_small/model +3 -0
- vocab/strings.json +0 -0
.gitattributes
CHANGED
@@ -39,3 +39,7 @@ textcat_classify/model filter=lfs diff=lfs merge=lfs -text
|
|
39 |
tok2vec/model filter=lfs diff=lfs merge=lfs -text
|
40 |
vocab/key2row filter=lfs diff=lfs merge=lfs -text
|
41 |
vocab/vectors filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
39 |
tok2vec/model filter=lfs diff=lfs merge=lfs -text
|
40 |
vocab/key2row filter=lfs diff=lfs merge=lfs -text
|
41 |
vocab/vectors filter=lfs diff=lfs merge=lfs -text
|
42 |
+
ner/model filter=lfs diff=lfs merge=lfs -text
|
43 |
+
parser/model filter=lfs diff=lfs merge=lfs -text
|
44 |
+
tok2vec_small/model filter=lfs diff=lfs merge=lfs -text
|
45 |
+
vocab/strings.json filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -2,7 +2,6 @@
|
|
2 |
tags:
|
3 |
- spacy
|
4 |
- token-classification
|
5 |
-
- text-classification
|
6 |
language:
|
7 |
- en
|
8 |
model-index:
|
@@ -25,10 +24,10 @@ model-index:
|
|
25 |
| Feature | Description |
|
26 |
| --- | --- |
|
27 |
| **Name** | `en_tako_query_filter` |
|
28 |
-
| **Version** | `0.0.
|
29 |
| **spaCy** | `>=3.7.5,<3.8.0` |
|
30 |
-
| **Default Pipeline** | `tok2vec`, `ner`, `
|
31 |
-
| **Components** | `tok2vec`, `ner`, `
|
32 |
| **Vectors** | 514157 keys, 514157 unique vectors (300 dimensions) |
|
33 |
| **Sources** | n/a |
|
34 |
| **License** | n/a |
|
@@ -38,12 +37,11 @@ model-index:
|
|
38 |
|
39 |
<details>
|
40 |
|
41 |
-
<summary>View label scheme (
|
42 |
|
43 |
| Component | Labels |
|
44 |
| --- | --- |
|
45 |
-
| **`ner`** | `CARDINAL`, `
|
46 |
-
| **`textcat`** | `Business and Finance`, `Arts, Culture, and Entertainment`, `Crime`, `Sports`, `Politics`, `Science and Technology`, `Health and Wellness`, `Lifestyle and Fashion` |
|
47 |
| **`textcat_classify`** | `ACCEPT`, `REJECT` |
|
48 |
|
49 |
</details>
|
@@ -56,12 +54,12 @@ model-index:
|
|
56 |
| `ENTS_P` | 0.00 |
|
57 |
| `ENTS_R` | 0.00 |
|
58 |
| `ENTS_PER_TYPE` | 0.00 |
|
59 |
-
| `CATS_SCORE` |
|
60 |
-
| `CATS_MICRO_P` |
|
61 |
-
| `CATS_MICRO_R` |
|
62 |
-
| `CATS_MICRO_F` |
|
63 |
-
| `CATS_MACRO_P` |
|
64 |
-
| `CATS_MACRO_R` |
|
65 |
-
| `CATS_MACRO_F` |
|
66 |
-
| `CATS_MACRO_AUC` |
|
67 |
-
| `TEXTCAT_CLASSIFY_LOSS` |
|
|
|
2 |
tags:
|
3 |
- spacy
|
4 |
- token-classification
|
|
|
5 |
language:
|
6 |
- en
|
7 |
model-index:
|
|
|
24 |
| Feature | Description |
|
25 |
| --- | --- |
|
26 |
| **Name** | `en_tako_query_filter` |
|
27 |
+
| **Version** | `0.0.2` |
|
28 |
| **spaCy** | `>=3.7.5,<3.8.0` |
|
29 |
+
| **Default Pipeline** | `tok2vec`, `ner`, `textcat_classify` |
|
30 |
+
| **Components** | `tok2vec`, `ner`, `textcat_classify` |
|
31 |
| **Vectors** | 514157 keys, 514157 unique vectors (300 dimensions) |
|
32 |
| **Sources** | n/a |
|
33 |
| **License** | n/a |
|
|
|
37 |
|
38 |
<details>
|
39 |
|
40 |
+
<summary>View label scheme (21 labels for 2 components)</summary>
|
41 |
|
42 |
| Component | Labels |
|
43 |
| --- | --- |
|
44 |
+
| **`ner`** | `CARDINAL`, `DATE`, `EVENT`, `FAC`, `GPE`, `LANGUAGE`, `LAW`, `LOC`, `MONEY`, `NORP`, `ORDINAL`, `ORG`, `PERCENT`, `PERSON`, `PRODUCT`, `QUANTITY`, `STOCK_TICKER`, `TIME`, `WORK_OF_ART` |
|
|
|
45 |
| **`textcat_classify`** | `ACCEPT`, `REJECT` |
|
46 |
|
47 |
</details>
|
|
|
54 |
| `ENTS_P` | 0.00 |
|
55 |
| `ENTS_R` | 0.00 |
|
56 |
| `ENTS_PER_TYPE` | 0.00 |
|
57 |
+
| `CATS_SCORE` | 85.07 |
|
58 |
+
| `CATS_MICRO_P` | 85.31 |
|
59 |
+
| `CATS_MICRO_R` | 85.31 |
|
60 |
+
| `CATS_MICRO_F` | 85.31 |
|
61 |
+
| `CATS_MACRO_P` | 85.35 |
|
62 |
+
| `CATS_MACRO_R` | 85.31 |
|
63 |
+
| `CATS_MACRO_F` | 85.31 |
|
64 |
+
| `CATS_MACRO_AUC` | 91.67 |
|
65 |
+
| `TEXTCAT_CLASSIFY_LOSS` | 94.04 |
|
attribute_ruler/patterns
ADDED
Binary file (14.7 kB). View file
|
|
config.cfg
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
[paths]
|
2 |
-
train = "corpus/
|
3 |
-
dev = "corpus/
|
4 |
vectors = "en_core_web_lg"
|
5 |
init_tok2vec = null
|
6 |
|
7 |
[variables]
|
8 |
wandb_project_name = "tako-query-filter"
|
9 |
wandb_team_name = "tako-team"
|
|
|
10 |
|
11 |
[system]
|
12 |
gpu_allocator = "pytorch"
|
@@ -14,7 +15,7 @@ seed = 0
|
|
14 |
|
15 |
[nlp]
|
16 |
lang = "en"
|
17 |
-
pipeline = ["tok2vec","ner","
|
18 |
batch_size = 1000
|
19 |
disabled = []
|
20 |
before_creation = null
|
@@ -46,30 +47,9 @@ nO = null
|
|
46 |
width = 256
|
47 |
upstream = "*"
|
48 |
|
49 |
-
[components.textcat]
|
50 |
-
factory = "textcat"
|
51 |
-
scorer = {"@scorers":"spacy.textcat_scorer.v2"}
|
52 |
-
threshold = 0.0
|
53 |
-
|
54 |
-
[components.textcat.model]
|
55 |
-
@architectures = "spacy.TextCatEnsemble.v2"
|
56 |
-
nO = null
|
57 |
-
|
58 |
-
[components.textcat.model.linear_model]
|
59 |
-
@architectures = "spacy.TextCatBOW.v3"
|
60 |
-
exclusive_classes = false
|
61 |
-
length = 262144
|
62 |
-
ngram_size = 1
|
63 |
-
no_output_layer = false
|
64 |
-
nO = null
|
65 |
-
|
66 |
-
[components.textcat.model.tok2vec]
|
67 |
-
@architectures = "spacy.Tok2VecListener.v1"
|
68 |
-
width = 256
|
69 |
-
upstream = "*"
|
70 |
-
|
71 |
[components.textcat_classify]
|
72 |
-
factory = "
|
|
|
73 |
scorer = {"@scorers":"spacy.textcat_scorer.v2"}
|
74 |
threshold = 0.0
|
75 |
|
@@ -90,17 +70,17 @@ nO = null
|
|
90 |
|
91 |
[components.textcat_classify.model.tok2vec.embed]
|
92 |
@architectures = "spacy.MultiHashEmbed.v2"
|
93 |
-
width =
|
94 |
attrs = ["NORM","PREFIX","SUFFIX","SHAPE","ENT_TYPE"]
|
95 |
-
rows = [
|
96 |
include_static_vectors = true
|
97 |
|
98 |
[components.textcat_classify.model.tok2vec.encode]
|
99 |
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
100 |
-
width =
|
101 |
window_size = 1
|
102 |
maxout_pieces = 3
|
103 |
-
depth =
|
104 |
|
105 |
[components.tok2vec]
|
106 |
factory = "tok2vec"
|
@@ -150,12 +130,12 @@ seed = ${system.seed}
|
|
150 |
gpu_allocator = ${system.gpu_allocator}
|
151 |
dropout = 0.1
|
152 |
accumulate_gradient = 1
|
153 |
-
patience =
|
154 |
max_epochs = 0
|
155 |
max_steps = 20000
|
156 |
eval_frequency = 100
|
157 |
-
frozen_components = ["
|
158 |
-
annotating_components = ["ner"
|
159 |
before_to_disk = null
|
160 |
before_update = null
|
161 |
|
|
|
1 |
[paths]
|
2 |
+
train = "corpus/filter-train.spacy"
|
3 |
+
dev = "corpus/filter-test.spacy"
|
4 |
vectors = "en_core_web_lg"
|
5 |
init_tok2vec = null
|
6 |
|
7 |
[variables]
|
8 |
wandb_project_name = "tako-query-filter"
|
9 |
wandb_team_name = "tako-team"
|
10 |
+
base_model = "ner/dashing-wind"
|
11 |
|
12 |
[system]
|
13 |
gpu_allocator = "pytorch"
|
|
|
15 |
|
16 |
[nlp]
|
17 |
lang = "en"
|
18 |
+
pipeline = ["tok2vec","ner","textcat_classify"]
|
19 |
batch_size = 1000
|
20 |
disabled = []
|
21 |
before_creation = null
|
|
|
47 |
width = 256
|
48 |
upstream = "*"
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
[components.textcat_classify]
|
51 |
+
factory = "weighted_textcat"
|
52 |
+
class_weights = [0.67,0.33]
|
53 |
scorer = {"@scorers":"spacy.textcat_scorer.v2"}
|
54 |
threshold = 0.0
|
55 |
|
|
|
70 |
|
71 |
[components.textcat_classify.model.tok2vec.embed]
|
72 |
@architectures = "spacy.MultiHashEmbed.v2"
|
73 |
+
width = 128
|
74 |
attrs = ["NORM","PREFIX","SUFFIX","SHAPE","ENT_TYPE"]
|
75 |
+
rows = [2000,500,1000,500,500]
|
76 |
include_static_vectors = true
|
77 |
|
78 |
[components.textcat_classify.model.tok2vec.encode]
|
79 |
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
80 |
+
width = 128
|
81 |
window_size = 1
|
82 |
maxout_pieces = 3
|
83 |
+
depth = 4
|
84 |
|
85 |
[components.tok2vec]
|
86 |
factory = "tok2vec"
|
|
|
130 |
gpu_allocator = ${system.gpu_allocator}
|
131 |
dropout = 0.1
|
132 |
accumulate_gradient = 1
|
133 |
+
patience = 1000
|
134 |
max_epochs = 0
|
135 |
max_steps = 20000
|
136 |
eval_frequency = 100
|
137 |
+
frozen_components = ["tagger","attribute_ruler","parser","tok2vec","ner"]
|
138 |
+
annotating_components = ["ner"]
|
139 |
before_to_disk = null
|
140 |
before_update = null
|
141 |
|
custom_textcat.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
|
2 |
+
from spacy.util import registry
|
3 |
+
from thinc.types import Floats2d
|
4 |
+
from spacy.tokens import Doc
|
5 |
+
from spacy.pipeline import TextCategorizer
|
6 |
+
from spacy.training import Example, validate_examples
|
7 |
+
from spacy.pipeline.textcat import textcat_score
|
8 |
+
from spacy.vocab import Vocab
|
9 |
+
from spacy.scorer import Scorer
|
10 |
+
from spacy.language import Language
|
11 |
+
from thinc.api import Model
|
12 |
+
import numpy
|
13 |
+
|
14 |
+
|
15 |
+
@Language.factory(
|
16 |
+
"weighted_textcat",
|
17 |
+
assigns=["doc.cats"],
|
18 |
+
default_config={
|
19 |
+
"threshold": 0.0,
|
20 |
+
"scorer": {"@scorers": "spacy.textcat_scorer.v2"},
|
21 |
+
},
|
22 |
+
default_score_weights={
|
23 |
+
"cats_score": 1.0,
|
24 |
+
"cats_score_desc": None,
|
25 |
+
"cats_micro_p": None,
|
26 |
+
"cats_micro_r": None,
|
27 |
+
"cats_micro_f": None,
|
28 |
+
"cats_macro_p": None,
|
29 |
+
"cats_macro_r": None,
|
30 |
+
"cats_macro_f": None,
|
31 |
+
"cats_macro_auc": None,
|
32 |
+
"cats_f_per_type": None,
|
33 |
+
},
|
34 |
+
)
|
35 |
+
def make_textcat(
|
36 |
+
nlp: Language,
|
37 |
+
name: str,
|
38 |
+
model: Model[List[Doc], List[Floats2d]],
|
39 |
+
threshold: float,
|
40 |
+
scorer: Optional[Callable],
|
41 |
+
class_weights: Optional[List],
|
42 |
+
) -> "TextCategorizer":
|
43 |
+
"""Create a TextCategorizer component. The text categorizer predicts categories
|
44 |
+
over a whole document. It can learn one or more labels, and the labels are considered
|
45 |
+
to be mutually exclusive (i.e. one true label per doc).
|
46 |
+
|
47 |
+
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
48 |
+
scores for each category.
|
49 |
+
threshold (float): Cutoff to consider a prediction "positive".
|
50 |
+
scorer (Optional[Callable]): The scoring method.
|
51 |
+
"""
|
52 |
+
if class_weights == "null":
|
53 |
+
class_weights = None
|
54 |
+
return CustomTextcat(
|
55 |
+
nlp.vocab,
|
56 |
+
model,
|
57 |
+
name,
|
58 |
+
threshold=threshold,
|
59 |
+
scorer=scorer,
|
60 |
+
weights=class_weights,
|
61 |
+
)
|
62 |
+
|
63 |
+
|
64 |
+
def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
65 |
+
return Scorer.score_cats(
|
66 |
+
examples,
|
67 |
+
"cats",
|
68 |
+
multi_label=False,
|
69 |
+
**kwargs,
|
70 |
+
)
|
71 |
+
|
72 |
+
|
73 |
+
@registry.scorers("spacy.textcat_scorer.v2")
|
74 |
+
def make_textcat_scorer():
|
75 |
+
return textcat_score
|
76 |
+
|
77 |
+
|
78 |
+
class CustomTextcat(TextCategorizer):
|
79 |
+
def __init__(
|
80 |
+
self,
|
81 |
+
vocab: Vocab,
|
82 |
+
model: Model,
|
83 |
+
name: str = "textcat",
|
84 |
+
*,
|
85 |
+
threshold: float,
|
86 |
+
scorer: Optional[Callable] = textcat_score,
|
87 |
+
weights: Optional[List[float]] = None,
|
88 |
+
) -> None:
|
89 |
+
"""Initialize a text categorizer for single-label classification.
|
90 |
+
|
91 |
+
vocab (Vocab): The shared vocabulary.
|
92 |
+
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
93 |
+
name (str): The component instance name, used to add entries to the
|
94 |
+
losses during training.
|
95 |
+
threshold (float): Unused, not needed for single-label (exclusive
|
96 |
+
classes) classification.
|
97 |
+
scorer (Optional[Callable]): The scoring method. Defaults to
|
98 |
+
Scorer.score_cats for the attribute "cats".
|
99 |
+
|
100 |
+
DOCS: https://spacy.io/api/textcategorizer#init
|
101 |
+
"""
|
102 |
+
self.vocab = vocab
|
103 |
+
self.model = model
|
104 |
+
self.name = name
|
105 |
+
self._rehearsal_model = None
|
106 |
+
cfg: Dict[str, Any] = {
|
107 |
+
"labels": [],
|
108 |
+
"threshold": threshold,
|
109 |
+
"positive_label": None,
|
110 |
+
}
|
111 |
+
self.cfg = dict(cfg)
|
112 |
+
self.scorer = scorer
|
113 |
+
if weights is not None:
|
114 |
+
print(f"Using weights: {weights}")
|
115 |
+
self.weights = numpy.array(weights)
|
116 |
+
|
117 |
+
def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
|
118 |
+
"""Find the loss and gradient of loss for the batch of documents and
|
119 |
+
their predicted scores.
|
120 |
+
|
121 |
+
examples (Iterable[Examples]): The batch of examples.
|
122 |
+
scores: Scores representing the model's predictions.
|
123 |
+
RETURNS (Tuple[float, float]): The loss and the gradient.
|
124 |
+
|
125 |
+
DOCS: https://spacy.io/api/textcategorizer#get_loss
|
126 |
+
"""
|
127 |
+
validate_examples(examples, "TextCategorizer.get_loss")
|
128 |
+
self._validate_categories(examples)
|
129 |
+
truths, not_missing = self._examples_to_truth(examples)
|
130 |
+
not_missing = self.model.ops.asarray(not_missing) # type: ignore
|
131 |
+
d_scores = scores - truths
|
132 |
+
d_scores *= not_missing
|
133 |
+
weights = self.model.ops.asarray(self.weights) # type: ignore
|
134 |
+
if weights is not None:
|
135 |
+
squared = d_scores**2
|
136 |
+
mean_square_error = numpy.sum(squared * weights) / (
|
137 |
+
numpy.sum(weights) * len(squared)
|
138 |
+
)
|
139 |
+
d_scores *= weights
|
140 |
+
else:
|
141 |
+
mean_square_error = (d_scores**2).mean()
|
142 |
+
return float(mean_square_error), d_scores
|
en_tako_query_filter-any-py3-none-any.whl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be4b077cc99a883fdd1970bcf64c5c31e1ed0de729d07dc2bde0ee67d5faaabe
|
3 |
+
size 619534225
|
meta.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
{
|
2 |
"lang":"en",
|
3 |
"name":"tako_query_filter",
|
4 |
-
"version":"0.0.
|
5 |
"description":"",
|
6 |
"author":"",
|
7 |
"email":"",
|
@@ -16,18 +16,116 @@
|
|
16 |
"name":"en_vectors"
|
17 |
},
|
18 |
"labels":{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
"tok2vec":[
|
20 |
|
21 |
],
|
22 |
"ner":[
|
23 |
"CARDINAL",
|
24 |
-
"CUSTOM_ATTRIBUTE",
|
25 |
-
"CUSTOM_SEMANTIC_FUNCTION",
|
26 |
-
"CUSTOM_SPORTS_CONFERENCE",
|
27 |
-
"CUSTOM_SPORTS_LEAGUE",
|
28 |
-
"CUSTOM_SPORTS_ROLE",
|
29 |
-
"CUSTOM_STOCK_TICKER",
|
30 |
-
"CUSTOM_TEAM",
|
31 |
"DATE",
|
32 |
"EVENT",
|
33 |
"FAC",
|
@@ -43,34 +141,31 @@
|
|
43 |
"PERSON",
|
44 |
"PRODUCT",
|
45 |
"QUANTITY",
|
|
|
46 |
"TIME",
|
47 |
"WORK_OF_ART"
|
48 |
],
|
49 |
-
"textcat":[
|
50 |
-
"Business and Finance",
|
51 |
-
"Arts, Culture, and Entertainment",
|
52 |
-
"Crime",
|
53 |
-
"Sports",
|
54 |
-
"Politics",
|
55 |
-
"Science and Technology",
|
56 |
-
"Health and Wellness",
|
57 |
-
"Lifestyle and Fashion"
|
58 |
-
],
|
59 |
"textcat_classify":[
|
60 |
"ACCEPT",
|
61 |
"REJECT"
|
62 |
]
|
63 |
},
|
64 |
"pipeline":[
|
|
|
|
|
|
|
|
|
65 |
"tok2vec",
|
66 |
"ner",
|
67 |
-
"textcat",
|
68 |
"textcat_classify"
|
69 |
],
|
70 |
"components":[
|
|
|
|
|
|
|
|
|
71 |
"tok2vec",
|
72 |
"ner",
|
73 |
-
"textcat",
|
74 |
"textcat_classify"
|
75 |
],
|
76 |
"disabled":[
|
@@ -81,28 +176,28 @@
|
|
81 |
"ents_p":0.0,
|
82 |
"ents_r":0.0,
|
83 |
"ents_per_type":0.0,
|
84 |
-
"cats_score":0.
|
85 |
"cats_score_desc":"F (ACCEPT)",
|
86 |
-
"cats_micro_p":0.
|
87 |
-
"cats_micro_r":0.
|
88 |
-
"cats_micro_f":0.
|
89 |
-
"cats_macro_p":0.
|
90 |
-
"cats_macro_r":0.
|
91 |
-
"cats_macro_f":0.
|
92 |
-
"cats_macro_auc":0.
|
93 |
"cats_f_per_type":{
|
94 |
"ACCEPT":{
|
95 |
-
"p":0.
|
96 |
-
"r":0.
|
97 |
-
"f":0.
|
98 |
},
|
99 |
"REJECT":{
|
100 |
-
"p":0.
|
101 |
-
"r":0.
|
102 |
-
"f":0.
|
103 |
}
|
104 |
},
|
105 |
-
"textcat_classify_loss":0.
|
106 |
},
|
107 |
"requirements":[
|
108 |
|
|
|
1 |
{
|
2 |
"lang":"en",
|
3 |
"name":"tako_query_filter",
|
4 |
+
"version":"0.0.2",
|
5 |
"description":"",
|
6 |
"author":"",
|
7 |
"email":"",
|
|
|
16 |
"name":"en_vectors"
|
17 |
},
|
18 |
"labels":{
|
19 |
+
"tok2vec_small":[
|
20 |
+
|
21 |
+
],
|
22 |
+
"tagger":[
|
23 |
+
"$",
|
24 |
+
"''",
|
25 |
+
",",
|
26 |
+
"-LRB-",
|
27 |
+
"-RRB-",
|
28 |
+
".",
|
29 |
+
":",
|
30 |
+
"ADD",
|
31 |
+
"AFX",
|
32 |
+
"CC",
|
33 |
+
"CD",
|
34 |
+
"DT",
|
35 |
+
"EX",
|
36 |
+
"FW",
|
37 |
+
"HYPH",
|
38 |
+
"IN",
|
39 |
+
"JJ",
|
40 |
+
"JJR",
|
41 |
+
"JJS",
|
42 |
+
"LS",
|
43 |
+
"MD",
|
44 |
+
"NFP",
|
45 |
+
"NN",
|
46 |
+
"NNP",
|
47 |
+
"NNPS",
|
48 |
+
"NNS",
|
49 |
+
"PDT",
|
50 |
+
"POS",
|
51 |
+
"PRP",
|
52 |
+
"PRP$",
|
53 |
+
"RB",
|
54 |
+
"RBR",
|
55 |
+
"RBS",
|
56 |
+
"RP",
|
57 |
+
"SYM",
|
58 |
+
"TO",
|
59 |
+
"UH",
|
60 |
+
"VB",
|
61 |
+
"VBD",
|
62 |
+
"VBG",
|
63 |
+
"VBN",
|
64 |
+
"VBP",
|
65 |
+
"VBZ",
|
66 |
+
"WDT",
|
67 |
+
"WP",
|
68 |
+
"WP$",
|
69 |
+
"WRB",
|
70 |
+
"XX",
|
71 |
+
"_SP",
|
72 |
+
"``"
|
73 |
+
],
|
74 |
+
"parser":[
|
75 |
+
"ROOT",
|
76 |
+
"acl",
|
77 |
+
"acomp",
|
78 |
+
"advcl",
|
79 |
+
"advmod",
|
80 |
+
"agent",
|
81 |
+
"amod",
|
82 |
+
"appos",
|
83 |
+
"attr",
|
84 |
+
"aux",
|
85 |
+
"auxpass",
|
86 |
+
"case",
|
87 |
+
"cc",
|
88 |
+
"ccomp",
|
89 |
+
"compound",
|
90 |
+
"conj",
|
91 |
+
"csubj",
|
92 |
+
"csubjpass",
|
93 |
+
"dative",
|
94 |
+
"dep",
|
95 |
+
"det",
|
96 |
+
"dobj",
|
97 |
+
"expl",
|
98 |
+
"intj",
|
99 |
+
"mark",
|
100 |
+
"meta",
|
101 |
+
"neg",
|
102 |
+
"nmod",
|
103 |
+
"npadvmod",
|
104 |
+
"nsubj",
|
105 |
+
"nsubjpass",
|
106 |
+
"nummod",
|
107 |
+
"oprd",
|
108 |
+
"parataxis",
|
109 |
+
"pcomp",
|
110 |
+
"pobj",
|
111 |
+
"poss",
|
112 |
+
"preconj",
|
113 |
+
"predet",
|
114 |
+
"prep",
|
115 |
+
"prt",
|
116 |
+
"punct",
|
117 |
+
"quantmod",
|
118 |
+
"relcl",
|
119 |
+
"xcomp"
|
120 |
+
],
|
121 |
+
"attribute_ruler":[
|
122 |
+
|
123 |
+
],
|
124 |
"tok2vec":[
|
125 |
|
126 |
],
|
127 |
"ner":[
|
128 |
"CARDINAL",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
"DATE",
|
130 |
"EVENT",
|
131 |
"FAC",
|
|
|
141 |
"PERSON",
|
142 |
"PRODUCT",
|
143 |
"QUANTITY",
|
144 |
+
"STOCK_TICKER",
|
145 |
"TIME",
|
146 |
"WORK_OF_ART"
|
147 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
"textcat_classify":[
|
149 |
"ACCEPT",
|
150 |
"REJECT"
|
151 |
]
|
152 |
},
|
153 |
"pipeline":[
|
154 |
+
"tok2vec_small",
|
155 |
+
"tagger",
|
156 |
+
"parser",
|
157 |
+
"attribute_ruler",
|
158 |
"tok2vec",
|
159 |
"ner",
|
|
|
160 |
"textcat_classify"
|
161 |
],
|
162 |
"components":[
|
163 |
+
"tok2vec_small",
|
164 |
+
"tagger",
|
165 |
+
"parser",
|
166 |
+
"attribute_ruler",
|
167 |
"tok2vec",
|
168 |
"ner",
|
|
|
169 |
"textcat_classify"
|
170 |
],
|
171 |
"disabled":[
|
|
|
176 |
"ents_p":0.0,
|
177 |
"ents_r":0.0,
|
178 |
"ents_per_type":0.0,
|
179 |
+
"cats_score":0.8507157464,
|
180 |
"cats_score_desc":"F (ACCEPT)",
|
181 |
+
"cats_micro_p":0.8531187123,
|
182 |
+
"cats_micro_r":0.8531187123,
|
183 |
+
"cats_micro_f":0.8531187123,
|
184 |
+
"cats_macro_p":0.853485064,
|
185 |
+
"cats_macro_r":0.8531187123,
|
186 |
+
"cats_macro_f":0.8530806455,
|
187 |
+
"cats_macro_auc":0.9167497439,
|
188 |
"cats_f_per_type":{
|
189 |
"ACCEPT":{
|
190 |
+
"p":0.8648648649,
|
191 |
+
"r":0.8370221328,
|
192 |
+
"f":0.8507157464
|
193 |
},
|
194 |
"REJECT":{
|
195 |
+
"p":0.8421052632,
|
196 |
+
"r":0.8692152918,
|
197 |
+
"f":0.8554455446
|
198 |
}
|
199 |
},
|
200 |
+
"textcat_classify_loss":0.9403656576
|
201 |
},
|
202 |
"requirements":[
|
203 |
|
ner/model
CHANGED
Binary files a/ner/model and b/ner/model differ
|
|
ner/moves
CHANGED
@@ -1 +1 @@
|
|
1 |
-
��moves
|
|
|
1 |
+
��moves��{"0":{},"1":{"ORG":32008,"GPE":3728,"PERSON":1105,"DATE":850,"WORK_OF_ART":686,"PRODUCT":585,"EVENT":283,"MONEY":214,"NORP":179,"STOCK_TICKER":156,"LAW":129,"LOC":111,"PERCENT":88,"FAC":75,"QUANTITY":60,"CARDINAL":57,"ORDINAL":42,"TIME":27,"LANGUAGE":25},"2":{"ORG":32008,"GPE":3728,"PERSON":1105,"DATE":850,"WORK_OF_ART":686,"PRODUCT":585,"EVENT":283,"MONEY":214,"NORP":179,"STOCK_TICKER":156,"LAW":129,"LOC":111,"PERCENT":88,"FAC":75,"QUANTITY":60,"CARDINAL":57,"ORDINAL":42,"TIME":27,"LANGUAGE":25},"3":{"ORG":32008,"GPE":3728,"PERSON":1105,"DATE":850,"WORK_OF_ART":686,"PRODUCT":585,"EVENT":283,"MONEY":214,"NORP":179,"STOCK_TICKER":156,"LAW":129,"LOC":111,"PERCENT":88,"FAC":75,"QUANTITY":60,"CARDINAL":57,"ORDINAL":42,"TIME":27,"LANGUAGE":25},"4":{"ORG":32008,"GPE":3728,"PERSON":1105,"DATE":850,"WORK_OF_ART":686,"PRODUCT":585,"EVENT":283,"MONEY":214,"NORP":179,"STOCK_TICKER":156,"LAW":129,"LOC":111,"PERCENT":88,"FAC":75,"QUANTITY":60,"CARDINAL":57,"ORDINAL":42,"TIME":27,"LANGUAGE":25,"":1},"5":{"":1}}�cfg��neg_key�
|
parser/cfg
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"moves":null,
|
3 |
+
"update_with_oracle_cut_size":100,
|
4 |
+
"multitasks":[
|
5 |
+
|
6 |
+
],
|
7 |
+
"min_action_freq":30,
|
8 |
+
"learn_tokens":false,
|
9 |
+
"beam_width":1,
|
10 |
+
"beam_density":0.0,
|
11 |
+
"beam_update_prob":0.0,
|
12 |
+
"incorrect_spans_key":null
|
13 |
+
}
|
parser/model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a1836fbc02b3924b2fd5f65325c58ae852ff112db1090ca724e5a801e68b85fd
|
3 |
+
size 319909
|
parser/moves
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
��moves�{"0":{"":994332},"1":{"":999432},"2":{"det":172595,"nsubj":165748,"compound":116623,"amod":105184,"aux":86667,"punct":65478,"advmod":62763,"poss":36443,"mark":27941,"nummod":22598,"auxpass":15594,"prep":14001,"nsubjpass":13856,"neg":12357,"cc":10739,"nmod":9562,"advcl":9062,"npadvmod":8168,"quantmod":7101,"intj":6464,"ccomp":5896,"dobj":3427,"expl":3360,"dep":2871,"predet":1944,"parataxis":1837,"csubj":1428,"preconj":621,"pobj||prep":616,"attr":578,"meta":376,"advmod||conj":368,"dobj||xcomp":352,"acomp":284,"nsubj||ccomp":224,"dative":206,"advmod||xcomp":149,"dobj||ccomp":70,"csubjpass":64,"dobj||conj":62,"prep||conj":51,"acl":48,"prep||nsubj":41,"prep||dobj":36,"xcomp":34,"advmod||ccomp":32,"oprd":31},"3":{"punct":183790,"pobj":182191,"prep":174008,"dobj":89615,"conj":59687,"cc":51930,"ccomp":30385,"advmod":22861,"xcomp":21021,"relcl":20969,"advcl":19828,"attr":17741,"acomp":16922,"appos":15265,"case":13388,"acl":12085,"pcomp":10324,"dep":10116,"npadvmod":9796,"prt":8179,"agent":3903,"dative":3866,"nsubj":3470,"neg":2906,"amod":2839,"intj":2819,"nummod":2732,"oprd":2301,"parataxis":1261,"quantmod":319,"nmod":294,"acl||dobj":200,"prep||dobj":190,"prep||nsubj":162,"acl||nsubj":159,"appos||nsubj":145,"relcl||dobj":134,"relcl||nsubj":111,"aux":103,"expl":96,"meta":92,"appos||dobj":86,"preconj":71,"csubj":65,"prep||nsubjpass":55,"prep||advmod":54,"prep||acomp":53,"det":51,"nsubjpass":45,"relcl||pobj":42,"acl||nsubjpass":42,"mark":40,"auxpass":39,"prep||pobj":36,"relcl||nsubjpass":32,"appos||nsubjpass":31},"4":{"ROOT":111664}}�cfg��neg_key�
|
tagger/cfg
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"label_smoothing":0.0,
|
3 |
+
"labels":[
|
4 |
+
"$",
|
5 |
+
"''",
|
6 |
+
",",
|
7 |
+
"-LRB-",
|
8 |
+
"-RRB-",
|
9 |
+
".",
|
10 |
+
":",
|
11 |
+
"ADD",
|
12 |
+
"AFX",
|
13 |
+
"CC",
|
14 |
+
"CD",
|
15 |
+
"DT",
|
16 |
+
"EX",
|
17 |
+
"FW",
|
18 |
+
"HYPH",
|
19 |
+
"IN",
|
20 |
+
"JJ",
|
21 |
+
"JJR",
|
22 |
+
"JJS",
|
23 |
+
"LS",
|
24 |
+
"MD",
|
25 |
+
"NFP",
|
26 |
+
"NN",
|
27 |
+
"NNP",
|
28 |
+
"NNPS",
|
29 |
+
"NNS",
|
30 |
+
"PDT",
|
31 |
+
"POS",
|
32 |
+
"PRP",
|
33 |
+
"PRP$",
|
34 |
+
"RB",
|
35 |
+
"RBR",
|
36 |
+
"RBS",
|
37 |
+
"RP",
|
38 |
+
"SYM",
|
39 |
+
"TO",
|
40 |
+
"UH",
|
41 |
+
"VB",
|
42 |
+
"VBD",
|
43 |
+
"VBG",
|
44 |
+
"VBN",
|
45 |
+
"VBP",
|
46 |
+
"VBZ",
|
47 |
+
"WDT",
|
48 |
+
"WP",
|
49 |
+
"WP$",
|
50 |
+
"WRB",
|
51 |
+
"XX",
|
52 |
+
"_SP",
|
53 |
+
"``"
|
54 |
+
],
|
55 |
+
"neg_prefix":"!",
|
56 |
+
"overwrite":false
|
57 |
+
}
|
tagger/model
ADDED
Binary file (19.8 kB). View file
|
|
textcat_classify/model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c65c611aa01b463b7f99116d0b1a53cd75effb9d0bac5febef70bf3b85f0b075
|
3 |
+
size 8319359
|
tok2vec/model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 34434008
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c8db1e5a93c4f955f990b7f6005b11c65ac6b9efa20f2c02291ac2013d06a203
|
3 |
size 34434008
|
tok2vec_small/cfg
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
|
3 |
+
}
|
tok2vec_small/model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:42d8414521eaf75f817bd1b351b26039a22a912bb2617f95ead305420f2ebffd
|
3 |
+
size 6269370
|
vocab/strings.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|