lfoppiano commited on
Commit
ad4e920
·
verified ·
1 Parent(s): bf58cda

Delete grobid.yaml

Browse files
Files changed (1) hide show
  1. grobid.yaml +0 -331
grobid.yaml DELETED
@@ -1,331 +0,0 @@
1
- # this is the configuration file for the GROBID instance
2
-
3
- grobid:
4
- # where all the Grobid resources are stored (models, lexicon, native libraries, etc.), normally no need to change
5
- grobidHome: "grobid-home"
6
-
7
- # path relative to the grobid-home path (e.g. tmp for grobid-home/tmp) or absolute path (/tmp)
8
- temp: "tmp"
9
-
10
- # normally nothing to change here, path relative to the grobid-home path (e.g. grobid-home/lib)
11
- nativelibrary: "lib"
12
-
13
- pdf:
14
- pdfalto:
15
- # path relative to the grobid-home path (e.g. grobid-home/pdfalto), you don't want to change this normally
16
- path: "pdfalto"
17
- # security for PDF parsing
18
- memoryLimitMb: 6096
19
- timeoutSec: 120
20
-
21
- # security relative to the PDF parsing result
22
- blocksMax: 200000
23
- tokensMax: 1000000
24
-
25
- consolidation:
26
- # define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or
27
- # "glutton" for https://github.com/kermitt2/biblio-glutton
28
- service: "crossref"
29
- #service: "glutton"
30
- glutton:
31
- url: "https://cloud.science-miner.com/glutton"
32
- #url: "http://localhost:8080"
33
- crossref:
34
- mailto:
35
- # to use crossref web API, you need normally to use it politely and to indicate an email address here, e.g.
36
- #mailto: "[email protected]"
37
- token:
38
- # to use Crossref metadata plus service (available by subscription)
39
- #token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere"
40
-
41
- proxy:
42
- # proxy to be used when doing external call to the consolidation service
43
- host:
44
- port:
45
-
46
- # CORS configuration for the GROBID web API service
47
- corsAllowedOrigins: "*"
48
- corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
49
- corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"
50
-
51
- # the actual implementation for language recognition to be used
52
- languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"
53
-
54
- # the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP)
55
- sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"
56
- # sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory"
57
-
58
- # maximum concurrency allowed to GROBID server for processing parallel requests - change it according to your CPU/GPU capacities
59
- # for a production server running only GROBID, set the value slightly above the available number of threads of the server
60
- # to get best performance and security
61
- concurrency: 10
62
- # when the pool is full, for queries waiting for the availability of a Grobid engine, this is the maximum time wait to try
63
- # to get an engine (in seconds) - normally never change it
64
- poolMaxWait: 1
65
-
66
- delft:
67
- # DeLFT global parameters
68
- # delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model,
69
- # embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used)
70
- install: "../delft"
71
- pythonVirtualEnv:
72
-
73
- wapiti:
74
- # Wapiti global parameters
75
- # number of threads for training the wapiti models (0 to use all available processors)
76
- nbThreads: 0
77
-
78
- models:
79
- # we configure here how each sequence labeling model should be implemented
80
- # for feature-engineered CRF, use "wapiti" and possible training parameters are window, epsilon and nbMaxIterations
81
- # for Deep Learning, use "delft" and select the target DL architecture (see DeLFT library), the training
82
- # parameters then depends on this selected DL architecture
83
-
84
- - name: "segmentation"
85
- # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
86
- engine: "wapiti"
87
- #engine: "delft"
88
- wapiti:
89
- # wapiti training parameters, they will be used at training time only
90
- epsilon: 0.0000001
91
- window: 50
92
- nbMaxIterations: 2000
93
- delft:
94
- # deep learning parameters
95
- architecture: "BidLSTM_CRF_FEATURES"
96
- useELMo: false
97
- runtime:
98
- # parameters used at runtime/prediction
99
- max_sequence_length: 3000
100
- batch_size: 1
101
- training:
102
- # parameters used for training
103
- max_sequence_length: 3000
104
- batch_size: 10
105
-
106
- - name: "fulltext"
107
- # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
108
- engine: "wapiti"
109
- wapiti:
110
- # wapiti training parameters, they will be used at training time only
111
- epsilon: 0.0001
112
- window: 20
113
- nbMaxIterations: 1500
114
-
115
- - name: "header"
116
- engine: "wapiti"
117
- #engine: "delft"
118
- wapiti:
119
- # wapiti training parameters, they will be used at training time only
120
- epsilon: 0.000001
121
- window: 30
122
- nbMaxIterations: 1500
123
- delft:
124
- # deep learning parameters
125
- architecture: "BidLSTM_ChainCRF_FEATURES"
126
- #transformer: "allenai/scibert_scivocab_cased"
127
- useELMo: false
128
- runtime:
129
- # parameters used at runtime/prediction
130
- #max_sequence_length: 510
131
- max_sequence_length: 3000
132
- batch_size: 1
133
- training:
134
- # parameters used for training
135
- #max_sequence_length: 510
136
- #batch_size: 6
137
- max_sequence_length: 3000
138
- batch_size: 9
139
-
140
- - name: "reference-segmenter"
141
- engine: "wapiti"
142
- #engine: "delft"
143
- wapiti:
144
- # wapiti training parameters, they will be used at training time only
145
- epsilon: 0.00001
146
- window: 20
147
- delft:
148
- # deep learning parameters
149
- architecture: "BidLSTM_ChainCRF_FEATURES"
150
- useELMo: false
151
- runtime:
152
- # parameters used at runtime/prediction (for this model, use same max_sequence_length as training)
153
- max_sequence_length: 3000
154
- batch_size: 2
155
- training:
156
- # parameters used for training
157
- max_sequence_length: 3000
158
- batch_size: 10
159
-
160
- - name: "name-header"
161
- engine: "wapiti"
162
- #engine: "delft"
163
- delft:
164
- # deep learning parameters
165
- architecture: "BidLSTM_CRF_FEATURES"
166
-
167
- - name: "name-citation"
168
- engine: "wapiti"
169
- #engine: "delft"
170
- delft:
171
- # deep learning parameters
172
- architecture: "BidLSTM_CRF_FEATURES"
173
-
174
- - name: "date"
175
- engine: "wapiti"
176
- #engine: "delft"
177
- delft:
178
- # deep learning parameters
179
- architecture: "BidLSTM_CRF_FEATURES"
180
-
181
- - name: "figure"
182
- engine: "wapiti"
183
- #engine: "delft"
184
- wapiti:
185
- # wapiti training parameters, they will be used at training time only
186
- epsilon: 0.00001
187
- window: 20
188
- delft:
189
- # deep learning parameters
190
- architecture: "BidLSTM_CRF"
191
-
192
- - name: "table"
193
- engine: "wapiti"
194
- #engine: "delft"
195
- wapiti:
196
- # wapiti training parameters, they will be used at training time only
197
- epsilon: 0.00001
198
- window: 20
199
- delft:
200
- # deep learning parameters
201
- architecture: "BidLSTM_CRF"
202
-
203
- - name: "affiliation-address"
204
- engine: "wapiti"
205
- #engine: "delft"
206
- delft:
207
- # deep learning parameters
208
- architecture: "BidLSTM_CRF_FEATURES"
209
-
210
- - name: "citation"
211
- engine: "wapiti"
212
- #engine: "delft"
213
- wapiti:
214
- # wapiti training parameters, they will be used at training time only
215
- epsilon: 0.00001
216
- window: 50
217
- nbMaxIterations: 3000
218
- delft:
219
- # deep learning parameters
220
- architecture: "BidLSTM_CRF_FEATURES"
221
- #architecture: "BERT_CRF"
222
- #transformer: "michiyasunaga/LinkBERT-base"
223
- useELMo: false
224
- runtime:
225
- # parameters used at runtime/prediction
226
- max_sequence_length: 500
227
- batch_size: 30
228
- training:
229
- # parameters used for training
230
- max_sequence_length: 500
231
- batch_size: 50
232
-
233
- - name: "patent-citation"
234
- engine: "wapiti"
235
- #engine: "delft"
236
- wapiti:
237
- # wapiti training parameters, they will be used at training time only
238
- epsilon: 0.0001
239
- window: 20
240
- delft:
241
- # deep learning parameters
242
- architecture: "BidLSTM_CRF_FEATURES"
243
- #architecture: "BERT_CRF"
244
- runtime:
245
- # parameters used at runtime/prediction
246
- max_sequence_length: 800
247
- batch_size: 20
248
- training:
249
- # parameters used for training
250
- max_sequence_length: 1000
251
- batch_size: 40
252
-
253
- - name: "funding-acknowledgement"
254
- engine: "wapiti"
255
- #engine: "delft"
256
- wapiti:
257
- # wapiti training parameters, they will be used at training time only
258
- epsilon: 0.00001
259
- window: 50
260
- nbMaxIterations: 2000
261
- delft:
262
- # deep learning parameters
263
- architecture: "BidLSTM_CRF_FEATURES"
264
- #architecture: "BERT_CRF"
265
- #transformer: "michiyasunaga/LinkBERT-base"
266
- useELMo: false
267
- runtime:
268
- # parameters used at runtime/prediction
269
- max_sequence_length: 800
270
- batch_size: 20
271
- training:
272
- # parameters used for training
273
- max_sequence_length: 500
274
- batch_size: 40
275
-
276
- - name: "copyright"
277
- # at this time, we only have a DeLFT implementation,
278
- # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
279
- #engine: "delft"
280
- engine: "wapiti"
281
- delft:
282
- # deep learning parameters
283
- architecture: "gru"
284
- #architecture: "bert"
285
- #transformer: "allenai/scibert_scivocab_cased"
286
-
287
- - name: "license"
288
- # at this time, for being active, it must be DeLFT, no other implementation is available
289
- # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
290
- #engine: "delft"
291
- engine: "wapiti"
292
- delft:
293
- # deep learning parameters
294
- architecture: "gru"
295
- #architecture: "bert"
296
- #transformer: "allenai/scibert_scivocab_cased"
297
-
298
- # for **service only**: how to load the models,
299
- # false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down
300
- # significantly the service at first call
301
- # true -> all the models are loaded into memory at the server startup (default), slow the start of the services
302
- # and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready
303
- modelPreload: true
304
-
305
- server:
306
- type: custom
307
- applicationConnectors:
308
- - type: http
309
- port: 8070
310
- adminConnectors:
311
- - type: http
312
- port: 8071
313
- registerDefaultExceptionMappers: false
314
- # change the following for having all http requests logged
315
- requestLog:
316
- appenders: []
317
-
318
- # these logging settings apply to the Grobid service usage mode
319
- logging:
320
- level: INFO
321
- loggers:
322
- org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF"
323
- org.glassfish.jersey.internal: "OFF"
324
- com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF"
325
- appenders:
326
- - type: console
327
- threshold: INFO
328
- timeZone: UTC
329
- # uncomment to have the logs in json format
330
- # layout:
331
- # type: json