File size: 4,486 Bytes
a5baed0 7a7779f a5baed0 22bb108 a5baed0 22bb108 a5baed0 43dbeb1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
version: "0.8.0"
corpusPath: "./resources/dataset/dataseer/corpus"
templatePath: "./resources/dataset/dataseer/crfpp-templates/dataseer.template"
grobidHome: "/opt/grobid/grobid-home"
tmpPath: "/opt/grobid/grobid-home/tmp/"
# path to Pub2TEI repository as available at https://github.com/kermitt2/Pub2TEI
pub2teiPath: "/opt/Pub2TEI/"
gluttonHost:
gluttonPort:
# entity-fishing server information for performing entity disambiguation
# for https, indicate 443 as port
entityFishingHost: cloud.science-miner.com/nerd
entityFishingPort: 443
#entityFishingHost: localhost
#entityFishingPort: 8090
# if true we use binary classifiers for the contexts, otherwise use a single multi-label classifier
# binary classifiers perform better, but havier to use
useBinaryContextClassifiers: false
# sequence labeling model (identify data-related sections)
models:
# model for zones
- name: "dataseer"
engine: "wapiti"
#engine: "delft"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.00001
window: 20
nbMaxIterations: 2000
# classifier model, dataset binary (datset or not dataset in the current sentence)
- name: "dataseer-binary"
engine: "delft"
delft:
# deep learning parameters
#architecture: "gru"
architecture: "bert"
#embeddings_name: "word2vec"
transformer: "allenai/scibert_scivocab_cased"
# identification of the data type (first level hierarchy)
- name: "dataseer-first"
engine: "delft"
delft:
# deep learning parameters
#architecture: "gru"
architecture: "bert"
#embeddings_name: "word2vec"
transformer: "allenai/scibert_scivocab_cased"
# mention context classification (reuse binary for the moment)
- name: "dataseer-reuse"
engine: "delft"
delft:
# deep learning parameters
#architecture: "gru"
architecture: "bert"
#embeddings_name: "word2vec"
transformer: "allenai/scibert_scivocab_cased"
# model for dataset mention recognition
- name: "datasets"
#engine: "wapiti"
engine: "delft"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.00001
window: 20
nbMaxIterations: 2000
delft:
# deep learning parameters
#architecture: "BidLSTM_CRF"
architecture: "BERT_CRF"
#transformer: "allenai/scibert_scivocab_cased"
transformer: "michiyasunaga/LinkBERT-basecased"
#useELMo: true
#embeddings_name: "glove-840B"
runtime:
# parameters used at runtime/prediction
max_sequence_length: 200
#max_sequence_length: 300
batch_size: 20
- name: "context"
engine: "delft"
delft:
#architecture: "gru"
#embeddings_name: "glove-840B"
architecture: "bert"
transformer: "michiyasunaga/LinkBERT-basecased"
- name: "context_used"
engine: "delft"
delft:
#architecture: "gru"
#embeddings_name: "glove-840B"
architecture: "bert"
transformer: "michiyasunaga/LinkBERT-basecased"
- name: "context_creation"
engine: "delft"
delft:
#architecture: "gru"
#embeddings_name: "glove-840B"
architecture: "bert"
transformer: "michiyasunaga/LinkBERT-basecased"
- name: "context_shared"
engine: "delft"
delft:
#architecture: "gru"
#embeddings_name: "glove-840B"
architecture: "bert"
transformer: "michiyasunaga/LinkBERT-basecased"
# Limit the maximum number of requests (0, no limit)
maxParallelRequests: 0
# CORS configuration for the web API service
corsAllowedOrigins: "*"
corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"
server:
type: custom
idleTimeout: 120 seconds
applicationConnectors:
- type: http
port: 8060
adminConnectors:
- type: http
port: 8061
registerDefaultExceptionMappers: false
maxThreads: 2048
maxQueuedRequests: 2048
acceptQueueSize: 2048
requestLog:
appenders: []
# these logging settings apply to the service usage mode
logging:
level: INFO
loggers:
org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF"
org.glassfish.jersey.internal: "OFF"
com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF"
appenders:
- type: console
threshold: INFO
timeZone: UTC
# uncomment to have the logs in json format
#layout:
# type: json |