import transformers import gradio as gr import tensorflow as tf MODEL_DIRECTORY = './result/model' PRETRAINED_MODEL_NAME = 'dbmdz/bert-base-german-cased' TOKENIZER = transformers.BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME) MAX_SEQUENCE_LENGTH = 300 def encode(sentences, tokenizer, sequence_length): return tokenizer.batch_encode_plus( sentences, max_length=sequence_length, # set the length of the sequences add_special_tokens=True, # add [CLS] and [SEP] tokens return_attention_mask=True, return_token_type_ids=False, # not needed for this type of ML task pad_to_max_length=True, # add 0 pad tokens to the sequences less than max_length return_tensors='tf' ) hs_detection_model = tf.keras.models.load_model(MODEL_DIRECTORY, compile=True) def inference(sentence): encoded_sentence = encode([sentence], TOKENIZER, MAX_SEQUENCE_LENGTH) return hs_detection_model.predict(encoded_sentence.values()) title = "HS-Detector Demonstrator" description = """
Dataset: germeval18_hasoc19_rp21_combi_dataset (17,7% HS)
Das bisher beste Modell basierend auf Bert nach 2 Epochen und max. 300 Token pro Eintrag fine-tuning mit folgenden Evaluationsergebnissen:
Accuracy: 0.8794712286158631