import transformers import gradio as gr import tensorflow as tf MODEL_DIRECTORY = './result/model' PRETRAINED_MODEL_NAME = 'dbmdz/bert-base-german-cased' TOKENIZER = transformers.BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME) MAX_SEQUENCE_LENGTH = 300 def encode(sentences, tokenizer, sequence_length): return tokenizer.batch_encode_plus( sentences, max_length=sequence_length, # set the length of the sequences add_special_tokens=True, # add [CLS] and [SEP] tokens return_attention_mask=True, return_token_type_ids=False, # not needed for this type of ML task pad_to_max_length=True, # add 0 pad tokens to the sequences less than max_length return_tensors='tf' ) hs_detection_model = tf.keras.models.load_model(MODEL_DIRECTORY, compile=True) def inference(sentence): encoded_sentence = encode([sentence], TOKENIZER, MAX_SEQUENCE_LENGTH) return hs_detection_model.predict(encoded_sentence.values()) title = "HS-Detector Demonstrator" description = """

Dataset: germeval18_hasoc19_rp21_combi_dataset (17,7% HS)

Das bisher beste Modell basierend auf Bert nach 2 Epochen und max. 300 Token pro Eintrag fine-tuning mit folgenden Evaluationsergebnissen:

Accuracy: 0.8794712286158631
Balanced Accuracy: 0.7561891312100413
Binary F1-Score: 0.6249999999999999
Binary Precision: 0.6994584837545126
Binary Recall: 0.564868804664723
Weighted F1-Score: 0.8742843536656945
Weighted Precision: 0.8722794361456155
Weighted Recall: 0.8794712286158631
Macro F1-Score: 0.7765982087708463
Macro Precision: 0.80455672371745
Macro Recall: 0.7561891312100413
MCC score: 0.558655967312084
AUROC score: 0.7561891312100413
""" article = "Die Eingaben werden nicht geloggt. Klassifikator einfach ausprobieren." input_sentence_text = gr.inputs.Textbox(placeholder="Hier den Satz eingeben, der Hassrede enthalten kann.") ui = gr.Interface(fn=inference, inputs=input_sentence_text, outputs="text", title = title, description = description, article = article) ui.launch()