import argparse import json as js import os import re from pathlib import Path from typing import List, Tuple import fasttext import gradio as gr import joblib import omikuji from huggingface_hub import snapshot_download from parltopic.utils.helper import get_main_config from prepare_everything import download_model config = get_main_config() download_model( "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin", Path(config["paths"]["resources"]) / "lid.176.bin", ) # Download the model files from Hugging Face model_names = [ "omikuji-bonsai-parliament-spacy-de-all_topics-input_long", "omikuji-bonsai-parliament-spacy-fr-all_topics-input_long", "omikuji-bonsai-parliament-spacy-it-all_topics-input_long", ] for repo_id in model_names: if not os.path.exists(repo_id): os.makedirs(repo_id) model_dir = snapshot_download(repo_id=f"kapllan/{repo_id}", local_dir=f"kapllan/{repo_id}") lang_model = fasttext.load_model("lid.176.bin") with open(Path(config["paths"]["datasets"]) / "label2id.json", "r") as f: label2id = js.load(f) id2label = {} for key, value in label2id.items(): id2label[str(value)] = key with open(Path(config["paths"]["resources"]) / "topics_hierarchy.json", "r") as f: topics_hierarchy = js.load(f) def map_language(language: str) -> str: language_mapping = {"de": "German", "it": "Italian", "fr": "French"} if language in language_mapping.keys(): return language_mapping[language] else: return language def find_model(language: str): vectorizer, model = None, None if language in ["de", "fr", "it"]: path_to_vectorizer = ( f"./kapllan/omikuji-bonsai-parliament-spacy-{language}-all_topics-input_long/vectorizer" ) path_to_model = ( f"./kapllan/omikuji-bonsai-parliament-spacy-{language}-all_topics-input_long/omikuji-model" ) vectorizer = joblib.load(path_to_vectorizer) model = omikuji.Model.load(path_to_model) return vectorizer, model def predict_lang(text: str) -> str: text = re.sub( r"\n", "", text ) # Remove linebreaks because fasttext cannot process that otherwise predictions = lang_model.predict(text, k=1) # returns top 2 matching languages language = predictions[0][0] # returns top 2 matching languages language = re.sub(r"__label__", "", language) # returns top 2 matching languages return language def predict_topic(text: str) -> [List[str], str]: results = [] language = predict_lang(text) vectorizer, model = find_model(language) language = map_language(language) if vectorizer is not None: texts = [text] vector = vectorizer.transform(texts) for row in vector: if row.nnz == 0: # All zero vector, empty result continue feature_values = [(col, row[0, col]) for col in row.nonzero()[1]] for subj_id, score in model.predict(feature_values, top_k=1000): score = round(score*100, 2) results.append((id2label[str(subj_id)], score)) return results, language def get_row_color(type: str): if "main" in type.lower(): return "background-color: darkgrey;" if "sub" in type.lower(): return "background-color: lightgrey;" def generate_html_table(topics: List[Tuple[str, str, float]]): html = '
Type | Topic | Score |
---|---|---|
{type} | {topic} | {score} |