versae's picture
Update app.py
fe1893c
raw
history blame
2.34 kB
from typing import Optional, List, Set, Union
from huggingface_hub import hf_hub_download
import gradio as gr
import fasttext
model = fasttext.load_model(hf_hub_download("NbAiLab/nb-nordic-lid", "model.bin"))
model_labels = set(label[-3:] for label in model.get_labels())
def detect_lang(
text: str,
langs: Optional[Union[List, Set]]=None,
threshold: float=-1.0,
return_proba: bool=False
) -> Union[str, Tuple[str, float]]:
"""
This function takes in a text string and optional arguments for a list or
set of languages to detect, a threshold for minimum probability of language
detection, and a boolean for returning the probability of detected language.
It uses a pre-defined model to predict the language of the text and returns
the detected ISO-639-3 language code as a string. If the return_proba
argument is set to True, it will also return a tuple with the language code
and the probability of detection. If no language is detected, it will
return "und" as the language code.
Args:
- text (str): The text to detect the language of.
- langs (List or Set, optional): The list or set of languages to detect in
the text. Defaults to all languages in the model's labels.
- threshold (float, optional): The minimum probability for a language to be
considered detected. Defaults to `-1.0`.
- return_proba (bool, optional): Whether to return the language code and
probability of detection as a tuple. Defaults to `False`.
Returns:
str or Tuple[str, float]: The detected language code as a string, or a
tuple with the language code and probability of detection if
return_proba is set to True.
"""
if langs:
langs = set(langs)
else:
langs = model_labels
raw_prediction = model.predict(text, threshold=threshold, k=-1)
predictions = [
(label[-3:], min(probability, 1.0))
for label, probability in zip(*raw_prediction)
if label[-3:] in langs
]
if not predictions:
return [("und", 1.0)] if return_proba else "und"
else:
return predictions if return_proba else predictions[0][0]
def identify(text):
return dict(detect_lang(text, return_proba=True))
iface = gr.Interface(fn=identify, inputs="text", outputs="label")
iface.launch()