Spaces:

NbAiLab
/

language-identification

Sleeping

App Files Files Community

versae commited on Jun 21, 2023

Commit

fe1893c

1 Parent(s): 4106df7

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -3

app.py CHANGED Viewed

@@ -1,12 +1,60 @@
-import gradio as gr
 from huggingface_hub import hf_hub_download
 import fasttext
 model = fasttext.load_model(hf_hub_download("NbAiLab/nb-nordic-lid", "model.bin"))
 def identify(text):
-    return model.predict(text)
-iface = gr.Interface(fn=identify, inputs="text", outputs="text")
 iface.launch()

+from typing import Optional, List, Set, Union
 from huggingface_hub import hf_hub_download
+import gradio as gr
 import fasttext
 model = fasttext.load_model(hf_hub_download("NbAiLab/nb-nordic-lid", "model.bin"))
+model_labels = set(label[-3:] for label in model.get_labels())
+def detect_lang(
+    text: str,
+    langs: Optional[Union[List, Set]]=None,
+    threshold: float=-1.0,
+    return_proba: bool=False
+) -> Union[str, Tuple[str, float]]:
+    """
+    This function takes in a text string and optional arguments for a list or
+    set of languages to detect, a threshold for minimum probability of language
+    detection, and a boolean for returning the probability of detected language.
+    It uses a pre-defined model to predict the language of the text and returns
+    the detected ISO-639-3 language code as a string. If the return_proba
+    argument is set to True, it will also return a tuple with the language code
+    and the probability of detection. If no language is detected, it will
+    return "und" as the language code.
+    Args:
+    - text (str): The text to detect the language of.
+    - langs (List or Set, optional): The list or set of languages to detect in
+        the text. Defaults to all languages in the model's labels.
+    - threshold (float, optional): The minimum probability for a language to be
+        considered detected. Defaults to `-1.0`.
+    - return_proba (bool, optional): Whether to return the language code and
+        probability of detection as a tuple. Defaults to `False`.
+    Returns:
+    str or Tuple[str, float]: The detected language code as a string, or a
+        tuple with the language code and probability of detection if
+        return_proba is set to True.
+    """
+    if langs:
+        langs = set(langs)
+    else:
+        langs = model_labels
+    raw_prediction = model.predict(text, threshold=threshold, k=-1)
+    predictions = [
+        (label[-3:], min(probability, 1.0))
+        for label, probability in zip(*raw_prediction)
+        if label[-3:] in langs
+    ]
+    if not predictions:
+        return [("und", 1.0)] if return_proba else "und"
+    else:
+        return predictions if return_proba else predictions[0][0]
 def identify(text):
+    return dict(detect_lang(text, return_proba=True))
+iface = gr.Interface(fn=identify, inputs="text", outputs="label")
 iface.launch()