dinalzein commited on
Commit
409b791
Β·
1 Parent(s): a00b2b5

add app file

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
2
+ import gradio as gr
3
+ import torch
4
+ import numpy as np
5
+ from mapping_labels import languages_map, id2label
6
+
7
+ model_checkpoint = "dinalzein/xlm-roberta-base-finetuned-language-identification"
8
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
9
+ model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
10
+ trainer = Trainer(model)
11
+
12
+ class Dataset(torch.utils.data.Dataset):
13
+ def __init__(self, encodings, labels=None):
14
+ self.encodings = encodings
15
+ self.labels = labels
16
+
17
+ def __getitem__(self, idx):
18
+ item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
19
+ if self.labels:
20
+ item["labels"] = torch.tensor(self.labels[idx])
21
+ return item
22
+
23
+ def __len__(self):
24
+ return len(self.encodings["input_ids"])
25
+
26
+ def identify_language(txt):
27
+ txt=[txt]
28
+ tokenized_txt = tokenizer(txt, truncation=True, max_length=20)
29
+ txt_dataset = Dataset(tokenized_txt)
30
+ raw_pred, _, _ = trainer.predict(txt_dataset)
31
+ # Preprocess raw predictions
32
+ y_pred = np.argmax(raw_pred, axis=1)
33
+ return languages_map[id2label[str(y_pred[0])]]
34
+
35
+
36
+
37
+ #with gr.Row():
38
+ examples = [
39
+ "C'est La Vie",
40
+ "So ist das Leben",
41
+ "That is life",
42
+ "Ω‡Ψ°Ω‡ Ω‡ΩŠ Ψ§Ω„Ψ­ΩŠΨ§Ψ©"
43
+ ]
44
+
45
+
46
+ inputs=gr.inputs.Textbox(placeholder="Enter your text here", label="Text content", lines=5)
47
+ outputs=gr.outputs.Label(label="Language Identified:")
48
+
49
+
50
+ article = ('''## Suppoted Langauges \n
51
+ * Arabic (ar)
52
+ * Bulgarian (bg)
53
+ * German (de)
54
+ * Modern greek (el)
55
+ * English (en)
56
+ * Spanish (es)
57
+ * French (fr)
58
+ * Hindi (hi)
59
+ * Italian (it)
60
+ * Japanese (ja)
61
+ * Dutch (nl)
62
+ * Polish (pl)
63
+ * Portuguese (pt)
64
+ * Russian (ru)
65
+ * Swahili (sw)
66
+ * Thai (th)
67
+ * Turkish (tr)
68
+ * Urdu (ur)
69
+ * Vietnamese (vi)
70
+ * Chinese (zh)
71
+ ''')
72
+
73
+
74
+ gr.Interface(
75
+ fn=identify_language,
76
+ inputs=inputs,
77
+ outputs=outputs,
78
+ verbose=True,
79
+ examples = examples,
80
+ title="Language Identifier",
81
+ description="It aims at identifing the language a document is written in. It supports 20 different languages.",
82
+ article=article,
83
+ theme="huggingface"
84
+ ).launch()