Slavko Novak commited on
Commit
20d49b2
·
1 Parent(s): 9827f59

fasttext-numpy2

Browse files
Files changed (2) hide show
  1. eSeNTranslate.py +28 -0
  2. requirements.txt +1 -0
eSeNTranslate.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Imports
2
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
+ import fasttext
4
+ from huggingface_hub import hf_hub_download
5
+
6
+ # Model (Pipeline) class
7
+ class TranslateFromAny2XModel:
8
+ def __init__(self, nllb_model_path: str, fasttext_model_path: str, target_language="eng_Latn"):
9
+ """Initialize the model with paths for NLLB and FastText NLLB LID models."""
10
+ self.nllb_model_path = nllb_model_path
11
+ self.fasttext_model_path = fasttext_model_path
12
+ self.target_language = target_language
13
+
14
+ # Load NLLB model and tokenizer
15
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(nllb_model_path)
16
+ self.tokenizer = AutoTokenizer.from_pretrained(nllb_model_path)
17
+
18
+ # Load FastText language identification model
19
+ self.fasttext_model = fasttext.load_model(fasttext_model_path)
20
+
21
+ def generate(self, prompt: str) -> str:
22
+ """Translates the input prompt to target_language using the NLLB model and source language detection using fastText LID model."""
23
+ self.tokenizer.src_lang = self.fasttext_model.predict(prompt)[0][0].replace("__label__", "")
24
+ inputs = self.tokenizer(prompt, return_tensors="pt")
25
+ output_tokens = self.model.generate(**inputs, forced_bos_token_id=self.tokenizer.convert_tokens_to_ids(self.target_language))[0]
26
+ output = self.tokenizer.decode(output_tokens, skip_special_tokens=True)
27
+ return output
28
+
requirements.txt CHANGED
@@ -3,3 +3,4 @@ numpy
3
  diffusers
4
  torch
5
  transformers
 
 
3
  diffusers
4
  torch
5
  transformers
6
+ fasttext-numpy2