File size: 3,594 Bytes
d560727
7d6b48b
c8ba482
 
 
8fc4dca
e005ecb
8fc4dca
b91ec33
213e5ae
c8ba482
 
 
8fc4dca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41c45dd
81560a4
 
8fc4dca
c8ba482
8fc4dca
 
c8ba482
8fc4dca
 
 
c8ba482
e005ecb
 
 
c8ba482
e005ecb
 
 
b91ec33
e005ecb
 
a9e9446
e005ecb
 
3ffbf27
 
 
e005ecb
81560a4
 
 
 
 
e005ecb
b022516
 
 
 
 
 
 
40a1621
521bf85
2910967
c1dfeeb
e005ecb
2910967
c1dfeeb
2910967
 
 
 
 
 
9c65493
 
e005ecb
b022516
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import langid
import os
from haystack import Pipeline
from haystack.nodes import TextConverter, PreProcessor, BM25Retriever, FARMReader
from haystack.document_stores import InMemoryDocumentStore
from haystack.utils import print_answers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

class Sejarah:
    def __init__(self):
        
        document_store = InMemoryDocumentStore(use_bm25=True)

        #initialize the pipeline
        indexing_pipeline = Pipeline()
        text_converter = TextConverter()
        preprocessor = PreProcessor(
            clean_whitespace=True,
            clean_header_footer=True,
            clean_empty_lines=True,
            split_by="word",
            split_length=200,
            split_overlap=20,
            split_respect_sentence_boundary=True,
        )

        indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"])
        indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
        indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])

        dir = "documents"

        files_to_index = [dir+"/" + f for f in os.listdir(dir)]
        indexing_pipeline.run_batch(file_paths=files_to_index)

        retriever = BM25Retriever(document_store=document_store)
        reader = FARMReader(model_name_or_path="primasr/malaybert-for-eqa-finetuned", use_gpu=True)

        self.querying_pipeline = Pipeline()
        self.querying_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
        self.querying_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])

        #Malay to English Model
        self.id_en_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-id-en")
        self.id_en_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-id-en")

        #English to Malay Model
        self.en_id_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-id")
        self.en_id_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-id")

    
    def language_converter(self, content, lang, method):
        
        if lang == "en":
            if method == "question":
                tokenized_text = self.en_id_tokenizer.prepare_seq2seq_batch([content], return_tensors='pt')
                translation = self.en_id_model.generate(**tokenized_text)
                content = self.en_id_tokenizer.batch_decode(translation, skip_special_tokens=True)[0]

            else:
                tokenized_text = self.id_en_tokenizer.prepare_seq2seq_batch([content], return_tensors='pt')
                translation = self.id_en_model.generate(**tokenized_text)
                content = self.id_en_tokenizer.batch_decode(translation, skip_special_tokens=True)[0]

        return content


    def detect_language(self, content):
        lang = langid.classify(content)
        return lang[0]

        
    def interface(self, question):
        language = self.detect_language(question)

        converted_question = self.language_converter(question, language, "question")

        result = self.querying_pipeline.run(
            query=converted_question,
            params={
                "Retriever": {"top_k": 10},
                "Reader": {"top_k": 5}
            }
        )

        answer = self.language_converter(result['answers'][0].answer, language, "answer")
        context = self.language_converter(result['answers'][0].context, language, "answer")
        
        return answer, context