Spaces:
Runtime error
Runtime error
File size: 3,594 Bytes
d560727 7d6b48b c8ba482 8fc4dca e005ecb 8fc4dca b91ec33 213e5ae c8ba482 8fc4dca 41c45dd 81560a4 8fc4dca c8ba482 8fc4dca c8ba482 8fc4dca c8ba482 e005ecb c8ba482 e005ecb b91ec33 e005ecb a9e9446 e005ecb 3ffbf27 e005ecb 81560a4 e005ecb b022516 40a1621 521bf85 2910967 c1dfeeb e005ecb 2910967 c1dfeeb 2910967 9c65493 e005ecb b022516 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import langid
import os
from haystack import Pipeline
from haystack.nodes import TextConverter, PreProcessor, BM25Retriever, FARMReader
from haystack.document_stores import InMemoryDocumentStore
from haystack.utils import print_answers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
class Sejarah:
def __init__(self):
document_store = InMemoryDocumentStore(use_bm25=True)
#initialize the pipeline
indexing_pipeline = Pipeline()
text_converter = TextConverter()
preprocessor = PreProcessor(
clean_whitespace=True,
clean_header_footer=True,
clean_empty_lines=True,
split_by="word",
split_length=200,
split_overlap=20,
split_respect_sentence_boundary=True,
)
indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"])
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])
dir = "documents"
files_to_index = [dir+"/" + f for f in os.listdir(dir)]
indexing_pipeline.run_batch(file_paths=files_to_index)
retriever = BM25Retriever(document_store=document_store)
reader = FARMReader(model_name_or_path="primasr/malaybert-for-eqa-finetuned", use_gpu=True)
self.querying_pipeline = Pipeline()
self.querying_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
self.querying_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])
#Malay to English Model
self.id_en_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-id-en")
self.id_en_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-id-en")
#English to Malay Model
self.en_id_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-id")
self.en_id_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-id")
def language_converter(self, content, lang, method):
if lang == "en":
if method == "question":
tokenized_text = self.en_id_tokenizer.prepare_seq2seq_batch([content], return_tensors='pt')
translation = self.en_id_model.generate(**tokenized_text)
content = self.en_id_tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
else:
tokenized_text = self.id_en_tokenizer.prepare_seq2seq_batch([content], return_tensors='pt')
translation = self.id_en_model.generate(**tokenized_text)
content = self.id_en_tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
return content
def detect_language(self, content):
lang = langid.classify(content)
return lang[0]
def interface(self, question):
language = self.detect_language(question)
converted_question = self.language_converter(question, language, "question")
result = self.querying_pipeline.run(
query=converted_question,
params={
"Retriever": {"top_k": 10},
"Reader": {"top_k": 5}
}
)
answer = self.language_converter(result['answers'][0].answer, language, "answer")
context = self.language_converter(result['answers'][0].context, language, "answer")
return answer, context |