|
|
|
from haystack.document_stores import FAISSDocumentStore |
|
|
|
|
|
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat") |
|
|
|
import pandas as pd |
|
|
|
df_document = pd.read_csv("data/articles.csv") |
|
|
|
articles = [] |
|
for idx, row in df_document.iterrows(): |
|
article = { |
|
"content": row["article"], |
|
"meta":{ |
|
"chapter_name": row["chapter_name"], |
|
"article_page": row["article_page"], |
|
"article_number": row["article_number"], |
|
"article_name": row["article_name"], |
|
}, |
|
} |
|
articles.append(article) |
|
|
|
document_store.write_documents(articles, index="document") |
|
print(f"Loaded {document_store.get_document_count()} documents") |
|
|
|
from haystack.nodes import DensePassageRetriever |
|
|
|
retriever = DensePassageRetriever( |
|
document_store=document_store, |
|
query_embedding_model="sadakmed/dpr-passage_encoder-spanish", |
|
passage_embedding_model="sadakmed/dpr-passage_encoder-spanish", |
|
max_seq_len_query=64, |
|
max_seq_len_passage=384, |
|
batch_size=16, |
|
use_gpu=False, |
|
embed_title=True, |
|
use_fast_tokenizers=True, |
|
) |
|
document_store.update_embeddings(retriever) |
|
|
|
from haystack.nodes import FARMReader |
|
|
|
model_ckpt = "mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es" |
|
reader = FARMReader( |
|
model_name_or_path=model_ckpt, |
|
progress_bar=False, |
|
max_seq_len=384, |
|
doc_stride=128, |
|
return_no_answer=True, |
|
use_gpu=False, |
|
) |
|
|
|
from haystack.pipelines import ExtractiveQAPipeline |
|
|
|
pipe = ExtractiveQAPipeline(reader, retriever) |
|
|
|
question = "pueblos originarios justicia" |
|
prediction = pipe.run( |
|
query=question, |
|
params={ |
|
"Retriever": {"top_k": 10}, |
|
"Reader": {"top_k": 5} |
|
} |
|
) |
|
|
|
from pprint import pprint |
|
|
|
pprint(prediction) |
|
|
|
|
|
from haystack.utils import print_answers |
|
|
|
|
|
print_answers(prediction, details="minimum") |
|
|
|
|