File size: 1,020 Bytes
f493920
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import textract
from datasets import Dataset as hfd
from sentence_transformers import SentenceTransformer

from config import FEATURE_EXTRACTOR_CHECKPOINT

FEATURE_EXTRACTOR = SentenceTransformer(FEATURE_EXTRACTOR_CHECKPOINT)


def encode_sentence(instance: hfd, text_col: str):
    return {
        "embedding": FEATURE_EXTRACTOR.encode(
            instance[text_col], normalize_embeddings=True
        )
    }


def parse_pdf(pdf_path: str):
    """Gets text from a pdf file using textract"""
    txt = textract.process(pdf_path, method="pdfminer", encoding="latin-1").decode()
    return txt


def chunk_text(text: str, split_sentence="ARTÍCULO"):
    """creates chunks of texts using a split_sentence"""
    chunks = [
        {"chunk": split_sentence + " " + c.replace("\n", " ").strip()}
        for c in text.split(split_sentence)
    ]
    return chunks


def create_df(text_chunks: list[dict[str]]):
    "creates a HuggingFace dataset based on a list of dicts [str,str]"
    return hfd.from_list(text_chunks)