ReformaPensional-LLama3-RAG / preprocessing.py
jhonparra18's picture
updated file structure
f493920
raw
history blame
1.02 kB
import textract
from datasets import Dataset as hfd
from sentence_transformers import SentenceTransformer
from config import FEATURE_EXTRACTOR_CHECKPOINT
FEATURE_EXTRACTOR = SentenceTransformer(FEATURE_EXTRACTOR_CHECKPOINT)
def encode_sentence(instance: hfd, text_col: str):
return {
"embedding": FEATURE_EXTRACTOR.encode(
instance[text_col], normalize_embeddings=True
)
}
def parse_pdf(pdf_path: str):
"""Gets text from a pdf file using textract"""
txt = textract.process(pdf_path, method="pdfminer", encoding="latin-1").decode()
return txt
def chunk_text(text: str, split_sentence="ARTÍCULO"):
"""creates chunks of texts using a split_sentence"""
chunks = [
{"chunk": split_sentence + " " + c.replace("\n", " ").strip()}
for c in text.split(split_sentence)
]
return chunks
def create_df(text_chunks: list[dict[str]]):
"creates a HuggingFace dataset based on a list of dicts [str,str]"
return hfd.from_list(text_chunks)