RAG_PEDIATRICS / src /data_preparation.py
Stéphanie Kamgnia Wonkap
initial commit
a6e92fe
raw
history blame
1.76 kB
# Databricks notebook source
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering,pipeline
from transformers import AutoTokenizer, pipeline
from langchain.docstore.document import Document as LangchainDocument
from typing import List, Optional
#from langchain import HuggingFacePipeline
#from langchain.chains import RetrievalQA
EMBEDDING_MODEL_NAME = "OrdalieTech/Solon-embeddings-large-0.1"
def split_documents(
chunk_size: int,
knowledge_base: List[LangchainDocument],
tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
separator:List[str]=None,
) -> List[LangchainDocument]:
"""
Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
"""
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
AutoTokenizer.from_pretrained(tokenizer_name),
chunk_size=chunk_size,
chunk_overlap=int(chunk_size / 10),
add_start_index=True,
strip_whitespace=True,
separators=separator,
)
docs_processed = []
for doc in knowledge_base:
docs_processed += text_splitter.split_documents([doc])
# Remove duplicates
unique_texts = {}
docs_processed_unique = []
for doc in docs_processed:
if doc.page_content not in unique_texts:
unique_texts[doc.page_content] = True
docs_processed_unique.append(doc)
return docs_processed_unique