# Databricks notebook source from langchain.text_splitter import RecursiveCharacterTextSplitter from transformers import AutoTokenizer from sentence_transformers import SentenceTransformer from langchain_community.document_loaders import PyPDFLoader from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from transformers import AutoTokenizer, AutoModelForQuestionAnswering,pipeline from transformers import AutoTokenizer, pipeline from langchain.docstore.document import Document as LangchainDocument from typing import List, Optional #from langchain import HuggingFacePipeline #from langchain.chains import RetrievalQA #EMBEDDING_MODEL_NAME = "OrdalieTech/Solon-embeddings-large-0.1" def split_documents( chunk_size: int, knowledge_base: List[LangchainDocument], #tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME, separator:List[str]=None, ) -> List[LangchainDocument]: """ Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents. """ #text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( # AutoTokenizer.from_pretrained(tokenizer_name), # chunk_size=chunk_size, # chunk_overlap=int(chunk_size / 10), # add_start_index=True, # strip_whitespace=True, # separators=separator, #) text_splitter= RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=int(chunk_size / 10), strip_whitespace=True, separators=separator) docs_processed = [] #for doc in knowledge_base: # docs_processed += text_splitter.split_documents([doc]) docs_processed=text_splitter.split_documents(knowledge_base) # Remove duplicates unique_texts = {} docs_processed_unique = [] for doc in docs_processed: if doc.page_content not in unique_texts: unique_texts[doc.page_content] = True docs_processed_unique.append(doc) return docs_processed_unique