Spaces:
Sleeping
Sleeping
# Databricks notebook source | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from transformers import AutoTokenizer | |
from sentence_transformers import SentenceTransformer | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores import FAISS | |
from transformers import AutoTokenizer, AutoModelForQuestionAnswering,pipeline | |
from transformers import AutoTokenizer, pipeline | |
from langchain.docstore.document import Document as LangchainDocument | |
from typing import List, Optional | |
#from langchain import HuggingFacePipeline | |
#from langchain.chains import RetrievalQA | |
EMBEDDING_MODEL_NAME = "OrdalieTech/Solon-embeddings-large-0.1" | |
def split_documents( | |
chunk_size: int, | |
knowledge_base: List[LangchainDocument], | |
tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME, | |
separator:List[str]=None, | |
) -> List[LangchainDocument]: | |
""" | |
Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents. | |
""" | |
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( | |
AutoTokenizer.from_pretrained(tokenizer_name), | |
chunk_size=chunk_size, | |
chunk_overlap=int(chunk_size / 10), | |
add_start_index=True, | |
strip_whitespace=True, | |
separators=separator, | |
) | |
docs_processed = [] | |
for doc in knowledge_base: | |
docs_processed += text_splitter.split_documents([doc]) | |
# Remove duplicates | |
unique_texts = {} | |
docs_processed_unique = [] | |
for doc in docs_processed: | |
if doc.page_content not in unique_texts: | |
unique_texts[doc.page_content] = True | |
docs_processed_unique.append(doc) | |
return docs_processed_unique |