Spaces:
Runtime error
Runtime error
File size: 1,841 Bytes
44f0ae2 1de5e6e 44f0ae2 330533c 748259b 44f0ae2 3469cf9 44f0ae2 8ee40d6 44f0ae2 330533c f954296 330533c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import glob
import os
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from transformers import AutoTokenizer
path_to_data = "./data/"
def process_markdown():
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
("####", "Header 4"),
("#####", "Header 5")
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
files = glob.glob(path_to_data+"*.md")
print(files)
docs = []
for file in files:
try:
with open(file) as f:
docs.append(f.read())
except Exception as e:
print("Exception: ", e)
docs_processed = [markdown_splitter.split_text(doc) for doc in docs]
print(len(docs_processed))
print(docs_processed[0])
def process_pdf():
files = glob.glob(path_to_data+"*.pdf")
docs = []
for file in files:
try:
docs.append(PyMuPDFLoader(file).load())
except Exception as e:
print("Exception: ", e)
chunk_size = 256
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
chunk_size=chunk_size,
chunk_overlap=int(chunk_size / 10),
add_start_index=True,
strip_whitespace=True,
separators=["\n\n", "\n", ".", " ", ""],
)
docs_processed = [text_splitter.split_documents(doc) for doc in docs]
docs_processed = [item for sublist in docs_processed for item in sublist]
print(len(docs_processed))
print(docs_processed[0])
|