File size: 1,841 Bytes
44f0ae2
1de5e6e
44f0ae2
 
330533c
 
748259b
 
44f0ae2
 
 
 
 
 
 
 
 
 
 
3469cf9
44f0ae2
 
 
8ee40d6
 
44f0ae2
 
 
 
330533c
 
 
f954296
330533c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import glob
import os
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from transformers import AutoTokenizer
path_to_data = "./data/"

def process_markdown():
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
        ("####", "Header 4"),
        ("#####", "Header 5")
    ]
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    
    files = glob.glob(path_to_data+"*.md")
    print(files)
    docs = []
    for file in files:
      try:
          with open(file) as f:
            docs.append(f.read())
      except Exception as e:
        print("Exception: ", e)
    docs_processed = [markdown_splitter.split_text(doc) for doc in docs]
    print(len(docs_processed))
    print(docs_processed[0])

def process_pdf():
    files = glob.glob(path_to_data+"*.pdf")
    docs = []
    for file in files:
        try:
            docs.append(PyMuPDFLoader(file).load())
        except Exception as e:
            print("Exception: ", e)
    

    chunk_size = 256
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
            AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
            chunk_size=chunk_size,
            chunk_overlap=int(chunk_size / 10),
            add_start_index=True,
            strip_whitespace=True,
            separators=["\n\n", "\n", ".", " ", ""],
    )
    docs_processed = [text_splitter.split_documents(doc) for doc in docs]
    docs_processed = [item for sublist in docs_processed for item in sublist]

    print(len(docs_processed))
    print(docs_processed[0])