File size: 4,353 Bytes
31cbd5c
70c2a60
 
 
 
 
31cbd5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70c2a60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31cbd5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
from typing import List, Union
from pdfminer.high_level import extract_text
import io
from chainlit.types import AskFileResponse
import re


class TextFileLoader:
    def __init__(self, path: str, encoding: str = "utf-8"):
        self.documents = []
        self.path = path
        self.encoding = encoding

    def load(self):
        if os.path.isdir(self.path):
            self.load_directory()
        elif os.path.isfile(self.path) and self.path.endswith(".txt"):
            self.load_file()
        else:
            raise ValueError(
                "Provided path is neither a valid directory nor a .txt file."
            )

    def load_file(self):
        with open(self.path, "r", encoding=self.encoding) as f:
            self.documents.append(f.read())

    def load_directory(self):
        for root, _, files in os.walk(self.path):
            for file in files:
                if file.endswith(".txt"):
                    with open(
                        os.path.join(root, file), "r", encoding=self.encoding
                    ) as f:
                        self.documents.append(f.read())

    def load_documents(self):
        self.load()
        return self.documents
    
class PDFFileLoader(TextFileLoader):
    def __init__(self, path: str, encoding: str = "utf-8", content=None, files: list[AskFileResponse] = None):
        super().__init__(path, encoding)
        self.content = content
        self.files = files
        
    def load(self):
        if isinstance(self.files, List):
            for file in self.files:
                if file.content and file.path.endswith(".pdf"):
                    self.content = file.content
                    self.load_content()
        elif os.path.isdir(self.path):
            self.load_directory()
        elif os.path.isfile(self.path) and self.path.endswith(".pdf"):
            print("loading file ...")
            self.load_file()
        elif self.content and self.path.endswith(".pdf"):
            print("loading content ...")
            self.load_content()
        else:
            raise ValueError(
                "Provided path is neither a valid directory nor a .pdf file."
            )
        
    def load_content(self):
        """Load pdf already in memory"""
        text = extract_text(io.BytesIO(self.content))
        text = self.clean_text(text)
        self.documents.append(text)

    def clean_text(self, text):
        """Clean text by removing special characters."""
        # remove all \n
        text = text.replace('\n', ' ')
        text = re.sub(' +', ' ', text)
        # remove page number, we find it because it appears before '\x0c', use regex to find it
        text = re.sub(r'\d+ \x0c', '\x0c', text)
        # remove all '\x0c'
        text = text.replace('\x0c', ' ')
        return text

    def load_file(self):
        text = extract_text(pdf_file=self.path, codec=self.encoding)
        self.documents.append(text)

    def load_directory(self):
        for root, _, files in os.walk(self.path):
            for file in files:
                if file.endswith(".pdf"):
                    self.documents.append(
                        extract_text(os.path.join(root, file), encoding=self.encoding)
                    )



class CharacterTextSplitter:
    def __init__(
        self,
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
    ):
        assert (
            chunk_size > chunk_overlap
        ), "Chunk size must be greater than chunk overlap"

        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def split(self, text: str) -> List[str]:
        chunks = []
        for i in range(0, len(text), self.chunk_size - self.chunk_overlap):
            chunks.append(text[i : i + self.chunk_size])
        return chunks

    def split_texts(self, texts: List[str]) -> List[str]:
        chunks = []
        for text in texts:
            chunks.extend(self.split(text))
        return chunks


if __name__ == "__main__":
    loader = TextFileLoader("data/KingLear.txt")
    loader.load()
    splitter = CharacterTextSplitter()
    chunks = splitter.split_texts(loader.documents)
    print(len(chunks))
    print(chunks[0])
    print("--------")
    print(chunks[1])
    print("--------")
    print(chunks[-2])
    print("--------")
    print(chunks[-1])