Spaces:
Sleeping
Sleeping
File size: 4,353 Bytes
31cbd5c 70c2a60 31cbd5c 70c2a60 31cbd5c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import os
from typing import List, Union
from pdfminer.high_level import extract_text
import io
from chainlit.types import AskFileResponse
import re
class TextFileLoader:
def __init__(self, path: str, encoding: str = "utf-8"):
self.documents = []
self.path = path
self.encoding = encoding
def load(self):
if os.path.isdir(self.path):
self.load_directory()
elif os.path.isfile(self.path) and self.path.endswith(".txt"):
self.load_file()
else:
raise ValueError(
"Provided path is neither a valid directory nor a .txt file."
)
def load_file(self):
with open(self.path, "r", encoding=self.encoding) as f:
self.documents.append(f.read())
def load_directory(self):
for root, _, files in os.walk(self.path):
for file in files:
if file.endswith(".txt"):
with open(
os.path.join(root, file), "r", encoding=self.encoding
) as f:
self.documents.append(f.read())
def load_documents(self):
self.load()
return self.documents
class PDFFileLoader(TextFileLoader):
def __init__(self, path: str, encoding: str = "utf-8", content=None, files: list[AskFileResponse] = None):
super().__init__(path, encoding)
self.content = content
self.files = files
def load(self):
if isinstance(self.files, List):
for file in self.files:
if file.content and file.path.endswith(".pdf"):
self.content = file.content
self.load_content()
elif os.path.isdir(self.path):
self.load_directory()
elif os.path.isfile(self.path) and self.path.endswith(".pdf"):
print("loading file ...")
self.load_file()
elif self.content and self.path.endswith(".pdf"):
print("loading content ...")
self.load_content()
else:
raise ValueError(
"Provided path is neither a valid directory nor a .pdf file."
)
def load_content(self):
"""Load pdf already in memory"""
text = extract_text(io.BytesIO(self.content))
text = self.clean_text(text)
self.documents.append(text)
def clean_text(self, text):
"""Clean text by removing special characters."""
# remove all \n
text = text.replace('\n', ' ')
text = re.sub(' +', ' ', text)
# remove page number, we find it because it appears before '\x0c', use regex to find it
text = re.sub(r'\d+ \x0c', '\x0c', text)
# remove all '\x0c'
text = text.replace('\x0c', ' ')
return text
def load_file(self):
text = extract_text(pdf_file=self.path, codec=self.encoding)
self.documents.append(text)
def load_directory(self):
for root, _, files in os.walk(self.path):
for file in files:
if file.endswith(".pdf"):
self.documents.append(
extract_text(os.path.join(root, file), encoding=self.encoding)
)
class CharacterTextSplitter:
def __init__(
self,
chunk_size: int = 1000,
chunk_overlap: int = 200,
):
assert (
chunk_size > chunk_overlap
), "Chunk size must be greater than chunk overlap"
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def split(self, text: str) -> List[str]:
chunks = []
for i in range(0, len(text), self.chunk_size - self.chunk_overlap):
chunks.append(text[i : i + self.chunk_size])
return chunks
def split_texts(self, texts: List[str]) -> List[str]:
chunks = []
for text in texts:
chunks.extend(self.split(text))
return chunks
if __name__ == "__main__":
loader = TextFileLoader("data/KingLear.txt")
loader.load()
splitter = CharacterTextSplitter()
chunks = splitter.split_texts(loader.documents)
print(len(chunks))
print(chunks[0])
print("--------")
print(chunks[1])
print("--------")
print(chunks[-2])
print("--------")
print(chunks[-1])
|