Spaces:
Build error
Build error
File size: 1,796 Bytes
42fa84c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
import re
def clean_up_text(content: str) -> str:
"""
Remove unwanted characters and patterns in text input.
:param content: Text input.
:return: Cleaned version of original text input.
"""
# Fix hyphenated words broken by newline
content = re.sub(r"(\w+)-\n(\w+)", r"\1\2", content)
# Remove specific unwanted patterns and characters
unwanted_patterns = [
"\\n",
" β",
"ββββββββββ",
"βββββββββ",
"βββββ",
r"\\u[\dA-Fa-f]{4}",
r"\uf075",
r"\uf0b7",
]
for pattern in unwanted_patterns:
content = re.sub(pattern, "", content)
# Fix improperly spaced hyphenated words and normalize whitespace
content = re.sub(r"(\w)\s*-\s*(\w)", r"\1-\2", content)
content = re.sub(r"\s+", " ", content)
return content
def get_cleaned_dir_docs(pdf_file_dir):
print(pdf_file_dir)
documents = SimpleDirectoryReader(pdf_file_dir).load_data()
# Call function
cleaned_docs = []
for d in documents:
cleaned_text = clean_up_text(d.text)
d.text = cleaned_text
cleaned_docs.append(d)
return cleaned_docs
def get_cleaned_input_docs(pdf_file):
documents = SimpleDirectoryReader(input_files=[pdf_file]).load_data()
# Call function
cleaned_docs = []
for d in documents:
cleaned_text = clean_up_text(d.text)
d.text = cleaned_text
cleaned_docs.append(d)
return cleaned_docs
if __name__ == "__main__":
# docs = get_cleaned_dir_docs("Data\10200221027_Rajarshi Roy_ (1).pdf")
docs = get_cleaned_dir_docs("E:\projects\AI research assistant\Data")
print(docs)
|