Spaces:
Sleeping
Sleeping
""" | |
This script cleans the OCR files, so that we have uniform documents with the same pre-processing applied to each of | |
them. For every book, a new document is created so that the original file is always available for cross-checking etc. | |
Code adapted from Travelogues project, by Jan Rörden. Source: https://github.com/travelogues/scripts/blob/master/groundtruth/ | |
""" | |
import os | |
import re | |
import string | |
import unicodedata | |
from tqdm import tqdm | |
# directories | |
books_original_dir = 'source/path/' | |
output_dir = 'output/path/' | |
# Ensure the cleaned directory exists | |
os.makedirs(output_dir, exist_ok=True) | |
# Function to remove accents and umlauts | |
def remove_accents(input_str): | |
# Normalize to decompose accents | |
nfkd_form = unicodedata.normalize('NFKD', input_str) | |
# Filter out diacritical marks | |
return ''.join([c for c in nfkd_form if not unicodedata.combining(c)]) | |
for fname in tqdm(sorted(os.listdir(books_original_dir))): | |
# Save the current id for file naming later | |
current_book_id = fname[:-4] | |
# Process only .txt files | |
if fname.endswith('.txt'): | |
with open(os.path.join(books_original_dir, fname), 'r', encoding='utf-8') as f: | |
cleaned_lines = [] | |
page_lines = [] | |
for line in f: | |
# Replace long s and ß with normal s | |
clean_line = re.sub(r'[ſß]', 's', line) | |
# Remove accents and umlauts | |
clean_line = remove_accents(clean_line) | |
# Remove all non-word characters except whitespace and punctuation | |
clean_line = re.sub(r'[^a-zA-Z0-9\s' + re.escape(string.punctuation) + ']', '', clean_line) | |
# Convert to lowercase | |
#clean_line = clean_line.lower() | |
# Strip trailing spaces but keep line breaks | |
clean_line = clean_line.rstrip() | |
# Exclude lines based on criteria | |
if len(clean_line) < 3 or clean_line.isdigit() or not re.search(r'[a-zA-Z]', clean_line): | |
continue # Skip the line | |
# Check for a new page indicated by a blank line | |
if clean_line == "": | |
# Handle empty pages | |
if not page_lines or page_lines[0].startswith('statuscode') or page_lines[0].startswith('<html>'): | |
cleaned_lines.append("<empty page>") | |
else: | |
cleaned_lines.extend(page_lines) | |
page_lines = [] | |
else: | |
page_lines.append(clean_line) | |
# Handle the last page if the file ends without a blank line | |
if not page_lines or page_lines[0].startswith('statuscode') or page_lines[0].startswith('<html>'): | |
cleaned_lines.append("<empty page>") | |
else: | |
cleaned_lines.extend(page_lines) | |
# Save the cleaned text to a new file, retaining line breaks | |
cleaned_file_path = os.path.join(output_dir, f"{current_book_id}_cleaned.txt") | |
with open(cleaned_file_path, 'w', encoding='utf-8') as cleaned_file: | |
cleaned_file.write('\n'.join(cleaned_lines)) # Write lines with original line breaks |