freud_rag / unpack_german_freud.py
ruggsea's picture
initial
a37b18d
import zipfile
import os
import shutil
import re
def extract_epubs_from_zip(zip_path, extract_to):
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
for file_info in zip_ref.infolist():
if file_info.filename.endswith('.epub'):
# Extract the EPUB file directly to the target directory
zip_ref.extract(file_info, extract_to)
# Move the EPUB file to the target directory
extracted_path = os.path.join(extract_to, file_info.filename)
shutil.move(extracted_path, os.path.join(extract_to, os.path.basename(file_info.filename)))
def unpack_zips_and_extract_epubs(root_dir, epubs_dir):
if not os.path.exists(epubs_dir):
os.makedirs(epubs_dir)
for root, dirs, files in os.walk(root_dir):
for file in files:
if file.endswith('.zip'):
zip_path = os.path.join(root, file)
temp_extract_dir = os.path.join(root, 'temp_extract')
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(temp_extract_dir)
for temp_root, temp_dirs, temp_files in os.walk(temp_extract_dir):
for temp_file in temp_files:
if temp_file.endswith('.epub'):
epub_path = os.path.join(temp_root, temp_file)
shutil.move(epub_path, os.path.join(epubs_dir, os.path.basename(epub_path)))
elif temp_file.endswith('.zip'):
nested_zip_path = os.path.join(temp_root, temp_file)
if zipfile.is_zipfile(nested_zip_path):
extract_epubs_from_zip(nested_zip_path, epubs_dir)
# Clean up the temporary extraction directory
shutil.rmtree(temp_extract_dir)
# Remove any empty directories in the epubs directory
for root, dirs, files in os.walk(epubs_dir, topdown=False):
for dir in dirs:
dir_path = os.path.join(root, dir)
if not os.listdir(dir_path):
os.rmdir(dir_path)
if __name__ == "__main__":
root_directory = '.' # Change this to the directory containing your ZIP files
epubs_directory = './epubs'
unpack_zips_and_extract_epubs(root_directory, epubs_directory)