genai-reward / data_prep.py
TheFrenchDemos's picture
Initial commit
9705a2a
import os
import tarfile
import shutil
# Define the base directory (update this to your folder location)
base_dir = "..."
# Define the output directory
output_dir = os.path.join(base_dir, "all_extracted")
# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Iterate over the folders and files in the base directory
for root, dirs, files in os.walk(base_dir):
for file in files:
if file.endswith(".tar"):
tar_path = os.path.join(root, file)
tar_name = os.path.basename(file)[:-4]
print(f"Extracting: {tar_path}")
# Open the .tar file and extract its contents
with tarfile.open(tar_path) as tar:
# Extract to a temporary location
temp_dir = os.path.join(base_dir, "temp_extract")
os.makedirs(temp_dir, exist_ok=True)
tar.extractall(temp_dir)
# Move the extracted files to the output directory
for extracted_file in os.listdir(temp_dir):
source_path = os.path.join(temp_dir, extracted_file)
target_path = os.path.join(output_dir, f"{tar_name}_{extracted_file}")
shutil.move(source_path, target_path)
# Clean up the temporary directory
shutil.rmtree(temp_dir)
print(f"All files extracted to: {output_dir}")