Spaces:
Sleeping
Sleeping
import os | |
import sys | |
import pandas as pd | |
def display_and_store_directory_content(base_path): | |
""" | |
Display all paths with directories and files along with their content, | |
and store the information in a Pandas DataFrame. | |
Args: | |
base_path (str): The root directory path to scan. | |
Returns: | |
None: Prints paths and content, and saves the DataFrame as a pickle file. | |
""" | |
data = [] # To store path and content as rows for the DataFrame | |
for root, dirs, files in os.walk(base_path): | |
# Store directories (no content) | |
for d in dirs: | |
dir_path = os.path.join(root, d) | |
data.append({"path": dir_path, "content": ""}) | |
print(f"Directory: {dir_path}") | |
# Store files and their content | |
for f in files: | |
file_path = os.path.join(root, f) | |
try: | |
with open(file_path, 'r', encoding='utf-8') as file: | |
content = file.read() | |
except Exception as e: | |
content = f"Error reading file: {e}" | |
data.append({"path": file_path, "content": content}) | |
print(f"\nFile: {file_path}") | |
print("-" * 40) | |
print(content) | |
print("-" * 40) | |
# Create a DataFrame | |
df = pd.DataFrame(data) | |
# Create the 'extraction' directory if it doesn't exist | |
extraction_dir = "extraction" | |
if not os.path.exists(extraction_dir): | |
os.makedirs(extraction_dir) | |
# Use the last component of the base path as the file name | |
base_name = os.path.basename(os.path.normpath(base_path)) | |
output_file = os.path.join(extraction_dir, f"{base_name}.pkl") | |
# Save the DataFrame to a pickle file | |
df.to_pickle(output_file) | |
print(f"\nDataFrame saved to {output_file}") | |
if __name__ == "__main__": | |
# Ensure a directory path is provided as an argument | |
if len(sys.argv) < 2: | |
print("Usage: python utils\\extract_all_content.py <directory>") | |
sys.exit(1) | |
# Get the directory path from the command-line arguments | |
directory_path = sys.argv[1] | |
# Execute the function | |
if os.path.exists(directory_path): | |
display_and_store_directory_content(directory_path) | |
else: | |
print(f"Error: The path '{directory_path}' does not exist.") | |