|
import os |
|
import tempfile |
|
import zipfile |
|
|
|
def extract_repo_files(directory, folder_paths, file_paths): |
|
all_texts = [] |
|
file_references = [] |
|
|
|
zip_filename = next((file for file in os.listdir(directory) if file.endswith('.zip')), None) |
|
zip_file_path = os.path.join(directory, zip_filename) |
|
print(zip_file_path) |
|
|
|
with tempfile.TemporaryDirectory() as tmpdirname: |
|
|
|
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: |
|
zip_ref.extractall(tmpdirname) |
|
|
|
files = [] |
|
print("tmpdirname: " , tmpdirname) |
|
unzipped_root = os.listdir(tmpdirname) |
|
print("unzipped_root ", unzipped_root) |
|
|
|
tmpsubdirpath= os.path.join(tmpdirname, unzipped_root[0]) |
|
print("tempsubdirpath: ", tmpsubdirpath) |
|
|
|
if folder_paths: |
|
for folder_path in folder_paths: |
|
files += _get_all_files_in_folder(tmpsubdirpath, folder_path) |
|
if file_paths: |
|
files += [_get_file(tmpsubdirpath, file_path) for file_path in file_paths] |
|
|
|
|
|
print(f"Total number of files: {len(files)}") |
|
|
|
for file_path in files: |
|
|
|
file_ext = os.path.splitext(file_path)[1] |
|
|
|
if os.path.getsize(file_path) == 0: |
|
print(f"Skipping an empty file: {file_path}") |
|
continue |
|
|
|
with open(file_path, 'rb') as f: |
|
if file_ext in ['.rst', '.py']: |
|
text = f.read().decode('utf-8') |
|
|
|
all_texts.append(text) |
|
print("Filepaths brother:", file_path) |
|
relative_path = os.path.relpath(file_path, tmpsubdirpath) |
|
print("Relative Filepaths brother:", relative_path) |
|
file_references.append(relative_path) |
|
|
|
return all_texts, file_references |
|
|
|
|
|
|
|
def _get_all_files_in_folder(temp_dir, folder_path): |
|
|
|
all_files = [] |
|
target_dir = os.path.join(temp_dir, folder_path) |
|
|
|
for root, dirs, files in os.walk(target_dir): |
|
print(f"Files in current directory ({root}): {files}") |
|
for file in files: |
|
print(f"Processing file: {file}") |
|
all_files.append(os.path.join(root, file)) |
|
|
|
return all_files |
|
|
|
def _get_file(temp_dir, file_path): |
|
full_path = os.path.join(temp_dir, file_path) |
|
return full_path |
|
|