"""API custom file upload call wrappers.""" import hashlib import os import shutil import tempfile from copy import copy from datetime import datetime from pathlib import Path from zipfile import ZIP_DEFLATED, ZipFile import requests import typer from folding_studio_data_models import CustomFileType from rich import ( print, # pylint:disable=redefined-builtin ) from tqdm import tqdm from tqdm.utils import CallbackIOWrapper from folding_studio.config import API_URL, REQUEST_TIMEOUT def _upload_file_to_signed_url( signed_url: str, src: str, headers: dict[str, str], ) -> requests.Response: """Upload a local file to a GCS bucket using a signed URL. Use a PUT request. Args: signed_url (str): the signed URL corresponding to the GCS path. src (src | Path): the local file path. headers (dict[str, str]): HTTP request headers. Raises: requests.exceptions.HTTPError: if something went wrong during the uploading. Returns: A response to the PUT request. """ put_headers = copy(headers) put_headers["Content-type"] = "application/octet-stream" file_size = os.path.getsize(src) with open(src, "rb") as fd: with tqdm( desc=f"Uploading {src}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024, ) as t: reader_wrapper = CallbackIOWrapper(t.update, fd, "read") response = requests.put( url=signed_url, data=reader_wrapper, headers=put_headers, ) response.raise_for_status() return response def _get_blob_name_from_file_content(src: str | Path) -> str: """Get a unique file name based on its content. This file name is used as blob name when uploading the file to a bucket. Args: src (str | Path): Path to local file. Returns: The unique blob name. """ src = Path(src) file_hash = hashlib.md5() with src.open("rb") as fd: fd.seek(0) while chunk := fd.read(8192): file_hash.update(chunk) hexcode = file_hash.hexdigest()[:8] # Take file name from src first as maybe a path with directories # Then only extract the stem. There maybe more than 1 extension # example: data/templates/custom_msa.a3m.pqt file_stem = src.name.split(".")[0] suffix = "".join(src.suffixes) return f"{file_stem}_{hexcode}{suffix}" def _copy_and_zip_files( file_list: list[Path], temp_dir: tempfile.TemporaryDirectory, zip_name: str = "files.zip", ): """ Copies a list of files to a temporary directory and zips them into one archive with the highest compression level. Args: file_list (list): List of file paths to be copied and zipped. temp_dir: (TemporaryDirectory): Path to the temporary directory. zip_name (str): Name of the resulting zip file. Returns: str: Path to the created zip file. """ to_zip = [] for file_path in file_list: if file_path.is_file(): blob_name = _get_blob_name_from_file_content(src=file_path) dest_file = os.path.join(temp_dir, blob_name) shutil.copy(file_path, dest_file) to_zip.append(dest_file) else: print(f"Warning: {file_path} does not exist or is not a file.") zip_path = os.path.join(temp_dir, zip_name) with ZipFile(zip_path, "w", compression=ZIP_DEFLATED, compresslevel=7) as zipf: for file_name in to_zip: zipf.write(file_name, arcname=Path(file_name).name) return zip_path def _get_blob_zip_name(file_type: str): timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f") return f"{file_type}_files_{timestamp}.zip" def upload_custom_files( headers: dict[str, str], paths: list[Path], file_type: CustomFileType, ) -> dict[str, str]: """Upload custom files to the GCS bucket. Args: headers (dict[str, str]): HTTP request headers. paths (list[Path]): List of custom template files path. file_type (CustomFileType): Type of file to upload. Raises: typer.Exit: If an error occurs during the API call. Returns: dict[str, str]: Mapping of local filenames to GCS paths. """ url = API_URL + "getUploadSignedURL" paths = set(paths) print(f"Uploading {len(paths)}: {tuple(str(p) for p in paths)}.") blobs = [_get_blob_name_from_file_content(src=file) for file in paths] # Zip files and upload archive blob_zip = _get_blob_zip_name(file_type.value) with tempfile.TemporaryDirectory() as temp_dir: zip_path = _copy_and_zip_files( file_list=paths, temp_dir=temp_dir, zip_name=blob_zip, ) url_response = requests.get( url, params={ "blob_name": blob_zip, "file_type": file_type.value, }, headers=headers, timeout=REQUEST_TIMEOUT, ) if not url_response.ok: print(f"Error while generating signed URL: {url_response.content.decode()}") raise typer.Exit(code=1) json_response = url_response.json() signed_url = json_response["signed_url"] upload_response = _upload_file_to_signed_url( signed_url=signed_url, src=zip_path, headers=headers ) if not upload_response.ok: print(f"Error while uploading {zip_path}.") raise typer.Exit(code=1) # Unzip in dest bucket unzip_response = requests.post( API_URL + "unzipFileInBucket", params={ "zip_file_path": json_response["destination_file"], }, headers=headers, timeout=REQUEST_TIMEOUT, ) if not unzip_response.ok: print(f"Error while unzip custom files: {unzip_response.content.decode()}") raise typer.Exit(code=1) local_to_gcs = { str(file): f"{json_response['destination_bucket']}/{blob_name}" for file, blob_name in zip(paths, blobs) } print("Custom files successfully uploaded.") return local_to_gcs