import sys import os # Add the project root directory to Python path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import boto3 from botocore.exceptions import NoCredentialsError, ClientError from config import get_settings settings = get_settings() class S3Handler: def __init__(self): self.s3_client = boto3.client( 's3', aws_access_key_id=settings.AWS_ACCESS_KEY, aws_secret_access_key=settings.AWS_SECRET_KEY, region_name=settings.AWS_REGION ) def upload_directory(self, local_directory, bucket_name, s3_prefix=""): uploaded_files = [] errors = [] for root, _, files in os.walk(local_directory): for filename in files: # Get the full local path local_path = os.path.join(root, filename) # Get relative path by removing the local_directory prefix relative_path = os.path.relpath(local_path, local_directory) # Create S3 key (preserve directory structure) s3_key = os.path.join(s3_prefix, relative_path).replace("\\", "/") try: print(f"Uploading {local_path} to {bucket_name}/{s3_key}") self.s3_client.upload_file(local_path, bucket_name, s3_key) uploaded_files.append(s3_key) except ClientError as e: print(f"Error uploading {local_path}: {str(e)}") errors.append(local_path) return uploaded_files, errors def upload_file_to_s3(self, file_path, bucket_name, s3_key): """ Upload a single file to S3. Args: file_path (str): Local path to the file to upload. bucket_name (str): Name of the S3 bucket. s3_key (str): Key (path) to save the file in S3. Returns: str: The URL of the uploaded file. """ try: self.s3_client.upload_file(file_path, bucket_name, s3_key) print(f"Uploaded {file_path} to s3://{bucket_name}/{s3_key}") return f"s3://{bucket_name}/{s3_key}" except FileNotFoundError: print(f"File not found: {file_path}") raise except NoCredentialsError: print("AWS credentials not found.") raise except ClientError as e: print(f"Error uploading file: {e}") raise def list_files(self, bucket_name, prefix=""): """List all files in the bucket with given prefix""" try: response = self.s3_client.list_objects_v2( Bucket=bucket_name, Prefix=prefix ) print(f"\nFiles in bucket '{bucket_name}' with prefix '{prefix}':") if 'Contents' in response: for obj in response['Contents']: print(f"- {obj['Key']} ({obj['Size']} bytes)") return [obj['Key'] for obj in response['Contents']] else: print("No files found") return [] except ClientError as e: print(f"Error listing files: {str(e)}") return [] def delete_all_files(self, bucket_name, prefix=""): """ Delete all files in the bucket with given prefix Args: bucket_name: Name of the S3 bucket prefix: Optional prefix to delete only files under this path Returns: tuple: (number of deleted files, list of any files that failed to delete) """ deleted_count = 0 failed_deletes = [] try: # List all objects in the bucket while True: # Get batch of objects response = self.s3_client.list_objects_v2( Bucket=bucket_name, Prefix=prefix ) # If bucket is empty if 'Contents' not in response: print(f"No files found in bucket '{bucket_name}' with prefix '{prefix}'") break # Prepare objects for deletion objects_to_delete = [{'Key': obj['Key']} for obj in response['Contents']] # Delete the batch of objects delete_response = self.s3_client.delete_objects( Bucket=bucket_name, Delete={ 'Objects': objects_to_delete, 'Quiet': False } ) # Count successful deletes if 'Deleted' in delete_response: deleted_count += len(delete_response['Deleted']) for obj in delete_response['Deleted']: print(f"Deleted: {obj['Key']}") # Track failed deletes if 'Errors' in delete_response: for error in delete_response['Errors']: failed_deletes.append(error['Key']) print(f"Failed to delete {error['Key']}: {error['Message']}") # Check if there are more objects to delete if not response.get('IsTruncated'): # No more files break print(f"\nSuccessfully deleted {deleted_count} files") if failed_deletes: print(f"Failed to delete {len(failed_deletes)} files") return deleted_count, failed_deletes except ClientError as e: print(f"Error deleting files: {str(e)}") return 0, [] def upload_string_to_s3(self, string_data, bucket_name, s3_key): """ Upload a string as an object to S3. Args: string_data (str): The string content to upload. bucket_name (str): The S3 bucket name. s3_key (str): The S3 key (path) to save the file. """ try: # Convert string data to bytes self.s3_client.put_object(Body=string_data, Bucket=bucket_name, Key=s3_key) print(f"Uploaded string to s3://{bucket_name}/{s3_key}") except (NoCredentialsError, ClientError) as e: print(f"Failed to upload string data: {e}") raise def download_string_from_s3(self, bucket_name, s3_key): """ Download a string object from S3 and return it. Args: bucket_name (str): The S3 bucket name. s3_key (str): The S3 key (path) to the object. Returns: str: The content of the object as a string. """ try: response = self.s3_client.get_object(Bucket=bucket_name, Key=s3_key) content = response['Body'].read().decode('utf-8') print(f"Downloaded content from s3://{bucket_name}/{s3_key}") return content except (NoCredentialsError, ClientError) as e: print(f"Failed to download string data: {e}") raise def download_pdf_by_article_id(self, article_id, metadata, bucket_name, local_download_dir): """ Download a specific PDF from S3 by article ID. Args: article_id (str): The PMC article ID to download (e.g., "PMC1464409"). metadata (list): List of metadata records. bucket_name (str): Name of the S3 bucket containing the files. """ # Search for the article in the metadata record = next((item for item in metadata if item["pmc_id"] == article_id), None) if not record: print(f"Article ID {article_id} not found in metadata.") return pdf_s3_path = record.get("pdf_s3_path") # Extract the S3 key from the S3 path s3_key = pdf_s3_path.replace(f"s3://{bucket_name}/", "") # Define the local file path local_pdf_path = os.path.join(local_download_dir, f"{article_id}.pdf") print(f"Downloading {article_id} from S3: {pdf_s3_path} to {local_pdf_path}") # Download the file try: self.s3_client.download_file(bucket_name, s3_key, local_pdf_path) print(f"Downloaded {article_id} to {local_pdf_path}") except Exception as e: print(f"Failed to download {article_id}: {e}") if __name__ == "__main__": s3 = S3Handler() s3.list_files(bucket_name=settings.AWS_BUCKET_NAME) # from botocore.config import Config # Create custom configuration # config = Config( # region_name='me-south-1' # ) # import boto3 # from botocore.exceptions import ClientError # # Initialize the S3 client with exact credentials # s3_client = boto3.client('s3', aws_access_key_id='AKIA4MTWHLYL52IGF2VY'.strip(), aws_secret_access_key='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'.strip(), # config=config) # # Test function # def test_connection(): # try: # # Try to list objects # response = s3_client.list_objects_v2( # Bucket='sehas3.bucket1' # ) # try: # bucket = s3_client.Bucket('sehas3.bucket1') # for obj in bucket.objects.all(): # print(obj.key) # print("Success!") # except ClientError as e: # print(f"Error: {e}") # except ClientError as e: # print(f"Error: {str(e)}") # test_connection()