sehatech-demo / core /s3_utils.py
larawehbe's picture
Upload folder using huggingface_hub
965ac15 verified
import sys
import os
# Add the project root directory to Python path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import boto3
from botocore.exceptions import NoCredentialsError, ClientError
from config import get_settings
settings = get_settings()
class S3Handler:
def __init__(self):
self.s3_client = boto3.client(
's3',
aws_access_key_id=settings.AWS_ACCESS_KEY,
aws_secret_access_key=settings.AWS_SECRET_KEY,
region_name=settings.AWS_REGION
)
def upload_directory(self, local_directory, bucket_name, s3_prefix=""):
uploaded_files = []
errors = []
for root, _, files in os.walk(local_directory):
for filename in files:
# Get the full local path
local_path = os.path.join(root, filename)
# Get relative path by removing the local_directory prefix
relative_path = os.path.relpath(local_path, local_directory)
# Create S3 key (preserve directory structure)
s3_key = os.path.join(s3_prefix, relative_path).replace("\\", "/")
try:
print(f"Uploading {local_path} to {bucket_name}/{s3_key}")
self.s3_client.upload_file(local_path, bucket_name, s3_key)
uploaded_files.append(s3_key)
except ClientError as e:
print(f"Error uploading {local_path}: {str(e)}")
errors.append(local_path)
return uploaded_files, errors
def upload_file_to_s3(self, file_path, bucket_name, s3_key):
"""
Upload a single file to S3.
Args:
file_path (str): Local path to the file to upload.
bucket_name (str): Name of the S3 bucket.
s3_key (str): Key (path) to save the file in S3.
Returns:
str: The URL of the uploaded file.
"""
try:
self.s3_client.upload_file(file_path, bucket_name, s3_key)
print(f"Uploaded {file_path} to s3://{bucket_name}/{s3_key}")
return f"s3://{bucket_name}/{s3_key}"
except FileNotFoundError:
print(f"File not found: {file_path}")
raise
except NoCredentialsError:
print("AWS credentials not found.")
raise
except ClientError as e:
print(f"Error uploading file: {e}")
raise
def list_files(self, bucket_name, prefix=""):
"""List all files in the bucket with given prefix"""
try:
response = self.s3_client.list_objects_v2(
Bucket=bucket_name,
Prefix=prefix
)
print(f"\nFiles in bucket '{bucket_name}' with prefix '{prefix}':")
if 'Contents' in response:
for obj in response['Contents']:
print(f"- {obj['Key']} ({obj['Size']} bytes)")
return [obj['Key'] for obj in response['Contents']]
else:
print("No files found")
return []
except ClientError as e:
print(f"Error listing files: {str(e)}")
return []
def delete_all_files(self, bucket_name, prefix=""):
"""
Delete all files in the bucket with given prefix
Args:
bucket_name: Name of the S3 bucket
prefix: Optional prefix to delete only files under this path
Returns:
tuple: (number of deleted files, list of any files that failed to delete)
"""
deleted_count = 0
failed_deletes = []
try:
# List all objects in the bucket
while True:
# Get batch of objects
response = self.s3_client.list_objects_v2(
Bucket=bucket_name,
Prefix=prefix
)
# If bucket is empty
if 'Contents' not in response:
print(f"No files found in bucket '{bucket_name}' with prefix '{prefix}'")
break
# Prepare objects for deletion
objects_to_delete = [{'Key': obj['Key']} for obj in response['Contents']]
# Delete the batch of objects
delete_response = self.s3_client.delete_objects(
Bucket=bucket_name,
Delete={
'Objects': objects_to_delete,
'Quiet': False
}
)
# Count successful deletes
if 'Deleted' in delete_response:
deleted_count += len(delete_response['Deleted'])
for obj in delete_response['Deleted']:
print(f"Deleted: {obj['Key']}")
# Track failed deletes
if 'Errors' in delete_response:
for error in delete_response['Errors']:
failed_deletes.append(error['Key'])
print(f"Failed to delete {error['Key']}: {error['Message']}")
# Check if there are more objects to delete
if not response.get('IsTruncated'): # No more files
break
print(f"\nSuccessfully deleted {deleted_count} files")
if failed_deletes:
print(f"Failed to delete {len(failed_deletes)} files")
return deleted_count, failed_deletes
except ClientError as e:
print(f"Error deleting files: {str(e)}")
return 0, []
def upload_string_to_s3(self, string_data, bucket_name, s3_key):
"""
Upload a string as an object to S3.
Args:
string_data (str): The string content to upload.
bucket_name (str): The S3 bucket name.
s3_key (str): The S3 key (path) to save the file.
"""
try:
# Convert string data to bytes
self.s3_client.put_object(Body=string_data, Bucket=bucket_name, Key=s3_key)
print(f"Uploaded string to s3://{bucket_name}/{s3_key}")
except (NoCredentialsError, ClientError) as e:
print(f"Failed to upload string data: {e}")
raise
def download_string_from_s3(self, bucket_name, s3_key):
"""
Download a string object from S3 and return it.
Args:
bucket_name (str): The S3 bucket name.
s3_key (str): The S3 key (path) to the object.
Returns:
str: The content of the object as a string.
"""
try:
response = self.s3_client.get_object(Bucket=bucket_name, Key=s3_key)
content = response['Body'].read().decode('utf-8')
print(f"Downloaded content from s3://{bucket_name}/{s3_key}")
return content
except (NoCredentialsError, ClientError) as e:
print(f"Failed to download string data: {e}")
raise
def download_pdf_by_article_id(self, article_id, metadata, bucket_name, local_download_dir):
"""
Download a specific PDF from S3 by article ID.
Args:
article_id (str): The PMC article ID to download (e.g., "PMC1464409").
metadata (list): List of metadata records.
bucket_name (str): Name of the S3 bucket containing the files.
"""
# Search for the article in the metadata
record = next((item for item in metadata if item["pmc_id"] == article_id), None)
if not record:
print(f"Article ID {article_id} not found in metadata.")
return
pdf_s3_path = record.get("pdf_s3_path")
# Extract the S3 key from the S3 path
s3_key = pdf_s3_path.replace(f"s3://{bucket_name}/", "")
# Define the local file path
local_pdf_path = os.path.join(local_download_dir, f"{article_id}.pdf")
print(f"Downloading {article_id} from S3: {pdf_s3_path} to {local_pdf_path}")
# Download the file
try:
self.s3_client.download_file(bucket_name, s3_key, local_pdf_path)
print(f"Downloaded {article_id} to {local_pdf_path}")
except Exception as e:
print(f"Failed to download {article_id}: {e}")
if __name__ == "__main__":
s3 = S3Handler()
s3.list_files(bucket_name=settings.AWS_BUCKET_NAME)
# from botocore.config import Config
# Create custom configuration
# config = Config(
# region_name='me-south-1'
# )
# import boto3
# from botocore.exceptions import ClientError
# # Initialize the S3 client with exact credentials
# s3_client = boto3.client('s3', aws_access_key_id='AKIA4MTWHLYL52IGF2VY'.strip(), aws_secret_access_key='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'.strip(),
# config=config)
# # Test function
# def test_connection():
# try:
# # Try to list objects
# response = s3_client.list_objects_v2(
# Bucket='sehas3.bucket1'
# )
# try:
# bucket = s3_client.Bucket('sehas3.bucket1')
# for obj in bucket.objects.all():
# print(obj.key)
# print("Success!")
# except ClientError as e:
# print(f"Error: {e}")
# except ClientError as e:
# print(f"Error: {str(e)}")
# test_connection()