Spaces:
Build error
Build error
import sys | |
import os | |
# Add the project root directory to Python path | |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
import boto3 | |
from botocore.exceptions import NoCredentialsError, ClientError | |
from config import get_settings | |
settings = get_settings() | |
class S3Handler: | |
def __init__(self): | |
self.s3_client = boto3.client( | |
's3', | |
aws_access_key_id=settings.AWS_ACCESS_KEY, | |
aws_secret_access_key=settings.AWS_SECRET_KEY, | |
region_name=settings.AWS_REGION | |
) | |
def upload_directory(self, local_directory, bucket_name, s3_prefix=""): | |
uploaded_files = [] | |
errors = [] | |
for root, _, files in os.walk(local_directory): | |
for filename in files: | |
# Get the full local path | |
local_path = os.path.join(root, filename) | |
# Get relative path by removing the local_directory prefix | |
relative_path = os.path.relpath(local_path, local_directory) | |
# Create S3 key (preserve directory structure) | |
s3_key = os.path.join(s3_prefix, relative_path).replace("\\", "/") | |
try: | |
print(f"Uploading {local_path} to {bucket_name}/{s3_key}") | |
self.s3_client.upload_file(local_path, bucket_name, s3_key) | |
uploaded_files.append(s3_key) | |
except ClientError as e: | |
print(f"Error uploading {local_path}: {str(e)}") | |
errors.append(local_path) | |
return uploaded_files, errors | |
def upload_file_to_s3(self, file_path, bucket_name, s3_key): | |
""" | |
Upload a single file to S3. | |
Args: | |
file_path (str): Local path to the file to upload. | |
bucket_name (str): Name of the S3 bucket. | |
s3_key (str): Key (path) to save the file in S3. | |
Returns: | |
str: The URL of the uploaded file. | |
""" | |
try: | |
self.s3_client.upload_file(file_path, bucket_name, s3_key) | |
print(f"Uploaded {file_path} to s3://{bucket_name}/{s3_key}") | |
return f"s3://{bucket_name}/{s3_key}" | |
except FileNotFoundError: | |
print(f"File not found: {file_path}") | |
raise | |
except NoCredentialsError: | |
print("AWS credentials not found.") | |
raise | |
except ClientError as e: | |
print(f"Error uploading file: {e}") | |
raise | |
def list_files(self, bucket_name, prefix=""): | |
"""List all files in the bucket with given prefix""" | |
try: | |
response = self.s3_client.list_objects_v2( | |
Bucket=bucket_name, | |
Prefix=prefix | |
) | |
print(f"\nFiles in bucket '{bucket_name}' with prefix '{prefix}':") | |
if 'Contents' in response: | |
for obj in response['Contents']: | |
print(f"- {obj['Key']} ({obj['Size']} bytes)") | |
return [obj['Key'] for obj in response['Contents']] | |
else: | |
print("No files found") | |
return [] | |
except ClientError as e: | |
print(f"Error listing files: {str(e)}") | |
return [] | |
def delete_all_files(self, bucket_name, prefix=""): | |
""" | |
Delete all files in the bucket with given prefix | |
Args: | |
bucket_name: Name of the S3 bucket | |
prefix: Optional prefix to delete only files under this path | |
Returns: | |
tuple: (number of deleted files, list of any files that failed to delete) | |
""" | |
deleted_count = 0 | |
failed_deletes = [] | |
try: | |
# List all objects in the bucket | |
while True: | |
# Get batch of objects | |
response = self.s3_client.list_objects_v2( | |
Bucket=bucket_name, | |
Prefix=prefix | |
) | |
# If bucket is empty | |
if 'Contents' not in response: | |
print(f"No files found in bucket '{bucket_name}' with prefix '{prefix}'") | |
break | |
# Prepare objects for deletion | |
objects_to_delete = [{'Key': obj['Key']} for obj in response['Contents']] | |
# Delete the batch of objects | |
delete_response = self.s3_client.delete_objects( | |
Bucket=bucket_name, | |
Delete={ | |
'Objects': objects_to_delete, | |
'Quiet': False | |
} | |
) | |
# Count successful deletes | |
if 'Deleted' in delete_response: | |
deleted_count += len(delete_response['Deleted']) | |
for obj in delete_response['Deleted']: | |
print(f"Deleted: {obj['Key']}") | |
# Track failed deletes | |
if 'Errors' in delete_response: | |
for error in delete_response['Errors']: | |
failed_deletes.append(error['Key']) | |
print(f"Failed to delete {error['Key']}: {error['Message']}") | |
# Check if there are more objects to delete | |
if not response.get('IsTruncated'): # No more files | |
break | |
print(f"\nSuccessfully deleted {deleted_count} files") | |
if failed_deletes: | |
print(f"Failed to delete {len(failed_deletes)} files") | |
return deleted_count, failed_deletes | |
except ClientError as e: | |
print(f"Error deleting files: {str(e)}") | |
return 0, [] | |
def upload_string_to_s3(self, string_data, bucket_name, s3_key): | |
""" | |
Upload a string as an object to S3. | |
Args: | |
string_data (str): The string content to upload. | |
bucket_name (str): The S3 bucket name. | |
s3_key (str): The S3 key (path) to save the file. | |
""" | |
try: | |
# Convert string data to bytes | |
self.s3_client.put_object(Body=string_data, Bucket=bucket_name, Key=s3_key) | |
print(f"Uploaded string to s3://{bucket_name}/{s3_key}") | |
except (NoCredentialsError, ClientError) as e: | |
print(f"Failed to upload string data: {e}") | |
raise | |
def download_string_from_s3(self, bucket_name, s3_key): | |
""" | |
Download a string object from S3 and return it. | |
Args: | |
bucket_name (str): The S3 bucket name. | |
s3_key (str): The S3 key (path) to the object. | |
Returns: | |
str: The content of the object as a string. | |
""" | |
try: | |
response = self.s3_client.get_object(Bucket=bucket_name, Key=s3_key) | |
content = response['Body'].read().decode('utf-8') | |
print(f"Downloaded content from s3://{bucket_name}/{s3_key}") | |
return content | |
except (NoCredentialsError, ClientError) as e: | |
print(f"Failed to download string data: {e}") | |
raise | |
def download_pdf_by_article_id(self, article_id, metadata, bucket_name, local_download_dir): | |
""" | |
Download a specific PDF from S3 by article ID. | |
Args: | |
article_id (str): The PMC article ID to download (e.g., "PMC1464409"). | |
metadata (list): List of metadata records. | |
bucket_name (str): Name of the S3 bucket containing the files. | |
""" | |
# Search for the article in the metadata | |
record = next((item for item in metadata if item["pmc_id"] == article_id), None) | |
if not record: | |
print(f"Article ID {article_id} not found in metadata.") | |
return | |
pdf_s3_path = record.get("pdf_s3_path") | |
# Extract the S3 key from the S3 path | |
s3_key = pdf_s3_path.replace(f"s3://{bucket_name}/", "") | |
# Define the local file path | |
local_pdf_path = os.path.join(local_download_dir, f"{article_id}.pdf") | |
print(f"Downloading {article_id} from S3: {pdf_s3_path} to {local_pdf_path}") | |
# Download the file | |
try: | |
self.s3_client.download_file(bucket_name, s3_key, local_pdf_path) | |
print(f"Downloaded {article_id} to {local_pdf_path}") | |
except Exception as e: | |
print(f"Failed to download {article_id}: {e}") | |
if __name__ == "__main__": | |
s3 = S3Handler() | |
s3.list_files(bucket_name=settings.AWS_BUCKET_NAME) | |
# from botocore.config import Config | |
# Create custom configuration | |
# config = Config( | |
# region_name='me-south-1' | |
# ) | |
# import boto3 | |
# from botocore.exceptions import ClientError | |
# # Initialize the S3 client with exact credentials | |
# s3_client = boto3.client('s3', aws_access_key_id='AKIA4MTWHLYL52IGF2VY'.strip(), aws_secret_access_key='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'.strip(), | |
# config=config) | |
# # Test function | |
# def test_connection(): | |
# try: | |
# # Try to list objects | |
# response = s3_client.list_objects_v2( | |
# Bucket='sehas3.bucket1' | |
# ) | |
# try: | |
# bucket = s3_client.Bucket('sehas3.bucket1') | |
# for obj in bucket.objects.all(): | |
# print(obj.key) | |
# print("Success!") | |
# except ClientError as e: | |
# print(f"Error: {e}") | |
# except ClientError as e: | |
# print(f"Error: {str(e)}") | |
# test_connection() |