File size: 5,509 Bytes

863fd3a

# import os
# import pandas as pd
# from PIL import Image, ImageOps
# import numpy as np
# from tqdm import tqdm
# from multiprocessing import Pool, cpu_count

# # 读取CSV文件
# csv_path = '/data/cjm/FungiCLEF2024/Dataset/06_new_train_valmetadata.csv'
# data = pd.read_csv(csv_path)

# # 设置根目录
# root_dir = '/data/cjm/FungiCLEF2024/Dataset/DF20_21_300'

# # 过滤poisonous为1的数据
# poisonous_data = data[data['poisonous'] == 1]

# # 创建保存增强数据的DataFrame，并包含原始数据
# new_data = data.copy()

# # 定义数据增强函数
# def augment_image(args):
#     row, root_dir = args
#     image_path = row['image_path']
#     full_path = os.path.join(root_dir, image_path)
#     augmented_rows = []

#     if os.path.exists(full_path):
#         image = Image.open(full_path)
#         w, h = image.size

#         # 定义旋转和翻转操作
#         transformations = {
#             'r90': image.rotate(90, expand=True),
#             'r180': image.rotate(180, expand=True),
#             'r270': image.rotate(270, expand=True),
#             'fh': ImageOps.mirror(image),
#             'fv': ImageOps.flip(image),
#         }

#         for suffix, img in transformations.items():
#             # 裁剪图片以去除旋转后的黑边
#             if suffix in ['r90', 'r270']:
#                 img = img.crop((0, 0, h, w))

#             new_image_path = os.path.splitext(image_path)[0] + f'_{suffix}.JPG'
#             new_full_path = os.path.join(root_dir, new_image_path)
#             img.save(new_full_path)

#             new_row = row.copy()
#             new_row['image_path'] = new_image_path
#             augmented_rows.append(new_row)

#     return augmented_rows

# # 准备多进程处理
# num_processes = cpu_count()
# pool = Pool(processes=num_processes)

# # 使用tqdm显示进度
# augmented_data = []
# for augmented_rows in tqdm(pool.imap_unordered(augment_image, [(row, root_dir) for _, row in poisonous_data.iterrows()]), total=len(poisonous_data)):
#     augmented_data.extend(augmented_rows)

# # 关闭进程池
# pool.close()
# pool.join()

# # 将增强后的数据添加到new_data中
# new_data = new_data.append(augmented_data, ignore_index=True)

# # 将数据保存到新的CSV文件中
# new_csv_path = '/data/cjm/FungiCLEF2024/Dataset/07_new_train_valmetadata.csv'
# new_data.to_csv(new_csv_path, index=False)


import os
import pandas as pd
from PIL import Image, ImageOps
import numpy as np
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
import random

# 读取CSV文件
csv_path = '/data/cjm/FungiCLEF2024/Dataset/06_new_train_valmetadata.csv'
data = pd.read_csv(csv_path)

# 设置根目录
root_dir = '/data/cjm/FungiCLEF2024/Dataset/DF20_21_300'

# 过滤poisonous为1的数据
poisonous_data = data[data['poisonous'] == 1]

# 创建保存增强数据的DataFrame，并包含原始数据
new_data = data.copy()

# 定义数据增强函数
def augment_image(args):
    row, root_dir = args
    image_path = row['image_path']
    full_path = os.path.join(root_dir, image_path)
    augmented_rows = []

    if os.path.exists(full_path):
        image = Image.open(full_path)
        w, h = image.size

        # 定义旋转和翻转操作
        transformations = {
            'r90': image.rotate(90, expand=True),
            'r180': image.rotate(180, expand=True),
            'r270': image.rotate(270, expand=True),
            'fh': ImageOps.mirror(image),
            'fv': ImageOps.flip(image),
        }

        # 添加随机裁剪操作
        for i in range(4):
            rand = random.uniform(0.7, 0.8)
            new_w = int(w * rand)
            new_h = int(h * rand)
            left = random.randint(0, w - new_w)
            top = random.randint(0, h - new_h)
            right = left + new_w
            bottom = top + new_h
            cropped_image = image.crop((left, top, right, bottom))
            # cropped_image = cropped_image.resize((w, h))  # 调整回原始尺寸

            new_image_path = os.path.splitext(image_path)[0] + f'_crop{rand}.JPG'
            new_full_path = os.path.join(root_dir, new_image_path)
            cropped_image.save(new_full_path)

            new_row = row.copy()
            new_row['image_path'] = new_image_path
            augmented_rows.append(new_row)

        for suffix, img in transformations.items():
            # 裁剪图片以去除旋转后的黑边
            if suffix in ['r90', 'r270']:
                img = img.crop((0, 0, h, w))

            new_image_path = os.path.splitext(image_path)[0] + f'_{suffix}.JPG'
            new_full_path = os.path.join(root_dir, new_image_path)
            img.save(new_full_path)

            new_row = row.copy()
            new_row['image_path'] = new_image_path
            augmented_rows.append(new_row)

    return augmented_rows

# 准备多进程处理
num_processes = cpu_count()
pool = Pool(processes=num_processes)

# 使用tqdm显示进度
augmented_data = []
for augmented_rows in tqdm(pool.imap_unordered(augment_image, [(row, root_dir) for _, row in poisonous_data.iterrows()]), total=len(poisonous_data)):
    augmented_data.extend(augmented_rows)

# 关闭进程池
pool.close()
pool.join()

# 将增强后的数据添加到new_data中
new_data = new_data.append(augmented_data, ignore_index=True)

# 将数据保存到新的CSV文件中
new_csv_path = '/data/cjm/FungiCLEF2024/Dataset/07_new_train_valmetadata.csv'
new_data.to_csv(new_csv_path, index=False)