FungiCLEF2024 / 07_data_augmentation.py

Upload 07_data_augmentation.py

863fd3a verified about 1 year ago

5.51 kB

	# import os
	# import pandas as pd
	# from PIL import Image, ImageOps
	# import numpy as np
	# from tqdm import tqdm
	# from multiprocessing import Pool, cpu_count

	# # 读取CSV文件
	# csv_path = '/data/cjm/FungiCLEF2024/Dataset/06_new_train_valmetadata.csv'
	# data = pd.read_csv(csv_path)

	# # 设置根目录
	# root_dir = '/data/cjm/FungiCLEF2024/Dataset/DF20_21_300'

	# # 过滤poisonous为1的数据
	# poisonous_data = data[data['poisonous'] == 1]

	# # 创建保存增强数据的DataFrame，并包含原始数据
	# new_data = data.copy()

	# # 定义数据增强函数
	# def augment_image(args):
	# row, root_dir = args
	# image_path = row['image_path']
	# full_path = os.path.join(root_dir, image_path)
	# augmented_rows = []

	# if os.path.exists(full_path):
	# image = Image.open(full_path)
	# w, h = image.size

	# # 定义旋转和翻转操作
	# transformations = {
	# 'r90': image.rotate(90, expand=True),
	# 'r180': image.rotate(180, expand=True),
	# 'r270': image.rotate(270, expand=True),
	# 'fh': ImageOps.mirror(image),
	# 'fv': ImageOps.flip(image),
	# }

	# for suffix, img in transformations.items():
	# # 裁剪图片以去除旋转后的黑边
	# if suffix in ['r90', 'r270']:
	# img = img.crop((0, 0, h, w))

	# new_image_path = os.path.splitext(image_path)[0] + f'_{suffix}.JPG'
	# new_full_path = os.path.join(root_dir, new_image_path)
	# img.save(new_full_path)

	# new_row = row.copy()
	# new_row['image_path'] = new_image_path
	# augmented_rows.append(new_row)

	# return augmented_rows

	# # 准备多进程处理
	# num_processes = cpu_count()
	# pool = Pool(processes=num_processes)

	# # 使用tqdm显示进度
	# augmented_data = []
	# for augmented_rows in tqdm(pool.imap_unordered(augment_image, [(row, root_dir) for _, row in poisonous_data.iterrows()]), total=len(poisonous_data)):
	# augmented_data.extend(augmented_rows)

	# # 关闭进程池
	# pool.close()
	# pool.join()

	# # 将增强后的数据添加到new_data中
	# new_data = new_data.append(augmented_data, ignore_index=True)

	# # 将数据保存到新的CSV文件中
	# new_csv_path = '/data/cjm/FungiCLEF2024/Dataset/07_new_train_valmetadata.csv'
	# new_data.to_csv(new_csv_path, index=False)


	import os
	import pandas as pd
	from PIL import Image, ImageOps
	import numpy as np
	from tqdm import tqdm
	from multiprocessing import Pool, cpu_count
	import random

	# 读取CSV文件
	csv_path = '/data/cjm/FungiCLEF2024/Dataset/06_new_train_valmetadata.csv'
	data = pd.read_csv(csv_path)

	# 设置根目录
	root_dir = '/data/cjm/FungiCLEF2024/Dataset/DF20_21_300'

	# 过滤poisonous为1的数据
	poisonous_data = data[data['poisonous'] == 1]

	# 创建保存增强数据的DataFrame，并包含原始数据
	new_data = data.copy()

	# 定义数据增强函数
	def augment_image(args):
	row, root_dir = args
	image_path = row['image_path']
	full_path = os.path.join(root_dir, image_path)
	augmented_rows = []

	if os.path.exists(full_path):
	image = Image.open(full_path)
	w, h = image.size

	# 定义旋转和翻转操作
	transformations = {
	'r90': image.rotate(90, expand=True),
	'r180': image.rotate(180, expand=True),
	'r270': image.rotate(270, expand=True),
	'fh': ImageOps.mirror(image),
	'fv': ImageOps.flip(image),
	}

	# 添加随机裁剪操作
	for i in range(4):
	rand = random.uniform(0.7, 0.8)
	new_w = int(w * rand)
	new_h = int(h * rand)
	left = random.randint(0, w - new_w)
	top = random.randint(0, h - new_h)
	right = left + new_w
	bottom = top + new_h
	cropped_image = image.crop((left, top, right, bottom))
	# cropped_image = cropped_image.resize((w, h)) # 调整回原始尺寸

	new_image_path = os.path.splitext(image_path)[0] + f'_crop{rand}.JPG'
	new_full_path = os.path.join(root_dir, new_image_path)
	cropped_image.save(new_full_path)

	new_row = row.copy()
	new_row['image_path'] = new_image_path
	augmented_rows.append(new_row)

	for suffix, img in transformations.items():
	# 裁剪图片以去除旋转后的黑边
	if suffix in ['r90', 'r270']:
	img = img.crop((0, 0, h, w))

	new_image_path = os.path.splitext(image_path)[0] + f'_{suffix}.JPG'
	new_full_path = os.path.join(root_dir, new_image_path)
	img.save(new_full_path)

	new_row = row.copy()
	new_row['image_path'] = new_image_path
	augmented_rows.append(new_row)

	return augmented_rows

	# 准备多进程处理
	num_processes = cpu_count()
	pool = Pool(processes=num_processes)

	# 使用tqdm显示进度
	augmented_data = []
	for augmented_rows in tqdm(pool.imap_unordered(augment_image, [(row, root_dir) for _, row in poisonous_data.iterrows()]), total=len(poisonous_data)):
	augmented_data.extend(augmented_rows)

	# 关闭进程池
	pool.close()
	pool.join()

	# 将增强后的数据添加到new_data中
	new_data = new_data.append(augmented_data, ignore_index=True)

	# 将数据保存到新的CSV文件中
	new_csv_path = '/data/cjm/FungiCLEF2024/Dataset/07_new_train_valmetadata.csv'
	new_data.to_csv(new_csv_path, index=False)