{ "cells": [ { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "import csv\n", "import jaconv\n", "import re\n", "\n", "def preprocess(csv_path: str, preprocessed_csv_path: str):\n", " \"\"\"\n", " 与えられたCSVファイルを読み込んだのち、以下の処理をしてから、{CSVファイル名}_preprocessed.csvとして保存する\n", " CSVファイルのフォーマットは\"TEXT,LABEL\"の2列である。\n", "\n", " TEXTの変換ルールは次の通り。\n", " 1. 文字列が半角・全角スペース・改行を含む場合、その文字列を複数の文字列に分割する\n", " 2. 記号(!,?,!,?,・,.,…,',\",♪,♫)と全ての絵文字を削除する\n", " 3. ()または()で囲まれた文字列を削除する\n", " 4. 半角カタカナを全角カタカナに、~を~に、-をーに変換する\n", " 5. 2つ以上連続する~~を~に、ーーをーに変換する\n", " 6. 空文字列を削除する\n", "\n", " 保存する前にフィルタリングを行う。\n", " 1. TEXTが空文字列の行を削除する\n", " 2. TEXTとLABELの組み合わせが重複している行を削除する\n", " \"\"\"\n", " # Read the CSV file\n", " with open(csv_path, 'r', encoding='utf-8') as file:\n", " reader = csv.reader(file)\n", " data = list(reader)\n", " \n", " preprocessed_data = []\n", "\n", " # Preprocess the TEXT column\n", " for i in range(len(data)):\n", " text, label = data[i]\n", " # Split the text into multiple strings if it contains spaces or newlines\n", " text = re.split(r'\\s+', text)\n", " # Remove symbols\n", " text = [re.sub(r'[!?!?・.…\\'\"’”\\♪♫]', '', word) for word in text]\n", " # Remove strings enclosed in parentheses\n", " text = [re.sub(r'\\(.*?\\)|(.*?)', '', word) for word in text]\n", " # Convert half-width katakana to full-width katakana\n", " text = [jaconv.h2z(word) for word in text]\n", " # Convert ~ to ~ and - to ー\n", " # Note: 〜(U+301C) is a different character from ~(U+FF5E\n", " text = [re.sub(r'[~〜]', '~', word) for word in text]\n", " text = [re.sub(r'-', 'ー', word) for word in text]\n", " # Convert multiple consecutive ~ to ~ and ーー to ー\n", " text = [re.sub(r'~+', '~', word) for word in text]\n", " text = [re.sub(r'ー+', 'ー', word) for word in text]\n", " \n", " [preprocessed_data.append([word, label]) for word in text if word != '' ]\n", "\n", " # Remove duplicate rows based on TEXT and LABEL combination\n", " preprocessed_data = [list(x) for x in set(tuple(x) for x in preprocessed_data)]\n", "\n", " # Sort the data by LABEL, TEXT\n", " preprocessed_data.sort(key=lambda x: (x[1], x[0]))\n", "\n", " # Save the preprocessed data to a new CSV file\n", " with open(preprocessed_csv_path, 'w', encoding='utf-8', newline='') as file:\n", " writer = csv.writer(file)\n", " writer.writerows(preprocessed_data)\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "\n", "\n", "def split(csv_path: str):\n", " # 元のCSVファイルを読み込む\n", " df = pd.read_csv(csv_path, encoding='utf-8')\n", "\n", " # 訓練用データセットとテスト用データセットに分割\n", " train_df, test_df = train_test_split(df, test_size=0.05) # 高速化のため検証データの数を減らす\n", "\n", " # 新しいCSVファイルとして保存\n", " train_df.to_csv(csv_path.replace('.csv', '_train.csv'), index=False)\n", " test_df.to_csv(csv_path.replace('.csv', '_test.csv'), index=False)\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "csv_path = '../data/datawithrakko.csv'\n", "preprocessed_csv_path = csv_path.replace('.csv', '_preprocessed.csv')\n", "preprocess(csv_path, preprocessed_csv_path)\n", "split(preprocessed_csv_path)" ] } ], "metadata": { "kernelspec": { "display_name": "chiikawa-yonezu", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }