Spaces:
Runtime error
Runtime error
File size: 5,007 Bytes
06a4fa8 5e0e168 06a4fa8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import jaconv\n",
"import re\n",
"\n",
"def preprocess(csv_path: str, preprocessed_csv_path: str):\n",
" \"\"\"\n",
" 与えられたCSVファイルを読み込んだのち、以下の処理をしてから、{CSVファイル名}_preprocessed.csvとして保存する\n",
" CSVファイルのフォーマットは\"TEXT,LABEL\"の2列である。\n",
"\n",
" TEXTの変換ルールは次の通り。\n",
" 1. 文字列が半角・全角スペース・改行を含む場合、その文字列を複数の文字列に分割する\n",
" 2. 記号(!,?,!,?,・,.,…,',\",♪,♫)と全ての絵文字を削除する\n",
" 3. ()または()で囲まれた文字列を削除する\n",
" 4. 半角カタカナを全角カタカナに、~を~に、-をーに変換する\n",
" 5. 2つ以上連続する~~を~に、ーーをーに変換する\n",
" 6. 空文字列を削除する\n",
"\n",
" 保存する前にフィルタリングを行う。\n",
" 1. TEXTが空文字列の行を削除する\n",
" 2. TEXTとLABELの組み合わせが重複している行を削除する\n",
" \"\"\"\n",
" # Read the CSV file\n",
" with open(csv_path, 'r', encoding='utf-8') as file:\n",
" reader = csv.reader(file)\n",
" data = list(reader)\n",
" \n",
" preprocessed_data = []\n",
"\n",
" # Preprocess the TEXT column\n",
" for i in range(len(data)):\n",
" text, label = data[i]\n",
" # Split the text into multiple strings if it contains spaces or newlines\n",
" text = re.split(r'\\s+', text)\n",
" # Remove symbols\n",
" text = [re.sub(r'[!?!?・.…\\'\"’”\\♪♫]', '', word) for word in text]\n",
" # Remove strings enclosed in parentheses\n",
" text = [re.sub(r'\\(.*?\\)|(.*?)', '', word) for word in text]\n",
" # Convert half-width katakana to full-width katakana\n",
" text = [jaconv.h2z(word) for word in text]\n",
" # Convert ~ to ~ and - to ー\n",
" # Note: 〜(U+301C) is a different character from ~(U+FF5E\n",
" text = [re.sub(r'[~〜]', '~', word) for word in text]\n",
" text = [re.sub(r'-', 'ー', word) for word in text]\n",
" # Convert multiple consecutive ~ to ~ and ーー to ー\n",
" text = [re.sub(r'~+', '~', word) for word in text]\n",
" text = [re.sub(r'ー+', 'ー', word) for word in text]\n",
" \n",
" [preprocessed_data.append([word, label]) for word in text if word != '' ]\n",
"\n",
" # Remove duplicate rows based on TEXT and LABEL combination\n",
" preprocessed_data = [list(x) for x in set(tuple(x) for x in preprocessed_data)]\n",
"\n",
" # Sort the data by LABEL, TEXT\n",
" preprocessed_data.sort(key=lambda x: (x[1], x[0]))\n",
"\n",
" # Save the preprocessed data to a new CSV file\n",
" with open(preprocessed_csv_path, 'w', encoding='utf-8', newline='') as file:\n",
" writer = csv.writer(file)\n",
" writer.writerows(preprocessed_data)\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"\n",
"def split(csv_path: str):\n",
" # 元のCSVファイルを読み込む\n",
" df = pd.read_csv(csv_path, encoding='utf-8')\n",
"\n",
" # 訓練用データセットとテスト用データセットに分割\n",
" train_df, test_df = train_test_split(df, test_size=0.05) # 高速化のため検証データの数を減らす\n",
"\n",
" # 新しいCSVファイルとして保存\n",
" train_df.to_csv(csv_path.replace('.csv', '_train.csv'), index=False)\n",
" test_df.to_csv(csv_path.replace('.csv', '_test.csv'), index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"csv_path = '../data/datawithrakko.csv'\n",
"preprocessed_csv_path = csv_path.replace('.csv', '_preprocessed.csv')\n",
"preprocess(csv_path, preprocessed_csv_path)\n",
"split(preprocessed_csv_path)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "chiikawa-yonezu",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|