File size: 5,007 Bytes
06a4fa8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e0e168
06a4fa8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import jaconv\n",
    "import re\n",
    "\n",
    "def preprocess(csv_path: str, preprocessed_csv_path: str):\n",
    "    \"\"\"\n",
    "    与えられたCSVファイルを読み込んだのち、以下の処理をしてから、{CSVファイル名}_preprocessed.csvとして保存する\n",
    "    CSVファイルのフォーマットは\"TEXT,LABEL\"の2列である。\n",
    "\n",
    "    TEXTの変換ルールは次の通り。\n",
    "    1. 文字列が半角・全角スペース・改行を含む場合、その文字列を複数の文字列に分割する\n",
    "    2. 記号(!,?,!,?,・,.,…,',\",♪,♫)と全ての絵文字を削除する\n",
    "    3. ()または()で囲まれた文字列を削除する\n",
    "    4. 半角カタカナを全角カタカナに、~を~に、-をーに変換する\n",
    "    5. 2つ以上連続する~~を~に、ーーをーに変換する\n",
    "    6. 空文字列を削除する\n",
    "\n",
    "    保存する前にフィルタリングを行う。\n",
    "    1. TEXTが空文字列の行を削除する\n",
    "    2. TEXTとLABELの組み合わせが重複している行を削除する\n",
    "    \"\"\"\n",
    "    # Read the CSV file\n",
    "    with open(csv_path, 'r', encoding='utf-8') as file:\n",
    "        reader = csv.reader(file)\n",
    "        data = list(reader)\n",
    "    \n",
    "    preprocessed_data = []\n",
    "\n",
    "    # Preprocess the TEXT column\n",
    "    for i in range(len(data)):\n",
    "        text, label = data[i]\n",
    "        # Split the text into multiple strings if it contains spaces or newlines\n",
    "        text = re.split(r'\\s+', text)\n",
    "        # Remove symbols\n",
    "        text = [re.sub(r'[!?!?・.…\\'\"’”\\♪♫]', '', word) for word in text]\n",
    "        # Remove strings enclosed in parentheses\n",
    "        text = [re.sub(r'\\(.*?\\)|(.*?)', '', word) for word in text]\n",
    "        # Convert half-width katakana to full-width katakana\n",
    "        text = [jaconv.h2z(word) for word in text]\n",
    "        # Convert ~ to ~ and - to ー\n",
    "        # Note: 〜(U+301C) is a different character from ~(U+FF5E\n",
    "        text = [re.sub(r'[~〜]', '~', word) for word in text]\n",
    "        text = [re.sub(r'-', 'ー', word) for word in text]\n",
    "        # Convert multiple consecutive ~ to ~ and ーー to ー\n",
    "        text = [re.sub(r'~+', '~', word) for word in text]\n",
    "        text = [re.sub(r'ー+', 'ー', word) for word in text]\n",
    "        \n",
    "        [preprocessed_data.append([word, label]) for word in text if word != '' ]\n",
    "\n",
    "    # Remove duplicate rows based on TEXT and LABEL combination\n",
    "    preprocessed_data = [list(x) for x in set(tuple(x) for x in preprocessed_data)]\n",
    "\n",
    "    # Sort the data by LABEL, TEXT\n",
    "    preprocessed_data.sort(key=lambda x: (x[1], x[0]))\n",
    "\n",
    "    # Save the preprocessed data to a new CSV file\n",
    "    with open(preprocessed_csv_path, 'w', encoding='utf-8', newline='') as file:\n",
    "        writer = csv.writer(file)\n",
    "        writer.writerows(preprocessed_data)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "\n",
    "def split(csv_path: str):\n",
    "    # 元のCSVファイルを読み込む\n",
    "    df = pd.read_csv(csv_path, encoding='utf-8')\n",
    "\n",
    "    # 訓練用データセットとテスト用データセットに分割\n",
    "    train_df, test_df = train_test_split(df, test_size=0.05) # 高速化のため検証データの数を減らす\n",
    "\n",
    "    # 新しいCSVファイルとして保存\n",
    "    train_df.to_csv(csv_path.replace('.csv', '_train.csv'), index=False)\n",
    "    test_df.to_csv(csv_path.replace('.csv', '_test.csv'), index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "csv_path = '../data/datawithrakko.csv'\n",
    "preprocessed_csv_path = csv_path.replace('.csv', '_preprocessed.csv')\n",
    "preprocess(csv_path, preprocessed_csv_path)\n",
    "split(preprocessed_csv_path)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "chiikawa-yonezu",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}