Spaces:

xhiroga
/

chiikawa-yonezu

Runtime error

File size: 5,007 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import jaconv\n",
    "import re\n",
    "\n",
    "def preprocess(csv_path: str, preprocessed_csv_path: str):\n",
    "    \"\"\"\n",
    "    与えられたCSVファイルを読み込んだのち、以下の処理をしてから、{CSVファイル名}_preprocessed.csvとして保存する\n",
    "    CSVファイルのフォーマットは\"TEXT,LABEL\"の2列である。\n",
    "\n",
    "    TEXTの変換ルールは次の通り。\n",
    "    1. 文字列が半角・全角スペース・改行を含む場合、その文字列を複数の文字列に分割する\n",
    "    2. 記号（！,？,!,?,・,.,…,',\",♪,♫）と全ての絵文字を削除する\n",
    "    3. ()または（）で囲まれた文字列を削除する\n",
    "    4. 半角カタカナを全角カタカナに、~を～に、-をーに変換する\n",
    "    5. 2つ以上連続する～～を～に、ーーをーに変換する\n",
    "    6. 空文字列を削除する\n",
    "\n",
    "    保存する前にフィルタリングを行う。\n",
    "    1. TEXTが空文字列の行を削除する\n",
    "    2. TEXTとLABELの組み合わせが重複している行を削除する\n",
    "    \"\"\"\n",
    "    # Read the CSV file\n",
    "    with open(csv_path, 'r', encoding='utf-8') as file:\n",
    "        reader = csv.reader(file)\n",
    "        data = list(reader)\n",
    "    \n",
    "    preprocessed_data = []\n",
    "\n",
    "    # Preprocess the TEXT column\n",
    "    for i in range(len(data)):\n",
    "        text, label = data[i]\n",
    "        # Split the text into multiple strings if it contains spaces or newlines\n",
    "        text = re.split(r'\\s+', text)\n",
    "        # Remove symbols\n",
    "        text = [re.sub(r'[！？!?・.…\\'\"’”\\♪♫]', '', word) for word in text]\n",
    "        # Remove strings enclosed in parentheses\n",
    "        text = [re.sub(r'\\(.*?\\)|（.*?）', '', word) for word in text]\n",
    "        # Convert half-width katakana to full-width katakana\n",
    "        text = [jaconv.h2z(word) for word in text]\n",
    "        # Convert ~ to ～ and - to ー\n",
    "        # Note: 〜(U+301C) is a different character from ～(U+FF5E\n",
    "        text = [re.sub(r'[~〜]', '～', word) for word in text]\n",
    "        text = [re.sub(r'-', 'ー', word) for word in text]\n",
    "        # Convert multiple consecutive ～ to ～ and ーー to ー\n",
    "        text = [re.sub(r'～+', '～', word) for word in text]\n",
    "        text = [re.sub(r'ー+', 'ー', word) for word in text]\n",
    "        \n",
    "        [preprocessed_data.append([word, label]) for word in text if word != '' ]\n",
    "\n",
    "    # Remove duplicate rows based on TEXT and LABEL combination\n",
    "    preprocessed_data = [list(x) for x in set(tuple(x) for x in preprocessed_data)]\n",
    "\n",
    "    # Sort the data by LABEL, TEXT\n",
    "    preprocessed_data.sort(key=lambda x: (x[1], x[0]))\n",
    "\n",
    "    # Save the preprocessed data to a new CSV file\n",
    "    with open(preprocessed_csv_path, 'w', encoding='utf-8', newline='') as file:\n",
    "        writer = csv.writer(file)\n",
    "        writer.writerows(preprocessed_data)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "\n",
    "def split(csv_path: str):\n",
    "    # 元のCSVファイルを読み込む\n",
    "    df = pd.read_csv(csv_path, encoding='utf-8')\n",
    "\n",
    "    # 訓練用データセットとテスト用データセットに分割\n",
    "    train_df, test_df = train_test_split(df, test_size=0.05) # 高速化のため検証データの数を減らす\n",
    "\n",
    "    # 新しいCSVファイルとして保存\n",
    "    train_df.to_csv(csv_path.replace('.csv', '_train.csv'), index=False)\n",
    "    test_df.to_csv(csv_path.replace('.csv', '_test.csv'), index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "csv_path = '../data/datawithrakko.csv'\n",
    "preprocessed_csv_path = csv_path.replace('.csv', '_preprocessed.csv')\n",
    "preprocess(csv_path, preprocessed_csv_path)\n",
    "split(preprocessed_csv_path)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "chiikawa-yonezu",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}