Spaces:

xhiroga
/

chiikawa-yonezu

Runtime error

App Files Files Community

Hiroaki OGASAWARA commited on Jan 10, 2024

Commit

06a4fa8

1 Parent(s): 0aa8103

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

.gitignore +5 -0
README.md +24 -7
app.py +78 -0
environment.yml +285 -0
notebooks/embeddings.ipynb +85 -0
notebooks/preprocessing.ipynb +126 -0
notebooks/train.ipynb +0 -0
requirements.txt +4 -0
utils/ClassifierModel.py +16 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+data/
+figures/
+models/*
+__pycache__

README.md CHANGED Viewed

@@ -1,12 +1,29 @@
 ---
-title: Chiikawa Yonezu
-emoji: 🏢
-colorFrom: indigo
-colorTo: yellow
 sdk: gradio
 sdk_version: 4.13.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: chiikawa-yonezu
+app_file: app.py
 sdk: gradio
 sdk_version: 4.13.0
 ---
+# ちいかわか米津玄師か分類タスク
+```powershell
+conda create -f environment.yml
+conda activate chiikawa-yonezu
+pip install fugashi ipadic
+```
+## Run gradio
+```powershell
+conda activate chiikawa-yonezu
+python app.py
+# or
+conda run -n chiikawa-yonezu python app.py # not recommended because standard output is not displayed
+```
+## Deploy to gradio
+```powershell
+conda activate chiikawa-yonezu
+gradio deploy
+```

app.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from pprint import pprint
+import gradio as gr
+import torch
+from safetensors import safe_open
+from transformers import BertTokenizer
+from utils.ClassifierModel import ClassifierModel
+def _classify_text(text, model, device, tokenizer, max_length=20):
+    """
+    テキストが、'ちいかわ' と '米津玄師' のどちらに該当するかの確率を出力する。
+    """
+    # テキストをトークナイズし、PyTorchのテンソルに変換
+    inputs = tokenizer.encode_plus(
+        text,
+        add_special_tokens=True,
+        max_length=max_length,
+        padding="max_length",
+        truncation=True,
+        return_attention_mask=True,
+        return_tensors="pt",
+    )
+    pprint(f"inputs: {inputs}")
+    # モデルの推論
+    model.eval()
+    with torch.no_grad():
+        outputs = model(
+            inputs["input_ids"].to(device), inputs["attention_mask"].to(device)
+        )
+        pprint(f"outputs: {outputs}")
+        probabilities = torch.nn.functional.softmax(outputs, dim=1)
+    # 確率の取得
+    chiikawa_prob = probabilities[0][0].item()
+    yonezu_prob = probabilities[0][1].item()
+    return chiikawa_prob, yonezu_prob
+is_cuda = torch.cuda.is_available()
+device = torch.device("cuda" if is_cuda else "cpu")
+pprint(f"device: {device}")
+model_save_path = "models/model.safetensors"
+tensors = {}
+with safe_open(model_save_path, framework="pt", device="cpu") as f:
+    for key in f.keys():
+        tensors[key] = f.get_tensor(key)
+inference_model: torch.nn.Module = ClassifierModel().to(device)
+inference_model.load_state_dict(tensors)
+tokenizer = BertTokenizer.from_pretrained(
+    "cl-tohoku/bert-base-japanese-whole-word-masking"
+)
+def classify_text(text):
+    chii_prob, yone_prob = _classify_text(text, inference_model, device, tokenizer)
+    return {"ちいかわ": chii_prob, "米津玄師": yone_prob}
+demo = gr.Interface(
+    fn=classify_text,
+    inputs="textbox",
+    outputs="label",
+    examples=[
+        "炊き立て・・・・ってコト！？",
+        "晴れた空に種を蒔こう",
+    ],
+)
+demo.launch(share=True)  # Share your demo with just 1 extra parameter 🚀

environment.yml ADDED Viewed

	@@ -0,0 +1,285 @@

+name: chiikawa-yonezu
+channels:
+  - pytorch
+  - nvidia
+  - conda-forge
+  - defaults
+dependencies:
+  - aiohttp=3.9.1=py311ha68e1ae_0
+  - aiosignal=1.3.1=pyhd8ed1ab_0
+  - asttokens=2.4.1=pyhd8ed1ab_0
+  - attrs=23.2.0=pyh71513ae_0
+  - aws-c-auth=0.7.11=h9ca94be_0
+  - aws-c-cal=0.6.9=h7f0e5be_2
+  - aws-c-common=0.9.10=hcfcfb64_0
+  - aws-c-compression=0.2.17=h7f0e5be_7
+  - aws-c-event-stream=0.4.0=h51e6447_0
+  - aws-c-http=0.8.0=h80119a0_0
+  - aws-c-io=0.13.36=ha737126_3
+  - aws-c-mqtt=0.10.0=h2889a98_2
+  - aws-c-s3=0.4.7=h876bada_3
+  - aws-c-sdkutils=0.1.13=h7f0e5be_0
+  - aws-checksums=0.1.17=h7f0e5be_6
+  - aws-crt-cpp=0.26.0=h3a8c176_4
+  - aws-sdk-cpp=1.11.210=h79fa1a6_8
+  - blas=2.120=mkl
+  - blas-devel=3.9.0=20_win64_mkl
+  - brotli=1.1.0=hcfcfb64_1
+  - brotli-bin=1.1.0=hcfcfb64_1
+  - brotli-python=1.1.0=py311h12c1d0e_1
+  - bzip2=1.0.8=hcfcfb64_5
+  - c-ares=1.24.0=hcfcfb64_0
+  - ca-certificates=2023.11.17=h56e8100_0
+  - certifi=2023.11.17=pyhd8ed1ab_0
+  - charset-normalizer=3.3.2=pyhd8ed1ab_0
+  - colorama=0.4.6=pyhd8ed1ab_0
+  - comm=0.2.1=pyhd8ed1ab_0
+  - contourpy=1.2.0=py311h005e61a_0
+  - cuda-cccl=12.3.101=0
+  - cuda-cudart=12.1.105=0
+  - cuda-cudart-dev=12.1.105=0
+  - cuda-cupti=12.1.105=0
+  - cuda-libraries=12.1.0=0
+  - cuda-libraries-dev=12.1.0=0
+  - cuda-nvrtc=12.1.105=0
+  - cuda-nvrtc-dev=12.1.105=0
+  - cuda-nvtx=12.1.105=0
+  - cuda-opencl=12.3.101=0
+  - cuda-opencl-dev=12.3.101=0
+  - cuda-profiler-api=12.3.101=0
+  - cuda-runtime=12.1.0=0
+  - cycler=0.12.1=pyhd8ed1ab_0
+  - datasets=2.16.1=pyhd8ed1ab_0
+  - debugpy=1.8.0=py311h12c1d0e_1
+  - decorator=5.1.1=pyhd8ed1ab_0
+  - dill=0.3.7=pyhd8ed1ab_0
+  - exceptiongroup=1.2.0=pyhd8ed1ab_0
+  - executing=2.0.1=pyhd8ed1ab_0
+  - filelock=3.13.1=pyhd8ed1ab_0
+  - fonttools=4.47.0=py311ha68e1ae_0
+  - freetype=2.12.1=hdaf720e_2
+  - frozenlist=1.4.1=py311ha68e1ae_0
+  - fsspec=2023.10.0=pyhca7485f_0
+  - gettext=0.21.1=h5728263_0
+  - glib=2.78.3=h12be248_0
+  - glib-tools=2.78.3=h12be248_0
+  - gst-plugins-base=1.22.8=h001b923_0
+  - gstreamer=1.22.8=hb4038d2_0
+  - huggingface_hub=0.20.2=pyhd8ed1ab_0
+  - icu=73.2=h63175ca_0
+  - idna=3.6=pyhd8ed1ab_0
+  - importlib-metadata=7.0.1=pyha770c72_0
+  - importlib_metadata=7.0.1=hd8ed1ab_0
+  - intel-openmp=2023.2.0=h57928b3_50497
+  - ipykernel=6.28.0=pyha63f2e9_0
+  - ipython=8.19.0=pyh7428d3b_0
+  - jaconv=0.3.4=pyhd8ed1ab_0
+  - jedi=0.19.1=pyhd8ed1ab_0
+  - jinja2=3.1.2=pyhd8ed1ab_1
+  - joblib=1.3.2=pyhd8ed1ab_0
+  - jupyter_client=8.6.0=pyhd8ed1ab_0
+  - jupyter_core=5.7.0=py311h1ea47a8_0
+  - kiwisolver=1.4.5=py311h005e61a_1
+  - krb5=1.21.2=heb0366b_0
+  - lcms2=2.16=h67d730c_0
+  - lerc=4.0.0=h63175ca_0
+  - libabseil=20230802.1=cxx17_h63175ca_0
+  - libarrow=14.0.2=he5f67d5_2_cpu
+  - libarrow-acero=14.0.2=h63175ca_2_cpu
+  - libarrow-dataset=14.0.2=h63175ca_2_cpu
+  - libarrow-flight=14.0.2=h53b1db0_2_cpu
+  - libarrow-flight-sql=14.0.2=h78eab7c_2_cpu
+  - libarrow-gandiva=14.0.2=hb2eaab1_2_cpu
+  - libarrow-substrait=14.0.2=hd4c9904_2_cpu
+  - libblas=3.9.0=20_win64_mkl
+  - libbrotlicommon=1.1.0=hcfcfb64_1
+  - libbrotlidec=1.1.0=hcfcfb64_1
+  - libbrotlienc=1.1.0=hcfcfb64_1
+  - libcblas=3.9.0=20_win64_mkl
+  - libclang=15.0.7=default_h77d9078_3
+  - libclang13=15.0.7=default_h77d9078_3
+  - libcrc32c=1.1.2=h0e60522_0
+  - libcublas=12.1.0.26=0
+  - libcublas-dev=12.1.0.26=0
+  - libcufft=11.0.2.4=0
+  - libcufft-dev=11.0.2.4=0
+  - libcurand=10.3.4.107=0
+  - libcurand-dev=10.3.4.107=0
+  - libcurl=8.5.0=hd5e4a3a_0
+  - libcusolver=11.4.4.55=0
+  - libcusolver-dev=11.4.4.55=0
+  - libcusparse=12.0.2.55=0
+  - libcusparse-dev=12.0.2.55=0
+  - libdeflate=1.19=hcfcfb64_0
+  - libevent=2.1.12=h3671451_1
+  - libexpat=2.5.0=h63175ca_1
+  - libffi=3.4.2=h8ffe710_5
+  - libglib=2.78.3=h16e383f_0
+  - libgoogle-cloud=2.12.0=h39f2fc6_4
+  - libgrpc=1.59.3=h5bbd4a7_0
+  - libhwloc=2.9.3=default_haede6df_1009
+  - libiconv=1.17=hcfcfb64_2
+  - libjpeg-turbo=3.0.0=hcfcfb64_1
+  - liblapack=3.9.0=20_win64_mkl
+  - liblapacke=3.9.0=20_win64_mkl
+  - libnpp=12.0.2.50=0
+  - libnpp-dev=12.0.2.50=0
+  - libnvjitlink=12.1.105=0
+  - libnvjitlink-dev=12.1.105=0
+  - libnvjpeg=12.1.1.14=0
+  - libnvjpeg-dev=12.1.1.14=0
+  - libogg=1.3.4=h8ffe710_1
+  - libparquet=14.0.2=h7ec3a38_2_cpu
+  - libpng=1.6.39=h19919ed_0
+  - libprotobuf=4.24.4=hb8276f3_0
+  - libre2-11=2023.06.02=h8c5ae5e_0
+  - libsodium=1.0.18=h8d14728_1
+  - libsqlite=3.44.2=hcfcfb64_0
+  - libssh2=1.11.0=h7dfc565_0
+  - libthrift=0.19.0=ha2b3283_1
+  - libtiff=4.6.0=h6e2ebb7_2
+  - libutf8proc=2.8.0=h82a8f57_0
+  - libuv=1.44.2=hcfcfb64_1
+  - libvorbis=1.3.7=h0e60522_0
+  - libwebp-base=1.3.2=hcfcfb64_0
+  - libxcb=1.15=hcd874cb_0
+  - libxml2=2.11.6=hc3477c8_0
+  - libzlib=1.2.13=hcfcfb64_5
+  - lz4-c=1.9.4=hcfcfb64_0
+  - m2w64-gcc-libgfortran=5.3.0=6
+  - m2w64-gcc-libs=5.3.0=7
+  - m2w64-gcc-libs-core=5.3.0=7
+  - m2w64-gmp=6.1.0=2
+  - m2w64-libwinpthread-git=5.0.0.4634.697f757=2
+  - markupsafe=2.1.3=py311ha68e1ae_1
+  - matplotlib=3.8.2=py311h1ea47a8_0
+  - matplotlib-base=3.8.2=py311h6e989c2_0
+  - matplotlib-inline=0.1.6=pyhd8ed1ab_0
+  - mkl=2023.2.0=h6a75c08_50497
+  - mkl-devel=2023.2.0=h57928b3_50497
+  - mkl-include=2023.2.0=h6a75c08_50497
+  - mpmath=1.3.0=pyhd8ed1ab_0
+  - msys2-conda-epoch=20160418=1
+  - multidict=6.0.4=py311ha68e1ae_1
+  - multiprocess=0.70.15=py311ha68e1ae_1
+  - munkres=1.1.4=pyh9f0ad1d_0
+  - nest-asyncio=1.5.8=pyhd8ed1ab_0
+  - networkx=3.2.1=pyhd8ed1ab_0
+  - numpy=1.26.3=py311h0b4df5a_0
+  - openjpeg=2.5.0=h3d672ee_3
+  - openssl=3.2.0=hcfcfb64_1
+  - orc=1.9.2=hf0b6bd4_0
+  - packaging=23.2=pyhd8ed1ab_0
+  - pandas=2.1.4=py311hf63dbb6_0
+  - parso=0.8.3=pyhd8ed1ab_0
+  - pcre2=10.42=h17e33f8_0
+  - pickleshare=0.7.5=py_1003
+  - pillow=10.2.0=py311h4dd8a23_0
+  - pip=23.3.2=pyhd8ed1ab_0
+  - platformdirs=4.1.0=pyhd8ed1ab_0
+  - ply=3.11=py_1
+  - prompt-toolkit=3.0.42=pyha770c72_0
+  - psutil=5.9.7=py311ha68e1ae_0
+  - pthread-stubs=0.4=hcd874cb_1001
+  - pthreads-win32=2.9.1=hfa6e2cd_3
+  - pure_eval=0.2.2=pyhd8ed1ab_0
+  - pyarrow=14.0.2=py311h6a6099b_2_cpu
+  - pyarrow-hotfix=0.6=pyhd8ed1ab_0
+  - pygments=2.17.2=pyhd8ed1ab_0
+  - pyparsing=3.1.1=pyhd8ed1ab_0
+  - pyqt=5.15.9=py311h125bc19_5
+  - pyqt5-sip=12.12.2=py311h12c1d0e_5
+  - pysocks=1.7.1=pyh0701188_6
+  - python=3.11.7=h2628c8c_1_cpython
+  - python-dateutil=2.8.2=pyhd8ed1ab_0
+  - python-tzdata=2023.4=pyhd8ed1ab_0
+  - python-xxhash=3.4.1=py311ha68e1ae_0
+  - python_abi=3.11=4_cp311
+  - pytorch=2.1.2=py3.11_cuda12.1_cudnn8_0
+  - pytorch-cuda=12.1=hde6ce7c_5
+  - pytorch-mutex=1.0=cuda
+  - pytz=2023.3.post1=pyhd8ed1ab_0
+  - pywin32=306=py311h12c1d0e_2
+  - pyyaml=6.0.1=py311ha68e1ae_1
+  - pyzmq=25.1.2=py311h9250fbb_0
+  - qt-main=5.15.8=h9e85ed6_18
+  - re2=2023.06.02=hcbb65ff_0
+  - regex=2023.12.25=py311ha68e1ae_0
+  - requests=2.31.0=pyhd8ed1ab_0
+  - safetensors=0.3.3=py311hc37eb10_1
+  - scikit-learn=1.3.2=py311h142b183_2
+  - scipy=1.11.4=py311h0b4df5a_0
+  - setuptools=69.0.3=pyhd8ed1ab_0
+  - sip=6.7.12=py311h12c1d0e_0
+  - six=1.16.0=pyh6c4a22f_0
+  - snappy=1.1.10=hfb803bf_0
+  - stack_data=0.6.2=pyhd8ed1ab_0
+  - sympy=1.12=pyh04b8f61_3
+  - tbb=2021.11.0=h91493d7_0
+  - threadpoolctl=3.2.0=pyha21a80b_0
+  - tk=8.6.13=h5226925_1
+  - tokenizers=0.15.0=py311h91c4a10_1
+  - toml=0.10.2=pyhd8ed1ab_0
+  - tomli=2.0.1=pyhd8ed1ab_0
+  - tornado=6.3.3=py311ha68e1ae_1
+  - tqdm=4.66.1=pyhd8ed1ab_0
+  - traitlets=5.14.1=pyhd8ed1ab_0
+  - transformers=4.36.2=pyhd8ed1ab_0
+  - typing-extensions=4.9.0=hd8ed1ab_0
+  - typing_extensions=4.9.0=pyha770c72_0
+  - tzdata=2023d=h0c530f3_0
+  - ucrt=10.0.22621.0=h57928b3_0
+  - urllib3=2.1.0=pyhd8ed1ab_0
+  - vc=14.3=hcf57466_18
+  - vc14_runtime=14.38.33130=h82b7239_18
+  - vs2015_runtime=14.38.33130=hcb4865c_18
+  - wcwidth=0.2.12=pyhd8ed1ab_0
+  - wheel=0.42.0=pyhd8ed1ab_0
+  - win_inet_pton=1.1.0=pyhd8ed1ab_6
+  - xorg-libxau=1.0.11=hcd874cb_0
+  - xorg-libxdmcp=1.1.3=hcd874cb_0
+  - xxhash=0.8.2=hcfcfb64_0
+  - xz=5.2.6=h8d14728_0
+  - yaml=0.2.5=h8ffe710_2
+  - yarl=1.9.3=py311ha68e1ae_0
+  - zeromq=4.3.5=h63175ca_0
+  - zipp=3.17.0=pyhd8ed1ab_0
+  - zstd=1.5.5=h12be248_0
+  - pip:
+      - aiofiles==23.2.1
+      - altair==5.2.0
+      - annotated-types==0.6.0
+      - anyio==4.2.0
+      - click==8.1.7
+      - fastapi==0.108.0
+      - ffmpy==0.3.1
+      - gradio==4.13.0
+      - gradio-client==0.8.0
+      - h11==0.14.0
+      - httpcore==1.0.2
+      - httpx==0.26.0
+      - importlib-resources==6.1.1
+      - jsonschema==4.20.0
+      - jsonschema-specifications==2023.12.1
+      - markdown-it-py==3.0.0
+      - mdurl==0.1.2
+      - orjson==3.9.10
+      - pydantic==2.5.3
+      - pydantic-core==2.14.6
+      - pydub==0.25.1
+      - python-multipart==0.0.6
+      - referencing==0.32.1
+      - rich==13.7.0
+      - rpds-py==0.16.2
+      - semantic-version==2.10.0
+      - shellingham==1.5.4
+      - sniffio==1.3.0
+      - starlette==0.32.0.post1
+      - tomlkit==0.12.0
+      - toolz==0.12.0
+      - torchaudio==2.1.2
+      - torchvision==0.16.2
+      - typer==0.9.0
+      - uvicorn==0.25.0
+      - websockets==11.0.3

notebooks/embeddings.ipynb ADDED Viewed

	@@ -0,0 +1,85 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import BertTokenizer, BertModel\n",
+    "import torch\n",
+    "from pprint import pprint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n",
+      "The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. \n",
+      "The class this function is called from is 'BertTokenizer'.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]),\n",
+      " 'input_ids': tensor([[   2,   73,  371,   37, 1541,  546,    3]]),\n",
+      " 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]])}\n",
+      "torch.Size([1, 7, 768])\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 日本語の事前学習済みモデルとトークナイザーの読み込み\n",
+    "tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')\n",
+    "model = BertModel.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')\n",
+    "\n",
+    "# テキストをトークン化し、PyTorchテンソルに変換\n",
+    "text = \"お正月休み\"\n",
+    "encoded_input = tokenizer(text, return_tensors='pt')\n",
+    "pprint(encoded_input)\n",
+    "\n",
+    "# 単語埋め込みを取得\n",
+    "with torch.no_grad():\n",
+    "    output = model(**encoded_input)\n",
+    "    embeddings = output.last_hidden_state\n",
+    "    pprint(embeddings.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "chiikawa-yonezu",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

notebooks/preprocessing.ipynb ADDED Viewed

	@@ -0,0 +1,126 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "import jaconv\n",
+    "import re\n",
+    "\n",
+    "def preprocess(csv_path: str, preprocessed_csv_path: str):\n",
+    "    \"\"\"\n",
+    "    与えられたCSVファイルを読み込んだのち、以下の処理をしてから、{CSVファイル名}_preprocessed.csvとして保存する\n",
+    "    CSVファイルのフォーマットは\"TEXT,LABEL\"の2列である。\n",
+    "\n",
+    "    TEXTの変換ルールは次の通り。\n",
+    "    1. 文字列が半角・全角スペース・改行を含む場合、その文字列を複数の文字列に分割する\n",
+    "    2. 記号（！,？,!,?,・,.,…,',\",♪,♫）と全ての絵文字を削除する\n",
+    "    3. ()または（）で囲まれた文字列を削除する\n",
+    "    4. 半角カタカナを全角カタカナに、~を～に、-をーに変換する\n",
+    "    5. 2つ以上連続する～～を～に、ーーをーに変換する\n",
+    "    6. 空文字列を削除する\n",
+    "\n",
+    "    保存する前にフィルタリングを行う。\n",
+    "    1. TEXTが空文字列の行を削除する\n",
+    "    2. TEXTとLABELの組み合わせが重複している行を削除する\n",
+    "    \"\"\"\n",
+    "    # Read the CSV file\n",
+    "    with open(csv_path, 'r', encoding='utf-8') as file:\n",
+    "        reader = csv.reader(file)\n",
+    "        data = list(reader)\n",
+    "    \n",
+    "    preprocessed_data = []\n",
+    "\n",
+    "    # Preprocess the TEXT column\n",
+    "    for i in range(len(data)):\n",
+    "        text, label = data[i]\n",
+    "        # Split the text into multiple strings if it contains spaces or newlines\n",
+    "        text = re.split(r'\\s+', text)\n",
+    "        # Remove symbols\n",
+    "        text = [re.sub(r'[！？!?・.…\\'\"’”\\♪♫]', '', word) for word in text]\n",
+    "        # Remove strings enclosed in parentheses\n",
+    "        text = [re.sub(r'\\(.*?\\)|（.*?）', '', word) for word in text]\n",
+    "        # Convert half-width katakana to full-width katakana\n",
+    "        text = [jaconv.h2z(word) for word in text]\n",
+    "        # Convert ~ to ～ and - to ー\n",
+    "        # Note: 〜(U+301C) is a different character from ～(U+FF5E\n",
+    "        text = [re.sub(r'[~〜]', '～', word) for word in text]\n",
+    "        text = [re.sub(r'-', 'ー', word) for word in text]\n",
+    "        # Convert multiple consecutive ～ to ～ and ーー to ー\n",
+    "        text = [re.sub(r'～+', '～', word) for word in text]\n",
+    "        text = [re.sub(r'ー+', 'ー', word) for word in text]\n",
+    "        \n",
+    "        [preprocessed_data.append([word, label]) for word in text if word != '' ]\n",
+    "\n",
+    "    # Remove duplicate rows based on TEXT and LABEL combination\n",
+    "    preprocessed_data = [list(x) for x in set(tuple(x) for x in preprocessed_data)]\n",
+    "\n",
+    "    # Sort the data by LABEL, TEXT\n",
+    "    preprocessed_data.sort(key=lambda x: (x[1], x[0]))\n",
+    "\n",
+    "    # Save the preprocessed data to a new CSV file\n",
+    "    with open(preprocessed_csv_path, 'w', encoding='utf-8', newline='') as file:\n",
+    "        writer = csv.writer(file)\n",
+    "        writer.writerows(preprocessed_data)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "\n",
+    "def split(csv_path: str):\n",
+    "    # 元のCSVファイルを読み込む\n",
+    "    df = pd.read_csv(csv_path, encoding='utf-8')\n",
+    "\n",
+    "    # 訓練用データセットとテスト用データセットに分割\n",
+    "    train_df, test_df = train_test_split(df, test_size=0.05) # 高速化のため検証データの数を減らす\n",
+    "\n",
+    "    # 新しいCSVファイルとして保存\n",
+    "    train_df.to_csv(csv_path.replace('.csv', '_train.csv'), index=False)\n",
+    "    test_df.to_csv(csv_path.replace('.csv', '_test.csv'), index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "csv_path = '../data/data.csv'\n",
+    "preprocessed_csv_path = csv_path.replace('.csv', '_preprocessed.csv')\n",
+    "preprocess(csv_path, preprocessed_csv_path)\n",
+    "split(preprocessed_csv_path)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "chiikawa-yonezu",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

notebooks/train.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+safetensors
+torch
+transformers

utils/ClassifierModel.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch
+from transformers import BertModel
+class ClassifierModel(torch.nn.Module):
+    def __init__(self):
+        super(ClassifierModel, self).__init__()
+        self.bert = BertModel.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
+        self.linear = torch.nn.Linear(768, 2)  # BERTの隠れ層の次元数と出力クラス数
+    def forward(self, input_ids, attention_mask):
+        with torch.no_grad():
+            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        last_hidden_state = outputs[0]
+        pooled_output = last_hidden_state[:, 0]
+        return self.linear(pooled_output)