Valeriy Sinyukov commited on
Commit
43a63e6
·
1 Parent(s): 35e0bf8

Ipynb preparing russian dataset

Browse files
category_classification/datasets/prepare_ru_dataset.ipynb ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from pathlib import Path\n",
10
+ "\n",
11
+ "import pandas as pd\n",
12
+ "\n",
13
+ "from datasets_common import write_dataset, train_test_split"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 2,
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "dataset_dir = Path('ru')\n",
23
+ "parts_dir = dataset_dir / 'dataset_parts' "
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 3,
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "dfs = []\n",
33
+ "for dataset_path in parts_dir.glob(\"*.csv\"):\n",
34
+ " dfs.append(pd.read_csv(dataset_path))\n",
35
+ "df = pd.concat(dfs)"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 4,
41
+ "metadata": {},
42
+ "outputs": [],
43
+ "source": [
44
+ "label_name = 'category'\n",
45
+ "df = df.rename(columns={'categories': label_name})\n",
46
+ "for column in df.columns:\n",
47
+ " def transform_cell(value):\n",
48
+ " prefixes = [\"{'translation_text': '\", \"{\\'translation_text\\': \\'\", \"\\'translation_text\\':\", \"{'translation_text': \\\"\"]\n",
49
+ " suffix = \"\\'}\"\n",
50
+ " for prefix in prefixes:\n",
51
+ " if value.startswith(prefix):\n",
52
+ " value = value[len(prefix):]\n",
53
+ " if value.endswith(suffix):\n",
54
+ " value = value[:-len(suffix)]\n",
55
+ " return value\n",
56
+ " df[column] = df[column].apply(transform_cell)"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 5,
62
+ "metadata": {},
63
+ "outputs": [],
64
+ "source": [
65
+ "y = df[label_name]\n",
66
+ "X = df.drop(columns=label_name)"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": 6,
72
+ "metadata": {},
73
+ "outputs": [],
74
+ "source": [
75
+ "X_train, X_test, y_train, y_test = train_test_split(X, y)"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 7,
81
+ "metadata": {},
82
+ "outputs": [],
83
+ "source": [
84
+ "train_filename = \"arxiv_train.csv\"\n",
85
+ "test_filename = \"arxiv_test.csv\"\n",
86
+ "write_dataset(dest_dir=dataset_dir, X=X_train, y=y_train, filename=train_filename, to_json=False)\n",
87
+ "write_dataset(dest_dir=dataset_dir, X=X_test, y=y_test, filename=test_filename, to_json=False)"
88
+ ]
89
+ }
90
+ ],
91
+ "metadata": {
92
+ "kernelspec": {
93
+ "display_name": ".venv",
94
+ "language": "python",
95
+ "name": "python3"
96
+ },
97
+ "language_info": {
98
+ "codemirror_mode": {
99
+ "name": "ipython",
100
+ "version": 3
101
+ },
102
+ "file_extension": ".py",
103
+ "mimetype": "text/x-python",
104
+ "name": "python",
105
+ "nbconvert_exporter": "python",
106
+ "pygments_lexer": "ipython3",
107
+ "version": "3.10.12"
108
+ }
109
+ },
110
+ "nbformat": 4,
111
+ "nbformat_minor": 2
112
+ }