licesma commited on
Commit
4e76f69
·
1 Parent(s): bffa994

train data eda notebook

Browse files
deepseek-coder-1.3b-instruct/train_data_eda.ipynb CHANGED
@@ -2,12 +2,12 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 6,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
9
  "import pandas as pd\n",
10
- "\n",
11
  "df = pd.read_csv(\"../train-data/sql_train.tsv\", sep=\"\\t\")"
12
  ]
13
  },
@@ -31,36 +31,157 @@
31
  "df.columns"
32
  ]
33
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  {
35
  "cell_type": "code",
36
- "execution_count": null,
37
  "metadata": {},
38
  "outputs": [
39
  {
40
  "data": {
41
  "text/plain": [
42
- "count 1044\n",
43
- "unique 1043\n",
44
- "top SELECT ROUND(AVG(pts_home),2) AS avg_home_poin...\n",
45
- "freq 2\n",
46
- "Name: sql_query, dtype: object"
47
  ]
48
  },
49
- "execution_count": 8,
50
  "metadata": {},
51
  "output_type": "execute_result"
52
  }
53
  ],
54
  "source": [
55
- "df['sql_query'].str.len().describe()"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  ]
57
  },
58
  {
59
  "cell_type": "code",
60
- "execution_count": null,
61
  "metadata": {},
62
  "outputs": [],
63
- "source": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  }
65
  ],
66
  "metadata": {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 18,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
9
  "import pandas as pd\n",
10
+ "import re\n",
11
  "df = pd.read_csv(\"../train-data/sql_train.tsv\", sep=\"\\t\")"
12
  ]
13
  },
 
31
  "df.columns"
32
  ]
33
  },
34
+ {
35
+ "cell_type": "markdown",
36
+ "metadata": {},
37
+ "source": [
38
+ "## By character count"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 14,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "less_than_90 = short_queries = df[df['sql_query'].str.len() < 90]"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 17,
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "short_queries.to_csv(\"../train-data/less_than_90.tsv\", sep=\"\\t\", index=False)"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "markdown",
61
+ "metadata": {},
62
+ "source": [
63
+ "## From to Where"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": 25,
69
+ "metadata": {},
70
+ "outputs": [],
71
+ "source": [
72
+ "df['after_from'] = df['sql_query'].str.extract(r'FROM\\s+(\\w+)', flags=re.IGNORECASE)[0]"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": 26,
78
+ "metadata": {},
79
+ "outputs": [
80
+ {
81
+ "data": {
82
+ "text/plain": [
83
+ "0 team\n",
84
+ "1 game\n",
85
+ "2 game\n",
86
+ "3 game\n",
87
+ "4 game\n",
88
+ " ... \n",
89
+ "1039 game\n",
90
+ "1040 game\n",
91
+ "1041 other_stats\n",
92
+ "1042 other_stats\n",
93
+ "1043 game\n",
94
+ "Name: after_from, Length: 1044, dtype: object"
95
+ ]
96
+ },
97
+ "execution_count": 26,
98
+ "metadata": {},
99
+ "output_type": "execute_result"
100
+ }
101
+ ],
102
+ "source": [
103
+ "df['after_from']"
104
+ ]
105
+ },
106
  {
107
  "cell_type": "code",
108
+ "execution_count": 27,
109
  "metadata": {},
110
  "outputs": [
111
  {
112
  "data": {
113
  "text/plain": [
114
+ "array(['team', 'game', 'other_stats'], dtype=object)"
 
 
 
 
115
  ]
116
  },
117
+ "execution_count": 27,
118
  "metadata": {},
119
  "output_type": "execute_result"
120
  }
121
  ],
122
  "source": [
123
+ "df['after_from'].dropna().unique()\n"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": 28,
129
+ "metadata": {},
130
+ "outputs": [],
131
+ "source": [
132
+ "df_game = df[df['after_from'] == 'game']\n",
133
+ "df_game.to_csv(\"../train-data/queries_from_game.tsv\", sep=\"\\t\", index=False)"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": 29,
139
+ "metadata": {},
140
+ "outputs": [],
141
+ "source": [
142
+ "df_game = df[df['after_from'] == 'team']\n",
143
+ "df_game.to_csv(\"../train-data/queries_from_team.tsv\", sep=\"\\t\", index=False)"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "code",
148
+ "execution_count": 30,
149
+ "metadata": {},
150
+ "outputs": [],
151
+ "source": [
152
+ "df_game = df[df['after_from'] == 'other_stats']\n",
153
+ "df_game.to_csv(\"../train-data/queries_from_other_stats.tsv\", sep=\"\\t\", index=False)"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "markdown",
158
+ "metadata": {},
159
+ "source": [
160
+ "## Contain Join"
161
  ]
162
  },
163
  {
164
  "cell_type": "code",
165
+ "execution_count": 31,
166
  "metadata": {},
167
  "outputs": [],
168
+ "source": [
169
+ "# Queries that contain the word JOIN (case-insensitive)\n",
170
+ "df_with_join = df[df['sql_query'].str.contains(r'\\bJOIN\\b', case=False, na=False)]\n",
171
+ "df_with_join.to_csv(\"../train-data/with_join.tsv\", sep=\"\\t\", index=False)"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": 32,
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "\n",
181
+ "# Queries that do NOT contain the word JOIN\n",
182
+ "df_without_join = df[~df['sql_query'].str.contains(r'\\bJOIN\\b', case=False, na=False)]\n",
183
+ "df_without_join.to_csv(\"../train-data/without_join.tsv\", sep=\"\\t\", index=False)"
184
+ ]
185
  }
186
  ],
187
  "metadata": {