Spaces:
Running
Running
Deepak Sahu
commited on
Commit
·
01e2b4e
1
Parent(s):
dc7bbeb
section update
Browse files- .resources/clean_3.png +3 -0
- .resources/fine-tune.png +3 -0
- .resources/generate_emb.png +3 -0
- .resources/generate_emb2.png +3 -0
- README.md +12 -5
- z_clean_data.ipynb +286 -0
- z_finetune_gpt.py +1 -1
.resources/clean_3.png
ADDED
![]() |
Git LFS Details
|
.resources/fine-tune.png
ADDED
![]() |
Git LFS Details
|
.resources/generate_emb.png
ADDED
![]() |
Git LFS Details
|
.resources/generate_emb2.png
ADDED
![]() |
Git LFS Details
|
README.md
CHANGED
@@ -115,7 +115,7 @@ What is not taken care
|
|
115 |
python z_clean_data.py
|
116 |
```
|
117 |
|
118 |
-

|
119 |
|
120 |
|
121 |
Output: `clean_books_summary.csv`, `unique_titles_books_summary.csv`
|
|
|
123 |
|
124 |
### Step 2: Generate vectors of the books summaries.
|
125 |
|
126 |
+
**WHAT & WHY**
|
127 |
|
|
|
128 |
|
129 |
+
Here, I am going to use pretrained sentence encoder that will help get the meaning of the sentence. We perform this over `unique_titles_books_summary.csv` dataset
|
130 |
+
|
131 |
+
Caching because the semantic meaning of the summaries (for books to output) are not changed during entire runtime.
|
132 |
+
|
133 |
+

|
134 |
+
|
135 |
+
|
136 |
+
**RUN**:
|
137 |
|
138 |
Use command
|
139 |
```SH
|
|
|
142 |
|
143 |
Just using CPU should take <1 min
|
144 |
|
145 |
+

|
146 |
+
|
147 |
|
148 |
Output: `app_cache/summary_vectors.npy`
|
149 |
|
z_clean_data.ipynb
CHANGED
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"# Just Inspection Notebook\n",
|
8 |
+
"\n",
|
9 |
+
"Different from `z_clean_data.py`"
|
10 |
+
]
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"cell_type": "code",
|
14 |
+
"execution_count": 1,
|
15 |
+
"metadata": {},
|
16 |
+
"outputs": [
|
17 |
+
{
|
18 |
+
"data": {
|
19 |
+
"text/html": [
|
20 |
+
"<div>\n",
|
21 |
+
"<style scoped>\n",
|
22 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
23 |
+
" vertical-align: middle;\n",
|
24 |
+
" }\n",
|
25 |
+
"\n",
|
26 |
+
" .dataframe tbody tr th {\n",
|
27 |
+
" vertical-align: top;\n",
|
28 |
+
" }\n",
|
29 |
+
"\n",
|
30 |
+
" .dataframe thead th {\n",
|
31 |
+
" text-align: right;\n",
|
32 |
+
" }\n",
|
33 |
+
"</style>\n",
|
34 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
35 |
+
" <thead>\n",
|
36 |
+
" <tr style=\"text-align: right;\">\n",
|
37 |
+
" <th></th>\n",
|
38 |
+
" <th>book_name</th>\n",
|
39 |
+
" <th>summaries</th>\n",
|
40 |
+
" <th>categories</th>\n",
|
41 |
+
" </tr>\n",
|
42 |
+
" </thead>\n",
|
43 |
+
" <tbody>\n",
|
44 |
+
" <tr>\n",
|
45 |
+
" <th>0</th>\n",
|
46 |
+
" <td>The Highly Sensitive Person</td>\n",
|
47 |
+
" <td>is a self-assessment guide and how-to-live tem...</td>\n",
|
48 |
+
" <td>science</td>\n",
|
49 |
+
" </tr>\n",
|
50 |
+
" <tr>\n",
|
51 |
+
" <th>1</th>\n",
|
52 |
+
" <td>Why Has Nobody Told Me This Before?</td>\n",
|
53 |
+
" <td>is a collection of a clinical psychologist’s ...</td>\n",
|
54 |
+
" <td>science</td>\n",
|
55 |
+
" </tr>\n",
|
56 |
+
" <tr>\n",
|
57 |
+
" <th>2</th>\n",
|
58 |
+
" <td>The Midnight Library</td>\n",
|
59 |
+
" <td>tells the story of Nora, a depressed woman in...</td>\n",
|
60 |
+
" <td>science</td>\n",
|
61 |
+
" </tr>\n",
|
62 |
+
" <tr>\n",
|
63 |
+
" <th>3</th>\n",
|
64 |
+
" <td>Brave New World</td>\n",
|
65 |
+
" <td>presents a futuristic society engineered perf...</td>\n",
|
66 |
+
" <td>science</td>\n",
|
67 |
+
" </tr>\n",
|
68 |
+
" <tr>\n",
|
69 |
+
" <th>4</th>\n",
|
70 |
+
" <td>1984</td>\n",
|
71 |
+
" <td>is the story of a man questioning the system ...</td>\n",
|
72 |
+
" <td>science</td>\n",
|
73 |
+
" </tr>\n",
|
74 |
+
" </tbody>\n",
|
75 |
+
"</table>\n",
|
76 |
+
"</div>"
|
77 |
+
],
|
78 |
+
"text/plain": [
|
79 |
+
" book_name \\\n",
|
80 |
+
"0 The Highly Sensitive Person \n",
|
81 |
+
"1 Why Has Nobody Told Me This Before? \n",
|
82 |
+
"2 The Midnight Library \n",
|
83 |
+
"3 Brave New World \n",
|
84 |
+
"4 1984 \n",
|
85 |
+
"\n",
|
86 |
+
" summaries categories \n",
|
87 |
+
"0 is a self-assessment guide and how-to-live tem... science \n",
|
88 |
+
"1 is a collection of a clinical psychologist’s ... science \n",
|
89 |
+
"2 tells the story of Nora, a depressed woman in... science \n",
|
90 |
+
"3 presents a futuristic society engineered perf... science \n",
|
91 |
+
"4 is the story of a man questioning the system ... science "
|
92 |
+
]
|
93 |
+
},
|
94 |
+
"execution_count": 1,
|
95 |
+
"metadata": {},
|
96 |
+
"output_type": "execute_result"
|
97 |
+
}
|
98 |
+
],
|
99 |
+
"source": [
|
100 |
+
"from z_utils import get_dataframe \n",
|
101 |
+
"\n",
|
102 |
+
"books_df = get_dataframe(\"books_summary.csv\")\n",
|
103 |
+
"books_df.head()"
|
104 |
+
]
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"cell_type": "code",
|
108 |
+
"execution_count": 3,
|
109 |
+
"metadata": {},
|
110 |
+
"outputs": [
|
111 |
+
{
|
112 |
+
"data": {
|
113 |
+
"text/html": [
|
114 |
+
"<div>\n",
|
115 |
+
"<style scoped>\n",
|
116 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
117 |
+
" vertical-align: middle;\n",
|
118 |
+
" }\n",
|
119 |
+
"\n",
|
120 |
+
" .dataframe tbody tr th {\n",
|
121 |
+
" vertical-align: top;\n",
|
122 |
+
" }\n",
|
123 |
+
"\n",
|
124 |
+
" .dataframe thead th {\n",
|
125 |
+
" text-align: right;\n",
|
126 |
+
" }\n",
|
127 |
+
"</style>\n",
|
128 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
129 |
+
" <thead>\n",
|
130 |
+
" <tr style=\"text-align: right;\">\n",
|
131 |
+
" <th></th>\n",
|
132 |
+
" <th>book_name</th>\n",
|
133 |
+
" <th>summaries</th>\n",
|
134 |
+
" <th>categories</th>\n",
|
135 |
+
" </tr>\n",
|
136 |
+
" </thead>\n",
|
137 |
+
" <tbody>\n",
|
138 |
+
" <tr>\n",
|
139 |
+
" <th>2</th>\n",
|
140 |
+
" <td>The Midnight Library</td>\n",
|
141 |
+
" <td>tells the story of Nora, a depressed woman in...</td>\n",
|
142 |
+
" <td>science</td>\n",
|
143 |
+
" </tr>\n",
|
144 |
+
" <tr>\n",
|
145 |
+
" <th>522</th>\n",
|
146 |
+
" <td>The Midnight Library</td>\n",
|
147 |
+
" <td>tells the story of Nora, a depressed woman in...</td>\n",
|
148 |
+
" <td>relationships</td>\n",
|
149 |
+
" </tr>\n",
|
150 |
+
" <tr>\n",
|
151 |
+
" <th>788</th>\n",
|
152 |
+
" <td>The Midnight Library</td>\n",
|
153 |
+
" <td>tells the story of Nora, a depressed woman in...</td>\n",
|
154 |
+
" <td>happiness</td>\n",
|
155 |
+
" </tr>\n",
|
156 |
+
" <tr>\n",
|
157 |
+
" <th>1821</th>\n",
|
158 |
+
" <td>The Midnight Library</td>\n",
|
159 |
+
" <td>tells the story of Nora, a depressed woman in...</td>\n",
|
160 |
+
" <td>psychology</td>\n",
|
161 |
+
" </tr>\n",
|
162 |
+
" <tr>\n",
|
163 |
+
" <th>2402</th>\n",
|
164 |
+
" <td>The Midnight Library</td>\n",
|
165 |
+
" <td>tells the story of Nora, a depressed woman in...</td>\n",
|
166 |
+
" <td>motivation</td>\n",
|
167 |
+
" </tr>\n",
|
168 |
+
" <tr>\n",
|
169 |
+
" <th>3645</th>\n",
|
170 |
+
" <td>The Midnight Library</td>\n",
|
171 |
+
" <td>tells the story of Nora, a depressed woman in...</td>\n",
|
172 |
+
" <td>creativity</td>\n",
|
173 |
+
" </tr>\n",
|
174 |
+
" <tr>\n",
|
175 |
+
" <th>3941</th>\n",
|
176 |
+
" <td>The Midnight Library</td>\n",
|
177 |
+
" <td>tells the story of Nora, a depressed woman in...</td>\n",
|
178 |
+
" <td>fiction</td>\n",
|
179 |
+
" </tr>\n",
|
180 |
+
" <tr>\n",
|
181 |
+
" <th>4305</th>\n",
|
182 |
+
" <td>The Midnight Library</td>\n",
|
183 |
+
" <td>tells the story of Nora, a depressed woman in...</td>\n",
|
184 |
+
" <td>work</td>\n",
|
185 |
+
" </tr>\n",
|
186 |
+
" <tr>\n",
|
187 |
+
" <th>4665</th>\n",
|
188 |
+
" <td>The Midnight Library</td>\n",
|
189 |
+
" <td>tells the story of Nora, a depressed woman in...</td>\n",
|
190 |
+
" <td>mindfulness</td>\n",
|
191 |
+
" </tr>\n",
|
192 |
+
" </tbody>\n",
|
193 |
+
"</table>\n",
|
194 |
+
"</div>"
|
195 |
+
],
|
196 |
+
"text/plain": [
|
197 |
+
" book_name summaries \\\n",
|
198 |
+
"2 The Midnight Library tells the story of Nora, a depressed woman in... \n",
|
199 |
+
"522 The Midnight Library tells the story of Nora, a depressed woman in... \n",
|
200 |
+
"788 The Midnight Library tells the story of Nora, a depressed woman in... \n",
|
201 |
+
"1821 The Midnight Library tells the story of Nora, a depressed woman in... \n",
|
202 |
+
"2402 The Midnight Library tells the story of Nora, a depressed woman in... \n",
|
203 |
+
"3645 The Midnight Library tells the story of Nora, a depressed woman in... \n",
|
204 |
+
"3941 The Midnight Library tells the story of Nora, a depressed woman in... \n",
|
205 |
+
"4305 The Midnight Library tells the story of Nora, a depressed woman in... \n",
|
206 |
+
"4665 The Midnight Library tells the story of Nora, a depressed woman in... \n",
|
207 |
+
"\n",
|
208 |
+
" categories \n",
|
209 |
+
"2 science \n",
|
210 |
+
"522 relationships \n",
|
211 |
+
"788 happiness \n",
|
212 |
+
"1821 psychology \n",
|
213 |
+
"2402 motivation \n",
|
214 |
+
"3645 creativity \n",
|
215 |
+
"3941 fiction \n",
|
216 |
+
"4305 work \n",
|
217 |
+
"4665 mindfulness "
|
218 |
+
]
|
219 |
+
},
|
220 |
+
"execution_count": 3,
|
221 |
+
"metadata": {},
|
222 |
+
"output_type": "execute_result"
|
223 |
+
}
|
224 |
+
],
|
225 |
+
"source": [
|
226 |
+
"books_df[books_df[\"book_name\"] == \"The Midnight Library\"]"
|
227 |
+
]
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"cell_type": "code",
|
231 |
+
"execution_count": 4,
|
232 |
+
"metadata": {},
|
233 |
+
"outputs": [
|
234 |
+
{
|
235 |
+
"data": {
|
236 |
+
"text/plain": [
|
237 |
+
"count 1230.000000\n",
|
238 |
+
"mean 4.042276\n",
|
239 |
+
"std 1.985669\n",
|
240 |
+
"min 1.000000\n",
|
241 |
+
"25% 3.000000\n",
|
242 |
+
"50% 4.000000\n",
|
243 |
+
"75% 5.000000\n",
|
244 |
+
"max 12.000000\n",
|
245 |
+
"Name: book_name, dtype: float64"
|
246 |
+
]
|
247 |
+
},
|
248 |
+
"execution_count": 4,
|
249 |
+
"metadata": {},
|
250 |
+
"output_type": "execute_result"
|
251 |
+
}
|
252 |
+
],
|
253 |
+
"source": [
|
254 |
+
"books_df[\"book_name\"].value_counts().describe()"
|
255 |
+
]
|
256 |
+
},
|
257 |
+
{
|
258 |
+
"cell_type": "code",
|
259 |
+
"execution_count": null,
|
260 |
+
"metadata": {},
|
261 |
+
"outputs": [],
|
262 |
+
"source": []
|
263 |
+
}
|
264 |
+
],
|
265 |
+
"metadata": {
|
266 |
+
"kernelspec": {
|
267 |
+
"display_name": "Python 3",
|
268 |
+
"language": "python",
|
269 |
+
"name": "python3"
|
270 |
+
},
|
271 |
+
"language_info": {
|
272 |
+
"codemirror_mode": {
|
273 |
+
"name": "ipython",
|
274 |
+
"version": 3
|
275 |
+
},
|
276 |
+
"file_extension": ".py",
|
277 |
+
"mimetype": "text/x-python",
|
278 |
+
"name": "python",
|
279 |
+
"nbconvert_exporter": "python",
|
280 |
+
"pygments_lexer": "ipython3",
|
281 |
+
"version": "3.10.4"
|
282 |
+
}
|
283 |
+
},
|
284 |
+
"nbformat": 4,
|
285 |
+
"nbformat_minor": 2
|
286 |
+
}
|
z_finetune_gpt.py
CHANGED
@@ -13,7 +13,7 @@ BASE_CASUAL_MODEL = "openai-community/gpt2"
|
|
13 |
TRAINED_MODEL_OUTPUT_DIR = "content" # same name for HF Hub
|
14 |
|
15 |
set_seed(42)
|
16 |
-
EPOCHS =
|
17 |
LR = 2e-5
|
18 |
|
19 |
# Load dataset
|
|
|
13 |
TRAINED_MODEL_OUTPUT_DIR = "content" # same name for HF Hub
|
14 |
|
15 |
set_seed(42)
|
16 |
+
EPOCHS = 2
|
17 |
LR = 2e-5
|
18 |
|
19 |
# Load dataset
|