Upload RAG_Chatbot

#1
by tricaominh - opened
Files changed (1) hide show
  1. RAG_ChatBot (1).ipynb +1350 -0
RAG_ChatBot (1).ipynb ADDED
@@ -0,0 +1,1350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU",
17
+ "widgets": {
18
+ "application/vnd.jupyter.widget-state+json": {
19
+ "95135ba6ca104151abab245f938b46a1": {
20
+ "model_module": "@jupyter-widgets/controls",
21
+ "model_name": "HBoxModel",
22
+ "model_module_version": "1.5.0",
23
+ "state": {
24
+ "_dom_classes": [],
25
+ "_model_module": "@jupyter-widgets/controls",
26
+ "_model_module_version": "1.5.0",
27
+ "_model_name": "HBoxModel",
28
+ "_view_count": null,
29
+ "_view_module": "@jupyter-widgets/controls",
30
+ "_view_module_version": "1.5.0",
31
+ "_view_name": "HBoxView",
32
+ "box_style": "",
33
+ "children": [
34
+ "IPY_MODEL_7c0719ab78e3479393e2e160f7bd7a4c",
35
+ "IPY_MODEL_1395931983524aa58dfb7a603c952748",
36
+ "IPY_MODEL_e05f535e386e482da0450c2ca0594c42"
37
+ ],
38
+ "layout": "IPY_MODEL_b5318f98281a43d388ff7868a2e69cbd"
39
+ }
40
+ },
41
+ "7c0719ab78e3479393e2e160f7bd7a4c": {
42
+ "model_module": "@jupyter-widgets/controls",
43
+ "model_name": "HTMLModel",
44
+ "model_module_version": "1.5.0",
45
+ "state": {
46
+ "_dom_classes": [],
47
+ "_model_module": "@jupyter-widgets/controls",
48
+ "_model_module_version": "1.5.0",
49
+ "_model_name": "HTMLModel",
50
+ "_view_count": null,
51
+ "_view_module": "@jupyter-widgets/controls",
52
+ "_view_module_version": "1.5.0",
53
+ "_view_name": "HTMLView",
54
+ "description": "",
55
+ "description_tooltip": null,
56
+ "layout": "IPY_MODEL_00d64df484074630a2be9bedbcaec9ab",
57
+ "placeholder": "​",
58
+ "style": "IPY_MODEL_8be96f5efde442bbb1e44e8658d2fa6f",
59
+ "value": "Loading checkpoint shards: 100%"
60
+ }
61
+ },
62
+ "1395931983524aa58dfb7a603c952748": {
63
+ "model_module": "@jupyter-widgets/controls",
64
+ "model_name": "FloatProgressModel",
65
+ "model_module_version": "1.5.0",
66
+ "state": {
67
+ "_dom_classes": [],
68
+ "_model_module": "@jupyter-widgets/controls",
69
+ "_model_module_version": "1.5.0",
70
+ "_model_name": "FloatProgressModel",
71
+ "_view_count": null,
72
+ "_view_module": "@jupyter-widgets/controls",
73
+ "_view_module_version": "1.5.0",
74
+ "_view_name": "ProgressView",
75
+ "bar_style": "success",
76
+ "description": "",
77
+ "description_tooltip": null,
78
+ "layout": "IPY_MODEL_68c6d56e98734bc7a49859ec57f462ad",
79
+ "max": 2,
80
+ "min": 0,
81
+ "orientation": "horizontal",
82
+ "style": "IPY_MODEL_e5b15807c72d4e69be4051c90d9649d4",
83
+ "value": 2
84
+ }
85
+ },
86
+ "e05f535e386e482da0450c2ca0594c42": {
87
+ "model_module": "@jupyter-widgets/controls",
88
+ "model_name": "HTMLModel",
89
+ "model_module_version": "1.5.0",
90
+ "state": {
91
+ "_dom_classes": [],
92
+ "_model_module": "@jupyter-widgets/controls",
93
+ "_model_module_version": "1.5.0",
94
+ "_model_name": "HTMLModel",
95
+ "_view_count": null,
96
+ "_view_module": "@jupyter-widgets/controls",
97
+ "_view_module_version": "1.5.0",
98
+ "_view_name": "HTMLView",
99
+ "description": "",
100
+ "description_tooltip": null,
101
+ "layout": "IPY_MODEL_f04d5e153ee94795bd6a7848be30789a",
102
+ "placeholder": "​",
103
+ "style": "IPY_MODEL_f4d5244ed5634e99b4e1719d77bd1676",
104
+ "value": " 2/2 [00:27<00:00, 11.71s/it]"
105
+ }
106
+ },
107
+ "b5318f98281a43d388ff7868a2e69cbd": {
108
+ "model_module": "@jupyter-widgets/base",
109
+ "model_name": "LayoutModel",
110
+ "model_module_version": "1.2.0",
111
+ "state": {
112
+ "_model_module": "@jupyter-widgets/base",
113
+ "_model_module_version": "1.2.0",
114
+ "_model_name": "LayoutModel",
115
+ "_view_count": null,
116
+ "_view_module": "@jupyter-widgets/base",
117
+ "_view_module_version": "1.2.0",
118
+ "_view_name": "LayoutView",
119
+ "align_content": null,
120
+ "align_items": null,
121
+ "align_self": null,
122
+ "border": null,
123
+ "bottom": null,
124
+ "display": null,
125
+ "flex": null,
126
+ "flex_flow": null,
127
+ "grid_area": null,
128
+ "grid_auto_columns": null,
129
+ "grid_auto_flow": null,
130
+ "grid_auto_rows": null,
131
+ "grid_column": null,
132
+ "grid_gap": null,
133
+ "grid_row": null,
134
+ "grid_template_areas": null,
135
+ "grid_template_columns": null,
136
+ "grid_template_rows": null,
137
+ "height": null,
138
+ "justify_content": null,
139
+ "justify_items": null,
140
+ "left": null,
141
+ "margin": null,
142
+ "max_height": null,
143
+ "max_width": null,
144
+ "min_height": null,
145
+ "min_width": null,
146
+ "object_fit": null,
147
+ "object_position": null,
148
+ "order": null,
149
+ "overflow": null,
150
+ "overflow_x": null,
151
+ "overflow_y": null,
152
+ "padding": null,
153
+ "right": null,
154
+ "top": null,
155
+ "visibility": null,
156
+ "width": null
157
+ }
158
+ },
159
+ "00d64df484074630a2be9bedbcaec9ab": {
160
+ "model_module": "@jupyter-widgets/base",
161
+ "model_name": "LayoutModel",
162
+ "model_module_version": "1.2.0",
163
+ "state": {
164
+ "_model_module": "@jupyter-widgets/base",
165
+ "_model_module_version": "1.2.0",
166
+ "_model_name": "LayoutModel",
167
+ "_view_count": null,
168
+ "_view_module": "@jupyter-widgets/base",
169
+ "_view_module_version": "1.2.0",
170
+ "_view_name": "LayoutView",
171
+ "align_content": null,
172
+ "align_items": null,
173
+ "align_self": null,
174
+ "border": null,
175
+ "bottom": null,
176
+ "display": null,
177
+ "flex": null,
178
+ "flex_flow": null,
179
+ "grid_area": null,
180
+ "grid_auto_columns": null,
181
+ "grid_auto_flow": null,
182
+ "grid_auto_rows": null,
183
+ "grid_column": null,
184
+ "grid_gap": null,
185
+ "grid_row": null,
186
+ "grid_template_areas": null,
187
+ "grid_template_columns": null,
188
+ "grid_template_rows": null,
189
+ "height": null,
190
+ "justify_content": null,
191
+ "justify_items": null,
192
+ "left": null,
193
+ "margin": null,
194
+ "max_height": null,
195
+ "max_width": null,
196
+ "min_height": null,
197
+ "min_width": null,
198
+ "object_fit": null,
199
+ "object_position": null,
200
+ "order": null,
201
+ "overflow": null,
202
+ "overflow_x": null,
203
+ "overflow_y": null,
204
+ "padding": null,
205
+ "right": null,
206
+ "top": null,
207
+ "visibility": null,
208
+ "width": null
209
+ }
210
+ },
211
+ "8be96f5efde442bbb1e44e8658d2fa6f": {
212
+ "model_module": "@jupyter-widgets/controls",
213
+ "model_name": "DescriptionStyleModel",
214
+ "model_module_version": "1.5.0",
215
+ "state": {
216
+ "_model_module": "@jupyter-widgets/controls",
217
+ "_model_module_version": "1.5.0",
218
+ "_model_name": "DescriptionStyleModel",
219
+ "_view_count": null,
220
+ "_view_module": "@jupyter-widgets/base",
221
+ "_view_module_version": "1.2.0",
222
+ "_view_name": "StyleView",
223
+ "description_width": ""
224
+ }
225
+ },
226
+ "68c6d56e98734bc7a49859ec57f462ad": {
227
+ "model_module": "@jupyter-widgets/base",
228
+ "model_name": "LayoutModel",
229
+ "model_module_version": "1.2.0",
230
+ "state": {
231
+ "_model_module": "@jupyter-widgets/base",
232
+ "_model_module_version": "1.2.0",
233
+ "_model_name": "LayoutModel",
234
+ "_view_count": null,
235
+ "_view_module": "@jupyter-widgets/base",
236
+ "_view_module_version": "1.2.0",
237
+ "_view_name": "LayoutView",
238
+ "align_content": null,
239
+ "align_items": null,
240
+ "align_self": null,
241
+ "border": null,
242
+ "bottom": null,
243
+ "display": null,
244
+ "flex": null,
245
+ "flex_flow": null,
246
+ "grid_area": null,
247
+ "grid_auto_columns": null,
248
+ "grid_auto_flow": null,
249
+ "grid_auto_rows": null,
250
+ "grid_column": null,
251
+ "grid_gap": null,
252
+ "grid_row": null,
253
+ "grid_template_areas": null,
254
+ "grid_template_columns": null,
255
+ "grid_template_rows": null,
256
+ "height": null,
257
+ "justify_content": null,
258
+ "justify_items": null,
259
+ "left": null,
260
+ "margin": null,
261
+ "max_height": null,
262
+ "max_width": null,
263
+ "min_height": null,
264
+ "min_width": null,
265
+ "object_fit": null,
266
+ "object_position": null,
267
+ "order": null,
268
+ "overflow": null,
269
+ "overflow_x": null,
270
+ "overflow_y": null,
271
+ "padding": null,
272
+ "right": null,
273
+ "top": null,
274
+ "visibility": null,
275
+ "width": null
276
+ }
277
+ },
278
+ "e5b15807c72d4e69be4051c90d9649d4": {
279
+ "model_module": "@jupyter-widgets/controls",
280
+ "model_name": "ProgressStyleModel",
281
+ "model_module_version": "1.5.0",
282
+ "state": {
283
+ "_model_module": "@jupyter-widgets/controls",
284
+ "_model_module_version": "1.5.0",
285
+ "_model_name": "ProgressStyleModel",
286
+ "_view_count": null,
287
+ "_view_module": "@jupyter-widgets/base",
288
+ "_view_module_version": "1.2.0",
289
+ "_view_name": "StyleView",
290
+ "bar_color": null,
291
+ "description_width": ""
292
+ }
293
+ },
294
+ "f04d5e153ee94795bd6a7848be30789a": {
295
+ "model_module": "@jupyter-widgets/base",
296
+ "model_name": "LayoutModel",
297
+ "model_module_version": "1.2.0",
298
+ "state": {
299
+ "_model_module": "@jupyter-widgets/base",
300
+ "_model_module_version": "1.2.0",
301
+ "_model_name": "LayoutModel",
302
+ "_view_count": null,
303
+ "_view_module": "@jupyter-widgets/base",
304
+ "_view_module_version": "1.2.0",
305
+ "_view_name": "LayoutView",
306
+ "align_content": null,
307
+ "align_items": null,
308
+ "align_self": null,
309
+ "border": null,
310
+ "bottom": null,
311
+ "display": null,
312
+ "flex": null,
313
+ "flex_flow": null,
314
+ "grid_area": null,
315
+ "grid_auto_columns": null,
316
+ "grid_auto_flow": null,
317
+ "grid_auto_rows": null,
318
+ "grid_column": null,
319
+ "grid_gap": null,
320
+ "grid_row": null,
321
+ "grid_template_areas": null,
322
+ "grid_template_columns": null,
323
+ "grid_template_rows": null,
324
+ "height": null,
325
+ "justify_content": null,
326
+ "justify_items": null,
327
+ "left": null,
328
+ "margin": null,
329
+ "max_height": null,
330
+ "max_width": null,
331
+ "min_height": null,
332
+ "min_width": null,
333
+ "object_fit": null,
334
+ "object_position": null,
335
+ "order": null,
336
+ "overflow": null,
337
+ "overflow_x": null,
338
+ "overflow_y": null,
339
+ "padding": null,
340
+ "right": null,
341
+ "top": null,
342
+ "visibility": null,
343
+ "width": null
344
+ }
345
+ },
346
+ "f4d5244ed5634e99b4e1719d77bd1676": {
347
+ "model_module": "@jupyter-widgets/controls",
348
+ "model_name": "DescriptionStyleModel",
349
+ "model_module_version": "1.5.0",
350
+ "state": {
351
+ "_model_module": "@jupyter-widgets/controls",
352
+ "_model_module_version": "1.5.0",
353
+ "_model_name": "DescriptionStyleModel",
354
+ "_view_count": null,
355
+ "_view_module": "@jupyter-widgets/base",
356
+ "_view_module_version": "1.2.0",
357
+ "_view_name": "StyleView",
358
+ "description_width": ""
359
+ }
360
+ }
361
+ }
362
+ }
363
+ },
364
+ "cells": [
365
+ {
366
+ "cell_type": "code",
367
+ "execution_count": 2,
368
+ "metadata": {
369
+ "id": "ZZzhbHyTbFed"
370
+ },
371
+ "outputs": [],
372
+ "source": [
373
+ "!pip install -q transformers sentence_transformers faiss-cpu torch PyPDF2 nltk"
374
+ ]
375
+ },
376
+ {
377
+ "cell_type": "code",
378
+ "source": [
379
+ "!pip install -U langchain-community\n",
380
+ "from langchain.vectorstores import Qdrant\n",
381
+ "from langchain.embeddings import HuggingFaceBgeEmbeddings\n",
382
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
383
+ "from langchain.document_loaders import PyPDFDirectoryLoader,TextLoader"
384
+ ],
385
+ "metadata": {
386
+ "id": "jlkvzNn_bzX1",
387
+ "colab": {
388
+ "base_uri": "https://localhost:8080/"
389
+ },
390
+ "outputId": "3b899e09-e0af-4d7c-f122-9edda8c44b52"
391
+ },
392
+ "execution_count": 3,
393
+ "outputs": [
394
+ {
395
+ "output_type": "stream",
396
+ "name": "stdout",
397
+ "text": [
398
+ "Requirement already satisfied: langchain-community in /usr/local/lib/python3.10/dist-packages (0.3.11)\n",
399
+ "Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (6.0.2)\n",
400
+ "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (2.0.36)\n",
401
+ "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (3.11.9)\n",
402
+ "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.6.7)\n",
403
+ "Requirement already satisfied: httpx-sse<0.5.0,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.4.0)\n",
404
+ "Requirement already satisfied: langchain<0.4.0,>=0.3.11 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.3.11)\n",
405
+ "Requirement already satisfied: langchain-core<0.4.0,>=0.3.24 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.3.24)\n",
406
+ "Requirement already satisfied: langsmith<0.3,>=0.1.125 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.1.147)\n",
407
+ "Requirement already satisfied: numpy<2,>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (1.26.4)\n",
408
+ "Requirement already satisfied: pydantic-settings<3.0.0,>=2.4.0 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (2.6.1)\n",
409
+ "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (2.32.3)\n",
410
+ "Requirement already satisfied: tenacity!=8.4.0,<10,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (9.0.0)\n",
411
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (2.4.4)\n",
412
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.3.1)\n",
413
+ "Requirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (4.0.3)\n",
414
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (24.2.0)\n",
415
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.5.0)\n",
416
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (6.1.0)\n",
417
+ "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (0.2.1)\n",
418
+ "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.18.3)\n",
419
+ "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community) (3.23.1)\n",
420
+ "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community) (0.9.0)\n",
421
+ "Requirement already satisfied: langchain-text-splitters<0.4.0,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from langchain<0.4.0,>=0.3.11->langchain-community) (0.3.2)\n",
422
+ "Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in /usr/local/lib/python3.10/dist-packages (from langchain<0.4.0,>=0.3.11->langchain-community) (2.10.3)\n",
423
+ "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.24->langchain-community) (1.33)\n",
424
+ "Requirement already satisfied: packaging<25,>=23.2 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.24->langchain-community) (24.2)\n",
425
+ "Requirement already satisfied: typing-extensions>=4.7 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.24->langchain-community) (4.12.2)\n",
426
+ "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.125->langchain-community) (0.28.0)\n",
427
+ "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.125->langchain-community) (3.10.12)\n",
428
+ "Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.125->langchain-community) (1.0.0)\n",
429
+ "Requirement already satisfied: python-dotenv>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from pydantic-settings<3.0.0,>=2.4.0->langchain-community) (1.0.1)\n",
430
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (3.4.0)\n",
431
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (3.10)\n",
432
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (2.2.3)\n",
433
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (2024.8.30)\n",
434
+ "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain-community) (3.1.1)\n",
435
+ "Requirement already satisfied: anyio in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.3,>=0.1.125->langchain-community) (3.7.1)\n",
436
+ "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.3,>=0.1.125->langchain-community) (1.0.7)\n",
437
+ "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->langsmith<0.3,>=0.1.125->langchain-community) (0.14.0)\n",
438
+ "Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.10/dist-packages (from jsonpatch<2.0,>=1.33->langchain-core<0.4.0,>=0.3.24->langchain-community) (3.0.0)\n",
439
+ "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain<0.4.0,>=0.3.11->langchain-community) (0.7.0)\n",
440
+ "Requirement already satisfied: pydantic-core==2.27.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain<0.4.0,>=0.3.11->langchain-community) (2.27.1)\n",
441
+ "Requirement already satisfied: mypy-extensions>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community) (1.0.0)\n",
442
+ "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio->httpx<1,>=0.23.0->langsmith<0.3,>=0.1.125->langchain-community) (1.3.1)\n",
443
+ "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio->httpx<1,>=0.23.0->langsmith<0.3,>=0.1.125->langchain-community) (1.2.2)\n"
444
+ ]
445
+ }
446
+ ]
447
+ },
448
+ {
449
+ "cell_type": "code",
450
+ "source": [
451
+ "import torch\n",
452
+ "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
453
+ "from sentence_transformers import SentenceTransformer\n",
454
+ "!pip install faiss-cpu\n",
455
+ "!pip install sentence-transformers\n",
456
+ "import faiss\n",
457
+ "import numpy as np\n",
458
+ "import pandas as pd\n",
459
+ "!\n",
460
+ "import PyPDF2\n",
461
+ "import os\n",
462
+ "import nltk\n",
463
+ "# nltk.download('punkt')\n",
464
+ "nltk.download('punkt_tab')\n",
465
+ "from nltk.tokenize import sent_tokenize\n",
466
+ "from google.colab import userdata"
467
+ ],
468
+ "metadata": {
469
+ "id": "PPBaElOGb0um",
470
+ "colab": {
471
+ "base_uri": "https://localhost:8080/"
472
+ },
473
+ "outputId": "712aa030-a0eb-450f-fb31-dd7e0151b297"
474
+ },
475
+ "execution_count": 4,
476
+ "outputs": [
477
+ {
478
+ "output_type": "stream",
479
+ "name": "stdout",
480
+ "text": [
481
+ "Requirement already satisfied: faiss-cpu in /usr/local/lib/python3.10/dist-packages (1.9.0.post1)\n",
482
+ "Requirement already satisfied: numpy<3.0,>=1.25.0 in /usr/local/lib/python3.10/dist-packages (from faiss-cpu) (1.26.4)\n",
483
+ "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from faiss-cpu) (24.2)\n",
484
+ "Requirement already satisfied: sentence-transformers in /usr/local/lib/python3.10/dist-packages (3.2.1)\n",
485
+ "Requirement already satisfied: transformers<5.0.0,>=4.41.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (4.46.3)\n",
486
+ "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (4.66.6)\n",
487
+ "Requirement already satisfied: torch>=1.11.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (2.5.1+cu121)\n",
488
+ "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.5.2)\n",
489
+ "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.13.1)\n",
490
+ "Requirement already satisfied: huggingface-hub>=0.20.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.26.3)\n",
491
+ "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (11.0.0)\n",
492
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (3.16.1)\n",
493
+ "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2024.10.0)\n",
494
+ "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (24.2)\n",
495
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (6.0.2)\n",
496
+ "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2.32.3)\n",
497
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (4.12.2)\n",
498
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.11.0->sentence-transformers) (3.4.2)\n",
499
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.11.0->sentence-transformers) (3.1.4)\n",
500
+ "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.11.0->sentence-transformers) (1.13.1)\n",
501
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch>=1.11.0->sentence-transformers) (1.3.0)\n",
502
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (1.26.4)\n",
503
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (2024.9.11)\n",
504
+ "Requirement already satisfied: tokenizers<0.21,>=0.20 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.20.3)\n",
505
+ "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.4.5)\n",
506
+ "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers) (1.4.2)\n",
507
+ "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers) (3.5.0)\n",
508
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.11.0->sentence-transformers) (3.0.2)\n",
509
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.4.0)\n",
510
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.10)\n",
511
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.2.3)\n",
512
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2024.8.30)\n"
513
+ ]
514
+ },
515
+ {
516
+ "output_type": "stream",
517
+ "name": "stderr",
518
+ "text": [
519
+ "[nltk_data] Downloading package punkt_tab to /root/nltk_data...\n",
520
+ "[nltk_data] Package punkt_tab is already up-to-date!\n"
521
+ ]
522
+ }
523
+ ]
524
+ },
525
+ {
526
+ "cell_type": "code",
527
+ "source": [
528
+ "HUGGING_FACE_ACCESS_TOKEN = userdata.get('HF_TOKEN_Z')\n",
529
+ "\n",
530
+ "model_name = 'google/gemma-2-2b-it'\n",
531
+ "\n",
532
+ "model = AutoModelForCausalLM.from_pretrained(\n",
533
+ " model_name,\n",
534
+ " torch_dtype=torch.float16,\n",
535
+ " token=HUGGING_FACE_ACCESS_TOKEN\n",
536
+ " ).to('cuda')\n",
537
+ "\n",
538
+ "tokenizer = AutoTokenizer.from_pretrained(model_name, token=HUGGING_FACE_ACCESS_TOKEN)"
539
+ ],
540
+ "metadata": {
541
+ "id": "j_41WiGgb37x",
542
+ "colab": {
543
+ "base_uri": "https://localhost:8080/",
544
+ "height": 49,
545
+ "referenced_widgets": [
546
+ "95135ba6ca104151abab245f938b46a1",
547
+ "7c0719ab78e3479393e2e160f7bd7a4c",
548
+ "1395931983524aa58dfb7a603c952748",
549
+ "e05f535e386e482da0450c2ca0594c42",
550
+ "b5318f98281a43d388ff7868a2e69cbd",
551
+ "00d64df484074630a2be9bedbcaec9ab",
552
+ "8be96f5efde442bbb1e44e8658d2fa6f",
553
+ "68c6d56e98734bc7a49859ec57f462ad",
554
+ "e5b15807c72d4e69be4051c90d9649d4",
555
+ "f04d5e153ee94795bd6a7848be30789a",
556
+ "f4d5244ed5634e99b4e1719d77bd1676"
557
+ ]
558
+ },
559
+ "outputId": "ab4bff38-76d8-4f60-bb91-36e628fec941"
560
+ },
561
+ "execution_count": 5,
562
+ "outputs": [
563
+ {
564
+ "output_type": "display_data",
565
+ "data": {
566
+ "text/plain": [
567
+ "Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
568
+ ],
569
+ "application/vnd.jupyter.widget-view+json": {
570
+ "version_major": 2,
571
+ "version_minor": 0,
572
+ "model_id": "95135ba6ca104151abab245f938b46a1"
573
+ }
574
+ },
575
+ "metadata": {}
576
+ }
577
+ ]
578
+ },
579
+ {
580
+ "cell_type": "code",
581
+ "source": [
582
+ "def extract_text_from_pdf(pdf_path):\n",
583
+ " try:\n",
584
+ " with open(pdf_path, 'rb') as file:\n",
585
+ " reader = PyPDF2.PdfReader(file)\n",
586
+ " text = \"\".join([page.extract_text() for page in reader.pages])\n",
587
+ " return text\n",
588
+ " except Exception as e:\n",
589
+ " print(f\"Error reading {pdf_path}: {e}\")\n",
590
+ " return \"\"\n",
591
+ "\n",
592
+ "def split_text_into_chunks(text, max_chunk_size=1000):\n",
593
+ " sentences = sent_tokenize(text)\n",
594
+ " chunks = []\n",
595
+ " current_chunk = \"\"\n",
596
+ "\n",
597
+ " for sentence in sentences:\n",
598
+ " if len(current_chunk) + len(sentence) <= max_chunk_size:\n",
599
+ " current_chunk += sentence + \" \"\n",
600
+ " else:\n",
601
+ " chunks.append(current_chunk.strip())\n",
602
+ " current_chunk = sentence + \" \"\n",
603
+ "\n",
604
+ " if current_chunk:\n",
605
+ " chunks.append(current_chunk.strip())\n",
606
+ "\n",
607
+ " return chunks"
608
+ ],
609
+ "metadata": {
610
+ "id": "Hg_hYwQ6b5xU"
611
+ },
612
+ "execution_count": 6,
613
+ "outputs": []
614
+ },
615
+ {
616
+ "cell_type": "code",
617
+ "source": [
618
+ "from google.colab import drive\n",
619
+ "drive.mount('/content/drive')"
620
+ ],
621
+ "metadata": {
622
+ "id": "f8iaap2ib7Vl",
623
+ "colab": {
624
+ "base_uri": "https://localhost:8080/"
625
+ },
626
+ "outputId": "58a4a282-6796-4097-dee9-d0f3b74f3394"
627
+ },
628
+ "execution_count": 7,
629
+ "outputs": [
630
+ {
631
+ "output_type": "stream",
632
+ "name": "stdout",
633
+ "text": [
634
+ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
635
+ ]
636
+ }
637
+ ]
638
+ },
639
+ {
640
+ "cell_type": "code",
641
+ "source": [
642
+ "# check list pdfs and replace with yourpath\n",
643
+ "\n",
644
+ "os.chdir('/content/drive/MyDrive/Data')\n",
645
+ "!ls"
646
+ ],
647
+ "metadata": {
648
+ "id": "Ry1jQWXCb82A",
649
+ "colab": {
650
+ "base_uri": "https://localhost:8080/"
651
+ },
652
+ "outputId": "44771696-6c6d-42d1-b8e9-b6dd0f3a877a"
653
+ },
654
+ "execution_count": 8,
655
+ "outputs": [
656
+ {
657
+ "output_type": "stream",
658
+ "name": "stdout",
659
+ "text": [
660
+ "data_cleaned_aisc.pdf\n"
661
+ ]
662
+ }
663
+ ]
664
+ },
665
+ {
666
+ "cell_type": "code",
667
+ "source": [
668
+ "encoder = SentenceTransformer('all-MiniLM-L6-v2')\n",
669
+ "\n",
670
+ "# Process PDF files\n",
671
+ "pdf_directory = \"/content/drive/MyDrive/Data\"\n",
672
+ "df_documents = pd.DataFrame(columns=['path', 'text_chunks', 'embeddings'])\n",
673
+ "\n",
674
+ "for filename in os.listdir(pdf_directory):\n",
675
+ " if filename.endswith(\".pdf\"):\n",
676
+ " print(filename)\n",
677
+ " pdf_path = os.path.join(pdf_directory, filename)\n",
678
+ " text = extract_text_from_pdf(pdf_path)\n",
679
+ " chunks = split_text_into_chunks(text)\n",
680
+ " document_embeddings = encoder.encode(chunks)\n",
681
+ " new_row = pd.DataFrame({'path': [pdf_path], 'text_chunks': [chunks], 'embeddings': [document_embeddings]})\n",
682
+ " df_documents = pd.concat([df_documents, new_row], ignore_index=True)\n",
683
+ "\n",
684
+ "df_documents"
685
+ ],
686
+ "metadata": {
687
+ "id": "JUqcsy6sb-Y0",
688
+ "colab": {
689
+ "base_uri": "https://localhost:8080/",
690
+ "height": 106
691
+ },
692
+ "outputId": "9c235f52-88a1-4ad8-c07e-94227745cad3"
693
+ },
694
+ "execution_count": 9,
695
+ "outputs": [
696
+ {
697
+ "output_type": "stream",
698
+ "name": "stdout",
699
+ "text": [
700
+ "data_cleaned_aisc.pdf\n"
701
+ ]
702
+ },
703
+ {
704
+ "output_type": "execute_result",
705
+ "data": {
706
+ "text/plain": [
707
+ " path \\\n",
708
+ "0 /content/drive/MyDrive/Data/data_cleaned_aisc.pdf \n",
709
+ "\n",
710
+ " text_chunks \\\n",
711
+ "0 [Keo - Pad tαΊ£n nhiệt lΓ  gΓ¬? Keo - Pad tαΊ£n nhiệ... \n",
712
+ "\n",
713
+ " embeddings \n",
714
+ "0 [[0.0036073995, -0.059478104, 0.06371901, -0.0... "
715
+ ],
716
+ "text/html": [
717
+ "\n",
718
+ " <div id=\"df-9cc63421-bb32-46ec-a442-27cbbf2da9f9\" class=\"colab-df-container\">\n",
719
+ " <div>\n",
720
+ "<style scoped>\n",
721
+ " .dataframe tbody tr th:only-of-type {\n",
722
+ " vertical-align: middle;\n",
723
+ " }\n",
724
+ "\n",
725
+ " .dataframe tbody tr th {\n",
726
+ " vertical-align: top;\n",
727
+ " }\n",
728
+ "\n",
729
+ " .dataframe thead th {\n",
730
+ " text-align: right;\n",
731
+ " }\n",
732
+ "</style>\n",
733
+ "<table border=\"1\" class=\"dataframe\">\n",
734
+ " <thead>\n",
735
+ " <tr style=\"text-align: right;\">\n",
736
+ " <th></th>\n",
737
+ " <th>path</th>\n",
738
+ " <th>text_chunks</th>\n",
739
+ " <th>embeddings</th>\n",
740
+ " </tr>\n",
741
+ " </thead>\n",
742
+ " <tbody>\n",
743
+ " <tr>\n",
744
+ " <th>0</th>\n",
745
+ " <td>/content/drive/MyDrive/Data/data_cleaned_aisc.pdf</td>\n",
746
+ " <td>[Keo - Pad tαΊ£n nhiệt lΓ  gΓ¬? Keo - Pad tαΊ£n nhiệ...</td>\n",
747
+ " <td>[[0.0036073995, -0.059478104, 0.06371901, -0.0...</td>\n",
748
+ " </tr>\n",
749
+ " </tbody>\n",
750
+ "</table>\n",
751
+ "</div>\n",
752
+ " <div class=\"colab-df-buttons\">\n",
753
+ "\n",
754
+ " <div class=\"colab-df-container\">\n",
755
+ " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-9cc63421-bb32-46ec-a442-27cbbf2da9f9')\"\n",
756
+ " title=\"Convert this dataframe to an interactive table.\"\n",
757
+ " style=\"display:none;\">\n",
758
+ "\n",
759
+ " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
760
+ " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
761
+ " </svg>\n",
762
+ " </button>\n",
763
+ "\n",
764
+ " <style>\n",
765
+ " .colab-df-container {\n",
766
+ " display:flex;\n",
767
+ " gap: 12px;\n",
768
+ " }\n",
769
+ "\n",
770
+ " .colab-df-convert {\n",
771
+ " background-color: #E8F0FE;\n",
772
+ " border: none;\n",
773
+ " border-radius: 50%;\n",
774
+ " cursor: pointer;\n",
775
+ " display: none;\n",
776
+ " fill: #1967D2;\n",
777
+ " height: 32px;\n",
778
+ " padding: 0 0 0 0;\n",
779
+ " width: 32px;\n",
780
+ " }\n",
781
+ "\n",
782
+ " .colab-df-convert:hover {\n",
783
+ " background-color: #E2EBFA;\n",
784
+ " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
785
+ " fill: #174EA6;\n",
786
+ " }\n",
787
+ "\n",
788
+ " .colab-df-buttons div {\n",
789
+ " margin-bottom: 4px;\n",
790
+ " }\n",
791
+ "\n",
792
+ " [theme=dark] .colab-df-convert {\n",
793
+ " background-color: #3B4455;\n",
794
+ " fill: #D2E3FC;\n",
795
+ " }\n",
796
+ "\n",
797
+ " [theme=dark] .colab-df-convert:hover {\n",
798
+ " background-color: #434B5C;\n",
799
+ " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
800
+ " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
801
+ " fill: #FFFFFF;\n",
802
+ " }\n",
803
+ " </style>\n",
804
+ "\n",
805
+ " <script>\n",
806
+ " const buttonEl =\n",
807
+ " document.querySelector('#df-9cc63421-bb32-46ec-a442-27cbbf2da9f9 button.colab-df-convert');\n",
808
+ " buttonEl.style.display =\n",
809
+ " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
810
+ "\n",
811
+ " async function convertToInteractive(key) {\n",
812
+ " const element = document.querySelector('#df-9cc63421-bb32-46ec-a442-27cbbf2da9f9');\n",
813
+ " const dataTable =\n",
814
+ " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
815
+ " [key], {});\n",
816
+ " if (!dataTable) return;\n",
817
+ "\n",
818
+ " const docLinkHtml = 'Like what you see? Visit the ' +\n",
819
+ " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
820
+ " + ' to learn more about interactive tables.';\n",
821
+ " element.innerHTML = '';\n",
822
+ " dataTable['output_type'] = 'display_data';\n",
823
+ " await google.colab.output.renderOutput(dataTable, element);\n",
824
+ " const docLink = document.createElement('div');\n",
825
+ " docLink.innerHTML = docLinkHtml;\n",
826
+ " element.appendChild(docLink);\n",
827
+ " }\n",
828
+ " </script>\n",
829
+ " </div>\n",
830
+ "\n",
831
+ "\n",
832
+ " <div id=\"id_c5591e82-595e-4add-9b94-d608b8a3b092\">\n",
833
+ " <style>\n",
834
+ " .colab-df-generate {\n",
835
+ " background-color: #E8F0FE;\n",
836
+ " border: none;\n",
837
+ " border-radius: 50%;\n",
838
+ " cursor: pointer;\n",
839
+ " display: none;\n",
840
+ " fill: #1967D2;\n",
841
+ " height: 32px;\n",
842
+ " padding: 0 0 0 0;\n",
843
+ " width: 32px;\n",
844
+ " }\n",
845
+ "\n",
846
+ " .colab-df-generate:hover {\n",
847
+ " background-color: #E2EBFA;\n",
848
+ " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
849
+ " fill: #174EA6;\n",
850
+ " }\n",
851
+ "\n",
852
+ " [theme=dark] .colab-df-generate {\n",
853
+ " background-color: #3B4455;\n",
854
+ " fill: #D2E3FC;\n",
855
+ " }\n",
856
+ "\n",
857
+ " [theme=dark] .colab-df-generate:hover {\n",
858
+ " background-color: #434B5C;\n",
859
+ " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
860
+ " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
861
+ " fill: #FFFFFF;\n",
862
+ " }\n",
863
+ " </style>\n",
864
+ " <button class=\"colab-df-generate\" onclick=\"generateWithVariable('df_documents')\"\n",
865
+ " title=\"Generate code using this dataframe.\"\n",
866
+ " style=\"display:none;\">\n",
867
+ "\n",
868
+ " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
869
+ " width=\"24px\">\n",
870
+ " <path d=\"M7,19H8.4L18.45,9,17,7.55,7,17.6ZM5,21V16.75L18.45,3.32a2,2,0,0,1,2.83,0l1.4,1.43a1.91,1.91,0,0,1,.58,1.4,1.91,1.91,0,0,1-.58,1.4L9.25,21ZM18.45,9,17,7.55Zm-12,3A5.31,5.31,0,0,0,4.9,8.1,5.31,5.31,0,0,0,1,6.5,5.31,5.31,0,0,0,4.9,4.9,5.31,5.31,0,0,0,6.5,1,5.31,5.31,0,0,0,8.1,4.9,5.31,5.31,0,0,0,12,6.5,5.46,5.46,0,0,0,6.5,12Z\"/>\n",
871
+ " </svg>\n",
872
+ " </button>\n",
873
+ " <script>\n",
874
+ " (() => {\n",
875
+ " const buttonEl =\n",
876
+ " document.querySelector('#id_c5591e82-595e-4add-9b94-d608b8a3b092 button.colab-df-generate');\n",
877
+ " buttonEl.style.display =\n",
878
+ " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
879
+ "\n",
880
+ " buttonEl.onclick = () => {\n",
881
+ " google.colab.notebook.generateWithVariable('df_documents');\n",
882
+ " }\n",
883
+ " })();\n",
884
+ " </script>\n",
885
+ " </div>\n",
886
+ "\n",
887
+ " </div>\n",
888
+ " </div>\n"
889
+ ],
890
+ "application/vnd.google.colaboratory.intrinsic+json": {
891
+ "type": "dataframe",
892
+ "variable_name": "df_documents",
893
+ "summary": "{\n \"name\": \"df_documents\",\n \"rows\": 1,\n \"fields\": [\n {\n \"column\": \"path\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"/content/drive/MyDrive/Data/data_cleaned_aisc.pdf\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text_chunks\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"embeddings\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
894
+ }
895
+ },
896
+ "metadata": {},
897
+ "execution_count": 9
898
+ }
899
+ ]
900
+ },
901
+ {
902
+ "cell_type": "code",
903
+ "source": [
904
+ "all_embeddings = np.vstack(df_documents['embeddings'].tolist())\n",
905
+ "dimension = all_embeddings.shape[1]\n",
906
+ "index = faiss.IndexFlatL2(dimension)\n",
907
+ "index.add(all_embeddings)"
908
+ ],
909
+ "metadata": {
910
+ "id": "eQljqN_ScAuQ"
911
+ },
912
+ "execution_count": 10,
913
+ "outputs": []
914
+ },
915
+ {
916
+ "cell_type": "code",
917
+ "source": [
918
+ "def find_most_similar_chunks(query, top_k=3):\n",
919
+ " query_embedding = encoder.encode([query])\n",
920
+ " distances, indices = index.search(query_embedding, top_k)\n",
921
+ " results = []\n",
922
+ " total_chunks = sum(len(chunks) for chunks in df_documents['text_chunks'])\n",
923
+ " for i, idx in enumerate(indices[0]):\n",
924
+ " if idx < total_chunks:\n",
925
+ " doc_idx = 0\n",
926
+ " chunk_idx = idx\n",
927
+ " while chunk_idx >= len(df_documents['text_chunks'].iloc[doc_idx]):\n",
928
+ " chunk_idx -= len(df_documents['text_chunks'].iloc[doc_idx])\n",
929
+ " doc_idx += 1\n",
930
+ " results.append({\n",
931
+ " 'document': df_documents['path'].iloc[doc_idx],\n",
932
+ " 'chunk': df_documents['text_chunks'].iloc[doc_idx][chunk_idx],\n",
933
+ " 'distance': distances[0][i]\n",
934
+ " })\n",
935
+ " return results\n",
936
+ "\n",
937
+ "def generate_response(query, context, max_length=1000):\n",
938
+ " # query_template = \"BαΊ‘n lΓ  mα»™t chatbot tΖ° vαΊ₯n khΓ‘ch hΓ ng. HΓ£y trαΊ£ lời cΓ’u hỏi sau dα»±a trΓͺn ngα»― cαΊ£nh, nαΊΏu ngα»― cαΊ£nh khΓ΄ng cung cαΊ₯p cΓ’u trαΊ£ lời hoαΊ·c khΓ΄ng chαΊ―c chαΊ―n hΓ£y trαΊ£ lời 'TΓ΄i khΓ΄ng biαΊΏt thΓ΄ng tin nΓ y, tuy nhiΓͺn Δ‘oαΊ‘n thΓ΄ng tin dΖ°α»›i phαΊ§n tham khαΊ£o cΓ³ thể cΓ³ cΓ’u trαΊ£ lời cho bαΊ‘n!' Δ‘α»«ng cα»‘ tαΊ‘o ra cΓ’u trαΊ£ lời khΓ΄ng cΓ³ trong ngα»― cαΊ£nh.\\nNgα»― cαΊ£nh: {context} \\nCΓ’u hỏi: {question}\\nTrαΊ£ lời: \"\n",
939
+ " # query_template = \"Tham khảo ngữ cảnh:{context}\\n\\n### CÒu hỏi:{question}\\n\\n### Trả lời:\"\n",
940
+ " prompt = f\"Context: {context}\\n\\nQuestion: {query}\\n\\nAnswer:\"\n",
941
+ " input_ids = tokenizer(prompt, return_tensors=\"pt\").input_ids.to('cuda')\n",
942
+ "\n",
943
+ " with torch.no_grad():\n",
944
+ " output = model.generate(input_ids, max_new_tokens=max_length, num_return_sequences=1)\n",
945
+ "\n",
946
+ " decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)\n",
947
+ "\n",
948
+ " # Extracting the answer part by removing the prompt portion\n",
949
+ " answer_start = decoded_output.find(\"Answer:\") + len(\"Answer:\")\n",
950
+ " answer = decoded_output[answer_start:].strip()\n",
951
+ "\n",
952
+ " return answer\n",
953
+ "\n",
954
+ "def query_documents(query):\n",
955
+ " similar_chunks = find_most_similar_chunks(query)\n",
956
+ " context = \" \".join([result['chunk'].replace(\"\\n\", \"\") for result in similar_chunks])\n",
957
+ " response = generate_response(query, context)\n",
958
+ " return response, similar_chunks"
959
+ ],
960
+ "metadata": {
961
+ "id": "J0IAR3JxcC_m"
962
+ },
963
+ "execution_count": 11,
964
+ "outputs": []
965
+ },
966
+ {
967
+ "cell_type": "code",
968
+ "source": [
969
+ "query = \"Keo-Pad tαΊ£n nhiệt lΓ  gΓ¬?\"\n",
970
+ "answer, relevant_chunks = query_documents(query)\n",
971
+ "\n",
972
+ "print(f\"Query: {query}\\n\\n-----\\n\")\n",
973
+ "print(f\"Generated answer: {answer}\\n\\n-----\\n\")\n",
974
+ "print(\"Relevant chunks:\")\n",
975
+ "for chunk in relevant_chunks:\n",
976
+ " print(f\"Document: {chunk['document']}\")\n",
977
+ " print(f\"Chunk: {chunk['chunk']}\".replace(\"\\n\", \"\"))\n",
978
+ " print(f\"Distance: {chunk['distance']}\")\n",
979
+ " print()"
980
+ ],
981
+ "metadata": {
982
+ "id": "eIDf8cKtcFZZ",
983
+ "colab": {
984
+ "base_uri": "https://localhost:8080/"
985
+ },
986
+ "outputId": "e37229c5-531f-4687-d178-ee5daa94af49"
987
+ },
988
+ "execution_count": 12,
989
+ "outputs": [
990
+ {
991
+ "output_type": "stream",
992
+ "name": "stdout",
993
+ "text": [
994
+ "Query: Keo-Pad tαΊ£n nhiệt lΓ  gΓ¬?\n",
995
+ "\n",
996
+ "-----\n",
997
+ "\n",
998
+ "Generated answer: Keo-Pad tαΊ£n nhiệt lΓ  mα»™t loαΊ‘i vαΊ­t liệu được sα»­ dα»₯ng để lαΊ₯p Δ‘αΊ§y khoαΊ£ng hở giα»―a bα»™ xα»­ lΓ½ vΓ  bα»™ tαΊ£n nhiệt, giΓΊp cαΊ£i thiện khαΊ£ nΔƒng truy ền nhiệt tα»« bα»™ xα»­ lΓ½ Δ‘αΊΏn bα»™ tαΊ£n nhiệt, tα»« Δ‘Γ³ giΓΊp giαΊ£m nhiệt Δ‘α»™ của bα»™ xα»­ lΓ½.\n",
999
+ "\n",
1000
+ "-----\n",
1001
+ "\n",
1002
+ "Relevant chunks:\n",
1003
+ "Document: /content/drive/MyDrive/Data/data_cleaned_aisc.pdf\n",
1004
+ "Chunk: CΓ³ nhiều loαΊ‘i keo - pad tαΊ£n nhiệt khΓ‘c nhau trΓͺn th α»‹ trường, bao g α»“m keo - pad tαΊ£n nhiệt silicon, keo - pad tαΊ£n nhiệt carbon, keo - pad tαΊ£n nhiệt kim loαΊ‘i lỏng vΓ  keo - pad tαΊ£n nhiệt silicon ceramic. Keo - pad tαΊ£n nhiệt silicon lΓ  gΓ¬? Keo - Pad tαΊ£n nhiệt silicon lΓ  m α»™t loαΊ‘i keo - pad tαΊ£n nhiệt được lΓ m tα»« silicon, cΓ³ Δ‘ α»™ bền cao, khαΊ£ nΔƒng dαΊ«n nhiệt tα»‘t vΓ  giΓ‘ thΓ nh h ợp lΓ½. Keo - pad tαΊ£n nhiệt carbon lΓ  gΓ¬? Keo - Pad tαΊ£n nhiệt carbon lΓ  m α»™t loαΊ‘i keo - pad tαΊ£n nhiệt được lΓ m tα»« carbon, cΓ³ kh αΊ£ nΔƒng dαΊ«n nhiệt tα»‘t vΓ  Δ‘α»™ bền cao, nhΖ°ng giΓ‘ thΓ nh tΖ°Ζ‘ng Δ‘ α»‘i cao. Keo - pad tαΊ£n nhiệt kim loαΊ‘i lỏng lΓ  gΓ¬? Keo - Pad tαΊ£n nhiệt kim loαΊ‘i lỏng lΓ  mα»™t loαΊ‘i keo - pad tαΊ£n nhiệt được lΓ m tα»« kim loαΊ‘i lỏng, cΓ³ khαΊ£ nΔƒng dαΊ«n nhiệt tα»‘t nhαΊ₯t trong cΓ‘c lo αΊ‘i keo - pad tαΊ£n nhiệt, nhΖ°ng giΓ‘ thΓ nh cao vΓ  cΓ³ thể gΓ’y ra nguy cΖ‘ rΓ² r ỉ nαΊΏu khΓ΄ng s α»­ dα»₯ng Δ‘ΓΊng cΓ‘ch. Keo - pad tαΊ£n nhiệt silicon ceramic lΓ  gΓ¬?\n",
1005
+ "Distance: 0.6636487245559692\n",
1006
+ "\n",
1007
+ "Document: /content/drive/MyDrive/Data/data_cleaned_aisc.pdf\n",
1008
+ "Chunk: Keo - Pad tαΊ£n nhiệt lΓ  gΓ¬? Keo - Pad tαΊ£n nhiệt lΓ  mα»™t loαΊ‘i vαΊ­t liệu được sα»­ dα»₯ng để lαΊ₯p Δ‘αΊ§y khoαΊ£ng hở giα»―a bα»™ xα»­ lΓ½ vΓ  bα»™ tαΊ£n nhiệt, giΓΊp cαΊ£i thiện khαΊ£ nΔƒng truy ền nhiệt tα»« bα»™ xα»­ lΓ½ Δ‘αΊΏn bα»™ tαΊ£n nhiệt, tα»« Δ‘Γ³ giΓΊp giαΊ£m nhiệt Δ‘α»™ của bα»™ xα»­ lΓ½ ThΓ nh phαΊ§n của keo - pad tαΊ£n nhiệt lΓ  gΓ¬? ', Keo - Pad tαΊ£n nhiệt được lΓ m tα»« nhiều loαΊ‘i vαΊ­t liệu khΓ‘c nhau, bao g α»“m chαΊ₯t lΓ m αΊ©m, chαΊ₯t kαΊΏt dΓ­nh, ch αΊ₯t Δ‘α»™n vΓ  chαΊ₯t lΓ m tΔƒng Δ‘ α»™ cα»©ng. ThΓ nh ph αΊ§n cα»₯ thể của keo - pad tαΊ£n nhiệt cΓ³ thể thay Δ‘α»•i tΓΉy thuα»™c vΓ o mα»₯c Δ‘Γ­ch sα»­ dα»₯ng. Keo - pad tαΊ£n nhiệt được sα»­ dα»₯ng nhΖ° th αΊΏ nΓ o? Keo - Pad tαΊ£n nhiệt được sα»­ dα»₯ng bαΊ±ng cΓ‘ch thoa m α»™t lα»›p mỏng lΓͺn bề mαΊ·t của bα»™ xα»­ lΓ½, sau Δ‘Γ³ dΓ‘n b α»™ tαΊ£n nhiệt lΓͺn trΓͺn. L α»›p keo - pad tαΊ£n nhiệt sαΊ½ lαΊ₯p Δ‘αΊ§y khoαΊ£ng hở giα»―a bα»™ xα»­ lΓ½ vΓ  bα»™ tαΊ£n nhiệt, giΓΊp cαΊ£i thiện khαΊ£ nΔƒng truy ền nhiệt tα»« bα»™ xα»­ lΓ½ Δ‘αΊΏn bα»™ tαΊ£n nhiệt. Nhα»―ng loαΊ‘i keo - pad tαΊ£n nhiệt phα»• biαΊΏn lΓ  gΓ¬?\n",
1009
+ "Distance: 0.6936659812927246\n",
1010
+ "\n",
1011
+ "Document: /content/drive/MyDrive/Data/data_cleaned_aisc.pdf\n",
1012
+ "Chunk: Keo - Pad tαΊ£n nhiệt silicon ceramic lΓ  m α»™t loαΊ‘i keo - pad tαΊ£n nhiệt được lΓ m tα»« silicon vΓ  ceramic, cΓ³ kh αΊ£ nΔƒng dαΊ«n nhiệt tα»‘t, Δ‘α»™ bền cao vΓ  giΓ‘ thΓ nh h ợp lΓ½. LoαΊ‘i keo - pad tαΊ£n nhiệt nΓ o phΓΉ h ợp vα»›i tΓ΄i? Lα»±a chọn loαΊ‘i keo - pad tαΊ£n nhiệt phΓΉ hợp phα»₯ thuα»™c vΓ o nhi ều yαΊΏu tα»‘, bao gα»“m loαΊ‘i bα»™ xα»­ lΓ½, loαΊ‘i bα»™ tαΊ£n nhiệt, mα»©c nhiệt Δ‘α»™ hoαΊ‘t Δ‘α»™ng mong mu α»‘n vΓ  ngΓ’n sΓ‘ch c ủa bαΊ‘n. BαΊ‘n nΓͺn tham khαΊ£o Γ½ kiαΊΏn của chuyΓͺn gia ho αΊ·c đọc cΓ‘c bΓ i Δ‘Γ‘nh giΓ‘ Δ‘ ể lα»±a chọn loαΊ‘i keo - pad tαΊ£n nhiệt phΓΉ hợp nhαΊ₯t. TΓ΄i nΓͺn mua keo - pad tαΊ£n nhiệt ở Δ‘Γ’u? BαΊ‘n cΓ³ thể mua keo - pad tαΊ£n nhiệt tαΊ‘i cΓ‘c cα»­a hΓ ng bΓ‘n linh ki ện mΓ‘y tΓ­nh ho αΊ·c cΓ‘c trang thΖ°Ζ‘ng m αΊ‘i Δ‘iện tα»­. Tuy nhiΓͺn, b αΊ‘n nΓͺn chọn mua sαΊ£n phαΊ©m tα»« nhα»―ng nhΓ  cung c αΊ₯p uy tΓ­n để Δ‘αΊ£m bαΊ£o chαΊ₯t lượng vΓ  trΓ‘nh mua ph αΊ£i hΓ ng giαΊ£, hΓ ng kΓ©m ch αΊ₯t lượng.' QuαΊ§n jeans nam cΓ³ nh α»―ng loαΊ‘i vαΊ£i nΓ o? QuαΊ§n jeans nam cΓ³ nhi ều loαΊ‘i vαΊ£i khΓ‘c nhau, ph α»• biαΊΏn nhαΊ₯t lΓ  vαΊ£i denim, v αΊ£i kaki, vαΊ£i bα»‘ vΓ  vαΊ£i nhung. Đặc Δ‘iểm của tα»«ng loαΊ‘i vαΊ£i lΓ  gΓ¬?\n",
1013
+ "Distance: 0.7591948509216309\n",
1014
+ "\n"
1015
+ ]
1016
+ }
1017
+ ]
1018
+ },
1019
+ {
1020
+ "cell_type": "code",
1021
+ "source": [
1022
+ "query = \"TΓ΄i muα»‘n quαΊ§n Γ‘o mαΊ·c cho mΓΉa Δ‘Γ΄ng cho trαΊ» em\"\n",
1023
+ "answer, relevant_chunks = query_documents(query)\n",
1024
+ "\n",
1025
+ "print(f\"Query: {query}\\n\\n-----\\n\")\n",
1026
+ "print(f\"Generated answer: {answer}\\n\\n-----\\n\")\n",
1027
+ "print(\"Relevant chunks:\")\n",
1028
+ "for chunk in relevant_chunks:\n",
1029
+ " print(f\"Document: {chunk['document']}\")\n",
1030
+ " print(f\"Chunk: {chunk['chunk']}\".replace(\"\\n\", \"\"))\n",
1031
+ " print(f\"Distance: {chunk['distance']}\")\n",
1032
+ " print()"
1033
+ ],
1034
+ "metadata": {
1035
+ "id": "u0T-08hneR77",
1036
+ "colab": {
1037
+ "base_uri": "https://localhost:8080/"
1038
+ },
1039
+ "outputId": "8987764c-21af-42df-df87-4477ee275314"
1040
+ },
1041
+ "execution_count": 13,
1042
+ "outputs": [
1043
+ {
1044
+ "output_type": "stream",
1045
+ "name": "stdout",
1046
+ "text": [
1047
+ "Query: TΓ΄i muα»‘n quαΊ§n Γ‘o mαΊ·c cho mΓΉa Δ‘Γ΄ng cho trαΊ» em\n",
1048
+ "\n",
1049
+ "-----\n",
1050
+ "\n",
1051
+ "Generated answer: BαΊ‘n muα»‘n tΓ¬m quαΊ§n Γ‘o mΓΉa Δ‘Γ΄ng cho trαΊ» em, vαΊ­y nΓͺn cαΊ§n lΖ°u Γ½ nhα»―ng Δ‘iều sau:\n",
1052
+ "\n",
1053
+ "**1. ChαΊ₯t liệu:** \n",
1054
+ " - Chọn quαΊ§n Γ‘o lΓ m tα»« chαΊ₯t liệu αΊ₯m Γ‘p, giα»― nhiệt tα»‘t nhΖ°: Fleece, Thicken Wool, Cotton, Flannel.\n",
1055
+ " - Kiểm tra xem chαΊ₯t liệu cΓ³ mềm mαΊ‘i, dα»… chα»‹u cho trαΊ» khΓ΄ng.\n",
1056
+ "\n",
1057
+ "**2. ThiαΊΏt kαΊΏ:** \n",
1058
+ " - TΓΉy theo Δ‘α»™ tuα»•i vΓ  sở thΓ­ch của trαΊ», lα»±a chọn quαΊ§n Γ‘o cΓ³ thiαΊΏt kαΊΏ phΓΉ hợp. \n",
1059
+ " - Kiểm tra xem quαΊ§n Γ‘o cΓ³ đủ cΓ‘c lα»›p để giα»― αΊ₯m, trΓ‘nh bα»‹ lαΊ‘nh.\n",
1060
+ "\n",
1061
+ "**3. Độ bền:** \n",
1062
+ " - Chọn quαΊ§n Γ‘o cΓ³ Δ‘α»™ bền cao, dα»… dΓ ng giαΊ·t sαΊ‘ch. \n",
1063
+ " - Kiểm tra xem quαΊ§n Γ‘o cΓ³ đường may chαΊ―c chαΊ―n, khΓ³a kΓ©o vΓ  phα»₯ kiện tα»‘t.\n",
1064
+ "\n",
1065
+ "**4. MΓ u sαΊ―c:** \n",
1066
+ " - Lα»±a chọn mΓ u sαΊ―c phΓΉ hợp vα»›i sở thΓ­ch của trαΊ». \n",
1067
+ " - MΓ u sαΊ―c tΖ°Ζ‘i sΓ‘ng, dα»… nhΓ¬n sαΊ½ giΓΊp trαΊ» cαΊ£m thαΊ₯y vui vαΊ».\n",
1068
+ "\n",
1069
+ "**5. GiΓ‘ cαΊ£:** \n",
1070
+ " - Lα»±a chọn quαΊ§n Γ‘o phΓΉ hợp vα»›i ngΓ’n sΓ‘ch của gia Δ‘Γ¬nh. \n",
1071
+ " - LΖ°u Γ½ giΓ‘ cαΊ£ cΓ³ thể thay Δ‘α»•i tΓΉy theo thΖ°Ζ‘ng hiệu vΓ  chαΊ₯t liệu.\n",
1072
+ "\n",
1073
+ "**6. ThΖ°Ζ‘ng hiệu:** \n",
1074
+ " - Lα»±a chọn thΖ°Ζ‘ng hiệu uy tΓ­n, cΓ³ chαΊ₯t lượng tα»‘t. \n",
1075
+ " - Tham khαΊ£o Γ½ kiαΊΏn tα»« người thΓ’n, bαΊ‘n bΓ¨ để chọn được thΖ°Ζ‘ng hiệu phΓΉ hợp.\n",
1076
+ "\n",
1077
+ "**7. LΖ°u Γ½:** \n",
1078
+ " - LΖ°u Γ½ Δ‘αΊΏn kΓ­ch thΖ°α»›c quαΊ§n Γ‘o phΓΉ hợp vα»›i chiều cao vΓ  cΓ’n nαΊ·ng của trαΊ». \n",
1079
+ " - Kiểm tra xem quαΊ§n Γ‘o cΓ³ đủ cΓ‘c lα»›p để giα»― αΊ₯m, trΓ‘nh bα»‹ lαΊ‘nh. \n",
1080
+ " - LΖ°u Γ½ Δ‘αΊΏn cΓ‘c thΓ΄ng tin về bαΊ£o hΓ nh, chαΊΏ Δ‘α»™ Δ‘α»•i trαΊ£ của cα»­a hΓ ng.\n",
1081
+ "\n",
1082
+ "-----\n",
1083
+ "\n",
1084
+ "Relevant chunks:\n",
1085
+ "Document: /content/drive/MyDrive/Data/data_cleaned_aisc.pdf\n",
1086
+ "Chunk: ', '' '', '' 'NαΊΏu tΓ΄i muα»‘n tΓ¬m phα»₯ kiện cΖ°α»›i được lΓ m tα»« chαΊ₯t liệu cao cαΊ₯p nhΖ°ng v αΊ«n nαΊ±m trong t αΊ§m giΓ‘ của mΓ¬nh thΓ¬ cΓ³ nh α»―ng lα»±a chọn nΓ o?\n",
1087
+ "Distance: 0.6322872042655945\n",
1088
+ "\n",
1089
+ "Document: /content/drive/MyDrive/Data/data_cleaned_aisc.pdf\n",
1090
+ "Chunk: NαΊΏu bαΊ‘n muα»‘n mua mα»™t chiαΊΏc Γ‘o khoΓ‘c giΓ³ v α»«a tΓΊi tiền hΖ‘n, bαΊ‘n cΓ³ thể tΓ¬m cΓ‘c sαΊ£n phαΊ©m của cΓ‘c thΖ°Ζ‘ng hi ệu Việt Nam nhΖ° Weill, Mucino, Canifa, An PhΖ° α»›c, ...Vα»›i mα»©c giΓ‘ tα»« 200.000 Δ‘ α»“ng Δ‘αΊΏn 500.000 Δ‘ α»“ng, bαΊ‘n vαΊ«n cΓ³ thể sở hα»―u mα»™t chiαΊΏc Γ‘o khoΓ‘c giΓ³ chαΊ₯t lượng tα»‘t. ', '' '', '' 'C αΊ£m Ζ‘n chuyΓͺn gia, tΓ΄i Δ‘Γ£ hi ểu hΖ‘n về cΓ‘ch chọn Γ‘o khoΓ‘c giΓ³ ch αΊ₯t lượng cao. TΓ΄i s αΊ½ tham khαΊ£o nhα»―ng thΓ΄ng tin nΓ y Δ‘ ể mua được chiαΊΏc Γ‘o khoΓ‘c giΓ³ Ζ°ng Γ½. ', '' '', '' 'RαΊ₯t vui vΓ¬ tΓ΄i cΓ³ th ể giΓΊp bαΊ‘n chọn được chiαΊΏc Γ‘o khoΓ‘c giΓ³ ch αΊ₯t lượng cao phΓΉ h ợp vα»›i nhu cαΊ§u của mΓ¬nh. ChΓΊc b αΊ‘n mua sαΊ―m vui vαΊ»!' '' '', '' 'TΓ΄i mu α»‘n biαΊΏt cΓ‘ch Δ‘Γ‘nh giΓ‘ ch αΊ₯t lượng viΓͺn nΓ©n cΓ  phΓͺ. B αΊ‘n cΓ³ thể giΓΊp tΓ΄i khΓ΄ng? ', '' '', '' 'ChαΊ―c chαΊ―n rα»“i. CΓ³ mα»™t sα»‘ cΓ‘ch để Δ‘Γ‘nh giΓ‘ ch αΊ₯t lượng viΓͺn nΓ©n cΓ  phΓͺ. B αΊ‘n cΓ³ thể kiểm tra bao bΓ¬, thΓ nh ph αΊ§n, hΖ°Ζ‘ng v α»‹, Δ‘α»™ tΖ°Ζ‘i vΓ  tΓ­nh nh αΊ₯t quΓ‘n của viΓͺn nΓ©n. ', '' '', '' 'TΓ΄i nΓͺn kiểm tra nhα»―ng gΓ¬ trΓͺn bao bΓ¬ viΓͺn nΓ©n cΓ  phΓͺ?\n",
1091
+ "Distance: 0.6948944330215454\n",
1092
+ "\n",
1093
+ "Document: /content/drive/MyDrive/Data/data_cleaned_aisc.pdf\n",
1094
+ "Chunk: Chọn chαΊ₯t liệu phΓΉ hợp BαΊ‘n nΓͺn chọn Balo được lΓ m tα»« chαΊ₯t liệu cao cαΊ₯p, cΓ³ khαΊ£ nΔƒng chα»‘ng thαΊ₯m nΖ°α»›c vΓ  Δ‘α»™ bền cao.\\n\\n3. Kiểm tra chαΊ₯t lượng BαΊ‘n nΓͺn kiểm tra kα»Ή chαΊ₯t lượng của Balo trΖ° α»›c khi mua, bao g α»“m đường may, khΓ³a kΓ©o vΓ  ph α»₯ kiện.'\n",
1095
+ "Distance: 0.6998488903045654\n",
1096
+ "\n"
1097
+ ]
1098
+ }
1099
+ ]
1100
+ },
1101
+ {
1102
+ "cell_type": "code",
1103
+ "source": [
1104
+ "query = \"Chủ tα»‹ch Hα»“ ChΓ­ Minh lΓ  ai?\"\n",
1105
+ "answer, relevant_chunks = query_documents(query)\n",
1106
+ "\n",
1107
+ "print(f\"Query: {query}\\n\\n-----\\n\")\n",
1108
+ "print(f\"Generated answer: {answer}\\n\\n-----\\n\")\n",
1109
+ "print(\"Relevant chunks:\")\n",
1110
+ "for chunk in relevant_chunks:\n",
1111
+ " print(f\"Document: {chunk['document']}\")\n",
1112
+ " print(f\"Chunk: {chunk['chunk']}\".replace(\"\\n\", \"\"))\n",
1113
+ " print(f\"Distance: {chunk['distance']}\")\n",
1114
+ " print()"
1115
+ ],
1116
+ "metadata": {
1117
+ "id": "7Tw9KouChHAS",
1118
+ "colab": {
1119
+ "base_uri": "https://localhost:8080/"
1120
+ },
1121
+ "outputId": "ca577e46-5042-412a-c8ae-fd175f14c699"
1122
+ },
1123
+ "execution_count": 14,
1124
+ "outputs": [
1125
+ {
1126
+ "output_type": "stream",
1127
+ "name": "stdout",
1128
+ "text": [
1129
+ "Query: Chủ tα»‹ch Hα»“ ChΓ­ Minh lΓ  ai?\n",
1130
+ "\n",
1131
+ "-----\n",
1132
+ "\n",
1133
+ "Generated answer: Chủ tα»‹ch Hα»“ ChΓ­ Minh lΓ  mα»™t nhΓ  cΓ‘ch mαΊ‘ng, chΓ­nh trα»‹ gia, vΓ  nhΓ  vΔƒn Việt Nam.\n",
1134
+ "\n",
1135
+ "-----\n",
1136
+ "\n",
1137
+ "Relevant chunks:\n",
1138
+ "Document: /content/drive/MyDrive/Data/data_cleaned_aisc.pdf\n",
1139
+ "Chunk: ', '' '', '' 'M α»™t sα»‘ thΖ°Ζ‘ng hi ệu Δ‘α»“ng hα»“ trαΊ» em được Δ‘Γ‘nh giΓ‘ cao bao g α»“m Casio, Citizen, Seiko, Timex, Daniel Wellington, Skmei, APELA, Olympia Star,...', '' '', '' 'CΓ³ Δ‘ α»“ng hα»“ trαΊ» em nΓ o cΓ³ th ể sα»­ dα»₯ng cho trαΊ» nhỏ tα»« 2-3 tuα»•i khΓ΄ng? ', '' '', '' 'CΓ³, m α»™t sα»‘ thΖ°Ζ‘ng hi ệu Δ‘α»“ng hα»“ trαΊ» em cΓ³ sαΊ£n xuαΊ₯t Δ‘α»“ng hα»“ dΓ nh riΓͺng cho trαΊ» nhỏ tα»« 2-3 tuα»•i vα»›i thiαΊΏt kαΊΏ Δ‘Ζ‘n giαΊ£n, dΓ’y Δ‘eo m ềm mαΊ‘i. ', '' '', '' 'M α»™t chiαΊΏc Δ‘α»“ng hα»“ thΓ΄ng minh dΓ nh cho tr αΊ» em cΓ³ nh α»―ng tΓ­nh nΔƒng h α»―u Γ­ch nΓ o? ', '' '', '' 'Đ α»“ng hα»“ thΓ΄ng minh dΓ nh cho tr αΊ» em thường cΓ³ cΓ‘c tΓ­nh nΔƒng nhΖ° g ọi Δ‘iện, nhαΊ―n tin, Δ‘α»‹nh vα»‹ GPS, theo dΓ΅i hoαΊ‘t Δ‘α»™ng, chΖ‘i trΓ² chΖ‘i, k αΊΏt nα»‘i vα»›i thiαΊΏt bα»‹ di Δ‘α»™ng,... giΓΊp ph α»₯ huynh cΓ³ th ể quαΊ£n lΓ½ vΓ  giΓ‘m sΓ‘t tr αΊ» dα»… dΓ ng hΖ‘n. ', '' '', '' 'Đ α»“ng hα»“ trαΊ» em nΓͺn cΓ³ m α»©c chα»‘ng nΖ°α»›c nhΖ° thαΊΏ nΓ o? ', '' '', '' 'TΓΉy thu α»™c vΓ o nhu c αΊ§u sα»­ dα»₯ng, nhΖ°ng b αΊ‘n nΓͺn chọn Δ‘α»“ng hα»“ trαΊ» em cΓ³ kh αΊ£ nΔƒng chα»‘ng nΖ°α»›c Γ­t nhαΊ₯t lΓ  3 ATM (30 mΓ©t) Δ‘ ể cΓ³ thể chα»‹u được nΖ°α»›c bαΊ―n vΓ o hoαΊ·c rα»­a tay.'\n",
1140
+ "Distance: 0.7290650606155396\n",
1141
+ "\n",
1142
+ "Document: /content/drive/MyDrive/Data/data_cleaned_aisc.pdf\n",
1143
+ "Chunk: Nα»™i dung sΓ‘ch ph αΊ£i được trΓ¬nh bΓ y khoa h ọc, logic.\\n\\n* HΓ¬nh αΊ£nh minh h ọa HΓ¬nh αΊ£nh minh họa trong sΓ‘ch phαΊ£i được in sαΊ―c nΓ©t, rΓ΅ rΓ ng. HΓ¬nh αΊ£nh phαΊ£i phΓΉ hợp vα»›i nα»™i dung sΓ‘ch vΓ  giΓΊp ngΖ° ời đọc dα»… hiểu hΖ‘n. ', '' '', '' 'Đ αΊ·c Δ‘iểm nΓ o thể hiện sαΊ£n phαΊ©m nΓ y chΓΊ tr ọng Δ‘αΊΏn tΓ­nh xΓ‘c th α»±c của thΓ΄ng tin? ', '' '', '' 'SΓ‘ch BΓ  m αΊΉ - Em bΓ© chΓΊ tr ọng Δ‘αΊΏn tΓ­nh xΓ‘c th α»±c của thΓ΄ng tin thΓ΄ng qua cΓ‘c Δ‘αΊ·c Δ‘iểm sau\\n\\n* TΓ‘c giαΊ£ SΓ‘ch được viαΊΏt bởi cΓ‘c chuyΓͺn gia cΓ³ uy tΓ­n trong lΔ©nh v α»±c sα»©c khỏe bΓ  mαΊΉ vΓ  trαΊ» em. CΓ‘c chuyΓͺn gia nΓ y Δ‘Γ£ cΓ³ nhi ều nΔƒm kinh nghi ệm vΓ  kiαΊΏn thα»©c chuyΓͺn mΓ΄n v α»―ng chαΊ―c.\\n\\n* Dα»― liệu SΓ‘ch sα»­ dα»₯ng cΓ‘c dα»― liệu khoa học để hα»— trợ cho cΓ‘c thΓ΄ng tin Δ‘Ζ° ợc trΓ¬nh bΓ y. CΓ‘c d α»― liệu nΓ y được thu thαΊ­p tα»« cΓ‘c nghiΓͺn c α»©u Δ‘Γ‘ng tin cαΊ­y.\\n\\n* TΓ i liệu tham kh αΊ£o SΓ‘ch cung c αΊ₯p danh sΓ‘ch cΓ‘c tΓ i li ệu tham kh αΊ£o để người đọc cΓ³ thể tΓ¬m hiểu thΓͺm thΓ΄ng tin v ề cΓ‘c chủ đề được đề cαΊ­p trong sΓ‘ch. ', '' '', '' 'L ợi Γ­ch của việc sα»­ dα»₯ng sΓ‘ch BΓ  m αΊΉ - Em bΓ© lΓ  gΓ¬?\n",
1144
+ "Distance: 0.7367197275161743\n",
1145
+ "\n",
1146
+ "Document: /content/drive/MyDrive/Data/data_cleaned_aisc.pdf\n",
1147
+ "Chunk: ', '' '', '' 'Đ ể Δ‘Γ‘nh giΓ‘ ch αΊ₯t lượng Bia Nα»™i Địa, bαΊ‘n cΓ³ thể dα»±a trΓͺn cΓ‘c tiΓͺu chΓ­ sau \\n\\n* **MΓΉi hΖ°Ζ‘ng** Bia cΓ³ mΓΉi thΖ‘m Δ‘ αΊ·c trΖ°ng, khΓ΄ng cΓ³ mΓΉi chua hay hΓ΄i. \\n* **Vα»‹** Bia cΓ³ v α»‹ Δ‘αΊ―ng nhαΊΉ, hΖ‘i ngọt vΓ  cΓ³ hαΊ­u vα»‹ dα»… chα»‹u.\\n* **MΓ u s αΊ―c** Bia cΓ³ mΓ u vΓ ng Γ³ng, trong su α»‘t vΓ  khΓ΄ng cΓ³ c αΊ·n.\\n* **Bọt** Bia cΓ³ lα»›p bọt dΓ y, mα»‹n vΓ  tan d αΊ§n sau mα»™t thời gian.\\n* **Độ cα»“n** Bia cΓ³ Δ‘ α»™ cα»“n tα»« 4% Δ‘αΊΏn 6%. ', '' '', '' 'Nh α»―ng Δ‘αΊ·c Δ‘iểm nΓ o của Bia Nα»™i Địa thể hiện Δ‘Γ’y lΓ  sαΊ£n phαΊ©m chαΊ₯t lượng cao?\n",
1148
+ "Distance: 0.7405332326889038\n",
1149
+ "\n"
1150
+ ]
1151
+ }
1152
+ ]
1153
+ },
1154
+ {
1155
+ "cell_type": "code",
1156
+ "source": [
1157
+ "!pip install flask flask-ngrok"
1158
+ ],
1159
+ "metadata": {
1160
+ "id": "ghDyh70Eyntt",
1161
+ "outputId": "0f358e02-de83-4dea-dd66-b699f1a97af8",
1162
+ "colab": {
1163
+ "base_uri": "https://localhost:8080/"
1164
+ }
1165
+ },
1166
+ "execution_count": 15,
1167
+ "outputs": [
1168
+ {
1169
+ "output_type": "stream",
1170
+ "name": "stdout",
1171
+ "text": [
1172
+ "Requirement already satisfied: flask in /usr/local/lib/python3.10/dist-packages (3.0.3)\n",
1173
+ "Requirement already satisfied: flask-ngrok in /usr/local/lib/python3.10/dist-packages (0.0.25)\n",
1174
+ "Requirement already satisfied: Werkzeug>=3.0.0 in /usr/local/lib/python3.10/dist-packages (from flask) (3.1.3)\n",
1175
+ "Requirement already satisfied: Jinja2>=3.1.2 in /usr/local/lib/python3.10/dist-packages (from flask) (3.1.4)\n",
1176
+ "Requirement already satisfied: itsdangerous>=2.1.2 in /usr/local/lib/python3.10/dist-packages (from flask) (2.2.0)\n",
1177
+ "Requirement already satisfied: click>=8.1.3 in /usr/local/lib/python3.10/dist-packages (from flask) (8.1.7)\n",
1178
+ "Requirement already satisfied: blinker>=1.6.2 in /usr/local/lib/python3.10/dist-packages (from flask) (1.9.0)\n",
1179
+ "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from flask-ngrok) (2.32.3)\n",
1180
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from Jinja2>=3.1.2->flask) (3.0.2)\n",
1181
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->flask-ngrok) (3.4.0)\n",
1182
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->flask-ngrok) (3.10)\n",
1183
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->flask-ngrok) (2.2.3)\n",
1184
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->flask-ngrok) (2024.8.30)\n"
1185
+ ]
1186
+ }
1187
+ ]
1188
+ },
1189
+ {
1190
+ "cell_type": "code",
1191
+ "source": [
1192
+ "!pip install pyngrok"
1193
+ ],
1194
+ "metadata": {
1195
+ "id": "humLGSVY0Nn2",
1196
+ "outputId": "206f0451-e512-4783-e4c4-25befa6a5b92",
1197
+ "colab": {
1198
+ "base_uri": "https://localhost:8080/"
1199
+ }
1200
+ },
1201
+ "execution_count": 18,
1202
+ "outputs": [
1203
+ {
1204
+ "output_type": "stream",
1205
+ "name": "stdout",
1206
+ "text": [
1207
+ "Collecting pyngrok\n",
1208
+ " Downloading pyngrok-7.2.1-py3-none-any.whl.metadata (8.3 kB)\n",
1209
+ "Requirement already satisfied: PyYAML>=5.1 in /usr/local/lib/python3.10/dist-packages (from pyngrok) (6.0.2)\n",
1210
+ "Downloading pyngrok-7.2.1-py3-none-any.whl (22 kB)\n",
1211
+ "Installing collected packages: pyngrok\n",
1212
+ "Successfully installed pyngrok-7.2.1\n"
1213
+ ]
1214
+ }
1215
+ ]
1216
+ },
1217
+ {
1218
+ "cell_type": "code",
1219
+ "source": [
1220
+ "!pip install gradio"
1221
+ ],
1222
+ "metadata": {
1223
+ "id": "G-vwpY1B1Sc4",
1224
+ "outputId": "95db1828-d5b0-4d5d-9a1e-ff78bf2adf3e",
1225
+ "colab": {
1226
+ "base_uri": "https://localhost:8080/",
1227
+ "height": 1000
1228
+ }
1229
+ },
1230
+ "execution_count": 21,
1231
+ "outputs": [
1232
+ {
1233
+ "output_type": "stream",
1234
+ "name": "stdout",
1235
+ "text": [
1236
+ "Collecting gradio\n",
1237
+ " Downloading gradio-5.8.0-py3-none-any.whl.metadata (16 kB)\n",
1238
+ "Collecting aiofiles<24.0,>=22.0 (from gradio)\n",
1239
+ " Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)\n",
1240
+ "Requirement already satisfied: anyio<5.0,>=3.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (3.7.1)\n",
1241
+ "Collecting fastapi<1.0,>=0.115.2 (from gradio)\n",
1242
+ " Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)\n",
1243
+ "Collecting ffmpy (from gradio)\n",
1244
+ " Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)\n",
1245
+ "Collecting gradio-client==1.5.1 (from gradio)\n",
1246
+ " Downloading gradio_client-1.5.1-py3-none-any.whl.metadata (7.1 kB)\n",
1247
+ "Requirement already satisfied: httpx>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from gradio) (0.28.0)\n",
1248
+ "Requirement already satisfied: huggingface-hub>=0.25.1 in /usr/local/lib/python3.10/dist-packages (from gradio) (0.26.3)\n",
1249
+ "Requirement already satisfied: jinja2<4.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (3.1.4)\n",
1250
+ "Collecting markupsafe~=2.0 (from gradio)\n",
1251
+ " Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)\n",
1252
+ "Requirement already satisfied: numpy<3.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (1.26.4)\n",
1253
+ "Requirement already satisfied: orjson~=3.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (3.10.12)\n",
1254
+ "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from gradio) (24.2)\n",
1255
+ "Requirement already satisfied: pandas<3.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (2.2.2)\n",
1256
+ "Requirement already satisfied: pillow<12.0,>=8.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (11.0.0)\n",
1257
+ "Requirement already satisfied: pydantic>=2.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (2.10.3)\n",
1258
+ "Collecting pydub (from gradio)\n",
1259
+ " Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)\n",
1260
+ "Collecting python-multipart>=0.0.18 (from gradio)\n",
1261
+ " Downloading python_multipart-0.0.19-py3-none-any.whl.metadata (1.8 kB)\n",
1262
+ "Requirement already satisfied: pyyaml<7.0,>=5.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (6.0.2)\n",
1263
+ "Collecting ruff>=0.2.2 (from gradio)\n",
1264
+ " Downloading ruff-0.8.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)\n",
1265
+ "Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)\n",
1266
+ " Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)\n",
1267
+ "Collecting semantic-version~=2.0 (from gradio)\n",
1268
+ " Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)\n",
1269
+ "Collecting starlette<1.0,>=0.40.0 (from gradio)\n",
1270
+ " Downloading starlette-0.41.3-py3-none-any.whl.metadata (6.0 kB)\n",
1271
+ "Collecting tomlkit<0.14.0,>=0.12.0 (from gradio)\n",
1272
+ " Downloading tomlkit-0.13.2-py3-none-any.whl.metadata (2.7 kB)\n",
1273
+ "Requirement already satisfied: typer<1.0,>=0.12 in /usr/local/lib/python3.10/dist-packages (from gradio) (0.15.0)\n",
1274
+ "Requirement already satisfied: typing-extensions~=4.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (4.12.2)\n",
1275
+ "Collecting uvicorn>=0.14.0 (from gradio)\n",
1276
+ " Downloading uvicorn-0.32.1-py3-none-any.whl.metadata (6.6 kB)\n",
1277
+ "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from gradio-client==1.5.1->gradio) (2024.10.0)\n",
1278
+ "Collecting websockets<15.0,>=10.0 (from gradio-client==1.5.1->gradio)\n",
1279
+ " Downloading websockets-14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
1280
+ "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->gradio) (3.10)\n",
1281
+ "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->gradio) (1.3.1)\n",
1282
+ "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->gradio) (1.2.2)\n",
1283
+ "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx>=0.24.1->gradio) (2024.8.30)\n",
1284
+ "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx>=0.24.1->gradio) (1.0.7)\n",
1285
+ "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx>=0.24.1->gradio) (0.14.0)\n",
1286
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.25.1->gradio) (3.16.1)\n",
1287
+ "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.25.1->gradio) (2.32.3)\n",
1288
+ "Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.25.1->gradio) (4.66.6)\n",
1289
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0,>=1.0->gradio) (2.8.2)\n",
1290
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0,>=1.0->gradio) (2024.2)\n",
1291
+ "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0,>=1.0->gradio) (2024.2)\n",
1292
+ "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2.0->gradio) (0.7.0)\n",
1293
+ "Requirement already satisfied: pydantic-core==2.27.1 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2.0->gradio) (2.27.1)\n",
1294
+ "Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0,>=0.12->gradio) (8.1.7)\n",
1295
+ "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0,>=0.12->gradio) (1.5.4)\n",
1296
+ "Requirement already satisfied: rich>=10.11.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0,>=0.12->gradio) (13.9.4)\n",
1297
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas<3.0,>=1.0->gradio) (1.16.0)\n",
1298
+ "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (3.0.0)\n",
1299
+ "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (2.18.0)\n",
1300
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.25.1->gradio) (3.4.0)\n",
1301
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.25.1->gradio) (2.2.3)\n",
1302
+ "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0,>=0.12->gradio) (0.1.2)\n",
1303
+ "Downloading gradio-5.8.0-py3-none-any.whl (57.2 MB)\n",
1304
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.2/57.2 MB\u001b[0m \u001b[31m12.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1305
+ "\u001b[?25hDownloading gradio_client-1.5.1-py3-none-any.whl (320 kB)\n",
1306
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━��━━━━━━━━━━━━━━\u001b[0m \u001b[32m320.2/320.2 kB\u001b[0m \u001b[31m25.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1307
+ "\u001b[?25hDownloading aiofiles-23.2.1-py3-none-any.whl (15 kB)\n",
1308
+ "Downloading fastapi-0.115.6-py3-none-any.whl (94 kB)\n",
1309
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.8/94.8 kB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1310
+ "\u001b[?25hDownloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25 kB)\n",
1311
+ "Downloading python_multipart-0.0.19-py3-none-any.whl (24 kB)\n",
1312
+ "Downloading ruff-0.8.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.2 MB)\n",
1313
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.2/11.2 MB\u001b[0m \u001b[31m68.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1314
+ "\u001b[?25hDownloading safehttpx-0.1.6-py3-none-any.whl (8.7 kB)\n",
1315
+ "Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)\n",
1316
+ "Downloading starlette-0.41.3-py3-none-any.whl (73 kB)\n",
1317
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.2/73.2 kB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1318
+ "\u001b[?25hDownloading tomlkit-0.13.2-py3-none-any.whl (37 kB)\n",
1319
+ "Downloading uvicorn-0.32.1-py3-none-any.whl (63 kB)\n",
1320
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m63.8/63.8 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1321
+ "\u001b[?25hDownloading ffmpy-0.4.0-py3-none-any.whl (5.8 kB)\n",
1322
+ "Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n",
1323
+ "Downloading websockets-14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (168 kB)\n",
1324
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m168.2/168.2 kB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1325
+ "\u001b[?25hInstalling collected packages: pydub, websockets, uvicorn, tomlkit, semantic-version, ruff, python-multipart, markupsafe, ffmpy, aiofiles, starlette, safehttpx, gradio-client, fastapi, gradio\n",
1326
+ " Attempting uninstall: markupsafe\n",
1327
+ " Found existing installation: MarkupSafe 3.0.2\n",
1328
+ " Uninstalling MarkupSafe-3.0.2:\n",
1329
+ " Successfully uninstalled MarkupSafe-3.0.2\n",
1330
+ "Successfully installed aiofiles-23.2.1 fastapi-0.115.6 ffmpy-0.4.0 gradio-5.8.0 gradio-client-1.5.1 markupsafe-2.1.5 pydub-0.25.1 python-multipart-0.0.19 ruff-0.8.2 safehttpx-0.1.6 semantic-version-2.10.0 starlette-0.41.3 tomlkit-0.13.2 uvicorn-0.32.1 websockets-14.1\n"
1331
+ ]
1332
+ },
1333
+ {
1334
+ "output_type": "display_data",
1335
+ "data": {
1336
+ "application/vnd.colab-display-data+json": {
1337
+ "pip_warning": {
1338
+ "packages": [
1339
+ "markupsafe"
1340
+ ]
1341
+ },
1342
+ "id": "c5460d831b954259abaf1f971d89285d"
1343
+ }
1344
+ },
1345
+ "metadata": {}
1346
+ }
1347
+ ]
1348
+ }
1349
+ ]
1350
+ }