MounikaAithagoni commited on
Commit
87dbeaf
·
verified ·
1 Parent(s): a45d761

Upload 2 files

Browse files
Transformer- Part-1_ English-to-Arabic translation.docx ADDED
Binary file (149 kB). View file
 
Transformer_Part_2__English_to_Arabic_translation.ipynb ADDED
@@ -0,0 +1,1468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "A100"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU",
17
+ "widgets": {
18
+ "application/vnd.jupyter.widget-state+json": {
19
+ "2bea128ff6a84dea98153aa2c398c845": {
20
+ "model_module": "@jupyter-widgets/controls",
21
+ "model_name": "HBoxModel",
22
+ "model_module_version": "1.5.0",
23
+ "state": {
24
+ "_dom_classes": [],
25
+ "_model_module": "@jupyter-widgets/controls",
26
+ "_model_module_version": "1.5.0",
27
+ "_model_name": "HBoxModel",
28
+ "_view_count": null,
29
+ "_view_module": "@jupyter-widgets/controls",
30
+ "_view_module_version": "1.5.0",
31
+ "_view_name": "HBoxView",
32
+ "box_style": "",
33
+ "children": [
34
+ "IPY_MODEL_9860c006ecf841e8ac018927761fadda",
35
+ "IPY_MODEL_06f1133db9334aa79e927e51220a1561",
36
+ "IPY_MODEL_f19b60de091948998a0d037896442ac5"
37
+ ],
38
+ "layout": "IPY_MODEL_e47071d647b04f4683527fc4c5acb592"
39
+ }
40
+ },
41
+ "9860c006ecf841e8ac018927761fadda": {
42
+ "model_module": "@jupyter-widgets/controls",
43
+ "model_name": "HTMLModel",
44
+ "model_module_version": "1.5.0",
45
+ "state": {
46
+ "_dom_classes": [],
47
+ "_model_module": "@jupyter-widgets/controls",
48
+ "_model_module_version": "1.5.0",
49
+ "_model_name": "HTMLModel",
50
+ "_view_count": null,
51
+ "_view_module": "@jupyter-widgets/controls",
52
+ "_view_module_version": "1.5.0",
53
+ "_view_name": "HTMLView",
54
+ "description": "",
55
+ "description_tooltip": null,
56
+ "layout": "IPY_MODEL_78630ac454e54e91b3b46184da36f29b",
57
+ "placeholder": "​",
58
+ "style": "IPY_MODEL_23c4bde6b4cd4b918d8ddc0a504d263d",
59
+ "value": "model.safetensors: 100%"
60
+ }
61
+ },
62
+ "06f1133db9334aa79e927e51220a1561": {
63
+ "model_module": "@jupyter-widgets/controls",
64
+ "model_name": "FloatProgressModel",
65
+ "model_module_version": "1.5.0",
66
+ "state": {
67
+ "_dom_classes": [],
68
+ "_model_module": "@jupyter-widgets/controls",
69
+ "_model_module_version": "1.5.0",
70
+ "_model_name": "FloatProgressModel",
71
+ "_view_count": null,
72
+ "_view_module": "@jupyter-widgets/controls",
73
+ "_view_module_version": "1.5.0",
74
+ "_view_name": "ProgressView",
75
+ "bar_style": "success",
76
+ "description": "",
77
+ "description_tooltip": null,
78
+ "layout": "IPY_MODEL_821c81b459e04c92a3609ae2c017e1a5",
79
+ "max": 548105171,
80
+ "min": 0,
81
+ "orientation": "horizontal",
82
+ "style": "IPY_MODEL_63aed25f754742948b88a967a1b413d7",
83
+ "value": 548105171
84
+ }
85
+ },
86
+ "f19b60de091948998a0d037896442ac5": {
87
+ "model_module": "@jupyter-widgets/controls",
88
+ "model_name": "HTMLModel",
89
+ "model_module_version": "1.5.0",
90
+ "state": {
91
+ "_dom_classes": [],
92
+ "_model_module": "@jupyter-widgets/controls",
93
+ "_model_module_version": "1.5.0",
94
+ "_model_name": "HTMLModel",
95
+ "_view_count": null,
96
+ "_view_module": "@jupyter-widgets/controls",
97
+ "_view_module_version": "1.5.0",
98
+ "_view_name": "HTMLView",
99
+ "description": "",
100
+ "description_tooltip": null,
101
+ "layout": "IPY_MODEL_d44d9eaebd4449c39d0a85f8886e3110",
102
+ "placeholder": "​",
103
+ "style": "IPY_MODEL_7e4fb57794f9468a9de7a1733a21c0b9",
104
+ "value": " 548M/548M [00:03<00:00, 205MB/s]"
105
+ }
106
+ },
107
+ "e47071d647b04f4683527fc4c5acb592": {
108
+ "model_module": "@jupyter-widgets/base",
109
+ "model_name": "LayoutModel",
110
+ "model_module_version": "1.2.0",
111
+ "state": {
112
+ "_model_module": "@jupyter-widgets/base",
113
+ "_model_module_version": "1.2.0",
114
+ "_model_name": "LayoutModel",
115
+ "_view_count": null,
116
+ "_view_module": "@jupyter-widgets/base",
117
+ "_view_module_version": "1.2.0",
118
+ "_view_name": "LayoutView",
119
+ "align_content": null,
120
+ "align_items": null,
121
+ "align_self": null,
122
+ "border": null,
123
+ "bottom": null,
124
+ "display": null,
125
+ "flex": null,
126
+ "flex_flow": null,
127
+ "grid_area": null,
128
+ "grid_auto_columns": null,
129
+ "grid_auto_flow": null,
130
+ "grid_auto_rows": null,
131
+ "grid_column": null,
132
+ "grid_gap": null,
133
+ "grid_row": null,
134
+ "grid_template_areas": null,
135
+ "grid_template_columns": null,
136
+ "grid_template_rows": null,
137
+ "height": null,
138
+ "justify_content": null,
139
+ "justify_items": null,
140
+ "left": null,
141
+ "margin": null,
142
+ "max_height": null,
143
+ "max_width": null,
144
+ "min_height": null,
145
+ "min_width": null,
146
+ "object_fit": null,
147
+ "object_position": null,
148
+ "order": null,
149
+ "overflow": null,
150
+ "overflow_x": null,
151
+ "overflow_y": null,
152
+ "padding": null,
153
+ "right": null,
154
+ "top": null,
155
+ "visibility": null,
156
+ "width": null
157
+ }
158
+ },
159
+ "78630ac454e54e91b3b46184da36f29b": {
160
+ "model_module": "@jupyter-widgets/base",
161
+ "model_name": "LayoutModel",
162
+ "model_module_version": "1.2.0",
163
+ "state": {
164
+ "_model_module": "@jupyter-widgets/base",
165
+ "_model_module_version": "1.2.0",
166
+ "_model_name": "LayoutModel",
167
+ "_view_count": null,
168
+ "_view_module": "@jupyter-widgets/base",
169
+ "_view_module_version": "1.2.0",
170
+ "_view_name": "LayoutView",
171
+ "align_content": null,
172
+ "align_items": null,
173
+ "align_self": null,
174
+ "border": null,
175
+ "bottom": null,
176
+ "display": null,
177
+ "flex": null,
178
+ "flex_flow": null,
179
+ "grid_area": null,
180
+ "grid_auto_columns": null,
181
+ "grid_auto_flow": null,
182
+ "grid_auto_rows": null,
183
+ "grid_column": null,
184
+ "grid_gap": null,
185
+ "grid_row": null,
186
+ "grid_template_areas": null,
187
+ "grid_template_columns": null,
188
+ "grid_template_rows": null,
189
+ "height": null,
190
+ "justify_content": null,
191
+ "justify_items": null,
192
+ "left": null,
193
+ "margin": null,
194
+ "max_height": null,
195
+ "max_width": null,
196
+ "min_height": null,
197
+ "min_width": null,
198
+ "object_fit": null,
199
+ "object_position": null,
200
+ "order": null,
201
+ "overflow": null,
202
+ "overflow_x": null,
203
+ "overflow_y": null,
204
+ "padding": null,
205
+ "right": null,
206
+ "top": null,
207
+ "visibility": null,
208
+ "width": null
209
+ }
210
+ },
211
+ "23c4bde6b4cd4b918d8ddc0a504d263d": {
212
+ "model_module": "@jupyter-widgets/controls",
213
+ "model_name": "DescriptionStyleModel",
214
+ "model_module_version": "1.5.0",
215
+ "state": {
216
+ "_model_module": "@jupyter-widgets/controls",
217
+ "_model_module_version": "1.5.0",
218
+ "_model_name": "DescriptionStyleModel",
219
+ "_view_count": null,
220
+ "_view_module": "@jupyter-widgets/base",
221
+ "_view_module_version": "1.2.0",
222
+ "_view_name": "StyleView",
223
+ "description_width": ""
224
+ }
225
+ },
226
+ "821c81b459e04c92a3609ae2c017e1a5": {
227
+ "model_module": "@jupyter-widgets/base",
228
+ "model_name": "LayoutModel",
229
+ "model_module_version": "1.2.0",
230
+ "state": {
231
+ "_model_module": "@jupyter-widgets/base",
232
+ "_model_module_version": "1.2.0",
233
+ "_model_name": "LayoutModel",
234
+ "_view_count": null,
235
+ "_view_module": "@jupyter-widgets/base",
236
+ "_view_module_version": "1.2.0",
237
+ "_view_name": "LayoutView",
238
+ "align_content": null,
239
+ "align_items": null,
240
+ "align_self": null,
241
+ "border": null,
242
+ "bottom": null,
243
+ "display": null,
244
+ "flex": null,
245
+ "flex_flow": null,
246
+ "grid_area": null,
247
+ "grid_auto_columns": null,
248
+ "grid_auto_flow": null,
249
+ "grid_auto_rows": null,
250
+ "grid_column": null,
251
+ "grid_gap": null,
252
+ "grid_row": null,
253
+ "grid_template_areas": null,
254
+ "grid_template_columns": null,
255
+ "grid_template_rows": null,
256
+ "height": null,
257
+ "justify_content": null,
258
+ "justify_items": null,
259
+ "left": null,
260
+ "margin": null,
261
+ "max_height": null,
262
+ "max_width": null,
263
+ "min_height": null,
264
+ "min_width": null,
265
+ "object_fit": null,
266
+ "object_position": null,
267
+ "order": null,
268
+ "overflow": null,
269
+ "overflow_x": null,
270
+ "overflow_y": null,
271
+ "padding": null,
272
+ "right": null,
273
+ "top": null,
274
+ "visibility": null,
275
+ "width": null
276
+ }
277
+ },
278
+ "63aed25f754742948b88a967a1b413d7": {
279
+ "model_module": "@jupyter-widgets/controls",
280
+ "model_name": "ProgressStyleModel",
281
+ "model_module_version": "1.5.0",
282
+ "state": {
283
+ "_model_module": "@jupyter-widgets/controls",
284
+ "_model_module_version": "1.5.0",
285
+ "_model_name": "ProgressStyleModel",
286
+ "_view_count": null,
287
+ "_view_module": "@jupyter-widgets/base",
288
+ "_view_module_version": "1.2.0",
289
+ "_view_name": "StyleView",
290
+ "bar_color": null,
291
+ "description_width": ""
292
+ }
293
+ },
294
+ "d44d9eaebd4449c39d0a85f8886e3110": {
295
+ "model_module": "@jupyter-widgets/base",
296
+ "model_name": "LayoutModel",
297
+ "model_module_version": "1.2.0",
298
+ "state": {
299
+ "_model_module": "@jupyter-widgets/base",
300
+ "_model_module_version": "1.2.0",
301
+ "_model_name": "LayoutModel",
302
+ "_view_count": null,
303
+ "_view_module": "@jupyter-widgets/base",
304
+ "_view_module_version": "1.2.0",
305
+ "_view_name": "LayoutView",
306
+ "align_content": null,
307
+ "align_items": null,
308
+ "align_self": null,
309
+ "border": null,
310
+ "bottom": null,
311
+ "display": null,
312
+ "flex": null,
313
+ "flex_flow": null,
314
+ "grid_area": null,
315
+ "grid_auto_columns": null,
316
+ "grid_auto_flow": null,
317
+ "grid_auto_rows": null,
318
+ "grid_column": null,
319
+ "grid_gap": null,
320
+ "grid_row": null,
321
+ "grid_template_areas": null,
322
+ "grid_template_columns": null,
323
+ "grid_template_rows": null,
324
+ "height": null,
325
+ "justify_content": null,
326
+ "justify_items": null,
327
+ "left": null,
328
+ "margin": null,
329
+ "max_height": null,
330
+ "max_width": null,
331
+ "min_height": null,
332
+ "min_width": null,
333
+ "object_fit": null,
334
+ "object_position": null,
335
+ "order": null,
336
+ "overflow": null,
337
+ "overflow_x": null,
338
+ "overflow_y": null,
339
+ "padding": null,
340
+ "right": null,
341
+ "top": null,
342
+ "visibility": null,
343
+ "width": null
344
+ }
345
+ },
346
+ "7e4fb57794f9468a9de7a1733a21c0b9": {
347
+ "model_module": "@jupyter-widgets/controls",
348
+ "model_name": "DescriptionStyleModel",
349
+ "model_module_version": "1.5.0",
350
+ "state": {
351
+ "_model_module": "@jupyter-widgets/controls",
352
+ "_model_module_version": "1.5.0",
353
+ "_model_name": "DescriptionStyleModel",
354
+ "_view_count": null,
355
+ "_view_module": "@jupyter-widgets/base",
356
+ "_view_module_version": "1.2.0",
357
+ "_view_name": "StyleView",
358
+ "description_width": ""
359
+ }
360
+ },
361
+ "5a3cde9d5eda48d1bacedb3da7a8c36e": {
362
+ "model_module": "@jupyter-widgets/controls",
363
+ "model_name": "HBoxModel",
364
+ "model_module_version": "1.5.0",
365
+ "state": {
366
+ "_dom_classes": [],
367
+ "_model_module": "@jupyter-widgets/controls",
368
+ "_model_module_version": "1.5.0",
369
+ "_model_name": "HBoxModel",
370
+ "_view_count": null,
371
+ "_view_module": "@jupyter-widgets/controls",
372
+ "_view_module_version": "1.5.0",
373
+ "_view_name": "HBoxView",
374
+ "box_style": "",
375
+ "children": [
376
+ "IPY_MODEL_59dfbf89a4a44856853bbdd20e87c1b6",
377
+ "IPY_MODEL_35a848f438c44b3aa35bc3ec3720fb6d",
378
+ "IPY_MODEL_50e96a2a7dac40719b2bfac5aebcd802"
379
+ ],
380
+ "layout": "IPY_MODEL_c23fb59092fd4b019a41e6ba82827880"
381
+ }
382
+ },
383
+ "59dfbf89a4a44856853bbdd20e87c1b6": {
384
+ "model_module": "@jupyter-widgets/controls",
385
+ "model_name": "HTMLModel",
386
+ "model_module_version": "1.5.0",
387
+ "state": {
388
+ "_dom_classes": [],
389
+ "_model_module": "@jupyter-widgets/controls",
390
+ "_model_module_version": "1.5.0",
391
+ "_model_name": "HTMLModel",
392
+ "_view_count": null,
393
+ "_view_module": "@jupyter-widgets/controls",
394
+ "_view_module_version": "1.5.0",
395
+ "_view_name": "HTMLView",
396
+ "description": "",
397
+ "description_tooltip": null,
398
+ "layout": "IPY_MODEL_afbba89a0bfc4a1994e5f5b009f1ff9c",
399
+ "placeholder": "​",
400
+ "style": "IPY_MODEL_d1562f9faedd457c9b12c8fe88215a0b",
401
+ "value": "generation_config.json: 100%"
402
+ }
403
+ },
404
+ "35a848f438c44b3aa35bc3ec3720fb6d": {
405
+ "model_module": "@jupyter-widgets/controls",
406
+ "model_name": "FloatProgressModel",
407
+ "model_module_version": "1.5.0",
408
+ "state": {
409
+ "_dom_classes": [],
410
+ "_model_module": "@jupyter-widgets/controls",
411
+ "_model_module_version": "1.5.0",
412
+ "_model_name": "FloatProgressModel",
413
+ "_view_count": null,
414
+ "_view_module": "@jupyter-widgets/controls",
415
+ "_view_module_version": "1.5.0",
416
+ "_view_name": "ProgressView",
417
+ "bar_style": "success",
418
+ "description": "",
419
+ "description_tooltip": null,
420
+ "layout": "IPY_MODEL_b92e1b10eba743dea4c248022193de45",
421
+ "max": 124,
422
+ "min": 0,
423
+ "orientation": "horizontal",
424
+ "style": "IPY_MODEL_e5703982607e4ccaa875bf168d29b567",
425
+ "value": 124
426
+ }
427
+ },
428
+ "50e96a2a7dac40719b2bfac5aebcd802": {
429
+ "model_module": "@jupyter-widgets/controls",
430
+ "model_name": "HTMLModel",
431
+ "model_module_version": "1.5.0",
432
+ "state": {
433
+ "_dom_classes": [],
434
+ "_model_module": "@jupyter-widgets/controls",
435
+ "_model_module_version": "1.5.0",
436
+ "_model_name": "HTMLModel",
437
+ "_view_count": null,
438
+ "_view_module": "@jupyter-widgets/controls",
439
+ "_view_module_version": "1.5.0",
440
+ "_view_name": "HTMLView",
441
+ "description": "",
442
+ "description_tooltip": null,
443
+ "layout": "IPY_MODEL_c42451c719a24442aa0eea5d985f5f21",
444
+ "placeholder": "​",
445
+ "style": "IPY_MODEL_e01cdc7235bf42ef84a900b1c5bf206e",
446
+ "value": " 124/124 [00:00<00:00, 8.93kB/s]"
447
+ }
448
+ },
449
+ "c23fb59092fd4b019a41e6ba82827880": {
450
+ "model_module": "@jupyter-widgets/base",
451
+ "model_name": "LayoutModel",
452
+ "model_module_version": "1.2.0",
453
+ "state": {
454
+ "_model_module": "@jupyter-widgets/base",
455
+ "_model_module_version": "1.2.0",
456
+ "_model_name": "LayoutModel",
457
+ "_view_count": null,
458
+ "_view_module": "@jupyter-widgets/base",
459
+ "_view_module_version": "1.2.0",
460
+ "_view_name": "LayoutView",
461
+ "align_content": null,
462
+ "align_items": null,
463
+ "align_self": null,
464
+ "border": null,
465
+ "bottom": null,
466
+ "display": null,
467
+ "flex": null,
468
+ "flex_flow": null,
469
+ "grid_area": null,
470
+ "grid_auto_columns": null,
471
+ "grid_auto_flow": null,
472
+ "grid_auto_rows": null,
473
+ "grid_column": null,
474
+ "grid_gap": null,
475
+ "grid_row": null,
476
+ "grid_template_areas": null,
477
+ "grid_template_columns": null,
478
+ "grid_template_rows": null,
479
+ "height": null,
480
+ "justify_content": null,
481
+ "justify_items": null,
482
+ "left": null,
483
+ "margin": null,
484
+ "max_height": null,
485
+ "max_width": null,
486
+ "min_height": null,
487
+ "min_width": null,
488
+ "object_fit": null,
489
+ "object_position": null,
490
+ "order": null,
491
+ "overflow": null,
492
+ "overflow_x": null,
493
+ "overflow_y": null,
494
+ "padding": null,
495
+ "right": null,
496
+ "top": null,
497
+ "visibility": null,
498
+ "width": null
499
+ }
500
+ },
501
+ "afbba89a0bfc4a1994e5f5b009f1ff9c": {
502
+ "model_module": "@jupyter-widgets/base",
503
+ "model_name": "LayoutModel",
504
+ "model_module_version": "1.2.0",
505
+ "state": {
506
+ "_model_module": "@jupyter-widgets/base",
507
+ "_model_module_version": "1.2.0",
508
+ "_model_name": "LayoutModel",
509
+ "_view_count": null,
510
+ "_view_module": "@jupyter-widgets/base",
511
+ "_view_module_version": "1.2.0",
512
+ "_view_name": "LayoutView",
513
+ "align_content": null,
514
+ "align_items": null,
515
+ "align_self": null,
516
+ "border": null,
517
+ "bottom": null,
518
+ "display": null,
519
+ "flex": null,
520
+ "flex_flow": null,
521
+ "grid_area": null,
522
+ "grid_auto_columns": null,
523
+ "grid_auto_flow": null,
524
+ "grid_auto_rows": null,
525
+ "grid_column": null,
526
+ "grid_gap": null,
527
+ "grid_row": null,
528
+ "grid_template_areas": null,
529
+ "grid_template_columns": null,
530
+ "grid_template_rows": null,
531
+ "height": null,
532
+ "justify_content": null,
533
+ "justify_items": null,
534
+ "left": null,
535
+ "margin": null,
536
+ "max_height": null,
537
+ "max_width": null,
538
+ "min_height": null,
539
+ "min_width": null,
540
+ "object_fit": null,
541
+ "object_position": null,
542
+ "order": null,
543
+ "overflow": null,
544
+ "overflow_x": null,
545
+ "overflow_y": null,
546
+ "padding": null,
547
+ "right": null,
548
+ "top": null,
549
+ "visibility": null,
550
+ "width": null
551
+ }
552
+ },
553
+ "d1562f9faedd457c9b12c8fe88215a0b": {
554
+ "model_module": "@jupyter-widgets/controls",
555
+ "model_name": "DescriptionStyleModel",
556
+ "model_module_version": "1.5.0",
557
+ "state": {
558
+ "_model_module": "@jupyter-widgets/controls",
559
+ "_model_module_version": "1.5.0",
560
+ "_model_name": "DescriptionStyleModel",
561
+ "_view_count": null,
562
+ "_view_module": "@jupyter-widgets/base",
563
+ "_view_module_version": "1.2.0",
564
+ "_view_name": "StyleView",
565
+ "description_width": ""
566
+ }
567
+ },
568
+ "b92e1b10eba743dea4c248022193de45": {
569
+ "model_module": "@jupyter-widgets/base",
570
+ "model_name": "LayoutModel",
571
+ "model_module_version": "1.2.0",
572
+ "state": {
573
+ "_model_module": "@jupyter-widgets/base",
574
+ "_model_module_version": "1.2.0",
575
+ "_model_name": "LayoutModel",
576
+ "_view_count": null,
577
+ "_view_module": "@jupyter-widgets/base",
578
+ "_view_module_version": "1.2.0",
579
+ "_view_name": "LayoutView",
580
+ "align_content": null,
581
+ "align_items": null,
582
+ "align_self": null,
583
+ "border": null,
584
+ "bottom": null,
585
+ "display": null,
586
+ "flex": null,
587
+ "flex_flow": null,
588
+ "grid_area": null,
589
+ "grid_auto_columns": null,
590
+ "grid_auto_flow": null,
591
+ "grid_auto_rows": null,
592
+ "grid_column": null,
593
+ "grid_gap": null,
594
+ "grid_row": null,
595
+ "grid_template_areas": null,
596
+ "grid_template_columns": null,
597
+ "grid_template_rows": null,
598
+ "height": null,
599
+ "justify_content": null,
600
+ "justify_items": null,
601
+ "left": null,
602
+ "margin": null,
603
+ "max_height": null,
604
+ "max_width": null,
605
+ "min_height": null,
606
+ "min_width": null,
607
+ "object_fit": null,
608
+ "object_position": null,
609
+ "order": null,
610
+ "overflow": null,
611
+ "overflow_x": null,
612
+ "overflow_y": null,
613
+ "padding": null,
614
+ "right": null,
615
+ "top": null,
616
+ "visibility": null,
617
+ "width": null
618
+ }
619
+ },
620
+ "e5703982607e4ccaa875bf168d29b567": {
621
+ "model_module": "@jupyter-widgets/controls",
622
+ "model_name": "ProgressStyleModel",
623
+ "model_module_version": "1.5.0",
624
+ "state": {
625
+ "_model_module": "@jupyter-widgets/controls",
626
+ "_model_module_version": "1.5.0",
627
+ "_model_name": "ProgressStyleModel",
628
+ "_view_count": null,
629
+ "_view_module": "@jupyter-widgets/base",
630
+ "_view_module_version": "1.2.0",
631
+ "_view_name": "StyleView",
632
+ "bar_color": null,
633
+ "description_width": ""
634
+ }
635
+ },
636
+ "c42451c719a24442aa0eea5d985f5f21": {
637
+ "model_module": "@jupyter-widgets/base",
638
+ "model_name": "LayoutModel",
639
+ "model_module_version": "1.2.0",
640
+ "state": {
641
+ "_model_module": "@jupyter-widgets/base",
642
+ "_model_module_version": "1.2.0",
643
+ "_model_name": "LayoutModel",
644
+ "_view_count": null,
645
+ "_view_module": "@jupyter-widgets/base",
646
+ "_view_module_version": "1.2.0",
647
+ "_view_name": "LayoutView",
648
+ "align_content": null,
649
+ "align_items": null,
650
+ "align_self": null,
651
+ "border": null,
652
+ "bottom": null,
653
+ "display": null,
654
+ "flex": null,
655
+ "flex_flow": null,
656
+ "grid_area": null,
657
+ "grid_auto_columns": null,
658
+ "grid_auto_flow": null,
659
+ "grid_auto_rows": null,
660
+ "grid_column": null,
661
+ "grid_gap": null,
662
+ "grid_row": null,
663
+ "grid_template_areas": null,
664
+ "grid_template_columns": null,
665
+ "grid_template_rows": null,
666
+ "height": null,
667
+ "justify_content": null,
668
+ "justify_items": null,
669
+ "left": null,
670
+ "margin": null,
671
+ "max_height": null,
672
+ "max_width": null,
673
+ "min_height": null,
674
+ "min_width": null,
675
+ "object_fit": null,
676
+ "object_position": null,
677
+ "order": null,
678
+ "overflow": null,
679
+ "overflow_x": null,
680
+ "overflow_y": null,
681
+ "padding": null,
682
+ "right": null,
683
+ "top": null,
684
+ "visibility": null,
685
+ "width": null
686
+ }
687
+ },
688
+ "e01cdc7235bf42ef84a900b1c5bf206e": {
689
+ "model_module": "@jupyter-widgets/controls",
690
+ "model_name": "DescriptionStyleModel",
691
+ "model_module_version": "1.5.0",
692
+ "state": {
693
+ "_model_module": "@jupyter-widgets/controls",
694
+ "_model_module_version": "1.5.0",
695
+ "_model_name": "DescriptionStyleModel",
696
+ "_view_count": null,
697
+ "_view_module": "@jupyter-widgets/base",
698
+ "_view_module_version": "1.2.0",
699
+ "_view_name": "StyleView",
700
+ "description_width": ""
701
+ }
702
+ }
703
+ }
704
+ }
705
+ },
706
+ "cells": [
707
+ {
708
+ "cell_type": "markdown",
709
+ "source": [
710
+ "**Arabic text to English GPT2 Translator**"
711
+ ],
712
+ "metadata": {
713
+ "id": "OJm9SnOwmMuN"
714
+ }
715
+ },
716
+ {
717
+ "cell_type": "code",
718
+ "source": [
719
+ "!pip install datasets transformers sacrebleu\n",
720
+ "!pip install sentencepiece\n",
721
+ "!pip install sacrebleu"
722
+ ],
723
+ "metadata": {
724
+ "colab": {
725
+ "base_uri": "https://localhost:8080/"
726
+ },
727
+ "id": "GsW0HEokd85l",
728
+ "outputId": "38867062-0272-4a26-bff4-fff72abc74ef"
729
+ },
730
+ "execution_count": 1,
731
+ "outputs": [
732
+ {
733
+ "output_type": "stream",
734
+ "name": "stdout",
735
+ "text": [
736
+ "Collecting datasets\n",
737
+ " Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)\n",
738
+ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.46.3)\n",
739
+ "Collecting sacrebleu\n",
740
+ " Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)\n",
741
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.8/51.8 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
742
+ "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.16.1)\n",
743
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.26.4)\n",
744
+ "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (17.0.0)\n",
745
+ "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n",
746
+ " Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
747
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n",
748
+ "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.32.3)\n",
749
+ "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.6)\n",
750
+ "Collecting xxhash (from datasets)\n",
751
+ " Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
752
+ "Collecting multiprocess<0.70.17 (from datasets)\n",
753
+ " Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n",
754
+ "Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)\n",
755
+ " Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)\n",
756
+ "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.11.9)\n",
757
+ "Requirement already satisfied: huggingface-hub>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.26.3)\n",
758
+ "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.2)\n",
759
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.2)\n",
760
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.9.11)\n",
761
+ "Requirement already satisfied: tokenizers<0.21,>=0.20 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.20.3)\n",
762
+ "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.5)\n",
763
+ "Collecting portalocker (from sacrebleu)\n",
764
+ " Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)\n",
765
+ "Requirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (0.9.0)\n",
766
+ "Collecting colorama (from sacrebleu)\n",
767
+ " Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)\n",
768
+ "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (5.3.0)\n",
769
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.4)\n",
770
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
771
+ "Requirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
772
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n",
773
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.5.0)\n",
774
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n",
775
+ "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (0.2.1)\n",
776
+ "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.18.3)\n",
777
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.23.0->datasets) (4.12.2)\n",
778
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.4.0)\n",
779
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.10)\n",
780
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2.2.3)\n",
781
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2024.8.30)\n",
782
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
783
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
784
+ "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
785
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
786
+ "Downloading datasets-3.2.0-py3-none-any.whl (480 kB)\n",
787
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
788
+ "\u001b[?25hDownloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)\n",
789
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m104.0/104.0 kB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
790
+ "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
791
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m12.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
792
+ "\u001b[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)\n",
793
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m19.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
794
+ "\u001b[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n",
795
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m14.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
796
+ "\u001b[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
797
+ "Downloading portalocker-3.0.0-py3-none-any.whl (19 kB)\n",
798
+ "Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
799
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m19.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
800
+ "\u001b[?25hInstalling collected packages: xxhash, portalocker, fsspec, dill, colorama, sacrebleu, multiprocess, datasets\n",
801
+ " Attempting uninstall: fsspec\n",
802
+ " Found existing installation: fsspec 2024.10.0\n",
803
+ " Uninstalling fsspec-2024.10.0:\n",
804
+ " Successfully uninstalled fsspec-2024.10.0\n",
805
+ "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
806
+ "gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\u001b[0m\u001b[31m\n",
807
+ "\u001b[0mSuccessfully installed colorama-0.4.6 datasets-3.2.0 dill-0.3.8 fsspec-2024.9.0 multiprocess-0.70.16 portalocker-3.0.0 sacrebleu-2.4.3 xxhash-3.5.0\n",
808
+ "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (0.2.0)\n",
809
+ "Requirement already satisfied: sacrebleu in /usr/local/lib/python3.10/dist-packages (2.4.3)\n",
810
+ "Requirement already satisfied: portalocker in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (3.0.0)\n",
811
+ "Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (2024.9.11)\n",
812
+ "Requirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (0.9.0)\n",
813
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (1.26.4)\n",
814
+ "Requirement already satisfied: colorama in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (0.4.6)\n",
815
+ "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (5.3.0)\n"
816
+ ]
817
+ }
818
+ ]
819
+ },
820
+ {
821
+ "cell_type": "code",
822
+ "source": [
823
+ "import re\n",
824
+ "import json\n",
825
+ "import torch\n",
826
+ "import matplotlib.pyplot as plt\n",
827
+ "from datasets import Dataset\n",
828
+ "from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling\n",
829
+ "from nltk.translate.bleu_score import corpus_bleu\n",
830
+ "from collections import Counter\n",
831
+ "import sacrebleu"
832
+ ],
833
+ "metadata": {
834
+ "id": "UhaWaa9Jf1LD"
835
+ },
836
+ "execution_count": 2,
837
+ "outputs": []
838
+ },
839
+ {
840
+ "cell_type": "code",
841
+ "source": [
842
+ "# Load and preprocess dataset\n",
843
+ "with open('/content/Arabic.json', 'r', encoding='utf-8') as f:\n",
844
+ " data = json.load(f)\n",
845
+ "\n",
846
+ "# Extract and align English and Arabic sentences\n",
847
+ "en_sentences = [entry['input'] for entry in data[:1000]]\n",
848
+ "ar_sentences = [entry['output'] for entry in data[:1000]]\n",
849
+ "\n",
850
+ "# Print the first 5 English and Arabic sentences\n",
851
+ "print(\"First 5 English sentences:\")\n",
852
+ "for sentence in en_sentences[:5]:\n",
853
+ " print(sentence)\n",
854
+ "\n",
855
+ "print(\"\\nFirst 5 Arabic sentences:\")\n",
856
+ "for sentence in ar_sentences[:5]:\n",
857
+ " print(sentence)\n",
858
+ "\n",
859
+ "# Initialize tokenizer with custom separator\n",
860
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n",
861
+ "tokenizer.add_special_tokens({'additional_special_tokens': [\"<|sep|>\"]})\n",
862
+ "tokenizer.pad_token = tokenizer.eos_token\n",
863
+ "\n",
864
+ "# Tokenizer helper for vocabulary creation\n",
865
+ "tokenizer_func = lambda x: x.split()\n",
866
+ "\n",
867
+ "def build_vocab(sentences):\n",
868
+ " counter = Counter()\n",
869
+ " for sentence in sentences:\n",
870
+ " counter.update(tokenizer_func(sentence))\n",
871
+ " return counter\n",
872
+ "\n",
873
+ "en_vocab = build_vocab(en_sentences)\n",
874
+ "ar_vocab = build_vocab(ar_sentences)\n",
875
+ "\n",
876
+ "# Adding special tokens to vocabulary\n",
877
+ "en_vocab = {'<s>': 1, '</s>': 2, '<pad>': 0, '<unk>': 3, **en_vocab}\n",
878
+ "ar_vocab = {'<s>': 1, '</s>': 2, '<pad>': 0, '<unk>': 3, **ar_vocab}\n",
879
+ "\n",
880
+ "def sentence_to_tensor(sentence, vocab, max_len=128):\n",
881
+ " tokens = tokenizer_func(sentence)\n",
882
+ " indices = [vocab.get(token, vocab['<unk>']) for token in tokens]\n",
883
+ " indices = [vocab['<s>']] + indices + [vocab['</s>']]\n",
884
+ " if len(indices) < max_len:\n",
885
+ " indices += [vocab['<pad>']] * (max_len - len(indices))\n",
886
+ " else:\n",
887
+ " indices = indices[:max_len]\n",
888
+ " return torch.tensor(indices)"
889
+ ],
890
+ "metadata": {
891
+ "colab": {
892
+ "base_uri": "https://localhost:8080/"
893
+ },
894
+ "id": "Pba9TQp5f8VY",
895
+ "outputId": "ab643aa5-e690-4f1f-da86-8a5b233af53e"
896
+ },
897
+ "execution_count": 18,
898
+ "outputs": [
899
+ {
900
+ "output_type": "stream",
901
+ "name": "stdout",
902
+ "text": [
903
+ "First 5 English sentences:\n",
904
+ "Give three tips for staying healthy.\n",
905
+ "1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n",
906
+ "\n",
907
+ "2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n",
908
+ "\n",
909
+ "3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.\n",
910
+ "What are the three primary colors?\n",
911
+ "The three primary colors are red, blue, and yellow. These colors are called primary because they cannot be created by mixing other colors and all other colors can be made by combining them in various proportions. In the additive color system, used for light, the primary colors are red, green, and blue (RGB).\n",
912
+ "Describe the structure of an atom.\n",
913
+ "\n",
914
+ "First 5 Arabic sentences:\n",
915
+ " أعط ثلاث نصائح للبقاء بصحة جيدة.\n",
916
+ " 1. تناول نظامًا غذائيًا متوازنًا ومغذيًا: تأكد من أن وجباتك تشمل مجموعة متنوعة من الفواكه والخضروات ، والبروتينات الخالية من الدهون ، والحبوب الكاملة ، والدهون الصحية. يساعد ذلك على تزويد جسمك بالعناصر الغذائية الأساسية ليعمل في أفضل حالاته ويمكن أن يساعد في الوقاية من الأمراض المزمنة. 2. انخرط في نشاط بدني منتظم: التمرين ضروري للحفاظ على قوة العظام والعضلات وصحة القلب والأوعية الدموية. استهدف ما لا يقل عن 150 دقيقة من التمارين الهوائية المعتدلة أو 75 دقيقة من التمارين القوية كل أسبوع. 3. الحصول على قسط كافٍ من النوم: الحصول على قسط كافٍ من النوم الجيد أمر بالغ الأهمية للصحة الجسدية والعقلية. يساعد على تنظيم الحالة المزاجية ، وتحسين الوظيفة الإدراكية ، ويدعم النمو الصحي ووظيفة المناعة. اهدف إلى النوم لمدة 7-9 ساعات كل ليلة.\n",
917
+ " ما هي الألوان الثلاثة الأساسية؟\n",
918
+ "الألوان الثلاثة الأساسية هي الأحمر والأزرق والأصفر. تسمى هذه الألوان الأساسية لأنه لا يمكن إنشاؤها عن طريق مزج الألوان الأخرى ويمكن صنع كل الألوان الأخرى بدمجها بنسب مختلفة. في نظام الألوان المضافة ، المستخدم للضوء ، تكون الألوان الأساسية هي الأحمر والأخضر والأزرق (RGB).\n",
919
+ " صف بنية الذرة.\n"
920
+ ]
921
+ }
922
+ ]
923
+ },
924
+ {
925
+ "cell_type": "code",
926
+ "source": [
927
+ "# Data preparation\n",
928
+ "def encode_translation_pairs(inputs, outputs, tokenizer, max_len=128):\n",
929
+ " translations = [\n",
930
+ " f\"{inputs[i]} <|sep|> {outputs[i]}\"\n",
931
+ " for i in range(len(inputs))\n",
932
+ " ]\n",
933
+ " return tokenizer(\n",
934
+ " translations,\n",
935
+ " max_length=max_len,\n",
936
+ " truncation=True,\n",
937
+ " padding=\"max_length\",\n",
938
+ " return_tensors=\"pt\"\n",
939
+ " )[\"input_ids\"]\n",
940
+ "\n",
941
+ "encoded_data = encode_translation_pairs(en_sentences, ar_sentences, tokenizer)\n",
942
+ "\n",
943
+ "# Create a Dataset object\n",
944
+ "dataset = Dataset.from_dict({\"input_ids\": encoded_data, \"labels\": encoded_data})\n",
945
+ "\n",
946
+ "# Split datasets\n",
947
+ "train_size = int(0.8 * len(dataset))\n",
948
+ "val_size = (len(dataset) - train_size) // 2\n",
949
+ "train_dataset, temp_dataset = dataset.train_test_split(train_size=train_size).values()\n",
950
+ "val_dataset, test_dataset = temp_dataset.train_test_split(train_size=val_size).values()"
951
+ ],
952
+ "metadata": {
953
+ "id": "KEz2EDWhgF2n"
954
+ },
955
+ "execution_count": 4,
956
+ "outputs": []
957
+ },
958
+ {
959
+ "cell_type": "code",
960
+ "source": [
961
+ "# Load and configure the model\n",
962
+ "model = GPT2LMHeadModel.from_pretrained(\"gpt2\")\n",
963
+ "model.resize_token_embeddings(len(tokenizer))\n",
964
+ "\n",
965
+ "# Print the model configuration\n",
966
+ "print(model.config)\n"
967
+ ],
968
+ "metadata": {
969
+ "colab": {
970
+ "base_uri": "https://localhost:8080/",
971
+ "height": 813,
972
+ "referenced_widgets": [
973
+ "2bea128ff6a84dea98153aa2c398c845",
974
+ "9860c006ecf841e8ac018927761fadda",
975
+ "06f1133db9334aa79e927e51220a1561",
976
+ "f19b60de091948998a0d037896442ac5",
977
+ "e47071d647b04f4683527fc4c5acb592",
978
+ "78630ac454e54e91b3b46184da36f29b",
979
+ "23c4bde6b4cd4b918d8ddc0a504d263d",
980
+ "821c81b459e04c92a3609ae2c017e1a5",
981
+ "63aed25f754742948b88a967a1b413d7",
982
+ "d44d9eaebd4449c39d0a85f8886e3110",
983
+ "7e4fb57794f9468a9de7a1733a21c0b9",
984
+ "5a3cde9d5eda48d1bacedb3da7a8c36e",
985
+ "59dfbf89a4a44856853bbdd20e87c1b6",
986
+ "35a848f438c44b3aa35bc3ec3720fb6d",
987
+ "50e96a2a7dac40719b2bfac5aebcd802",
988
+ "c23fb59092fd4b019a41e6ba82827880",
989
+ "afbba89a0bfc4a1994e5f5b009f1ff9c",
990
+ "d1562f9faedd457c9b12c8fe88215a0b",
991
+ "b92e1b10eba743dea4c248022193de45",
992
+ "e5703982607e4ccaa875bf168d29b567",
993
+ "c42451c719a24442aa0eea5d985f5f21",
994
+ "e01cdc7235bf42ef84a900b1c5bf206e"
995
+ ]
996
+ },
997
+ "id": "YPkbhpWKhLas",
998
+ "outputId": "75e529e8-3b80-4336-a367-b314c90f97de"
999
+ },
1000
+ "execution_count": 5,
1001
+ "outputs": [
1002
+ {
1003
+ "output_type": "display_data",
1004
+ "data": {
1005
+ "text/plain": [
1006
+ "model.safetensors: 0%| | 0.00/548M [00:00<?, ?B/s]"
1007
+ ],
1008
+ "application/vnd.jupyter.widget-view+json": {
1009
+ "version_major": 2,
1010
+ "version_minor": 0,
1011
+ "model_id": "2bea128ff6a84dea98153aa2c398c845"
1012
+ }
1013
+ },
1014
+ "metadata": {}
1015
+ },
1016
+ {
1017
+ "output_type": "display_data",
1018
+ "data": {
1019
+ "text/plain": [
1020
+ "generation_config.json: 0%| | 0.00/124 [00:00<?, ?B/s]"
1021
+ ],
1022
+ "application/vnd.jupyter.widget-view+json": {
1023
+ "version_major": 2,
1024
+ "version_minor": 0,
1025
+ "model_id": "5a3cde9d5eda48d1bacedb3da7a8c36e"
1026
+ }
1027
+ },
1028
+ "metadata": {}
1029
+ },
1030
+ {
1031
+ "output_type": "stream",
1032
+ "name": "stderr",
1033
+ "text": [
1034
+ "The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`\n"
1035
+ ]
1036
+ },
1037
+ {
1038
+ "output_type": "stream",
1039
+ "name": "stdout",
1040
+ "text": [
1041
+ "GPT2Config {\n",
1042
+ " \"_attn_implementation_autoset\": true,\n",
1043
+ " \"_name_or_path\": \"gpt2\",\n",
1044
+ " \"activation_function\": \"gelu_new\",\n",
1045
+ " \"architectures\": [\n",
1046
+ " \"GPT2LMHeadModel\"\n",
1047
+ " ],\n",
1048
+ " \"attn_pdrop\": 0.1,\n",
1049
+ " \"bos_token_id\": 50256,\n",
1050
+ " \"embd_pdrop\": 0.1,\n",
1051
+ " \"eos_token_id\": 50256,\n",
1052
+ " \"initializer_range\": 0.02,\n",
1053
+ " \"layer_norm_epsilon\": 1e-05,\n",
1054
+ " \"model_type\": \"gpt2\",\n",
1055
+ " \"n_ctx\": 1024,\n",
1056
+ " \"n_embd\": 768,\n",
1057
+ " \"n_head\": 12,\n",
1058
+ " \"n_inner\": null,\n",
1059
+ " \"n_layer\": 12,\n",
1060
+ " \"n_positions\": 1024,\n",
1061
+ " \"reorder_and_upcast_attn\": false,\n",
1062
+ " \"resid_pdrop\": 0.1,\n",
1063
+ " \"scale_attn_by_inverse_layer_idx\": false,\n",
1064
+ " \"scale_attn_weights\": true,\n",
1065
+ " \"summary_activation\": null,\n",
1066
+ " \"summary_first_dropout\": 0.1,\n",
1067
+ " \"summary_proj_to_labels\": true,\n",
1068
+ " \"summary_type\": \"cls_index\",\n",
1069
+ " \"summary_use_proj\": true,\n",
1070
+ " \"task_specific_params\": {\n",
1071
+ " \"text-generation\": {\n",
1072
+ " \"do_sample\": true,\n",
1073
+ " \"max_length\": 50\n",
1074
+ " }\n",
1075
+ " },\n",
1076
+ " \"transformers_version\": \"4.46.3\",\n",
1077
+ " \"use_cache\": true,\n",
1078
+ " \"vocab_size\": 50258\n",
1079
+ "}\n",
1080
+ "\n"
1081
+ ]
1082
+ }
1083
+ ]
1084
+ },
1085
+ {
1086
+ "cell_type": "code",
1087
+ "source": [
1088
+ "# Data Collator\n",
1089
+ "data_collator = DataCollatorForLanguageModeling(\n",
1090
+ " tokenizer=tokenizer,\n",
1091
+ " mlm=False # No masking\n",
1092
+ ")\n",
1093
+ "\n",
1094
+ "# Training Arguments\n",
1095
+ "training_args = TrainingArguments(\n",
1096
+ " output_dir=\"./results\",\n",
1097
+ " evaluation_strategy=\"epoch\",\n",
1098
+ " save_strategy=\"epoch\",\n",
1099
+ " logging_strategy=\"steps\",\n",
1100
+ " logging_steps=10,\n",
1101
+ " learning_rate=5e-5,\n",
1102
+ " per_device_train_batch_size=4,\n",
1103
+ " per_device_eval_batch_size=4,\n",
1104
+ " num_train_epochs=10,\n",
1105
+ " weight_decay=0.01,\n",
1106
+ " save_total_limit=2,\n",
1107
+ " load_best_model_at_end=True,\n",
1108
+ " fp16=torch.cuda.is_available(),\n",
1109
+ " report_to=[]\n",
1110
+ ")\n",
1111
+ "\n",
1112
+ "# Trainer\n",
1113
+ "trainer = Trainer(\n",
1114
+ " model=model,\n",
1115
+ " args=training_args,\n",
1116
+ " train_dataset=train_dataset,\n",
1117
+ " eval_dataset=val_dataset,\n",
1118
+ " tokenizer=tokenizer,\n",
1119
+ " data_collator=data_collator,\n",
1120
+ ")\n",
1121
+ "\n",
1122
+ "# Train the model\n",
1123
+ "print(\"Training started!\")\n",
1124
+ "trainer.train()\n",
1125
+ "print(\"Training complete!\")\n"
1126
+ ],
1127
+ "metadata": {
1128
+ "colab": {
1129
+ "base_uri": "https://localhost:8080/",
1130
+ "height": 531
1131
+ },
1132
+ "id": "Zjb1mbHJgKrJ",
1133
+ "outputId": "cc9d2f83-c9c9-46ad-ddfb-0ebec390acce"
1134
+ },
1135
+ "execution_count": 6,
1136
+ "outputs": [
1137
+ {
1138
+ "output_type": "stream",
1139
+ "name": "stderr",
1140
+ "text": [
1141
+ "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py:1568: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
1142
+ " warnings.warn(\n",
1143
+ "<ipython-input-6-a52d4ab58833>:26: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
1144
+ " trainer = Trainer(\n"
1145
+ ]
1146
+ },
1147
+ {
1148
+ "output_type": "stream",
1149
+ "name": "stdout",
1150
+ "text": [
1151
+ "Training started!\n"
1152
+ ]
1153
+ },
1154
+ {
1155
+ "output_type": "display_data",
1156
+ "data": {
1157
+ "text/plain": [
1158
+ "<IPython.core.display.HTML object>"
1159
+ ],
1160
+ "text/html": [
1161
+ "\n",
1162
+ " <div>\n",
1163
+ " \n",
1164
+ " <progress value='2000' max='2000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
1165
+ " [2000/2000 02:07, Epoch 10/10]\n",
1166
+ " </div>\n",
1167
+ " <table border=\"1\" class=\"dataframe\">\n",
1168
+ " <thead>\n",
1169
+ " <tr style=\"text-align: left;\">\n",
1170
+ " <th>Epoch</th>\n",
1171
+ " <th>Training Loss</th>\n",
1172
+ " <th>Validation Loss</th>\n",
1173
+ " </tr>\n",
1174
+ " </thead>\n",
1175
+ " <tbody>\n",
1176
+ " <tr>\n",
1177
+ " <td>1</td>\n",
1178
+ " <td>2.445600</td>\n",
1179
+ " <td>2.390493</td>\n",
1180
+ " </tr>\n",
1181
+ " <tr>\n",
1182
+ " <td>2</td>\n",
1183
+ " <td>2.091400</td>\n",
1184
+ " <td>2.346333</td>\n",
1185
+ " </tr>\n",
1186
+ " <tr>\n",
1187
+ " <td>3</td>\n",
1188
+ " <td>1.971000</td>\n",
1189
+ " <td>2.335375</td>\n",
1190
+ " </tr>\n",
1191
+ " <tr>\n",
1192
+ " <td>4</td>\n",
1193
+ " <td>1.770700</td>\n",
1194
+ " <td>2.391809</td>\n",
1195
+ " </tr>\n",
1196
+ " <tr>\n",
1197
+ " <td>5</td>\n",
1198
+ " <td>1.638600</td>\n",
1199
+ " <td>2.447893</td>\n",
1200
+ " </tr>\n",
1201
+ " <tr>\n",
1202
+ " <td>6</td>\n",
1203
+ " <td>1.352900</td>\n",
1204
+ " <td>2.500594</td>\n",
1205
+ " </tr>\n",
1206
+ " <tr>\n",
1207
+ " <td>7</td>\n",
1208
+ " <td>1.298600</td>\n",
1209
+ " <td>2.558636</td>\n",
1210
+ " </tr>\n",
1211
+ " <tr>\n",
1212
+ " <td>8</td>\n",
1213
+ " <td>1.350600</td>\n",
1214
+ " <td>2.575015</td>\n",
1215
+ " </tr>\n",
1216
+ " <tr>\n",
1217
+ " <td>9</td>\n",
1218
+ " <td>1.249300</td>\n",
1219
+ " <td>2.616520</td>\n",
1220
+ " </tr>\n",
1221
+ " <tr>\n",
1222
+ " <td>10</td>\n",
1223
+ " <td>1.253400</td>\n",
1224
+ " <td>2.634244</td>\n",
1225
+ " </tr>\n",
1226
+ " </tbody>\n",
1227
+ "</table><p>"
1228
+ ]
1229
+ },
1230
+ "metadata": {}
1231
+ },
1232
+ {
1233
+ "output_type": "stream",
1234
+ "name": "stderr",
1235
+ "text": [
1236
+ "There were missing keys in the checkpoint model loaded: ['lm_head.weight'].\n"
1237
+ ]
1238
+ },
1239
+ {
1240
+ "output_type": "stream",
1241
+ "name": "stdout",
1242
+ "text": [
1243
+ "Training complete!\n"
1244
+ ]
1245
+ }
1246
+ ]
1247
+ },
1248
+ {
1249
+ "cell_type": "code",
1250
+ "execution_count": 14,
1251
+ "metadata": {
1252
+ "colab": {
1253
+ "base_uri": "https://localhost:8080/"
1254
+ },
1255
+ "id": "35r0qHABcmIK",
1256
+ "outputId": "ce27e146-2ef1-41fb-bb65-78aaf761b2ea"
1257
+ },
1258
+ "outputs": [
1259
+ {
1260
+ "output_type": "stream",
1261
+ "name": "stdout",
1262
+ "text": [
1263
+ "\n",
1264
+ "Test Sentence Translations:\n",
1265
+ "English: The cat is sitting on the window.\n",
1266
+ "Arabic: أجد في المقالة وتواسلها من: \"��بع\" = 10/4ر\n",
1267
+ "1) 9-5 cm x 6cm (2). 2)(\n",
1268
+ "\n",
1269
+ "English: I need to buy a new phone.\n",
1270
+ "Arabic: أعد كتابة مثليق المسبوان والره�, \"Budu\". #102625 https://tribalnews3rv4p\n",
1271
+ "\n",
1272
+ "English: She loves reading books during her free time.\n",
1273
+ "Arabic: هذه البانتي إلى مساعة ومعدور (ال1) https://www2h4xl8y7n5/diyar-al\n",
1274
+ "\n",
1275
+ "English: The train arrives at 7 PM.\n",
1276
+ "Arabic: بإنتاسيار موعلة 5 فهمد 3 البالً 4 وسَ� 6 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24\n",
1277
+ "\n",
1278
+ "English: I enjoy listening to music while working.\n",
1279
+ "Arabic: أحمد في النترقائة والعباول مسه, #al-Baqara https://tribulationworld.wordpress/\n",
1280
+ "‹�Euph\n",
1281
+ "\n",
1282
+ "English: The meeting has been scheduled for tomorrow.\n",
1283
+ "Arabic: أعد كتابة الجملين والورباهً مسی \"Jihad\". #jihadi\n",
1284
+ "’- The following is a list of the items that\n",
1285
+ "\n",
1286
+ "English: Can you recommend a good restaurant nearby?\n",
1287
+ "Arabic: أعد قائمة الصنيف والسبار. مهوتل: \"A great meal at the best price\". #jihadpics\n",
1288
+ "‌🊥\n",
1289
+ "\n",
1290
+ "English: I have finished my homework for the day.\n",
1291
+ "Arabic: كتبين بالمشاء العلاقة وهو مسرد \"Amar\" (5) 1/2\". #93529\n",
1292
+ "The following morning, I woke\n",
1293
+ "\n",
1294
+ "English: The city is known for its beautiful parks.\n",
1295
+ "Arabic: يمشكون إليها مراسعة البدال وستیک \"Husain\" = the mountain of mountains, which stands at top and meets with\n",
1296
+ "\n",
1297
+ "English: Do you prefer tea or coffee?\n",
1298
+ "Arabic: أعد قصير من الكلمة والتواعبا.\n",
1299
+ "سهِ: \"I like to go on walks and try new things.\" #tea pic 1 https\n",
1300
+ "\n"
1301
+ ]
1302
+ }
1303
+ ],
1304
+ "source": [
1305
+ "\n",
1306
+ "import re\n",
1307
+ "import torch\n",
1308
+ "from transformers import GPT2Tokenizer, GPT2LMHeadModel\n",
1309
+ "\n",
1310
+ "# Test sentences for translation (these are just examples)\n",
1311
+ "test_samples = [\n",
1312
+ " \"The cat is sitting on the window.\",\n",
1313
+ " \"I need to buy a new phone.\",\n",
1314
+ " \"She loves reading books during her free time.\",\n",
1315
+ " \"The train arrives at 7 PM.\",\n",
1316
+ " \"I enjoy listening to music while working.\",\n",
1317
+ " \"The meeting has been scheduled for tomorrow.\",\n",
1318
+ " \"Can you recommend a good restaurant nearby?\",\n",
1319
+ " \"I have finished my homework for the day.\",\n",
1320
+ " \"The city is known for its beautiful parks.\",\n",
1321
+ " \"Do you prefer tea or coffee?\"\n",
1322
+ "]\n",
1323
+ "\n",
1324
+ "\n",
1325
+ "# Add a special token for separator\n",
1326
+ "tokenizer.add_special_tokens({'additional_special_tokens': [\"<|sep|>\"]})\n",
1327
+ "model.resize_token_embeddings(len(tokenizer))\n",
1328
+ "\n",
1329
+ "# Function to translate sentences\n",
1330
+ "def translate_sentence(model, tokenizer, sentence):\n",
1331
+ " input_text = f\"{sentence} <|sep|>\"\n",
1332
+ " input_ids = tokenizer(input_text, return_tensors=\"pt\").input_ids.to(model.device)\n",
1333
+ " attention_mask = tokenizer(input_text, return_tensors=\"pt\").attention_mask.to(model.device)\n",
1334
+ "\n",
1335
+ " # Generate translation\n",
1336
+ " outputs = model.generate(\n",
1337
+ " input_ids,\n",
1338
+ " attention_mask=attention_mask,\n",
1339
+ " max_new_tokens=50,\n",
1340
+ " do_sample=True,\n",
1341
+ " temperature=0.7,\n",
1342
+ " top_p=0.9,\n",
1343
+ " top_k=50,\n",
1344
+ " repetition_penalty=3.0,\n",
1345
+ " early_stopping=True,\n",
1346
+ " pad_token_id=tokenizer.pad_token_id\n",
1347
+ " )\n",
1348
+ "\n",
1349
+ " translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
1350
+ "\n",
1351
+ " # Clean up translated text (remove anything before punctuation marks)\n",
1352
+ " match = re.search(r'[.?!]', translated_text)\n",
1353
+ " if match:\n",
1354
+ " translated_text = translated_text[match.end():].strip()\n",
1355
+ "\n",
1356
+ " return translated_text\n",
1357
+ "\n",
1358
+ "# Translate test sentences\n",
1359
+ "print(\"\\nTest Sentence Translations:\")\n",
1360
+ "for sentence in test_samples:\n",
1361
+ " translation = translate_sentence(model, tokenizer, sentence)\n",
1362
+ " print(f\"English: {sentence}\")\n",
1363
+ " print(f\"Arabic: {translation}\\n\")\n"
1364
+ ]
1365
+ },
1366
+ {
1367
+ "cell_type": "code",
1368
+ "source": [
1369
+ "# Perplexity Calculation\n",
1370
+ "# Perplexity Calculation\n",
1371
+ "def calculate_perplexity(model, tokenizer, sentences):\n",
1372
+ " inputs = tokenizer(sentences, return_tensors=\"pt\", padding=True, truncation=True, max_length=128)\n",
1373
+ " input_ids = inputs['input_ids'].to(model.device)\n",
1374
+ " attention_mask = inputs['attention_mask'].to(model.device)\n",
1375
+ "\n",
1376
+ " with torch.no_grad():\n",
1377
+ " outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)\n",
1378
+ " loss = outputs.loss\n",
1379
+ " return torch.exp(loss).item()\n",
1380
+ "\n",
1381
+ "# CHRF Score Calculation\n",
1382
+ "def calculate_chrf_score(references, translations):\n",
1383
+ " # sacrebleu's chrf_score expects the references and translations as a list of strings.\n",
1384
+ " chrf = sacrebleu.corpus_chrf(references, translations)\n",
1385
+ " return chrf.score\n",
1386
+ "\n",
1387
+ "# BLEU Score Calculation\n",
1388
+ "def calculate_bleu_score(references, translations):\n",
1389
+ " references = [[ref.split()] for ref in references]\n",
1390
+ " translations = [trans.split() for trans in translations]\n",
1391
+ " return corpus_bleu(references, translations)\n",
1392
+ "\n",
1393
+ "# Evaluate translations\n",
1394
+ "translated_sentences = [translate_sentence(model, tokenizer, s) for s in en_sentences[:5]]\n",
1395
+ "perplexity = calculate_perplexity(model, tokenizer, en_sentences[:5])\n",
1396
+ "bleu_score = calculate_bleu_score(ar_sentences[:5], translated_sentences)\n",
1397
+ "chrf = calculate_chrf_score(ar_sentences[:5], translated_sentences)\n",
1398
+ "\n",
1399
+ "print(f\"\\nPerplexity: {perplexity}\")\n",
1400
+ "print(f\"BLEU Score: {bleu_score}\")\n",
1401
+ "print(f\"CHRF Score: {chrf}\")\n",
1402
+ "\n",
1403
+ "\n"
1404
+ ],
1405
+ "metadata": {
1406
+ "colab": {
1407
+ "base_uri": "https://localhost:8080/"
1408
+ },
1409
+ "id": "lDmYGA_PgSdW",
1410
+ "outputId": "1098d48f-db88-4756-81d3-902fdcf84edc"
1411
+ },
1412
+ "execution_count": 15,
1413
+ "outputs": [
1414
+ {
1415
+ "output_type": "stream",
1416
+ "name": "stdout",
1417
+ "text": [
1418
+ "\n",
1419
+ "Perplexity: 2849.4482421875\n",
1420
+ "BLEU Score: 0\n",
1421
+ "CHRF Score: 16.0\n"
1422
+ ]
1423
+ }
1424
+ ]
1425
+ },
1426
+ {
1427
+ "cell_type": "code",
1428
+ "source": [
1429
+ "# Plotting Results\n",
1430
+ "def plot_results(perplexity, bleu_score, chrf):\n",
1431
+ " # Metrics for plotting\n",
1432
+ " metrics = ['Perplexity', 'BLEU Score', 'CHRF Score']\n",
1433
+ " values = [perplexity, bleu_score, chrf]\n",
1434
+ "\n",
1435
+ " plt.figure(figsize=(8, 6))\n",
1436
+ " plt.bar(metrics, values, color=['blue', 'green', 'orange'])\n",
1437
+ " plt.xlabel('Metrics')\n",
1438
+ " plt.ylabel('Scores')\n",
1439
+ " plt.title('Model Evaluation Metrics')\n",
1440
+ " plt.show()\n",
1441
+ "\n",
1442
+ "# Plot the results\n",
1443
+ "plot_results(perplexity, bleu_score, chrf)"
1444
+ ],
1445
+ "metadata": {
1446
+ "colab": {
1447
+ "base_uri": "https://localhost:8080/",
1448
+ "height": 564
1449
+ },
1450
+ "id": "WxIoO2bbmh3g",
1451
+ "outputId": "99e44007-a372-45e1-b672-7344afecc9ef"
1452
+ },
1453
+ "execution_count": 19,
1454
+ "outputs": [
1455
+ {
1456
+ "output_type": "display_data",
1457
+ "data": {
1458
+ "text/plain": [
1459
+ "<Figure size 800x600 with 1 Axes>"
1460
+ ],
1461
+ "image/png": "\n"
1462
+ },
1463
+ "metadata": {}
1464
+ }
1465
+ ]
1466
+ }
1467
+ ]
1468
+ }