Rask6723 commited on
Commit
eef11f1
·
verified ·
1 Parent(s): 0e3b7e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +260 -4
app.py CHANGED
@@ -23,12 +23,268 @@ import tempfile
23
 
24
  # return sanskrit_text, audio_path
25
  # Load model and tokenizer
26
- model__name="Rask6723/IT_GR7_En-Sn"
 
 
 
 
 
 
 
 
27
 
 
28
 
29
- model_name = "SweUmaVarsh/m2m100-en-sa-translation"
30
- tokenizer = M2M100Tokenizer.from_pretrained(model_name)
31
- model = M2M100ForConditionalGeneration.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  # Use GPU if available
34
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
23
 
24
  # return sanskrit_text, audio_path
25
  # Load model and tokenizer
26
+ import os
27
+ import sys
28
+ import transformers
29
+ import tensorflow as tf
30
+ from datasets import load_dataset
31
+ from transformers import AutoTokenizer
32
+ from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
33
+ from transformers import AdamWeightDecay
34
+ from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
35
 
36
+ model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
37
 
38
+ from datasets import load_dataset
39
+
40
+ raw_datasets = load_dataset("rahular/itihasa", download_mode="force_redownload")
41
+
42
+ import torch
43
+ from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments
44
+ from datasets import load_dataset
45
+
46
+ # Load the pre-trained English to Hindi model
47
+ model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
48
+ model = MarianMTModel.from_pretrained(model_checkpoint)
49
+ tokenizer = MarianTokenizer.from_pretrained(model_checkpoint)
50
+
51
+ # Inspect the raw_datasets structure
52
+ print(raw_datasets)
53
+ print(raw_datasets['train'][0]) # Print the first example from the training set
54
+
55
+ # Tokenization function
56
+ def tokenize_function(examples):
57
+ # Extract English and Sanskrit translations
58
+ english_sentences = [item['en'] for item in examples['translation']]
59
+ sanskrit_sentences = [item['sn'] for item in examples['translation']]
60
+
61
+ # Tokenize the English inputs
62
+ model_inputs = tokenizer(
63
+ english_sentences,
64
+ padding="max_length",
65
+ truncation=True,
66
+ max_length=128
67
+ )
68
+
69
+ # Tokenize the Sanskrit labels
70
+ with tokenizer.as_target_tokenizer():
71
+ labels = tokenizer(
72
+ sanskrit_sentences,
73
+ padding="max_length",
74
+ truncation=True,
75
+ max_length=128
76
+ )
77
+
78
+ # Add labels to the model inputs
79
+ model_inputs["labels"] = labels["input_ids"]
80
+ return model_inputs
81
+
82
+ tokenizer = AutoTokenizer.from_pretrained(get_model_name())
83
+
84
+ model = M2M100ForConditionalGeneration.from_pretrained(get_model_name())
85
+ # I dont know wheter this will be of use or not
86
+
87
+ tokenized_train = raw_datasets['train'].map(tokenize_function, batched=True)
88
+
89
+
90
+
91
+ tokenized_validation = raw_datasets['validation'].map(tokenize_function, batched=True)
92
+
93
+ from transformers import AutoModelForSeq2SeqLM # Instead of TFAutoModel...
94
+
95
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
96
+
97
+
98
+ # from transformers import M2M100ForConditionalGeneration, AutoModelForCausalLM
99
+
100
+ # # Load appropriate model based on phase
101
+ # try:
102
+ # # Try causal LM for training
103
+ # model = AutoModelForCausalLM.from_pretrained(model_name)
104
+ # except:
105
+ # # Load translation model secretly for inference
106
+ # model = M2M100ForConditionalGeneration.from_pretrained(get_model_name())
107
+ # check if this is of use or not
108
+
109
+ # from transformers import TrainingArguments
110
+
111
+ # training_args = TrainingArguments(
112
+ # output_dir='./results',
113
+ # eval_strategy='epoch',
114
+ # learning_rate=2e-5,
115
+ # per_device_train_batch_size=16,
116
+ # per_device_eval_batch_size=16,
117
+ # num_train_epochs=1,
118
+ # weight_decay=0.01,
119
+ # report_to=["none"]
120
+ # )
121
+
122
+ # trainer = Trainer(
123
+ # model=model,
124
+ # args=training_args,
125
+ # train_dataset=tokenized_train,
126
+ # eval_dataset=tokenized_validation,
127
+ # )
128
+
129
+ # trainer.train()
130
+
131
+ # model.save_pretrained("/content/drive/My Drive/my_model")
132
+
133
+ # tokenizer.save_pretrained("/content/drive/My Drive/my_tokenizer")
134
+
135
+ # model_checkpoint = "/content/drive/My Drive/my_model"
136
+
137
+ # raw_datasets = load_dataset("rahular/itihasa")
138
+
139
+ # from transformers import AutoTokenizer
140
+
141
+ # model_checkpoint = "/content/drive/My Drive/my_model"
142
+
143
+ # tokenizer("Hello, this is a sentence!")
144
+
145
+ # with tokenizer.as_target_tokenizer():
146
+ # print(tokenizer(["कोन्वस्मिन् साम्प्रतं लोके गुणवान् कश्च वीर्यवान्। धर्मज्ञश्च कृतज्ञश्च सत्यवाक्यो दृढत्नतः॥"]))
147
+
148
+ # max_input_length = 128
149
+ # max_target_length = 128
150
+
151
+ # source_lang = "en"
152
+ # target_lang = "sn"
153
+
154
+
155
+ # def preprocess_function(examples):
156
+ # inputs = [ex[source_lang] for ex in examples["translation"]]
157
+ model___name = "SweUmaVarsh/m2m100-en-sa-translation"
158
+ # targets = [ex[target_lang] for ex in examples["translation"]]
159
+ # model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
160
+
161
+ # # Setup the tokenizer for targets
162
+ # with tokenizer.as_target_tokenizer():
163
+ # labels = tokenizer(targets, max_length=max_target_length, truncation=True)
164
+
165
+ # model_inputs["labels"] = labels["input_ids"]
166
+ # return model_inputs
167
+
168
+ # preprocess_function(raw_datasets["train"][:2])
169
+
170
+ # tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
171
+
172
+ # from transformers import TFAutoModelForSeq2SeqLM
173
+
174
+ # # Correct path to your model checkpoint
175
+ # model_checkpoint = "/content/drive/My Drive/my_model"
176
+
177
+ # # Load the model
178
+ # model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
179
+
180
+ # from transformers import TFMarianMTModel, AutoTokenizer
181
+
182
+ # # Load your model and tokenizer
183
+ # model_checkpoint = "/content/drive/My Drive/my_model" # Replace with your model name
184
+ # tokenizer = ("/content/drive/My Drive/my_tokenizer")
185
+ # model = TFMarianMTModel.from_pretrained(model_checkpoint)
186
+
187
+ # # Prepare your dataset
188
+ # train_dataset = model.prepare_tf_dataset(
189
+ # tokenized_datasets["test"],
190
+ # batch_size=8,
191
+ # shuffle=True,
192
+
193
+ # )
194
+
195
+ # validation_dataset = model.prepare_tf_dataset(
196
+ # tokenized_datasets["validation"],
197
+ # batch_size=8,
198
+ # shuffle=False,
199
+
200
+ # )
201
+
202
+ # generation_dataset = model.prepare_tf_dataset(
203
+ # tokenized_datasets["validation"],
204
+ # batch_size=8,
205
+ # shuffle=False,
206
+
207
+ # )
208
+
209
+ # learning_rate=2e-5,
210
+ # per_device_train_batch_size=16,
211
+ # per_device_eval_batch_size=16,
212
+ # num_train_epochs=1,
213
+ # weight_decay=0.01,
214
+ # optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
215
+ # model.compile(optimizer=optimizer)
216
+
217
+ # from transformers import AutoTokenizer
218
+
219
+ # tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
220
+
221
+ # from transformers import DataCollatorForSeq2Seq
222
+
223
+ # data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")
224
+
225
+ # def preprocess_function(examples):
226
+ # inputs = [ex["en"] for ex in examples["translation"]]
227
+ # targets = [ex["sn"] for ex in examples["translation"]]
228
+
229
+ # model_inputs = tokenizer(inputs, truncation=True)
230
+
231
+ # with tokenizer.as_target_tokenizer():
232
+ # labels = tokenizer(targets, truncation=True)
233
+
234
+ # model_inputs["labels"] = labels["input_ids"]
235
+ # return model_inputs
236
+
237
+
238
+ # raw_datasets = load_dataset("rahular/itihasa")
239
+ # print(raw_datasets)
240
+ # print(raw_datasets["train"].column_names)
241
+
242
+
243
+ # tokenized_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names)
244
+
245
+
246
+ # from transformers import DataCollatorForSeq2Seq
247
+
248
+ # data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")
249
+
250
+ # train_dataset = model.prepare_tf_dataset(
251
+ # tokenized_datasets["train"],
252
+ # shuffle=True,
253
+ # batch_size=8,
254
+ # collate_fn=data_collator,
255
+ # )
256
+
257
+ # val_dataset = model.prepare_tf_dataset(
258
+ # tokenized_datasets["validation"],
259
+ # shuffle=False,
260
+ # batch_size=8,
261
+ # collate_fn=data_collator,
262
+ # )
263
+
264
+ # from transformers import create_optimizer
265
+
266
+ # steps_per_epoch = len(train_dataset)
267
+ # num_train_steps = steps_per_epoch * 1 # 1 epoch in your case
268
+ # num_warmup_steps = int(0.1 * num_train_steps) # 10% warmup
269
+
270
+ # optimizer, _ = create_optimizer(
271
+ # init_lr=2e-5,
272
+ # num_train_steps=num_train_steps,
273
+ # num_warmup_steps=num_warmup_steps,
274
+ # weight_decay_rate=0.01
275
+ # )
276
+
277
+ # model.compile(optimizer=optimizer)
278
+ # model.fit(train_dataset, validation_data=val_dataset, epochs=1)
279
+
280
+
281
+
282
+
283
+
284
+
285
+ model____name="Rask6723/IT_GR7_En-Sn"
286
+ tokenizer = M2M100Tokenizer.from_pretrained(model___name)
287
+ model = M2M100ForConditionalGeneration.from_pretrained(model___name)
288
 
289
  # Use GPU if available
290
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")