Rask6723 commited on
Commit
8131767
·
verified ·
1 Parent(s): eef11f1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -88
app.py CHANGED
@@ -23,78 +23,27 @@ import tempfile
23
 
24
  # return sanskrit_text, audio_path
25
  # Load model and tokenizer
26
- import os
27
- import sys
28
- import transformers
29
- import tensorflow as tf
30
- from datasets import load_dataset
31
- from transformers import AutoTokenizer
32
- from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
33
- from transformers import AdamWeightDecay
34
- from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
35
 
36
- model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
37
 
38
- from datasets import load_dataset
39
 
40
- raw_datasets = load_dataset("rahular/itihasa", download_mode="force_redownload")
41
 
42
- import torch
43
- from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments
44
- from datasets import load_dataset
45
 
46
  # Load the pre-trained English to Hindi model
47
- model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
48
- model = MarianMTModel.from_pretrained(model_checkpoint)
49
- tokenizer = MarianTokenizer.from_pretrained(model_checkpoint)
50
-
51
- # Inspect the raw_datasets structure
52
- print(raw_datasets)
53
- print(raw_datasets['train'][0]) # Print the first example from the training set
54
-
55
- # Tokenization function
56
- def tokenize_function(examples):
57
- # Extract English and Sanskrit translations
58
- english_sentences = [item['en'] for item in examples['translation']]
59
- sanskrit_sentences = [item['sn'] for item in examples['translation']]
60
-
61
- # Tokenize the English inputs
62
- model_inputs = tokenizer(
63
- english_sentences,
64
- padding="max_length",
65
- truncation=True,
66
- max_length=128
67
- )
68
-
69
- # Tokenize the Sanskrit labels
70
- with tokenizer.as_target_tokenizer():
71
- labels = tokenizer(
72
- sanskrit_sentences,
73
- padding="max_length",
74
- truncation=True,
75
- max_length=128
76
- )
77
-
78
- # Add labels to the model inputs
79
- model_inputs["labels"] = labels["input_ids"]
80
- return model_inputs
81
-
82
- tokenizer = AutoTokenizer.from_pretrained(get_model_name())
83
-
84
- model = M2M100ForConditionalGeneration.from_pretrained(get_model_name())
85
- # I dont know wheter this will be of use or not
86
-
87
- tokenized_train = raw_datasets['train'].map(tokenize_function, batched=True)
88
-
89
-
90
-
91
- tokenized_validation = raw_datasets['validation'].map(tokenize_function, batched=True)
92
-
93
- from transformers import AutoModelForSeq2SeqLM # Instead of TFAutoModel...
94
-
95
- model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
96
-
97
-
98
  # from transformers import M2M100ForConditionalGeneration, AutoModelForCausalLM
99
 
100
  # # Load appropriate model based on phase
@@ -259,28 +208,6 @@ model___name = "SweUmaVarsh/m2m100-en-sa-translation"
259
  # shuffle=False,
260
  # batch_size=8,
261
  # collate_fn=data_collator,
262
- # )
263
-
264
- # from transformers import create_optimizer
265
-
266
- # steps_per_epoch = len(train_dataset)
267
- # num_train_steps = steps_per_epoch * 1 # 1 epoch in your case
268
- # num_warmup_steps = int(0.1 * num_train_steps) # 10% warmup
269
-
270
- # optimizer, _ = create_optimizer(
271
- # init_lr=2e-5,
272
- # num_train_steps=num_train_steps,
273
- # num_warmup_steps=num_warmup_steps,
274
- # weight_decay_rate=0.01
275
- # )
276
-
277
- # model.compile(optimizer=optimizer)
278
- # model.fit(train_dataset, validation_data=val_dataset, epochs=1)
279
-
280
-
281
-
282
-
283
-
284
 
285
  model____name="Rask6723/IT_GR7_En-Sn"
286
  tokenizer = M2M100Tokenizer.from_pretrained(model___name)
 
23
 
24
  # return sanskrit_text, audio_path
25
  # Load model and tokenizer
26
+ # import os
27
+ # import sys
28
+ # import transformers
29
+ # import tensorflow as tf
30
+ # from datasets import load_dataset
31
+ # from transformers import AutoTokenizer
32
+ # from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
33
+ # from transformers import AdamWeightDecay
34
+ # from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
35
 
36
+ # model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
37
 
38
+ # from datasets import load_dataset
39
 
40
+ # raw_datasets = load_dataset("rahular/itihasa", download_mode="force_redownload")
41
 
42
+ # import torch
43
+ # from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments
44
+ # from datasets import load_dataset
45
 
46
  # Load the pre-trained English to Hindi model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  # from transformers import M2M100ForConditionalGeneration, AutoModelForCausalLM
48
 
49
  # # Load appropriate model based on phase
 
208
  # shuffle=False,
209
  # batch_size=8,
210
  # collate_fn=data_collator,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
  model____name="Rask6723/IT_GR7_En-Sn"
213
  tokenizer = M2M100Tokenizer.from_pretrained(model___name)