ketanchaudhary88 commited on
Commit
5b21651
·
verified ·
1 Parent(s): e7ea3dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -17
app.py CHANGED
@@ -1,22 +1,88 @@
1
  import nlpaug.augmenter.word as naw
2
- import nlpaug.augmenter.translator as nat
 
 
3
 
4
- # Initialize augmenters
5
- synonym_aug = naw.SynonymAug(aug_src='en', lang='eng')
6
- back_translate_en_to_hi = nat.BackTranslationAug(from_model_name='Helsinki-NLP/opus-mt-en-hi', to_model_name='Helsinki-NLP/opus-mt-hi-en')
7
 
8
- def augment_text(text, augmentation_type='synonym'):
 
9
  """
10
- Augment text based on the specified type.
11
- augmentation_type: 'synonym' for synonym replacement, 'back_translation' for back translation
 
12
  """
13
- if augmentation_type == 'synonym':
14
- return synonym_aug.augment(text)
15
- elif augmentation_type == 'back_translation':
16
- return back_translate_en_to_hi.augment(text)
17
- else:
18
- return text
19
-
20
- # Test augmentation functions
21
- print(augment_text("What is your address?", augmentation_type='synonym')) # Synonym
22
- print(augment_text("What is your address?", augmentation_type='back_translation')) # Back translation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import nlpaug.augmenter.word as naw
2
+ import nlpaug.augmenter.sentence as nas
3
+ from transformers import MarianMTModel, MarianTokenizer
4
+ import random
5
 
 
 
 
6
 
7
+ # 1. Back Translation using Hugging Face Models (English to Hindi and back to English)
8
+ def back_translate(text, from_model, to_model, from_tokenizer, to_tokenizer):
9
  """
10
+ Perform back translation:
11
+ 1. Translate from English to Hindi
12
+ 2. Translate back from Hindi to English
13
  """
14
+ # Step 1: Translate from English to Hindi
15
+ encoded = from_tokenizer.encode(text, return_tensors="pt", truncation=True, padding=True)
16
+ translated = from_model.generate(encoded, num_beams=4, max_length=50, early_stopping=True)
17
+ hindi_text = to_tokenizer.decode(translated[0], skip_special_tokens=True)
18
+
19
+ # Step 2: Translate back from Hindi to English
20
+ encoded_back = to_tokenizer.encode(hindi_text, return_tensors="pt", truncation=True, padding=True)
21
+ back_translated = to_model.generate(encoded_back, num_beams=4, max_length=50, early_stopping=True)
22
+ back_translated_text = from_tokenizer.decode(back_translated[0], skip_special_tokens=True)
23
+
24
+ return back_translated_text
25
+
26
+
27
+ # Load pre-trained MarianMT models for back-translation
28
+ en_to_hi_model_name = 'Helsinki-NLP/opus-mt-en-hi' # English to Hindi
29
+ hi_to_en_model_name = 'Helsinki-NLP/opus-mt-hi-en' # Hindi to English
30
+
31
+ # Load the models and tokenizers
32
+ en_to_hi_model = MarianMTModel.from_pretrained(en_to_hi_model_name)
33
+ en_to_hi_tokenizer = MarianTokenizer.from_pretrained(en_to_hi_model_name)
34
+
35
+ hi_to_en_model = MarianMTModel.from_pretrained(hi_to_en_model_name)
36
+ hi_to_en_tokenizer = MarianTokenizer.from_pretrained(hi_to_en_model_name)
37
+
38
+ # 2. Synonym Augmentation using nlpaug
39
+ def synonym_augmentation(text):
40
+ augmenter = naw.SynonymAug(aug_src='en', lang='eng')
41
+ return augmenter.augment(text)
42
+
43
+ # 3. Random Deletion using nlpaug
44
+ def random_deletion(text):
45
+ augmenter = naw.RandomWordAug(action="delete", aug_p=0.3) # Deleting 30% of words
46
+ return augmenter.augment(text)
47
+
48
+ # 4. Random Insertion using nlpaug
49
+ def random_insertion(text):
50
+ augmenter = naw.RandomWordAug(action="insert", aug_p=0.3) # Inserting 30% of random words
51
+ return augmenter.augment(text)
52
+
53
+ # 5. Random Swap using nlpaug (optional)
54
+ def random_swap(text):
55
+ augmenter = naw.RandomWordAug(action="swap", aug_p=0.3) # Swapping 30% of words
56
+ return augmenter.augment(text)
57
+
58
+ # 6. Combining all augmentations
59
+ def augment_text(text):
60
+ augmented_data = []
61
+
62
+ # 1. Back-translation augmentation
63
+ augmented_data.append(back_translate(text, en_to_hi_model, hi_to_en_model, en_to_hi_tokenizer, hi_to_en_tokenizer))
64
+
65
+ # 2. Synonym replacement
66
+ augmented_data.append(synonym_augmentation(text))
67
+
68
+ # 3. Random deletion
69
+ augmented_data.append(random_deletion(text))
70
+
71
+ # 4. Random insertion
72
+ augmented_data.append(random_insertion(text))
73
+
74
+ # 5. Random swap (optional)
75
+ augmented_data.append(random_swap(text))
76
+
77
+ return augmented_data
78
+
79
+ # Example usage
80
+ original_text = "What is your address?"
81
+
82
+ # Get augmented text using different techniques
83
+ augmented_texts = augment_text(original_text)
84
+
85
+ # Print original and augmented texts
86
+ print("Original Text:", original_text)
87
+ for i, augmented in enumerate(augmented_texts, 1):
88
+ print(f"Augmented {i}: {augmented}")