Spaces:

ketanchaudhary88
/

DataAugmentation

Runtime error

App Files Files Community

ketanchaudhary88 commited on Nov 16, 2024

Commit

5b21651

verified ·

1 Parent(s): e7ea3dd

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -17

app.py CHANGED Viewed

@@ -1,22 +1,88 @@
 import nlpaug.augmenter.word as naw
-import nlpaug.augmenter.translator as nat
-# Initialize augmenters
-synonym_aug = naw.SynonymAug(aug_src='en', lang='eng')
-back_translate_en_to_hi = nat.BackTranslationAug(from_model_name='Helsinki-NLP/opus-mt-en-hi', to_model_name='Helsinki-NLP/opus-mt-hi-en')
-def augment_text(text, augmentation_type='synonym'):
     """
-    Augment text based on the specified type.
-    augmentation_type: 'synonym' for synonym replacement, 'back_translation' for back translation
     """
-    if augmentation_type == 'synonym':
-        return synonym_aug.augment(text)
-    elif augmentation_type == 'back_translation':
-        return back_translate_en_to_hi.augment(text)
-    else:
-        return text
-# Test augmentation functions
-print(augment_text("What is your address?", augmentation_type='synonym'))  # Synonym
-print(augment_text("What is your address?", augmentation_type='back_translation'))  # Back translation

 import nlpaug.augmenter.word as naw
+import nlpaug.augmenter.sentence as nas
+from transformers import MarianMTModel, MarianTokenizer
+import random
+# 1. Back Translation using Hugging Face Models (English to Hindi and back to English)
+def back_translate(text, from_model, to_model, from_tokenizer, to_tokenizer):
     """
+    Perform back translation:
+    1. Translate from English to Hindi
+    2. Translate back from Hindi to English
     """
+    # Step 1: Translate from English to Hindi
+    encoded = from_tokenizer.encode(text, return_tensors="pt", truncation=True, padding=True)
+    translated = from_model.generate(encoded, num_beams=4, max_length=50, early_stopping=True)
+    hindi_text = to_tokenizer.decode(translated[0], skip_special_tokens=True)
+    # Step 2: Translate back from Hindi to English
+    encoded_back = to_tokenizer.encode(hindi_text, return_tensors="pt", truncation=True, padding=True)
+    back_translated = to_model.generate(encoded_back, num_beams=4, max_length=50, early_stopping=True)
+    back_translated_text = from_tokenizer.decode(back_translated[0], skip_special_tokens=True)
+    return back_translated_text
+# Load pre-trained MarianMT models for back-translation
+en_to_hi_model_name = 'Helsinki-NLP/opus-mt-en-hi'  # English to Hindi
+hi_to_en_model_name = 'Helsinki-NLP/opus-mt-hi-en'  # Hindi to English
+# Load the models and tokenizers
+en_to_hi_model = MarianMTModel.from_pretrained(en_to_hi_model_name)
+en_to_hi_tokenizer = MarianTokenizer.from_pretrained(en_to_hi_model_name)
+hi_to_en_model = MarianMTModel.from_pretrained(hi_to_en_model_name)
+hi_to_en_tokenizer = MarianTokenizer.from_pretrained(hi_to_en_model_name)
+# 2. Synonym Augmentation using nlpaug
+def synonym_augmentation(text):
+    augmenter = naw.SynonymAug(aug_src='en', lang='eng')
+    return augmenter.augment(text)
+# 3. Random Deletion using nlpaug
+def random_deletion(text):
+    augmenter = naw.RandomWordAug(action="delete", aug_p=0.3)  # Deleting 30% of words
+    return augmenter.augment(text)
+# 4. Random Insertion using nlpaug
+def random_insertion(text):
+    augmenter = naw.RandomWordAug(action="insert", aug_p=0.3)  # Inserting 30% of random words
+    return augmenter.augment(text)
+# 5. Random Swap using nlpaug (optional)
+def random_swap(text):
+    augmenter = naw.RandomWordAug(action="swap", aug_p=0.3)  # Swapping 30% of words
+    return augmenter.augment(text)
+# 6. Combining all augmentations
+def augment_text(text):
+    augmented_data = []
+    # 1. Back-translation augmentation
+    augmented_data.append(back_translate(text, en_to_hi_model, hi_to_en_model, en_to_hi_tokenizer, hi_to_en_tokenizer))
+    # 2. Synonym replacement
+    augmented_data.append(synonym_augmentation(text))
+    # 3. Random deletion
+    augmented_data.append(random_deletion(text))
+    # 4. Random insertion
+    augmented_data.append(random_insertion(text))
+    # 5. Random swap (optional)
+    augmented_data.append(random_swap(text))
+    return augmented_data
+# Example usage
+original_text = "What is your address?"
+# Get augmented text using different techniques
+augmented_texts = augment_text(original_text)
+# Print original and augmented texts
+print("Original Text:", original_text)
+for i, augmented in enumerate(augmented_texts, 1):
+    print(f"Augmented {i}: {augmented}")