import nlpaug.augmenter.word as naw import nlpaug.augmenter.sentence as nas from transformers import MarianMTModel, MarianTokenizer import random # 1. Back Translation using Hugging Face Models (English to Hindi and back to English) def back_translate(text, from_model, to_model, from_tokenizer, to_tokenizer): """ Perform back translation: 1. Translate from English to Hindi 2. Translate back from Hindi to English """ # Step 1: Translate from English to Hindi encoded = from_tokenizer.encode(text, return_tensors="pt", truncation=True, padding=True) translated = from_model.generate(encoded, num_beams=4, max_length=50, early_stopping=True) hindi_text = to_tokenizer.decode(translated[0], skip_special_tokens=True) # Step 2: Translate back from Hindi to English encoded_back = to_tokenizer.encode(hindi_text, return_tensors="pt", truncation=True, padding=True) back_translated = to_model.generate(encoded_back, num_beams=4, max_length=50, early_stopping=True) back_translated_text = from_tokenizer.decode(back_translated[0], skip_special_tokens=True) return back_translated_text # Load pre-trained MarianMT models for back-translation en_to_hi_model_name = 'Helsinki-NLP/opus-mt-en-hi' # English to Hindi hi_to_en_model_name = 'Helsinki-NLP/opus-mt-hi-en' # Hindi to English # Load the models and tokenizers en_to_hi_model = MarianMTModel.from_pretrained(en_to_hi_model_name) en_to_hi_tokenizer = MarianTokenizer.from_pretrained(en_to_hi_model_name) hi_to_en_model = MarianMTModel.from_pretrained(hi_to_en_model_name) hi_to_en_tokenizer = MarianTokenizer.from_pretrained(hi_to_en_model_name) # 2. Synonym Augmentation using nlpaug def synonym_augmentation(text): augmenter = naw.SynonymAug(aug_src='en', lang='eng') return augmenter.augment(text) # 3. Random Deletion using nlpaug def random_deletion(text): augmenter = naw.RandomWordAug(action="delete", aug_p=0.3) # Deleting 30% of words return augmenter.augment(text) # 4. Random Insertion using nlpaug def random_insertion(text): augmenter = naw.RandomWordAug(action="insert", aug_p=0.3) # Inserting 30% of random words return augmenter.augment(text) # 5. Random Swap using nlpaug (optional) def random_swap(text): augmenter = naw.RandomWordAug(action="swap", aug_p=0.3) # Swapping 30% of words return augmenter.augment(text) # 6. Combining all augmentations def augment_text(text): augmented_data = [] # 1. Back-translation augmentation augmented_data.append(back_translate(text, en_to_hi_model, hi_to_en_model, en_to_hi_tokenizer, hi_to_en_tokenizer)) # 2. Synonym replacement augmented_data.append(synonym_augmentation(text)) # 3. Random deletion augmented_data.append(random_deletion(text)) # 4. Random insertion augmented_data.append(random_insertion(text)) # 5. Random swap (optional) augmented_data.append(random_swap(text)) return augmented_data # Example usage original_text = "What is your address?" # Get augmented text using different techniques augmented_texts = augment_text(original_text) # Print original and augmented texts print("Original Text:", original_text) for i, augmented in enumerate(augmented_texts, 1): print(f"Augmented {i}: {augmented}")