File size: 3,311 Bytes
1bb47ca
5b21651
 
 
600489b
 
5b21651
 
600489b
5b21651
 
 
600489b
5b21651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
from transformers import MarianMTModel, MarianTokenizer
import random


# 1. Back Translation using Hugging Face Models (English to Hindi and back to English)
def back_translate(text, from_model, to_model, from_tokenizer, to_tokenizer):
    """
    Perform back translation:
    1. Translate from English to Hindi
    2. Translate back from Hindi to English
    """
    # Step 1: Translate from English to Hindi
    encoded = from_tokenizer.encode(text, return_tensors="pt", truncation=True, padding=True)
    translated = from_model.generate(encoded, num_beams=4, max_length=50, early_stopping=True)
    hindi_text = to_tokenizer.decode(translated[0], skip_special_tokens=True)

    # Step 2: Translate back from Hindi to English
    encoded_back = to_tokenizer.encode(hindi_text, return_tensors="pt", truncation=True, padding=True)
    back_translated = to_model.generate(encoded_back, num_beams=4, max_length=50, early_stopping=True)
    back_translated_text = from_tokenizer.decode(back_translated[0], skip_special_tokens=True)

    return back_translated_text


# Load pre-trained MarianMT models for back-translation
en_to_hi_model_name = 'Helsinki-NLP/opus-mt-en-hi'  # English to Hindi
hi_to_en_model_name = 'Helsinki-NLP/opus-mt-hi-en'  # Hindi to English

# Load the models and tokenizers
en_to_hi_model = MarianMTModel.from_pretrained(en_to_hi_model_name)
en_to_hi_tokenizer = MarianTokenizer.from_pretrained(en_to_hi_model_name)

hi_to_en_model = MarianMTModel.from_pretrained(hi_to_en_model_name)
hi_to_en_tokenizer = MarianTokenizer.from_pretrained(hi_to_en_model_name)

# 2. Synonym Augmentation using nlpaug
def synonym_augmentation(text):
    augmenter = naw.SynonymAug(aug_src='en', lang='eng')
    return augmenter.augment(text)

# 3. Random Deletion using nlpaug
def random_deletion(text):
    augmenter = naw.RandomWordAug(action="delete", aug_p=0.3)  # Deleting 30% of words
    return augmenter.augment(text)

# 4. Random Insertion using nlpaug
def random_insertion(text):
    augmenter = naw.RandomWordAug(action="insert", aug_p=0.3)  # Inserting 30% of random words
    return augmenter.augment(text)

# 5. Random Swap using nlpaug (optional)
def random_swap(text):
    augmenter = naw.RandomWordAug(action="swap", aug_p=0.3)  # Swapping 30% of words
    return augmenter.augment(text)

# 6. Combining all augmentations
def augment_text(text):
    augmented_data = []

    # 1. Back-translation augmentation
    augmented_data.append(back_translate(text, en_to_hi_model, hi_to_en_model, en_to_hi_tokenizer, hi_to_en_tokenizer))
    
    # 2. Synonym replacement
    augmented_data.append(synonym_augmentation(text))
    
    # 3. Random deletion
    augmented_data.append(random_deletion(text))
    
    # 4. Random insertion
    augmented_data.append(random_insertion(text))
    
    # 5. Random swap (optional)
    augmented_data.append(random_swap(text))
    
    return augmented_data

# Example usage
original_text = "What is your address?"

# Get augmented text using different techniques
augmented_texts = augment_text(original_text)

# Print original and augmented texts
print("Original Text:", original_text)
for i, augmented in enumerate(augmented_texts, 1):
    print(f"Augmented {i}: {augmented}")