Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,22 +1,88 @@
|
|
1 |
import nlpaug.augmenter.word as naw
|
2 |
-
import nlpaug.augmenter.
|
|
|
|
|
3 |
|
4 |
-
# Initialize augmenters
|
5 |
-
synonym_aug = naw.SynonymAug(aug_src='en', lang='eng')
|
6 |
-
back_translate_en_to_hi = nat.BackTranslationAug(from_model_name='Helsinki-NLP/opus-mt-en-hi', to_model_name='Helsinki-NLP/opus-mt-hi-en')
|
7 |
|
8 |
-
|
|
|
9 |
"""
|
10 |
-
|
11 |
-
|
|
|
12 |
"""
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import nlpaug.augmenter.word as naw
|
2 |
+
import nlpaug.augmenter.sentence as nas
|
3 |
+
from transformers import MarianMTModel, MarianTokenizer
|
4 |
+
import random
|
5 |
|
|
|
|
|
|
|
6 |
|
7 |
+
# 1. Back Translation using Hugging Face Models (English to Hindi and back to English)
|
8 |
+
def back_translate(text, from_model, to_model, from_tokenizer, to_tokenizer):
|
9 |
"""
|
10 |
+
Perform back translation:
|
11 |
+
1. Translate from English to Hindi
|
12 |
+
2. Translate back from Hindi to English
|
13 |
"""
|
14 |
+
# Step 1: Translate from English to Hindi
|
15 |
+
encoded = from_tokenizer.encode(text, return_tensors="pt", truncation=True, padding=True)
|
16 |
+
translated = from_model.generate(encoded, num_beams=4, max_length=50, early_stopping=True)
|
17 |
+
hindi_text = to_tokenizer.decode(translated[0], skip_special_tokens=True)
|
18 |
+
|
19 |
+
# Step 2: Translate back from Hindi to English
|
20 |
+
encoded_back = to_tokenizer.encode(hindi_text, return_tensors="pt", truncation=True, padding=True)
|
21 |
+
back_translated = to_model.generate(encoded_back, num_beams=4, max_length=50, early_stopping=True)
|
22 |
+
back_translated_text = from_tokenizer.decode(back_translated[0], skip_special_tokens=True)
|
23 |
+
|
24 |
+
return back_translated_text
|
25 |
+
|
26 |
+
|
27 |
+
# Load pre-trained MarianMT models for back-translation
|
28 |
+
en_to_hi_model_name = 'Helsinki-NLP/opus-mt-en-hi' # English to Hindi
|
29 |
+
hi_to_en_model_name = 'Helsinki-NLP/opus-mt-hi-en' # Hindi to English
|
30 |
+
|
31 |
+
# Load the models and tokenizers
|
32 |
+
en_to_hi_model = MarianMTModel.from_pretrained(en_to_hi_model_name)
|
33 |
+
en_to_hi_tokenizer = MarianTokenizer.from_pretrained(en_to_hi_model_name)
|
34 |
+
|
35 |
+
hi_to_en_model = MarianMTModel.from_pretrained(hi_to_en_model_name)
|
36 |
+
hi_to_en_tokenizer = MarianTokenizer.from_pretrained(hi_to_en_model_name)
|
37 |
+
|
38 |
+
# 2. Synonym Augmentation using nlpaug
|
39 |
+
def synonym_augmentation(text):
|
40 |
+
augmenter = naw.SynonymAug(aug_src='en', lang='eng')
|
41 |
+
return augmenter.augment(text)
|
42 |
+
|
43 |
+
# 3. Random Deletion using nlpaug
|
44 |
+
def random_deletion(text):
|
45 |
+
augmenter = naw.RandomWordAug(action="delete", aug_p=0.3) # Deleting 30% of words
|
46 |
+
return augmenter.augment(text)
|
47 |
+
|
48 |
+
# 4. Random Insertion using nlpaug
|
49 |
+
def random_insertion(text):
|
50 |
+
augmenter = naw.RandomWordAug(action="insert", aug_p=0.3) # Inserting 30% of random words
|
51 |
+
return augmenter.augment(text)
|
52 |
+
|
53 |
+
# 5. Random Swap using nlpaug (optional)
|
54 |
+
def random_swap(text):
|
55 |
+
augmenter = naw.RandomWordAug(action="swap", aug_p=0.3) # Swapping 30% of words
|
56 |
+
return augmenter.augment(text)
|
57 |
+
|
58 |
+
# 6. Combining all augmentations
|
59 |
+
def augment_text(text):
|
60 |
+
augmented_data = []
|
61 |
+
|
62 |
+
# 1. Back-translation augmentation
|
63 |
+
augmented_data.append(back_translate(text, en_to_hi_model, hi_to_en_model, en_to_hi_tokenizer, hi_to_en_tokenizer))
|
64 |
+
|
65 |
+
# 2. Synonym replacement
|
66 |
+
augmented_data.append(synonym_augmentation(text))
|
67 |
+
|
68 |
+
# 3. Random deletion
|
69 |
+
augmented_data.append(random_deletion(text))
|
70 |
+
|
71 |
+
# 4. Random insertion
|
72 |
+
augmented_data.append(random_insertion(text))
|
73 |
+
|
74 |
+
# 5. Random swap (optional)
|
75 |
+
augmented_data.append(random_swap(text))
|
76 |
+
|
77 |
+
return augmented_data
|
78 |
+
|
79 |
+
# Example usage
|
80 |
+
original_text = "What is your address?"
|
81 |
+
|
82 |
+
# Get augmented text using different techniques
|
83 |
+
augmented_texts = augment_text(original_text)
|
84 |
+
|
85 |
+
# Print original and augmented texts
|
86 |
+
print("Original Text:", original_text)
|
87 |
+
for i, augmented in enumerate(augmented_texts, 1):
|
88 |
+
print(f"Augmented {i}: {augmented}")
|