suayptalha commited on
Commit
9a23d03
·
verified ·
1 Parent(s): 34768ec

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +177 -1
README.md CHANGED
@@ -32,11 +32,187 @@ Alphabet (output):
32
 
33
  Here 'r' is number 1 in the alphabet and that is why we use 'a' instead of 'r' in encoding.
34
 
35
- Suggested Usage:
36
  ```py
37
  #Load the model and tokenizer
38
  cipher_text = "" #Encoded text here!
39
  inputs = tokenizer(cipher_text, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
40
  outputs = model.generate(inputs["input_ids"], max_length=256)
41
  decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  ```
 
32
 
33
  Here 'r' is number 1 in the alphabet and that is why we use 'a' instead of 'r' in encoding.
34
 
35
+ Single Model Usage:
36
  ```py
37
  #Load the model and tokenizer
38
  cipher_text = "" #Encoded text here!
39
  inputs = tokenizer(cipher_text, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
40
  outputs = model.generate(inputs["input_ids"], max_length=256)
41
  decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
42
+ ```
43
+
44
+ Full Pipeline Usage:
45
+ ```py
46
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
47
+ import torch
48
+ from string import ascii_lowercase
49
+ import Levenshtein
50
+ import random
51
+
52
+ tokenizer = AutoTokenizer.from_pretrained("Cipher-AI/Substitution-Cipher-Alphabet-Eng")
53
+ model = AutoModelForSeq2SeqLM.from_pretrained("Cipher-AI/Substitution-Cipher-Alphabet-Eng")
54
+
55
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
56
+
57
+ alphabet_model = model.to(device)
58
+ correction_model = AutoModelForSeq2SeqLM.from_pretrained("Cipher-AI/AutoCorrect-EN-v2").to(device)
59
+
60
+ def similarity_percentage(s1, s2):
61
+ distance = Levenshtein.distance(s1, s2)
62
+
63
+ max_len = max(len(s1), len(s2))
64
+
65
+ similarity = (1 - distance / max_len) * 100
66
+
67
+ return similarity
68
+
69
+ def decode(cipher_text, key):
70
+ decipher_map = {ascii_lowercase[i]: j for i, j in enumerate(key[:26])}
71
+ decipher_map.update({ascii_lowercase[i].upper(): j.upper() for i, j in enumerate(key[:26])})
72
+ ans = ''.join(map(lambda x: decipher_map[x] if x in decipher_map else x, cipher_text))
73
+ return ans
74
+
75
+ def model_pass(model, input, max_length=256):
76
+ inputs = tokenizer(input, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
77
+ outputs = model.generate(inputs["input_ids"], max_length=max_length)
78
+ result = tokenizer.decode(outputs[0], skip_special_tokens=True)
79
+ return result
80
+
81
+ def decipher(cipher_text, key) -> str:
82
+ decipher_map = {ascii_lowercase[i]: j for i, j in enumerate(key[0])}
83
+ decipher_map.update({ascii_lowercase[i].upper(): j.upper() for i, j in enumerate(key[0])})
84
+
85
+ result = ''.join(map(lambda x: decipher_map[x] if x in decipher_map else x, cipher_text[0]))
86
+
87
+ return result
88
+
89
+ def cipher(plain_text) -> tuple[str, list]:
90
+ alphabet_map = list(ascii_lowercase)
91
+ random.shuffle(alphabet_map)
92
+ alphabet_map = {i : j for i, j in zip(ascii_lowercase, alphabet_map)}
93
+
94
+ alphabet_map.update({i.upper() : j.upper() for i, j in alphabet_map.items()})
95
+
96
+ cipher_text = ''.join(map(lambda x: alphabet_map[x] if x in alphabet_map else x, plain_text))
97
+ return cipher_text, alphabet_map
98
+
99
+ def correct_text(cipher_text, model_output):
100
+ cipher_text = cipher_text.split(' ')
101
+ model_output = model_output.split(' ')
102
+
103
+ letter_map = {i: {j: 0 for j in ascii_lowercase} for i in ascii_lowercase}
104
+
105
+
106
+ # Levenstein distance for lenghts of words
107
+ n = len(cipher_text)
108
+ m = len(model_output)
109
+
110
+ i = 0
111
+ j = 0
112
+ dp = [[0 for _ in range(m + 1)] for _ in range(n + 1)]
113
+
114
+ for i in range(n + 1):
115
+ dp[i][0] = i
116
+
117
+
118
+ for j in range(m + 1):
119
+ dp[0][j] = j
120
+
121
+ for i in range(1, n + 1):
122
+ for j in range(1, m + 1):
123
+ if len(cipher_text[i - 1]) == len(model_output[j - 1]):
124
+ dp[i][j] = dp[i - 1][j - 1]
125
+
126
+ else:
127
+ dp[i][j] = min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) + 1
128
+
129
+ i = n
130
+ j = m
131
+ while i > 0 and j > 0:
132
+
133
+ before = min([(0, dp[i - 1][j - 1]), (1, dp[i - 1][j]), (2, dp[i][j - 1])], key=lambda x: x[1])
134
+ match before[0]:
135
+ case 0:
136
+ if dp[i - 1][j - 1] == dp[i][j]:
137
+ # If the same we add them to letter map
138
+ cipher = cipher_text[i-1]
139
+ model_o = model_output[j-1]
140
+
141
+ for c_letter, m_letter in zip(cipher.lower(), model_o.lower()):
142
+ if c_letter in letter_map and m_letter in letter_map[c_letter]:
143
+ letter_map[c_letter][m_letter] += 1
144
+
145
+ i = i - 1
146
+ j = j - 1
147
+ case 1:
148
+ i = i - 1
149
+ case 2:
150
+ j = j - 1
151
+
152
+ for letter in ascii_lowercase:
153
+ letter_sum = sum(letter_map[letter].values())
154
+ if letter_sum == 0:
155
+ # That letter wasn't in the text
156
+ letter_map[letter] = None
157
+ continue
158
+
159
+ # Sorted from most accuring to least
160
+ letter_map[letter] = [(k, v / letter_sum) for k, v in sorted(letter_map[letter].items(), key=lambda item: item[1], reverse=True)]
161
+
162
+ change_map = {
163
+ i : None for i in ascii_lowercase
164
+ }
165
+
166
+ for i in range(len(ascii_lowercase)):
167
+ for letter in ascii_lowercase:
168
+ if letter_map[letter] is None:
169
+ continue # That letter wasn't in the text
170
+
171
+ # If None then it didn't get substituted earlier
172
+ map_letter = letter_map[letter][i][0]
173
+ if (letter_map[letter][i][1] > 0 and (change_map[map_letter] is None
174
+ or (change_map[map_letter][2] < letter_map[letter][i][1] and change_map[map_letter][1] >= i))):
175
+ change_map[map_letter] = (letter, i, letter_map[letter][i][1])
176
+ # Letter, iteration, percentage
177
+
178
+ change_map = {i[1][0]: i[0] for i in change_map.items() if i[1] is not None}
179
+
180
+ for letter in ascii_lowercase:
181
+ if letter not in change_map:
182
+ change_map[letter] = '.'
183
+
184
+
185
+ # Add uppercases
186
+ change_map.update(
187
+ {
188
+ i[0].upper() : i[1].upper() for i in change_map.items()
189
+ }
190
+ )
191
+
192
+ new_text = []
193
+ for cipher in cipher_text:
194
+ new_word = ""
195
+ for c_letter in cipher:
196
+ if c_letter in change_map:
197
+ new_word += change_map[c_letter]
198
+
199
+ else:
200
+ new_word += c_letter
201
+
202
+
203
+ new_text.append(new_word)
204
+
205
+ return ' '.join(new_text)
206
+
207
+ def crack_sub(cipher_text):
208
+ output = model_pass(alphabet_model, cipher_text, 26)
209
+ decoded = decode(cipher_text, output)
210
+ second_pass = model_pass(correction_model, decoded, len(decoded))
211
+ second_text = correct_text(cipher_text, second_pass)
212
+
213
+ return second_text
214
+
215
+ """
216
+ Use crack_sub() function to solve monoalphabetic substitution ciphers!
217
+ """
218
  ```