File size: 4,150 Bytes
5eea398 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
class SpaceGen_preprocessing:
def __init__(self, content = "helloworld", size= 10, past_capacity = 5 , future_capacity = 5):
self.size = size
self.content = content[:self.size]
self.past_capacity = past_capacity
self.future_capacity = future_capacity
self.num_features = self.past_capacity + self.future_capacity + 1 # 1 for letter
self.vocabulary = []
def create_vocabulary(self, correct_txt):
'''
Returns the unique letters of the given text + '-1'
'''
vocabulary = list({b for b in bytes(correct_txt, 'utf-8')})
vocabulary.append(-1)
vocabulary = sorted(vocabulary)
self.vocabulary = vocabulary
return None
@staticmethod
def create_decision_vector(W: list, C: list):
'''
Returns the Decision Vector(D),
given Wrong Vector(W) and Correct Vector(C)
'''
D = []
w_i = 0
c_i = 0
while w_i < len(W):
if W[w_i] == C[c_i]:
D.append('K')
w_i += 1
c_i += 1
elif W[w_i] == 32 and C[c_i] != 32 :
D.append('D')
w_i += 1
elif C[c_i] == 32 and W[w_i] != 32:
D.append('I')
c_i += 1
w_i += 1
else:
c_i += 1
return D
@staticmethod
def to_correct(W, D):
'''
Returns the correct text,
given Wrong Vector(W) and Decision Vector(D)
'''
output_vec = []
for i in range(0, len(D)):
if D[i] == 'K':
output_vec.append(W[i])
elif D[i] == 'I':
output_vec.append(32)
output_vec.append(W[i])
elif D[i] == 'D':
pass
decoded_text = bytes(output_vec).decode()
return decoded_text
@staticmethod
def to_bytes_list(text: str, encoding = 'UTF-8'):
'''
Returns the bytes list of a given text
'''
return [b for b in bytes(text, encoding)]
@staticmethod
def to_one_hot_df(wrong_txt, D):
'''
Returns the one hot encoded dataframe,
given Wrong Vector(W) and Decision Vector(D)
'''
df = pd.DataFrame({'letter':[l for l in wrong_txt],'decision':D})
encoding = OneHotEncoder()
y_matrix = encoding.fit_transform(df[['decision']])
onehot_df = pd.DataFrame(y_matrix.toarray(), columns = encoding.get_feature_names_out(['decision']) )
onehot_df = onehot_df.astype('int')
example_df = pd.concat([df, onehot_df], axis=1)
example_df =example_df.drop(['decision'], axis=1)
return example_df
@staticmethod
def decode_vec(arr):
'''
Returns the decoded text,
given the bytes list
'''
return bytes(arr).decode()
@staticmethod
def sliding_window_past(arr, window_size = 5):
'''
Returns the past sliding window of the given array and window size
'''
arr = list(arr)
new_arr = []
for i in range(len(arr)):
start_window = max(0, i- window_size)
tmp_seq = arr[start_window:i]
if window_size - len(tmp_seq) ==0:
new_arr.append(tmp_seq)
else:
new_arr.append([-1] * (window_size - len(tmp_seq)) + tmp_seq)
return new_arr
@staticmethod
def sliding_window_future(arr, window_size = 5):
'''
Returns the future sliding window of the given array and window size
'''
arr = list(arr)
seq = []
for i in range(len(arr)):
p = arr[i+1:i+window_size+1]
if window_size - len(p) ==0:
seq.append(p)
else:
seq.append(p + [-1] * (window_size - len(p)))
return seq
@staticmethod
def insert_random_spaces(text, percent = .25):
'''
Returns the text with random spaces inserted
'''
l = list(text)
rand_indices = np.random.randint(0, len(l)+1, int(np.round(len(l) * percent)))
print(rand_indices)
t = 1
for i in range(len(l)+1):
if i in rand_indices:
l.insert(i + t, ' ')
t+=1
new_txt = ''.join(l).strip()
return new_txt
@staticmethod
def prob_to_decision(a):
'''
Return I or K given probability vector
'''
if a[0] > a[1]:
return 'I'
else:
return 'K' |