Spaces:

asafd60
/

SpaceGen

Sleeping

File size: 4,150 Bytes

5eea398

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

class SpaceGen_preprocessing:
  def __init__(self, content = "helloworld", size= 10, past_capacity = 5 , future_capacity = 5):
    self.size = size
    self.content = content[:self.size]
    self.past_capacity = past_capacity
    self.future_capacity = future_capacity
    self.num_features = self.past_capacity + self.future_capacity + 1 # 1 for letter
    self.vocabulary = []

  def create_vocabulary(self, correct_txt):
    '''
    Returns the unique letters of the given text + '-1'
    '''
    vocabulary = list({b for b in bytes(correct_txt, 'utf-8')})
    vocabulary.append(-1)
    vocabulary = sorted(vocabulary)
    self.vocabulary = vocabulary
    return None

  @staticmethod
  def create_decision_vector(W: list, C: list):
    '''
    Returns the Decision Vector(D),
    given Wrong Vector(W) and Correct Vector(C)
    '''
    D = []
    w_i = 0
    c_i = 0
    while w_i < len(W):
      if W[w_i] == C[c_i]:
          D.append('K')
          w_i += 1
          c_i += 1
      elif W[w_i] == 32 and C[c_i] != 32 :
          D.append('D')
          w_i += 1
      elif C[c_i] == 32 and W[w_i] != 32:
          D.append('I')
          c_i += 1
          w_i += 1
      else:
          c_i += 1
    return D


  @staticmethod
  def to_correct(W, D):
      '''
      Returns the correct text,
      given Wrong Vector(W) and Decision Vector(D)
      '''
      output_vec = []
      for i in range(0, len(D)):
        if D[i] == 'K':
          output_vec.append(W[i])
        elif D[i] == 'I':
          output_vec.append(32)
          output_vec.append(W[i])
        elif D[i] == 'D':
          pass
      decoded_text = bytes(output_vec).decode()
      return decoded_text


  @staticmethod
  def to_bytes_list(text: str, encoding = 'UTF-8'):
      '''
      Returns the bytes list of a given text
      '''
      return [b for b in bytes(text, encoding)]


  @staticmethod
  def to_one_hot_df(wrong_txt, D):
    '''
    Returns the one hot encoded dataframe,
    given Wrong Vector(W) and Decision Vector(D)
    '''
    df = pd.DataFrame({'letter':[l for l in wrong_txt],'decision':D})
    encoding =  OneHotEncoder()
    y_matrix =  encoding.fit_transform(df[['decision']])
    onehot_df = pd.DataFrame(y_matrix.toarray(), columns = encoding.get_feature_names_out(['decision']) )
    onehot_df = onehot_df.astype('int')
    example_df = pd.concat([df, onehot_df], axis=1)
    example_df =example_df.drop(['decision'], axis=1)
    return example_df


  @staticmethod
  def decode_vec(arr):
    '''
    Returns the decoded text,
    given the bytes list
    '''
    return bytes(arr).decode()


  @staticmethod
  def sliding_window_past(arr, window_size = 5):
    '''
    Returns the past sliding window of the given array and window size
    '''
    arr = list(arr)
    new_arr = []
    for i in range(len(arr)):
      start_window = max(0, i- window_size)
      tmp_seq = arr[start_window:i]
      if window_size - len(tmp_seq) ==0:
        new_arr.append(tmp_seq)
      else:
        new_arr.append([-1] * (window_size - len(tmp_seq)) + tmp_seq)
    return new_arr


  @staticmethod
  def sliding_window_future(arr, window_size = 5):
    '''
    Returns the future sliding window of the given array and window size
    '''
    arr = list(arr)
    seq = []
    for i in range(len(arr)):
      p = arr[i+1:i+window_size+1]
      if window_size - len(p) ==0:
        seq.append(p)
      else:
        seq.append(p + [-1] * (window_size - len(p)))
    return seq

  @staticmethod
  def insert_random_spaces(text, percent = .25):
    '''
    Returns the text with random spaces inserted
    '''
    l = list(text)
    rand_indices = np.random.randint(0, len(l)+1, int(np.round(len(l) * percent)))
    print(rand_indices)
    t = 1
    for i in range(len(l)+1):
      if i in rand_indices:
          l.insert(i + t, ' ')
          t+=1
    new_txt = ''.join(l).strip()
    return new_txt


  @staticmethod
  def prob_to_decision(a):
    '''
    Return I or K given probability vector
    '''
    if a[0] > a[1]:
      return 'I'
    else:
      return 'K'