File size: 12,984 Bytes
229a3ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import numpy as np
import bitarray
import sys
import re
import math
from meteor import encode_meteor,decode_meteor
from utils import get_model, encode_context
from arithmetic import encode_arithmetic, decode_arithmetic
from block_baseline import get_bins, encode_block, decode_block
from huffman_baseline import encode_huffman, decode_huffman
from sample import sample

def encode_message(mode, message_str, context):
    enc, model = get_model(model_name='gpt2')
    ## PARAMETERS
    # message_str = input("input secret message:")
    unicode_enc = False
    # mode = 'meteor'
    # mode = input("Please enter mode (arithmetic, huffman, bins, or sample): ")
    block_size = 3 # for huffman and bins
    temp = 0.9 # for arithmetic
    precision = 26 # for arithmetic
    sample_tokens = 100 # for sample
    topk = 300
    finish_sent=True # whether or not to force finish sent. If so, stats displayed will be for non-finished sentence
    meteor_sort = False
    meteor_random = False
    
    key = b'0x01'*64
    sample_seed_prefix = b'sample'
    nonce = b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'

    ## VALIDATE PARAMETERS
    if mode not in ['meteor', 'arithmetic', 'huffman', 'bins']:
        raise NotImplementedError

    if mode == 'bins':
        bin2words, words2bin = get_bins(len(enc.encoder), block_size)

#     context = \
# """Washington received his initial military training and command with the Virginia Regiment during the French and Indian War. He was later elected to the Virginia House of Burgesses and was named a delegate to the Continental Congress, where he was appointed Commanding General of the nation's Continental Army. Washington led American forces, allied with France, in the defeat of the British at Yorktown. Once victory for the United States was in hand in 1783, Washington resigned his commission.
# """
    # context = "Despite a long history of research and wide-spread applications to censorship resistant systems, practical steganographic systems capable of embedding messages into realistic communication distributions, like text, do not exist." #@param ["Washington received his initial military training and command with the Virginia Regiment during the French and Indian War. He was later elected to the Virginia House of Burgesses and was named a delegate to the Continental Congress, where he was appointed Commanding General of the nation's Continental Army. Washington led American forces, allied with France, in the defeat of the British at Yorktown. Once victory for the United States was in hand in 1783, Washington resigned his commission.", "The Alvarez hypothesis posits that the mass extinction of the dinosaurs and many other living things during the Cretaceous-Paleogene extinction event was caused by the impact of a large asteroid on the Earth. Prior to 2013, it was commonly cited as having happened about 65 million years ago, but Renne and colleagues (2013) gave an updated value of 66 million years. Evidence indicates that the asteroid fell in the Yucatan Peninsula, at Chicxulub, Mexico. The hypothesis is named after the father-and-son team of scientists Luis and Walter Alvarez, who first suggested it in 1980. Shortly afterwards, and independently, the same was suggested by Dutch paleontologist Jan Smit.", "Despite a long history of research and wide-spread applications to censorship resistant systems, practical steganographic systems capable of embedding messages into realistic communication distributions, like text, do not exist."] {allow-input: true}
    context_tokens = encode_context(context, enc)
    # ------------------------------------------------------------------------------------
    # ------------------------------------------------------------------------------------
    # First encode message to uniform bits, without any context
    # (not essential this is arithmetic vs ascii, but it's more efficient when the message is natural language)
    if unicode_enc:
        ba = bitarray.bitarray()
        ba.frombytes(message_str.encode('utf-8'))
        message = ba.tolist()
    else:
        message_ctx = [enc.encoder['<|endoftext|>']]
        message_str += '<eos>'
        message = decode_arithmetic(model, enc, message_str, message_ctx, precision=40, topk=60000)
    # Next encode bits into cover text, using arbitrary context
    Hq = 0
    if mode == 'arithmetic':
        out, nll, kl, words_per_bit = encode_arithmetic(model, enc, message, context_tokens, temp=temp, finish_sent=finish_sent, precision=precision, topk=topk)
    elif mode == 'huffman':
        out, nll, kl, words_per_bit = encode_huffman(model, enc, message, context_tokens, block_size, finish_sent=finish_sent)
    elif mode == 'bins':
        out, nll, kl, words_per_bit = encode_block(model, enc, message, context_tokens, block_size, bin2words, words2bin, finish_sent=finish_sent)
    elif mode == 'meteor':
        out, nll, kl, words_per_bit = encode_meteor(model, enc, message, context_tokens, temp=temp, finish_sent=finish_sent,
                                                    precision=precision, topk=topk, is_sort=meteor_sort, randomize_key=meteor_random, input_key=key, input_nonce=nonce)
    elif mode == 'sample':
        out, nll, kl, Hq = sample(model, enc, sample_tokens, context_tokens, temperature=temp, topk=topk)
        words_per_bit = 1
    text = enc.decode(out)
    
    # print(message)
    # print(len(message))
    print("="*40 + " Encoding " + "="*40)
    print(text)
    print('ppl: %0.2f, kl: %0.3f, words/bit: %0.2f, bits/word: %0.2f, entropy: %.2f' % (math.exp(nll), kl, words_per_bit, 1/words_per_bit, Hq/0.69315))
    
    stats = {
        "ppl": math.exp(nll),
        "kl": kl,
        "wordsbit": words_per_bit,
        "entropy": Hq/0.69315
    }
    # return text, stats
    return text,stats["ppl"], stats["kl"], stats["wordsbit"]

def decode_message(mode, text, context):
    enc, model = get_model(model_name='gpt2')
    ## PARAMETERS
    unicode_enc = False
    # mode = 'meteor'
    block_size = 3 # for huffman and bins
    temp = 0.9 # for arithmetic
    precision = 26 # for arithmetic
    sample_tokens = 100 # for sample
    topk = 300
    finish_sent=True # whether or not to force finish sent. If so, stats displayed will be for non-finished sentence
    meteor_sort = False
    meteor_random = False

    key = b'0x01'*64
    sample_seed_prefix = b'sample'
    nonce = b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'

    ## VALIDATE PARAMETERS
    if mode not in ['meteor', 'arithmetic', 'huffman', 'bins', 'sample']:
        raise NotImplementedError
    if mode == 'bins':
        bin2words, words2bin = get_bins(len(enc.encoder), block_size)

    context_tokens = encode_context(context, enc)

    if mode != 'sample':
        if mode == 'arithmetic':
            message_rec = decode_arithmetic(model, enc, text, context_tokens, temp=temp, precision=precision, topk=topk)
        elif mode == 'huffman':
            message_rec = decode_huffman(model, enc, text, context_tokens, block_size)
        elif mode == 'bins':
            message_rec = decode_block(model, enc, text, context_tokens, block_size, bin2words, words2bin)
        elif mode == 'meteor':
            message_rec = decode_meteor(model, enc, text, context_tokens, temp=temp,
                                precision=precision, topk=topk, is_sort=meteor_sort, input_key=key, input_nonce=nonce)

        print("="*35 + " Recovered Message " + "="*35)
        # print(message_rec)
        # print("=" * 80)
        # Finally map message bits back to original text
        if unicode_enc:
            message_rec = [bool(item) for item in message_rec]
            ba = bitarray.bitarray(message_rec)
            reconst = ba.tobytes().decode('utf-8', 'ignore')
        else:
            message_ctx = [enc.encoder['<|endoftext|>']]
            reconst = encode_arithmetic(model, enc, message_rec, message_ctx, precision=40, topk=60000)
            reconst = enc.decode(reconst[0])
        print(reconst[:-5])
        print("=" * 80)
    return reconst[:-5]

# def main():
#     chosen_context = "Despite a long history of research and wide-spread applications to censorship resistant systems, practical steganographic systems capable of embedding messages into realistic communication distributions, like text, do not exist." #@param ["Washington received his initial military training and command with the Virginia Regiment during the French and Indian War. He was later elected to the Virginia House of Burgesses and was named a delegate to the Continental Congress, where he was appointed Commanding General of the nation's Continental Army. Washington led American forces, allied with France, in the defeat of the British at Yorktown. Once victory for the United States was in hand in 1783, Washington resigned his commission.", "The Alvarez hypothesis posits that the mass extinction of the dinosaurs and many other living things during the Cretaceous-Paleogene extinction event was caused by the impact of a large asteroid on the Earth. Prior to 2013, it was commonly cited as having happened about 65 million years ago, but Renne and colleagues (2013) gave an updated value of 66 million years. Evidence indicates that the asteroid fell in the Yucatan Peninsula, at Chicxulub, Mexico. The hypothesis is named after the father-and-son team of scientists Luis and Walter Alvarez, who first suggested it in 1980. Shortly afterwards, and independently, the same was suggested by Dutch paleontologist Jan Smit.", "Despite a long history of research and wide-spread applications to censorship resistant systems, practical steganographic systems capable of embedding messages into realistic communication distributions, like text, do not exist."] {allow-input: true}
#     # #@title  { run: "auto", display-mode: "form" }
#     message_text = "generate text!" #@param {type:"string"}
#     mode = input("Please enter mode (meteor, arithmetic, huffman, bins, or sample): ")
#     #@title Run me!
#     #@markdown Make sure to re-run this cell if you change the parameters above.
#     x = encode_message(mode, message_text, chosen_context)
#     # print(x[0])
#     y = decode_message(mode, x[0], chosen_context)
   
# if __name__ == '__main__':
#     main()

# chosen_context = "Despite a long history of research and wide-spread applications to censorship resistant systems, practical steganographic systems capable of embedding messages into realistic communication distributions, like text, do not exist." #@param ["Washington received his initial military training and command with the Virginia Regiment during the French and Indian War. He was later elected to the Virginia House of Burgesses and was named a delegate to the Continental Congress, where he was appointed Commanding General of the nation's Continental Army. Washington led American forces, allied with France, in the defeat of the British at Yorktown. Once victory for the United States was in hand in 1783, Washington resigned his commission.", "The Alvarez hypothesis posits that the mass extinction of the dinosaurs and many other living things during the Cretaceous-Paleogene extinction event was caused by the impact of a large asteroid on the Earth. Prior to 2013, it was commonly cited as having happened about 65 million years ago, but Renne and colleagues (2013) gave an updated value of 66 million years. Evidence indicates that the asteroid fell in the Yucatan Peninsula, at Chicxulub, Mexico. The hypothesis is named after the father-and-son team of scientists Luis and Walter Alvarez, who first suggested it in 1980. Shortly afterwards, and independently, the same was suggested by Dutch paleontologist Jan Smit.", "Despite a long history of research and wide-spread applications to censorship resistant systems, practical steganographic systems capable of embedding messages into realistic communication distributions, like text, do not exist."] {allow-input: true}
# # chosen_context = "Washington received his initial military training and command with the Virginia Regiment during the French and Indian War. He was later elected to the Virginia House of Burgesses and was named a delegate to the Continental Congress, where he was appointed Commanding General of the nation's Continental Army. Washington led American forces, allied with France, in the defeat of the British at Yorktown. Once victory for the United States was in hand in 1783, Washington resigned his commission."
# # chosen_context += "\n\n"  # to add a little spacing

# # #@title  { run: "auto", display-mode: "form" }
# message_text = "generate text!" #@param {type:"string"}
# mode = input("Please enter mode (arithmetic, huffman, bins, or sample): ")
# #@title Run me!
# #@markdown Make sure to re-run this cell if you change the parameters above.
# x = encode_message(mode, message_text, chosen_context)
# # print(x[0])
# y = decode_message(mode, x[0], chosen_context)