File size: 4,119 Bytes
01c3073
bbc1fe3
 
01c3073
 
d3eb07d
 
b1dd808
 
3f47af7
 
edab27e
 
 
 
 
 
 
 
 
 
 
fb5ce4e
 
 
 
 
 
edab27e
 
8ec4446
edab27e
 
8ea9eda
b1dd808
 
d3eb07d
1311a82
09910fb
b1dd808
d3eb07d
073c7e0
057ba08
073c7e0
d3eb07d
e2f9f6a
 
 
 
 
 
 
 
 
 
374968b
b1dd808
0d90edb
 
 
 
01c3073
 
6ab20b3
3f47af7
8ec4446
 
1311a82
 
 
 
 
 
fb5ce4e
1311a82
 
 
6ab20b3
 
 
 
3f47af7
0adad70
f183304
0adad70
f183304
bbc1fe3
0d90edb
 
 
0b155f0
4954b56
 
 
 
 
 
 
 
bbc1fe3
0b155f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4954b56
 
 
bbc1fe3
0b155f0
 
f183304
bbc1fe3
4954b56
 
 
 
1311a82
 
 
 
0d90edb
1311a82
4954b56
1311a82
0d90edb
4954b56
01c3073
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import gradio as gr
import json
import os
import spaces
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

import huggingface_hub

import prep_decompiled

description = """# ReSym Test Space

This is a test space of the models from the [ReSym
artifacts](https://github.com/lt-asset/resym).  Sadly, at the time I am writing
this, not all of ReSym is publicly available; specifically, the Prolog component
is [not available](https://github.com/lt-asset/resym/issues/2).

This space simply performs inference on the two pretrained models available as
part of the ReSym artifacts. It takes a variable name and some decompiled code
as input, and outputs the variable type and other information.

## Disclaimer

I'm not a ReSym developer and I may have messed something up.  In particular,
you must prompt the variable names in the decompiled code as part of the prompt,
and I reused some of their own code to do this.

## Todo

* Add field decoding (probably needs Docker)

"""

hf_key = os.environ["HF_TOKEN"]
huggingface_hub.login(token=hf_key)

tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoderbase-3b")
vardecoder_model = AutoModelForCausalLM.from_pretrained(
    "ejschwartz/resym-vardecoder", torch_dtype=torch.bfloat16, device_map="auto"
)
fielddecoder_model = AutoModelForCausalLM.from_pretrained(
    "ejschwartz/resym-fielddecoder", torch_dtype=torch.bfloat16, device_map="auto"
)

example = r"""__int64 __fastcall sub_410D81(__int64 a1, __int64 a2, __int64 a3)
{
int v4; // [rsp+20h] [rbp-20h] BYREF
__int64 v5; // [rsp+28h] [rbp-18h]

if ( !a1 || !a2 || !a3 )
return 0LL;
v4 = 5;
v5 = a3;
return sub_411142(a1, a2, &v4);
}"""

examples = [
    ex.encode().decode("unicode_escape") for ex in open("examples.txt", "r").readlines()
]


@spaces.GPU
def infer(code):

    splitcode = [s.strip() for s in code.splitlines()]
    code = "\n".join(splitcode)
    bodyvars = [
        v["name"] for v in prep_decompiled.extract_comments(splitcode) if "name" in v
    ]
    argvars = [
        v["name"] for v in prep_decompiled.parse_signature(splitcode) if "name" in v
    ]
    vars = argvars + bodyvars
    # comments = prep_decompiled.extract_comments(splitcode)
    # sig = prep_decompiled.parse_signature(splitcode)
    # print(f"vars {vars}")

    varstring = ", ".join([f"`{v}`" for v in vars])

    var_name = vars[0]

    # ejs: Yeah, this var_name thing is really bizarre. But look at https://github.com/lt-asset/resym/blob/main/training_src/fielddecoder_inf.py
    var_prompt = f"What are the original name and data types of variables {varstring}?\n```\n{code}\n```{var_name}"

    print(f"Prompt:\n{var_prompt}")

    input_ids = tokenizer.encode(var_prompt, return_tensors="pt").cuda()[
        :, : 8192 - 1024
    ]
    var_output = vardecoder_model.generate(
        input_ids=input_ids,
        max_new_tokens=1024,
        num_beams=4,
        num_return_sequences=1,
        do_sample=False,
        early_stopping=False,
        pad_token_id=0,
        eos_token_id=0,
    )[0]
    var_output = tokenizer.decode(
        var_output[input_ids.size(1) :],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )
    field_output = fielddecoder_model.generate(
        input_ids=input_ids,
        max_new_tokens=1024,
        num_beams=4,
        num_return_sequences=1,
        do_sample=False,
        early_stopping=False,
        pad_token_id=0,
        eos_token_id=0,
    )[0]
    field_output = tokenizer.decode(
        field_output[input_ids.size(1) :],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )

    var_output = var_name + ":" + var_output
    field_output = var_name + ":" + field_output
    return var_output, varstring


demo = gr.Interface(
    fn=infer,
    inputs=[
        gr.Textbox(lines=10, value=example, label="Hex-Rays Decompilation"),
    ],
    outputs=[
        gr.Text(label="Var Decoder Output"),
        # gr.Text(label="Field Decoder Output"),
        gr.Text(label="Generated Variable List"),
    ],
    description=description,
    examples=examples,
)
demo.launch()