File size: 5,202 Bytes
01c3073
9377434
28b893e
7e17e4e
bbc1fe3
01c3073
 
d3eb07d
 
b1dd808
 
3f47af7
 
b1dd808
 
d3eb07d
1311a82
09910fb
2ab342b
762a224
4ee21c2
2ab342b
762a224
d3eb07d
9377434
b1dd808
0d90edb
 
 
 
01c3073
2d64e29
 
 
 
28b893e
 
 
 
 
 
 
 
 
2d64e29
 
03ad973
2d64e29
 
 
347742a
 
2d64e29
2456748
11f2ade
2ab342b
2d64e29
01c3073
6ab20b3
3f47af7
5f12c87
 
 
 
1311a82
 
 
 
 
 
fb5ce4e
1311a82
 
 
6ab20b3
 
 
2ab342b
3f47af7
0adad70
7a59fcd
0adad70
2456748
bbc1fe3
2ab342b
0d90edb
 
0b155f0
2ab342b
4954b56
 
 
 
 
 
 
bbc1fe3
0b155f0
2ab342b
0b155f0
 
 
9377434
2ab342b
2b63412
0a4c2da
2b63412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a4b368
2b63412
 
 
2ab342b
347742a
2b63412
91e7dfd
 
bbc1fe3
4954b56
 
 
 
d868a6b
1311a82
 
 
2ab342b
1311a82
91e7dfd
4954b56
9bd433d
 
 
555a8e5
0d90edb
4954b56
01c3073
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import gradio as gr
from gradio_client import Client
from gradio_client.exceptions import AppError
import frontmatter
import os
import spaces
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

import huggingface_hub

import prep_decompiled

hf_key = os.environ["HF_TOKEN"]
huggingface_hub.login(token=hf_key)

tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoderbase-3b")
vardecoder_model = AutoModelForCausalLM.from_pretrained(
    "ejschwartz/resym-vardecoder", torch_dtype=torch.bfloat16
).to("cuda")
fielddecoder_model = AutoModelForCausalLM.from_pretrained(
    "ejschwartz/resym-fielddecoder", torch_dtype=torch.bfloat16
).to("cuda")

gradio_client = Client("https://ejschwartz-resym-field-helper.hf.space/")

examples = [
    ex.encode().decode("unicode_escape") for ex in open("examples.txt", "r").readlines()
]


# Example prompt
#   "input": "```\n_BOOL8 __fastcall sub_409B9A(_QWORD *a1, _QWORD *a2)\n{\nreturn *a1 < *a2 || *a1 == *a2 && a1[1] < a2[1];\n}\n```\nWhat are the variable name and type for the following memory accesses:a1, a1[1], a2, a2[1]?\n",
#  "output": "a1: a, os_reltime* -> sec, os_time_t\na1[1]: a, os_reltime* -> usec, os_time_t\na2: b, os_reltime* -> sec, os_time_t\na2[1]: b, os_reltime* -> usec, os_time_t",
def field_prompt(code):
    try:
        field_helper_result = gradio_client.predict(
            decompiled_code=code,
            api_name="/predict",
        )
    except AppError as e:
        print(f"AppError: {e}")
        return None, [], None
    
    print(f"field helper result: {field_helper_result}")

    fields = sorted(list(set([e['expr'] for e in field_helper_result[0] if e['expr'] != ''])))
    print(f"fields: {fields}")

    prompt = f"```\n{code}\n```\nWhat are the variable name and type for the following memory accesses:{', '.join(fields)}?\n"
    if len(fields) > 0:
        prompt += f"{fields[0]}:"

    print(f"field prompt: {repr(prompt)}")

    return prompt, fields, field_helper_result

@spaces.GPU
def infer(code):

    splitcode = code.splitlines()
    #splitcode = [s.strip() for s in code.splitlines()]
    #code = "\n".join(splitcode)
    
    bodyvars = [
        v["name"] for v in prep_decompiled.extract_comments(splitcode) if "name" in v
    ]
    argvars = [
        v["name"] for v in prep_decompiled.parse_signature(splitcode) if "name" in v
    ]
    vars = argvars + bodyvars
    # comments = prep_decompiled.extract_comments(splitcode)
    # sig = prep_decompiled.parse_signature(splitcode)
    # print(f"vars {vars}")

    varstring = ", ".join([f"`{v}`" for v in vars])

    first_var = vars[0]

    # ejs: Yeah, this var_name thing is really bizarre. But look at https://github.com/lt-asset/resym/blob/main/training_src/fielddecoder_inf.py
    var_prompt = f"What are the original name and data types of variables {varstring}?\n```\n{code}\n```{first_var}:"

    print(f"Prompt:\n{repr(var_prompt)}")

    var_input_ids = tokenizer.encode(var_prompt, return_tensors="pt").cuda()[
        :, : 8192 - 1024
    ]
    var_output = vardecoder_model.generate(
        input_ids=var_input_ids,
        max_new_tokens=1024,
        num_beams=4,
        num_return_sequences=1,
        do_sample=False,
        early_stopping=False,
        pad_token_id=0,
        eos_token_id=0,
    )[0]
    var_output = tokenizer.decode(
        var_output[var_input_ids.size(1) :],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )

    field_prompt_result, fields, field_helper_result = field_prompt(code)
    if len(fields) == 0:
        field_output = "Failed to parse fields" if field_prompt_result is None else "No fields"
    else:
        field_input_ids = tokenizer.encode(field_prompt_result, return_tensors="pt").cuda()[
            :, : 8192 - 1024
        ]

        field_output = fielddecoder_model.generate(
            input_ids=field_input_ids,
            max_new_tokens=1024,
            num_beams=4,
            num_return_sequences=1,
            do_sample=False,
            early_stopping=False,
            pad_token_id=0,
            eos_token_id=0,
        )[0]
        field_output = tokenizer.decode(
            field_output[field_input_ids.size(1) :],
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )

        field_output = fields[0] + ":" + field_output
    var_output = first_var + ":" + var_output
    fieldstring = ", ".join(fields)
    return var_output, field_output, varstring, fieldstring


demo = gr.Interface(
    fn=infer,
    inputs=[
        gr.Textbox(lines=10, value=examples[0], label="Hex-Rays Decompilation"),
    ],
    outputs=[
        gr.Text(label="Var Decoder Output"),
        gr.Text(label="Field Decoder Output"),
        gr.Text(label="Generated Variable List"),
        gr.Text(label="Generated Field Access List"),
    ],
    # description=frontmatter.load("README.md").content,
    description="""This is a test space of the models from the [ReSym
artifacts](https://github.com/lt-asset/resym).  For more information, please see
[the README](https://huggingface.co/spaces/ejschwartz/resym/blob/main/README.md).""",
    examples=examples,
)
demo.launch()