Spaces:
Runtime error
Runtime error
File size: 4,119 Bytes
01c3073 bbc1fe3 01c3073 d3eb07d b1dd808 3f47af7 edab27e fb5ce4e edab27e 8ec4446 edab27e 8ea9eda b1dd808 d3eb07d 1311a82 09910fb b1dd808 d3eb07d 073c7e0 057ba08 073c7e0 d3eb07d e2f9f6a 374968b b1dd808 0d90edb 01c3073 6ab20b3 3f47af7 8ec4446 1311a82 fb5ce4e 1311a82 6ab20b3 3f47af7 0adad70 f183304 0adad70 f183304 bbc1fe3 0d90edb 0b155f0 4954b56 bbc1fe3 0b155f0 4954b56 bbc1fe3 0b155f0 f183304 bbc1fe3 4954b56 1311a82 0d90edb 1311a82 4954b56 1311a82 0d90edb 4954b56 01c3073 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import gradio as gr
import json
import os
import spaces
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import huggingface_hub
import prep_decompiled
description = """# ReSym Test Space
This is a test space of the models from the [ReSym
artifacts](https://github.com/lt-asset/resym). Sadly, at the time I am writing
this, not all of ReSym is publicly available; specifically, the Prolog component
is [not available](https://github.com/lt-asset/resym/issues/2).
This space simply performs inference on the two pretrained models available as
part of the ReSym artifacts. It takes a variable name and some decompiled code
as input, and outputs the variable type and other information.
## Disclaimer
I'm not a ReSym developer and I may have messed something up. In particular,
you must prompt the variable names in the decompiled code as part of the prompt,
and I reused some of their own code to do this.
## Todo
* Add field decoding (probably needs Docker)
"""
hf_key = os.environ["HF_TOKEN"]
huggingface_hub.login(token=hf_key)
tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoderbase-3b")
vardecoder_model = AutoModelForCausalLM.from_pretrained(
"ejschwartz/resym-vardecoder", torch_dtype=torch.bfloat16, device_map="auto"
)
fielddecoder_model = AutoModelForCausalLM.from_pretrained(
"ejschwartz/resym-fielddecoder", torch_dtype=torch.bfloat16, device_map="auto"
)
example = r"""__int64 __fastcall sub_410D81(__int64 a1, __int64 a2, __int64 a3)
{
int v4; // [rsp+20h] [rbp-20h] BYREF
__int64 v5; // [rsp+28h] [rbp-18h]
if ( !a1 || !a2 || !a3 )
return 0LL;
v4 = 5;
v5 = a3;
return sub_411142(a1, a2, &v4);
}"""
examples = [
ex.encode().decode("unicode_escape") for ex in open("examples.txt", "r").readlines()
]
@spaces.GPU
def infer(code):
splitcode = [s.strip() for s in code.splitlines()]
code = "\n".join(splitcode)
bodyvars = [
v["name"] for v in prep_decompiled.extract_comments(splitcode) if "name" in v
]
argvars = [
v["name"] for v in prep_decompiled.parse_signature(splitcode) if "name" in v
]
vars = argvars + bodyvars
# comments = prep_decompiled.extract_comments(splitcode)
# sig = prep_decompiled.parse_signature(splitcode)
# print(f"vars {vars}")
varstring = ", ".join([f"`{v}`" for v in vars])
var_name = vars[0]
# ejs: Yeah, this var_name thing is really bizarre. But look at https://github.com/lt-asset/resym/blob/main/training_src/fielddecoder_inf.py
var_prompt = f"What are the original name and data types of variables {varstring}?\n```\n{code}\n```{var_name}"
print(f"Prompt:\n{var_prompt}")
input_ids = tokenizer.encode(var_prompt, return_tensors="pt").cuda()[
:, : 8192 - 1024
]
var_output = vardecoder_model.generate(
input_ids=input_ids,
max_new_tokens=1024,
num_beams=4,
num_return_sequences=1,
do_sample=False,
early_stopping=False,
pad_token_id=0,
eos_token_id=0,
)[0]
var_output = tokenizer.decode(
var_output[input_ids.size(1) :],
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
)
field_output = fielddecoder_model.generate(
input_ids=input_ids,
max_new_tokens=1024,
num_beams=4,
num_return_sequences=1,
do_sample=False,
early_stopping=False,
pad_token_id=0,
eos_token_id=0,
)[0]
field_output = tokenizer.decode(
field_output[input_ids.size(1) :],
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
)
var_output = var_name + ":" + var_output
field_output = var_name + ":" + field_output
return var_output, varstring
demo = gr.Interface(
fn=infer,
inputs=[
gr.Textbox(lines=10, value=example, label="Hex-Rays Decompilation"),
],
outputs=[
gr.Text(label="Var Decoder Output"),
# gr.Text(label="Field Decoder Output"),
gr.Text(label="Generated Variable List"),
],
description=description,
examples=examples,
)
demo.launch()
|