File size: 5,454 Bytes
86a2bff 32078b8 af9812a 86a2bff 58773bb f9437fe 9cbc665 3115532 58773bb b1fd156 1d56a5d af9812a 8ef67fc 25d463a 2bce853 25d463a 5e7f50e 8ef67fc cecf2b8 1d56a5d f3fdb65 10ca2d7 a17ae36 66a9140 f3fdb65 c485ef9 885832d c485ef9 1d7fc0d a17ae36 1d7fc0d 1f9e351 4c715cf 58caa69 32078b8 c485ef9 c1aaef4 5e7f50e f3fdb65 8c2cd7c 5e7f50e 2c26c26 f3fdb65 76d23b8 f3fdb65 76d23b8 c485ef9 0ed5800 8c2cd7c 1d7fc0d 96362ce 9f73464 1d7fc0d b4f4d50 b7caf2a eed6546 b4f4d50 32078b8 af9812a a86d2b5 9953197 5e7f50e af9812a 32078b8 f3fdb65 b7caf2a 0ed5800 32078b8 1718a54 f3fdb65 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import gradio as gr
import shap
import transformers
import os
import re
import subprocess
import sys
import tempfile
model = gr.load("ejschwartz/oo-method-test-model-bylibrary", src="models")
model_interp = transformers.pipeline("text-classification", "ejschwartz/oo-method-test-model-bylibrary")
def get_all_dis(bname, addrs=None):
anafile = tempfile.NamedTemporaryFile(prefix=os.path.basename(bname) + "_", suffix=".bat_ana")
ananame = anafile.name
addrstr = ""
if addrs is not None:
addrstr = " ".join([f"--function-at {x}" for x in addrs])
subprocess.check_output(f"bat-ana {addrstr} --no-post-analysis -o {ananame} {bname} 2>/dev/null", shell=True)
output = subprocess.check_output(f"bat-dis --no-insn-address --no-bb-cfg-arrows --color=off {ananame} 2>/dev/null", shell=True)
output = re.sub(b' +', b' ', output)
func_dis = {}
last_func = None
current_output = []
for l in output.splitlines():
if l.startswith(b";;; function 0x"):
if last_func is not None:
func_dis[last_func] = b"\n".join(current_output)
last_func = int(l.split()[2], 16)
current_output.clear()
if not b";;" in l:
current_output.append(l)
if last_func is not None:
if last_func in func_dis:
print("Warning: Ignoring multiple functions at the same address")
else:
func_dis[last_func] = b"\n".join(current_output)
return func_dis
def get_funs(f):
funs = get_all_dis(f.name)
return "\n".join(("%#x" % addr) for addr in funs.keys())
with gr.Blocks() as demo:
all_dis_state = gr.State()
gr.Markdown(
"""
# Function/Method Detector
First, upload a binary.
This model was only trained on 32-bit MSVC++ binaries. You can provide
other types of binaries, but the result will probably be gibberish.
"""
)
file_widget = gr.File(label="Binary file")
with gr.Column(visible=False) as col:
#output = gr.Textbox("Output")
gr.Markdown("""
Great, you selected an executable! Now pick the function you would like to analyze.
""")
fun_dropdown = gr.Dropdown(label="Select a function", choices=["Woohoo!"], interactive=True)
gr.Markdown("""
Below you can find the selected function's disassembly, and the model's
prediction of whether the function is an object-oriented method or a
regular function.
""")
with gr.Row(visible=True) as result:
disassembly = gr.Textbox(label="Disassembly", lines=20)
with gr.Column():
clazz = gr.Label()
interpret_button = gr.Button("Interpret (very slow)")
interpretation = gr.components.Interpretation(disassembly)
example_widget = gr.Examples(
examples=[f.path for f in os.scandir(os.path.join(os.path.dirname(__file__), "examples"))],
inputs=file_widget,
outputs=[all_dis_state, disassembly, clazz]
)
def file_change_fn(file, progress=gr.Progress()):
if file is None:
return {col: gr.update(visible=False),
all_dis_state: None}
else:
#fun_data = {42: 2, 43: 3}
progress(0, desc="Disassembling executable")
fun_data = get_all_dis(file.name)
addrs = ["%#x" % addr for addr in fun_data.keys()]
return {col: gr.update(visible=True),
fun_dropdown: gr.Dropdown.update(choices=addrs, value=addrs[0]),
all_dis_state: fun_data
}
def function_change_fn(selected_fun, fun_data):
disassembly_str = fun_data[int(selected_fun, 16)].decode("utf-8")
load_results = model.fn(disassembly_str)
top_k = {e['label']: e['confidence'] for e in load_results['confidences']}
return {disassembly: gr.Textbox.update(value=disassembly_str),
clazz: gr.Label.update(top_k),
# I can't figure out how to hide this
#interpretation: {}
}
# XXX: Ideally we'd use the gr.load model, which uses the huggingface
# inference API. But shap library appears to use information in the
# transformers pipeline, and I don't feel like figuring out how to
# reimplement that, so we'll just use a regular transformers pipeline here
# for interpretation.
def interpretation_function(text, progress=gr.Progress(track_tqdm=True)):
progress(0, desc="Interpreting function")
explainer = shap.Explainer(model_interp)
shap_values = explainer([text])
# Dimensions are (batch size, text size, number of classes)
# Since we care about positive sentiment, use index 1
scores = list(zip(shap_values.data[0], shap_values.values[0, :, 1]))
# Scores contains (word, score) pairs
# Format expected by gr.components.Interpretation
return {"original": text, "interpretation": scores}
file_widget.change(file_change_fn, file_widget, [col, fun_dropdown, all_dis_state])
fun_dropdown.change(function_change_fn, [fun_dropdown, all_dis_state], [disassembly, clazz, interpretation])
interpret_button.click(interpretation_function, disassembly, interpretation)
demo.queue()
demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
|