arise-sustech
commited on
Commit
•
1ef1c3c
1
Parent(s):
383a088
Update README.md
Browse files
README.md
CHANGED
@@ -12,6 +12,7 @@ LLM4Decompile aims to decompile x86 assembly instructions into C. It is finetune
|
|
12 |
- **Github Repository:** [LLM4Compile](https://github.com/albertan017/LLM4Decompile)
|
13 |
- **Paper link:** For more details check out the [paper](https://arxiv.org/abs/2403.05286).
|
14 |
|
|
|
15 |
|
16 |
### 2. Evaluation Results
|
17 |
| Model | Re-compilability | | | | | Re-executability | | | | |
|
@@ -26,6 +27,9 @@ LLM4Decompile aims to decompile x86 assembly instructions into C. It is finetune
|
|
26 |
|
27 |
|
28 |
### 3. How to Use
|
|
|
|
|
|
|
29 |
Here give an example of how to use our model.
|
30 |
First compile the C code into binary, disassemble the binary into assembly instructions:
|
31 |
```python
|
@@ -33,11 +37,9 @@ import subprocess
|
|
33 |
import os
|
34 |
import re
|
35 |
|
36 |
-
digit_pattern = r'\b0x[a-fA-F0-9]+\b'#
|
37 |
zeros_pattern = r'^0+\s'#0s
|
38 |
OPT = ["O0", "O1", "O2", "O3"]
|
39 |
-
before = f"# This is the assembly code with {opt_state} optimization:\n"
|
40 |
-
after = "\n# What is the source code?\n"
|
41 |
fileName = 'path/to/file'
|
42 |
with open(fileName+'.c','r') as f:#original file
|
43 |
c_func = f.read()
|
@@ -57,7 +59,8 @@ for opt_state in OPT:
|
|
57 |
tmp_asm = tmp_asm.split('#')[0].strip()#remove the comments
|
58 |
input_asm+=tmp_asm+'\n'
|
59 |
input_asm = re.sub(zeros_pattern, '', input_asm)
|
60 |
-
|
|
|
61 |
input_asm_prompt = before+input_asm.strip()+after
|
62 |
with open(fileName +'_' + opt_state +'.asm','w',encoding='utf-8') as f:
|
63 |
f.write(input_asm_prompt)
|
@@ -76,7 +79,7 @@ with open(fileName +'_' + opt_state +'.asm','r') as f:#original file
|
|
76 |
asm_func = f.read()
|
77 |
inputs = tokenizer(asm_func, return_tensors="pt").to(model.device)
|
78 |
with torch.no_grad():
|
79 |
-
outputs = model.generate(**inputs, max_new_tokens=
|
80 |
c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1])
|
81 |
```
|
82 |
|
|
|
12 |
- **Github Repository:** [LLM4Compile](https://github.com/albertan017/LLM4Decompile)
|
13 |
- **Paper link:** For more details check out the [paper](https://arxiv.org/abs/2403.05286).
|
14 |
|
15 |
+
Note: The unified optimization (UO) model is trained without prior knowledge of the optimization levels (O0~O3), the average re-executability is arond 0.21.
|
16 |
|
17 |
### 2. Evaluation Results
|
18 |
| Model | Re-compilability | | | | | Re-executability | | | | |
|
|
|
27 |
|
28 |
|
29 |
### 3. How to Use
|
30 |
+
|
31 |
+
Note: For the UO model, it is trained without prior knowledge of the optimization levels (O0~O3), therefore, the prompt is slightly different.
|
32 |
+
|
33 |
Here give an example of how to use our model.
|
34 |
First compile the C code into binary, disassemble the binary into assembly instructions:
|
35 |
```python
|
|
|
37 |
import os
|
38 |
import re
|
39 |
|
40 |
+
digit_pattern = r'\b0x[a-fA-F0-9]+\b'# binary codes in Hexadecimal
|
41 |
zeros_pattern = r'^0+\s'#0s
|
42 |
OPT = ["O0", "O1", "O2", "O3"]
|
|
|
|
|
43 |
fileName = 'path/to/file'
|
44 |
with open(fileName+'.c','r') as f:#original file
|
45 |
c_func = f.read()
|
|
|
59 |
tmp_asm = tmp_asm.split('#')[0].strip()#remove the comments
|
60 |
input_asm+=tmp_asm+'\n'
|
61 |
input_asm = re.sub(zeros_pattern, '', input_asm)
|
62 |
+
before = f"# This is the assembly code:\n"#prompt different for the UO model
|
63 |
+
after = "\n# What is the source code?\n"#prompt
|
64 |
input_asm_prompt = before+input_asm.strip()+after
|
65 |
with open(fileName +'_' + opt_state +'.asm','w',encoding='utf-8') as f:
|
66 |
f.write(input_asm_prompt)
|
|
|
79 |
asm_func = f.read()
|
80 |
inputs = tokenizer(asm_func, return_tensors="pt").to(model.device)
|
81 |
with torch.no_grad():
|
82 |
+
outputs = model.generate(**inputs, max_new_tokens=200)
|
83 |
c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1])
|
84 |
```
|
85 |
|