LLM4Binary commited on
Commit
9aaf80a
·
verified ·
1 Parent(s): c53a3c7

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +127 -49
README.md CHANGED
@@ -7,6 +7,7 @@ widget:
7
  - text: "# This is the assembly code:\n<func0>:\nendbr64\nlea (%rdi,%rsi,1),%eax\nretq\n# What is the source code?\n"
8
  ---
9
 
 
10
  ### 1. Introduction of LLM4Decompile
11
 
12
  LLM4Decompile aims to decompile x86 assembly instructions into C. The newly released V1.5 series are trained with a larger dataset (15B tokens) and a maximum token length of 4,096, with remarkable performance (up to 100% improvement) compared to the previous model.
@@ -15,81 +16,158 @@ LLM4Decompile aims to decompile x86 assembly instructions into C. The newly rele
15
 
16
 
17
  ### 2. Evaluation Results
18
- | Model | HumanEval-Decompile | | | | | ExeBench | | | | |
19
- |:-----------------------:|:-------------------:|:------:|:------:|:------:|:------:|:--------:|:------:|:------:|:------:|:------:|
20
- | opt-level | O0 | O1 | O2 | O3 | Avg. | O0 | O1 | O2 | O3 | Avg. |
21
- | GPT4 | 0.1341 | 0.1890 | 0.1524 | 0.0854 | 0.1402 | TBD | TBD | TBD | TBD | TBD |
22
- | Deepseek-Coder-33B | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
23
- | LLM4Decompile-6.7B-UO | 0.3720 | 0.1585 | 0.2134 | 0.2134 | 0.2393 | 0.0904 | 0.0988 | 0.0988 | 0.0950 | 0.0957 |
24
- | LLM4Decompile-1.3B-V1.5 | 0.4817 | 0.2463 | 0.2329 | 0.2280 | 0.2972 | 0.2076 | 0.1774 | 0.1721 | 0.1728 | 0.1824 |
25
- | LLM4Decompile-6.7B-V1.5 | 0.6927 | 0.4280 | 0.4134 | 0.3732 | 0.4768 | 0.2453 | 0.1999 | 0.1927 | 0.1938 | 0.2079 |
26
 
 
 
 
 
 
 
 
 
 
27
 
28
  ### 3. How to Use
29
- Here is an example of how to use our model (Revised for V1.5).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  Note: **Replace** func0 with the function name you want to decompile.
31
 
32
  **Preprocessing:** Compile the C code into binary, and disassemble the binary into assembly instructions.
33
  ```python
34
- import subprocess
35
  import os
 
 
36
 
37
  OPT = ["O0", "O1", "O2", "O3"]
38
- fileName = 'samples/sample' #'path/to/file'
39
- for opt_state in OPT:
40
- output_file = fileName +'_' + opt_state
41
- input_file = fileName+'.c'
42
- compile_command = f'gcc -o {output_file}.o {input_file} -{opt_state} -lm'#compile the code with GCC on Linux
43
- subprocess.run(compile_command, shell=True, check=True)
44
- compile_command = f'objdump -d {output_file}.o > {output_file}.s'#disassemble the binary file into assembly instructions
45
- subprocess.run(compile_command, shell=True, check=True)
46
-
47
- input_asm = ''
48
- with open(output_file+'.s') as f:#asm file
49
- asm= f.read()
50
- if '<'+'func0'+'>:' not in asm: #IMPORTANT replace func0 with the function name
51
- raise ValueError("compile fails")
52
- asm = '<'+'func0'+'>:' + asm.split('<'+'func0'+'>:')[-1].split('\n\n')[0] #IMPORTANT replace func0 with the function name
53
- asm_clean = ""
54
- asm_sp = asm.split("\n")
55
- for tmp in asm_sp:
56
- if len(tmp.split("\t"))<3 and '00' in tmp:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  continue
58
- idx = min(
59
- len(tmp.split("\t")) - 1, 2
60
- )
61
- tmp_asm = "\t".join(tmp.split("\t")[idx:]) # remove the binary code
62
- tmp_asm = tmp_asm.split("#")[0].strip() # remove the comments
63
- asm_clean += tmp_asm + "\n"
64
- input_asm = asm_clean.strip()
65
- before = f"# This is the assembly code:\n"#prompt
66
- after = "\n# What is the source code?\n"#prompt
67
- input_asm_prompt = before+input_asm.strip()+after
68
- with open(fileName +'_' + opt_state +'.asm','w',encoding='utf-8') as f:
69
- f.write(input_asm_prompt)
 
 
 
 
 
 
70
  ```
71
 
72
- **Decompilation:** Use LLM4Decompile to translate the assembly instructions into C:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  ```python
74
  from transformers import AutoTokenizer, AutoModelForCausalLM
75
  import torch
76
 
77
- model_path = 'LLM4Binary/llm4decompile-1.3b-v1.5' # V1.5 Model
78
  tokenizer = AutoTokenizer.from_pretrained(model_path)
79
- model = AutoModelForCausalLM.from_pretrained(model_path,torch_dtype=torch.bfloat16).cuda()
80
 
81
- with open(fileName +'_' + OPT[0] +'.asm','r') as f:#optimization level O0
82
  asm_func = f.read()
83
  inputs = tokenizer(asm_func, return_tensors="pt").to(model.device)
84
  with torch.no_grad():
85
- outputs = model.generate(**inputs, max_new_tokens=4000)
86
  c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1])
87
 
88
- with open(fileName +'.c','r') as f:#original file
89
  func = f.read()
90
 
91
- print(f'original function:\n{func}')# Note we only decompile one function, where the original file may contain multiple functions
92
- print(f'decompiled function:\n{c_func_decompile}')
 
93
  ```
94
 
95
  ### 4. License
 
7
  - text: "# This is the assembly code:\n<func0>:\nendbr64\nlea (%rdi,%rsi,1),%eax\nretq\n# What is the source code?\n"
8
  ---
9
 
10
+
11
  ### 1. Introduction of LLM4Decompile
12
 
13
  LLM4Decompile aims to decompile x86 assembly instructions into C. The newly released V1.5 series are trained with a larger dataset (15B tokens) and a maximum token length of 4,096, with remarkable performance (up to 100% improvement) compared to the previous model.
 
16
 
17
 
18
  ### 2. Evaluation Results
 
 
 
 
 
 
 
 
19
 
20
+ | Metrics | Re-executability Rate | | | | | Edit Similarity | | | | |
21
+ |:-----------------------:|:---------------------:|:-------:|:-------:|:-------:|:-------:|:---------------:|:-------:|:-------:|:-------:|:-------:|
22
+ | Optimization Level | O0 | O1 | O2 | O3 | AVG | O0 | O1 | O2 | O3 | AVG |
23
+ | LLM4Decompile-End-6.7B | 0.6805 | 0.3951 | 0.3671 | 0.3720 | 0.4537 | 0.1557 | 0.1292 | 0.1293 | 0.1269 | 0.1353 |
24
+ | Ghidra | 0.3476 | 0.1646 | 0.1524 | 0.1402 | 0.2012 | 0.0699 | 0.0613 | 0.0619 | 0.0547 | 0.0620 |
25
+ | +GPT-4o | 0.4695 | 0.3415 | 0.2866 | 0.3110 | 0.3522 | 0.0660 | 0.0563 | 0.0567 | 0.0499 | 0.0572 |
26
+ | +LLM4Decompile-Ref-1.3B | 0.6890 | 0.3720 | 0.4085 | 0.3720 | 0.4604 | 0.1517 | 0.1325 | 0.1292 | 0.1267 | 0.1350 |
27
+ | +LLM4Decompile-Ref-6.7B | 0.7439 | 0.4695 | 0.4756 | 0.4207 | 0.5274 | 0.1559 | 0.1353 | 0.1342 | 0.1273 | 0.1382 |
28
+ | +LLM4Decompile-Ref-33B | 0.7073 | 0.4756 | 0.4390 | 0.4146 | 0.5091 | 0.1540 | 0.1379 | 0.1363 | 0.1307 | 0.1397 |
29
 
30
  ### 3. How to Use
31
+ Here is an example of how to use our model (Only for V2. For previous models, please check the corresponding model page at HF).
32
+
33
+ 1. Install Ghidra
34
+ Download [Ghidra](https://github.com/NationalSecurityAgency/ghidra/releases/download/Ghidra_11.0.3_build/ghidra_11.0.3_PUBLIC_20240410.zip) to the current folder. You can also check the [page](https://github.com/NationalSecurityAgency/ghidra/releases) for other versions. Unzip the package to the current folder.
35
+ In bash, you can use the following:
36
+ ```bash
37
+ cd LLM4Decompile/ghidra
38
+ wget https://github.com/NationalSecurityAgency/ghidra/releases/download/Ghidra_11.0.3_build/ghidra_11.0.3_PUBLIC_20240410.zip
39
+ unzip ghidra_11.0.3_PUBLIC_20240410.zip
40
+ ```
41
+ 2. Install Java-SDK-17
42
+ Ghidra 11 is dependent on Java-SDK-17, a simple way to install the SDK on Ubuntu:
43
+ ```bash
44
+ apt-get update
45
+ apt-get upgrade
46
+ apt install openjdk-17-jdk openjdk-17-jre
47
+ ```
48
+ Please check [Ghidra install guide](https://htmlpreview.github.io/?https://github.com/NationalSecurityAgency/ghidra/blob/Ghidra_11.1.1_build/GhidraDocs/InstallationGuide.html) for other platforms.
49
+
50
+ 3. Use Ghidra Headless to decompile binary (demo.py)
51
+
52
  Note: **Replace** func0 with the function name you want to decompile.
53
 
54
  **Preprocessing:** Compile the C code into binary, and disassemble the binary into assembly instructions.
55
  ```python
 
56
  import os
57
+ import subprocess
58
+ from tqdm import tqdm,trange
59
 
60
  OPT = ["O0", "O1", "O2", "O3"]
61
+ timeout_duration = 10
62
+
63
+ ghidra_path = "./ghidra_11.0.3_PUBLIC/support/analyzeHeadless"#path to the headless analyzer, change the path accordingly
64
+ postscript = "./decompile.py"#path to the decompiler helper function, change the path accordingly
65
+ project_path = "."#path to temp folder for analysis, change the path accordingly
66
+ project_name = "tmp_ghidra_proj"
67
+ func_path = "../samples/sample.c"#path to c code for compiling and decompiling, change the path accordingly
68
+ fileName = "sample"
69
+
70
+ with tempfile.TemporaryDirectory() as temp_dir:
71
+ pid = os.getpid()
72
+ asm_all = {}
73
+ for opt in [OPT[0]]:
74
+ executable_path = os.path.join(temp_dir, f"{pid}_{opt}.o")
75
+ cmd = f'gcc -{opt} -o {executable_path} {func_path} -lm'
76
+ subprocess.run(
77
+ cmd.split(' '),
78
+ check=True,
79
+ stdout=subprocess.DEVNULL, # Suppress stdout
80
+ stderr=subprocess.DEVNULL, # Suppress stderr
81
+ timeout=timeout_duration,
82
+ )
83
+
84
+ output_path = os.path.join(temp_dir, f"{pid}_{opt}.c")
85
+ command = [
86
+ ghidra_path,
87
+ temp_dir,
88
+ project_name,
89
+ "-import", executable_path,
90
+ "-postScript", postscript, output_path,
91
+ "-deleteProject", # WARNING: This will delete the project after analysis
92
+ ]
93
+ result = subprocess.run(command, text=True, capture_output=True, check=True)
94
+ with open(output_path,'r') as f:
95
+ c_decompile = f.read()
96
+ c_func = []
97
+ flag = 0
98
+ for line in c_decompile.split('\n'):
99
+ if "Function: func0" in line:#**Replace** func0 with the function name you want to decompile.
100
+ flag = 1
101
+ c_func.append(line)
102
  continue
103
+ if flag:
104
+ if '// Function:' in line:
105
+ if len(c_func) > 1:
106
+ break
107
+ c_func.append(line)
108
+ if flag == 0:
109
+ raise ValueError('bad case no function found')
110
+ for idx_tmp in range(1,len(c_func)):##########remove the comments
111
+ if 'func0' in c_func[idx_tmp]:
112
+ break
113
+ c_func = c_func[idx_tmp:]
114
+ input_asm = '\n'.join(c_func).strip()
115
+
116
+ before = f"# This is the assembly code:\n"#prompt
117
+ after = "\n# What is the source code?\n"#prompt
118
+ input_asm_prompt = before+input_asm.strip()+after
119
+ with open(fileName +'_' + opt +'.pseudo','w',encoding='utf-8') as f:
120
+ f.write(input_asm_prompt)
121
  ```
122
 
123
+ Ghidra pseudo-code may look like this:
124
+ ```c
125
+ undefined4 func0(float param_1,long param_2,int param_3)
126
+ {
127
+ int local_28;
128
+ int local_24;
129
+
130
+ local_24 = 0;
131
+ do {
132
+ local_28 = local_24;
133
+ if (param_3 <= local_24) {
134
+ return 0;
135
+ }
136
+ while (local_28 = local_28 + 1, local_28 < param_3) {
137
+ if ((double)((ulong)(double)(*(float *)(param_2 + (long)local_24 * 4) -
138
+ *(float *)(param_2 + (long)local_28 * 4)) &
139
+ SUB168(_DAT_00402010,0)) < (double)param_1) {
140
+ return 1;
141
+ }
142
+ }
143
+ local_24 = local_24 + 1;
144
+ } while( true );
145
+ }
146
+ ```
147
+ 4. Refine pseudo-code using LLM4Decompile (demo.py)
148
+
149
+ **Decompilation:** Use LLM4Decompile-Ref to refine the Ghidra pseudo-code into C:
150
  ```python
151
  from transformers import AutoTokenizer, AutoModelForCausalLM
152
  import torch
153
 
154
+ model_path = 'LLM4Binary/llm4decompile-6.7b-v2' # V2 Model
155
  tokenizer = AutoTokenizer.from_pretrained(model_path)
156
+ model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16).cuda()
157
 
158
+ with open(fileName +'_' + OPT[0] +'.pseudo','r') as f:#optimization level O0
159
  asm_func = f.read()
160
  inputs = tokenizer(asm_func, return_tensors="pt").to(model.device)
161
  with torch.no_grad():
162
+ outputs = model.generate(**inputs, max_new_tokens=2048)### max length to 4096, max new tokens should be below the range
163
  c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1])
164
 
165
+ with open(fileName +'_' + OPT[0] +'.pseudo','r') as f:#original file
166
  func = f.read()
167
 
168
+ print(f'pseudo function:\n{func}')# Note we only decompile one function, where the original file may contain multiple functions
169
+ print(f'refined function:\n{c_func_decompile}')
170
+
171
  ```
172
 
173
  ### 4. License