Improve model card: Add metadata, update introduction, and add citation
#1
by
nielsr
HF Staff
- opened
README.md
CHANGED
|
@@ -3,25 +3,30 @@ license: mit
|
|
| 3 |
tags:
|
| 4 |
- decompile
|
| 5 |
- binary
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
---
|
| 7 |
|
| 8 |
-
|
| 9 |
|
| 10 |
-
|
| 11 |
|
| 12 |
-
- **
|
| 13 |
|
|
|
|
| 14 |
|
| 15 |
### 2. Evaluation Results
|
| 16 |
|
| 17 |
| Metrics | Re-executability Rate | | | | | Edit Similarity | | | | |
|
| 18 |
-
|
| 19 |
-
| Optimization Level | O0 | O1 | O2 | O3 | AVG | O0 | O1 | O2 | O3 | AVG
|
| 20 |
-
| LLM4Decompile-End-6.7B | 0.6805 | 0.3951 | 0.3671 | 0.3720 | 0.4537 | 0.1557 | 0.1292 | 0.1293 | 0.1269 | 0.1353
|
| 21 |
-
| Ghidra | 0.3476 | 0.1646 | 0.1524 | 0.1402 | 0.2012 | 0.0699 | 0.0613 | 0.0619 | 0.0547 | 0.0620
|
| 22 |
-
| +GPT-4o | 0.4695 | 0.3415 | 0.2866 | 0.3110 | 0.3522 | 0.0660 | 0.0563 | 0.0567 | 0.0499 | 0.0572
|
| 23 |
-
| +LLM4Decompile-Ref-1.3B | 0.6890 | 0.3720 | 0.4085 | 0.3720 | 0.4604 | 0.1517 | 0.1325 | 0.1292 | 0.1267 | 0.1350
|
| 24 |
-
| +LLM4Decompile-Ref-6.7B | 0.7439 | 0.4695 | 0.4756 | 0.4207 | 0.5274 | 0.1559 | 0.1353 | 0.1342 | 0.1273 | 0.1382
|
| 25 |
| +LLM4Decompile-Ref-33B | 0.7073 | 0.4756 | 0.4390 | 0.4146 | 0.5091 | 0.1540 | 0.1379 | 0.1363 | 0.1307 | 0.1397 |
|
| 26 |
|
| 27 |
### 3. How to Use
|
|
@@ -71,28 +76,29 @@ with tempfile.TemporaryDirectory() as temp_dir:
|
|
| 71 |
executable_path = os.path.join(temp_dir, f"{pid}_{opt}.o")
|
| 72 |
cmd = f'gcc -{opt} -o {executable_path} {func_path} -lm'
|
| 73 |
subprocess.run(
|
| 74 |
-
cmd.split(' ')
|
| 75 |
-
check=True
|
| 76 |
-
stdout=subprocess.DEVNULL, # Suppress stdout
|
| 77 |
-
stderr=subprocess.DEVNULL, # Suppress stderr
|
| 78 |
-
timeout=timeout_duration
|
| 79 |
)
|
| 80 |
|
| 81 |
output_path = os.path.join(temp_dir, f"{pid}_{opt}.c")
|
| 82 |
-
command = [
|
| 83 |
-
ghidra_path
|
| 84 |
-
temp_dir
|
| 85 |
-
project_name
|
| 86 |
-
"-import", executable_path
|
| 87 |
-
"-postScript", postscript, output_path
|
| 88 |
-
"-deleteProject", # WARNING: This will delete the project after analysis
|
| 89 |
-
]
|
| 90 |
result = subprocess.run(command, text=True, capture_output=True, check=True)
|
| 91 |
-
with open(output_path,'r') as f
|
| 92 |
c_decompile = f.read()
|
| 93 |
c_func = []
|
| 94 |
flag = 0
|
| 95 |
-
for line in c_decompile.split('
|
|
|
|
| 96 |
if "Function: func0" in line:#**Replace** func0 with the function name you want to decompile.
|
| 97 |
flag = 1
|
| 98 |
c_func.append(line)
|
|
@@ -108,10 +114,14 @@ with tempfile.TemporaryDirectory() as temp_dir:
|
|
| 108 |
if 'func0' in c_func[idx_tmp]:
|
| 109 |
break
|
| 110 |
c_func = c_func[idx_tmp:]
|
| 111 |
-
input_asm = '
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
input_asm_prompt = before+input_asm.strip()+after
|
| 116 |
with open(fileName +'_' + opt +'.pseudo','w',encoding='utf-8') as f:
|
| 117 |
f.write(input_asm_prompt)
|
|
@@ -131,8 +141,8 @@ undefined4 func0(float param_1,long param_2,int param_3)
|
|
| 131 |
return 0;
|
| 132 |
}
|
| 133 |
while (local_28 = local_28 + 1, local_28 < param_3) {
|
| 134 |
-
if ((double)((ulong)(double)(*(float *)(param_2 + (long)local_24 * 4)
|
| 135 |
-
*(float *)(param_2 + (long)local_28 * 4))
|
| 136 |
SUB168(_DAT_00402010,0)) < (double)param_1) {
|
| 137 |
return 1;
|
| 138 |
}
|
|
@@ -162,8 +172,10 @@ c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1])
|
|
| 162 |
with open(fileName +'_' + OPT[0] +'.pseudo','r') as f:#original file
|
| 163 |
func = f.read()
|
| 164 |
|
| 165 |
-
print(f'pseudo function:\
|
| 166 |
-
|
|
|
|
|
|
|
| 167 |
|
| 168 |
```
|
| 169 |
|
|
@@ -173,3 +185,15 @@ This code repository is licensed under the MIT License.
|
|
| 173 |
### 5. Contact
|
| 174 |
|
| 175 |
If you have any questions, please raise an issue.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
tags:
|
| 4 |
- decompile
|
| 5 |
- binary
|
| 6 |
+
datasets:
|
| 7 |
+
- LLM4Binary/decompile-bench
|
| 8 |
+
pipeline_tag: text-generation
|
| 9 |
+
library_name: transformers
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# LLM4Decompile-6.7B-v2 Model Card
|
| 13 |
|
| 14 |
+
This repository contains **LLM4Decompile-6.7B-v2**, an open-source large language model dedicated to decompilation, presented in the paper [Decompile-Bench: Million-Scale Binary-Source Function Pairs for Real-World Binary Decompilation](https://huggingface.co/papers/2505.12668).
|
| 15 |
|
| 16 |
+
The LLM4Decompile project aims to decompile x86 assembly instructions into human-readable C source code. This specific `v2` model is part of the **LLM4Decompile-Ref** series, building upon **Ghidra** to **refine** the decompiled pseudo-code. It is trained on 2 billion tokens with a maximum token length of 4,096, demonstrating remarkable performance (up to 100% improvement) compared to previous models.
|
| 17 |
|
| 18 |
+
- **GitHub Repository:** [LLM4Decompile](https://github.com/albertan017/LLM4Decompile)
|
| 19 |
|
| 20 |
### 2. Evaluation Results
|
| 21 |
|
| 22 |
| Metrics | Re-executability Rate | | | | | Edit Similarity | | | | |
|
| 23 |
+
|:-----------------------:|:---------------------:|:-------:|:-------:|:-------:|:-------:|:---------------:|:-------:|:-------:|:-------:|:-------:|\
|
| 24 |
+
| Optimization Level | O0 | O1 | O2 | O3 | AVG | O0 | O1 | O2 | O3 | AVG |\
|
| 25 |
+
| LLM4Decompile-End-6.7B | 0.6805 | 0.3951 | 0.3671 | 0.3720 | 0.4537 | 0.1557 | 0.1292 | 0.1293 | 0.1269 | 0.1353 |\
|
| 26 |
+
| Ghidra | 0.3476 | 0.1646 | 0.1524 | 0.1402 | 0.2012 | 0.0699 | 0.0613 | 0.0619 | 0.0547 | 0.0620 |\
|
| 27 |
+
| +GPT-4o | 0.4695 | 0.3415 | 0.2866 | 0.3110 | 0.3522 | 0.0660 | 0.0563 | 0.0567 | 0.0499 | 0.0572 |\
|
| 28 |
+
| +LLM4Decompile-Ref-1.3B | 0.6890 | 0.3720 | 0.4085 | 0.3720 | 0.4604 | 0.1517 | 0.1325 | 0.1292 | 0.1267 | 0.1350 |\
|
| 29 |
+
| +LLM4Decompile-Ref-6.7B | 0.7439 | 0.4695 | 0.4756 | 0.4207 | 0.5274 | 0.1559 | 0.1353 | 0.1342 | 0.1273 | 0.1382 |\
|
| 30 |
| +LLM4Decompile-Ref-33B | 0.7073 | 0.4756 | 0.4390 | 0.4146 | 0.5091 | 0.1540 | 0.1379 | 0.1363 | 0.1307 | 0.1397 |
|
| 31 |
|
| 32 |
### 3. How to Use
|
|
|
|
| 76 |
executable_path = os.path.join(temp_dir, f"{pid}_{opt}.o")
|
| 77 |
cmd = f'gcc -{opt} -o {executable_path} {func_path} -lm'
|
| 78 |
subprocess.run(
|
| 79 |
+
cmd.split(' '),\
|
| 80 |
+
check=True,\
|
| 81 |
+
stdout=subprocess.DEVNULL, # Suppress stdout\
|
| 82 |
+
stderr=subprocess.DEVNULL, # Suppress stderr\
|
| 83 |
+
timeout=timeout_duration,\
|
| 84 |
)
|
| 85 |
|
| 86 |
output_path = os.path.join(temp_dir, f"{pid}_{opt}.c")
|
| 87 |
+
command = [\
|
| 88 |
+
ghidra_path,\
|
| 89 |
+
temp_dir,\
|
| 90 |
+
project_name,\
|
| 91 |
+
"-import", executable_path,\
|
| 92 |
+
"-postScript", postscript, output_path,\
|
| 93 |
+
"-deleteProject", # WARNING: This will delete the project after analysis\
|
| 94 |
+
]\
|
| 95 |
result = subprocess.run(command, text=True, capture_output=True, check=True)
|
| 96 |
+
with open(output_path,'r') as f:\
|
| 97 |
c_decompile = f.read()
|
| 98 |
c_func = []
|
| 99 |
flag = 0
|
| 100 |
+
for line in c_decompile.split('
|
| 101 |
+
'):
|
| 102 |
if "Function: func0" in line:#**Replace** func0 with the function name you want to decompile.
|
| 103 |
flag = 1
|
| 104 |
c_func.append(line)
|
|
|
|
| 114 |
if 'func0' in c_func[idx_tmp]:
|
| 115 |
break
|
| 116 |
c_func = c_func[idx_tmp:]
|
| 117 |
+
input_asm = '
|
| 118 |
+
'.join(c_func).strip()
|
| 119 |
+
|
| 120 |
+
before = f"# This is the assembly code:\
|
| 121 |
+
"#prompt
|
| 122 |
+
after = "
|
| 123 |
+
# What is the source code?\
|
| 124 |
+
"#prompt
|
| 125 |
input_asm_prompt = before+input_asm.strip()+after
|
| 126 |
with open(fileName +'_' + opt +'.pseudo','w',encoding='utf-8') as f:
|
| 127 |
f.write(input_asm_prompt)
|
|
|
|
| 141 |
return 0;
|
| 142 |
}
|
| 143 |
while (local_28 = local_28 + 1, local_28 < param_3) {
|
| 144 |
+
if ((double)((ulong)(double)(*(float *)(param_2 + (long)local_24 * 4) -\
|
| 145 |
+
*(float *)(param_2 + (long)local_28 * 4)) &\
|
| 146 |
SUB168(_DAT_00402010,0)) < (double)param_1) {
|
| 147 |
return 1;
|
| 148 |
}
|
|
|
|
| 172 |
with open(fileName +'_' + OPT[0] +'.pseudo','r') as f:#original file
|
| 173 |
func = f.read()
|
| 174 |
|
| 175 |
+
print(f'pseudo function:\
|
| 176 |
+
{func}')# Note we only decompile one function, where the original file may contain multiple functions
|
| 177 |
+
print(f'refined function:\
|
| 178 |
+
{c_func_decompile}')
|
| 179 |
|
| 180 |
```
|
| 181 |
|
|
|
|
| 185 |
### 5. Contact
|
| 186 |
|
| 187 |
If you have any questions, please raise an issue.
|
| 188 |
+
|
| 189 |
+
## Citation
|
| 190 |
+
```
|
| 191 |
+
@misc{tan2024llm4decompile,
|
| 192 |
+
title={LLM4Decompile: Decompiling Binary Code with Large Language Models},
|
| 193 |
+
author={Hanzhuo Tan and Qi Luo and Jing Li and Yuqun Zhang},
|
| 194 |
+
year={2024},
|
| 195 |
+
eprint={2403.05286},
|
| 196 |
+
archivePrefix={arXiv},
|
| 197 |
+
primaryClass={cs.PL}
|
| 198 |
+
}
|
| 199 |
+
```
|