Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
import zipfile | |
import tempfile | |
import pefile | |
import shutil | |
import subprocess | |
import re | |
import struct | |
from pathlib import Path | |
from capstone import Cs, CS_ARCH_X86, CS_MODE_32, CS_MODE_64 | |
st.set_page_config(page_title="Advanced File Analyzer", page_icon="π", layout="wide") | |
st.title("Advanced File Analysis Tool") | |
st.markdown(""" | |
This tool allows you to: | |
- Extract and view contents of .zip files | |
- Decompile .exe files to Python-like code | |
- View decompiled code from .dll files | |
- Automatically analyze nested executables | |
""") | |
def try_pyinstaller_extraction(file_path, output_dir): | |
"""Attempt to extract Python scripts from PyInstaller executables""" | |
try: | |
# Install pyinstxtractor if not already present | |
subprocess.run(["pip", "install", "pyinstxtractor"], capture_output=True) | |
# Run pyinstxtractor on the file | |
result = subprocess.run(["python", "-m", "pyinstxtractor", file_path], | |
cwd=output_dir, capture_output=True, text=True) | |
extracted_dir = os.path.join(output_dir, os.path.basename(file_path) + "_extracted") | |
if os.path.exists(extracted_dir): | |
# Try to decompile the Python bytecode files | |
subprocess.run(["pip", "install", "uncompyle6"], capture_output=True) | |
python_files = {} | |
for root, _, files in os.walk(extracted_dir): | |
for file in files: | |
if file.endswith('.pyc') or file.endswith('.pyo'): | |
pyc_path = os.path.join(root, file) | |
py_path = pyc_path + ".py" | |
try: | |
subprocess.run(["uncompyle6", pyc_path, "-o", py_path], capture_output=True) | |
if os.path.exists(py_path): | |
with open(py_path, 'r', encoding='utf-8', errors='ignore') as f: | |
rel_path = os.path.relpath(pyc_path, extracted_dir) | |
python_files[rel_path] = f.read() | |
except: | |
pass | |
return { | |
"success": True, | |
"message": "Successfully extracted Python code", | |
"files": python_files | |
} | |
return { | |
"success": False, | |
"message": "Not a PyInstaller executable or extraction failed" | |
} | |
except Exception as e: | |
return { | |
"success": False, | |
"message": f"PyInstaller extraction error: {str(e)}" | |
} | |
def disassemble_binary(file_path, is_dll=False): | |
"""Disassemble a binary file to approximate code""" | |
try: | |
pe = pefile.PE(file_path) | |
# Determine if 32-bit or 64-bit | |
is_64bit = pe.OPTIONAL_HEADER.Magic == 0x20b | |
mode = CS_MODE_64 if is_64bit else CS_MODE_32 | |
# Initialize disassembler | |
md = Cs(CS_ARCH_X86, mode) | |
md.detail = True | |
code_sections = [] | |
# Find the entry point for EXEs | |
entry_point = None | |
if not is_dll: | |
entry_rva = pe.OPTIONAL_HEADER.AddressOfEntryPoint | |
for section in pe.sections: | |
if section.contains_rva(entry_rva): | |
entry_offset = entry_rva - section.VirtualAddress + section.PointerToRawData | |
entry_point = entry_offset | |
# Process each section | |
for section in pe.sections: | |
section_name = section.Name.decode('utf-8', errors='ignore').strip('\x00') | |
# Focus on code sections | |
if section.Characteristics & 0x20000000: # IMAGE_SCN_CNT_CODE | |
section_data = pe.get_data(section.VirtualAddress, section.SizeOfRawData) | |
# Start disassembly from entry point if in this section | |
start_offset = 0 | |
if entry_point is not None and section.contains_rva(pe.OPTIONAL_HEADER.AddressOfEntryPoint): | |
start_offset = entry_point - section.PointerToRawData | |
# Limit to reasonable size for preview | |
preview_size = min(len(section_data) - start_offset, 4096) | |
# Disassemble | |
disassembly = [] | |
is_entry = entry_point is not None and section.contains_rva(pe.OPTIONAL_HEADER.AddressOfEntryPoint) | |
for i, (address, size, mnemonic, op_str) in enumerate(md.disasm_lite( | |
section_data[start_offset:start_offset+preview_size], | |
section.VirtualAddress + start_offset)): | |
# Mark entry point | |
entry_marker = "ENTRY POINT -> " if is_entry and i == 0 else "" | |
disassembly.append(f"{entry_marker}0x{address:08x}: {mnemonic} {op_str}") | |
code_sections.append({ | |
"name": section_name, | |
"disassembly": disassembly | |
}) | |
# Get exports for DLLs | |
exports = [] | |
if is_dll and hasattr(pe, 'DIRECTORY_ENTRY_EXPORT'): | |
for exp in pe.DIRECTORY_ENTRY_EXPORT.symbols: | |
if exp.name: | |
exports.append(exp.name.decode('utf-8', errors='ignore')) | |
# Convert to pseudo-Python | |
pseudo_python = generate_pseudo_python(code_sections, exports, is_dll) | |
return { | |
"success": True, | |
"code_sections": code_sections, | |
"exports": exports, | |
"pseudo_python": pseudo_python | |
} | |
except Exception as e: | |
return { | |
"success": False, | |
"message": f"Disassembly error: {str(e)}" | |
} | |
def generate_pseudo_python(code_sections, exports, is_dll): | |
"""Generate pseudo-Python code from disassembly""" | |
pseudo_code = [] | |
pseudo_code.append("# This is an approximation generated from binary code") | |
pseudo_code.append("# It is NOT the original source code") | |
pseudo_code.append("") | |
if is_dll: | |
pseudo_code.append("# DLL Export Functions") | |
for export in exports: | |
pseudo_code.append(f"def {export}():") | |
pseudo_code.append(" # Implementation not recoverable from binary") | |
pseudo_code.append(" pass") | |
pseudo_code.append("") | |
for section in code_sections: | |
pseudo_code.append(f"# Code Section: {section['name']}") | |
pseudo_code.append("def main():") | |
# Very basic pattern recognition for common operations | |
in_function = False | |
current_function = None | |
# Extract common patterns from assembly | |
for line in section['disassembly'][:100]: # Limit to first 100 instructions for preview | |
# Look for call instructions to infer function calls | |
if "call" in line and "0x" in line: | |
addr = re.search(r'0x[0-9a-f]+', line) | |
if addr: | |
pseudo_code.append(f" call_function_{addr.group(0)}()") | |
# Look for mov instructions to infer variable assignments | |
elif "mov" in line: | |
reg_match = re.search(r'mov\s+(\w+),\s+(.+)', line) | |
if reg_match: | |
dest, source = reg_match.groups() | |
pseudo_code.append(f" {dest} = {source} # {line}") | |
# Look for common comparisons | |
elif "cmp" in line: | |
cmp_match = re.search(r'cmp\s+(\w+),\s+(.+)', line) | |
if cmp_match: | |
a, b = cmp_match.groups() | |
pseudo_code.append(f" if {a} == {b}: # {line}") | |
pseudo_code.append(" pass") | |
else: | |
pseudo_code.append(f" # {line}") | |
pseudo_code.append("") | |
pseudo_code.append("if __name__ == '__main__':") | |
pseudo_code.append(" main()") | |
pseudo_code.append("") | |
return "\n".join(pseudo_code) | |
def unpack_exe(file_path, output_dir): | |
"""Extract information from an EXE file and attempt to convert to Python""" | |
try: | |
pe = pefile.PE(file_path) | |
# Basic PE information | |
info = { | |
"Machine": hex(pe.FILE_HEADER.Machine), | |
"TimeDateStamp": pe.FILE_HEADER.TimeDateStamp, | |
"NumberOfSections": pe.FILE_HEADER.NumberOfSections, | |
"Sections": [] | |
} | |
# Get section information | |
for section in pe.sections: | |
section_name = section.Name.decode('utf-8', errors='ignore').strip('\x00') | |
info["Sections"].append({ | |
"Name": section_name, | |
"VirtualAddress": hex(section.VirtualAddress), | |
"SizeOfRawData": section.SizeOfRawData, | |
"Entropy": section.get_entropy() | |
}) | |
# Get imports | |
if hasattr(pe, 'DIRECTORY_ENTRY_IMPORT'): | |
info["Imports"] = [] | |
for entry in pe.DIRECTORY_ENTRY_IMPORT: | |
dll_name = entry.dll.decode('utf-8', errors='ignore') | |
imports = [] | |
for imp in entry.imports: | |
if imp.name: | |
imports.append(imp.name.decode('utf-8', errors='ignore')) | |
info["Imports"].append({ | |
"DLL": dll_name, | |
"Functions": imports | |
}) | |
# Extract resources if present | |
if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'): | |
resource_dir = os.path.join(output_dir, "resources") | |
os.makedirs(resource_dir, exist_ok=True) | |
extracted_resources = [] | |
for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries: | |
if hasattr(resource_type, 'directory'): | |
for resource_id in resource_type.directory.entries: | |
if hasattr(resource_id, 'directory'): | |
for resource_lang in resource_id.directory.entries: | |
data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size) | |
resource_filename = f"resource_{resource_type.id}_{resource_id.id}_{resource_lang.id}" | |
resource_path = os.path.join(resource_dir, resource_filename) | |
with open(resource_path, 'wb') as f: | |
f.write(data) | |
extracted_resources.append(resource_filename) | |
info["ExtractedResources"] = extracted_resources | |
# Try to disassemble and convert to pseudo-Python | |
disassembly_result = disassemble_binary(file_path) | |
# Try PyInstaller extraction for Python executables | |
pyinstaller_result = try_pyinstaller_extraction(file_path, output_dir) | |
return { | |
"basic_info": info, | |
"disassembly": disassembly_result, | |
"pyinstaller": pyinstaller_result | |
} | |
except Exception as e: | |
return {"Error": str(e)} | |
def analyze_dll(file_path): | |
"""Extract information and code from a DLL file""" | |
try: | |
pe = pefile.PE(file_path) | |
# Basic information | |
info = { | |
"Machine": hex(pe.FILE_HEADER.Machine), | |
"TimeDateStamp": pe.FILE_HEADER.TimeDateStamp, | |
"NumberOfSections": pe.FILE_HEADER.NumberOfSections, | |
"Characteristics": hex(pe.FILE_HEADER.Characteristics), | |
"DllCharacteristics": hex(pe.OPTIONAL_HEADER.DllCharacteristics), | |
"Sections": [] | |
} | |
# Get section information | |
for section in pe.sections: | |
section_name = section.Name.decode('utf-8', errors='ignore').strip('\x00') | |
info["Sections"].append({ | |
"Name": section_name, | |
"VirtualAddress": hex(section.VirtualAddress), | |
"SizeOfRawData": section.SizeOfRawData, | |
"Entropy": section.get_entropy() | |
}) | |
# Get exports | |
if hasattr(pe, 'DIRECTORY_ENTRY_EXPORT'): | |
info["Exports"] = [] | |
for exp in pe.DIRECTORY_ENTRY_EXPORT.symbols: | |
if exp.name: | |
info["Exports"].append(exp.name.decode('utf-8', errors='ignore')) | |
# Get imports | |
if hasattr(pe, 'DIRECTORY_ENTRY_IMPORT'): | |
info["Imports"] = [] | |
for entry in pe.DIRECTORY_ENTRY_IMPORT: | |
dll_name = entry.dll.decode('utf-8', errors='ignore') | |
imports = [] | |
for imp in entry.imports: | |
if imp.name: | |
imports.append(imp.name.decode('utf-8', errors='ignore')) | |
info["Imports"].append({ | |
"DLL": dll_name, | |
"Functions": imports | |
}) | |
# Try to disassemble and convert to pseudo-Python | |
disassembly_result = disassemble_binary(file_path, is_dll=True) | |
return { | |
"basic_info": info, | |
"disassembly": disassembly_result | |
} | |
except Exception as e: | |
return {"Error": str(e)} | |
def process_zip_file(file_path, temp_dir): | |
"""Process a ZIP file and extract its contents""" | |
try: | |
with zipfile.ZipFile(file_path, 'r') as zip_ref: | |
# Get file list before extraction | |
file_list = zip_ref.namelist() | |
# Extract to temp directory | |
zip_ref.extractall(temp_dir) | |
# Check for nested executables | |
nested_files = {} | |
for root, _, files in os.walk(temp_dir): | |
for file in files: | |
full_path = os.path.join(root, file) | |
rel_path = os.path.relpath(full_path, temp_dir) | |
if file.endswith('.exe'): | |
exe_output_dir = os.path.join(temp_dir, f"{file}_unpacked") | |
os.makedirs(exe_output_dir, exist_ok=True) | |
nested_files[rel_path] = { | |
'type': 'exe', | |
'info': unpack_exe(full_path, exe_output_dir) | |
} | |
elif file.endswith('.dll'): | |
nested_files[rel_path] = { | |
'type': 'dll', | |
'info': analyze_dll(full_path) | |
} | |
return { | |
'file_list': file_list, | |
'nested_files': nested_files | |
} | |
except Exception as e: | |
return {'error': str(e)} | |
# Main app logic | |
uploaded_file = st.file_uploader("Upload a file (.zip, .exe, or .dll)", type=["zip", "exe", "dll"]) | |
if uploaded_file is not None: | |
with tempfile.TemporaryDirectory() as temp_dir: | |
# Save the uploaded file to the temporary directory | |
file_path = os.path.join(temp_dir, uploaded_file.name) | |
with open(file_path, "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
st.success(f"File uploaded: {uploaded_file.name}") | |
# Process based on file type | |
if uploaded_file.name.lower().endswith('.zip'): | |
st.subheader("ZIP File Contents") | |
output_dir = os.path.join(temp_dir, "extracted") | |
os.makedirs(output_dir, exist_ok=True) | |
result = process_zip_file(file_path, output_dir) | |
if 'error' in result: | |
st.error(f"Error processing ZIP file: {result['error']}") | |
else: | |
with st.expander("ZIP Contents", expanded=True): | |
st.write(f"Total files: {len(result['file_list'])}") | |
st.json(result['file_list']) | |
if result['nested_files']: | |
st.subheader("Detected Executable Files") | |
for file_path, file_info in result['nested_files'].items(): | |
with st.expander(f"{file_path} ({file_info['type'].upper()})"): | |
if file_info['type'] == 'exe': | |
tabs = st.tabs(["Basic Info", "Python Code", "Disassembly"]) | |
with tabs[0]: | |
st.json(file_info['info'].get('basic_info', {})) | |
with tabs[1]: | |
pyinstaller_result = file_info['info'].get('pyinstaller', {}) | |
disassembly = file_info['info'].get('disassembly', {}) | |
if pyinstaller_result.get('success', False): | |
st.success("Python code extracted successfully!") | |
for filename, content in pyinstaller_result.get('files', {}).items(): | |
with st.expander(f"Python File: {filename}"): | |
st.code(content, language="python") | |
else: | |
st.warning("Not a Python executable or extraction failed.") | |
st.subheader("Generated Python-like Code") | |
st.code(disassembly.get('pseudo_python', "# No code could be generated"), language="python") | |
with tabs[2]: | |
disassembly = file_info['info'].get('disassembly', {}) | |
if disassembly.get('success', False): | |
for section in disassembly.get('code_sections', []): | |
with st.expander(f"Section: {section['name']}"): | |
st.code("\n".join(section['disassembly']), language="asm") | |
else: | |
st.error(disassembly.get('message', "Disassembly failed")) | |
else: # DLL | |
tabs = st.tabs(["Basic Info", "DLL Code", "Exports/Imports"]) | |
with tabs[0]: | |
st.json(file_info['info'].get('basic_info', {})) | |
with tabs[1]: | |
disassembly = file_info['info'].get('disassembly', {}) | |
st.subheader("Generated Python-like Code") | |
st.code(disassembly.get('pseudo_python', "# No code could be generated"), language="python") | |
with tabs[2]: | |
basic_info = file_info['info'].get('basic_info', {}) | |
if 'Exports' in basic_info: | |
st.subheader("Exported Functions") | |
st.json(basic_info['Exports']) | |
if 'Imports' in basic_info: | |
st.subheader("Imported DLLs and Functions") | |
st.json(basic_info['Imports']) | |
elif uploaded_file.name.lower().endswith('.exe'): | |
st.subheader("EXE File Analysis and Decompilation") | |
output_dir = os.path.join(temp_dir, "exe_unpacked") | |
os.makedirs(output_dir, exist_ok=True) | |
try: | |
exe_info = unpack_exe(file_path, output_dir) | |
tabs = st.tabs(["Basic Info", "Python Code", "Disassembly", "Resources"]) | |
with tabs[0]: | |
st.json(exe_info.get('basic_info', {})) | |
with tabs[1]: | |
pyinstaller_result = exe_info.get('pyinstaller', {}) | |
disassembly = exe_info.get('disassembly', {}) | |
if pyinstaller_result.get('success', False): | |
st.success("Python code extracted successfully!") | |
for filename, content in pyinstaller_result.get('files', {}).items(): | |
with st.expander(f"Python File: {filename}"): | |
st.code(content, language="python") | |
else: | |
st.warning(pyinstaller_result.get('message', "Not a Python executable or extraction failed.")) | |
st.subheader("Generated Python-like Code") | |
st.code(disassembly.get('pseudo_python', "# No code could be generated"), language="python") | |
with tabs[2]: | |
disassembly = exe_info.get('disassembly', {}) | |
if disassembly.get('success', False): | |
for section in disassembly.get('code_sections', []): | |
with st.expander(f"Section: {section['name']}"): | |
st.code("\n".join(section['disassembly']), language="asm") | |
else: | |
st.error(disassembly.get('message', "Disassembly failed")) | |
with tabs[3]: | |
basic_info = exe_info.get('basic_info', {}) | |
if 'ExtractedResources' in basic_info: | |
st.write(f"Found {len(basic_info['ExtractedResources'])} resources") | |
for resource in basic_info['ExtractedResources']: | |
st.text(f"Resource: {resource}") | |
except Exception as e: | |
st.error(f"Error analyzing EXE file: {str(e)}") | |
elif uploaded_file.name.lower().endswith('.dll'): | |
st.subheader("DLL File Analysis and Decompilation") | |
try: | |
dll_info = analyze_dll(file_path) | |
tabs = st.tabs(["Basic Info", "DLL Code", "Exports/Imports"]) | |
with tabs[0]: | |
st.json(dll_info.get('basic_info', {})) | |
with tabs[1]: | |
disassembly = dll_info.get('disassembly', {}) | |
st.subheader("Generated Python-like Code") | |
st.code(disassembly.get('pseudo_python', "# No code could be generated"), language="python") | |
if disassembly.get('success', False): | |
st.subheader("Assembly Code") | |
for section in disassembly.get('code_sections', []): | |
with st.expander(f"Section: {section['name']}"): | |
st.code("\n".join(section['disassembly']), language="asm") | |
with tabs[2]: | |
basic_info = dll_info.get('basic_info', {}) | |
if 'Exports' in basic_info: | |
st.subheader("Exported Functions") | |
st.json(basic_info['Exports']) | |
if 'Imports' in basic_info: | |
st.subheader("Imported DLLs and Functions") | |
st.json(basic_info['Imports']) | |
except Exception as e: | |
st.error(f"Error analyzing DLL file: {str(e)}") |