Files
8086-rs/src/disasm.rs
Marco Thomas b5c178ea61 chore: remove Immediate from Immediate::{Word, Byte, Operand}
It's already clear that its an Immediate value
without the prefix.
2025-05-14 10:40:52 +09:00

463 lines
17 KiB
Rust

use core::fmt;
use std::{fs::File, io::Read, process::exit};
use crate::aout::Aout;
use crate::instructions::{MemoryIndex, ModRmTarget, Operand, Pointer};
use crate::register::{Register, RegisterId, SegmentRegister};
use crate::{
Args,
instructions::{Instruction, Mnemonic},
};
use crate::{modrmb, modrmgprb, modrmgprv, modrms, modrmv};
#[derive(Debug)]
/// Generic errors, which are encountered during parsing.
pub enum DisasmError {
NoFile(Option<String>),
IoError(std::io::Error),
}
impl From<std::io::Error> for DisasmError {
fn from(error: std::io::Error) -> Self {
DisasmError::IoError(error)
}
}
impl fmt::Display for DisasmError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
DisasmError::NoFile(msg) => write!(f, "No file error: {:?}", msg),
DisasmError::IoError(msg) => write!(f, "{}", msg),
}
}
}
/// Disassemble the binary in `path` into a vector of instructions.
/// Main entry point to the disassembly.
pub fn disasm(args: &Args) -> Result<Vec<Instruction>, DisasmError> {
let contents = path_to_buf(args)?;
let aout = Aout::new(contents);
log::debug!("{:?}", aout);
let mut disasm = Disassembler::new(aout);
disasm.decode_instructions()
}
/// Read a filepath into a u8 buffer.
fn path_to_buf(args: &Args) -> Result<Vec<u8>, DisasmError> {
let path = args
.path
.clone()
.ok_or(DisasmError::NoFile(args.path.clone()))?;
let mut file = File::open(path)?;
let mut buf = Vec::new();
file.read_to_end(&mut buf)?;
Ok(buf)
}
#[derive(Debug)]
struct Disassembler {
pub offset: usize, // the current offset in the disasm process
pub text: Vec<u8>, // the aout binary
pub instruction: Instruction, // the instruction, which is currently being parsed
}
impl Disassembler {
pub fn new(aout: Aout) -> Self {
Disassembler {
offset: 0,
text: aout.text,
instruction: Instruction::new(),
}
}
/// Parse a single byte of binary, return it and advance the offset.
/// Returns the read byte.
pub fn parse_byte(&mut self) -> u8 {
// advance to operand
self.offset += 1;
let byte = self.text[self.offset];
self.instruction.raw.push(byte);
byte
}
/// Parse a single word of binary, return it and advance the offset.
/// Returns the read word.
pub fn parse_word(&mut self) -> u16 {
// advance to operand
self.offset += 1;
let byte1 = self.text[self.offset];
let byte2 = self.text[self.offset + 1];
// jump onto last operand
self.offset += 1;
self.instruction.raw.push(byte1);
self.instruction.raw.push(byte2);
u16::from_le_bytes([byte1, byte2])
}
/// Takes in a modrm byte and returns mod, reg and r/m.
fn deconstruct_modrm_byte(modrm: u8) -> (u8, u8, u8) {
let mode = (modrm >> 6) & 0b11;
let reg = (modrm >> 3) & 0b111;
let rm = modrm & 0b111;
(mode, reg, rm)
}
/// Parse a single modrm byte, return the resulting MemoryIndex and advance the offset.
/// Returns the parsed modrm target and the source register
pub fn parse_modrm_byte(&mut self, width: Operand) -> (ModRmTarget, RegisterId) {
// advance to operand
self.offset += 1;
let modrm = self.text[self.offset];
self.instruction.raw.push(modrm);
let (mode, reg, rm) = Self::deconstruct_modrm_byte(modrm);
log::debug!(
"0x{:04x} deconstructed into: 0b{:b}, 0b{:b}, 0b{:b}",
modrm,
mode,
reg,
rm
);
let mut displacement = None;
match mode {
0b00 => {
if rm == 0b110 {
log::debug!("Additional word during ModRM parsing was read with mod 0.");
displacement = Some(Operand::Word(self.parse_word()));
} else {
displacement = None;
}
}
0b01 => {
log::debug!("Additional byte during ModRM parsing was read.");
displacement = Some(Operand::Byte(self.parse_byte()))
}
0b10 => {
log::debug!("Additional word during ModRM parsing was read.");
displacement = Some(Operand::Word(self.parse_word()));
}
0b11 => {
log::debug!("ModRM (0b{:b}) to/from Register (0b{:b})", rm, reg);
// XXX: find a nicer way instead of using Byte(0) and Word(0)
let target = match width {
Operand::Byte(_) => ModRmTarget::Register(Register::by_id(Operand::Byte(rm))),
Operand::Word(_) => {
ModRmTarget::Register(Register::by_id(Operand::Word(rm.into())))
}
};
return (target, reg);
}
_ => panic!("Invalid ModRM byte encountered"),
};
let index = match rm {
0b0000 => MemoryIndex {
base: Some(Register::BX),
index: Some(Register::SI),
displacement,
},
0b0001 => MemoryIndex {
base: Some(Register::BX),
index: Some(Register::DI),
displacement,
},
0b0010 => MemoryIndex {
base: Some(Register::BP),
index: Some(Register::SI),
displacement,
},
0b0011 => MemoryIndex {
base: Some(Register::BP),
index: Some(Register::DI),
displacement,
},
0b0100 => MemoryIndex {
base: None,
index: Some(Register::SI),
displacement,
},
0b0101 => MemoryIndex {
base: None,
index: Some(Register::DI),
displacement,
},
0b0110 => MemoryIndex {
base: Some(Register::BP),
index: None,
displacement,
},
0b0111 => MemoryIndex {
base: Some(Register::BX),
index: None,
displacement,
},
_ => panic!("Invalid ModRM byte encountered"),
};
(ModRmTarget::Memory(index), reg)
}
/// Match the modrm reg bits to the GPR1 mnemonics.
/// GPR always has an imm value as second operand, but is available in both
/// Byte and Word length.
pub fn modrm_reg_to_mnemonic(reg: u8, target: ModRmTarget, imm: Operand) -> Mnemonic {
match imm {
Operand::Byte(b) => match reg {
0b000 => Mnemonic::ADD_Ib(target, b),
0b001 => Mnemonic::OR_Ib(target, b),
0b010 => Mnemonic::ADC_Ib(target, b),
0b011 => Mnemonic::SBB_Ib(target, b),
0b100 => Mnemonic::AND_Ib(target, b),
0b101 => Mnemonic::SUB_Ib(target, b),
0b110 => Mnemonic::XOR_Ib(target, b),
0b111 => Mnemonic::CMP_Ib(target, b),
_ => panic!("Illegal GPR1 mnemonic"),
},
Operand::Word(w) => match reg {
0b000 => Mnemonic::ADD_Iv(target, w),
0b001 => Mnemonic::OR_Iv(target, w),
0b010 => Mnemonic::ADC_Iv(target, w),
0b011 => Mnemonic::SBB_Iv(target, w),
0b100 => Mnemonic::AND_Iv(target, w),
0b101 => Mnemonic::SUB_Iv(target, w),
0b110 => Mnemonic::XOR_Iv(target, w),
0b111 => Mnemonic::CMP_Iv(target, w),
_ => panic!("Illegal GPR1 mnemonic"),
},
}
}
/// Decode instructions from the text section of the provided binary
pub fn decode_instructions(&mut self) -> Result<Vec<Instruction>, DisasmError> {
// naive approach:
// 1. read byte
// 2. pattern match to see which instruction it is
// 3. read as many bytes as this instruction needs (registers, immidiates, ...)
// repeat until no bytes left
let mut instructions = Vec::new();
while self.offset < self.text.len() {
self.instruction.start = self.offset;
let opcode = self.text[self.offset];
// additional raw bytes will be pushed by parse functions
self.instruction.raw.push(opcode);
// XXX: convert this copy and paste horror into a proc macro
self.instruction.opcode = match opcode {
0x00 => modrmb!(self, ADD_FromReg),
0x01 => modrmv!(self, ADD_FromReg),
0x02 => modrmb!(self, ADD_ToReg),
0x03 => modrmv!(self, ADD_ToReg),
0x04 => Mnemonic::ADD_ALIb(self.parse_byte()),
0x05 => Mnemonic::ADD_AXIv(self.parse_word()),
0x06 => Mnemonic::PUSH_S(SegmentRegister::ES),
0x07 => Mnemonic::POP_S(SegmentRegister::ES),
0x08 => modrmb!(self, OR_FromReg),
0x09 => modrmv!(self, OR_FromReg),
0x0A => modrmb!(self, OR_ToReg),
0x0B => modrmv!(self, OR_ToReg),
0x0C => Mnemonic::OR_ALIb(self.parse_byte()),
0x0D => Mnemonic::OR_AXIv(self.parse_word()),
0x0E => Mnemonic::PUSH_S(SegmentRegister::CS),
0x0F => panic!("Opcode 0x0F (POP CS) is considered undefined"),
0x10 => modrmb!(self, ADC_FromReg),
0x11 => modrmv!(self, ADC_FromReg),
0x12 => modrmb!(self, ADC_ToReg),
0x13 => modrmv!(self, ADC_ToReg),
0x14 => Mnemonic::ADC_ALIb(self.parse_byte()),
0x15 => Mnemonic::ADC_AXIv(self.parse_word()),
0x16 => Mnemonic::PUSH_S(SegmentRegister::SS),
0x17 => Mnemonic::POP_S(SegmentRegister::SS),
0x18 => modrmb!(self, SBB_FromReg),
0x19 => modrmv!(self, SBB_FromReg),
0x1A => modrmb!(self, SBB_ToReg),
0x1B => modrmv!(self, SBB_ToReg),
0x1C => Mnemonic::SBB_ALIb(self.parse_byte()),
0x1D => Mnemonic::SBB_AXIv(self.parse_word()),
0x1E => Mnemonic::PUSH_S(SegmentRegister::DS),
0x1F => Mnemonic::POP_S(SegmentRegister::DS),
0x20 => modrmb!(self, AND_FromReg),
0x21 => modrmv!(self, AND_FromReg),
0x22 => modrmb!(self, AND_ToReg),
0x23 => modrmv!(self, AND_ToReg),
0x24 => Mnemonic::AND_ALIb(self.parse_byte()),
0x25 => Mnemonic::AND_AXIv(self.parse_word()),
0x26 => Mnemonic::OVERRIDE(SegmentRegister::ES),
0x27 => Mnemonic::DAA,
0x28 => modrmb!(self, SUB_FromReg),
0x29 => modrmv!(self, SUB_FromReg),
0x2A => modrmb!(self, SUB_ToReg),
0x2B => modrmv!(self, SUB_ToReg),
0x2C => Mnemonic::SUB_ALIb(self.parse_byte()),
0x2D => Mnemonic::SUB_AXIv(self.parse_word()),
0x2E => Mnemonic::OVERRIDE(SegmentRegister::CS),
0x2F => Mnemonic::DAS,
0x30 => modrmb!(self, XOR_FromReg),
0x31 => modrmv!(self, XOR_FromReg),
0x32 => modrmb!(self, XOR_ToReg),
0x33 => modrmv!(self, XOR_ToReg),
0x34 => Mnemonic::XOR_ALIb(self.parse_byte()),
0x35 => Mnemonic::XOR_AXIv(self.parse_word()),
0x36 => Mnemonic::OVERRIDE(SegmentRegister::SS),
0x37 => Mnemonic::AAA,
0x38 => modrmb!(self, CMP_FromReg),
0x39 => modrmv!(self, CMP_FromReg),
0x3A => modrmb!(self, CMP_ToReg),
0x3B => modrmv!(self, CMP_ToReg),
0x3C => Mnemonic::CMP_ALIb(self.parse_byte()),
0x3D => Mnemonic::CMP_AXIv(self.parse_word()),
0x3E => Mnemonic::OVERRIDE(SegmentRegister::DS),
0x3F => Mnemonic::AAS,
0x40 => Mnemonic::INC(Register::AX),
0x41 => Mnemonic::INC(Register::CX),
0x42 => Mnemonic::INC(Register::DX),
0x43 => Mnemonic::INC(Register::BX),
0x44 => Mnemonic::INC(Register::SP),
0x45 => Mnemonic::INC(Register::BP),
0x46 => Mnemonic::INC(Register::SI),
0x47 => Mnemonic::INC(Register::DI),
0x48 => Mnemonic::DEC(Register::AX),
0x49 => Mnemonic::DEC(Register::CX),
0x4A => Mnemonic::DEC(Register::DX),
0x4B => Mnemonic::DEC(Register::BX),
0x4C => Mnemonic::DEC(Register::SP),
0x4D => Mnemonic::DEC(Register::BP),
0x4E => Mnemonic::DEC(Register::SI),
0x4F => Mnemonic::DEC(Register::DI),
0x50 => Mnemonic::PUSH_R(Register::AX),
0x51 => Mnemonic::PUSH_R(Register::CX),
0x52 => Mnemonic::PUSH_R(Register::DX),
0x53 => Mnemonic::PUSH_R(Register::BX),
0x54 => Mnemonic::PUSH_R(Register::SP),
0x55 => Mnemonic::PUSH_R(Register::BP),
0x56 => Mnemonic::PUSH_R(Register::SI),
0x57 => Mnemonic::PUSH_R(Register::DI),
0x58 => Mnemonic::POP_R(Register::AX),
0x59 => Mnemonic::POP_R(Register::CX),
0x5A => Mnemonic::POP_R(Register::DX),
0x5B => Mnemonic::POP_R(Register::BX),
0x5C => Mnemonic::POP_R(Register::SP),
0x5D => Mnemonic::POP_R(Register::BP),
0x5E => Mnemonic::POP_R(Register::SI),
0x5F => Mnemonic::POP_R(Register::DI),
0x60..=0x6F => panic!("0x06 to 0x06F is considered undefined."),
0x70 => Mnemonic::JO(self.parse_byte()),
0x71 => Mnemonic::JNO(self.parse_byte()),
0x72 => Mnemonic::JB(self.parse_byte()),
0x73 => Mnemonic::JNB(self.parse_byte()),
0x74 => Mnemonic::JZ(self.parse_byte()),
0x75 => Mnemonic::JNZ(self.parse_byte()),
0x76 => Mnemonic::JBE(self.parse_byte()),
0x77 => Mnemonic::JA(self.parse_byte()),
0x78 => Mnemonic::JS(self.parse_byte()),
0x79 => Mnemonic::JNS(self.parse_byte()),
0x7A => Mnemonic::JPE(self.parse_byte()),
0x7B => Mnemonic::JPO(self.parse_byte()),
0x7C => Mnemonic::JL(self.parse_byte()),
0x7D => Mnemonic::JGE(self.parse_byte()),
0x7E => Mnemonic::JLE(self.parse_byte()),
0x7F => Mnemonic::JG(self.parse_byte()),
0x80 => modrmgprb!(self),
0x81 => modrmgprv!(self),
0x82 => modrmgprb!(self), // same as 0x80
0x83 => panic!("Sign extented GPR1 not yet implemented"),
0x84 => modrmb!(self, TEST),
0x85 => modrmv!(self, TEST),
0x86 => modrmb!(self, XHCG),
0x87 => modrmv!(self, XHCG),
0x88 => modrmb!(self, MOV_FromReg),
0x89 => modrmv!(self, MOV_FromReg),
0x8A => modrmb!(self, MOV_ToReg),
0x8B => modrmv!(self, MOV_ToReg),
0x8C => modrms!(self, MOV_FromSReg),
0x8E => modrms!(self, MOV_ToSReg),
0x8D => modrmv!(self, LEA),
0x8F => {
let target = self.parse_modrm_byte(Operand::Word(0)).0;
let mem = match target {
ModRmTarget::Memory(idx) => idx,
_ => panic!("POP_M instruction given a register to pop into"),
};
Mnemonic::POP_M(mem)
}
0x90 => Mnemonic::NOP(),
0x91 => Mnemonic::XCHG_AX(Register::CX),
0x92 => Mnemonic::XCHG_AX(Register::DX),
0x93 => Mnemonic::XCHG_AX(Register::BX),
0x94 => Mnemonic::XCHG_AX(Register::SP),
0x95 => Mnemonic::XCHG_AX(Register::BP),
0x96 => Mnemonic::XCHG_AX(Register::SI),
0x97 => Mnemonic::XCHG_AX(Register::DI),
0x98 => Mnemonic::CBW,
0x99 => Mnemonic::CWD,
0x9A => Mnemonic::CALL(Pointer {
segment: self.parse_word(),
offset: self.parse_word(),
}),
0x9B => Mnemonic::WAIT,
0x9C => Mnemonic::PUSHF,
0x9D => Mnemonic::POPF,
0x9E => Mnemonic::SAHF,
0x9F => Mnemonic::LAHF,
0xCD => Mnemonic::INT(self.parse_byte()),
0xBB => Mnemonic::MOV_BXIv(self.parse_word()),
_ => {
eprintln!("Encountered unknown instruction '0x{:x}'", opcode);
eprintln!("Offset might be misaligned and data is being interpreted.");
eprintln!("Existing to avoid further misinterpretation...");
exit(1);
}
};
println!("{}", self.instruction);
instructions.push(self.instruction.clone());
self.instruction = Instruction::new();
self.offset += 1;
}
Ok(instructions)
}
}