chore: whole swoop of enhanced documentation

This commit is contained in:
2025-05-28 09:41:40 +09:00
parent 322a276617
commit 0893969f4e
7 changed files with 210 additions and 173 deletions

View File

@@ -9,7 +9,7 @@ use crate::{
Args,
instructions::{Instruction, Mnemonic},
};
use crate::{modrm_instruction_sregister, modrm_instruction_wordwidth, modrm_target_bytewidth};
use crate::{modrm_8b_register, modrm_16b_register, modrm_sregister};
use core::fmt;
use std::{fs::File, io::Read, process::exit};
@@ -103,8 +103,37 @@ impl Disassembler {
}
}
/// Parse a single byte of binary, return it and advance the offset.
/// Returns the read byte.
/// Start the disassmble and allow for some error handling wrapped around
/// the actual decoding function.
pub fn disassemble(&mut self) -> Result<Vec<Instruction>, DisasmError> {
let is_ok = self.decode_instructions();
// a.out pads the text section to byte align, so the fasely interpreted
// instructions have to be removed.
self.remove_trailing_padding();
// read instructions from disassembler object instead of decode function
// to allow some error's to act as warnings (see below)
let instructions = self.instructions.clone();
// allow for warning-type errors to pass through, as they are not fatal
match is_ok {
Ok(_) => Ok(instructions),
Err(e) => match e {
DisasmError::EndOfTextSection => {
log::debug!("Solo padded 0-byte at end of file was found. Ignoring.");
Ok(instructions)
}
_ => {
println!("Encountered error during disassembly: {e}");
Err(e)
}
},
}
}
/// Parse a single byte of the binary and advance the offset.
/// Returns the read byte (Intel b operand).
fn parse_byte(&mut self) -> Result<Byte, DisasmError> {
log::debug!("Attempting to parse byte at {:#04x} ...", self.offset);
// check if the byte would be out of bounds
@@ -130,9 +159,9 @@ impl Disassembler {
Ok(*byte)
}
/// Parse a single word of binary.
/// Parse a single word of the binary and advance the offset.
/// Just a wrapper for parsing a byte twice.
/// Returns the read word.
/// Returns the read word (Intel w/v operand).
fn parse_word(&mut self) -> Result<Word, DisasmError> {
log::debug!("Attempting to parse word at {:#04x} ...", self.offset);
let byte1 = self.parse_byte()?;
@@ -140,9 +169,10 @@ impl Disassembler {
Ok(u16::from_le_bytes([byte1, byte2]))
}
/// Parse a single byte of binary and interpret as as signed.
/// The isize contains a relative offset to be added to the address
/// of the subsequent instruction.
/// Parse a single of the binary, interpret it as signed and advance the
/// offset.
/// Returns the read byte added to the address of the subsequent instruction
/// to act as a relative offset (Intel Jb operand).
fn parse_j_byte(&mut self) -> Result<isize, DisasmError> {
log::debug!("Attempting to parse Jb at {:#04x} ...", self.offset);
// first interpret as 2-complement, then cast for addition
@@ -155,9 +185,10 @@ impl Disassembler {
Ok(byte + next_addr)
}
/// Parse a single byte of binary and interpret as signed.
/// The isize contains a relative offset to be added to the address
/// of the subsequent instruction.
/// Parse a word of the binary, interpret it as signed and advance the
/// offset.
/// Returns the read word added to the address of the subsequent instruction
/// to act as a relative offset (Intel Jw/Jv operand).
pub fn parse_j_word(&mut self) -> Result<isize, DisasmError> {
log::debug!("Attempting to parse Jv at {:#04x} ...", self.offset);
// first interpret as 2-complement, then cast for addition
@@ -170,7 +201,10 @@ impl Disassembler {
Ok(word + next_addr)
}
/// Parse a pointer type.
/// Parse a single pointer of the binary and advance the offset.
/// Just a wrapper for parsing a byte 4 types and constructing a pointer
/// type.
/// Returns the read pointer (Intel p operand).
fn parse_ptr(&mut self) -> Result<Pointer, DisasmError> {
log::debug!("Attempting to parse pointer at {:#04x} ...", self.offset);
let byte0 = self.parse_byte()?;
@@ -185,24 +219,30 @@ impl Disassembler {
})
}
/// Takes in a modrm byte and returns mod, reg and r/m.
fn deconstruct_modrm_byte(modrm: u8) -> (u8, u8, u8) {
let mode = (modrm >> 6) & 0b11;
let reg = (modrm >> 3) & 0b111;
let rm = modrm & 0b111;
(mode, reg, rm)
/// Parse an Mp Operand (Memory Pointer).
/// An Mp is a ModRM byte with the `reg` bits ignored and an additional
/// 2 [`Word`]s parsed for a [`Pointer`] type.
fn parse_mp(&mut self) -> Result<(ModRmTarget, Pointer), DisasmError> {
let (target, _) = self.parse_modrm_byte(Operand::Byte(0))?;
let ptr = self.parse_ptr()?;
Ok((target, ptr))
}
/// Parse a single modrm byte, return the resulting MemoryIndex and advance the offset.
/// Returns the parsed modrm target and the source register
/// Parse a single ModRM byte, calculate the [`ModRmTarget`] (Memory or
/// Register) from that byte and advance the offset.
/// It is always just a single byte, even for word-width instructions.
/// Returns the [`ModRmTarget`] (either memory or a register) as well as the
/// `reg` bitfield, which will later be used to determine another register
/// or even mnemonic in the group-type instructions.
fn parse_modrm_byte(
&mut self,
register_width: Operand,
) -> Result<(ModRmTarget, RegisterId), DisasmError> {
let modrm = self.parse_byte()?;
let (mode, reg, rm) = Self::deconstruct_modrm_byte(modrm);
let mode = (modrm >> 6) & 0b11;
let reg = (modrm >> 3) & 0b111;
let rm = modrm & 0b111;
log::debug!(
"{:#04x} deconstructed into: {:#b}, {:#b}, {:#b}",
@@ -304,17 +344,17 @@ impl Disassembler {
Ok((ModRmTarget::Memory(index), reg))
}
/// Match the modrm reg bits to the GPR1 mnemonics.
/// Group 1 always have an ModRM target (all modrm bits, without reg) as
/// first and an imm value as second operand (which has to be parsed before
/// call to this function), but is available in both Byte and Word length.
/// Match the ModRM `reg` bitfield to Intel Group 1-type instructions. Group
/// 1 always has an [`ModRmTarget`] as first and a [`Register`] as second
/// operand, which is determined by the ModRM `reg` field, aswell as the
/// bit-width of the instruction currently being parsed.
fn modrm_reg_to_grp1(
modrm_reg_byte: u8,
reg: u8,
target: ModRmTarget,
register_id: Operand,
instruction_width: Operand,
) -> Result<Mnemonic, DisasmError> {
match register_id {
Operand::Byte(b) => match modrm_reg_byte {
match instruction_width {
Operand::Byte(b) => match reg {
0b000 => Ok(Mnemonic::ADD_Ib(target, b)),
0b001 => Ok(Mnemonic::OR_Ib(target, b)),
0b010 => Ok(Mnemonic::ADC_Ib(target, b)),
@@ -323,9 +363,9 @@ impl Disassembler {
0b101 => Ok(Mnemonic::SUB_Ib(target, b)),
0b110 => Ok(Mnemonic::XOR_Ib(target, b)),
0b111 => Ok(Mnemonic::CMP_Ib(target, b)),
_ => return Err(DisasmError::IllegalGroupMnemonic(1, modrm_reg_byte)),
_ => return Err(DisasmError::IllegalGroupMnemonic(1, reg)),
},
Operand::Word(w) => match modrm_reg_byte {
Operand::Word(w) => match reg {
0b000 => Ok(Mnemonic::ADD_Iv(target, w)),
0b001 => Ok(Mnemonic::OR_Iv(target, w)),
0b010 => Ok(Mnemonic::ADC_Iv(target, w)),
@@ -334,15 +374,16 @@ impl Disassembler {
0b101 => Ok(Mnemonic::SUB_Iv(target, w)),
0b110 => Ok(Mnemonic::XOR_Iv(target, w)),
0b111 => Ok(Mnemonic::CMP_Iv(target, w)),
_ => return Err(DisasmError::IllegalGroupMnemonic(1, modrm_reg_byte)),
_ => return Err(DisasmError::IllegalGroupMnemonic(1, reg)),
},
}
}
/// Match the modrm reg bits to the GPR2 mnemonics.
/// Group 2 only has a single operand, the other one is either a constant
/// 1 (not present in the binary) or the CL register.
/// This function assumes the operand to be 1
/// Match the ModRM `reg` bits to Intel Group 2-type instructions. Group 2
/// always only has a single operand, the other is either `1` or the `CL`
/// register.
/// This function assumes the operand to be `1`.
/// See [`Self::modrm_reg_to_grp2_cl`] for the counter part.
fn modrm_reg_to_grp2_1(reg: u8, target: ModRmTarget) -> Result<Mnemonic, DisasmError> {
match reg {
0b000 => Ok(Mnemonic::ROL_b(target, 1)),
@@ -357,10 +398,11 @@ impl Disassembler {
}
}
/// Match the modrm reg bits to the GPR2 mnemonics.
/// Group 2 only has a single operand, the other one is either a constant
/// 1 (not present in the binary) or the CL register.
/// This function assumes the operand to be CL register.
/// Match the ModRM `reg` bits to Intel Group 2-type instructions. Group 2
/// always only has a single operand, the other is either `1` or the `CL`
/// register.
/// This function assumes the operand to be [`Register::CL`].
/// See [`Self::modrm_reg_to_grp2_cl`] for the counter part.
fn modrm_reg_to_grp2_cl(reg: u8, target: ModRmTarget) -> Result<Mnemonic, DisasmError> {
match reg {
0b000 => Ok(Mnemonic::ROL_fromReg(target, Register::CL)),
@@ -375,9 +417,9 @@ impl Disassembler {
}
}
/// Match the modrm reg bits to the GPR3a/b mnemonics.
/// Group 3 only has a single operand, which is the ModRmTarget selected
/// by modrm bits.
/// Match the ModRM `reg` bits to Intel Group 3a/b-type instructions.
/// Group 3 selects an unary mnemonic with the `reg` bit fiels. The operand
/// is the [`ModRmTarget`].
fn modrm_reg_to_grp3(
&mut self,
reg: u8,
@@ -400,21 +442,12 @@ impl Disassembler {
}
}
/// Parse an Mp Operand (Memory Pointer).
/// An Mp is a ModRM byte with the `reg` bits ignored and an additional
/// 2 words parsed for a `Pointer` type.
fn modrm_mp(&mut self) -> Result<(ModRmTarget, Pointer), DisasmError> {
let (target, _) = self.parse_modrm_byte(Operand::Byte(0))?;
let ptr = self.parse_ptr()?;
Ok((target, ptr))
}
/// a.out pads the text section with 0x00 bytes. During parsing, these get
/// interpreted as `0x00 0x00`, which have to get removed for an authentic
/// disassembly.
/// This is done in favor of removing all 0x00 bytes in the beginning,
/// as this could remove an actual 0x00 byte as operand of the final
/// instruction. Of course, this could remove an actual `0x00 0x00`
/// as this could remove an actual `0x00` byte as operand of the final
/// real instruction. Of course, this could remove an actual `0x00 0x00`
/// instruction from the end, but they would not have any effect on
/// execution anyway.
fn remove_trailing_padding(&mut self) {
@@ -441,33 +474,10 @@ impl Disassembler {
self.instructions.truncate(until);
}
/// Start the disassmble and allow for some error handling wrapped around
/// the actual decoding function.
pub fn disassemble(&mut self) -> Result<Vec<Instruction>, DisasmError> {
let parsing = self.decode_instructions();
// a.out pads the text section to byte align, so the fasely interpreted
// instructions have to be removed.
self.remove_trailing_padding();
let instructions = self.instructions.clone();
// allow for warning-type errors to pass through, as they are not fatal
match parsing {
Ok(_) => Ok(instructions),
Err(e) => match e {
DisasmError::EndOfTextSection => {
log::debug!("Solo padded 0-byte at end of file was found. Ignoring.");
Ok(instructions)
}
_ => {
println!("Encountered error during disassembly: {e}");
Err(e)
}
},
}
}
/// Decode instructions by matching their byte signature to their mnemonics.
/// Decode instructions by matching byte signature to their mnemonics and
/// depending on the instruction, parsing some operands afterwards.
/// All parsing is done in capsulated functions, here everything just
/// gets consolodated.
fn decode_instructions(&mut self) -> Result<(), DisasmError> {
log::debug!("Starting to decode text of length {}", self.text.len());
while self.offset < self.text.len() {
@@ -482,20 +492,20 @@ impl Disassembler {
self.instruction.raw.push(opcode);
self.instruction.opcode = match opcode {
0x00 => modrm_target_bytewidth!(self, ADD_FromReg),
0x01 => modrm_instruction_wordwidth!(self, ADD_FromReg),
0x02 => modrm_target_bytewidth!(self, ADD_ToReg),
0x03 => modrm_instruction_wordwidth!(self, ADD_ToReg),
0x00 => modrm_8b_register!(self, ADD_FromReg),
0x01 => modrm_16b_register!(self, ADD_FromReg),
0x02 => modrm_8b_register!(self, ADD_ToReg),
0x03 => modrm_16b_register!(self, ADD_ToReg),
0x04 => Mnemonic::ADD_ALIb(self.parse_byte()?),
0x05 => Mnemonic::ADD_AXIv(self.parse_word()?),
0x06 => Mnemonic::PUSH_S(SegmentRegister::ES),
0x07 => Mnemonic::POP_S(SegmentRegister::ES),
0x08 => modrm_target_bytewidth!(self, OR_FromReg),
0x09 => modrm_instruction_wordwidth!(self, OR_FromReg),
0x0A => modrm_target_bytewidth!(self, OR_ToReg),
0x0B => modrm_instruction_wordwidth!(self, OR_ToReg),
0x08 => modrm_8b_register!(self, OR_FromReg),
0x09 => modrm_16b_register!(self, OR_FromReg),
0x0A => modrm_8b_register!(self, OR_ToReg),
0x0B => modrm_16b_register!(self, OR_ToReg),
0x0C => Mnemonic::OR_ALIb(self.parse_byte()?),
0x0D => Mnemonic::OR_AXIv(self.parse_word()?),
@@ -503,60 +513,60 @@ impl Disassembler {
0x0F => return Err(DisasmError::OpcodeUndefined(opcode)),
0x10 => modrm_target_bytewidth!(self, ADC_FromReg),
0x11 => modrm_instruction_wordwidth!(self, ADC_FromReg),
0x12 => modrm_target_bytewidth!(self, ADC_ToReg),
0x13 => modrm_instruction_wordwidth!(self, ADC_ToReg),
0x10 => modrm_8b_register!(self, ADC_FromReg),
0x11 => modrm_16b_register!(self, ADC_FromReg),
0x12 => modrm_8b_register!(self, ADC_ToReg),
0x13 => modrm_16b_register!(self, ADC_ToReg),
0x14 => Mnemonic::ADC_ALIb(self.parse_byte()?),
0x15 => Mnemonic::ADC_AXIv(self.parse_word()?),
0x16 => Mnemonic::PUSH_S(SegmentRegister::SS),
0x17 => Mnemonic::POP_S(SegmentRegister::SS),
0x18 => modrm_target_bytewidth!(self, SBB_FromReg),
0x19 => modrm_instruction_wordwidth!(self, SBB_FromReg),
0x1A => modrm_target_bytewidth!(self, SBB_ToReg),
0x1B => modrm_instruction_wordwidth!(self, SBB_ToReg),
0x18 => modrm_8b_register!(self, SBB_FromReg),
0x19 => modrm_16b_register!(self, SBB_FromReg),
0x1A => modrm_8b_register!(self, SBB_ToReg),
0x1B => modrm_16b_register!(self, SBB_ToReg),
0x1C => Mnemonic::SBB_ALIb(self.parse_byte()?),
0x1D => Mnemonic::SBB_AXIv(self.parse_word()?),
0x1E => Mnemonic::PUSH_S(SegmentRegister::DS),
0x1F => Mnemonic::POP_S(SegmentRegister::DS),
0x20 => modrm_target_bytewidth!(self, AND_FromReg),
0x21 => modrm_instruction_wordwidth!(self, AND_FromReg),
0x22 => modrm_target_bytewidth!(self, AND_ToReg),
0x23 => modrm_instruction_wordwidth!(self, AND_ToReg),
0x20 => modrm_8b_register!(self, AND_FromReg),
0x21 => modrm_16b_register!(self, AND_FromReg),
0x22 => modrm_8b_register!(self, AND_ToReg),
0x23 => modrm_16b_register!(self, AND_ToReg),
0x24 => Mnemonic::AND_ALIb(self.parse_byte()?),
0x25 => Mnemonic::AND_AXIv(self.parse_word()?),
0x26 => Mnemonic::OVERRIDE(SegmentRegister::ES),
0x27 => Mnemonic::DAA,
0x28 => modrm_target_bytewidth!(self, SUB_FromReg),
0x29 => modrm_instruction_wordwidth!(self, SUB_FromReg),
0x2A => modrm_target_bytewidth!(self, SUB_ToReg),
0x2B => modrm_instruction_wordwidth!(self, SUB_ToReg),
0x28 => modrm_8b_register!(self, SUB_FromReg),
0x29 => modrm_16b_register!(self, SUB_FromReg),
0x2A => modrm_8b_register!(self, SUB_ToReg),
0x2B => modrm_16b_register!(self, SUB_ToReg),
0x2C => Mnemonic::SUB_ALIb(self.parse_byte()?),
0x2D => Mnemonic::SUB_AXIv(self.parse_word()?),
0x2E => Mnemonic::OVERRIDE(SegmentRegister::CS),
0x2F => Mnemonic::DAS,
0x30 => modrm_target_bytewidth!(self, XOR_FromReg),
0x31 => modrm_instruction_wordwidth!(self, XOR_FromReg),
0x32 => modrm_target_bytewidth!(self, XOR_ToReg),
0x33 => modrm_instruction_wordwidth!(self, XOR_ToReg),
0x30 => modrm_8b_register!(self, XOR_FromReg),
0x31 => modrm_16b_register!(self, XOR_FromReg),
0x32 => modrm_8b_register!(self, XOR_ToReg),
0x33 => modrm_16b_register!(self, XOR_ToReg),
0x34 => Mnemonic::XOR_ALIb(self.parse_byte()?),
0x35 => Mnemonic::XOR_AXIv(self.parse_word()?),
0x36 => Mnemonic::OVERRIDE(SegmentRegister::SS),
0x37 => Mnemonic::AAA,
0x38 => modrm_target_bytewidth!(self, CMP_FromReg),
0x39 => modrm_instruction_wordwidth!(self, CMP_FromReg),
0x3A => modrm_target_bytewidth!(self, CMP_ToReg),
0x3B => modrm_instruction_wordwidth!(self, CMP_ToReg),
0x38 => modrm_8b_register!(self, CMP_FromReg),
0x39 => modrm_16b_register!(self, CMP_FromReg),
0x3A => modrm_8b_register!(self, CMP_ToReg),
0x3B => modrm_16b_register!(self, CMP_ToReg),
0x3C => Mnemonic::CMP_ALIb(self.parse_byte()?),
0x3D => Mnemonic::CMP_AXIv(self.parse_word()?),
@@ -642,20 +652,20 @@ impl Disassembler {
Self::modrm_reg_to_grp1(reg, target, Operand::Byte(imm))?
}
0x84 => modrm_target_bytewidth!(self, TEST),
0x85 => modrm_instruction_wordwidth!(self, TEST),
0x84 => modrm_8b_register!(self, TEST),
0x85 => modrm_16b_register!(self, TEST),
0x86 => modrm_target_bytewidth!(self, XCHG),
0x87 => modrm_instruction_wordwidth!(self, XCHG),
0x86 => modrm_8b_register!(self, XCHG),
0x87 => modrm_16b_register!(self, XCHG),
0x88 => modrm_target_bytewidth!(self, MOV_FromReg),
0x89 => modrm_instruction_wordwidth!(self, MOV_FromReg),
0x8A => modrm_target_bytewidth!(self, MOV_ToReg),
0x8B => modrm_instruction_wordwidth!(self, MOV_ToReg),
0x8C => modrm_instruction_sregister!(self, MOV_FromSReg),
0x8E => modrm_instruction_sregister!(self, MOV_ToSReg),
0x88 => modrm_8b_register!(self, MOV_FromReg),
0x89 => modrm_16b_register!(self, MOV_FromReg),
0x8A => modrm_8b_register!(self, MOV_ToReg),
0x8B => modrm_16b_register!(self, MOV_ToReg),
0x8C => modrm_sregister!(self, MOV_FromSReg),
0x8E => modrm_sregister!(self, MOV_ToSReg),
0x8D => modrm_instruction_wordwidth!(self, LEA),
0x8D => modrm_16b_register!(self, LEA),
0x8F => {
let (target, _) = self.parse_modrm_byte(Operand::Word(0))?;
@@ -734,11 +744,11 @@ impl Disassembler {
0xC3 => Mnemonic::RET,
0xC4 => {
let (target, ptr) = self.modrm_mp()?;
let (target, ptr) = self.parse_mp()?;
Mnemonic::LES(target, ptr)
}
0xC5 => {
let (target, ptr) = self.modrm_mp()?;
let (target, ptr) = self.parse_mp()?;
Mnemonic::LDS(target, ptr)
}