From df00f59b5a7b8f01ae4ab7f4cff92a55c30b39a7 Mon Sep 17 00:00:00 2001 From: Marco Thomas Date: Thu, 8 May 2025 20:18:02 +0900 Subject: [PATCH] ft: implement disasm in own struct This makes it easier to implement each opcode, as the offset calculation and recovery of raw read bytes is internalized. --- src/disasm.rs | 141 ++++++++++++++++++++++---------------------- src/instructions.rs | 14 ++--- 2 files changed, 76 insertions(+), 79 deletions(-) diff --git a/src/disasm.rs b/src/disasm.rs index 7f8b823..d573aeb 100644 --- a/src/disasm.rs +++ b/src/disasm.rs @@ -38,9 +38,8 @@ pub fn disasm(args: &Args) -> Result, DisasmError> { log::debug!("{:?}", aout); - let instructions = decode_instructions(&aout)?; - - Ok(instructions) + let mut disasm = Disassembler::new(aout); + disasm.decode_instructions() } /// Read a filepath into a u8 buffer. @@ -56,97 +55,53 @@ fn path_to_buf(args: &Args) -> Result, DisasmError> { Ok(buf) } -/// Decode instructions from the text section of the provided binary -fn decode_instructions(aout: &Aout) -> Result, DisasmError> { - // naive approach: - // 1. read byte - // 2. pattern match to see which instruction it is - // 3. read as many bytes as this instruction needs (registers, immidiates, ...) - // repeat until no bytes left - - let mut instructions = Vec::new(); - let mut disassembler = Disassembler { - offset: 0, - text: aout.text.clone(), - }; - - while disassembler.offset < disassembler.text.len() { - let mut instr = Instruction::new(); - instr.start = disassembler.offset; - - let opcode = disassembler.text[disassembler.offset]; - instr.raw.push(opcode); - match opcode { - // ADD - 0x00 => { - let (mem_index, mut raw) = disassembler.parse_modrm_byte(); - let reg = disassembler.parse_byte(); - instr.raw.append(&mut raw); - instr.raw.push(reg); - instr.opcode = Opcode::ADD_EbGb(mem_index, Register::by_id(reg)); - } - // INT - 0xCD => { - let byte = disassembler.parse_byte(); - instr.raw.push(byte); - instr.opcode = Opcode::INT(ImmediateByte(byte)); - } - // MOV - 0xBB => { - let (word, raw) = disassembler.parse_word(); - instr.raw.push(raw.0); - instr.raw.push(raw.1); - instr.opcode = Opcode::MOV_BXIv(Register::BX, ImmediateWord(word)); - } - _ => { - eprintln!("Encountered unknown instruction '0x{:x}'", opcode); - eprintln!("Offset might be misaligned and data is being interpreted."); - eprintln!("Existing to avoid further misinterpretation..."); - exit(1); - } - }; - - println!("{}", instr); - instructions.push(instr); - } - - Ok(instructions) -} - #[derive(Debug)] struct Disassembler { - pub offset: usize, - pub text: Vec, + pub offset: usize, // the current offset in the disasm process + pub text: Vec, // the aout binary + pub instruction: Instruction, // the instruction, which is currently being parsed } impl Disassembler { + pub fn new(aout: Aout) -> Self { + Disassembler { + offset: 0, + text: aout.text, + instruction: Instruction::new(), + } + } + /// Parse a single byte of binary, return it and advance the offset. /// Returns the read byte. pub fn parse_byte(&mut self) -> u8 { self.offset += 1; let byte = self.text[self.offset]; self.offset += 1; + self.instruction.raw.push(byte); byte } + /// Parse a single word of binary, return it and advance the offset. - /// Returns the read word and a tuple of the read raw bytes - pub fn parse_word(&mut self) -> (u16, (u8, u8)) { + /// Returns the read word. + pub fn parse_word(&mut self) -> u16 { self.offset += 1; let byte1 = self.text[self.offset]; let byte2 = self.text[self.offset + 1]; self.offset += 2; - (u16::from_le_bytes([byte1, byte2]), (byte1, byte2)) + self.instruction.raw.push(byte1); + self.instruction.raw.push(byte2); + u16::from_le_bytes([byte1, byte2]) } + /// Parse a single modrm byte, return the resulting MemoryIndex and advance the offset. /// Returns the parsed modrm memory access, as well as all read raw bytes - pub fn parse_modrm_byte(&mut self) -> (MemoryIndex, Vec) { + pub fn parse_modrm_byte(&mut self) -> MemoryIndex { // Calculate ModRM byte with bitmask let opcode = self.text[self.offset]; let modulo = opcode >> 6; let reg = (opcode >> 3) & 7; let rm = opcode & 7; - let mut displacement_raw = Vec::new(); let displacement = match modulo { 0 => { if rm == 6 { @@ -158,15 +113,12 @@ impl Disassembler { 1 => { self.offset += 2; // one additional byte was read let byte = self.parse_byte(); - displacement_raw.push(byte); log::debug!("Additional byte during ModRM parsing was read."); Some(Displacement::Byte(byte)) } 2 => { self.offset += 3; // two additional bytes (word) was read - let (word, raw) = self.parse_word(); - displacement_raw.push(raw.0); - displacement_raw.push(raw.1); + let word = self.parse_word(); log::debug!("Additional two bytes during ModRM parsing was read."); Some(Displacement::Word(word)) } @@ -218,6 +170,51 @@ impl Disassembler { _ => panic!("Invalid ModRM byte encountered"), }; - return (index, displacement_raw); + index + } + + /// Decode instructions from the text section of the provided binary + pub fn decode_instructions(&mut self) -> Result, DisasmError> { + // naive approach: + // 1. read byte + // 2. pattern match to see which instruction it is + // 3. read as many bytes as this instruction needs (registers, immidiates, ...) + // repeat until no bytes left + + let mut instructions = Vec::new(); + + while self.offset < self.text.len() { + self.instruction.start = self.offset; + + let opcode = self.text[self.offset]; + + // additional raw bytes will be pushed by parse functions + self.instruction.raw.push(opcode); + self.instruction.opcode = match opcode { + // ADD + 0x00 => { + Opcode::ADD_EbGb(self.parse_modrm_byte(), Register::by_id(self.parse_byte())) + } + // INT + 0xCD => Opcode::INT(ImmediateByte(self.parse_byte())), + // MOV + 0xBB => Opcode::MOV_BXIv(Register::BX, ImmediateWord(self.parse_word())), + _ => { + eprintln!( + "Encountered unknown self.instructionuction '0x{:x}'", + opcode + ); + eprintln!("Offset might be misaligned and data is being interpreted."); + eprintln!("Existing to avoid further misinterpretation..."); + exit(1); + } + }; + + println!("{}", self.instruction); + instructions.push(self.instruction.clone()); + self.instruction = Instruction::new(); + } + + Ok(instructions) } } diff --git a/src/instructions.rs b/src/instructions.rs index 25efa93..57ec2fc 100644 --- a/src/instructions.rs +++ b/src/instructions.rs @@ -6,7 +6,7 @@ pub type b = u8; #[allow(non_camel_case_types)] pub type w = u16; -#[derive(Debug)] +#[derive(Debug, Clone)] #[allow(dead_code)] /// A single 'line' of executable ASM is called an Instruction, which /// contains the `Opcode` that will be executed, alongside its starting offset @@ -37,7 +37,7 @@ impl fmt::Display for Instruction { } } -#[derive(Debug)] +#[derive(Debug, Clone)] #[allow(dead_code, non_camel_case_types)] pub enum Opcode { NOP(), @@ -61,7 +61,7 @@ impl fmt::Display for Opcode { } /// Registers of a 8086 processor -#[derive(Debug)] +#[derive(Debug, Clone)] #[allow(dead_code)] pub enum Register { AX, @@ -167,11 +167,11 @@ impl fmt::Display for SegmentRegister { } /// An immediate byte value for an instruction. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct ImmediateByte(pub b); /// An immediate word value for an instruction -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct ImmediateWord(pub w); macro_rules! impl_display_and_lowerhex { @@ -195,7 +195,7 @@ impl_display_and_lowerhex!(ImmediateWord); /// A memory index operand is usually created by ModRM bytes or words. /// e.g. [bx+si] -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct MemoryIndex { pub base: Option, pub index: Option, @@ -226,7 +226,7 @@ impl fmt::Display for MemoryIndex { } } -#[derive(Debug)] +#[derive(Debug, Clone)] #[allow(dead_code)] /// Displacement for ModRM pub enum Displacement {