diff --git a/src/aout.rs b/src/aout.rs index 44ce948..80417ed 100644 --- a/src/aout.rs +++ b/src/aout.rs @@ -33,14 +33,11 @@ impl Aout { let text_start = hdr.hdrlen as usize; let text_end = text_start + hdr.text as usize; - let data_start = text_end + 1; + let data_start = text_end; let data_end = data_start + hdr.data as usize; - dbg!(&hdr); - let text_section = &buf[text_start..text_end]; - // let data_section = &buf[data_start..data_end]; - let data_section = []; + let data_section = &buf[data_start..data_end]; Aout { header: hdr, diff --git a/src/disasm.rs b/src/disasm.rs index 841526e..3a5f676 100644 --- a/src/disasm.rs +++ b/src/disasm.rs @@ -1,7 +1,5 @@ //! The main dissembling logic. -use env_logger::Target; - use crate::aout::Aout; use crate::operands::{ Byte, DWord, Displacement, IByte, IWord, MemoryIndex, ModRmTarget, Operand, Pointer, Word, @@ -11,7 +9,7 @@ use crate::{ Args, instructions::{Instruction, Mnemonic}, }; -use crate::{modrmb, modrms, modrmv}; +use crate::{modrm_instruction_sregister, modrm_instruction_wordwidth, modrm_target_bytewidth}; use core::fmt; use std::{fs::File, io::Read, process::exit}; @@ -25,7 +23,9 @@ pub enum DisasmError { IllegalModRMByteMode(u8), IllegalModRMByteIndex(u8), IllegalOperand(String), - ReadBeyondTextSection(), + ReadBeyondTextSection, + // not an error per se, it indicates a single 0x00 byte padding + EndOfTextSection, UnknownRegister(usize), } @@ -61,7 +61,7 @@ impl fmt::Display for DisasmError { modrm ), DisasmError::IllegalOperand(msg) => write!(f, "Error (Illegal operand). {}", msg), - DisasmError::ReadBeyondTextSection() => write!( + DisasmError::ReadBeyondTextSection => write!( f, "Error (Out of bounds access). Wanted to paese an additional byte, but there is no more text section.", ), @@ -69,61 +69,62 @@ impl fmt::Display for DisasmError { f, "Error (Unknown register). The register with ID {id} is unknown", ), + DisasmError::EndOfTextSection => write!(f, "Warning. End of text section reached."), } } } -/// Disassemble the binary in `path` into a vector of instructions. -/// Main entry point to the disassembly. -pub fn disasm(args: &Args) -> Result, DisasmError> { - let contents = path_to_buf(args)?; - let aout = Aout::new(contents); - - log::debug!("{:?}", aout); - - let mut disasm = Disassembler::new(aout); - disasm.decode_instructions() -} - -/// Read a filepath into a u8 buffer. -fn path_to_buf(args: &Args) -> Result, DisasmError> { - let path = args - .path - .clone() - .ok_or(DisasmError::NoFile(args.path.clone()))?; - let mut file = File::open(path)?; - let mut buf = Vec::new(); - file.read_to_end(&mut buf)?; - - Ok(buf) -} - #[derive(Debug, Clone)] pub struct Disassembler { - pub offset: usize, // the current offset in the disasm process - pub text: Vec, // the aout binary - pub instruction: Instruction, // the instruction, which is currently being parsed + pub offset: usize, // the current offset in the disasm process + pub text: Vec, // the aout binary + pub instruction: Instruction, // the instruction, which is currently being parsed + pub instructions: Vec, // all parsed instructions } impl Disassembler { - pub fn new(aout: Aout) -> Self { + pub fn new(args: &Args) -> Self { + let path = args + .path + .clone() + .ok_or(DisasmError::NoFile(args.path.clone())) + .unwrap(); + let mut file = File::open(path).unwrap(); + let mut buf = Vec::new(); + file.read_to_end(&mut buf).unwrap(); + let aout = Aout::new(buf); + log::debug!("{:?}", aout); + Disassembler { offset: 0, text: aout.text, instruction: Instruction::new(), + instructions: Vec::new(), } } /// Parse a single byte of binary, return it and advance the offset. /// Returns the read byte. - pub fn parse_byte(&mut self) -> Result { + fn parse_byte(&mut self) -> Result { log::debug!("Attempting to parse byte at {:#04x} ...", self.offset); - // advance to operand - self.offset += 1; + // check if the byte would be out of bounds + if self.offset + 1 == self.text.len() { + // check if text section ends with single 0x00 padding byte + if self.text[self.offset] == 0 { + return Err(DisasmError::EndOfTextSection); + // else its just an out of bounds read + } else { + return Err(DisasmError::ReadBeyondTextSection); + } + // if not, advance offset to next byte + } else { + self.offset += 1; + } + let byte = self .text .get(self.offset) - .ok_or(DisasmError::ReadBeyondTextSection())?; + .ok_or(DisasmError::ReadBeyondTextSection)?; log::debug!("Parsed byte {byte:#04x}"); self.instruction.raw.push(*byte); Ok(*byte) @@ -132,7 +133,7 @@ impl Disassembler { /// Parse a single word of binary. /// Just a wrapper for parsing a byte twice. /// Returns the read word. - pub fn parse_word(&mut self) -> Result { + fn parse_word(&mut self) -> Result { log::debug!("Attempting to parse word at {:#04x} ...", self.offset); let byte1 = self.parse_byte()?; let byte2 = self.parse_byte()?; @@ -142,7 +143,7 @@ impl Disassembler { /// Parse a single byte of binary and interpret as as signed. /// The isize contains a relative offset to be added to the address /// of the subsequent instruction. - pub fn parse_j_byte(&mut self) -> Result { + fn parse_j_byte(&mut self) -> Result { log::debug!("Attempting to parse Jb at {:#04x} ...", self.offset); // first interpret as 2-complement, then cast for addition let byte = self.parse_byte()? as IByte as isize; @@ -170,7 +171,7 @@ impl Disassembler { } /// Parse a pointer type. - pub fn parse_ptr(&mut self) -> Result { + fn parse_ptr(&mut self) -> Result { log::debug!("Attempting to parse pointer at {:#04x} ...", self.offset); let byte0 = self.parse_byte()?; let byte1 = self.parse_byte()?; @@ -195,7 +196,7 @@ impl Disassembler { /// Parse a single modrm byte, return the resulting MemoryIndex and advance the offset. /// Returns the parsed modrm target and the source register - pub fn parse_modrm_byte( + fn parse_modrm_byte( &mut self, register_width: Operand, ) -> Result<(ModRmTarget, RegisterId), DisasmError> { @@ -307,7 +308,7 @@ impl Disassembler { /// Group 1 always have an ModRM target (all modrm bits, without reg) as /// first and an imm value as second operand (which has to be parsed before /// call to this function), but is available in both Byte and Word length. - pub fn modrm_reg_to_grp1( + fn modrm_reg_to_grp1( modrm_reg_byte: u8, target: ModRmTarget, register_id: Operand, @@ -342,7 +343,7 @@ impl Disassembler { /// Group 2 only has a single operand, the other one is either a constant /// 1 (not present in the binary) or the CL register. /// This function assumes the operand to be 1 - pub fn modrm_reg_to_grp2_1(reg: u8, target: ModRmTarget) -> Result { + fn modrm_reg_to_grp2_1(reg: u8, target: ModRmTarget) -> Result { match reg { 0b000 => Ok(Mnemonic::ROL_b(target, 1)), 0b001 => Ok(Mnemonic::ROR_b(target, 1)), @@ -360,7 +361,7 @@ impl Disassembler { /// Group 2 only has a single operand, the other one is either a constant /// 1 (not present in the binary) or the CL register. /// This function assumes the operand to be CL register. - pub fn modrm_reg_to_grp2_cl(reg: u8, target: ModRmTarget) -> Result { + fn modrm_reg_to_grp2_cl(reg: u8, target: ModRmTarget) -> Result { match reg { 0b000 => Ok(Mnemonic::ROL_fromReg(target, Register::CL)), 0b001 => Ok(Mnemonic::ROR_fromReg(target, Register::CL)), @@ -377,7 +378,7 @@ impl Disassembler { /// Match the modrm reg bits to the GPR3a/b mnemonics. /// Group 3 only has a single operand, which is the ModRmTarget selected /// by modrm bits. - pub fn modrm_reg_to_grp3( + fn modrm_reg_to_grp3( &mut self, reg: u8, target: ModRmTarget, @@ -402,46 +403,99 @@ impl Disassembler { /// Parse an Mp Operand (Memory Pointer). /// An Mp is a ModRM byte with the `reg` bits ignored and an additional /// 2 words parsed for a `Pointer` type. - pub fn modrm_mp(&mut self) -> Result<(ModRmTarget, Pointer), DisasmError> { + fn modrm_mp(&mut self) -> Result<(ModRmTarget, Pointer), DisasmError> { let (target, _) = self.parse_modrm_byte(Operand::Byte(0))?; let ptr = self.parse_ptr()?; Ok((target, ptr)) } - /// Decode instructions from the text section of the provided binary - pub fn decode_instructions(&mut self) -> Result, DisasmError> { - // naive approach: - // 1. read byte - // 2. pattern match to see which instruction it is - // 3. read as many bytes as this instruction needs (registers, immidiates, ...) - // repeat until no bytes left + /// a.out pads the text section with 0x00 bytes. During parsing, these get + /// interpreted as `0x00 0x00`, which have to get removed for an authentic + /// disassembly. + /// This is done in favor of removing all 0x00 bytes in the beginning, + /// as this could remove an actual 0x00 byte as operand of the final + /// instruction. Of course, this could remove an actual `0x00 0x00` + /// instruction from the end, but they would not have any effect on + /// execution anyway. + fn remove_trailing_padding(&mut self) { + let mut until = self.instructions.len(); + for i in self.instructions.iter().rev() { + match i.opcode { + // 0x00 0x00 in binary + Mnemonic::ADD_FromReg( + ModRmTarget::Memory(MemoryIndex { + base: Some(Register::BX), + index: Some(Register::SI), + displacement: None, + }), + Register::AL, + ) => until -= 1, + // stop when another instruction is hit + _ => break, + } + } + log::debug!( + "Truncated file by {} bytes by removing trailing padding bytes.", + self.text.len() - until + ); + self.instructions.truncate(until); + } - let mut instructions = Vec::new(); + /// Start the disassmble and allow for some error handling wrapped around + /// the actual decoding function. + pub fn disassemble(&mut self) -> Result, DisasmError> { + let parsing = self.decode_instructions(); + // a.out pads the text section to byte align, so the fasely interpreted + // instructions have to be removed. + self.remove_trailing_padding(); + let instructions = self.instructions.clone(); + + // allow for warning-type errors to pass through, as they are not fatal + match parsing { + Ok(_) => Ok(instructions), + Err(e) => match e { + DisasmError::EndOfTextSection => { + log::debug!("Solo padded 0-byte at end of file was found. Ignoring."); + Ok(instructions) + } + _ => { + println!("Encountered error during disassembly: {e}"); + Err(e) + } + }, + } + } + + /// Decode instructions by matching their byte signature to their mnemonics. + fn decode_instructions(&mut self) -> Result<(), DisasmError> { log::debug!("Starting to decode text of length {}", self.text.len()); while self.offset < self.text.len() { + // reset mutable current instruction + self.instruction = Instruction::new(); self.instruction.start = self.offset; + // fetch next opcode let opcode = self.text[self.offset]; // additional raw bytes will be pushed by parse functions self.instruction.raw.push(opcode); - // XXX: convert this copy and paste horror into a proc macro + self.instruction.opcode = match opcode { - 0x00 => modrmb!(self, ADD_FromReg), - 0x01 => modrmv!(self, ADD_FromReg), - 0x02 => modrmb!(self, ADD_ToReg), - 0x03 => modrmv!(self, ADD_ToReg), + 0x00 => modrm_target_bytewidth!(self, ADD_FromReg), + 0x01 => modrm_instruction_wordwidth!(self, ADD_FromReg), + 0x02 => modrm_target_bytewidth!(self, ADD_ToReg), + 0x03 => modrm_instruction_wordwidth!(self, ADD_ToReg), 0x04 => Mnemonic::ADD_ALIb(self.parse_byte()?), 0x05 => Mnemonic::ADD_AXIv(self.parse_word()?), 0x06 => Mnemonic::PUSH_S(SegmentRegister::ES), 0x07 => Mnemonic::POP_S(SegmentRegister::ES), - 0x08 => modrmb!(self, OR_FromReg), - 0x09 => modrmv!(self, OR_FromReg), - 0x0A => modrmb!(self, OR_ToReg), - 0x0B => modrmv!(self, OR_ToReg), + 0x08 => modrm_target_bytewidth!(self, OR_FromReg), + 0x09 => modrm_instruction_wordwidth!(self, OR_FromReg), + 0x0A => modrm_target_bytewidth!(self, OR_ToReg), + 0x0B => modrm_instruction_wordwidth!(self, OR_ToReg), 0x0C => Mnemonic::OR_ALIb(self.parse_byte()?), 0x0D => Mnemonic::OR_AXIv(self.parse_word()?), @@ -449,60 +503,60 @@ impl Disassembler { 0x0F => return Err(DisasmError::OpcodeUndefined(opcode)), - 0x10 => modrmb!(self, ADC_FromReg), - 0x11 => modrmv!(self, ADC_FromReg), - 0x12 => modrmb!(self, ADC_ToReg), - 0x13 => modrmv!(self, ADC_ToReg), + 0x10 => modrm_target_bytewidth!(self, ADC_FromReg), + 0x11 => modrm_instruction_wordwidth!(self, ADC_FromReg), + 0x12 => modrm_target_bytewidth!(self, ADC_ToReg), + 0x13 => modrm_instruction_wordwidth!(self, ADC_ToReg), 0x14 => Mnemonic::ADC_ALIb(self.parse_byte()?), 0x15 => Mnemonic::ADC_AXIv(self.parse_word()?), 0x16 => Mnemonic::PUSH_S(SegmentRegister::SS), 0x17 => Mnemonic::POP_S(SegmentRegister::SS), - 0x18 => modrmb!(self, SBB_FromReg), - 0x19 => modrmv!(self, SBB_FromReg), - 0x1A => modrmb!(self, SBB_ToReg), - 0x1B => modrmv!(self, SBB_ToReg), + 0x18 => modrm_target_bytewidth!(self, SBB_FromReg), + 0x19 => modrm_instruction_wordwidth!(self, SBB_FromReg), + 0x1A => modrm_target_bytewidth!(self, SBB_ToReg), + 0x1B => modrm_instruction_wordwidth!(self, SBB_ToReg), 0x1C => Mnemonic::SBB_ALIb(self.parse_byte()?), 0x1D => Mnemonic::SBB_AXIv(self.parse_word()?), 0x1E => Mnemonic::PUSH_S(SegmentRegister::DS), 0x1F => Mnemonic::POP_S(SegmentRegister::DS), - 0x20 => modrmb!(self, AND_FromReg), - 0x21 => modrmv!(self, AND_FromReg), - 0x22 => modrmb!(self, AND_ToReg), - 0x23 => modrmv!(self, AND_ToReg), + 0x20 => modrm_target_bytewidth!(self, AND_FromReg), + 0x21 => modrm_instruction_wordwidth!(self, AND_FromReg), + 0x22 => modrm_target_bytewidth!(self, AND_ToReg), + 0x23 => modrm_instruction_wordwidth!(self, AND_ToReg), 0x24 => Mnemonic::AND_ALIb(self.parse_byte()?), 0x25 => Mnemonic::AND_AXIv(self.parse_word()?), 0x26 => Mnemonic::OVERRIDE(SegmentRegister::ES), 0x27 => Mnemonic::DAA, - 0x28 => modrmb!(self, SUB_FromReg), - 0x29 => modrmv!(self, SUB_FromReg), - 0x2A => modrmb!(self, SUB_ToReg), - 0x2B => modrmv!(self, SUB_ToReg), + 0x28 => modrm_target_bytewidth!(self, SUB_FromReg), + 0x29 => modrm_instruction_wordwidth!(self, SUB_FromReg), + 0x2A => modrm_target_bytewidth!(self, SUB_ToReg), + 0x2B => modrm_instruction_wordwidth!(self, SUB_ToReg), 0x2C => Mnemonic::SUB_ALIb(self.parse_byte()?), 0x2D => Mnemonic::SUB_AXIv(self.parse_word()?), 0x2E => Mnemonic::OVERRIDE(SegmentRegister::CS), 0x2F => Mnemonic::DAS, - 0x30 => modrmb!(self, XOR_FromReg), - 0x31 => modrmv!(self, XOR_FromReg), - 0x32 => modrmb!(self, XOR_ToReg), - 0x33 => modrmv!(self, XOR_ToReg), + 0x30 => modrm_target_bytewidth!(self, XOR_FromReg), + 0x31 => modrm_instruction_wordwidth!(self, XOR_FromReg), + 0x32 => modrm_target_bytewidth!(self, XOR_ToReg), + 0x33 => modrm_instruction_wordwidth!(self, XOR_ToReg), 0x34 => Mnemonic::XOR_ALIb(self.parse_byte()?), 0x35 => Mnemonic::XOR_AXIv(self.parse_word()?), 0x36 => Mnemonic::OVERRIDE(SegmentRegister::SS), 0x37 => Mnemonic::AAA, - 0x38 => modrmb!(self, CMP_FromReg), - 0x39 => modrmv!(self, CMP_FromReg), - 0x3A => modrmb!(self, CMP_ToReg), - 0x3B => modrmv!(self, CMP_ToReg), + 0x38 => modrm_target_bytewidth!(self, CMP_FromReg), + 0x39 => modrm_instruction_wordwidth!(self, CMP_FromReg), + 0x3A => modrm_target_bytewidth!(self, CMP_ToReg), + 0x3B => modrm_instruction_wordwidth!(self, CMP_ToReg), 0x3C => Mnemonic::CMP_ALIb(self.parse_byte()?), 0x3D => Mnemonic::CMP_AXIv(self.parse_word()?), @@ -588,20 +642,20 @@ impl Disassembler { Self::modrm_reg_to_grp1(reg, target, Operand::Byte(imm))? } - 0x84 => modrmb!(self, TEST), - 0x85 => modrmv!(self, TEST), + 0x84 => modrm_target_bytewidth!(self, TEST), + 0x85 => modrm_instruction_wordwidth!(self, TEST), - 0x86 => modrmb!(self, XCHG), - 0x87 => modrmv!(self, XCHG), + 0x86 => modrm_target_bytewidth!(self, XCHG), + 0x87 => modrm_instruction_wordwidth!(self, XCHG), - 0x88 => modrmb!(self, MOV_FromReg), - 0x89 => modrmv!(self, MOV_FromReg), - 0x8A => modrmb!(self, MOV_ToReg), - 0x8B => modrmv!(self, MOV_ToReg), - 0x8C => modrms!(self, MOV_FromSReg), - 0x8E => modrms!(self, MOV_ToSReg), + 0x88 => modrm_target_bytewidth!(self, MOV_FromReg), + 0x89 => modrm_instruction_wordwidth!(self, MOV_FromReg), + 0x8A => modrm_target_bytewidth!(self, MOV_ToReg), + 0x8B => modrm_instruction_wordwidth!(self, MOV_ToReg), + 0x8C => modrm_instruction_sregister!(self, MOV_FromSReg), + 0x8E => modrm_instruction_sregister!(self, MOV_ToSReg), - 0x8D => modrmv!(self, LEA), + 0x8D => modrm_instruction_wordwidth!(self, LEA), 0x8F => { let (target, _) = self.parse_modrm_byte(Operand::Word(0))?; @@ -818,13 +872,15 @@ impl Disassembler { } }; - println!("{}", self.instruction); - instructions.push(self.instruction.clone()); - self.instruction = Instruction::new(); + // Save parsed instruction + log::debug!("{}", self.instruction); + self.instructions.push(self.instruction.clone()); + + // Advance offset to hover the next potential opcode self.offset += 1; } - Ok(instructions) + Ok(()) } } @@ -839,24 +895,24 @@ mod tests { offset: 0, text, instruction: Instruction::new(), + instructions: Vec::new(), }; - let instructions = disassembler.decode_instructions().ok(); - if let Some(instrs) = instructions { - assert_eq!( - instrs[0], - Instruction { - start: 0, - raw: Vec::from([0, 0]), - opcode: Mnemonic::ADD_FromReg( - ModRmTarget::Memory(MemoryIndex { - base: Some(Register::BX), - index: Some(Register::SI), - displacement: None - }), - Register::AL - ) - } - ) - } + disassembler.decode_instructions().unwrap(); + let instructions = disassembler.instructions; + assert_eq!( + instructions[0], + Instruction { + start: 0, + raw: Vec::from([0, 0]), + opcode: Mnemonic::ADD_FromReg( + ModRmTarget::Memory(MemoryIndex { + base: Some(Register::BX), + index: Some(Register::SI), + displacement: None + }), + Register::AL + ) + } + ) } } diff --git a/src/disasm_macros.rs b/src/disasm_macros.rs index 872ac00..c126806 100644 --- a/src/disasm_macros.rs +++ b/src/disasm_macros.rs @@ -2,7 +2,7 @@ #[macro_export] /// Generate a Mnemonic for an 8-bit Register from a ModRM byte. -macro_rules! modrmb { +macro_rules! modrm_target_bytewidth { ($self:ident, $variant:ident) => {{ let (target, reg) = $self.parse_modrm_byte(Operand::Byte(0))?; Mnemonic::$variant(target, Register::by_id(Operand::Byte(reg))?) @@ -11,7 +11,7 @@ macro_rules! modrmb { #[macro_export] /// Generate a Mnemonic for a 16-bit Register from a ModRM byte. -macro_rules! modrmv { +macro_rules! modrm_instruction_wordwidth { ($self:ident, $variant:ident) => {{ let (target, reg) = $self.parse_modrm_byte(Operand::Word(0))?; Mnemonic::$variant(target, Register::by_id(Operand::Word(reg.into()))?) @@ -20,7 +20,7 @@ macro_rules! modrmv { #[macro_export] /// Generate a Mnemonic for a 16-bit Segment Register from a ModRM byte. -macro_rules! modrms { +macro_rules! modrm_instruction_sregister { ($self:ident, $variant:ident) => {{ let (target, reg) = $self.parse_modrm_byte(Operand::Word(0))?; Mnemonic::$variant(target, SegmentRegister::by_id(reg)?) diff --git a/src/instructions.rs b/src/instructions.rs index e622d40..fb858d5 100644 --- a/src/instructions.rs +++ b/src/instructions.rs @@ -296,6 +296,8 @@ pub enum Mnemonic { AAD(Byte), // MISC XLAT, + // Not part of 8086: + EOT, // End of Text Section } impl fmt::Display for Mnemonic { diff --git a/src/main.rs b/src/main.rs index 95005e8..02d0933 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,5 @@ use clap::{Parser, Subcommand}; +use disasm::Disassembler; mod aout; mod disasm; @@ -37,14 +38,12 @@ fn main() { match args.command { Command::Disasm => { - let instructions = disasm::disasm(&args); + let mut disasm = Disassembler::new(&args); + let instructions = disasm.disassemble(); match instructions { - Err(e) => { - println!("(undefined)"); - println!("Encountered error during parsing: {e}") - } + Ok(instrs) => instrs.iter().for_each(|i| println!("{i}")), _ => {} - }; + } } _ => panic!("Command not yet implemented"), }