diff --git a/src/aout.rs b/src/aout.rs index 80a6979..4c8451e 100644 --- a/src/aout.rs +++ b/src/aout.rs @@ -5,6 +5,7 @@ pub type c_long = i32; // we use a a.out with 32 byte #[derive(Debug)] #[allow(dead_code)] +/// Internal representation of the a.out binary format. pub struct Aout { pub header: Header, pub text: Vec, diff --git a/src/decode.rs b/src/decode.rs deleted file mode 100644 index 8b13789..0000000 --- a/src/decode.rs +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/disasm.rs b/src/disasm.rs index f3fbd3f..7f8b823 100644 --- a/src/disasm.rs +++ b/src/disasm.rs @@ -2,13 +2,14 @@ use core::fmt; use std::{fs::File, io::Read, process::exit}; use crate::aout::Aout; -use crate::instructions::MemoryIndex; +use crate::instructions::{Displacement, MemoryIndex}; use crate::{ Args, - instructions::{ImmediateByte, ImmediateWord, Instruction, MetaInstruction, Register}, + instructions::{ImmediateByte, ImmediateWord, Instruction, Opcode, Register}, }; #[derive(Debug)] +/// Generic errors, which are encountered during parsing. pub enum DisasmError { NoFile(Option), IoError(std::io::Error), @@ -29,13 +30,12 @@ impl fmt::Display for DisasmError { } } -/// Disassemble the binary in `path` into a vector of instructions -/// This function just calls all other -pub fn disasm(args: &Args) -> Result, DisasmError> { +/// Disassemble the binary in `path` into a vector of instructions. +/// Main entry point to the disassembly. +pub fn disasm(args: &Args) -> Result, DisasmError> { let contents = path_to_buf(args)?; let aout = Aout::new(contents); - // XXX: 00 is just 0, maybe this could be a problem? log::debug!("{:?}", aout); let instructions = decode_instructions(&aout)?; @@ -43,7 +43,7 @@ pub fn disasm(args: &Args) -> Result, DisasmError> { Ok(instructions) } -/// Read a filepath into a buffer +/// Read a filepath into a u8 buffer. fn path_to_buf(args: &Args) -> Result, DisasmError> { let path = args .path @@ -57,7 +57,7 @@ fn path_to_buf(args: &Args) -> Result, DisasmError> { } /// Decode instructions from the text section of the provided binary -fn decode_instructions(aout: &Aout) -> Result, DisasmError> { +fn decode_instructions(aout: &Aout) -> Result, DisasmError> { // naive approach: // 1. read byte // 2. pattern match to see which instruction it is @@ -65,38 +65,38 @@ fn decode_instructions(aout: &Aout) -> Result, DisasmError> // repeat until no bytes left let mut instructions = Vec::new(); - let mut offset = 0; + let mut disassembler = Disassembler { + offset: 0, + text: aout.text.clone(), + }; - let text = &aout.text; - while offset < aout.text.len() { - let mut instr = MetaInstruction::new(); - instr.start = offset; + while disassembler.offset < disassembler.text.len() { + let mut instr = Instruction::new(); + instr.start = disassembler.offset; - let opcode = text[offset]; + let opcode = disassembler.text[disassembler.offset]; + instr.raw.push(opcode); match opcode { // ADD 0x00 => { - let (mem_index, mut raw) = parse_modrm_byte(&mut offset, text); - let reg = parse_byte(&mut offset, text); - instr.size = 2 + raw.len(); - instr.raw = Vec::from([opcode]); + let (mem_index, mut raw) = disassembler.parse_modrm_byte(); + let reg = disassembler.parse_byte(); instr.raw.append(&mut raw); instr.raw.push(reg); - instr.instruction = Instruction::ADD_EbGb(mem_index, Register::by_id(reg)); + instr.opcode = Opcode::ADD_EbGb(mem_index, Register::by_id(reg)); } // INT 0xCD => { - let byte = parse_byte(&mut offset, text); - instr.size = 2; - instr.raw = Vec::from([opcode, byte]); - instr.instruction = Instruction::INT(ImmediateByte(byte)); + let byte = disassembler.parse_byte(); + instr.raw.push(byte); + instr.opcode = Opcode::INT(ImmediateByte(byte)); } // MOV 0xBB => { - let (word, raw) = parse_word(&mut offset, text); - instr.size = 3; - instr.raw = Vec::from([opcode, raw.0, raw.1]); - instr.instruction = Instruction::MOV_BXIv(Register::BX, ImmediateWord(word)); + let (word, raw) = disassembler.parse_word(); + instr.raw.push(raw.0); + instr.raw.push(raw.1); + instr.opcode = Opcode::MOV_BXIv(Register::BX, ImmediateWord(word)); } _ => { eprintln!("Encountered unknown instruction '0x{:x}'", opcode); @@ -113,117 +113,111 @@ fn decode_instructions(aout: &Aout) -> Result, DisasmError> Ok(instructions) } -/// Parse a single byte of binary, return it and advance the offset. -pub fn parse_byte(offset: &mut usize, text: &Vec) -> u8 { - *offset += 1; - let byte = text[*offset]; - *offset += 1; - byte -} -/// Parse a single word of binary, return it and advance the offset. -pub fn parse_word(offset: &mut usize, text: &Vec) -> (u16, (u8, u8)) { - *offset += 1; - let byte1 = text[*offset]; - let byte2 = text[*offset + 1]; - *offset += 2; - (u16::from_le_bytes([byte1, byte2]), (byte1, byte2)) -} -/// Parse a single modrm byte, return the resulting MemoryIndex and advance the offset. -pub fn parse_modrm_byte(offset: &mut usize, text: &Vec) -> (MemoryIndex, Vec) { - // Calculate ModRM byte with bitmask - let opcode = text[*offset]; - let modulo = opcode >> 6; - let reg = (opcode >> 3) & 7; - let rm = opcode & 7; - - let mut displacement_raw = Vec::new(); - let displacement = match modulo { - 0 => { - if rm == 6 { - // XXX: handle special case - panic!("Handle modulo == 0, rm == 6"); - } - None - } - 1 => { - *offset += 2; // one additional byte was read - let byte = parse_byte(offset, text); - displacement_raw.push(byte); - log::debug!("Additional byte during ModRM parsing was read."); - Some(Displacement::Byte(byte)) - } - 2 => { - *offset += 3; // two additional bytes (word) was read - let (word, raw) = parse_word(offset, text); - displacement_raw.push(raw.0); - displacement_raw.push(raw.1); - log::debug!("Additional two bytes during ModRM parsing was read."); - Some(Displacement::Word(word)) - } - 3 => panic!("TODO: handle modulo == 3"), - _ => panic!("Invalid ModRM byte encountered"), - }; - - let index = match rm { - 0 => MemoryIndex { - base: Some(Register::BX), - index: Some(Register::SI), - displacement, - }, - 1 => MemoryIndex { - base: Some(Register::BX), - index: Some(Register::DI), - displacement, - }, - 2 => MemoryIndex { - base: Some(Register::BP), - index: Some(Register::SI), - displacement, - }, - 3 => MemoryIndex { - base: Some(Register::BP), - index: Some(Register::DI), - displacement, - }, - 4 => MemoryIndex { - base: None, - index: Some(Register::SI), - displacement, - }, - 5 => MemoryIndex { - base: None, - index: Some(Register::DI), - displacement, - }, - 6 => MemoryIndex { - base: Some(Register::BP), - index: None, - displacement, - }, - 7 => MemoryIndex { - base: Some(Register::BX), - index: None, - displacement, - }, - _ => panic!("Invalid ModRM byte encountered"), - }; - - return (index, displacement_raw); -} - #[derive(Debug)] -#[allow(dead_code)] -/// Displacement for ModRM -pub enum Displacement { - Byte(u8), - Word(u16), +struct Disassembler { + pub offset: usize, + pub text: Vec, } -impl fmt::Display for Displacement { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - Self::Byte(byte) => write!(f, "{}", byte), - Self::Word(word) => write!(f, "{}", word), - } +impl Disassembler { + /// Parse a single byte of binary, return it and advance the offset. + /// Returns the read byte. + pub fn parse_byte(&mut self) -> u8 { + self.offset += 1; + let byte = self.text[self.offset]; + self.offset += 1; + byte + } + /// Parse a single word of binary, return it and advance the offset. + /// Returns the read word and a tuple of the read raw bytes + pub fn parse_word(&mut self) -> (u16, (u8, u8)) { + self.offset += 1; + let byte1 = self.text[self.offset]; + let byte2 = self.text[self.offset + 1]; + self.offset += 2; + (u16::from_le_bytes([byte1, byte2]), (byte1, byte2)) + } + /// Parse a single modrm byte, return the resulting MemoryIndex and advance the offset. + /// Returns the parsed modrm memory access, as well as all read raw bytes + pub fn parse_modrm_byte(&mut self) -> (MemoryIndex, Vec) { + // Calculate ModRM byte with bitmask + let opcode = self.text[self.offset]; + let modulo = opcode >> 6; + let reg = (opcode >> 3) & 7; + let rm = opcode & 7; + + let mut displacement_raw = Vec::new(); + let displacement = match modulo { + 0 => { + if rm == 6 { + // XXX: handle special case + panic!("Handle modulo == 0, rm == 6"); + } + None + } + 1 => { + self.offset += 2; // one additional byte was read + let byte = self.parse_byte(); + displacement_raw.push(byte); + log::debug!("Additional byte during ModRM parsing was read."); + Some(Displacement::Byte(byte)) + } + 2 => { + self.offset += 3; // two additional bytes (word) was read + let (word, raw) = self.parse_word(); + displacement_raw.push(raw.0); + displacement_raw.push(raw.1); + log::debug!("Additional two bytes during ModRM parsing was read."); + Some(Displacement::Word(word)) + } + 3 => panic!("TODO: handle modulo == 3"), + _ => panic!("Invalid ModRM byte encountered"), + }; + + let index = match rm { + 0 => MemoryIndex { + base: Some(Register::BX), + index: Some(Register::SI), + displacement, + }, + 1 => MemoryIndex { + base: Some(Register::BX), + index: Some(Register::DI), + displacement, + }, + 2 => MemoryIndex { + base: Some(Register::BP), + index: Some(Register::SI), + displacement, + }, + 3 => MemoryIndex { + base: Some(Register::BP), + index: Some(Register::DI), + displacement, + }, + 4 => MemoryIndex { + base: None, + index: Some(Register::SI), + displacement, + }, + 5 => MemoryIndex { + base: None, + index: Some(Register::DI), + displacement, + }, + 6 => MemoryIndex { + base: Some(Register::BP), + index: None, + displacement, + }, + 7 => MemoryIndex { + base: Some(Register::BX), + index: None, + displacement, + }, + _ => panic!("Invalid ModRM byte encountered"), + }; + + return (index, displacement_raw); } } diff --git a/src/instructions.rs b/src/instructions.rs index 44cd185..25efa93 100644 --- a/src/instructions.rs +++ b/src/instructions.rs @@ -1,76 +1,45 @@ use core::fmt; -use crate::disasm::Displacement; - -pub type MemAddress = u8; +// b: 8, w: 16, v: 16 -> i just treat v and w the same, if nothing blows up +#[allow(non_camel_case_types)] +pub type b = u8; +#[allow(non_camel_case_types)] +pub type w = u16; #[derive(Debug)] #[allow(dead_code)] -/// A single 'line' of executable ASM is called a MetaInstruction, which -/// contains the `Instruction`, which will be executed, alongside some Meta -/// Informations. -pub struct MetaInstruction { - pub start: usize, // location of the instruction start - pub size: usize, // size of the instruction in bytes - pub raw: Vec, // raw value of instruction - pub instruction: Instruction, // actual instruction +/// A single 'line' of executable ASM is called an Instruction, which +/// contains the `Opcode` that will be executed, alongside its starting offset +/// and the raw parsed bytes +pub struct Instruction { + pub start: usize, // location of the instruction start + pub raw: Vec, // raw value of instruction + pub opcode: Opcode, // actual instruction } -impl MetaInstruction { +impl Instruction { pub fn new() -> Self { - MetaInstruction { + Instruction { start: 0, - size: 0, raw: Vec::new(), - instruction: Instruction::NOP(), + opcode: Opcode::NOP(), } } } -impl fmt::Display for MetaInstruction { +impl fmt::Display for Instruction { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{:04x}: ", self.start).unwrap(); for b in self.raw.iter() { write!(f, "{:02x}", b).unwrap(); } - write!(f, "\t{}", self.instruction) - } -} - -#[derive(Debug)] -pub struct MemoryIndex { - pub base: Option, - pub index: Option, - pub displacement: Option, -} - -impl fmt::Display for MemoryIndex { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match &self.base { - Some(base) => match &self.index { - Some(index) => match &self.displacement { - Some(displacement) => write!(f, "[{}+{}+{}]", base, index, displacement), - None => write!(f, "[{}+{}]", base, index), - }, - None => match &self.displacement { - Some(displacement) => write!(f, "[{}+{}]", base, displacement), - None => write!(f, "[{}]", base), - }, - }, - None => match &self.index { - Some(index) => match &self.displacement { - Some(displacement) => write!(f, "{}+{}", index, displacement), - None => write!(f, "[{}]", index), - }, - None => panic!("Invalid MemoryIndex encountered"), - }, - } + write!(f, "\t{}", self.opcode) } } #[derive(Debug)] #[allow(dead_code, non_camel_case_types)] -pub enum Instruction { +pub enum Opcode { NOP(), // ADD ADD_EbGb(MemoryIndex, Register), @@ -80,7 +49,7 @@ pub enum Instruction { INT(ImmediateByte), } -impl fmt::Display for Instruction { +impl fmt::Display for Opcode { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Self::INT(byte) => write!(f, "INT, {:x}", byte), @@ -91,46 +60,6 @@ impl fmt::Display for Instruction { } } -// Types for operand encoding -#[derive(Debug)] -pub struct Memory(pub MemAddress); -// b: 8, w: 16, v: 16 -> i just treat v and w the same, if nothing blows up -#[derive(Debug)] -pub struct ImmediateByte(pub u8); -#[derive(Debug)] -pub struct ImmediateWord(pub u16); - -// ... and the displays for all of them -macro_rules! impl_display { - ($name:ident) => { - impl std::fmt::Display for $name { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{}", self.0) - } - } - }; -} - -macro_rules! impl_display_and_lowerhex { - ($name:ident) => { - impl std::fmt::Display for $name { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{}", self.0) - } - } - - impl std::fmt::LowerHex for $name { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - std::fmt::LowerHex::fmt(&self.0, f) - } - } - }; -} - -impl_display!(Memory); -impl_display_and_lowerhex!(ImmediateByte); -impl_display_and_lowerhex!(ImmediateWord); - /// Registers of a 8086 processor #[derive(Debug)] #[allow(dead_code)] @@ -236,3 +165,80 @@ impl fmt::Display for SegmentRegister { } } } + +/// An immediate byte value for an instruction. +#[derive(Debug)] +pub struct ImmediateByte(pub b); + +/// An immediate word value for an instruction +#[derive(Debug)] +pub struct ImmediateWord(pub w); + +macro_rules! impl_display_and_lowerhex { + ($name:ident) => { + impl std::fmt::Display for $name { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}", self.0) + } + } + + impl std::fmt::LowerHex for $name { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::LowerHex::fmt(&self.0, f) + } + } + }; +} + +impl_display_and_lowerhex!(ImmediateByte); +impl_display_and_lowerhex!(ImmediateWord); + +/// A memory index operand is usually created by ModRM bytes or words. +/// e.g. [bx+si] +#[derive(Debug)] +pub struct MemoryIndex { + pub base: Option, + pub index: Option, + pub displacement: Option, +} + +impl fmt::Display for MemoryIndex { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match &self.base { + Some(base) => match &self.index { + Some(index) => match &self.displacement { + Some(displacement) => write!(f, "[{}+{}+{}]", base, index, displacement), + None => write!(f, "[{}+{}]", base, index), + }, + None => match &self.displacement { + Some(displacement) => write!(f, "[{}+{}]", base, displacement), + None => write!(f, "[{}]", base), + }, + }, + None => match &self.index { + Some(index) => match &self.displacement { + Some(displacement) => write!(f, "{}+{}", index, displacement), + None => write!(f, "[{}]", index), + }, + None => panic!("Invalid MemoryIndex encountered"), + }, + } + } +} + +#[derive(Debug)] +#[allow(dead_code)] +/// Displacement for ModRM +pub enum Displacement { + Byte(u8), + Word(u16), +} + +impl fmt::Display for Displacement { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Byte(byte) => write!(f, "{}", byte), + Self::Word(word) => write!(f, "{}", word), + } + } +} diff --git a/src/main.rs b/src/main.rs index 0df0873..4170fb3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,6 @@ use clap::{Parser, Subcommand}; mod aout; -mod decode; mod disasm; mod instructions; @@ -34,7 +33,8 @@ fn main() { match args.command { Command::Disasm => { - let _instructions = disasm::disasm(&args).unwrap(); + let instructions = disasm::disasm(&args).unwrap(); + log::debug!("{:?}", &instructions); } _ => panic!("Command not yet implemented"), }