From 849895a4378becd47e7cba3b1fd915dae0b9dfb8 Mon Sep 17 00:00:00 2001 From: Marco Thomas Date: Wed, 7 May 2025 22:46:58 +0900 Subject: [PATCH] ft: add modrm parsing --- 8086_table.txt | 306 ++++++++++++++++++++++++++++++++++++++++++++ src/disasm.rs | 146 +++++++++++++++++++-- src/instructions.rs | 161 +++++++++++++++-------- 3 files changed, 550 insertions(+), 63 deletions(-) create mode 100644 8086_table.txt diff --git a/8086_table.txt b/8086_table.txt new file mode 100644 index 0000000..30cb978 --- /dev/null +++ b/8086_table.txt @@ -0,0 +1,306 @@ +00 ADD Eb Gb +01 ADD Ev Gv +02 ADD Gb Eb +03 ADD Gv Ev +04 ADD AL Ib +05 ADD eAX Iv +06 PUSH ES +07 POP ES +08 OR Eb Gb +09 OR Ev Gv +0A OR Gb Eb +0B OR Gv Ev +0C OR AL Ib +0D OR eAX Iv +0E PUSH CS +0F -- +10 ADC Eb Gb +11 ADC Ev Gv +12 ADC Gb Eb +13 ADC Gv Ev +14 ADC AL Ib +15 ADC eAX Iv +16 PUSH SS +17 POP SS +18 SBB Eb Gb +19 SBB Ev Gv +1A SBB Gb Eb +1B SBB Gv Ev +1C SBB AL Ib +1D SBB eAX Iv +1E PUSH DS +1F POP DS +20 AND Eb Gb +21 AND Ev Gv +22 AND Gb Eb +23 AND Gv Ev +24 AND AL Ib +25 AND eAX Iv +26 ES: +27 DAA +28 SUB Eb Gb +29 SUB Ev Gv +2A SUB Gb Eb +2B SUB Gv Ev +2C SUB AL Ib +2D SUB eAX Iv +2E CS: +2F DAS +30 XOR Eb Gb +31 XOR Ev Gv +32 XOR Gb Eb +33 XOR Gv Ev +34 XOR AL Ib +35 XOR eAX Iv +36 SS: +37 AAA +38 CMP Eb Gb +39 CMP Ev Gv +3A CMP Gb Eb +3B CMP Gv Ev +3C CMP AL Ib +3D CMP eAX Iv +3E DS: +3F AAS +40 INC eAX +41 INC eCX +42 INC eDX +43 INC eBX +44 INC eSP +45 INC eBP +46 INC eSI +47 INC eDI +48 DEC eAX +49 DEC eCX +4A DEC eDX +4B DEC eBX +4C DEC eSP +4D DEC eBP +4E DEC eSI +4F DEC eDI +50 PUSH eAX +51 PUSH eCX +52 PUSH eDX +53 PUSH eBX +54 PUSH eSP +55 PUSH eBP +56 PUSH eSI +57 PUSH eDI +58 POP eAX +59 POP eCX +5A POP eDX +5B POP eBX +5C POP eSP +5D POP eBP +5E POP eSI +5F POP eDI +60 -- +61 -- +62 -- +63 -- +64 -- +65 -- +66 -- +67 -- +68 -- +69 -- +6A -- +6B -- +6C -- +6D -- +6E -- +6F -- +70 JO Jb +71 JNO Jb +72 JB Jb +73 JNB Jb +74 JZ Jb +75 JNZ Jb +76 JBE Jb +77 JA Jb +78 JS Jb +79 JNS Jb +7A JPE Jb +7B JPO Jb +7C JL Jb +7D JGE Jb +7E JLE Jb +7F JG Jb +80 GRP1 Eb Ib +81 GRP1 Ev Iv +82 GRP1 Eb Ib +83 GRP1 Ev Ib +84 TEST Gb Eb +85 TEST Gv Ev +86 XCHG Gb Eb +87 XCHG Gv Ev +88 MOV Eb Gb +89 MOV Ev Gv +8A MOV Gb Eb +8B MOV Gv Ev +8C MOV Ew Sw +8D LEA Gv M +8E MOV Sw Ew +8F POP Ev +90 NOP +91 XCHG eCX eAX +92 XCHG eDX eAX +93 XCHG eBX eAX +94 XCHG eSP eAX +95 XCHG eBP eAX +96 XCHG eSI eAX +97 XCHG eDI eAX +98 CBW +99 CWD +9A CALL Ap +9B WAIT +9C PUSHF +9D POPF +9E SAHF +9F LAHF +A0 MOV AL Ob +A1 MOV eAX Ov +A2 MOV Ob AL +A3 MOV Ov eAX +A4 MOVSB +A5 MOVSW +A6 CMPSB +A7 CMPSW +A8 TEST AL Ib +A9 TEST eAX Iv +AA STOSB +AB STOSW +AC LODSB +AD LODSW +AE SCASB +AF SCASW +B0 MOV AL Ib +B1 MOV CL Ib +B2 MOV DL Ib +B3 MOV BL Ib +B4 MOV AH Ib +B5 MOV CH Ib +B6 MOV DH Ib +B7 MOV BH Ib +B8 MOV eAX Iv +B9 MOV eCX Iv +BA MOV eDX Iv +BB MOV eBX Iv +BC MOV eSP Iv +BD MOV eBP Iv +BE MOV eSI Iv +BF MOV eDI Iv +C0 -- +C1 -- +C2 RET Iw +C3 RET +C4 LES Gv Mp +C5 LDS Gv Mp +C6 MOV Eb Ib +C7 MOV Ev Iv +C8 -- +C9 -- +CA RETF Iw +CB RETF +CC INT 3 +CD INT Ib +CE INTO +CF IRET +D0 GRP2 Eb 1 +D1 GRP2 Ev 1 +D2 GRP2 Eb CL +D3 GRP2 Ev CL +D4 AAM I0 +D5 AAD I0 +D6 -- +D7 XLAT +D8 -- +D9 -- +DA -- +DB -- +DC -- +DD -- +DE -- +DF -- +E0 LOOPNZ Jb +E1 LOOPZ Jb +E2 LOOP Jb +E3 JCXZ Jb +E4 IN AL Ib +E5 IN eAX Ib +E6 OUT Ib AL +E7 OUT Ib eAX +E8 CALL Jv +E9 JMP Jv +EA JMP Ap +EB JMP Jb +EC IN AL DX +ED IN eAX DX +EE OUT DX AL +EF OUT DX eAX +F0 LOCK +F1 -- +F2 REPNZ +F3 REPZ +F4 HLT +F5 CMC +F6 GRP3a Eb +F7 GRP3b Ev +F8 CLC +F9 STC +FA CLI +FB STI +FC CLD +FD STD +FE GRP4 Eb +FF GRP5 Ev + + +GRP1/0 ADD +GRP1/1 OR +GRP1/2 ADC +GRP1/3 SBB +GRP1/4 AND +GRP1/5 SUB +GRP1/6 XOR +GRP1/7 CMP +GRP2/0 ROL +GRP2/1 ROR +GRP2/2 RCL +GRP2/3 RCR +GRP2/4 SHL +GRP2/5 SHR +GRP2/6 -- +GRP2/7 SAR +GRP3a/0 TEST Eb Ib +GRP3a/1 -- +GRP3a/2 NOT +GRP3a/3 NEG +GRP3a/4 MUL +GRP3a/5 IMUL +GRP3a/6 DIV +GRP3a/7 IDIV +GRP3b/0 TEST Ev Iv +GRP3b/1 -- +GRP3b/2 NOT +GRP3b/3 NEG +GRP3b/4 MUL +GRP3b/5 IMUL +GRP3b/6 DIV +GRP3b/7 IDIV +GRP4/0 INC +GRP4/1 DEC +GRP4/2 -- +GRP4/3 -- +GRP4/4 -- +GRP4/5 -- +GRP4/6 -- +GRP4/7 -- +GRP5/0 INC +GRP5/1 DEC +GRP5/2 CALL +GRP5/3 CALL Mp +GRP5/4 JMP +GRP5/5 JMP Mp +GRP5/6 PUSH +GRP5/7 -- diff --git a/src/disasm.rs b/src/disasm.rs index 26e5182..f3fbd3f 100644 --- a/src/disasm.rs +++ b/src/disasm.rs @@ -2,6 +2,7 @@ use core::fmt; use std::{fs::File, io::Read, process::exit}; use crate::aout::Aout; +use crate::instructions::MemoryIndex; use crate::{ Args, instructions::{ImmediateByte, ImmediateWord, Instruction, MetaInstruction, Register}, @@ -63,7 +64,7 @@ fn decode_instructions(aout: &Aout) -> Result, DisasmError> // 3. read as many bytes as this instruction needs (registers, immidiates, ...) // repeat until no bytes left - let instructions = Vec::new(); + let mut instructions = Vec::new(); let mut offset = 0; let text = &aout.text; @@ -73,19 +74,29 @@ fn decode_instructions(aout: &Aout) -> Result, DisasmError> let opcode = text[offset]; match opcode { - // 0x00 => {} // ADD + // ADD + 0x00 => { + let (mem_index, mut raw) = parse_modrm_byte(&mut offset, text); + let reg = parse_byte(&mut offset, text); + instr.size = 2 + raw.len(); + instr.raw = Vec::from([opcode]); + instr.raw.append(&mut raw); + instr.raw.push(reg); + instr.instruction = Instruction::ADD_EbGb(mem_index, Register::by_id(reg)); + } // INT 0xCD => { - instr.take_n_bytes(2, &mut offset, text); - instr.instruction = Instruction::INT(ImmediateByte(instr.raw[1])); + let byte = parse_byte(&mut offset, text); + instr.size = 2; + instr.raw = Vec::from([opcode, byte]); + instr.instruction = Instruction::INT(ImmediateByte(byte)); } // MOV 0xBB => { - instr.take_n_bytes(3, &mut offset, text); - instr.instruction = Instruction::MOV_RI( - Register::BX, - ImmediateWord(u16::from_le_bytes([instr.raw[1], instr.raw[2]])), - ); + let (word, raw) = parse_word(&mut offset, text); + instr.size = 3; + instr.raw = Vec::from([opcode, raw.0, raw.1]); + instr.instruction = Instruction::MOV_BXIv(Register::BX, ImmediateWord(word)); } _ => { eprintln!("Encountered unknown instruction '0x{:x}'", opcode); @@ -96,8 +107,123 @@ fn decode_instructions(aout: &Aout) -> Result, DisasmError> }; println!("{}", instr); - // dbg!(&instr); + instructions.push(instr); } Ok(instructions) } + +/// Parse a single byte of binary, return it and advance the offset. +pub fn parse_byte(offset: &mut usize, text: &Vec) -> u8 { + *offset += 1; + let byte = text[*offset]; + *offset += 1; + byte +} +/// Parse a single word of binary, return it and advance the offset. +pub fn parse_word(offset: &mut usize, text: &Vec) -> (u16, (u8, u8)) { + *offset += 1; + let byte1 = text[*offset]; + let byte2 = text[*offset + 1]; + *offset += 2; + (u16::from_le_bytes([byte1, byte2]), (byte1, byte2)) +} +/// Parse a single modrm byte, return the resulting MemoryIndex and advance the offset. +pub fn parse_modrm_byte(offset: &mut usize, text: &Vec) -> (MemoryIndex, Vec) { + // Calculate ModRM byte with bitmask + let opcode = text[*offset]; + let modulo = opcode >> 6; + let reg = (opcode >> 3) & 7; + let rm = opcode & 7; + + let mut displacement_raw = Vec::new(); + let displacement = match modulo { + 0 => { + if rm == 6 { + // XXX: handle special case + panic!("Handle modulo == 0, rm == 6"); + } + None + } + 1 => { + *offset += 2; // one additional byte was read + let byte = parse_byte(offset, text); + displacement_raw.push(byte); + log::debug!("Additional byte during ModRM parsing was read."); + Some(Displacement::Byte(byte)) + } + 2 => { + *offset += 3; // two additional bytes (word) was read + let (word, raw) = parse_word(offset, text); + displacement_raw.push(raw.0); + displacement_raw.push(raw.1); + log::debug!("Additional two bytes during ModRM parsing was read."); + Some(Displacement::Word(word)) + } + 3 => panic!("TODO: handle modulo == 3"), + _ => panic!("Invalid ModRM byte encountered"), + }; + + let index = match rm { + 0 => MemoryIndex { + base: Some(Register::BX), + index: Some(Register::SI), + displacement, + }, + 1 => MemoryIndex { + base: Some(Register::BX), + index: Some(Register::DI), + displacement, + }, + 2 => MemoryIndex { + base: Some(Register::BP), + index: Some(Register::SI), + displacement, + }, + 3 => MemoryIndex { + base: Some(Register::BP), + index: Some(Register::DI), + displacement, + }, + 4 => MemoryIndex { + base: None, + index: Some(Register::SI), + displacement, + }, + 5 => MemoryIndex { + base: None, + index: Some(Register::DI), + displacement, + }, + 6 => MemoryIndex { + base: Some(Register::BP), + index: None, + displacement, + }, + 7 => MemoryIndex { + base: Some(Register::BX), + index: None, + displacement, + }, + _ => panic!("Invalid ModRM byte encountered"), + }; + + return (index, displacement_raw); +} + +#[derive(Debug)] +#[allow(dead_code)] +/// Displacement for ModRM +pub enum Displacement { + Byte(u8), + Word(u16), +} + +impl fmt::Display for Displacement { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Byte(byte) => write!(f, "{}", byte), + Self::Word(word) => write!(f, "{}", word), + } + } +} diff --git a/src/instructions.rs b/src/instructions.rs index 103746c..44cd185 100644 --- a/src/instructions.rs +++ b/src/instructions.rs @@ -1,5 +1,7 @@ use core::fmt; +use crate::disasm::Displacement; + pub type MemAddress = u8; #[derive(Debug)] @@ -23,18 +25,6 @@ impl MetaInstruction { instruction: Instruction::NOP(), } } - - /// Parse n bytes from text section and advance offet. - /// Used to get the operands. - pub fn take_n_bytes(&mut self, size: usize, offset: &mut usize, text: &Vec) { - self.size = size; - self.raw = text[*offset as usize..] - .iter() - .take(size) - .cloned() - .collect(); - *offset += size; - } } impl fmt::Display for MetaInstruction { @@ -47,26 +37,45 @@ impl fmt::Display for MetaInstruction { } } +#[derive(Debug)] +pub struct MemoryIndex { + pub base: Option, + pub index: Option, + pub displacement: Option, +} + +impl fmt::Display for MemoryIndex { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match &self.base { + Some(base) => match &self.index { + Some(index) => match &self.displacement { + Some(displacement) => write!(f, "[{}+{}+{}]", base, index, displacement), + None => write!(f, "[{}+{}]", base, index), + }, + None => match &self.displacement { + Some(displacement) => write!(f, "[{}+{}]", base, displacement), + None => write!(f, "[{}]", base), + }, + }, + None => match &self.index { + Some(index) => match &self.displacement { + Some(displacement) => write!(f, "{}+{}", index, displacement), + None => write!(f, "[{}]", index), + }, + None => panic!("Invalid MemoryIndex encountered"), + }, + } + } +} + #[derive(Debug)] #[allow(dead_code, non_camel_case_types)] pub enum Instruction { NOP(), // ADD - ADD_RM(Register, Memory), - ADD_MR(Memory, Register), - ADD_RR(Register, Register), - ADD_MI(Memory, ImmediateByte), - ADD_RI(Register, ImmediateByte), + ADD_EbGb(MemoryIndex, Register), // MOV - MOV_RM(Register, Memory), - MOV_MR(Memory, Register), - MOV_RR(Register, Register), - MOV_MI(Memory, ImmediateByte), - MOV_RI(Register, ImmediateWord), - MOV_SM(SRegister, Memory), - MOV_MS(Memory, SRegister), - MOV_RS(Register, SRegister), - MOV_SR(SRegister, Register), + MOV_BXIv(Register, ImmediateWord), // INT INT(ImmediateByte), } @@ -75,7 +84,8 @@ impl fmt::Display for Instruction { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Self::INT(byte) => write!(f, "INT, {:x}", byte), - Self::MOV_RI(reg, word) => write!(f, "MOV {}, {:04x}", reg, word), + Self::ADD_EbGb(mem, reg) => write!(f, "ADD {}, {}", mem, reg), + Self::MOV_BXIv(reg, word) => write!(f, "MOV {}, {:04x}", reg, word), _ => write!(f, "display not yet implemented"), } } @@ -84,38 +94,43 @@ impl fmt::Display for Instruction { // Types for operand encoding #[derive(Debug)] pub struct Memory(pub MemAddress); +// b: 8, w: 16, v: 16 -> i just treat v and w the same, if nothing blows up #[derive(Debug)] pub struct ImmediateByte(pub u8); #[derive(Debug)] pub struct ImmediateWord(pub u16); // ... and the displays for all of them -impl fmt::Display for Memory { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", self.0) - } +macro_rules! impl_display { + ($name:ident) => { + impl std::fmt::Display for $name { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}", self.0) + } + } + }; } -impl fmt::Display for ImmediateByte { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", self.0) - } -} -impl fmt::LowerHex for ImmediateByte { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::LowerHex::fmt(&self.0, f) - } -} -impl fmt::Display for ImmediateWord { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", self.0) - } -} -impl fmt::LowerHex for ImmediateWord { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::LowerHex::fmt(&self.0, f) - } + +macro_rules! impl_display_and_lowerhex { + ($name:ident) => { + impl std::fmt::Display for $name { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}", self.0) + } + } + + impl std::fmt::LowerHex for $name { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::LowerHex::fmt(&self.0, f) + } + } + }; } +impl_display!(Memory); +impl_display_and_lowerhex!(ImmediateByte); +impl_display_and_lowerhex!(ImmediateWord); + /// Registers of a 8086 processor #[derive(Debug)] #[allow(dead_code)] @@ -138,6 +153,32 @@ pub enum Register { SP, } +#[allow(dead_code)] +impl Register { + /// Find the register corresponding to the 8086 bytecode ID + pub fn by_id(id: u8) -> Self { + match id { + 0x00 => Self::AL, + 0x01 => Self::CL, + 0x02 => Self::DL, + 0x03 => Self::BL, + 0x04 => Self::AH, + 0x05 => Self::CH, + 0x06 => Self::DH, + 0x07 => Self::BH, + 0x10 => Self::AX, + 0x11 => Self::CX, + 0x12 => Self::DX, + 0x13 => Self::BX, + 0x14 => Self::SP, + 0x15 => Self::BP, + 0x16 => Self::SI, + 0x17 => Self::DI, + _ => panic!("Invalid register ID encountered"), + } + } +} + impl fmt::Display for Register { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { @@ -161,17 +202,31 @@ impl fmt::Display for Register { } } -/// SRegisters of a 8086 processor +/// Segment Registers of a 8086 processor #[derive(Debug)] #[allow(dead_code)] -pub enum SRegister { +pub enum SegmentRegister { DS, ES, SS, CS, } -impl fmt::Display for SRegister { +#[allow(dead_code)] +impl SegmentRegister { + /// Find the SRegister corresponding to the 8086 bytecode ID + pub fn by_id(id: u8) -> Self { + match id { + 0x30 => Self::ES, + 0x31 => Self::CS, + 0x32 => Self::SS, + 0x33 => Self::DS, + _ => panic!("Invalid segment register ID encountered"), + } + } +} + +impl fmt::Display for SegmentRegister { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Self::DS => write!(f, "DS"),