ft: implement disasm in own struct

This makes it easier to implement each opcode,
as the offset calculation and recovery of raw
read bytes is internalized.
This commit is contained in:
2025-05-08 20:18:02 +09:00
parent 1c7d3f3adc
commit df00f59b5a
2 changed files with 76 additions and 79 deletions

View File

@@ -38,9 +38,8 @@ pub fn disasm(args: &Args) -> Result<Vec<Instruction>, DisasmError> {
log::debug!("{:?}", aout); log::debug!("{:?}", aout);
let instructions = decode_instructions(&aout)?; let mut disasm = Disassembler::new(aout);
disasm.decode_instructions()
Ok(instructions)
} }
/// Read a filepath into a u8 buffer. /// Read a filepath into a u8 buffer.
@@ -56,97 +55,53 @@ fn path_to_buf(args: &Args) -> Result<Vec<u8>, DisasmError> {
Ok(buf) Ok(buf)
} }
/// Decode instructions from the text section of the provided binary
fn decode_instructions(aout: &Aout) -> Result<Vec<Instruction>, DisasmError> {
// naive approach:
// 1. read byte
// 2. pattern match to see which instruction it is
// 3. read as many bytes as this instruction needs (registers, immidiates, ...)
// repeat until no bytes left
let mut instructions = Vec::new();
let mut disassembler = Disassembler {
offset: 0,
text: aout.text.clone(),
};
while disassembler.offset < disassembler.text.len() {
let mut instr = Instruction::new();
instr.start = disassembler.offset;
let opcode = disassembler.text[disassembler.offset];
instr.raw.push(opcode);
match opcode {
// ADD
0x00 => {
let (mem_index, mut raw) = disassembler.parse_modrm_byte();
let reg = disassembler.parse_byte();
instr.raw.append(&mut raw);
instr.raw.push(reg);
instr.opcode = Opcode::ADD_EbGb(mem_index, Register::by_id(reg));
}
// INT
0xCD => {
let byte = disassembler.parse_byte();
instr.raw.push(byte);
instr.opcode = Opcode::INT(ImmediateByte(byte));
}
// MOV
0xBB => {
let (word, raw) = disassembler.parse_word();
instr.raw.push(raw.0);
instr.raw.push(raw.1);
instr.opcode = Opcode::MOV_BXIv(Register::BX, ImmediateWord(word));
}
_ => {
eprintln!("Encountered unknown instruction '0x{:x}'", opcode);
eprintln!("Offset might be misaligned and data is being interpreted.");
eprintln!("Existing to avoid further misinterpretation...");
exit(1);
}
};
println!("{}", instr);
instructions.push(instr);
}
Ok(instructions)
}
#[derive(Debug)] #[derive(Debug)]
struct Disassembler { struct Disassembler {
pub offset: usize, pub offset: usize, // the current offset in the disasm process
pub text: Vec<u8>, pub text: Vec<u8>, // the aout binary
pub instruction: Instruction, // the instruction, which is currently being parsed
} }
impl Disassembler { impl Disassembler {
pub fn new(aout: Aout) -> Self {
Disassembler {
offset: 0,
text: aout.text,
instruction: Instruction::new(),
}
}
/// Parse a single byte of binary, return it and advance the offset. /// Parse a single byte of binary, return it and advance the offset.
/// Returns the read byte. /// Returns the read byte.
pub fn parse_byte(&mut self) -> u8 { pub fn parse_byte(&mut self) -> u8 {
self.offset += 1; self.offset += 1;
let byte = self.text[self.offset]; let byte = self.text[self.offset];
self.offset += 1; self.offset += 1;
self.instruction.raw.push(byte);
byte byte
} }
/// Parse a single word of binary, return it and advance the offset. /// Parse a single word of binary, return it and advance the offset.
/// Returns the read word and a tuple of the read raw bytes /// Returns the read word.
pub fn parse_word(&mut self) -> (u16, (u8, u8)) { pub fn parse_word(&mut self) -> u16 {
self.offset += 1; self.offset += 1;
let byte1 = self.text[self.offset]; let byte1 = self.text[self.offset];
let byte2 = self.text[self.offset + 1]; let byte2 = self.text[self.offset + 1];
self.offset += 2; self.offset += 2;
(u16::from_le_bytes([byte1, byte2]), (byte1, byte2)) self.instruction.raw.push(byte1);
self.instruction.raw.push(byte2);
u16::from_le_bytes([byte1, byte2])
} }
/// Parse a single modrm byte, return the resulting MemoryIndex and advance the offset. /// Parse a single modrm byte, return the resulting MemoryIndex and advance the offset.
/// Returns the parsed modrm memory access, as well as all read raw bytes /// Returns the parsed modrm memory access, as well as all read raw bytes
pub fn parse_modrm_byte(&mut self) -> (MemoryIndex, Vec<u8>) { pub fn parse_modrm_byte(&mut self) -> MemoryIndex {
// Calculate ModRM byte with bitmask // Calculate ModRM byte with bitmask
let opcode = self.text[self.offset]; let opcode = self.text[self.offset];
let modulo = opcode >> 6; let modulo = opcode >> 6;
let reg = (opcode >> 3) & 7; let reg = (opcode >> 3) & 7;
let rm = opcode & 7; let rm = opcode & 7;
let mut displacement_raw = Vec::new();
let displacement = match modulo { let displacement = match modulo {
0 => { 0 => {
if rm == 6 { if rm == 6 {
@@ -158,15 +113,12 @@ impl Disassembler {
1 => { 1 => {
self.offset += 2; // one additional byte was read self.offset += 2; // one additional byte was read
let byte = self.parse_byte(); let byte = self.parse_byte();
displacement_raw.push(byte);
log::debug!("Additional byte during ModRM parsing was read."); log::debug!("Additional byte during ModRM parsing was read.");
Some(Displacement::Byte(byte)) Some(Displacement::Byte(byte))
} }
2 => { 2 => {
self.offset += 3; // two additional bytes (word) was read self.offset += 3; // two additional bytes (word) was read
let (word, raw) = self.parse_word(); let word = self.parse_word();
displacement_raw.push(raw.0);
displacement_raw.push(raw.1);
log::debug!("Additional two bytes during ModRM parsing was read."); log::debug!("Additional two bytes during ModRM parsing was read.");
Some(Displacement::Word(word)) Some(Displacement::Word(word))
} }
@@ -218,6 +170,51 @@ impl Disassembler {
_ => panic!("Invalid ModRM byte encountered"), _ => panic!("Invalid ModRM byte encountered"),
}; };
return (index, displacement_raw); index
}
/// Decode instructions from the text section of the provided binary
pub fn decode_instructions(&mut self) -> Result<Vec<Instruction>, DisasmError> {
// naive approach:
// 1. read byte
// 2. pattern match to see which instruction it is
// 3. read as many bytes as this instruction needs (registers, immidiates, ...)
// repeat until no bytes left
let mut instructions = Vec::new();
while self.offset < self.text.len() {
self.instruction.start = self.offset;
let opcode = self.text[self.offset];
// additional raw bytes will be pushed by parse functions
self.instruction.raw.push(opcode);
self.instruction.opcode = match opcode {
// ADD
0x00 => {
Opcode::ADD_EbGb(self.parse_modrm_byte(), Register::by_id(self.parse_byte()))
}
// INT
0xCD => Opcode::INT(ImmediateByte(self.parse_byte())),
// MOV
0xBB => Opcode::MOV_BXIv(Register::BX, ImmediateWord(self.parse_word())),
_ => {
eprintln!(
"Encountered unknown self.instructionuction '0x{:x}'",
opcode
);
eprintln!("Offset might be misaligned and data is being interpreted.");
eprintln!("Existing to avoid further misinterpretation...");
exit(1);
}
};
println!("{}", self.instruction);
instructions.push(self.instruction.clone());
self.instruction = Instruction::new();
}
Ok(instructions)
} }
} }

View File

@@ -6,7 +6,7 @@ pub type b = u8;
#[allow(non_camel_case_types)] #[allow(non_camel_case_types)]
pub type w = u16; pub type w = u16;
#[derive(Debug)] #[derive(Debug, Clone)]
#[allow(dead_code)] #[allow(dead_code)]
/// A single 'line' of executable ASM is called an Instruction, which /// A single 'line' of executable ASM is called an Instruction, which
/// contains the `Opcode` that will be executed, alongside its starting offset /// contains the `Opcode` that will be executed, alongside its starting offset
@@ -37,7 +37,7 @@ impl fmt::Display for Instruction {
} }
} }
#[derive(Debug)] #[derive(Debug, Clone)]
#[allow(dead_code, non_camel_case_types)] #[allow(dead_code, non_camel_case_types)]
pub enum Opcode { pub enum Opcode {
NOP(), NOP(),
@@ -61,7 +61,7 @@ impl fmt::Display for Opcode {
} }
/// Registers of a 8086 processor /// Registers of a 8086 processor
#[derive(Debug)] #[derive(Debug, Clone)]
#[allow(dead_code)] #[allow(dead_code)]
pub enum Register { pub enum Register {
AX, AX,
@@ -167,11 +167,11 @@ impl fmt::Display for SegmentRegister {
} }
/// An immediate byte value for an instruction. /// An immediate byte value for an instruction.
#[derive(Debug)] #[derive(Debug, Clone)]
pub struct ImmediateByte(pub b); pub struct ImmediateByte(pub b);
/// An immediate word value for an instruction /// An immediate word value for an instruction
#[derive(Debug)] #[derive(Debug, Clone)]
pub struct ImmediateWord(pub w); pub struct ImmediateWord(pub w);
macro_rules! impl_display_and_lowerhex { macro_rules! impl_display_and_lowerhex {
@@ -195,7 +195,7 @@ impl_display_and_lowerhex!(ImmediateWord);
/// A memory index operand is usually created by ModRM bytes or words. /// A memory index operand is usually created by ModRM bytes or words.
/// e.g. [bx+si] /// e.g. [bx+si]
#[derive(Debug)] #[derive(Debug, Clone)]
pub struct MemoryIndex { pub struct MemoryIndex {
pub base: Option<Register>, pub base: Option<Register>,
pub index: Option<Register>, pub index: Option<Register>,
@@ -226,7 +226,7 @@ impl fmt::Display for MemoryIndex {
} }
} }
#[derive(Debug)] #[derive(Debug, Clone)]
#[allow(dead_code)] #[allow(dead_code)]
/// Displacement for ModRM /// Displacement for ModRM
pub enum Displacement { pub enum Displacement {