ft: implement disasm in own struct
This makes it easier to implement each opcode, as the offset calculation and recovery of raw read bytes is internalized.
This commit is contained in:
141
src/disasm.rs
141
src/disasm.rs
@@ -38,9 +38,8 @@ pub fn disasm(args: &Args) -> Result<Vec<Instruction>, DisasmError> {
|
|||||||
|
|
||||||
log::debug!("{:?}", aout);
|
log::debug!("{:?}", aout);
|
||||||
|
|
||||||
let instructions = decode_instructions(&aout)?;
|
let mut disasm = Disassembler::new(aout);
|
||||||
|
disasm.decode_instructions()
|
||||||
Ok(instructions)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read a filepath into a u8 buffer.
|
/// Read a filepath into a u8 buffer.
|
||||||
@@ -56,97 +55,53 @@ fn path_to_buf(args: &Args) -> Result<Vec<u8>, DisasmError> {
|
|||||||
Ok(buf)
|
Ok(buf)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Decode instructions from the text section of the provided binary
|
|
||||||
fn decode_instructions(aout: &Aout) -> Result<Vec<Instruction>, DisasmError> {
|
|
||||||
// naive approach:
|
|
||||||
// 1. read byte
|
|
||||||
// 2. pattern match to see which instruction it is
|
|
||||||
// 3. read as many bytes as this instruction needs (registers, immidiates, ...)
|
|
||||||
// repeat until no bytes left
|
|
||||||
|
|
||||||
let mut instructions = Vec::new();
|
|
||||||
let mut disassembler = Disassembler {
|
|
||||||
offset: 0,
|
|
||||||
text: aout.text.clone(),
|
|
||||||
};
|
|
||||||
|
|
||||||
while disassembler.offset < disassembler.text.len() {
|
|
||||||
let mut instr = Instruction::new();
|
|
||||||
instr.start = disassembler.offset;
|
|
||||||
|
|
||||||
let opcode = disassembler.text[disassembler.offset];
|
|
||||||
instr.raw.push(opcode);
|
|
||||||
match opcode {
|
|
||||||
// ADD
|
|
||||||
0x00 => {
|
|
||||||
let (mem_index, mut raw) = disassembler.parse_modrm_byte();
|
|
||||||
let reg = disassembler.parse_byte();
|
|
||||||
instr.raw.append(&mut raw);
|
|
||||||
instr.raw.push(reg);
|
|
||||||
instr.opcode = Opcode::ADD_EbGb(mem_index, Register::by_id(reg));
|
|
||||||
}
|
|
||||||
// INT
|
|
||||||
0xCD => {
|
|
||||||
let byte = disassembler.parse_byte();
|
|
||||||
instr.raw.push(byte);
|
|
||||||
instr.opcode = Opcode::INT(ImmediateByte(byte));
|
|
||||||
}
|
|
||||||
// MOV
|
|
||||||
0xBB => {
|
|
||||||
let (word, raw) = disassembler.parse_word();
|
|
||||||
instr.raw.push(raw.0);
|
|
||||||
instr.raw.push(raw.1);
|
|
||||||
instr.opcode = Opcode::MOV_BXIv(Register::BX, ImmediateWord(word));
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
eprintln!("Encountered unknown instruction '0x{:x}'", opcode);
|
|
||||||
eprintln!("Offset might be misaligned and data is being interpreted.");
|
|
||||||
eprintln!("Existing to avoid further misinterpretation...");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
println!("{}", instr);
|
|
||||||
instructions.push(instr);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(instructions)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
struct Disassembler {
|
struct Disassembler {
|
||||||
pub offset: usize,
|
pub offset: usize, // the current offset in the disasm process
|
||||||
pub text: Vec<u8>,
|
pub text: Vec<u8>, // the aout binary
|
||||||
|
pub instruction: Instruction, // the instruction, which is currently being parsed
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Disassembler {
|
impl Disassembler {
|
||||||
|
pub fn new(aout: Aout) -> Self {
|
||||||
|
Disassembler {
|
||||||
|
offset: 0,
|
||||||
|
text: aout.text,
|
||||||
|
instruction: Instruction::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Parse a single byte of binary, return it and advance the offset.
|
/// Parse a single byte of binary, return it and advance the offset.
|
||||||
/// Returns the read byte.
|
/// Returns the read byte.
|
||||||
pub fn parse_byte(&mut self) -> u8 {
|
pub fn parse_byte(&mut self) -> u8 {
|
||||||
self.offset += 1;
|
self.offset += 1;
|
||||||
let byte = self.text[self.offset];
|
let byte = self.text[self.offset];
|
||||||
self.offset += 1;
|
self.offset += 1;
|
||||||
|
self.instruction.raw.push(byte);
|
||||||
byte
|
byte
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parse a single word of binary, return it and advance the offset.
|
/// Parse a single word of binary, return it and advance the offset.
|
||||||
/// Returns the read word and a tuple of the read raw bytes
|
/// Returns the read word.
|
||||||
pub fn parse_word(&mut self) -> (u16, (u8, u8)) {
|
pub fn parse_word(&mut self) -> u16 {
|
||||||
self.offset += 1;
|
self.offset += 1;
|
||||||
let byte1 = self.text[self.offset];
|
let byte1 = self.text[self.offset];
|
||||||
let byte2 = self.text[self.offset + 1];
|
let byte2 = self.text[self.offset + 1];
|
||||||
self.offset += 2;
|
self.offset += 2;
|
||||||
(u16::from_le_bytes([byte1, byte2]), (byte1, byte2))
|
self.instruction.raw.push(byte1);
|
||||||
|
self.instruction.raw.push(byte2);
|
||||||
|
u16::from_le_bytes([byte1, byte2])
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parse a single modrm byte, return the resulting MemoryIndex and advance the offset.
|
/// Parse a single modrm byte, return the resulting MemoryIndex and advance the offset.
|
||||||
/// Returns the parsed modrm memory access, as well as all read raw bytes
|
/// Returns the parsed modrm memory access, as well as all read raw bytes
|
||||||
pub fn parse_modrm_byte(&mut self) -> (MemoryIndex, Vec<u8>) {
|
pub fn parse_modrm_byte(&mut self) -> MemoryIndex {
|
||||||
// Calculate ModRM byte with bitmask
|
// Calculate ModRM byte with bitmask
|
||||||
let opcode = self.text[self.offset];
|
let opcode = self.text[self.offset];
|
||||||
let modulo = opcode >> 6;
|
let modulo = opcode >> 6;
|
||||||
let reg = (opcode >> 3) & 7;
|
let reg = (opcode >> 3) & 7;
|
||||||
let rm = opcode & 7;
|
let rm = opcode & 7;
|
||||||
|
|
||||||
let mut displacement_raw = Vec::new();
|
|
||||||
let displacement = match modulo {
|
let displacement = match modulo {
|
||||||
0 => {
|
0 => {
|
||||||
if rm == 6 {
|
if rm == 6 {
|
||||||
@@ -158,15 +113,12 @@ impl Disassembler {
|
|||||||
1 => {
|
1 => {
|
||||||
self.offset += 2; // one additional byte was read
|
self.offset += 2; // one additional byte was read
|
||||||
let byte = self.parse_byte();
|
let byte = self.parse_byte();
|
||||||
displacement_raw.push(byte);
|
|
||||||
log::debug!("Additional byte during ModRM parsing was read.");
|
log::debug!("Additional byte during ModRM parsing was read.");
|
||||||
Some(Displacement::Byte(byte))
|
Some(Displacement::Byte(byte))
|
||||||
}
|
}
|
||||||
2 => {
|
2 => {
|
||||||
self.offset += 3; // two additional bytes (word) was read
|
self.offset += 3; // two additional bytes (word) was read
|
||||||
let (word, raw) = self.parse_word();
|
let word = self.parse_word();
|
||||||
displacement_raw.push(raw.0);
|
|
||||||
displacement_raw.push(raw.1);
|
|
||||||
log::debug!("Additional two bytes during ModRM parsing was read.");
|
log::debug!("Additional two bytes during ModRM parsing was read.");
|
||||||
Some(Displacement::Word(word))
|
Some(Displacement::Word(word))
|
||||||
}
|
}
|
||||||
@@ -218,6 +170,51 @@ impl Disassembler {
|
|||||||
_ => panic!("Invalid ModRM byte encountered"),
|
_ => panic!("Invalid ModRM byte encountered"),
|
||||||
};
|
};
|
||||||
|
|
||||||
return (index, displacement_raw);
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decode instructions from the text section of the provided binary
|
||||||
|
pub fn decode_instructions(&mut self) -> Result<Vec<Instruction>, DisasmError> {
|
||||||
|
// naive approach:
|
||||||
|
// 1. read byte
|
||||||
|
// 2. pattern match to see which instruction it is
|
||||||
|
// 3. read as many bytes as this instruction needs (registers, immidiates, ...)
|
||||||
|
// repeat until no bytes left
|
||||||
|
|
||||||
|
let mut instructions = Vec::new();
|
||||||
|
|
||||||
|
while self.offset < self.text.len() {
|
||||||
|
self.instruction.start = self.offset;
|
||||||
|
|
||||||
|
let opcode = self.text[self.offset];
|
||||||
|
|
||||||
|
// additional raw bytes will be pushed by parse functions
|
||||||
|
self.instruction.raw.push(opcode);
|
||||||
|
self.instruction.opcode = match opcode {
|
||||||
|
// ADD
|
||||||
|
0x00 => {
|
||||||
|
Opcode::ADD_EbGb(self.parse_modrm_byte(), Register::by_id(self.parse_byte()))
|
||||||
|
}
|
||||||
|
// INT
|
||||||
|
0xCD => Opcode::INT(ImmediateByte(self.parse_byte())),
|
||||||
|
// MOV
|
||||||
|
0xBB => Opcode::MOV_BXIv(Register::BX, ImmediateWord(self.parse_word())),
|
||||||
|
_ => {
|
||||||
|
eprintln!(
|
||||||
|
"Encountered unknown self.instructionuction '0x{:x}'",
|
||||||
|
opcode
|
||||||
|
);
|
||||||
|
eprintln!("Offset might be misaligned and data is being interpreted.");
|
||||||
|
eprintln!("Existing to avoid further misinterpretation...");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
println!("{}", self.instruction);
|
||||||
|
instructions.push(self.instruction.clone());
|
||||||
|
self.instruction = Instruction::new();
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(instructions)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ pub type b = u8;
|
|||||||
#[allow(non_camel_case_types)]
|
#[allow(non_camel_case_types)]
|
||||||
pub type w = u16;
|
pub type w = u16;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, Clone)]
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
/// A single 'line' of executable ASM is called an Instruction, which
|
/// A single 'line' of executable ASM is called an Instruction, which
|
||||||
/// contains the `Opcode` that will be executed, alongside its starting offset
|
/// contains the `Opcode` that will be executed, alongside its starting offset
|
||||||
@@ -37,7 +37,7 @@ impl fmt::Display for Instruction {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, Clone)]
|
||||||
#[allow(dead_code, non_camel_case_types)]
|
#[allow(dead_code, non_camel_case_types)]
|
||||||
pub enum Opcode {
|
pub enum Opcode {
|
||||||
NOP(),
|
NOP(),
|
||||||
@@ -61,7 +61,7 @@ impl fmt::Display for Opcode {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Registers of a 8086 processor
|
/// Registers of a 8086 processor
|
||||||
#[derive(Debug)]
|
#[derive(Debug, Clone)]
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
pub enum Register {
|
pub enum Register {
|
||||||
AX,
|
AX,
|
||||||
@@ -167,11 +167,11 @@ impl fmt::Display for SegmentRegister {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// An immediate byte value for an instruction.
|
/// An immediate byte value for an instruction.
|
||||||
#[derive(Debug)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct ImmediateByte(pub b);
|
pub struct ImmediateByte(pub b);
|
||||||
|
|
||||||
/// An immediate word value for an instruction
|
/// An immediate word value for an instruction
|
||||||
#[derive(Debug)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct ImmediateWord(pub w);
|
pub struct ImmediateWord(pub w);
|
||||||
|
|
||||||
macro_rules! impl_display_and_lowerhex {
|
macro_rules! impl_display_and_lowerhex {
|
||||||
@@ -195,7 +195,7 @@ impl_display_and_lowerhex!(ImmediateWord);
|
|||||||
|
|
||||||
/// A memory index operand is usually created by ModRM bytes or words.
|
/// A memory index operand is usually created by ModRM bytes or words.
|
||||||
/// e.g. [bx+si]
|
/// e.g. [bx+si]
|
||||||
#[derive(Debug)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct MemoryIndex {
|
pub struct MemoryIndex {
|
||||||
pub base: Option<Register>,
|
pub base: Option<Register>,
|
||||||
pub index: Option<Register>,
|
pub index: Option<Register>,
|
||||||
@@ -226,7 +226,7 @@ impl fmt::Display for MemoryIndex {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, Clone)]
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
/// Displacement for ModRM
|
/// Displacement for ModRM
|
||||||
pub enum Displacement {
|
pub enum Displacement {
|
||||||
|
|||||||
Reference in New Issue
Block a user