From df00f59b5a7b8f01ae4ab7f4cff92a55c30b39a7 Mon Sep 17 00:00:00 2001
From: Marco Thomas <github@marcothms.de>
Date: Thu, 8 May 2025 20:18:02 +0900
Subject: [PATCH] ft: implement disasm in own struct

This makes it easier to implement each opcode,
as the offset calculation and recovery of raw
read bytes is internalized.
---
 src/disasm.rs       | 141 ++++++++++++++++++++++----------------------
 src/instructions.rs |  14 ++---
 2 files changed, 76 insertions(+), 79 deletions(-)
diff --git a/src/disasm.rs b/src/disasm.rs
index 7f8b823..d573aeb 100644
--- a/src/disasm.rs
+++ b/src/disasm.rs
@@ -38,9 +38,8 @@ pub fn disasm(args: &Args) -> Result<Vec<Instruction>, DisasmError> {
 
     log::debug!("{:?}", aout);
 
-    let instructions = decode_instructions(&aout)?;
-
-    Ok(instructions)
+    let mut disasm = Disassembler::new(aout);
+    disasm.decode_instructions()
 }
 
 /// Read a filepath into a u8 buffer.
@@ -56,97 +55,53 @@ fn path_to_buf(args: &Args) -> Result<Vec<u8>, DisasmError> {
     Ok(buf)
 }
 
-/// Decode instructions from the text section of the provided binary
-fn decode_instructions(aout: &Aout) -> Result<Vec<Instruction>, DisasmError> {
-    // naive approach:
-    // 1. read byte
-    // 2. pattern match to see which instruction it is
-    // 3. read as many bytes as this instruction needs (registers, immidiates, ...)
-    // repeat until no bytes left
-
-    let mut instructions = Vec::new();
-    let mut disassembler = Disassembler {
-        offset: 0,
-        text: aout.text.clone(),
-    };
-
-    while disassembler.offset < disassembler.text.len() {
-        let mut instr = Instruction::new();
-        instr.start = disassembler.offset;
-
-        let opcode = disassembler.text[disassembler.offset];
-        instr.raw.push(opcode);
-        match opcode {
-            // ADD
-            0x00 => {
-                let (mem_index, mut raw) = disassembler.parse_modrm_byte();
-                let reg = disassembler.parse_byte();
-                instr.raw.append(&mut raw);
-                instr.raw.push(reg);
-                instr.opcode = Opcode::ADD_EbGb(mem_index, Register::by_id(reg));
-            }
-            // INT
-            0xCD => {
-                let byte = disassembler.parse_byte();
-                instr.raw.push(byte);
-                instr.opcode = Opcode::INT(ImmediateByte(byte));
-            }
-            // MOV
-            0xBB => {
-                let (word, raw) = disassembler.parse_word();
-                instr.raw.push(raw.0);
-                instr.raw.push(raw.1);
-                instr.opcode = Opcode::MOV_BXIv(Register::BX, ImmediateWord(word));
-            }
-            _ => {
-                eprintln!("Encountered unknown instruction '0x{:x}'", opcode);
-                eprintln!("Offset might be misaligned and data is being interpreted.");
-                eprintln!("Existing to avoid further misinterpretation...");
-                exit(1);
-            }
-        };
-
-        println!("{}", instr);
-        instructions.push(instr);
-    }
-
-    Ok(instructions)
-}
-
 #[derive(Debug)]
 struct Disassembler {
-    pub offset: usize,
-    pub text: Vec<u8>,
+    pub offset: usize,            // the current offset in the disasm process
+    pub text: Vec<u8>,            // the aout binary
+    pub instruction: Instruction, // the instruction, which is currently being parsed
 }
 
 impl Disassembler {
+    pub fn new(aout: Aout) -> Self {
+        Disassembler {
+            offset: 0,
+            text: aout.text,
+            instruction: Instruction::new(),
+        }
+    }
+
     /// Parse a single byte of binary, return it and advance the offset.
     /// Returns the read byte.
     pub fn parse_byte(&mut self) -> u8 {
         self.offset += 1;
         let byte = self.text[self.offset];
         self.offset += 1;
+        self.instruction.raw.push(byte);
         byte
     }
+
     /// Parse a single word of binary, return it and advance the offset.
-    /// Returns the read word and a tuple of the read raw bytes
-    pub fn parse_word(&mut self) -> (u16, (u8, u8)) {
+    /// Returns the read word.
+    pub fn parse_word(&mut self) -> u16 {
         self.offset += 1;
         let byte1 = self.text[self.offset];
         let byte2 = self.text[self.offset + 1];
         self.offset += 2;
-        (u16::from_le_bytes([byte1, byte2]), (byte1, byte2))
+        self.instruction.raw.push(byte1);
+        self.instruction.raw.push(byte2);
+        u16::from_le_bytes([byte1, byte2])
     }
+
     /// Parse a single modrm byte, return the resulting MemoryIndex and advance the offset.
     /// Returns the parsed modrm memory access, as well as all read raw bytes
-    pub fn parse_modrm_byte(&mut self) -> (MemoryIndex, Vec<u8>) {
+    pub fn parse_modrm_byte(&mut self) -> MemoryIndex {
         // Calculate ModRM byte with bitmask
         let opcode = self.text[self.offset];
         let modulo = opcode >> 6;
         let reg = (opcode >> 3) & 7;
         let rm = opcode & 7;
 
-        let mut displacement_raw = Vec::new();
         let displacement = match modulo {
             0 => {
                 if rm == 6 {
@@ -158,15 +113,12 @@ impl Disassembler {
             1 => {
                 self.offset += 2; // one additional byte was read
                 let byte = self.parse_byte();
-                displacement_raw.push(byte);
                 log::debug!("Additional byte during ModRM parsing was read.");
                 Some(Displacement::Byte(byte))
             }
             2 => {
                 self.offset += 3; // two additional bytes (word) was read
-                let (word, raw) = self.parse_word();
-                displacement_raw.push(raw.0);
-                displacement_raw.push(raw.1);
+                let word = self.parse_word();
                 log::debug!("Additional two bytes during ModRM parsing was read.");
                 Some(Displacement::Word(word))
             }
@@ -218,6 +170,51 @@ impl Disassembler {
             _ => panic!("Invalid ModRM byte encountered"),
         };
 
-        return (index, displacement_raw);
+        index
+    }
+
+    /// Decode instructions from the text section of the provided binary
+    pub fn decode_instructions(&mut self) -> Result<Vec<Instruction>, DisasmError> {
+        // naive approach:
+        // 1. read byte
+        // 2. pattern match to see which instruction it is
+        // 3. read as many bytes as this instruction needs (registers, immidiates, ...)
+        // repeat until no bytes left
+
+        let mut instructions = Vec::new();
+
+        while self.offset < self.text.len() {
+            self.instruction.start = self.offset;
+
+            let opcode = self.text[self.offset];
+
+            // additional raw bytes will be pushed by parse functions
+            self.instruction.raw.push(opcode);
+            self.instruction.opcode = match opcode {
+                // ADD
+                0x00 => {
+                    Opcode::ADD_EbGb(self.parse_modrm_byte(), Register::by_id(self.parse_byte()))
+                }
+                // INT
+                0xCD => Opcode::INT(ImmediateByte(self.parse_byte())),
+                // MOV
+                0xBB => Opcode::MOV_BXIv(Register::BX, ImmediateWord(self.parse_word())),
+                _ => {
+                    eprintln!(
+                        "Encountered unknown self.instructionuction '0x{:x}'",
+                        opcode
+                    );
+                    eprintln!("Offset might be misaligned and data is being interpreted.");
+                    eprintln!("Existing to avoid further misinterpretation...");
+                    exit(1);
+                }
+            };
+
+            println!("{}", self.instruction);
+            instructions.push(self.instruction.clone());
+            self.instruction = Instruction::new();
+        }
+
+        Ok(instructions)
     }
 }
diff --git a/src/instructions.rs b/src/instructions.rs
index 25efa93..57ec2fc 100644
--- a/src/instructions.rs
+++ b/src/instructions.rs
@@ -6,7 +6,7 @@ pub type b = u8;
 #[allow(non_camel_case_types)]
 pub type w = u16;
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 #[allow(dead_code)]
 /// A single 'line' of executable ASM is called an Instruction, which
 /// contains the `Opcode` that will be executed, alongside its starting offset
@@ -37,7 +37,7 @@ impl fmt::Display for Instruction {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 #[allow(dead_code, non_camel_case_types)]
 pub enum Opcode {
     NOP(),
@@ -61,7 +61,7 @@ impl fmt::Display for Opcode {
 }
 
 /// Registers of a 8086 processor
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 #[allow(dead_code)]
 pub enum Register {
     AX,
@@ -167,11 +167,11 @@ impl fmt::Display for SegmentRegister {
 }
 
 /// An immediate byte value for an instruction.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct ImmediateByte(pub b);
 
 /// An immediate word value for an instruction
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct ImmediateWord(pub w);
 
 macro_rules! impl_display_and_lowerhex {
@@ -195,7 +195,7 @@ impl_display_and_lowerhex!(ImmediateWord);
 
 /// A memory index operand is usually created by ModRM bytes or words.
 /// e.g. [bx+si]
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct MemoryIndex {
     pub base: Option<Register>,
     pub index: Option<Register>,
@@ -226,7 +226,7 @@ impl fmt::Display for MemoryIndex {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 #[allow(dead_code)]
 /// Displacement for ModRM
 pub enum Displacement {