fix: cleanup structs into correct files

This commit is contained in:
2025-05-08 10:05:09 +09:00
parent 849895a437
commit 1c7d3f3adc
5 changed files with 229 additions and 229 deletions

View File

@@ -5,6 +5,7 @@ pub type c_long = i32; // we use a a.out with 32 byte
#[derive(Debug)] #[derive(Debug)]
#[allow(dead_code)] #[allow(dead_code)]
/// Internal representation of the a.out binary format.
pub struct Aout { pub struct Aout {
pub header: Header, pub header: Header,
pub text: Vec<u8>, pub text: Vec<u8>,

View File

@@ -1 +0,0 @@

View File

@@ -2,13 +2,14 @@ use core::fmt;
use std::{fs::File, io::Read, process::exit}; use std::{fs::File, io::Read, process::exit};
use crate::aout::Aout; use crate::aout::Aout;
use crate::instructions::MemoryIndex; use crate::instructions::{Displacement, MemoryIndex};
use crate::{ use crate::{
Args, Args,
instructions::{ImmediateByte, ImmediateWord, Instruction, MetaInstruction, Register}, instructions::{ImmediateByte, ImmediateWord, Instruction, Opcode, Register},
}; };
#[derive(Debug)] #[derive(Debug)]
/// Generic errors, which are encountered during parsing.
pub enum DisasmError { pub enum DisasmError {
NoFile(Option<String>), NoFile(Option<String>),
IoError(std::io::Error), IoError(std::io::Error),
@@ -29,13 +30,12 @@ impl fmt::Display for DisasmError {
} }
} }
/// Disassemble the binary in `path` into a vector of instructions /// Disassemble the binary in `path` into a vector of instructions.
/// This function just calls all other /// Main entry point to the disassembly.
pub fn disasm(args: &Args) -> Result<Vec<MetaInstruction>, DisasmError> { pub fn disasm(args: &Args) -> Result<Vec<Instruction>, DisasmError> {
let contents = path_to_buf(args)?; let contents = path_to_buf(args)?;
let aout = Aout::new(contents); let aout = Aout::new(contents);
// XXX: 00 is just 0, maybe this could be a problem?
log::debug!("{:?}", aout); log::debug!("{:?}", aout);
let instructions = decode_instructions(&aout)?; let instructions = decode_instructions(&aout)?;
@@ -43,7 +43,7 @@ pub fn disasm(args: &Args) -> Result<Vec<MetaInstruction>, DisasmError> {
Ok(instructions) Ok(instructions)
} }
/// Read a filepath into a buffer /// Read a filepath into a u8 buffer.
fn path_to_buf(args: &Args) -> Result<Vec<u8>, DisasmError> { fn path_to_buf(args: &Args) -> Result<Vec<u8>, DisasmError> {
let path = args let path = args
.path .path
@@ -57,7 +57,7 @@ fn path_to_buf(args: &Args) -> Result<Vec<u8>, DisasmError> {
} }
/// Decode instructions from the text section of the provided binary /// Decode instructions from the text section of the provided binary
fn decode_instructions(aout: &Aout) -> Result<Vec<MetaInstruction>, DisasmError> { fn decode_instructions(aout: &Aout) -> Result<Vec<Instruction>, DisasmError> {
// naive approach: // naive approach:
// 1. read byte // 1. read byte
// 2. pattern match to see which instruction it is // 2. pattern match to see which instruction it is
@@ -65,38 +65,38 @@ fn decode_instructions(aout: &Aout) -> Result<Vec<MetaInstruction>, DisasmError>
// repeat until no bytes left // repeat until no bytes left
let mut instructions = Vec::new(); let mut instructions = Vec::new();
let mut offset = 0; let mut disassembler = Disassembler {
offset: 0,
text: aout.text.clone(),
};
let text = &aout.text; while disassembler.offset < disassembler.text.len() {
while offset < aout.text.len() { let mut instr = Instruction::new();
let mut instr = MetaInstruction::new(); instr.start = disassembler.offset;
instr.start = offset;
let opcode = text[offset]; let opcode = disassembler.text[disassembler.offset];
instr.raw.push(opcode);
match opcode { match opcode {
// ADD // ADD
0x00 => { 0x00 => {
let (mem_index, mut raw) = parse_modrm_byte(&mut offset, text); let (mem_index, mut raw) = disassembler.parse_modrm_byte();
let reg = parse_byte(&mut offset, text); let reg = disassembler.parse_byte();
instr.size = 2 + raw.len();
instr.raw = Vec::from([opcode]);
instr.raw.append(&mut raw); instr.raw.append(&mut raw);
instr.raw.push(reg); instr.raw.push(reg);
instr.instruction = Instruction::ADD_EbGb(mem_index, Register::by_id(reg)); instr.opcode = Opcode::ADD_EbGb(mem_index, Register::by_id(reg));
} }
// INT // INT
0xCD => { 0xCD => {
let byte = parse_byte(&mut offset, text); let byte = disassembler.parse_byte();
instr.size = 2; instr.raw.push(byte);
instr.raw = Vec::from([opcode, byte]); instr.opcode = Opcode::INT(ImmediateByte(byte));
instr.instruction = Instruction::INT(ImmediateByte(byte));
} }
// MOV // MOV
0xBB => { 0xBB => {
let (word, raw) = parse_word(&mut offset, text); let (word, raw) = disassembler.parse_word();
instr.size = 3; instr.raw.push(raw.0);
instr.raw = Vec::from([opcode, raw.0, raw.1]); instr.raw.push(raw.1);
instr.instruction = Instruction::MOV_BXIv(Register::BX, ImmediateWord(word)); instr.opcode = Opcode::MOV_BXIv(Register::BX, ImmediateWord(word));
} }
_ => { _ => {
eprintln!("Encountered unknown instruction '0x{:x}'", opcode); eprintln!("Encountered unknown instruction '0x{:x}'", opcode);
@@ -113,117 +113,111 @@ fn decode_instructions(aout: &Aout) -> Result<Vec<MetaInstruction>, DisasmError>
Ok(instructions) Ok(instructions)
} }
/// Parse a single byte of binary, return it and advance the offset.
pub fn parse_byte(offset: &mut usize, text: &Vec<u8>) -> u8 {
*offset += 1;
let byte = text[*offset];
*offset += 1;
byte
}
/// Parse a single word of binary, return it and advance the offset.
pub fn parse_word(offset: &mut usize, text: &Vec<u8>) -> (u16, (u8, u8)) {
*offset += 1;
let byte1 = text[*offset];
let byte2 = text[*offset + 1];
*offset += 2;
(u16::from_le_bytes([byte1, byte2]), (byte1, byte2))
}
/// Parse a single modrm byte, return the resulting MemoryIndex and advance the offset.
pub fn parse_modrm_byte(offset: &mut usize, text: &Vec<u8>) -> (MemoryIndex, Vec<u8>) {
// Calculate ModRM byte with bitmask
let opcode = text[*offset];
let modulo = opcode >> 6;
let reg = (opcode >> 3) & 7;
let rm = opcode & 7;
let mut displacement_raw = Vec::new();
let displacement = match modulo {
0 => {
if rm == 6 {
// XXX: handle special case
panic!("Handle modulo == 0, rm == 6");
}
None
}
1 => {
*offset += 2; // one additional byte was read
let byte = parse_byte(offset, text);
displacement_raw.push(byte);
log::debug!("Additional byte during ModRM parsing was read.");
Some(Displacement::Byte(byte))
}
2 => {
*offset += 3; // two additional bytes (word) was read
let (word, raw) = parse_word(offset, text);
displacement_raw.push(raw.0);
displacement_raw.push(raw.1);
log::debug!("Additional two bytes during ModRM parsing was read.");
Some(Displacement::Word(word))
}
3 => panic!("TODO: handle modulo == 3"),
_ => panic!("Invalid ModRM byte encountered"),
};
let index = match rm {
0 => MemoryIndex {
base: Some(Register::BX),
index: Some(Register::SI),
displacement,
},
1 => MemoryIndex {
base: Some(Register::BX),
index: Some(Register::DI),
displacement,
},
2 => MemoryIndex {
base: Some(Register::BP),
index: Some(Register::SI),
displacement,
},
3 => MemoryIndex {
base: Some(Register::BP),
index: Some(Register::DI),
displacement,
},
4 => MemoryIndex {
base: None,
index: Some(Register::SI),
displacement,
},
5 => MemoryIndex {
base: None,
index: Some(Register::DI),
displacement,
},
6 => MemoryIndex {
base: Some(Register::BP),
index: None,
displacement,
},
7 => MemoryIndex {
base: Some(Register::BX),
index: None,
displacement,
},
_ => panic!("Invalid ModRM byte encountered"),
};
return (index, displacement_raw);
}
#[derive(Debug)] #[derive(Debug)]
#[allow(dead_code)] struct Disassembler {
/// Displacement for ModRM pub offset: usize,
pub enum Displacement { pub text: Vec<u8>,
Byte(u8),
Word(u16),
} }
impl fmt::Display for Displacement { impl Disassembler {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { /// Parse a single byte of binary, return it and advance the offset.
match self { /// Returns the read byte.
Self::Byte(byte) => write!(f, "{}", byte), pub fn parse_byte(&mut self) -> u8 {
Self::Word(word) => write!(f, "{}", word), self.offset += 1;
} let byte = self.text[self.offset];
self.offset += 1;
byte
}
/// Parse a single word of binary, return it and advance the offset.
/// Returns the read word and a tuple of the read raw bytes
pub fn parse_word(&mut self) -> (u16, (u8, u8)) {
self.offset += 1;
let byte1 = self.text[self.offset];
let byte2 = self.text[self.offset + 1];
self.offset += 2;
(u16::from_le_bytes([byte1, byte2]), (byte1, byte2))
}
/// Parse a single modrm byte, return the resulting MemoryIndex and advance the offset.
/// Returns the parsed modrm memory access, as well as all read raw bytes
pub fn parse_modrm_byte(&mut self) -> (MemoryIndex, Vec<u8>) {
// Calculate ModRM byte with bitmask
let opcode = self.text[self.offset];
let modulo = opcode >> 6;
let reg = (opcode >> 3) & 7;
let rm = opcode & 7;
let mut displacement_raw = Vec::new();
let displacement = match modulo {
0 => {
if rm == 6 {
// XXX: handle special case
panic!("Handle modulo == 0, rm == 6");
}
None
}
1 => {
self.offset += 2; // one additional byte was read
let byte = self.parse_byte();
displacement_raw.push(byte);
log::debug!("Additional byte during ModRM parsing was read.");
Some(Displacement::Byte(byte))
}
2 => {
self.offset += 3; // two additional bytes (word) was read
let (word, raw) = self.parse_word();
displacement_raw.push(raw.0);
displacement_raw.push(raw.1);
log::debug!("Additional two bytes during ModRM parsing was read.");
Some(Displacement::Word(word))
}
3 => panic!("TODO: handle modulo == 3"),
_ => panic!("Invalid ModRM byte encountered"),
};
let index = match rm {
0 => MemoryIndex {
base: Some(Register::BX),
index: Some(Register::SI),
displacement,
},
1 => MemoryIndex {
base: Some(Register::BX),
index: Some(Register::DI),
displacement,
},
2 => MemoryIndex {
base: Some(Register::BP),
index: Some(Register::SI),
displacement,
},
3 => MemoryIndex {
base: Some(Register::BP),
index: Some(Register::DI),
displacement,
},
4 => MemoryIndex {
base: None,
index: Some(Register::SI),
displacement,
},
5 => MemoryIndex {
base: None,
index: Some(Register::DI),
displacement,
},
6 => MemoryIndex {
base: Some(Register::BP),
index: None,
displacement,
},
7 => MemoryIndex {
base: Some(Register::BX),
index: None,
displacement,
},
_ => panic!("Invalid ModRM byte encountered"),
};
return (index, displacement_raw);
} }
} }

View File

@@ -1,76 +1,45 @@
use core::fmt; use core::fmt;
use crate::disasm::Displacement; // b: 8, w: 16, v: 16 -> i just treat v and w the same, if nothing blows up
#[allow(non_camel_case_types)]
pub type MemAddress = u8; pub type b = u8;
#[allow(non_camel_case_types)]
pub type w = u16;
#[derive(Debug)] #[derive(Debug)]
#[allow(dead_code)] #[allow(dead_code)]
/// A single 'line' of executable ASM is called a MetaInstruction, which /// A single 'line' of executable ASM is called an Instruction, which
/// contains the `Instruction`, which will be executed, alongside some Meta /// contains the `Opcode` that will be executed, alongside its starting offset
/// Informations. /// and the raw parsed bytes
pub struct MetaInstruction { pub struct Instruction {
pub start: usize, // location of the instruction start pub start: usize, // location of the instruction start
pub size: usize, // size of the instruction in bytes pub raw: Vec<u8>, // raw value of instruction
pub raw: Vec<u8>, // raw value of instruction pub opcode: Opcode, // actual instruction
pub instruction: Instruction, // actual instruction
} }
impl MetaInstruction { impl Instruction {
pub fn new() -> Self { pub fn new() -> Self {
MetaInstruction { Instruction {
start: 0, start: 0,
size: 0,
raw: Vec::new(), raw: Vec::new(),
instruction: Instruction::NOP(), opcode: Opcode::NOP(),
} }
} }
} }
impl fmt::Display for MetaInstruction { impl fmt::Display for Instruction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{:04x}: ", self.start).unwrap(); write!(f, "{:04x}: ", self.start).unwrap();
for b in self.raw.iter() { for b in self.raw.iter() {
write!(f, "{:02x}", b).unwrap(); write!(f, "{:02x}", b).unwrap();
} }
write!(f, "\t{}", self.instruction) write!(f, "\t{}", self.opcode)
}
}
#[derive(Debug)]
pub struct MemoryIndex {
pub base: Option<Register>,
pub index: Option<Register>,
pub displacement: Option<Displacement>,
}
impl fmt::Display for MemoryIndex {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match &self.base {
Some(base) => match &self.index {
Some(index) => match &self.displacement {
Some(displacement) => write!(f, "[{}+{}+{}]", base, index, displacement),
None => write!(f, "[{}+{}]", base, index),
},
None => match &self.displacement {
Some(displacement) => write!(f, "[{}+{}]", base, displacement),
None => write!(f, "[{}]", base),
},
},
None => match &self.index {
Some(index) => match &self.displacement {
Some(displacement) => write!(f, "{}+{}", index, displacement),
None => write!(f, "[{}]", index),
},
None => panic!("Invalid MemoryIndex encountered"),
},
}
} }
} }
#[derive(Debug)] #[derive(Debug)]
#[allow(dead_code, non_camel_case_types)] #[allow(dead_code, non_camel_case_types)]
pub enum Instruction { pub enum Opcode {
NOP(), NOP(),
// ADD // ADD
ADD_EbGb(MemoryIndex, Register), ADD_EbGb(MemoryIndex, Register),
@@ -80,7 +49,7 @@ pub enum Instruction {
INT(ImmediateByte), INT(ImmediateByte),
} }
impl fmt::Display for Instruction { impl fmt::Display for Opcode {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self { match self {
Self::INT(byte) => write!(f, "INT, {:x}", byte), Self::INT(byte) => write!(f, "INT, {:x}", byte),
@@ -91,46 +60,6 @@ impl fmt::Display for Instruction {
} }
} }
// Types for operand encoding
#[derive(Debug)]
pub struct Memory(pub MemAddress);
// b: 8, w: 16, v: 16 -> i just treat v and w the same, if nothing blows up
#[derive(Debug)]
pub struct ImmediateByte(pub u8);
#[derive(Debug)]
pub struct ImmediateWord(pub u16);
// ... and the displays for all of them
macro_rules! impl_display {
($name:ident) => {
impl std::fmt::Display for $name {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
};
}
macro_rules! impl_display_and_lowerhex {
($name:ident) => {
impl std::fmt::Display for $name {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl std::fmt::LowerHex for $name {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
std::fmt::LowerHex::fmt(&self.0, f)
}
}
};
}
impl_display!(Memory);
impl_display_and_lowerhex!(ImmediateByte);
impl_display_and_lowerhex!(ImmediateWord);
/// Registers of a 8086 processor /// Registers of a 8086 processor
#[derive(Debug)] #[derive(Debug)]
#[allow(dead_code)] #[allow(dead_code)]
@@ -236,3 +165,80 @@ impl fmt::Display for SegmentRegister {
} }
} }
} }
/// An immediate byte value for an instruction.
#[derive(Debug)]
pub struct ImmediateByte(pub b);
/// An immediate word value for an instruction
#[derive(Debug)]
pub struct ImmediateWord(pub w);
macro_rules! impl_display_and_lowerhex {
($name:ident) => {
impl std::fmt::Display for $name {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl std::fmt::LowerHex for $name {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
std::fmt::LowerHex::fmt(&self.0, f)
}
}
};
}
impl_display_and_lowerhex!(ImmediateByte);
impl_display_and_lowerhex!(ImmediateWord);
/// A memory index operand is usually created by ModRM bytes or words.
/// e.g. [bx+si]
#[derive(Debug)]
pub struct MemoryIndex {
pub base: Option<Register>,
pub index: Option<Register>,
pub displacement: Option<Displacement>,
}
impl fmt::Display for MemoryIndex {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match &self.base {
Some(base) => match &self.index {
Some(index) => match &self.displacement {
Some(displacement) => write!(f, "[{}+{}+{}]", base, index, displacement),
None => write!(f, "[{}+{}]", base, index),
},
None => match &self.displacement {
Some(displacement) => write!(f, "[{}+{}]", base, displacement),
None => write!(f, "[{}]", base),
},
},
None => match &self.index {
Some(index) => match &self.displacement {
Some(displacement) => write!(f, "{}+{}", index, displacement),
None => write!(f, "[{}]", index),
},
None => panic!("Invalid MemoryIndex encountered"),
},
}
}
}
#[derive(Debug)]
#[allow(dead_code)]
/// Displacement for ModRM
pub enum Displacement {
Byte(u8),
Word(u16),
}
impl fmt::Display for Displacement {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Self::Byte(byte) => write!(f, "{}", byte),
Self::Word(word) => write!(f, "{}", word),
}
}
}

View File

@@ -1,7 +1,6 @@
use clap::{Parser, Subcommand}; use clap::{Parser, Subcommand};
mod aout; mod aout;
mod decode;
mod disasm; mod disasm;
mod instructions; mod instructions;
@@ -34,7 +33,8 @@ fn main() {
match args.command { match args.command {
Command::Disasm => { Command::Disasm => {
let _instructions = disasm::disasm(&args).unwrap(); let instructions = disasm::disasm(&args).unwrap();
log::debug!("{:?}", &instructions);
} }
_ => panic!("Command not yet implemented"), _ => panic!("Command not yet implemented"),
} }