ft: initial disasm of example data

This only contains two instructions of which I know
the correct output.
This commit is contained in:
2025-05-07 15:48:44 +09:00
commit 2af4578c8b
8 changed files with 804 additions and 0 deletions

62
src/aout.rs Normal file
View File

@@ -0,0 +1,62 @@
use std::ffi::{c_uchar, c_ushort};
#[allow(non_camel_case_types)]
pub type c_long = i32; // we use a a.out with 32 byte
#[derive(Debug)]
#[allow(dead_code)]
pub struct Aout {
pub header: Header,
pub text: Vec<u8>,
pub data: Vec<u8>,
}
impl Aout {
pub fn new(buf: Vec<u8>) -> Self {
let hdr = Header {
magic: [buf[0], buf[1]],
flags: buf[2],
cpu: buf[3],
hdrlen: buf[4],
unused: buf[5],
version: c_ushort::from_be_bytes([buf[6], buf[7]]),
text: c_long::from_le_bytes([buf[8], buf[9], buf[10], buf[11]]),
data: c_long::from_le_bytes([buf[12], buf[13], buf[14], buf[15]]),
bss: c_long::from_le_bytes([buf[16], buf[17], buf[18], buf[19]]),
entry: c_long::from_le_bytes([buf[20], buf[21], buf[22], buf[23]]),
total: c_long::from_le_bytes([buf[24], buf[25], buf[26], buf[27]]),
syms: c_long::from_le_bytes([buf[28], buf[29], buf[30], buf[31]]),
};
let text_start = hdr.hdrlen as usize;
let text_end = text_start + hdr.text as usize;
let data_start = text_end + 1;
let data_end = data_start + hdr.data as usize;
let text_section = &buf[text_start..text_end];
let data_section = &buf[data_start..data_end];
Aout {
header: hdr,
text: Vec::from(text_section),
data: Vec::from(data_section),
}
}
}
#[derive(Debug)]
#[allow(dead_code)]
pub struct Header {
pub magic: [c_uchar; 2], // magic number
pub flags: c_uchar, // flags, see below
pub cpu: c_uchar, // cpu id
pub hdrlen: c_uchar, // length of header
pub unused: c_uchar, // reserved for future use
pub version: c_ushort, // version stamp
pub text: c_long, // size of text segment in bytes
pub data: c_long, // size of data segment in bytes
pub bss: c_long, // size of bss segment in bytes
pub entry: c_long, // entry point
pub total: c_long, // total memory allocated
pub syms: c_long, // size of symbol table
}

1
src/decode.rs Normal file
View File

@@ -0,0 +1 @@

103
src/disasm.rs Normal file
View File

@@ -0,0 +1,103 @@
use core::fmt;
use std::{fs::File, io::Read, process::exit};
use crate::aout::Aout;
use crate::{
Args,
instructions::{ImmediateByte, ImmediateWord, Instruction, MetaInstruction, Register},
};
#[derive(Debug)]
pub enum DisasmError {
NoFile(Option<String>),
IoError(std::io::Error),
}
impl From<std::io::Error> for DisasmError {
fn from(error: std::io::Error) -> Self {
DisasmError::IoError(error)
}
}
impl fmt::Display for DisasmError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
DisasmError::NoFile(msg) => write!(f, "No file error: {:?}", msg),
DisasmError::IoError(msg) => write!(f, "{}", msg),
}
}
}
/// Disassemble the binary in `path` into a vector of instructions
/// This function just calls all other
pub fn disasm(args: &Args) -> Result<Vec<MetaInstruction>, DisasmError> {
let contents = path_to_buf(args)?;
let aout = Aout::new(contents);
// XXX: 00 is just 0, maybe this could be a problem?
log::debug!("{:?}", aout);
let instructions = decode_instructions(&aout)?;
Ok(instructions)
}
/// Read a filepath into a buffer
fn path_to_buf(args: &Args) -> Result<Vec<u8>, DisasmError> {
let path = args
.path
.clone()
.ok_or(DisasmError::NoFile(args.path.clone()))?;
let mut file = File::open(path)?;
let mut buf = Vec::new();
file.read_to_end(&mut buf)?;
Ok(buf)
}
/// Decode instructions from the text section of the provided binary
fn decode_instructions(aout: &Aout) -> Result<Vec<MetaInstruction>, DisasmError> {
// naive approach:
// 1. read byte
// 2. pattern match to see which instruction it is
// 3. read as many bytes as this instruction needs (registers, immidiates, ...)
// repeat until no bytes left
let instructions = Vec::new();
let mut offset = 0;
let text = &aout.text;
while offset < aout.text.len() {
let mut instr = MetaInstruction::new();
instr.start = offset;
let opcode = text[offset];
match opcode {
// 0x00 => {} // ADD
// INT
0xCD => {
instr.take_n_bytes(2, &mut offset, text);
instr.instruction = Instruction::INT(ImmediateByte(instr.raw[1]));
}
// MOV
0xBB => {
instr.take_n_bytes(3, &mut offset, text);
instr.instruction = Instruction::MOV_RI(
Register::BX,
ImmediateWord(u16::from_le_bytes([instr.raw[1], instr.raw[2]])),
);
}
_ => {
eprintln!("Encountered unknown instruction '0x{:x}'", opcode);
eprintln!("Offset might be misaligned and data is being interpreted.");
eprintln!("Existing to avoid further misinterpretation...");
exit(1);
}
};
println!("{}", instr);
// dbg!(&instr);
}
Ok(instructions)
}

183
src/instructions.rs Normal file
View File

@@ -0,0 +1,183 @@
use core::fmt;
pub type MemAddress = u8;
#[derive(Debug)]
#[allow(dead_code)]
/// A single 'line' of executable ASM is called a MetaInstruction, which
/// contains the `Instruction`, which will be executed, alongside some Meta
/// Informations.
pub struct MetaInstruction {
pub start: usize, // location of the instruction start
pub size: usize, // size of the instruction in bytes
pub raw: Vec<u8>, // raw value of instruction
pub instruction: Instruction, // actual instruction
}
impl MetaInstruction {
pub fn new() -> Self {
MetaInstruction {
start: 0,
size: 0,
raw: Vec::new(),
instruction: Instruction::NOP(),
}
}
/// Parse n bytes from text section and advance offet.
/// Used to get the operands.
pub fn take_n_bytes(&mut self, size: usize, offset: &mut usize, text: &Vec<u8>) {
self.size = size;
self.raw = text[*offset as usize..]
.iter()
.take(size)
.cloned()
.collect();
*offset += size;
}
}
impl fmt::Display for MetaInstruction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{:04x}: ", self.start).unwrap();
for b in self.raw.iter() {
write!(f, "{:02x}", b).unwrap();
}
write!(f, "\t{}", self.instruction)
}
}
#[derive(Debug)]
#[allow(dead_code, non_camel_case_types)]
pub enum Instruction {
NOP(),
// ADD
ADD_RM(Register, Memory),
ADD_MR(Memory, Register),
ADD_RR(Register, Register),
ADD_MI(Memory, ImmediateByte),
ADD_RI(Register, ImmediateByte),
// MOV
MOV_RM(Register, Memory),
MOV_MR(Memory, Register),
MOV_RR(Register, Register),
MOV_MI(Memory, ImmediateByte),
MOV_RI(Register, ImmediateWord),
MOV_SM(SRegister, Memory),
MOV_MS(Memory, SRegister),
MOV_RS(Register, SRegister),
MOV_SR(SRegister, Register),
// INT
INT(ImmediateByte),
}
impl fmt::Display for Instruction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Self::INT(byte) => write!(f, "INT, {:x}", byte),
Self::MOV_RI(reg, word) => write!(f, "MOV {}, {:04x}", reg, word),
_ => write!(f, "display not yet implemented"),
}
}
}
// Types for operand encoding
#[derive(Debug)]
pub struct Memory(pub MemAddress);
#[derive(Debug)]
pub struct ImmediateByte(pub u8);
#[derive(Debug)]
pub struct ImmediateWord(pub u16);
// ... and the displays for all of them
impl fmt::Display for Memory {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.0)
}
}
impl fmt::Display for ImmediateByte {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.0)
}
}
impl fmt::LowerHex for ImmediateByte {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::LowerHex::fmt(&self.0, f)
}
}
impl fmt::Display for ImmediateWord {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.0)
}
}
impl fmt::LowerHex for ImmediateWord {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::LowerHex::fmt(&self.0, f)
}
}
/// Registers of a 8086 processor
#[derive(Debug)]
#[allow(dead_code)]
pub enum Register {
AX,
BX,
CX,
DX,
AH,
AL,
BL,
BH,
CH,
CL,
DH,
DL,
DI,
SI,
BP,
SP,
}
impl fmt::Display for Register {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Self::AX => write!(f, "AX"),
Self::BX => write!(f, "BX"),
Self::CX => write!(f, "CX"),
Self::DX => write!(f, "DX"),
Self::AH => write!(f, "AH"),
Self::AL => write!(f, "AL"),
Self::BL => write!(f, "BL"),
Self::BH => write!(f, "BH"),
Self::CH => write!(f, "CH"),
Self::CL => write!(f, "CL"),
Self::DH => write!(f, "DH"),
Self::DL => write!(f, "DL"),
Self::DI => write!(f, "DI"),
Self::SI => write!(f, "SI"),
Self::BP => write!(f, "BP"),
Self::SP => write!(f, "SP"),
}
}
}
/// SRegisters of a 8086 processor
#[derive(Debug)]
#[allow(dead_code)]
pub enum SRegister {
DS,
ES,
SS,
CS,
}
impl fmt::Display for SRegister {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Self::DS => write!(f, "DS"),
Self::ES => write!(f, "ES"),
Self::SS => write!(f, "SS"),
Self::CS => write!(f, "CS"),
}
}
}

41
src/main.rs Normal file
View File

@@ -0,0 +1,41 @@
use clap::{Parser, Subcommand};
mod aout;
mod decode;
mod disasm;
mod instructions;
#[derive(Subcommand, Debug)]
enum Command {
/// Disassemble the binary into 8086 instructions
Disasm,
/// Interpret the binary as 8086 Minix
Interpret,
}
/// Simple prgram to diasm and interpret Minix binaries
#[derive(Parser, Debug)]
#[command(version, about, long_about = None)]
struct Args {
#[command(subcommand)]
command: Command,
/// Path of the binary
#[arg(short, long, global = true)]
path: Option<String>,
}
fn main() {
env_logger::init();
let args = Args::parse();
log::debug!("{:?}", args);
match args.command {
Command::Disasm => {
let _instructions = disasm::disasm(&args).unwrap();
}
_ => panic!("Command not yet implemented"),
}
}