diff --git a/src/input.zig b/src/input.zig
index a2935de..2bcf036 100644
--- a/src/input.zig
+++ b/src/input.zig
@@ -4,6 +4,8 @@ const util = @import("util.zig");
 const threads = @import("threads.zig");
 const output = @import("output.zig");
 
+// helper func, computes the appropriate coords for luminance
+// [macro_block][macro_block][intra_macro_block]
 inline fn lum_idxs(i: usize, j: usize) struct { usize, usize, usize } {
     return .{
         i / 2,
@@ -12,6 +14,9 @@ inline fn lum_idxs(i: usize, j: usize) struct { usize, usize, usize } {
     };
 }
 
+// either 2 separate funcs (see read_chrom) or multiple if statements, will be bad anyway
+// incrementally copies 8 byte sequences from the input buffer to the appropriate coords in the Y buffer
+// sends any completed blocks off to be processed for quantization as they fill
 fn read_lum(f: std.fs.File, source_buff: [][][4]util.Block, target_buff: [][][4]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void {
     const block_h = source_buff.len;
     const block_w = source_buff[0].len;
@@ -35,7 +40,9 @@ fn read_lum(f: std.fs.File, source_buff: [][][4]util.Block, target_buff: [][][4]
         }
     }
 }
-
+// either 2 separate funcs (see read_lum) or multiple if statements, will be bad anyway
+// incrementally copies 8 byte sequences from the input buffer to the appropriate coords in the U or V buffer
+// sends any completed blocks off to be processed for quantization as they fill
 fn read_chrom(f: std.fs.File, source_buff: [][]util.Block, target_buff: [][]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void {
     const block_h = source_buff.len;
     const block_w = source_buff[0].len;
@@ -62,11 +69,15 @@ fn read_chrom(f: std.fs.File, source_buff: [][]util.Block, target_buff: [][]util
 pub fn main_loop(f: std.fs.File, buffs: util.Buffers, thread_mgr: *threads.ThreadManager, alloc: std.mem.Allocator) !void {
     defer thread_mgr.quit();
     while (true) {
+        // resets control atomic variables, resumes quantizers
         thread_mgr.unblock();
         try read_lum(f, buffs.Y, buffs.Y_quant, buffs.input_buff, thread_mgr.queue_wrp.queue);
+        // U and V are downsampled, half len buffers
         try read_chrom(f, buffs.U, buffs.U_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue);
         try read_chrom(f, buffs.V, buffs.V_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue);
+        // wait until all blocks have been processed
         while (thread_mgr.signals.processed.load(.Acquire) != buffs.num_blocks) : (std.time.sleep(1)) {}
+        // sets the eof signal atomic variable, prevents quantizers from checking jobs when none can generate
         thread_mgr.eof();
         try output.generate_jpg(buffs, alloc);
     }
diff --git a/src/main.zig b/src/main.zig
index 1a47ebe..e28c58b 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -17,6 +17,7 @@ inline fn next_arg(args: *std.process.ArgIterator) ![:0]const u8 {
 
 inline fn get_dim(arg: [:0]const u8) !usize {
     const d = try std.fmt.parseInt(usize, arg, 10);
+    // forces image to be multiples of 16x16 mcu size
     if (d % 16 != 0 or d == 0) {
         return InitError.InvalidDimension;
     }
@@ -25,6 +26,7 @@ inline fn get_dim(arg: [:0]const u8) !usize {
 
 inline fn get_qual(arg: [:0]const u8) !f16 {
     const q = try std.fmt.parseFloat(f16, arg);
+    // quality generation for quant tables, 0-1 scale
     if (q < 0 or q > 1) {
         return InitError.InvalidQuality;
     }
@@ -32,6 +34,7 @@ inline fn get_qual(arg: [:0]const u8) !f16 {
 }
 
 inline fn get_n_jobs(arg: [:0]const u8) !usize {
+    // for multithreaded dct computation per block
     const n = try std.fmt.parseInt(usize, arg, 10);
     if (n == 0) {
         return InitError.InvalidNumJobs;
@@ -40,6 +43,7 @@ inline fn get_n_jobs(arg: [:0]const u8) !usize {
 }
 
 fn get_opts() !util.Options {
+    // args can be ignored after, just use a mini buffer instead
     var buff: [100]u8 = undefined;
     var fba = std.heap.FixedBufferAllocator.init(&buff);
     var alloc = fba.allocator();
diff --git a/src/output.zig b/src/output.zig
index 35cb73a..a8d08f6 100644
--- a/src/output.zig
+++ b/src/output.zig
@@ -45,11 +45,14 @@ const RLEWriter = struct {
         }
     }
 
+    // must write out the huffcode and extra bits if size >= 1
     fn write_value(self: *Self, huffcode: HuffCode, unit: RLE_Unit) !void {
         try self.bw.write_bits(huffcode.value, huffcode.n_bits);
+        // negative values must be written as val - 1 with the same number of bits as orig
         const unit_val: u16 = if (unit.value >= 0) @bitCast(unit.value) else @bitCast(unit.value - 1);
         const value_size = unit.symbol & 0x0f;
         if (value_size != 0) {
+            // 0 values only need the huffcode, no extra bits
             try self.bw.write_bits(unit_val & (try std.math.powi(u16, 2, value_size) - 1), @truncate(value_size));
         }
     }
@@ -91,10 +94,11 @@ const Scan = struct {
         }
     }
 
+    // operates on full image, generating an RLE sequence and frequency values,
+    // then gens corresponding hufftable
     fn do_rle_freq_pass(self: *Self, buff: *const util.Buffers, f: std.fs.File) !void {
         const h = buff.Y_quant.len;
         const w = buff.Y_quant[0].len;
-
         for (0..4) |i| {
             @memset(self.freqs[i], 0);
         }
@@ -118,7 +122,11 @@ const Scan = struct {
         }
     }
 
+    // dumps scan, order of Y macroblock (blockx4), U block, V block,
+    // need per block since first must be interpreted as dc
+    // 0x00 dc does not signal next block 0x00 ac does
     fn dump_scan(self: *Self, f: std.fs.File) !void {
+        // mostly hardcoded values
         _ = try f.write(&[_]u8{ 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x11, 0x03, 0x11, 0x00, 0x3f, 0x00 });
         var bw = RLEWriter.init(f);
         var idxs = [3]usize{ 0, 0, 0 };
@@ -134,6 +142,8 @@ const Scan = struct {
     }
 };
 
+// helper func needed for generating huffman code
+// gets idxs of least 2 non-zero values in slice.
 inline fn get_idx_min2(freqs: []u32) ?struct { usize, usize } {
     var min1: u32 = undefined;
     var min1_idx: usize = undefined;
@@ -172,6 +182,8 @@ inline fn get_idx_min2(freqs: []u32) ?struct { usize, usize } {
     } else null;
 }
 
+// generates code mapping from BITS and HUFFVAL
+// see https://www.w3.org/Graphics/JPEG/itu-t81.pdf annex c
 inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void {
     var huffsize = [_]u5{0} ** 256;
     var huffcode = [_]u16{0} ** 256;
@@ -217,10 +229,12 @@ inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void {
     }
 }
 
+// generate BITS and HUFFVAL
+// see https://www.w3.org/Graphics/JPEG/itu-t81.pdf annex K
 inline fn gen_huffman(freqs: []u32) HuffmanMeta {
     freqs[freqs.len - 1] = 1;
     var codesizes = [_]u8{0} ** 257;
-    var others = [_]u9{0x1ff} ** 257;
+    var others = [_]u9{0x1ff} ** 257; // 0x1ff since -1 not available and i8 wouldn't fit anyway
     while (get_idx_min2(freqs)) |tmp| {
         var v1 = tmp.@"0";
         var v2 = tmp.@"1";
@@ -276,6 +290,8 @@ inline fn gen_huffman(freqs: []u32) HuffmanMeta {
     };
 }
 
+// returns size field for given value tested switch and bitshifts, this was
+// somehow the fastest, slightly better than bit shifting
 inline fn get_size(n: i16) u8 {
     if (n == 0) {
         return 0;
@@ -306,6 +322,9 @@ inline fn get_size(n: i16) u8 {
     }
 }
 
+// for each block, process the dc by differencing it and perform RLE.
+// dc symbols are just the size, ac symbols are the rle in top 4 bits, size in next
+// append 0x00 at end of each block.
 fn parse_block(block: *util.BlockQuantized, dc_diff: *i16, rle: *RLE_Seq, dc_freqs: []u32, ac_freqs: []u32) !void {
     const diff = block[0] - dc_diff.*;
     var symbol = get_size(diff);
@@ -347,12 +366,14 @@ fn write_headers(f: std.fs.File, buff: *const util.Buffers) !void {
     const w = buff.Y.len * 16;
     const h = buff.Y[0].len * 16;
 
+    // write out magic marker 0xff 0xd8 and both quant tables,
     var out_buff = [6]u8{ 0xff, 0xd8, 0xff, 0xdb, 0x00, 0x84 } ++ [1]u8{0x00} ++ [_]u8{0x00} ** 64 ++ [1]u8{0x01} ++ [_]u8{0x00} ** 64;
     for (0..64) |i| {
         out_buff[7 + i] = @intFromFloat(buff.Q_Lum[i]);
         out_buff[7 + i + 65] = @intFromFloat(buff.Q_Chrom[i]);
     }
     _ = try f.write(&out_buff);
+    // write out SOF block, everything except w and h are largely irrelevant
     var sof_buff = [_]u8{
         0xff, 0xc0, 0x00, 0x11, 0x08, @truncate(w >> 8), @truncate(w & 0x00ff), @truncate(h >> 8), @truncate(h & 0x00ff), 0x03,
         0x01, 0x22, 0x00, 0x02, 0x11, 0x01,              0x03,                  0x11,              0x01,
@@ -361,6 +382,7 @@ fn write_headers(f: std.fs.File, buff: *const util.Buffers) !void {
 }
 
 fn dump_huffman(f: std.fs.File, huff_meta: HuffmanMeta, tree_i: usize) !void {
+    // write out huffman table bits and huffval representation
     const table_len = 3 + 16 + huff_meta.total_n;
     var out_buff = [5]u8{ 0xff, 0xc4, @truncate(table_len >> 8), @truncate(table_len & 0x00ff), @truncate(((tree_i % 2) << 4) | (tree_i / 2)) };
     _ = try f.write(&out_buff);
@@ -376,11 +398,8 @@ pub fn generate_jpg(buff: util.Buffers, alloc: std.mem.Allocator) !void {
     defer f.close();
     try write_headers(f, &buff);
 
+    // requires 2 passes, one for RLE and huffcode generation
+    // second to actually write out data
     try scan_data.do_rle_freq_pass(&buff, f);
     try scan_data.dump_scan(f);
-
-    // rle, huffman pass
-    // file headers
-    // quant + huffman write
-    // write scan
 }
diff --git a/src/threads.zig b/src/threads.zig
index ab7bff7..a96a84b 100644
--- a/src/threads.zig
+++ b/src/threads.zig
@@ -6,10 +6,11 @@ const transform = @import("transform.zig");
 const AtomicBool = std.atomic.Atomic(bool);
 const AtomicU32 = std.atomic.Atomic(u32);
 
+// atomic variables for cross-thread control
 const Signals = struct {
     quit: AtomicBool,
     processed: AtomicU32,
-    eof_block: AtomicU32,
+    eof_block: AtomicU32, // not bool to use with Futex wait
 
     const Self = @This();
 
@@ -22,6 +23,7 @@ const Signals = struct {
     }
 };
 
+// stores the jobqueue
 const QueueWrap = struct {
     queue: *util.JobQueue,
     job_pool: util.JobPool,
@@ -73,6 +75,7 @@ pub const ThreadManager = struct {
     }
 
     pub fn quit(self: *Self) void {
+        // signal a quit and wait for threads to exit
         self.signals.quit.store(true, .Release);
         self.unblock();
         for (self.threads.items) |thread| {
@@ -90,12 +93,15 @@ pub const ThreadManager = struct {
 };
 
 fn quantize_loop(queue: *util.JobQueue, signals: *Signals, Q_Lum: *util.QTable, Q_Chrom: *util.QTable) void {
+    // loop while check jobs avail or quit is not signalled
     while (queue.HasJobs() or !signals.quit.load(std.builtin.AtomicOrder.Acquire)) : (std.time.sleep(1)) {
-        const job = queue.pop() orelse continue;
-        transform.quantize(job.source, job.target, if (job.is_lum) Q_Lum else Q_Chrom);
-        _ = @atomicRmw(u32, &signals.processed.value, .Add, 1, .SeqCst);
+        // if eof has been signalled, wait until it isn't
         if (signals.eof_block.load(.Acquire) == 1) {
             std.Thread.Futex.wait(&signals.eof_block, 1);
         }
+        const job = queue.pop() orelse continue; // if check was stolen by other thread
+        transform.quantize(job.source, job.target, if (job.is_lum) Q_Lum else Q_Chrom);
+        // increment processed var since block now processed
+        _ = @atomicRmw(u32, &signals.processed.value, .Add, 1, .SeqCst);
     }
 }
diff --git a/src/transform.zig b/src/transform.zig
index 6d539c2..c5b0517 100644
--- a/src/transform.zig
+++ b/src/transform.zig
@@ -2,6 +2,7 @@ const std = @import("std");
 
 const util = @import("util.zig");
 
+// pre-computes coefficients at comptime
 pub const dct_coeffs = gen_coeffs();
 
 inline fn dct_cos(x: usize, f: usize) f16 {
@@ -14,6 +15,7 @@ inline fn dct_coeff(u: usize, v: usize) f16 {
     return 0.25 * (if (u == 0) 1.0 / @sqrt(2.0) else 1.0) * (if (v == 0) 1.0 / @sqrt(2.0) else 1.0);
 }
 
+// helper function to convert given u and v to the zigzag equivalents
 inline fn zz_conv(u: usize, v: usize) struct { u: usize, v: usize } {
     var band_i = u + v;
     const band_max = @min(7, band_i);
@@ -36,6 +38,10 @@ inline fn zz_band_len(band_i: usize) usize {
     return if (band_i < 8) band_i + 1 else 15 - band_i;
 }
 
+// generates an [8][8][8 * 8] set of values of dct coeffs
+// can be directly multiplied and summed to get dct value.
+// values are stored in zig-zagged order, so no need to rearrange
+// at runtime
 fn gen_coeffs() [8][8]@Vector(64, f16) {
     @setEvalBranchQuota(100000);
     var ret: [8][8]@Vector(64, f16) = undefined;
@@ -53,6 +59,13 @@ fn gen_coeffs() [8][8]@Vector(64, f16) {
     return ret;
 }
 
+// performs JPEG Type II DCT.
+// SIMD Vector optimzations applied if target supported
+// load source block bytes as floats
+// shift down 128
+// for each target coord in dct, mult shifted with corresponding coeff vector, add and store
+// divide dct by quant table values
+// store divved as i16 in target
 pub fn quantize(source: *util.Block, target: *util.BlockQuantized, qtable: *util.QTable) void {
     var source_holder: @Vector(64, f16) = undefined;
     var dct_holder: @Vector(64, f16) = undefined;
diff --git a/src/util.zig b/src/util.zig
index ddb5b2e..5de3159 100644
--- a/src/util.zig
+++ b/src/util.zig
@@ -21,6 +21,10 @@ pub const Job = struct {
     target: *BlockQuantized,
     is_lum: bool,
 };
+
+// rip std.atomic.Queue
+// simple Mutex wrapper around tailqueue,
+// also handles node storage in a MemoryPool
 pub const JobQueue = struct {
     const List = std.TailQueue(Job);
 
@@ -64,6 +68,8 @@ pub const JobQueue = struct {
 };
 pub const JobPool = std.heap.MemoryPool(JobQueue.List.Node);
 
+// main input buffers, + Qtables.
+// Y is stored with extra 2x2 block to make looping signifanctly less complicated
 pub const Buffers = struct {
     arena: std.heap.ArenaAllocator,
 
@@ -103,8 +109,10 @@ pub const Buffers = struct {
             .U_quant = try alloc.alloc([]BlockQuantized, block_h),
             .V_quant = try alloc.alloc([]BlockQuantized, block_h),
 
+            // h*w Y pixels, (h/2)*(w/2) U, V pixels, 64 pixels per block
             .num_blocks = @truncate(w * h * 3 / 2 / 64),
 
+            // read full block row at a time
             .input_buff = try alloc.alloc(u8, w * 8),
         };
         for (0..block_h) |i| {
@@ -126,6 +134,10 @@ pub const Buffers = struct {
     }
 };
 
+// simplistic qtable generation
+// assuming each high frequency band gets less and less important,
+// simply factor it all out by increasing q-value by band.
+// makes generation -> zig-zag simpler since can just do it in one step
 pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector(64, f16) {
     var ret: @Vector(64, f16) = [_]f32{0.0} ** 64;
     const band_range = step_stop_band - step_start_band;
@@ -149,6 +161,8 @@ pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector
     return ret;
 }
 
+// bit writer for scan data, adds buffering with byte stuffing
+// since BitWriter and BufferedWriter can be combined but bytes cannot be stuffed
 pub const BufferedBitWriter = struct {
     byte_buff: u8,
     bits_used: u4,
@@ -168,6 +182,9 @@ pub const BufferedBitWriter = struct {
         };
     }
 
+    // check if space avail is >= space needed
+    // if yes, just shove in bits,
+    // else shove in bits available, recurse with remaining
     pub fn write_bits(self: *Self, val: u16, n_bits: u5) !void {
         const curr_byte_space = 8 - self.bits_used;
         if (n_bits <= curr_byte_space) {
@@ -179,17 +196,25 @@ pub const BufferedBitWriter = struct {
         }
     }
 
+    // since write_bits handles alignment and extra values, adding here can only
+    // ever result in either a full byte or partial, no overflow into the next
     inline fn add_bits(self: *Self, val: u16, n_bits: u4) !void {
         self.byte_buff |= @truncate(val << (8 - self.bits_used - n_bits));
         self.bits_used += n_bits;
         if (self.bits_used == 8) {
+            // emit value
             self.buffer[self.buffer_idx] = self.byte_buff;
             if (self.byte_buff == 0xff) {
+                // if byte stuff, simply skip the next idx to get 0xff 0x00
                 self.buffer_idx += 1;
             }
             self.buffer_idx += 1;
             self.bits_used = 0;
             self.byte_buff = 0x00;
+
+            // if end of buffer was reached simply flush and wrap around
+            // the extra stuff increment should be preserved even if 0xff was
+            // the last value
             if (self.buffer_idx >= self.buffer.len) {
                 try self.flush();
                 self.buffer_idx %= self.buffer.len;
@@ -202,6 +227,8 @@ pub const BufferedBitWriter = struct {
         @memset(&self.buffer, 0);
     }
 
+    // special flush case when ending, requires partial flush and byte stuffing the
+    // last byte with 1s
     pub fn flush_end(self: *Self) !void {
         if (self.bits_used != 0) {
             const stuffing = try std.math.powi(u8, 2, 8 - self.bits_used) - 1;