diff --git a/src/input.zig b/src/input.zig index a2935de..2bcf036 100644 --- a/src/input.zig +++ b/src/input.zig @@ -4,6 +4,8 @@ const util = @import("util.zig"); const threads = @import("threads.zig"); const output = @import("output.zig"); +// helper func, computes the appropriate coords for luminance +// [macro_block][macro_block][intra_macro_block] inline fn lum_idxs(i: usize, j: usize) struct { usize, usize, usize } { return .{ i / 2, @@ -12,6 +14,9 @@ inline fn lum_idxs(i: usize, j: usize) struct { usize, usize, usize } { }; } +// either 2 separate funcs (see read_chrom) or multiple if statements, will be bad anyway +// incrementally copies 8 byte sequences from the input buffer to the appropriate coords in the Y buffer +// sends any completed blocks off to be processed for quantization as they fill fn read_lum(f: std.fs.File, source_buff: [][][4]util.Block, target_buff: [][][4]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void { const block_h = source_buff.len; const block_w = source_buff[0].len; @@ -35,7 +40,9 @@ fn read_lum(f: std.fs.File, source_buff: [][][4]util.Block, target_buff: [][][4] } } } - +// either 2 separate funcs (see read_lum) or multiple if statements, will be bad anyway +// incrementally copies 8 byte sequences from the input buffer to the appropriate coords in the U or V buffer +// sends any completed blocks off to be processed for quantization as they fill fn read_chrom(f: std.fs.File, source_buff: [][]util.Block, target_buff: [][]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void { const block_h = source_buff.len; const block_w = source_buff[0].len; @@ -62,11 +69,15 @@ fn read_chrom(f: std.fs.File, source_buff: [][]util.Block, target_buff: [][]util pub fn main_loop(f: std.fs.File, buffs: util.Buffers, thread_mgr: *threads.ThreadManager, alloc: std.mem.Allocator) !void { defer thread_mgr.quit(); while (true) { + // resets control atomic variables, resumes quantizers thread_mgr.unblock(); try read_lum(f, buffs.Y, buffs.Y_quant, buffs.input_buff, thread_mgr.queue_wrp.queue); + // U and V are downsampled, half len buffers try read_chrom(f, buffs.U, buffs.U_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue); try read_chrom(f, buffs.V, buffs.V_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue); + // wait until all blocks have been processed while (thread_mgr.signals.processed.load(.Acquire) != buffs.num_blocks) : (std.time.sleep(1)) {} + // sets the eof signal atomic variable, prevents quantizers from checking jobs when none can generate thread_mgr.eof(); try output.generate_jpg(buffs, alloc); } diff --git a/src/main.zig b/src/main.zig index 1a47ebe..e28c58b 100644 --- a/src/main.zig +++ b/src/main.zig @@ -17,6 +17,7 @@ inline fn next_arg(args: *std.process.ArgIterator) ![:0]const u8 { inline fn get_dim(arg: [:0]const u8) !usize { const d = try std.fmt.parseInt(usize, arg, 10); + // forces image to be multiples of 16x16 mcu size if (d % 16 != 0 or d == 0) { return InitError.InvalidDimension; } @@ -25,6 +26,7 @@ inline fn get_dim(arg: [:0]const u8) !usize { inline fn get_qual(arg: [:0]const u8) !f16 { const q = try std.fmt.parseFloat(f16, arg); + // quality generation for quant tables, 0-1 scale if (q < 0 or q > 1) { return InitError.InvalidQuality; } @@ -32,6 +34,7 @@ inline fn get_qual(arg: [:0]const u8) !f16 { } inline fn get_n_jobs(arg: [:0]const u8) !usize { + // for multithreaded dct computation per block const n = try std.fmt.parseInt(usize, arg, 10); if (n == 0) { return InitError.InvalidNumJobs; @@ -40,6 +43,7 @@ inline fn get_n_jobs(arg: [:0]const u8) !usize { } fn get_opts() !util.Options { + // args can be ignored after, just use a mini buffer instead var buff: [100]u8 = undefined; var fba = std.heap.FixedBufferAllocator.init(&buff); var alloc = fba.allocator(); diff --git a/src/output.zig b/src/output.zig index 35cb73a..a8d08f6 100644 --- a/src/output.zig +++ b/src/output.zig @@ -45,11 +45,14 @@ const RLEWriter = struct { } } + // must write out the huffcode and extra bits if size >= 1 fn write_value(self: *Self, huffcode: HuffCode, unit: RLE_Unit) !void { try self.bw.write_bits(huffcode.value, huffcode.n_bits); + // negative values must be written as val - 1 with the same number of bits as orig const unit_val: u16 = if (unit.value >= 0) @bitCast(unit.value) else @bitCast(unit.value - 1); const value_size = unit.symbol & 0x0f; if (value_size != 0) { + // 0 values only need the huffcode, no extra bits try self.bw.write_bits(unit_val & (try std.math.powi(u16, 2, value_size) - 1), @truncate(value_size)); } } @@ -91,10 +94,11 @@ const Scan = struct { } } + // operates on full image, generating an RLE sequence and frequency values, + // then gens corresponding hufftable fn do_rle_freq_pass(self: *Self, buff: *const util.Buffers, f: std.fs.File) !void { const h = buff.Y_quant.len; const w = buff.Y_quant[0].len; - for (0..4) |i| { @memset(self.freqs[i], 0); } @@ -118,7 +122,11 @@ const Scan = struct { } } + // dumps scan, order of Y macroblock (blockx4), U block, V block, + // need per block since first must be interpreted as dc + // 0x00 dc does not signal next block 0x00 ac does fn dump_scan(self: *Self, f: std.fs.File) !void { + // mostly hardcoded values _ = try f.write(&[_]u8{ 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x11, 0x03, 0x11, 0x00, 0x3f, 0x00 }); var bw = RLEWriter.init(f); var idxs = [3]usize{ 0, 0, 0 }; @@ -134,6 +142,8 @@ const Scan = struct { } }; +// helper func needed for generating huffman code +// gets idxs of least 2 non-zero values in slice. inline fn get_idx_min2(freqs: []u32) ?struct { usize, usize } { var min1: u32 = undefined; var min1_idx: usize = undefined; @@ -172,6 +182,8 @@ inline fn get_idx_min2(freqs: []u32) ?struct { usize, usize } { } else null; } +// generates code mapping from BITS and HUFFVAL +// see https://www.w3.org/Graphics/JPEG/itu-t81.pdf annex c inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void { var huffsize = [_]u5{0} ** 256; var huffcode = [_]u16{0} ** 256; @@ -217,10 +229,12 @@ inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void { } } +// generate BITS and HUFFVAL +// see https://www.w3.org/Graphics/JPEG/itu-t81.pdf annex K inline fn gen_huffman(freqs: []u32) HuffmanMeta { freqs[freqs.len - 1] = 1; var codesizes = [_]u8{0} ** 257; - var others = [_]u9{0x1ff} ** 257; + var others = [_]u9{0x1ff} ** 257; // 0x1ff since -1 not available and i8 wouldn't fit anyway while (get_idx_min2(freqs)) |tmp| { var v1 = tmp.@"0"; var v2 = tmp.@"1"; @@ -276,6 +290,8 @@ inline fn gen_huffman(freqs: []u32) HuffmanMeta { }; } +// returns size field for given value tested switch and bitshifts, this was +// somehow the fastest, slightly better than bit shifting inline fn get_size(n: i16) u8 { if (n == 0) { return 0; @@ -306,6 +322,9 @@ inline fn get_size(n: i16) u8 { } } +// for each block, process the dc by differencing it and perform RLE. +// dc symbols are just the size, ac symbols are the rle in top 4 bits, size in next +// append 0x00 at end of each block. fn parse_block(block: *util.BlockQuantized, dc_diff: *i16, rle: *RLE_Seq, dc_freqs: []u32, ac_freqs: []u32) !void { const diff = block[0] - dc_diff.*; var symbol = get_size(diff); @@ -347,12 +366,14 @@ fn write_headers(f: std.fs.File, buff: *const util.Buffers) !void { const w = buff.Y.len * 16; const h = buff.Y[0].len * 16; + // write out magic marker 0xff 0xd8 and both quant tables, var out_buff = [6]u8{ 0xff, 0xd8, 0xff, 0xdb, 0x00, 0x84 } ++ [1]u8{0x00} ++ [_]u8{0x00} ** 64 ++ [1]u8{0x01} ++ [_]u8{0x00} ** 64; for (0..64) |i| { out_buff[7 + i] = @intFromFloat(buff.Q_Lum[i]); out_buff[7 + i + 65] = @intFromFloat(buff.Q_Chrom[i]); } _ = try f.write(&out_buff); + // write out SOF block, everything except w and h are largely irrelevant var sof_buff = [_]u8{ 0xff, 0xc0, 0x00, 0x11, 0x08, @truncate(w >> 8), @truncate(w & 0x00ff), @truncate(h >> 8), @truncate(h & 0x00ff), 0x03, 0x01, 0x22, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, @@ -361,6 +382,7 @@ fn write_headers(f: std.fs.File, buff: *const util.Buffers) !void { } fn dump_huffman(f: std.fs.File, huff_meta: HuffmanMeta, tree_i: usize) !void { + // write out huffman table bits and huffval representation const table_len = 3 + 16 + huff_meta.total_n; var out_buff = [5]u8{ 0xff, 0xc4, @truncate(table_len >> 8), @truncate(table_len & 0x00ff), @truncate(((tree_i % 2) << 4) | (tree_i / 2)) }; _ = try f.write(&out_buff); @@ -376,11 +398,8 @@ pub fn generate_jpg(buff: util.Buffers, alloc: std.mem.Allocator) !void { defer f.close(); try write_headers(f, &buff); + // requires 2 passes, one for RLE and huffcode generation + // second to actually write out data try scan_data.do_rle_freq_pass(&buff, f); try scan_data.dump_scan(f); - - // rle, huffman pass - // file headers - // quant + huffman write - // write scan } diff --git a/src/threads.zig b/src/threads.zig index ab7bff7..a96a84b 100644 --- a/src/threads.zig +++ b/src/threads.zig @@ -6,10 +6,11 @@ const transform = @import("transform.zig"); const AtomicBool = std.atomic.Atomic(bool); const AtomicU32 = std.atomic.Atomic(u32); +// atomic variables for cross-thread control const Signals = struct { quit: AtomicBool, processed: AtomicU32, - eof_block: AtomicU32, + eof_block: AtomicU32, // not bool to use with Futex wait const Self = @This(); @@ -22,6 +23,7 @@ const Signals = struct { } }; +// stores the jobqueue const QueueWrap = struct { queue: *util.JobQueue, job_pool: util.JobPool, @@ -73,6 +75,7 @@ pub const ThreadManager = struct { } pub fn quit(self: *Self) void { + // signal a quit and wait for threads to exit self.signals.quit.store(true, .Release); self.unblock(); for (self.threads.items) |thread| { @@ -90,12 +93,15 @@ pub const ThreadManager = struct { }; fn quantize_loop(queue: *util.JobQueue, signals: *Signals, Q_Lum: *util.QTable, Q_Chrom: *util.QTable) void { + // loop while check jobs avail or quit is not signalled while (queue.HasJobs() or !signals.quit.load(std.builtin.AtomicOrder.Acquire)) : (std.time.sleep(1)) { - const job = queue.pop() orelse continue; - transform.quantize(job.source, job.target, if (job.is_lum) Q_Lum else Q_Chrom); - _ = @atomicRmw(u32, &signals.processed.value, .Add, 1, .SeqCst); + // if eof has been signalled, wait until it isn't if (signals.eof_block.load(.Acquire) == 1) { std.Thread.Futex.wait(&signals.eof_block, 1); } + const job = queue.pop() orelse continue; // if check was stolen by other thread + transform.quantize(job.source, job.target, if (job.is_lum) Q_Lum else Q_Chrom); + // increment processed var since block now processed + _ = @atomicRmw(u32, &signals.processed.value, .Add, 1, .SeqCst); } } diff --git a/src/transform.zig b/src/transform.zig index 6d539c2..c5b0517 100644 --- a/src/transform.zig +++ b/src/transform.zig @@ -2,6 +2,7 @@ const std = @import("std"); const util = @import("util.zig"); +// pre-computes coefficients at comptime pub const dct_coeffs = gen_coeffs(); inline fn dct_cos(x: usize, f: usize) f16 { @@ -14,6 +15,7 @@ inline fn dct_coeff(u: usize, v: usize) f16 { return 0.25 * (if (u == 0) 1.0 / @sqrt(2.0) else 1.0) * (if (v == 0) 1.0 / @sqrt(2.0) else 1.0); } +// helper function to convert given u and v to the zigzag equivalents inline fn zz_conv(u: usize, v: usize) struct { u: usize, v: usize } { var band_i = u + v; const band_max = @min(7, band_i); @@ -36,6 +38,10 @@ inline fn zz_band_len(band_i: usize) usize { return if (band_i < 8) band_i + 1 else 15 - band_i; } +// generates an [8][8][8 * 8] set of values of dct coeffs +// can be directly multiplied and summed to get dct value. +// values are stored in zig-zagged order, so no need to rearrange +// at runtime fn gen_coeffs() [8][8]@Vector(64, f16) { @setEvalBranchQuota(100000); var ret: [8][8]@Vector(64, f16) = undefined; @@ -53,6 +59,13 @@ fn gen_coeffs() [8][8]@Vector(64, f16) { return ret; } +// performs JPEG Type II DCT. +// SIMD Vector optimzations applied if target supported +// load source block bytes as floats +// shift down 128 +// for each target coord in dct, mult shifted with corresponding coeff vector, add and store +// divide dct by quant table values +// store divved as i16 in target pub fn quantize(source: *util.Block, target: *util.BlockQuantized, qtable: *util.QTable) void { var source_holder: @Vector(64, f16) = undefined; var dct_holder: @Vector(64, f16) = undefined; diff --git a/src/util.zig b/src/util.zig index ddb5b2e..5de3159 100644 --- a/src/util.zig +++ b/src/util.zig @@ -21,6 +21,10 @@ pub const Job = struct { target: *BlockQuantized, is_lum: bool, }; + +// rip std.atomic.Queue +// simple Mutex wrapper around tailqueue, +// also handles node storage in a MemoryPool pub const JobQueue = struct { const List = std.TailQueue(Job); @@ -64,6 +68,8 @@ pub const JobQueue = struct { }; pub const JobPool = std.heap.MemoryPool(JobQueue.List.Node); +// main input buffers, + Qtables. +// Y is stored with extra 2x2 block to make looping signifanctly less complicated pub const Buffers = struct { arena: std.heap.ArenaAllocator, @@ -103,8 +109,10 @@ pub const Buffers = struct { .U_quant = try alloc.alloc([]BlockQuantized, block_h), .V_quant = try alloc.alloc([]BlockQuantized, block_h), + // h*w Y pixels, (h/2)*(w/2) U, V pixels, 64 pixels per block .num_blocks = @truncate(w * h * 3 / 2 / 64), + // read full block row at a time .input_buff = try alloc.alloc(u8, w * 8), }; for (0..block_h) |i| { @@ -126,6 +134,10 @@ pub const Buffers = struct { } }; +// simplistic qtable generation +// assuming each high frequency band gets less and less important, +// simply factor it all out by increasing q-value by band. +// makes generation -> zig-zag simpler since can just do it in one step pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector(64, f16) { var ret: @Vector(64, f16) = [_]f32{0.0} ** 64; const band_range = step_stop_band - step_start_band; @@ -149,6 +161,8 @@ pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector return ret; } +// bit writer for scan data, adds buffering with byte stuffing +// since BitWriter and BufferedWriter can be combined but bytes cannot be stuffed pub const BufferedBitWriter = struct { byte_buff: u8, bits_used: u4, @@ -168,6 +182,9 @@ pub const BufferedBitWriter = struct { }; } + // check if space avail is >= space needed + // if yes, just shove in bits, + // else shove in bits available, recurse with remaining pub fn write_bits(self: *Self, val: u16, n_bits: u5) !void { const curr_byte_space = 8 - self.bits_used; if (n_bits <= curr_byte_space) { @@ -179,17 +196,25 @@ pub const BufferedBitWriter = struct { } } + // since write_bits handles alignment and extra values, adding here can only + // ever result in either a full byte or partial, no overflow into the next inline fn add_bits(self: *Self, val: u16, n_bits: u4) !void { self.byte_buff |= @truncate(val << (8 - self.bits_used - n_bits)); self.bits_used += n_bits; if (self.bits_used == 8) { + // emit value self.buffer[self.buffer_idx] = self.byte_buff; if (self.byte_buff == 0xff) { + // if byte stuff, simply skip the next idx to get 0xff 0x00 self.buffer_idx += 1; } self.buffer_idx += 1; self.bits_used = 0; self.byte_buff = 0x00; + + // if end of buffer was reached simply flush and wrap around + // the extra stuff increment should be preserved even if 0xff was + // the last value if (self.buffer_idx >= self.buffer.len) { try self.flush(); self.buffer_idx %= self.buffer.len; @@ -202,6 +227,8 @@ pub const BufferedBitWriter = struct { @memset(&self.buffer, 0); } + // special flush case when ending, requires partial flush and byte stuffing the + // last byte with 1s pub fn flush_end(self: *Self) !void { if (self.bits_used != 0) { const stuffing = try std.math.powi(u8, 2, 8 - self.bits_used) - 1;