comments

2023-12-12 16:02:12 +05:00 · 2023-12-12 16:02:12 +05:00 · 288a2bcc2d
commit 288a2bcc2d
parent 14a95f0c18
6 changed files with 92 additions and 12 deletions
--- a/src/input.zig
+++ b/src/input.zig
@ -4,6 +4,8 @@ const util = @import("util.zig");
 const threads = @import("threads.zig");
 const output = @import("output.zig");

+// helper func, computes the appropriate coords for luminance
+// [macro_block][macro_block][intra_macro_block]
 inline fn lum_idxs(i: usize, j: usize) struct { usize, usize, usize } {
    return .{
        i / 2,
@ -12,6 +14,9 @@ inline fn lum_idxs(i: usize, j: usize) struct { usize, usize, usize } {
    };
 }

+// either 2 separate funcs (see read_chrom) or multiple if statements, will be bad anyway
+// incrementally copies 8 byte sequences from the input buffer to the appropriate coords in the Y buffer
+// sends any completed blocks off to be processed for quantization as they fill
 fn read_lum(f: std.fs.File, source_buff: [][][4]util.Block, target_buff: [][][4]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void {
    const block_h = source_buff.len;
    const block_w = source_buff[0].len;
@ -35,7 +40,9 @@ fn read_lum(f: std.fs.File, source_buff: [][][4]util.Block, target_buff: [][][4]
        }
    }
 }
-
+// either 2 separate funcs (see read_lum) or multiple if statements, will be bad anyway
+// incrementally copies 8 byte sequences from the input buffer to the appropriate coords in the U or V buffer
+// sends any completed blocks off to be processed for quantization as they fill
 fn read_chrom(f: std.fs.File, source_buff: [][]util.Block, target_buff: [][]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void {
    const block_h = source_buff.len;
    const block_w = source_buff[0].len;
@ -62,11 +69,15 @@ fn read_chrom(f: std.fs.File, source_buff: [][]util.Block, target_buff: [][]util
 pub fn main_loop(f: std.fs.File, buffs: util.Buffers, thread_mgr: *threads.ThreadManager, alloc: std.mem.Allocator) !void {
    defer thread_mgr.quit();
    while (true) {
+        // resets control atomic variables, resumes quantizers
        thread_mgr.unblock();
        try read_lum(f, buffs.Y, buffs.Y_quant, buffs.input_buff, thread_mgr.queue_wrp.queue);
+        // U and V are downsampled, half len buffers
        try read_chrom(f, buffs.U, buffs.U_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue);
        try read_chrom(f, buffs.V, buffs.V_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue);
+        // wait until all blocks have been processed
        while (thread_mgr.signals.processed.load(.Acquire) != buffs.num_blocks) : (std.time.sleep(1)) {}
+        // sets the eof signal atomic variable, prevents quantizers from checking jobs when none can generate
        thread_mgr.eof();
        try output.generate_jpg(buffs, alloc);
    }
--- a/src/main.zig
+++ b/src/main.zig
@ -17,6 +17,7 @@ inline fn next_arg(args: *std.process.ArgIterator) ![:0]const u8 {

 inline fn get_dim(arg: [:0]const u8) !usize {
    const d = try std.fmt.parseInt(usize, arg, 10);
+    // forces image to be multiples of 16x16 mcu size
    if (d % 16 != 0 or d == 0) {
        return InitError.InvalidDimension;
    }
@ -25,6 +26,7 @@ inline fn get_dim(arg: [:0]const u8) !usize {

 inline fn get_qual(arg: [:0]const u8) !f16 {
    const q = try std.fmt.parseFloat(f16, arg);
+    // quality generation for quant tables, 0-1 scale
    if (q < 0 or q > 1) {
        return InitError.InvalidQuality;
    }
@ -32,6 +34,7 @@ inline fn get_qual(arg: [:0]const u8) !f16 {
 }

 inline fn get_n_jobs(arg: [:0]const u8) !usize {
+    // for multithreaded dct computation per block
    const n = try std.fmt.parseInt(usize, arg, 10);
    if (n == 0) {
        return InitError.InvalidNumJobs;
@ -40,6 +43,7 @@ inline fn get_n_jobs(arg: [:0]const u8) !usize {
 }

 fn get_opts() !util.Options {
+    // args can be ignored after, just use a mini buffer instead
    var buff: [100]u8 = undefined;
    var fba = std.heap.FixedBufferAllocator.init(&buff);
    var alloc = fba.allocator();
--- a/src/output.zig
+++ b/src/output.zig
@ -45,11 +45,14 @@ const RLEWriter = struct {
        }
    }

+    // must write out the huffcode and extra bits if size >= 1
    fn write_value(self: *Self, huffcode: HuffCode, unit: RLE_Unit) !void {
        try self.bw.write_bits(huffcode.value, huffcode.n_bits);
+        // negative values must be written as val - 1 with the same number of bits as orig
        const unit_val: u16 = if (unit.value >= 0) @bitCast(unit.value) else @bitCast(unit.value - 1);
        const value_size = unit.symbol & 0x0f;
        if (value_size != 0) {
+            // 0 values only need the huffcode, no extra bits
            try self.bw.write_bits(unit_val & (try std.math.powi(u16, 2, value_size) - 1), @truncate(value_size));
        }
    }
@ -91,10 +94,11 @@ const Scan = struct {
        }
    }

+    // operates on full image, generating an RLE sequence and frequency values,
+    // then gens corresponding hufftable
    fn do_rle_freq_pass(self: *Self, buff: *const util.Buffers, f: std.fs.File) !void {
        const h = buff.Y_quant.len;
        const w = buff.Y_quant[0].len;
-
        for (0..4) |i| {
            @memset(self.freqs[i], 0);
        }
@ -118,7 +122,11 @@ const Scan = struct {
        }
    }

+    // dumps scan, order of Y macroblock (blockx4), U block, V block,
+    // need per block since first must be interpreted as dc
+    // 0x00 dc does not signal next block 0x00 ac does
    fn dump_scan(self: *Self, f: std.fs.File) !void {
+        // mostly hardcoded values
        _ = try f.write(&[_]u8{ 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x11, 0x03, 0x11, 0x00, 0x3f, 0x00 });
        var bw = RLEWriter.init(f);
        var idxs = [3]usize{ 0, 0, 0 };
@ -134,6 +142,8 @@ const Scan = struct {
    }
 };

+// helper func needed for generating huffman code
+// gets idxs of least 2 non-zero values in slice.
 inline fn get_idx_min2(freqs: []u32) ?struct { usize, usize } {
    var min1: u32 = undefined;
    var min1_idx: usize = undefined;
@ -172,6 +182,8 @@ inline fn get_idx_min2(freqs: []u32) ?struct { usize, usize } {
    } else null;
 }

+// generates code mapping from BITS and HUFFVAL
+// see https://www.w3.org/Graphics/JPEG/itu-t81.pdf annex c
 inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void {
    var huffsize = [_]u5{0} ** 256;
    var huffcode = [_]u16{0} ** 256;
@ -217,10 +229,12 @@ inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void {
    }
 }

+// generate BITS and HUFFVAL
+// see https://www.w3.org/Graphics/JPEG/itu-t81.pdf annex K
 inline fn gen_huffman(freqs: []u32) HuffmanMeta {
    freqs[freqs.len - 1] = 1;
    var codesizes = [_]u8{0} ** 257;
-    var others = [_]u9{0x1ff} ** 257;
+    var others = [_]u9{0x1ff} ** 257; // 0x1ff since -1 not available and i8 wouldn't fit anyway
    while (get_idx_min2(freqs)) |tmp| {
        var v1 = tmp.@"0";
        var v2 = tmp.@"1";
@ -276,6 +290,8 @@ inline fn gen_huffman(freqs: []u32) HuffmanMeta {
    };
 }

+// returns size field for given value tested switch and bitshifts, this was
+// somehow the fastest, slightly better than bit shifting
 inline fn get_size(n: i16) u8 {
    if (n == 0) {
        return 0;
@ -306,6 +322,9 @@ inline fn get_size(n: i16) u8 {
    }
 }

+// for each block, process the dc by differencing it and perform RLE.
+// dc symbols are just the size, ac symbols are the rle in top 4 bits, size in next
+// append 0x00 at end of each block.
 fn parse_block(block: *util.BlockQuantized, dc_diff: *i16, rle: *RLE_Seq, dc_freqs: []u32, ac_freqs: []u32) !void {
    const diff = block[0] - dc_diff.*;
    var symbol = get_size(diff);
@ -347,12 +366,14 @@ fn write_headers(f: std.fs.File, buff: *const util.Buffers) !void {
    const w = buff.Y.len * 16;
    const h = buff.Y[0].len * 16;

+    // write out magic marker 0xff 0xd8 and both quant tables,
    var out_buff = [6]u8{ 0xff, 0xd8, 0xff, 0xdb, 0x00, 0x84 } ++ [1]u8{0x00} ++ [_]u8{0x00} ** 64 ++ [1]u8{0x01} ++ [_]u8{0x00} ** 64;
    for (0..64) |i| {
        out_buff[7 + i] = @intFromFloat(buff.Q_Lum[i]);
        out_buff[7 + i + 65] = @intFromFloat(buff.Q_Chrom[i]);
    }
    _ = try f.write(&out_buff);
+    // write out SOF block, everything except w and h are largely irrelevant
    var sof_buff = [_]u8{
        0xff, 0xc0, 0x00, 0x11, 0x08, @truncate(w >> 8), @truncate(w & 0x00ff), @truncate(h >> 8), @truncate(h & 0x00ff), 0x03,
        0x01, 0x22, 0x00, 0x02, 0x11, 0x01,              0x03,                  0x11,              0x01,
@ -361,6 +382,7 @@ fn write_headers(f: std.fs.File, buff: *const util.Buffers) !void {
 }

 fn dump_huffman(f: std.fs.File, huff_meta: HuffmanMeta, tree_i: usize) !void {
+    // write out huffman table bits and huffval representation
    const table_len = 3 + 16 + huff_meta.total_n;
    var out_buff = [5]u8{ 0xff, 0xc4, @truncate(table_len >> 8), @truncate(table_len & 0x00ff), @truncate(((tree_i % 2) << 4) | (tree_i / 2)) };
    _ = try f.write(&out_buff);
@ -376,11 +398,8 @@ pub fn generate_jpg(buff: util.Buffers, alloc: std.mem.Allocator) !void {
    defer f.close();
    try write_headers(f, &buff);

+    // requires 2 passes, one for RLE and huffcode generation
+    // second to actually write out data
    try scan_data.do_rle_freq_pass(&buff, f);
    try scan_data.dump_scan(f);
-
-    // rle, huffman pass
-    // file headers
-    // quant + huffman write
-    // write scan
 }
--- a/src/threads.zig
+++ b/src/threads.zig
@ -6,10 +6,11 @@ const transform = @import("transform.zig");
 const AtomicBool = std.atomic.Atomic(bool);
 const AtomicU32 = std.atomic.Atomic(u32);

+// atomic variables for cross-thread control
 const Signals = struct {
    quit: AtomicBool,
    processed: AtomicU32,
-    eof_block: AtomicU32,
+    eof_block: AtomicU32, // not bool to use with Futex wait

    const Self = @This();

@ -22,6 +23,7 @@ const Signals = struct {
    }
 };

+// stores the jobqueue
 const QueueWrap = struct {
    queue: *util.JobQueue,
    job_pool: util.JobPool,
@ -73,6 +75,7 @@ pub const ThreadManager = struct {
    }

    pub fn quit(self: *Self) void {
+        // signal a quit and wait for threads to exit
        self.signals.quit.store(true, .Release);
        self.unblock();
        for (self.threads.items) |thread| {
@ -90,12 +93,15 @@ pub const ThreadManager = struct {
 };

 fn quantize_loop(queue: *util.JobQueue, signals: *Signals, Q_Lum: *util.QTable, Q_Chrom: *util.QTable) void {
+    // loop while check jobs avail or quit is not signalled
    while (queue.HasJobs() or !signals.quit.load(std.builtin.AtomicOrder.Acquire)) : (std.time.sleep(1)) {
-        const job = queue.pop() orelse continue;
-        transform.quantize(job.source, job.target, if (job.is_lum) Q_Lum else Q_Chrom);
-        _ = @atomicRmw(u32, &signals.processed.value, .Add, 1, .SeqCst);
+        // if eof has been signalled, wait until it isn't
        if (signals.eof_block.load(.Acquire) == 1) {
            std.Thread.Futex.wait(&signals.eof_block, 1);
        }
+        const job = queue.pop() orelse continue; // if check was stolen by other thread
+        transform.quantize(job.source, job.target, if (job.is_lum) Q_Lum else Q_Chrom);
+        // increment processed var since block now processed
+        _ = @atomicRmw(u32, &signals.processed.value, .Add, 1, .SeqCst);
    }
 }
--- a/src/transform.zig
+++ b/src/transform.zig
@ -2,6 +2,7 @@ const std = @import("std");

 const util = @import("util.zig");

+// pre-computes coefficients at comptime
 pub const dct_coeffs = gen_coeffs();

 inline fn dct_cos(x: usize, f: usize) f16 {
@ -14,6 +15,7 @@ inline fn dct_coeff(u: usize, v: usize) f16 {
    return 0.25 * (if (u == 0) 1.0 / @sqrt(2.0) else 1.0) * (if (v == 0) 1.0 / @sqrt(2.0) else 1.0);
 }

+// helper function to convert given u and v to the zigzag equivalents
 inline fn zz_conv(u: usize, v: usize) struct { u: usize, v: usize } {
    var band_i = u + v;
    const band_max = @min(7, band_i);
@ -36,6 +38,10 @@ inline fn zz_band_len(band_i: usize) usize {
    return if (band_i < 8) band_i + 1 else 15 - band_i;
 }

+// generates an [8][8][8 * 8] set of values of dct coeffs
+// can be directly multiplied and summed to get dct value.
+// values are stored in zig-zagged order, so no need to rearrange
+// at runtime
 fn gen_coeffs() [8][8]@Vector(64, f16) {
    @setEvalBranchQuota(100000);
    var ret: [8][8]@Vector(64, f16) = undefined;
@ -53,6 +59,13 @@ fn gen_coeffs() [8][8]@Vector(64, f16) {
    return ret;
 }

+// performs JPEG Type II DCT.
+// SIMD Vector optimzations applied if target supported
+// load source block bytes as floats
+// shift down 128
+// for each target coord in dct, mult shifted with corresponding coeff vector, add and store
+// divide dct by quant table values
+// store divved as i16 in target
 pub fn quantize(source: *util.Block, target: *util.BlockQuantized, qtable: *util.QTable) void {
    var source_holder: @Vector(64, f16) = undefined;
    var dct_holder: @Vector(64, f16) = undefined;
--- a/src/util.zig
+++ b/src/util.zig
@ -21,6 +21,10 @@ pub const Job = struct {
    target: *BlockQuantized,
    is_lum: bool,
 };
+
+// rip std.atomic.Queue
+// simple Mutex wrapper around tailqueue,
+// also handles node storage in a MemoryPool
 pub const JobQueue = struct {
    const List = std.TailQueue(Job);

@ -64,6 +68,8 @@ pub const JobQueue = struct {
 };
 pub const JobPool = std.heap.MemoryPool(JobQueue.List.Node);

+// main input buffers, + Qtables.
+// Y is stored with extra 2x2 block to make looping signifanctly less complicated
 pub const Buffers = struct {
    arena: std.heap.ArenaAllocator,

@ -103,8 +109,10 @@ pub const Buffers = struct {
            .U_quant = try alloc.alloc([]BlockQuantized, block_h),
            .V_quant = try alloc.alloc([]BlockQuantized, block_h),

+            // h*w Y pixels, (h/2)*(w/2) U, V pixels, 64 pixels per block
            .num_blocks = @truncate(w * h * 3 / 2 / 64),

+            // read full block row at a time
            .input_buff = try alloc.alloc(u8, w * 8),
        };
        for (0..block_h) |i| {
@ -126,6 +134,10 @@ pub const Buffers = struct {
    }
 };

+// simplistic qtable generation
+// assuming each high frequency band gets less and less important,
+// simply factor it all out by increasing q-value by band.
+// makes generation -> zig-zag simpler since can just do it in one step
 pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector(64, f16) {
    var ret: @Vector(64, f16) = [_]f32{0.0} ** 64;
    const band_range = step_stop_band - step_start_band;
@ -149,6 +161,8 @@ pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector
    return ret;
 }

+// bit writer for scan data, adds buffering with byte stuffing
+// since BitWriter and BufferedWriter can be combined but bytes cannot be stuffed
 pub const BufferedBitWriter = struct {
    byte_buff: u8,
    bits_used: u4,
@ -168,6 +182,9 @@ pub const BufferedBitWriter = struct {
        };
    }

+    // check if space avail is >= space needed
+    // if yes, just shove in bits,
+    // else shove in bits available, recurse with remaining
    pub fn write_bits(self: *Self, val: u16, n_bits: u5) !void {
        const curr_byte_space = 8 - self.bits_used;
        if (n_bits <= curr_byte_space) {
@ -179,17 +196,25 @@ pub const BufferedBitWriter = struct {
        }
    }

+    // since write_bits handles alignment and extra values, adding here can only
+    // ever result in either a full byte or partial, no overflow into the next
    inline fn add_bits(self: *Self, val: u16, n_bits: u4) !void {
        self.byte_buff |= @truncate(val << (8 - self.bits_used - n_bits));
        self.bits_used += n_bits;
        if (self.bits_used == 8) {
+            // emit value
            self.buffer[self.buffer_idx] = self.byte_buff;
            if (self.byte_buff == 0xff) {
+                // if byte stuff, simply skip the next idx to get 0xff 0x00
                self.buffer_idx += 1;
            }
            self.buffer_idx += 1;
            self.bits_used = 0;
            self.byte_buff = 0x00;
+
+            // if end of buffer was reached simply flush and wrap around
+            // the extra stuff increment should be preserved even if 0xff was
+            // the last value
            if (self.buffer_idx >= self.buffer.len) {
                try self.flush();
                self.buffer_idx %= self.buffer.len;
@ -202,6 +227,8 @@ pub const BufferedBitWriter = struct {
        @memset(&self.buffer, 0);
    }

+    // special flush case when ending, requires partial flush and byte stuffing the
+    // last byte with 1s
    pub fn flush_end(self: *Self) !void {
        if (self.bits_used != 0) {
            const stuffing = try std.math.powi(u8, 2, 8 - self.bits_used) - 1;