This commit is contained in:
Muaz Ahmad 2023-12-12 16:02:12 +05:00
parent 14a95f0c18
commit 288a2bcc2d
6 changed files with 92 additions and 12 deletions

View file

@ -4,6 +4,8 @@ const util = @import("util.zig");
const threads = @import("threads.zig");
const output = @import("output.zig");
// helper func, computes the appropriate coords for luminance
// [macro_block][macro_block][intra_macro_block]
inline fn lum_idxs(i: usize, j: usize) struct { usize, usize, usize } {
return .{
i / 2,
@ -12,6 +14,9 @@ inline fn lum_idxs(i: usize, j: usize) struct { usize, usize, usize } {
};
}
// either 2 separate funcs (see read_chrom) or multiple if statements, will be bad anyway
// incrementally copies 8 byte sequences from the input buffer to the appropriate coords in the Y buffer
// sends any completed blocks off to be processed for quantization as they fill
fn read_lum(f: std.fs.File, source_buff: [][][4]util.Block, target_buff: [][][4]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void {
const block_h = source_buff.len;
const block_w = source_buff[0].len;
@ -35,7 +40,9 @@ fn read_lum(f: std.fs.File, source_buff: [][][4]util.Block, target_buff: [][][4]
}
}
}
// either 2 separate funcs (see read_lum) or multiple if statements, will be bad anyway
// incrementally copies 8 byte sequences from the input buffer to the appropriate coords in the U or V buffer
// sends any completed blocks off to be processed for quantization as they fill
fn read_chrom(f: std.fs.File, source_buff: [][]util.Block, target_buff: [][]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void {
const block_h = source_buff.len;
const block_w = source_buff[0].len;
@ -62,11 +69,15 @@ fn read_chrom(f: std.fs.File, source_buff: [][]util.Block, target_buff: [][]util
pub fn main_loop(f: std.fs.File, buffs: util.Buffers, thread_mgr: *threads.ThreadManager, alloc: std.mem.Allocator) !void {
defer thread_mgr.quit();
while (true) {
// resets control atomic variables, resumes quantizers
thread_mgr.unblock();
try read_lum(f, buffs.Y, buffs.Y_quant, buffs.input_buff, thread_mgr.queue_wrp.queue);
// U and V are downsampled, half len buffers
try read_chrom(f, buffs.U, buffs.U_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue);
try read_chrom(f, buffs.V, buffs.V_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue);
// wait until all blocks have been processed
while (thread_mgr.signals.processed.load(.Acquire) != buffs.num_blocks) : (std.time.sleep(1)) {}
// sets the eof signal atomic variable, prevents quantizers from checking jobs when none can generate
thread_mgr.eof();
try output.generate_jpg(buffs, alloc);
}

View file

@ -17,6 +17,7 @@ inline fn next_arg(args: *std.process.ArgIterator) ![:0]const u8 {
inline fn get_dim(arg: [:0]const u8) !usize {
const d = try std.fmt.parseInt(usize, arg, 10);
// forces image to be multiples of 16x16 mcu size
if (d % 16 != 0 or d == 0) {
return InitError.InvalidDimension;
}
@ -25,6 +26,7 @@ inline fn get_dim(arg: [:0]const u8) !usize {
inline fn get_qual(arg: [:0]const u8) !f16 {
const q = try std.fmt.parseFloat(f16, arg);
// quality generation for quant tables, 0-1 scale
if (q < 0 or q > 1) {
return InitError.InvalidQuality;
}
@ -32,6 +34,7 @@ inline fn get_qual(arg: [:0]const u8) !f16 {
}
inline fn get_n_jobs(arg: [:0]const u8) !usize {
// for multithreaded dct computation per block
const n = try std.fmt.parseInt(usize, arg, 10);
if (n == 0) {
return InitError.InvalidNumJobs;
@ -40,6 +43,7 @@ inline fn get_n_jobs(arg: [:0]const u8) !usize {
}
fn get_opts() !util.Options {
// args can be ignored after, just use a mini buffer instead
var buff: [100]u8 = undefined;
var fba = std.heap.FixedBufferAllocator.init(&buff);
var alloc = fba.allocator();

View file

@ -45,11 +45,14 @@ const RLEWriter = struct {
}
}
// must write out the huffcode and extra bits if size >= 1
fn write_value(self: *Self, huffcode: HuffCode, unit: RLE_Unit) !void {
try self.bw.write_bits(huffcode.value, huffcode.n_bits);
// negative values must be written as val - 1 with the same number of bits as orig
const unit_val: u16 = if (unit.value >= 0) @bitCast(unit.value) else @bitCast(unit.value - 1);
const value_size = unit.symbol & 0x0f;
if (value_size != 0) {
// 0 values only need the huffcode, no extra bits
try self.bw.write_bits(unit_val & (try std.math.powi(u16, 2, value_size) - 1), @truncate(value_size));
}
}
@ -91,10 +94,11 @@ const Scan = struct {
}
}
// operates on full image, generating an RLE sequence and frequency values,
// then gens corresponding hufftable
fn do_rle_freq_pass(self: *Self, buff: *const util.Buffers, f: std.fs.File) !void {
const h = buff.Y_quant.len;
const w = buff.Y_quant[0].len;
for (0..4) |i| {
@memset(self.freqs[i], 0);
}
@ -118,7 +122,11 @@ const Scan = struct {
}
}
// dumps scan, order of Y macroblock (blockx4), U block, V block,
// need per block since first must be interpreted as dc
// 0x00 dc does not signal next block 0x00 ac does
fn dump_scan(self: *Self, f: std.fs.File) !void {
// mostly hardcoded values
_ = try f.write(&[_]u8{ 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x11, 0x03, 0x11, 0x00, 0x3f, 0x00 });
var bw = RLEWriter.init(f);
var idxs = [3]usize{ 0, 0, 0 };
@ -134,6 +142,8 @@ const Scan = struct {
}
};
// helper func needed for generating huffman code
// gets idxs of least 2 non-zero values in slice.
inline fn get_idx_min2(freqs: []u32) ?struct { usize, usize } {
var min1: u32 = undefined;
var min1_idx: usize = undefined;
@ -172,6 +182,8 @@ inline fn get_idx_min2(freqs: []u32) ?struct { usize, usize } {
} else null;
}
// generates code mapping from BITS and HUFFVAL
// see https://www.w3.org/Graphics/JPEG/itu-t81.pdf annex c
inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void {
var huffsize = [_]u5{0} ** 256;
var huffcode = [_]u16{0} ** 256;
@ -217,10 +229,12 @@ inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void {
}
}
// generate BITS and HUFFVAL
// see https://www.w3.org/Graphics/JPEG/itu-t81.pdf annex K
inline fn gen_huffman(freqs: []u32) HuffmanMeta {
freqs[freqs.len - 1] = 1;
var codesizes = [_]u8{0} ** 257;
var others = [_]u9{0x1ff} ** 257;
var others = [_]u9{0x1ff} ** 257; // 0x1ff since -1 not available and i8 wouldn't fit anyway
while (get_idx_min2(freqs)) |tmp| {
var v1 = tmp.@"0";
var v2 = tmp.@"1";
@ -276,6 +290,8 @@ inline fn gen_huffman(freqs: []u32) HuffmanMeta {
};
}
// returns size field for given value tested switch and bitshifts, this was
// somehow the fastest, slightly better than bit shifting
inline fn get_size(n: i16) u8 {
if (n == 0) {
return 0;
@ -306,6 +322,9 @@ inline fn get_size(n: i16) u8 {
}
}
// for each block, process the dc by differencing it and perform RLE.
// dc symbols are just the size, ac symbols are the rle in top 4 bits, size in next
// append 0x00 at end of each block.
fn parse_block(block: *util.BlockQuantized, dc_diff: *i16, rle: *RLE_Seq, dc_freqs: []u32, ac_freqs: []u32) !void {
const diff = block[0] - dc_diff.*;
var symbol = get_size(diff);
@ -347,12 +366,14 @@ fn write_headers(f: std.fs.File, buff: *const util.Buffers) !void {
const w = buff.Y.len * 16;
const h = buff.Y[0].len * 16;
// write out magic marker 0xff 0xd8 and both quant tables,
var out_buff = [6]u8{ 0xff, 0xd8, 0xff, 0xdb, 0x00, 0x84 } ++ [1]u8{0x00} ++ [_]u8{0x00} ** 64 ++ [1]u8{0x01} ++ [_]u8{0x00} ** 64;
for (0..64) |i| {
out_buff[7 + i] = @intFromFloat(buff.Q_Lum[i]);
out_buff[7 + i + 65] = @intFromFloat(buff.Q_Chrom[i]);
}
_ = try f.write(&out_buff);
// write out SOF block, everything except w and h are largely irrelevant
var sof_buff = [_]u8{
0xff, 0xc0, 0x00, 0x11, 0x08, @truncate(w >> 8), @truncate(w & 0x00ff), @truncate(h >> 8), @truncate(h & 0x00ff), 0x03,
0x01, 0x22, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01,
@ -361,6 +382,7 @@ fn write_headers(f: std.fs.File, buff: *const util.Buffers) !void {
}
fn dump_huffman(f: std.fs.File, huff_meta: HuffmanMeta, tree_i: usize) !void {
// write out huffman table bits and huffval representation
const table_len = 3 + 16 + huff_meta.total_n;
var out_buff = [5]u8{ 0xff, 0xc4, @truncate(table_len >> 8), @truncate(table_len & 0x00ff), @truncate(((tree_i % 2) << 4) | (tree_i / 2)) };
_ = try f.write(&out_buff);
@ -376,11 +398,8 @@ pub fn generate_jpg(buff: util.Buffers, alloc: std.mem.Allocator) !void {
defer f.close();
try write_headers(f, &buff);
// requires 2 passes, one for RLE and huffcode generation
// second to actually write out data
try scan_data.do_rle_freq_pass(&buff, f);
try scan_data.dump_scan(f);
// rle, huffman pass
// file headers
// quant + huffman write
// write scan
}

View file

@ -6,10 +6,11 @@ const transform = @import("transform.zig");
const AtomicBool = std.atomic.Atomic(bool);
const AtomicU32 = std.atomic.Atomic(u32);
// atomic variables for cross-thread control
const Signals = struct {
quit: AtomicBool,
processed: AtomicU32,
eof_block: AtomicU32,
eof_block: AtomicU32, // not bool to use with Futex wait
const Self = @This();
@ -22,6 +23,7 @@ const Signals = struct {
}
};
// stores the jobqueue
const QueueWrap = struct {
queue: *util.JobQueue,
job_pool: util.JobPool,
@ -73,6 +75,7 @@ pub const ThreadManager = struct {
}
pub fn quit(self: *Self) void {
// signal a quit and wait for threads to exit
self.signals.quit.store(true, .Release);
self.unblock();
for (self.threads.items) |thread| {
@ -90,12 +93,15 @@ pub const ThreadManager = struct {
};
fn quantize_loop(queue: *util.JobQueue, signals: *Signals, Q_Lum: *util.QTable, Q_Chrom: *util.QTable) void {
// loop while check jobs avail or quit is not signalled
while (queue.HasJobs() or !signals.quit.load(std.builtin.AtomicOrder.Acquire)) : (std.time.sleep(1)) {
const job = queue.pop() orelse continue;
transform.quantize(job.source, job.target, if (job.is_lum) Q_Lum else Q_Chrom);
_ = @atomicRmw(u32, &signals.processed.value, .Add, 1, .SeqCst);
// if eof has been signalled, wait until it isn't
if (signals.eof_block.load(.Acquire) == 1) {
std.Thread.Futex.wait(&signals.eof_block, 1);
}
const job = queue.pop() orelse continue; // if check was stolen by other thread
transform.quantize(job.source, job.target, if (job.is_lum) Q_Lum else Q_Chrom);
// increment processed var since block now processed
_ = @atomicRmw(u32, &signals.processed.value, .Add, 1, .SeqCst);
}
}

View file

@ -2,6 +2,7 @@ const std = @import("std");
const util = @import("util.zig");
// pre-computes coefficients at comptime
pub const dct_coeffs = gen_coeffs();
inline fn dct_cos(x: usize, f: usize) f16 {
@ -14,6 +15,7 @@ inline fn dct_coeff(u: usize, v: usize) f16 {
return 0.25 * (if (u == 0) 1.0 / @sqrt(2.0) else 1.0) * (if (v == 0) 1.0 / @sqrt(2.0) else 1.0);
}
// helper function to convert given u and v to the zigzag equivalents
inline fn zz_conv(u: usize, v: usize) struct { u: usize, v: usize } {
var band_i = u + v;
const band_max = @min(7, band_i);
@ -36,6 +38,10 @@ inline fn zz_band_len(band_i: usize) usize {
return if (band_i < 8) band_i + 1 else 15 - band_i;
}
// generates an [8][8][8 * 8] set of values of dct coeffs
// can be directly multiplied and summed to get dct value.
// values are stored in zig-zagged order, so no need to rearrange
// at runtime
fn gen_coeffs() [8][8]@Vector(64, f16) {
@setEvalBranchQuota(100000);
var ret: [8][8]@Vector(64, f16) = undefined;
@ -53,6 +59,13 @@ fn gen_coeffs() [8][8]@Vector(64, f16) {
return ret;
}
// performs JPEG Type II DCT.
// SIMD Vector optimzations applied if target supported
// load source block bytes as floats
// shift down 128
// for each target coord in dct, mult shifted with corresponding coeff vector, add and store
// divide dct by quant table values
// store divved as i16 in target
pub fn quantize(source: *util.Block, target: *util.BlockQuantized, qtable: *util.QTable) void {
var source_holder: @Vector(64, f16) = undefined;
var dct_holder: @Vector(64, f16) = undefined;

View file

@ -21,6 +21,10 @@ pub const Job = struct {
target: *BlockQuantized,
is_lum: bool,
};
// rip std.atomic.Queue
// simple Mutex wrapper around tailqueue,
// also handles node storage in a MemoryPool
pub const JobQueue = struct {
const List = std.TailQueue(Job);
@ -64,6 +68,8 @@ pub const JobQueue = struct {
};
pub const JobPool = std.heap.MemoryPool(JobQueue.List.Node);
// main input buffers, + Qtables.
// Y is stored with extra 2x2 block to make looping signifanctly less complicated
pub const Buffers = struct {
arena: std.heap.ArenaAllocator,
@ -103,8 +109,10 @@ pub const Buffers = struct {
.U_quant = try alloc.alloc([]BlockQuantized, block_h),
.V_quant = try alloc.alloc([]BlockQuantized, block_h),
// h*w Y pixels, (h/2)*(w/2) U, V pixels, 64 pixels per block
.num_blocks = @truncate(w * h * 3 / 2 / 64),
// read full block row at a time
.input_buff = try alloc.alloc(u8, w * 8),
};
for (0..block_h) |i| {
@ -126,6 +134,10 @@ pub const Buffers = struct {
}
};
// simplistic qtable generation
// assuming each high frequency band gets less and less important,
// simply factor it all out by increasing q-value by band.
// makes generation -> zig-zag simpler since can just do it in one step
pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector(64, f16) {
var ret: @Vector(64, f16) = [_]f32{0.0} ** 64;
const band_range = step_stop_band - step_start_band;
@ -149,6 +161,8 @@ pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector
return ret;
}
// bit writer for scan data, adds buffering with byte stuffing
// since BitWriter and BufferedWriter can be combined but bytes cannot be stuffed
pub const BufferedBitWriter = struct {
byte_buff: u8,
bits_used: u4,
@ -168,6 +182,9 @@ pub const BufferedBitWriter = struct {
};
}
// check if space avail is >= space needed
// if yes, just shove in bits,
// else shove in bits available, recurse with remaining
pub fn write_bits(self: *Self, val: u16, n_bits: u5) !void {
const curr_byte_space = 8 - self.bits_used;
if (n_bits <= curr_byte_space) {
@ -179,17 +196,25 @@ pub const BufferedBitWriter = struct {
}
}
// since write_bits handles alignment and extra values, adding here can only
// ever result in either a full byte or partial, no overflow into the next
inline fn add_bits(self: *Self, val: u16, n_bits: u4) !void {
self.byte_buff |= @truncate(val << (8 - self.bits_used - n_bits));
self.bits_used += n_bits;
if (self.bits_used == 8) {
// emit value
self.buffer[self.buffer_idx] = self.byte_buff;
if (self.byte_buff == 0xff) {
// if byte stuff, simply skip the next idx to get 0xff 0x00
self.buffer_idx += 1;
}
self.buffer_idx += 1;
self.bits_used = 0;
self.byte_buff = 0x00;
// if end of buffer was reached simply flush and wrap around
// the extra stuff increment should be preserved even if 0xff was
// the last value
if (self.buffer_idx >= self.buffer.len) {
try self.flush();
self.buffer_idx %= self.buffer.len;
@ -202,6 +227,8 @@ pub const BufferedBitWriter = struct {
@memset(&self.buffer, 0);
}
// special flush case when ending, requires partial flush and byte stuffing the
// last byte with 1s
pub fn flush_end(self: *Self) !void {
if (self.bits_used != 0) {
const stuffing = try std.math.powi(u8, 2, 8 - self.bits_used) - 1;