comments
This commit is contained in:
parent
14a95f0c18
commit
288a2bcc2d
6 changed files with 92 additions and 12 deletions
|
@ -4,6 +4,8 @@ const util = @import("util.zig");
|
|||
const threads = @import("threads.zig");
|
||||
const output = @import("output.zig");
|
||||
|
||||
// helper func, computes the appropriate coords for luminance
|
||||
// [macro_block][macro_block][intra_macro_block]
|
||||
inline fn lum_idxs(i: usize, j: usize) struct { usize, usize, usize } {
|
||||
return .{
|
||||
i / 2,
|
||||
|
@ -12,6 +14,9 @@ inline fn lum_idxs(i: usize, j: usize) struct { usize, usize, usize } {
|
|||
};
|
||||
}
|
||||
|
||||
// either 2 separate funcs (see read_chrom) or multiple if statements, will be bad anyway
|
||||
// incrementally copies 8 byte sequences from the input buffer to the appropriate coords in the Y buffer
|
||||
// sends any completed blocks off to be processed for quantization as they fill
|
||||
fn read_lum(f: std.fs.File, source_buff: [][][4]util.Block, target_buff: [][][4]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void {
|
||||
const block_h = source_buff.len;
|
||||
const block_w = source_buff[0].len;
|
||||
|
@ -35,7 +40,9 @@ fn read_lum(f: std.fs.File, source_buff: [][][4]util.Block, target_buff: [][][4]
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// either 2 separate funcs (see read_lum) or multiple if statements, will be bad anyway
|
||||
// incrementally copies 8 byte sequences from the input buffer to the appropriate coords in the U or V buffer
|
||||
// sends any completed blocks off to be processed for quantization as they fill
|
||||
fn read_chrom(f: std.fs.File, source_buff: [][]util.Block, target_buff: [][]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void {
|
||||
const block_h = source_buff.len;
|
||||
const block_w = source_buff[0].len;
|
||||
|
@ -62,11 +69,15 @@ fn read_chrom(f: std.fs.File, source_buff: [][]util.Block, target_buff: [][]util
|
|||
pub fn main_loop(f: std.fs.File, buffs: util.Buffers, thread_mgr: *threads.ThreadManager, alloc: std.mem.Allocator) !void {
|
||||
defer thread_mgr.quit();
|
||||
while (true) {
|
||||
// resets control atomic variables, resumes quantizers
|
||||
thread_mgr.unblock();
|
||||
try read_lum(f, buffs.Y, buffs.Y_quant, buffs.input_buff, thread_mgr.queue_wrp.queue);
|
||||
// U and V are downsampled, half len buffers
|
||||
try read_chrom(f, buffs.U, buffs.U_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue);
|
||||
try read_chrom(f, buffs.V, buffs.V_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue);
|
||||
// wait until all blocks have been processed
|
||||
while (thread_mgr.signals.processed.load(.Acquire) != buffs.num_blocks) : (std.time.sleep(1)) {}
|
||||
// sets the eof signal atomic variable, prevents quantizers from checking jobs when none can generate
|
||||
thread_mgr.eof();
|
||||
try output.generate_jpg(buffs, alloc);
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ inline fn next_arg(args: *std.process.ArgIterator) ![:0]const u8 {
|
|||
|
||||
inline fn get_dim(arg: [:0]const u8) !usize {
|
||||
const d = try std.fmt.parseInt(usize, arg, 10);
|
||||
// forces image to be multiples of 16x16 mcu size
|
||||
if (d % 16 != 0 or d == 0) {
|
||||
return InitError.InvalidDimension;
|
||||
}
|
||||
|
@ -25,6 +26,7 @@ inline fn get_dim(arg: [:0]const u8) !usize {
|
|||
|
||||
inline fn get_qual(arg: [:0]const u8) !f16 {
|
||||
const q = try std.fmt.parseFloat(f16, arg);
|
||||
// quality generation for quant tables, 0-1 scale
|
||||
if (q < 0 or q > 1) {
|
||||
return InitError.InvalidQuality;
|
||||
}
|
||||
|
@ -32,6 +34,7 @@ inline fn get_qual(arg: [:0]const u8) !f16 {
|
|||
}
|
||||
|
||||
inline fn get_n_jobs(arg: [:0]const u8) !usize {
|
||||
// for multithreaded dct computation per block
|
||||
const n = try std.fmt.parseInt(usize, arg, 10);
|
||||
if (n == 0) {
|
||||
return InitError.InvalidNumJobs;
|
||||
|
@ -40,6 +43,7 @@ inline fn get_n_jobs(arg: [:0]const u8) !usize {
|
|||
}
|
||||
|
||||
fn get_opts() !util.Options {
|
||||
// args can be ignored after, just use a mini buffer instead
|
||||
var buff: [100]u8 = undefined;
|
||||
var fba = std.heap.FixedBufferAllocator.init(&buff);
|
||||
var alloc = fba.allocator();
|
||||
|
|
|
@ -45,11 +45,14 @@ const RLEWriter = struct {
|
|||
}
|
||||
}
|
||||
|
||||
// must write out the huffcode and extra bits if size >= 1
|
||||
fn write_value(self: *Self, huffcode: HuffCode, unit: RLE_Unit) !void {
|
||||
try self.bw.write_bits(huffcode.value, huffcode.n_bits);
|
||||
// negative values must be written as val - 1 with the same number of bits as orig
|
||||
const unit_val: u16 = if (unit.value >= 0) @bitCast(unit.value) else @bitCast(unit.value - 1);
|
||||
const value_size = unit.symbol & 0x0f;
|
||||
if (value_size != 0) {
|
||||
// 0 values only need the huffcode, no extra bits
|
||||
try self.bw.write_bits(unit_val & (try std.math.powi(u16, 2, value_size) - 1), @truncate(value_size));
|
||||
}
|
||||
}
|
||||
|
@ -91,10 +94,11 @@ const Scan = struct {
|
|||
}
|
||||
}
|
||||
|
||||
// operates on full image, generating an RLE sequence and frequency values,
|
||||
// then gens corresponding hufftable
|
||||
fn do_rle_freq_pass(self: *Self, buff: *const util.Buffers, f: std.fs.File) !void {
|
||||
const h = buff.Y_quant.len;
|
||||
const w = buff.Y_quant[0].len;
|
||||
|
||||
for (0..4) |i| {
|
||||
@memset(self.freqs[i], 0);
|
||||
}
|
||||
|
@ -118,7 +122,11 @@ const Scan = struct {
|
|||
}
|
||||
}
|
||||
|
||||
// dumps scan, order of Y macroblock (blockx4), U block, V block,
|
||||
// need per block since first must be interpreted as dc
|
||||
// 0x00 dc does not signal next block 0x00 ac does
|
||||
fn dump_scan(self: *Self, f: std.fs.File) !void {
|
||||
// mostly hardcoded values
|
||||
_ = try f.write(&[_]u8{ 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x11, 0x03, 0x11, 0x00, 0x3f, 0x00 });
|
||||
var bw = RLEWriter.init(f);
|
||||
var idxs = [3]usize{ 0, 0, 0 };
|
||||
|
@ -134,6 +142,8 @@ const Scan = struct {
|
|||
}
|
||||
};
|
||||
|
||||
// helper func needed for generating huffman code
|
||||
// gets idxs of least 2 non-zero values in slice.
|
||||
inline fn get_idx_min2(freqs: []u32) ?struct { usize, usize } {
|
||||
var min1: u32 = undefined;
|
||||
var min1_idx: usize = undefined;
|
||||
|
@ -172,6 +182,8 @@ inline fn get_idx_min2(freqs: []u32) ?struct { usize, usize } {
|
|||
} else null;
|
||||
}
|
||||
|
||||
// generates code mapping from BITS and HUFFVAL
|
||||
// see https://www.w3.org/Graphics/JPEG/itu-t81.pdf annex c
|
||||
inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void {
|
||||
var huffsize = [_]u5{0} ** 256;
|
||||
var huffcode = [_]u16{0} ** 256;
|
||||
|
@ -217,10 +229,12 @@ inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void {
|
|||
}
|
||||
}
|
||||
|
||||
// generate BITS and HUFFVAL
|
||||
// see https://www.w3.org/Graphics/JPEG/itu-t81.pdf annex K
|
||||
inline fn gen_huffman(freqs: []u32) HuffmanMeta {
|
||||
freqs[freqs.len - 1] = 1;
|
||||
var codesizes = [_]u8{0} ** 257;
|
||||
var others = [_]u9{0x1ff} ** 257;
|
||||
var others = [_]u9{0x1ff} ** 257; // 0x1ff since -1 not available and i8 wouldn't fit anyway
|
||||
while (get_idx_min2(freqs)) |tmp| {
|
||||
var v1 = tmp.@"0";
|
||||
var v2 = tmp.@"1";
|
||||
|
@ -276,6 +290,8 @@ inline fn gen_huffman(freqs: []u32) HuffmanMeta {
|
|||
};
|
||||
}
|
||||
|
||||
// returns size field for given value tested switch and bitshifts, this was
|
||||
// somehow the fastest, slightly better than bit shifting
|
||||
inline fn get_size(n: i16) u8 {
|
||||
if (n == 0) {
|
||||
return 0;
|
||||
|
@ -306,6 +322,9 @@ inline fn get_size(n: i16) u8 {
|
|||
}
|
||||
}
|
||||
|
||||
// for each block, process the dc by differencing it and perform RLE.
|
||||
// dc symbols are just the size, ac symbols are the rle in top 4 bits, size in next
|
||||
// append 0x00 at end of each block.
|
||||
fn parse_block(block: *util.BlockQuantized, dc_diff: *i16, rle: *RLE_Seq, dc_freqs: []u32, ac_freqs: []u32) !void {
|
||||
const diff = block[0] - dc_diff.*;
|
||||
var symbol = get_size(diff);
|
||||
|
@ -347,12 +366,14 @@ fn write_headers(f: std.fs.File, buff: *const util.Buffers) !void {
|
|||
const w = buff.Y.len * 16;
|
||||
const h = buff.Y[0].len * 16;
|
||||
|
||||
// write out magic marker 0xff 0xd8 and both quant tables,
|
||||
var out_buff = [6]u8{ 0xff, 0xd8, 0xff, 0xdb, 0x00, 0x84 } ++ [1]u8{0x00} ++ [_]u8{0x00} ** 64 ++ [1]u8{0x01} ++ [_]u8{0x00} ** 64;
|
||||
for (0..64) |i| {
|
||||
out_buff[7 + i] = @intFromFloat(buff.Q_Lum[i]);
|
||||
out_buff[7 + i + 65] = @intFromFloat(buff.Q_Chrom[i]);
|
||||
}
|
||||
_ = try f.write(&out_buff);
|
||||
// write out SOF block, everything except w and h are largely irrelevant
|
||||
var sof_buff = [_]u8{
|
||||
0xff, 0xc0, 0x00, 0x11, 0x08, @truncate(w >> 8), @truncate(w & 0x00ff), @truncate(h >> 8), @truncate(h & 0x00ff), 0x03,
|
||||
0x01, 0x22, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01,
|
||||
|
@ -361,6 +382,7 @@ fn write_headers(f: std.fs.File, buff: *const util.Buffers) !void {
|
|||
}
|
||||
|
||||
fn dump_huffman(f: std.fs.File, huff_meta: HuffmanMeta, tree_i: usize) !void {
|
||||
// write out huffman table bits and huffval representation
|
||||
const table_len = 3 + 16 + huff_meta.total_n;
|
||||
var out_buff = [5]u8{ 0xff, 0xc4, @truncate(table_len >> 8), @truncate(table_len & 0x00ff), @truncate(((tree_i % 2) << 4) | (tree_i / 2)) };
|
||||
_ = try f.write(&out_buff);
|
||||
|
@ -376,11 +398,8 @@ pub fn generate_jpg(buff: util.Buffers, alloc: std.mem.Allocator) !void {
|
|||
defer f.close();
|
||||
try write_headers(f, &buff);
|
||||
|
||||
// requires 2 passes, one for RLE and huffcode generation
|
||||
// second to actually write out data
|
||||
try scan_data.do_rle_freq_pass(&buff, f);
|
||||
try scan_data.dump_scan(f);
|
||||
|
||||
// rle, huffman pass
|
||||
// file headers
|
||||
// quant + huffman write
|
||||
// write scan
|
||||
}
|
||||
|
|
|
@ -6,10 +6,11 @@ const transform = @import("transform.zig");
|
|||
const AtomicBool = std.atomic.Atomic(bool);
|
||||
const AtomicU32 = std.atomic.Atomic(u32);
|
||||
|
||||
// atomic variables for cross-thread control
|
||||
const Signals = struct {
|
||||
quit: AtomicBool,
|
||||
processed: AtomicU32,
|
||||
eof_block: AtomicU32,
|
||||
eof_block: AtomicU32, // not bool to use with Futex wait
|
||||
|
||||
const Self = @This();
|
||||
|
||||
|
@ -22,6 +23,7 @@ const Signals = struct {
|
|||
}
|
||||
};
|
||||
|
||||
// stores the jobqueue
|
||||
const QueueWrap = struct {
|
||||
queue: *util.JobQueue,
|
||||
job_pool: util.JobPool,
|
||||
|
@ -73,6 +75,7 @@ pub const ThreadManager = struct {
|
|||
}
|
||||
|
||||
pub fn quit(self: *Self) void {
|
||||
// signal a quit and wait for threads to exit
|
||||
self.signals.quit.store(true, .Release);
|
||||
self.unblock();
|
||||
for (self.threads.items) |thread| {
|
||||
|
@ -90,12 +93,15 @@ pub const ThreadManager = struct {
|
|||
};
|
||||
|
||||
fn quantize_loop(queue: *util.JobQueue, signals: *Signals, Q_Lum: *util.QTable, Q_Chrom: *util.QTable) void {
|
||||
// loop while check jobs avail or quit is not signalled
|
||||
while (queue.HasJobs() or !signals.quit.load(std.builtin.AtomicOrder.Acquire)) : (std.time.sleep(1)) {
|
||||
const job = queue.pop() orelse continue;
|
||||
transform.quantize(job.source, job.target, if (job.is_lum) Q_Lum else Q_Chrom);
|
||||
_ = @atomicRmw(u32, &signals.processed.value, .Add, 1, .SeqCst);
|
||||
// if eof has been signalled, wait until it isn't
|
||||
if (signals.eof_block.load(.Acquire) == 1) {
|
||||
std.Thread.Futex.wait(&signals.eof_block, 1);
|
||||
}
|
||||
const job = queue.pop() orelse continue; // if check was stolen by other thread
|
||||
transform.quantize(job.source, job.target, if (job.is_lum) Q_Lum else Q_Chrom);
|
||||
// increment processed var since block now processed
|
||||
_ = @atomicRmw(u32, &signals.processed.value, .Add, 1, .SeqCst);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ const std = @import("std");
|
|||
|
||||
const util = @import("util.zig");
|
||||
|
||||
// pre-computes coefficients at comptime
|
||||
pub const dct_coeffs = gen_coeffs();
|
||||
|
||||
inline fn dct_cos(x: usize, f: usize) f16 {
|
||||
|
@ -14,6 +15,7 @@ inline fn dct_coeff(u: usize, v: usize) f16 {
|
|||
return 0.25 * (if (u == 0) 1.0 / @sqrt(2.0) else 1.0) * (if (v == 0) 1.0 / @sqrt(2.0) else 1.0);
|
||||
}
|
||||
|
||||
// helper function to convert given u and v to the zigzag equivalents
|
||||
inline fn zz_conv(u: usize, v: usize) struct { u: usize, v: usize } {
|
||||
var band_i = u + v;
|
||||
const band_max = @min(7, band_i);
|
||||
|
@ -36,6 +38,10 @@ inline fn zz_band_len(band_i: usize) usize {
|
|||
return if (band_i < 8) band_i + 1 else 15 - band_i;
|
||||
}
|
||||
|
||||
// generates an [8][8][8 * 8] set of values of dct coeffs
|
||||
// can be directly multiplied and summed to get dct value.
|
||||
// values are stored in zig-zagged order, so no need to rearrange
|
||||
// at runtime
|
||||
fn gen_coeffs() [8][8]@Vector(64, f16) {
|
||||
@setEvalBranchQuota(100000);
|
||||
var ret: [8][8]@Vector(64, f16) = undefined;
|
||||
|
@ -53,6 +59,13 @@ fn gen_coeffs() [8][8]@Vector(64, f16) {
|
|||
return ret;
|
||||
}
|
||||
|
||||
// performs JPEG Type II DCT.
|
||||
// SIMD Vector optimzations applied if target supported
|
||||
// load source block bytes as floats
|
||||
// shift down 128
|
||||
// for each target coord in dct, mult shifted with corresponding coeff vector, add and store
|
||||
// divide dct by quant table values
|
||||
// store divved as i16 in target
|
||||
pub fn quantize(source: *util.Block, target: *util.BlockQuantized, qtable: *util.QTable) void {
|
||||
var source_holder: @Vector(64, f16) = undefined;
|
||||
var dct_holder: @Vector(64, f16) = undefined;
|
||||
|
|
27
src/util.zig
27
src/util.zig
|
@ -21,6 +21,10 @@ pub const Job = struct {
|
|||
target: *BlockQuantized,
|
||||
is_lum: bool,
|
||||
};
|
||||
|
||||
// rip std.atomic.Queue
|
||||
// simple Mutex wrapper around tailqueue,
|
||||
// also handles node storage in a MemoryPool
|
||||
pub const JobQueue = struct {
|
||||
const List = std.TailQueue(Job);
|
||||
|
||||
|
@ -64,6 +68,8 @@ pub const JobQueue = struct {
|
|||
};
|
||||
pub const JobPool = std.heap.MemoryPool(JobQueue.List.Node);
|
||||
|
||||
// main input buffers, + Qtables.
|
||||
// Y is stored with extra 2x2 block to make looping signifanctly less complicated
|
||||
pub const Buffers = struct {
|
||||
arena: std.heap.ArenaAllocator,
|
||||
|
||||
|
@ -103,8 +109,10 @@ pub const Buffers = struct {
|
|||
.U_quant = try alloc.alloc([]BlockQuantized, block_h),
|
||||
.V_quant = try alloc.alloc([]BlockQuantized, block_h),
|
||||
|
||||
// h*w Y pixels, (h/2)*(w/2) U, V pixels, 64 pixels per block
|
||||
.num_blocks = @truncate(w * h * 3 / 2 / 64),
|
||||
|
||||
// read full block row at a time
|
||||
.input_buff = try alloc.alloc(u8, w * 8),
|
||||
};
|
||||
for (0..block_h) |i| {
|
||||
|
@ -126,6 +134,10 @@ pub const Buffers = struct {
|
|||
}
|
||||
};
|
||||
|
||||
// simplistic qtable generation
|
||||
// assuming each high frequency band gets less and less important,
|
||||
// simply factor it all out by increasing q-value by band.
|
||||
// makes generation -> zig-zag simpler since can just do it in one step
|
||||
pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector(64, f16) {
|
||||
var ret: @Vector(64, f16) = [_]f32{0.0} ** 64;
|
||||
const band_range = step_stop_band - step_start_band;
|
||||
|
@ -149,6 +161,8 @@ pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector
|
|||
return ret;
|
||||
}
|
||||
|
||||
// bit writer for scan data, adds buffering with byte stuffing
|
||||
// since BitWriter and BufferedWriter can be combined but bytes cannot be stuffed
|
||||
pub const BufferedBitWriter = struct {
|
||||
byte_buff: u8,
|
||||
bits_used: u4,
|
||||
|
@ -168,6 +182,9 @@ pub const BufferedBitWriter = struct {
|
|||
};
|
||||
}
|
||||
|
||||
// check if space avail is >= space needed
|
||||
// if yes, just shove in bits,
|
||||
// else shove in bits available, recurse with remaining
|
||||
pub fn write_bits(self: *Self, val: u16, n_bits: u5) !void {
|
||||
const curr_byte_space = 8 - self.bits_used;
|
||||
if (n_bits <= curr_byte_space) {
|
||||
|
@ -179,17 +196,25 @@ pub const BufferedBitWriter = struct {
|
|||
}
|
||||
}
|
||||
|
||||
// since write_bits handles alignment and extra values, adding here can only
|
||||
// ever result in either a full byte or partial, no overflow into the next
|
||||
inline fn add_bits(self: *Self, val: u16, n_bits: u4) !void {
|
||||
self.byte_buff |= @truncate(val << (8 - self.bits_used - n_bits));
|
||||
self.bits_used += n_bits;
|
||||
if (self.bits_used == 8) {
|
||||
// emit value
|
||||
self.buffer[self.buffer_idx] = self.byte_buff;
|
||||
if (self.byte_buff == 0xff) {
|
||||
// if byte stuff, simply skip the next idx to get 0xff 0x00
|
||||
self.buffer_idx += 1;
|
||||
}
|
||||
self.buffer_idx += 1;
|
||||
self.bits_used = 0;
|
||||
self.byte_buff = 0x00;
|
||||
|
||||
// if end of buffer was reached simply flush and wrap around
|
||||
// the extra stuff increment should be preserved even if 0xff was
|
||||
// the last value
|
||||
if (self.buffer_idx >= self.buffer.len) {
|
||||
try self.flush();
|
||||
self.buffer_idx %= self.buffer.len;
|
||||
|
@ -202,6 +227,8 @@ pub const BufferedBitWriter = struct {
|
|||
@memset(&self.buffer, 0);
|
||||
}
|
||||
|
||||
// special flush case when ending, requires partial flush and byte stuffing the
|
||||
// last byte with 1s
|
||||
pub fn flush_end(self: *Self) !void {
|
||||
if (self.bits_used != 0) {
|
||||
const stuffing = try std.math.powi(u8, 2, 8 - self.bits_used) - 1;
|
||||
|
|
Loading…
Reference in a new issue