Compare commits

..

No commits in common. "c91a2a4a0b08a87a3205961b268dd1f69f96448b" and "22a63d68bf794dfa133f3e8058069e316307cd8a" have entirely different histories.

7 changed files with 45 additions and 184 deletions

View file

@ -1,14 +0,0 @@
# img-stream-enc
continuous jpg encoder for piping raw YUV420P images to. Works well enough for 0.5 qual. For some reason anything higher tends to corrupt the output file.
Zig std lib only, Uses SIMD Vector optimization + pre-computing coefficients for quantization. Bottleneck is on rle, huffman code and output.
**Not intended for actual use**. Again higher qual tends to corrupt files, Corruption is progressive, and pixels start getting wonky well before, decoders complain, image is still usable though
`zig build` to a bin. Usage is `img-stream-enc <width> <height> <quality> <n_jobs>`
* `width`: pixel width of input image (must be multiple of 16)
* `height`: pixel height of input (multiple of 16)
* `quality`: quality factor, float between 0 and 1
* `n_jobs`: number of threads to spawn for quantization

View file

@ -4,8 +4,6 @@ const util = @import("util.zig");
const threads = @import("threads.zig"); const threads = @import("threads.zig");
const output = @import("output.zig"); const output = @import("output.zig");
// helper func, computes the appropriate coords for luminance
// [macro_block][macro_block][intra_macro_block]
inline fn lum_idxs(i: usize, j: usize) struct { usize, usize, usize } { inline fn lum_idxs(i: usize, j: usize) struct { usize, usize, usize } {
return .{ return .{
i / 2, i / 2,
@ -14,15 +12,12 @@ inline fn lum_idxs(i: usize, j: usize) struct { usize, usize, usize } {
}; };
} }
// either 2 separate funcs (see read_chrom) or multiple if statements, will be bad anyway
// incrementally copies 8 byte sequences from the input buffer to the appropriate coords in the Y buffer
// sends any completed blocks off to be processed for quantization as they fill
fn read_lum(f: std.fs.File, source_buff: [][][4]util.Block, target_buff: [][][4]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void { fn read_lum(f: std.fs.File, source_buff: [][][4]util.Block, target_buff: [][][4]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void {
const block_h = source_buff.len; const block_h = source_buff.len;
const block_w = source_buff[0].len; const block_w = source_buff[0].len;
for (0..block_h * 2) |i| { for (0..block_h * 2) |i| {
if (io_buff.len != try f.readAll(io_buff)) return util.Errors.EOFError; _ = try f.read(io_buff);
var io_idx: usize = 0; var io_idx: usize = 0;
for (0..8) |I| { for (0..8) |I| {
for (0..block_w * 2) |j| { for (0..block_w * 2) |j| {
@ -40,15 +35,13 @@ fn read_lum(f: std.fs.File, source_buff: [][][4]util.Block, target_buff: [][][4]
} }
} }
} }
// either 2 separate funcs (see read_lum) or multiple if statements, will be bad anyway
// incrementally copies 8 byte sequences from the input buffer to the appropriate coords in the U or V buffer
// sends any completed blocks off to be processed for quantization as they fill
fn read_chrom(f: std.fs.File, source_buff: [][]util.Block, target_buff: [][]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void { fn read_chrom(f: std.fs.File, source_buff: [][]util.Block, target_buff: [][]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void {
const block_h = source_buff.len; const block_h = source_buff.len;
const block_w = source_buff[0].len; const block_w = source_buff[0].len;
for (0..block_h) |i| { for (0..block_h) |i| {
if (io_buff.len != try f.readAll(io_buff)) return util.Errors.EOFError; _ = try f.read(io_buff);
var io_idx: usize = 0; var io_idx: usize = 0;
for (0..8) |I| { for (0..8) |I| {
for (0..block_w) |j| { for (0..block_w) |j| {
@ -68,17 +61,11 @@ fn read_chrom(f: std.fs.File, source_buff: [][]util.Block, target_buff: [][]util
pub fn main_loop(f: std.fs.File, buffs: util.Buffers, thread_mgr: *threads.ThreadManager, alloc: std.mem.Allocator) !void { pub fn main_loop(f: std.fs.File, buffs: util.Buffers, thread_mgr: *threads.ThreadManager, alloc: std.mem.Allocator) !void {
defer thread_mgr.quit(); defer thread_mgr.quit();
while (true) {
// resets control atomic variables, resumes quantizers
thread_mgr.unblock(); thread_mgr.unblock();
try read_lum(f, buffs.Y, buffs.Y_quant, buffs.input_buff, thread_mgr.queue_wrp.queue); try read_lum(f, buffs.Y, buffs.Y_quant, buffs.input_buff, thread_mgr.queue_wrp.queue);
// U and V are downsampled, half len buffers
try read_chrom(f, buffs.U, buffs.U_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue); try read_chrom(f, buffs.U, buffs.U_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue);
try read_chrom(f, buffs.V, buffs.V_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue); try read_chrom(f, buffs.V, buffs.V_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue);
// wait until all blocks have been processed
while (thread_mgr.signals.processed.load(.Acquire) != buffs.num_blocks) : (std.time.sleep(1)) {} while (thread_mgr.signals.processed.load(.Acquire) != buffs.num_blocks) : (std.time.sleep(1)) {}
// sets the eof signal atomic variable, prevents quantizers from checking jobs when none can generate
thread_mgr.eof(); thread_mgr.eof();
try output.generate_jpg(buffs, alloc); try output.generate_jpg(buffs, alloc);
} }
}

View file

@ -17,7 +17,6 @@ inline fn next_arg(args: *std.process.ArgIterator) ![:0]const u8 {
inline fn get_dim(arg: [:0]const u8) !usize { inline fn get_dim(arg: [:0]const u8) !usize {
const d = try std.fmt.parseInt(usize, arg, 10); const d = try std.fmt.parseInt(usize, arg, 10);
// forces image to be multiples of 16x16 mcu size
if (d % 16 != 0 or d == 0) { if (d % 16 != 0 or d == 0) {
return InitError.InvalidDimension; return InitError.InvalidDimension;
} }
@ -26,7 +25,6 @@ inline fn get_dim(arg: [:0]const u8) !usize {
inline fn get_qual(arg: [:0]const u8) !f16 { inline fn get_qual(arg: [:0]const u8) !f16 {
const q = try std.fmt.parseFloat(f16, arg); const q = try std.fmt.parseFloat(f16, arg);
// quality generation for quant tables, 0-1 scale
if (q < 0 or q > 1) { if (q < 0 or q > 1) {
return InitError.InvalidQuality; return InitError.InvalidQuality;
} }
@ -34,7 +32,6 @@ inline fn get_qual(arg: [:0]const u8) !f16 {
} }
inline fn get_n_jobs(arg: [:0]const u8) !usize { inline fn get_n_jobs(arg: [:0]const u8) !usize {
// for multithreaded dct computation per block
const n = try std.fmt.parseInt(usize, arg, 10); const n = try std.fmt.parseInt(usize, arg, 10);
if (n == 0) { if (n == 0) {
return InitError.InvalidNumJobs; return InitError.InvalidNumJobs;
@ -43,7 +40,6 @@ inline fn get_n_jobs(arg: [:0]const u8) !usize {
} }
fn get_opts() !util.Options { fn get_opts() !util.Options {
// args can be ignored after, just use a mini buffer instead
var buff: [100]u8 = undefined; var buff: [100]u8 = undefined;
var fba = std.heap.FixedBufferAllocator.init(&buff); var fba = std.heap.FixedBufferAllocator.init(&buff);
var alloc = fba.allocator(); var alloc = fba.allocator();
@ -69,12 +65,5 @@ pub fn main() !void {
defer thread_manager.deinit(); defer thread_manager.deinit();
var f = std.io.getStdIn(); var f = std.io.getStdIn();
input.main_loop(f, buffs, &thread_manager, std.heap.page_allocator) catch |err| { try input.main_loop(f, buffs, &thread_manager, std.heap.page_allocator);
switch (err) {
util.Errors.EOFError => {
return;
},
else => return err,
}
};
} }

View file

@ -45,20 +45,17 @@ const RLEWriter = struct {
} }
} }
// must write out the huffcode and extra bits if size >= 1
fn write_value(self: *Self, huffcode: HuffCode, unit: RLE_Unit) !void { fn write_value(self: *Self, huffcode: HuffCode, unit: RLE_Unit) !void {
try self.bw.write_bits(huffcode.value, huffcode.n_bits); try self.bw.write_bits(huffcode.value, huffcode.n_bits);
// negative values must be written as val - 1 with the same number of bits as orig
const unit_val: u16 = if (unit.value >= 0) @bitCast(unit.value) else @bitCast(unit.value - 1); const unit_val: u16 = if (unit.value >= 0) @bitCast(unit.value) else @bitCast(unit.value - 1);
const value_size = unit.symbol & 0x0f; try self.bw.write_bits(unit_val, @truncate(unit.symbol & 0x0f));
if (value_size != 0) {
// 0 values only need the huffcode, no extra bits
try self.bw.write_bits(unit_val & (try std.math.powi(u16, 2, value_size) - 1), @truncate(value_size));
}
} }
inline fn flush(self: *Self) !void { inline fn flush(self: *Self) !void {
try self.bw.flush_end(); if (self.bw.bits_used != 0) {
const bits_left = 8 - self.bw.bits_used;
try self.bw.write_bits(0xf, bits_left);
}
} }
}; };
@ -94,11 +91,10 @@ const Scan = struct {
} }
} }
// operates on full image, generating an RLE sequence and frequency values,
// then gens corresponding hufftable
fn do_rle_freq_pass(self: *Self, buff: *const util.Buffers, f: std.fs.File) !void { fn do_rle_freq_pass(self: *Self, buff: *const util.Buffers, f: std.fs.File) !void {
const h = buff.Y_quant.len; const h = buff.Y_quant.len;
const w = buff.Y_quant[0].len; const w = buff.Y_quant[0].len;
for (0..4) |i| { for (0..4) |i| {
@memset(self.freqs[i], 0); @memset(self.freqs[i], 0);
} }
@ -122,11 +118,7 @@ const Scan = struct {
} }
} }
// dumps scan, order of Y macroblock (blockx4), U block, V block,
// need per block since first must be interpreted as dc
// 0x00 dc does not signal next block 0x00 ac does
fn dump_scan(self: *Self, f: std.fs.File) !void { fn dump_scan(self: *Self, f: std.fs.File) !void {
// mostly hardcoded values
_ = try f.write(&[_]u8{ 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x11, 0x03, 0x11, 0x00, 0x3f, 0x00 }); _ = try f.write(&[_]u8{ 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x11, 0x03, 0x11, 0x00, 0x3f, 0x00 });
var bw = RLEWriter.init(f); var bw = RLEWriter.init(f);
var idxs = [3]usize{ 0, 0, 0 }; var idxs = [3]usize{ 0, 0, 0 };
@ -142,8 +134,6 @@ const Scan = struct {
} }
}; };
// helper func needed for generating huffman code
// gets idxs of least 2 non-zero values in slice.
inline fn get_idx_min2(freqs: []u32) ?struct { usize, usize } { inline fn get_idx_min2(freqs: []u32) ?struct { usize, usize } {
var min1: u32 = undefined; var min1: u32 = undefined;
var min1_idx: usize = undefined; var min1_idx: usize = undefined;
@ -182,8 +172,6 @@ inline fn get_idx_min2(freqs: []u32) ?struct { usize, usize } {
} else null; } else null;
} }
// generates code mapping from BITS and HUFFVAL
// see https://www.w3.org/Graphics/JPEG/itu-t81.pdf annex c
inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void { inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void {
var huffsize = [_]u5{0} ** 256; var huffsize = [_]u5{0} ** 256;
var huffcode = [_]u16{0} ** 256; var huffcode = [_]u16{0} ** 256;
@ -221,6 +209,7 @@ inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void {
si += 1; si += 1;
} }
} }
for (0..total_k) |x| { for (0..total_k) |x| {
try huff.put(huffman_meta.huffval[x], HuffCode{ try huff.put(huffman_meta.huffval[x], HuffCode{
.n_bits = huffsize[x], .n_bits = huffsize[x],
@ -229,12 +218,10 @@ inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void {
} }
} }
// generate BITS and HUFFVAL
// see https://www.w3.org/Graphics/JPEG/itu-t81.pdf annex K
inline fn gen_huffman(freqs: []u32) HuffmanMeta { inline fn gen_huffman(freqs: []u32) HuffmanMeta {
freqs[freqs.len - 1] = 1; freqs[freqs.len - 1] = 1;
var codesizes = [_]u8{0} ** 257; var codesizes = [_]u8{0} ** 257;
var others = [_]u9{0x1ff} ** 257; // 0x1ff since -1 not available and i8 wouldn't fit anyway var others = [_]u9{0x1ff} ** 257;
while (get_idx_min2(freqs)) |tmp| { while (get_idx_min2(freqs)) |tmp| {
var v1 = tmp.@"0"; var v1 = tmp.@"0";
var v2 = tmp.@"1"; var v2 = tmp.@"1";
@ -257,15 +244,17 @@ inline fn gen_huffman(freqs: []u32) HuffmanMeta {
bits[codesizes[i]] += 1; bits[codesizes[i]] += 1;
} }
} }
var i: usize = 32; var i: usize = 32;
while (i > 16) { while (i > 16) {
if (bits[i] > 0) { if (bits[i] > 0) {
var j = i - 2; var j = i - 2;
while (bits[j] <= 0) : (j -= 1) {} while (bits[j] <= 0) : (j -= 1) {
bits[i] -= 2; bits[i] -= 2;
bits[i - 1] += 1; bits[i - 1] += 1;
bits[j + 1] += 2; bits[j + 1] += 2;
bits[j] -= 1; bits[j] -= 1;
}
} else { } else {
i -= 1; i -= 1;
} }
@ -290,8 +279,6 @@ inline fn gen_huffman(freqs: []u32) HuffmanMeta {
}; };
} }
// returns size field for given value tested switch and bitshifts, this was
// somehow the fastest, slightly better than bit shifting
inline fn get_size(n: i16) u8 { inline fn get_size(n: i16) u8 {
if (n == 0) { if (n == 0) {
return 0; return 0;
@ -322,9 +309,6 @@ inline fn get_size(n: i16) u8 {
} }
} }
// for each block, process the dc by differencing it and perform RLE.
// dc symbols are just the size, ac symbols are the rle in top 4 bits, size in next
// append 0x00 at end of each block.
fn parse_block(block: *util.BlockQuantized, dc_diff: *i16, rle: *RLE_Seq, dc_freqs: []u32, ac_freqs: []u32) !void { fn parse_block(block: *util.BlockQuantized, dc_diff: *i16, rle: *RLE_Seq, dc_freqs: []u32, ac_freqs: []u32) !void {
const diff = block[0] - dc_diff.*; const diff = block[0] - dc_diff.*;
var symbol = get_size(diff); var symbol = get_size(diff);
@ -366,14 +350,12 @@ fn write_headers(f: std.fs.File, buff: *const util.Buffers) !void {
const w = buff.Y.len * 16; const w = buff.Y.len * 16;
const h = buff.Y[0].len * 16; const h = buff.Y[0].len * 16;
// write out magic marker 0xff 0xd8 and both quant tables,
var out_buff = [6]u8{ 0xff, 0xd8, 0xff, 0xdb, 0x00, 0x84 } ++ [1]u8{0x00} ++ [_]u8{0x00} ** 64 ++ [1]u8{0x01} ++ [_]u8{0x00} ** 64; var out_buff = [6]u8{ 0xff, 0xd8, 0xff, 0xdb, 0x00, 0x84 } ++ [1]u8{0x00} ++ [_]u8{0x00} ** 64 ++ [1]u8{0x01} ++ [_]u8{0x00} ** 64;
for (0..64) |i| { for (0..64) |i| {
out_buff[7 + i] = @intFromFloat(buff.Q_Lum[i]); out_buff[7 + i] = @intFromFloat(buff.Q_Lum[i]);
out_buff[7 + i + 65] = @intFromFloat(buff.Q_Chrom[i]); out_buff[7 + i + 65] = @intFromFloat(buff.Q_Chrom[i]);
} }
_ = try f.write(&out_buff); _ = try f.write(&out_buff);
// write out SOF block, everything except w and h are largely irrelevant
var sof_buff = [_]u8{ var sof_buff = [_]u8{
0xff, 0xc0, 0x00, 0x11, 0x08, @truncate(w >> 8), @truncate(w & 0x00ff), @truncate(h >> 8), @truncate(h & 0x00ff), 0x03, 0xff, 0xc0, 0x00, 0x11, 0x08, @truncate(w >> 8), @truncate(w & 0x00ff), @truncate(h >> 8), @truncate(h & 0x00ff), 0x03,
0x01, 0x22, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0x01, 0x22, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01,
@ -382,7 +364,6 @@ fn write_headers(f: std.fs.File, buff: *const util.Buffers) !void {
} }
fn dump_huffman(f: std.fs.File, huff_meta: HuffmanMeta, tree_i: usize) !void { fn dump_huffman(f: std.fs.File, huff_meta: HuffmanMeta, tree_i: usize) !void {
// write out huffman table bits and huffval representation
const table_len = 3 + 16 + huff_meta.total_n; const table_len = 3 + 16 + huff_meta.total_n;
var out_buff = [5]u8{ 0xff, 0xc4, @truncate(table_len >> 8), @truncate(table_len & 0x00ff), @truncate(((tree_i % 2) << 4) | (tree_i / 2)) }; var out_buff = [5]u8{ 0xff, 0xc4, @truncate(table_len >> 8), @truncate(table_len & 0x00ff), @truncate(((tree_i % 2) << 4) | (tree_i / 2)) };
_ = try f.write(&out_buff); _ = try f.write(&out_buff);
@ -398,8 +379,11 @@ pub fn generate_jpg(buff: util.Buffers, alloc: std.mem.Allocator) !void {
defer f.close(); defer f.close();
try write_headers(f, &buff); try write_headers(f, &buff);
// requires 2 passes, one for RLE and huffcode generation
// second to actually write out data
try scan_data.do_rle_freq_pass(&buff, f); try scan_data.do_rle_freq_pass(&buff, f);
try scan_data.dump_scan(f); try scan_data.dump_scan(f);
// rle, huffman pass
// file headers
// quant + huffman write
// write scan
} }

View file

@ -6,11 +6,10 @@ const transform = @import("transform.zig");
const AtomicBool = std.atomic.Atomic(bool); const AtomicBool = std.atomic.Atomic(bool);
const AtomicU32 = std.atomic.Atomic(u32); const AtomicU32 = std.atomic.Atomic(u32);
// atomic variables for cross-thread control
const Signals = struct { const Signals = struct {
quit: AtomicBool, quit: AtomicBool,
processed: AtomicU32, processed: AtomicU32,
eof_block: AtomicU32, // not bool to use with Futex wait eof_block: AtomicU32,
const Self = @This(); const Self = @This();
@ -23,7 +22,6 @@ const Signals = struct {
} }
}; };
// stores the jobqueue
const QueueWrap = struct { const QueueWrap = struct {
queue: *util.JobQueue, queue: *util.JobQueue,
job_pool: util.JobPool, job_pool: util.JobPool,
@ -75,7 +73,6 @@ pub const ThreadManager = struct {
} }
pub fn quit(self: *Self) void { pub fn quit(self: *Self) void {
// signal a quit and wait for threads to exit
self.signals.quit.store(true, .Release); self.signals.quit.store(true, .Release);
self.unblock(); self.unblock();
for (self.threads.items) |thread| { for (self.threads.items) |thread| {
@ -88,21 +85,16 @@ pub const ThreadManager = struct {
} }
pub fn unblock(self: *Self) void { pub fn unblock(self: *Self) void {
self.signals.eof_block.store(0, .Release); self.signals.eof_block.store(0, .Release);
self.signals.processed.store(0, .Release);
std.Thread.Futex.wake(&self.signals.eof_block, @truncate(self.threads.items.len));
} }
}; };
fn quantize_loop(queue: *util.JobQueue, signals: *Signals, Q_Lum: *util.QTable, Q_Chrom: *util.QTable) void { fn quantize_loop(queue: *util.JobQueue, signals: *Signals, Q_Lum: *util.QTable, Q_Chrom: *util.QTable) void {
// loop while check jobs avail or quit is not signalled
while (queue.HasJobs() or !signals.quit.load(std.builtin.AtomicOrder.Acquire)) : (std.time.sleep(1)) { while (queue.HasJobs() or !signals.quit.load(std.builtin.AtomicOrder.Acquire)) : (std.time.sleep(1)) {
// if eof has been signalled, wait until it isn't const job = queue.pop() orelse continue;
transform.quantize(job.source, job.target, if (job.is_lum) Q_Lum else Q_Chrom);
_ = @atomicRmw(u32, &signals.processed.value, .Add, 1, .SeqCst);
if (signals.eof_block.load(.Acquire) == 1) { if (signals.eof_block.load(.Acquire) == 1) {
std.Thread.Futex.wait(&signals.eof_block, 1); std.Thread.Futex.wait(&signals.eof_block, 1);
} }
const job = queue.pop() orelse continue; // if check was stolen by other thread
transform.quantize(job.source, job.target, if (job.is_lum) Q_Lum else Q_Chrom);
// increment processed var since block now processed
_ = @atomicRmw(u32, &signals.processed.value, .Add, 1, .SeqCst);
} }
} }

View file

@ -2,7 +2,6 @@ const std = @import("std");
const util = @import("util.zig"); const util = @import("util.zig");
// pre-computes coefficients at comptime
pub const dct_coeffs = gen_coeffs(); pub const dct_coeffs = gen_coeffs();
inline fn dct_cos(x: usize, f: usize) f16 { inline fn dct_cos(x: usize, f: usize) f16 {
@ -15,7 +14,6 @@ inline fn dct_coeff(u: usize, v: usize) f16 {
return 0.25 * (if (u == 0) 1.0 / @sqrt(2.0) else 1.0) * (if (v == 0) 1.0 / @sqrt(2.0) else 1.0); return 0.25 * (if (u == 0) 1.0 / @sqrt(2.0) else 1.0) * (if (v == 0) 1.0 / @sqrt(2.0) else 1.0);
} }
// helper function to convert given u and v to the zigzag equivalents
inline fn zz_conv(u: usize, v: usize) struct { u: usize, v: usize } { inline fn zz_conv(u: usize, v: usize) struct { u: usize, v: usize } {
var band_i = u + v; var band_i = u + v;
const band_max = @min(7, band_i); const band_max = @min(7, band_i);
@ -38,10 +36,6 @@ inline fn zz_band_len(band_i: usize) usize {
return if (band_i < 8) band_i + 1 else 15 - band_i; return if (band_i < 8) band_i + 1 else 15 - band_i;
} }
// generates an [8][8][8 * 8] set of values of dct coeffs
// can be directly multiplied and summed to get dct value.
// values are stored in zig-zagged order, so no need to rearrange
// at runtime
fn gen_coeffs() [8][8]@Vector(64, f16) { fn gen_coeffs() [8][8]@Vector(64, f16) {
@setEvalBranchQuota(100000); @setEvalBranchQuota(100000);
var ret: [8][8]@Vector(64, f16) = undefined; var ret: [8][8]@Vector(64, f16) = undefined;
@ -59,13 +53,6 @@ fn gen_coeffs() [8][8]@Vector(64, f16) {
return ret; return ret;
} }
// performs JPEG Type II DCT.
// SIMD Vector optimzations applied if target supported
// load source block bytes as floats
// shift down 128
// for each target coord in dct, mult shifted with corresponding coeff vector, add and store
// divide dct by quant table values
// store divved as i16 in target
pub fn quantize(source: *util.Block, target: *util.BlockQuantized, qtable: *util.QTable) void { pub fn quantize(source: *util.Block, target: *util.BlockQuantized, qtable: *util.QTable) void {
var source_holder: @Vector(64, f16) = undefined; var source_holder: @Vector(64, f16) = undefined;
var dct_holder: @Vector(64, f16) = undefined; var dct_holder: @Vector(64, f16) = undefined;

View file

@ -1,9 +1,5 @@
const std = @import("std"); const std = @import("std");
pub const Errors = error{
EOFError,
};
pub const Options = struct { pub const Options = struct {
width: usize, width: usize,
height: usize, height: usize,
@ -21,10 +17,6 @@ pub const Job = struct {
target: *BlockQuantized, target: *BlockQuantized,
is_lum: bool, is_lum: bool,
}; };
// rip std.atomic.Queue
// simple Mutex wrapper around tailqueue,
// also handles node storage in a MemoryPool
pub const JobQueue = struct { pub const JobQueue = struct {
const List = std.TailQueue(Job); const List = std.TailQueue(Job);
@ -68,8 +60,6 @@ pub const JobQueue = struct {
}; };
pub const JobPool = std.heap.MemoryPool(JobQueue.List.Node); pub const JobPool = std.heap.MemoryPool(JobQueue.List.Node);
// main input buffers, + Qtables.
// Y is stored with extra 2x2 block to make looping signifanctly less complicated
pub const Buffers = struct { pub const Buffers = struct {
arena: std.heap.ArenaAllocator, arena: std.heap.ArenaAllocator,
@ -109,10 +99,8 @@ pub const Buffers = struct {
.U_quant = try alloc.alloc([]BlockQuantized, block_h), .U_quant = try alloc.alloc([]BlockQuantized, block_h),
.V_quant = try alloc.alloc([]BlockQuantized, block_h), .V_quant = try alloc.alloc([]BlockQuantized, block_h),
// h*w Y pixels, (h/2)*(w/2) U, V pixels, 64 pixels per block
.num_blocks = @truncate(w * h * 3 / 2 / 64), .num_blocks = @truncate(w * h * 3 / 2 / 64),
// read full block row at a time
.input_buff = try alloc.alloc(u8, w * 8), .input_buff = try alloc.alloc(u8, w * 8),
}; };
for (0..block_h) |i| { for (0..block_h) |i| {
@ -134,15 +122,11 @@ pub const Buffers = struct {
} }
}; };
// simplistic qtable generation
// assuming each high frequency band gets less and less important,
// simply factor it all out by increasing q-value by band.
// makes generation -> zig-zag simpler since can just do it in one step
pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector(64, f16) { pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector(64, f16) {
var ret: @Vector(64, f16) = [_]f32{0.0} ** 64; var ret: @Vector(64, f16) = [_]f32{0.0} ** 64;
const band_range = step_stop_band - step_start_band; const band_range = step_stop_band - step_start_band;
const q_max: usize = @intFromFloat(255 - 235 * q); const q_max: usize = @intFromFloat(255 - 235 * q);
const q_min: usize = @intFromFloat(58 - 50 * q); const q_min = 8;
var step = (q_max - q_min) / band_range; var step = (q_max - q_min) / band_range;
var idx: usize = 0; var idx: usize = 0;
for (0..16) |band_i| { for (0..16) |band_i| {
@ -150,8 +134,9 @@ pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector
for (0..band_len) |j| { for (0..band_len) |j| {
if (band_i < step_start_band) { if (band_i < step_start_band) {
ret[idx + j] = @floatFromInt(q_min); ret[idx + j] = @floatFromInt(q_min);
} else if (band_i < band_range + step_start_band) { }
ret[idx + j] = @floatFromInt(q_min + step * (band_i - step_start_band)); if (band_i < band_range + step_start_band) {
ret[idx + j] = @floatFromInt(q_min + step * band_i);
} else { } else {
ret[idx + j] = @floatFromInt(q_max); ret[idx + j] = @floatFromInt(q_max);
} }
@ -161,12 +146,10 @@ pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector
return ret; return ret;
} }
// bit writer for scan data, adds buffering with byte stuffing
// since BitWriter and BufferedWriter can be combined but bytes cannot be stuffed
pub const BufferedBitWriter = struct { pub const BufferedBitWriter = struct {
byte_buff: u8, byte_buff: u8,
bits_used: u4, bits_used: u4,
buffer: [8]u8, buffer: [32]u8,
buffer_idx: usize, buffer_idx: usize,
f: std.fs.File, f: std.fs.File,
@ -177,65 +160,18 @@ pub const BufferedBitWriter = struct {
.f = f, .f = f,
.byte_buff = 0, .byte_buff = 0,
.bits_used = 0, .bits_used = 0,
.buffer = [_]u8{0x00} ** 8, .buffer = [_]u8{0x00} ** 32,
.buffer_idx = 0, .buffer_idx = 0,
}; };
} }
// check if space avail is >= space needed
// if yes, just shove in bits,
// else shove in bits available, recurse with remaining
pub fn write_bits(self: *Self, val: u16, n_bits: u5) !void { pub fn write_bits(self: *Self, val: u16, n_bits: u5) !void {
const curr_byte_space = 8 - self.bits_used; _ = n_bits;
if (n_bits <= curr_byte_space) { _ = val;
try self.add_bits(val, @truncate(n_bits)); _ = self;
} else {
try self.add_bits(val >> @truncate(n_bits - curr_byte_space), @truncate(curr_byte_space));
const val_remaining_mask = try std.math.powi(u16, 2, n_bits - curr_byte_space) - 1;
try self.write_bits(val & val_remaining_mask, n_bits - curr_byte_space);
}
}
// since write_bits handles alignment and extra values, adding here can only
// ever result in either a full byte or partial, no overflow into the next
inline fn add_bits(self: *Self, val: u16, n_bits: u4) !void {
self.byte_buff |= @truncate(val << (8 - self.bits_used - n_bits));
self.bits_used += n_bits;
if (self.bits_used == 8) {
// emit value
self.buffer[self.buffer_idx] = self.byte_buff;
if (self.byte_buff == 0xff) {
// if byte stuff, simply skip the next idx to get 0xff 0x00
self.buffer_idx += 1;
}
self.buffer_idx += 1;
self.bits_used = 0;
self.byte_buff = 0x00;
// if end of buffer was reached simply flush and wrap around
// the extra stuff increment should be preserved even if 0xff was
// the last value
if (self.buffer_idx >= self.buffer.len) {
try self.flush();
self.buffer_idx %= self.buffer.len;
}
}
} }
pub fn flush(self: *Self) !void { pub fn flush(self: *Self) !void {
_ = try self.f.write(&self.buffer); _ = self;
@memset(&self.buffer, 0);
}
// special flush case when ending, requires partial flush and byte stuffing the
// last byte with 1s
pub fn flush_end(self: *Self) !void {
if (self.bits_used != 0) {
const stuffing = try std.math.powi(u8, 2, 8 - self.bits_used) - 1;
try self.add_bits(stuffing, 8 - self.bits_used);
}
if (self.buffer_idx != 0) {
_ = try self.f.write(self.buffer[0..self.buffer_idx]);
}
} }
}; };