Compare commits
10 commits
22a63d68bf
...
c91a2a4a0b
Author | SHA1 | Date | |
---|---|---|---|
c91a2a4a0b | |||
c53efc58a3 | |||
288a2bcc2d | |||
14a95f0c18 | |||
a6d204dc42 | |||
58e79cd95e | |||
5ef9f52d95 | |||
f97456aab8 | |||
9648f06186 | |||
6c91c2c228 |
7 changed files with 184 additions and 45 deletions
14
README.md
Normal file
14
README.md
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
# img-stream-enc
|
||||||
|
|
||||||
|
continuous jpg encoder for piping raw YUV420P images to. Works well enough for 0.5 qual. For some reason anything higher tends to corrupt the output file.
|
||||||
|
|
||||||
|
Zig std lib only, Uses SIMD Vector optimization + pre-computing coefficients for quantization. Bottleneck is on rle, huffman code and output.
|
||||||
|
|
||||||
|
**Not intended for actual use**. Again higher qual tends to corrupt files, Corruption is progressive, and pixels start getting wonky well before, decoders complain, image is still usable though
|
||||||
|
|
||||||
|
`zig build` to a bin. Usage is `img-stream-enc <width> <height> <quality> <n_jobs>`
|
||||||
|
|
||||||
|
* `width`: pixel width of input image (must be multiple of 16)
|
||||||
|
* `height`: pixel height of input (multiple of 16)
|
||||||
|
* `quality`: quality factor, float between 0 and 1
|
||||||
|
* `n_jobs`: number of threads to spawn for quantization
|
|
@ -4,6 +4,8 @@ const util = @import("util.zig");
|
||||||
const threads = @import("threads.zig");
|
const threads = @import("threads.zig");
|
||||||
const output = @import("output.zig");
|
const output = @import("output.zig");
|
||||||
|
|
||||||
|
// helper func, computes the appropriate coords for luminance
|
||||||
|
// [macro_block][macro_block][intra_macro_block]
|
||||||
inline fn lum_idxs(i: usize, j: usize) struct { usize, usize, usize } {
|
inline fn lum_idxs(i: usize, j: usize) struct { usize, usize, usize } {
|
||||||
return .{
|
return .{
|
||||||
i / 2,
|
i / 2,
|
||||||
|
@ -12,12 +14,15 @@ inline fn lum_idxs(i: usize, j: usize) struct { usize, usize, usize } {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// either 2 separate funcs (see read_chrom) or multiple if statements, will be bad anyway
|
||||||
|
// incrementally copies 8 byte sequences from the input buffer to the appropriate coords in the Y buffer
|
||||||
|
// sends any completed blocks off to be processed for quantization as they fill
|
||||||
fn read_lum(f: std.fs.File, source_buff: [][][4]util.Block, target_buff: [][][4]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void {
|
fn read_lum(f: std.fs.File, source_buff: [][][4]util.Block, target_buff: [][][4]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void {
|
||||||
const block_h = source_buff.len;
|
const block_h = source_buff.len;
|
||||||
const block_w = source_buff[0].len;
|
const block_w = source_buff[0].len;
|
||||||
|
|
||||||
for (0..block_h * 2) |i| {
|
for (0..block_h * 2) |i| {
|
||||||
_ = try f.read(io_buff);
|
if (io_buff.len != try f.readAll(io_buff)) return util.Errors.EOFError;
|
||||||
var io_idx: usize = 0;
|
var io_idx: usize = 0;
|
||||||
for (0..8) |I| {
|
for (0..8) |I| {
|
||||||
for (0..block_w * 2) |j| {
|
for (0..block_w * 2) |j| {
|
||||||
|
@ -35,13 +40,15 @@ fn read_lum(f: std.fs.File, source_buff: [][][4]util.Block, target_buff: [][][4]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// either 2 separate funcs (see read_lum) or multiple if statements, will be bad anyway
|
||||||
|
// incrementally copies 8 byte sequences from the input buffer to the appropriate coords in the U or V buffer
|
||||||
|
// sends any completed blocks off to be processed for quantization as they fill
|
||||||
fn read_chrom(f: std.fs.File, source_buff: [][]util.Block, target_buff: [][]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void {
|
fn read_chrom(f: std.fs.File, source_buff: [][]util.Block, target_buff: [][]util.BlockQuantized, io_buff: []u8, queue: *util.JobQueue) !void {
|
||||||
const block_h = source_buff.len;
|
const block_h = source_buff.len;
|
||||||
const block_w = source_buff[0].len;
|
const block_w = source_buff[0].len;
|
||||||
|
|
||||||
for (0..block_h) |i| {
|
for (0..block_h) |i| {
|
||||||
_ = try f.read(io_buff);
|
if (io_buff.len != try f.readAll(io_buff)) return util.Errors.EOFError;
|
||||||
var io_idx: usize = 0;
|
var io_idx: usize = 0;
|
||||||
for (0..8) |I| {
|
for (0..8) |I| {
|
||||||
for (0..block_w) |j| {
|
for (0..block_w) |j| {
|
||||||
|
@ -61,11 +68,17 @@ fn read_chrom(f: std.fs.File, source_buff: [][]util.Block, target_buff: [][]util
|
||||||
|
|
||||||
pub fn main_loop(f: std.fs.File, buffs: util.Buffers, thread_mgr: *threads.ThreadManager, alloc: std.mem.Allocator) !void {
|
pub fn main_loop(f: std.fs.File, buffs: util.Buffers, thread_mgr: *threads.ThreadManager, alloc: std.mem.Allocator) !void {
|
||||||
defer thread_mgr.quit();
|
defer thread_mgr.quit();
|
||||||
|
while (true) {
|
||||||
|
// resets control atomic variables, resumes quantizers
|
||||||
thread_mgr.unblock();
|
thread_mgr.unblock();
|
||||||
try read_lum(f, buffs.Y, buffs.Y_quant, buffs.input_buff, thread_mgr.queue_wrp.queue);
|
try read_lum(f, buffs.Y, buffs.Y_quant, buffs.input_buff, thread_mgr.queue_wrp.queue);
|
||||||
|
// U and V are downsampled, half len buffers
|
||||||
try read_chrom(f, buffs.U, buffs.U_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue);
|
try read_chrom(f, buffs.U, buffs.U_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue);
|
||||||
try read_chrom(f, buffs.V, buffs.V_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue);
|
try read_chrom(f, buffs.V, buffs.V_quant, buffs.input_buff[0 .. buffs.input_buff.len / 2], thread_mgr.queue_wrp.queue);
|
||||||
|
// wait until all blocks have been processed
|
||||||
while (thread_mgr.signals.processed.load(.Acquire) != buffs.num_blocks) : (std.time.sleep(1)) {}
|
while (thread_mgr.signals.processed.load(.Acquire) != buffs.num_blocks) : (std.time.sleep(1)) {}
|
||||||
|
// sets the eof signal atomic variable, prevents quantizers from checking jobs when none can generate
|
||||||
thread_mgr.eof();
|
thread_mgr.eof();
|
||||||
try output.generate_jpg(buffs, alloc);
|
try output.generate_jpg(buffs, alloc);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
13
src/main.zig
13
src/main.zig
|
@ -17,6 +17,7 @@ inline fn next_arg(args: *std.process.ArgIterator) ![:0]const u8 {
|
||||||
|
|
||||||
inline fn get_dim(arg: [:0]const u8) !usize {
|
inline fn get_dim(arg: [:0]const u8) !usize {
|
||||||
const d = try std.fmt.parseInt(usize, arg, 10);
|
const d = try std.fmt.parseInt(usize, arg, 10);
|
||||||
|
// forces image to be multiples of 16x16 mcu size
|
||||||
if (d % 16 != 0 or d == 0) {
|
if (d % 16 != 0 or d == 0) {
|
||||||
return InitError.InvalidDimension;
|
return InitError.InvalidDimension;
|
||||||
}
|
}
|
||||||
|
@ -25,6 +26,7 @@ inline fn get_dim(arg: [:0]const u8) !usize {
|
||||||
|
|
||||||
inline fn get_qual(arg: [:0]const u8) !f16 {
|
inline fn get_qual(arg: [:0]const u8) !f16 {
|
||||||
const q = try std.fmt.parseFloat(f16, arg);
|
const q = try std.fmt.parseFloat(f16, arg);
|
||||||
|
// quality generation for quant tables, 0-1 scale
|
||||||
if (q < 0 or q > 1) {
|
if (q < 0 or q > 1) {
|
||||||
return InitError.InvalidQuality;
|
return InitError.InvalidQuality;
|
||||||
}
|
}
|
||||||
|
@ -32,6 +34,7 @@ inline fn get_qual(arg: [:0]const u8) !f16 {
|
||||||
}
|
}
|
||||||
|
|
||||||
inline fn get_n_jobs(arg: [:0]const u8) !usize {
|
inline fn get_n_jobs(arg: [:0]const u8) !usize {
|
||||||
|
// for multithreaded dct computation per block
|
||||||
const n = try std.fmt.parseInt(usize, arg, 10);
|
const n = try std.fmt.parseInt(usize, arg, 10);
|
||||||
if (n == 0) {
|
if (n == 0) {
|
||||||
return InitError.InvalidNumJobs;
|
return InitError.InvalidNumJobs;
|
||||||
|
@ -40,6 +43,7 @@ inline fn get_n_jobs(arg: [:0]const u8) !usize {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_opts() !util.Options {
|
fn get_opts() !util.Options {
|
||||||
|
// args can be ignored after, just use a mini buffer instead
|
||||||
var buff: [100]u8 = undefined;
|
var buff: [100]u8 = undefined;
|
||||||
var fba = std.heap.FixedBufferAllocator.init(&buff);
|
var fba = std.heap.FixedBufferAllocator.init(&buff);
|
||||||
var alloc = fba.allocator();
|
var alloc = fba.allocator();
|
||||||
|
@ -65,5 +69,12 @@ pub fn main() !void {
|
||||||
defer thread_manager.deinit();
|
defer thread_manager.deinit();
|
||||||
|
|
||||||
var f = std.io.getStdIn();
|
var f = std.io.getStdIn();
|
||||||
try input.main_loop(f, buffs, &thread_manager, std.heap.page_allocator);
|
input.main_loop(f, buffs, &thread_manager, std.heap.page_allocator) catch |err| {
|
||||||
|
switch (err) {
|
||||||
|
util.Errors.EOFError => {
|
||||||
|
return;
|
||||||
|
},
|
||||||
|
else => return err,
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,17 +45,20 @@ const RLEWriter = struct {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// must write out the huffcode and extra bits if size >= 1
|
||||||
fn write_value(self: *Self, huffcode: HuffCode, unit: RLE_Unit) !void {
|
fn write_value(self: *Self, huffcode: HuffCode, unit: RLE_Unit) !void {
|
||||||
try self.bw.write_bits(huffcode.value, huffcode.n_bits);
|
try self.bw.write_bits(huffcode.value, huffcode.n_bits);
|
||||||
|
// negative values must be written as val - 1 with the same number of bits as orig
|
||||||
const unit_val: u16 = if (unit.value >= 0) @bitCast(unit.value) else @bitCast(unit.value - 1);
|
const unit_val: u16 = if (unit.value >= 0) @bitCast(unit.value) else @bitCast(unit.value - 1);
|
||||||
try self.bw.write_bits(unit_val, @truncate(unit.symbol & 0x0f));
|
const value_size = unit.symbol & 0x0f;
|
||||||
|
if (value_size != 0) {
|
||||||
|
// 0 values only need the huffcode, no extra bits
|
||||||
|
try self.bw.write_bits(unit_val & (try std.math.powi(u16, 2, value_size) - 1), @truncate(value_size));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline fn flush(self: *Self) !void {
|
inline fn flush(self: *Self) !void {
|
||||||
if (self.bw.bits_used != 0) {
|
try self.bw.flush_end();
|
||||||
const bits_left = 8 - self.bw.bits_used;
|
|
||||||
try self.bw.write_bits(0xf, bits_left);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -91,10 +94,11 @@ const Scan = struct {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// operates on full image, generating an RLE sequence and frequency values,
|
||||||
|
// then gens corresponding hufftable
|
||||||
fn do_rle_freq_pass(self: *Self, buff: *const util.Buffers, f: std.fs.File) !void {
|
fn do_rle_freq_pass(self: *Self, buff: *const util.Buffers, f: std.fs.File) !void {
|
||||||
const h = buff.Y_quant.len;
|
const h = buff.Y_quant.len;
|
||||||
const w = buff.Y_quant[0].len;
|
const w = buff.Y_quant[0].len;
|
||||||
|
|
||||||
for (0..4) |i| {
|
for (0..4) |i| {
|
||||||
@memset(self.freqs[i], 0);
|
@memset(self.freqs[i], 0);
|
||||||
}
|
}
|
||||||
|
@ -118,7 +122,11 @@ const Scan = struct {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// dumps scan, order of Y macroblock (blockx4), U block, V block,
|
||||||
|
// need per block since first must be interpreted as dc
|
||||||
|
// 0x00 dc does not signal next block 0x00 ac does
|
||||||
fn dump_scan(self: *Self, f: std.fs.File) !void {
|
fn dump_scan(self: *Self, f: std.fs.File) !void {
|
||||||
|
// mostly hardcoded values
|
||||||
_ = try f.write(&[_]u8{ 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x11, 0x03, 0x11, 0x00, 0x3f, 0x00 });
|
_ = try f.write(&[_]u8{ 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x11, 0x03, 0x11, 0x00, 0x3f, 0x00 });
|
||||||
var bw = RLEWriter.init(f);
|
var bw = RLEWriter.init(f);
|
||||||
var idxs = [3]usize{ 0, 0, 0 };
|
var idxs = [3]usize{ 0, 0, 0 };
|
||||||
|
@ -134,6 +142,8 @@ const Scan = struct {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// helper func needed for generating huffman code
|
||||||
|
// gets idxs of least 2 non-zero values in slice.
|
||||||
inline fn get_idx_min2(freqs: []u32) ?struct { usize, usize } {
|
inline fn get_idx_min2(freqs: []u32) ?struct { usize, usize } {
|
||||||
var min1: u32 = undefined;
|
var min1: u32 = undefined;
|
||||||
var min1_idx: usize = undefined;
|
var min1_idx: usize = undefined;
|
||||||
|
@ -172,6 +182,8 @@ inline fn get_idx_min2(freqs: []u32) ?struct { usize, usize } {
|
||||||
} else null;
|
} else null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// generates code mapping from BITS and HUFFVAL
|
||||||
|
// see https://www.w3.org/Graphics/JPEG/itu-t81.pdf annex c
|
||||||
inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void {
|
inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void {
|
||||||
var huffsize = [_]u5{0} ** 256;
|
var huffsize = [_]u5{0} ** 256;
|
||||||
var huffcode = [_]u16{0} ** 256;
|
var huffcode = [_]u16{0} ** 256;
|
||||||
|
@ -209,7 +221,6 @@ inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void {
|
||||||
si += 1;
|
si += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (0..total_k) |x| {
|
for (0..total_k) |x| {
|
||||||
try huff.put(huffman_meta.huffval[x], HuffCode{
|
try huff.put(huffman_meta.huffval[x], HuffCode{
|
||||||
.n_bits = huffsize[x],
|
.n_bits = huffsize[x],
|
||||||
|
@ -218,10 +229,12 @@ inline fn gen_codes(huff: *Huffman, huffman_meta: HuffmanMeta) !void {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// generate BITS and HUFFVAL
|
||||||
|
// see https://www.w3.org/Graphics/JPEG/itu-t81.pdf annex K
|
||||||
inline fn gen_huffman(freqs: []u32) HuffmanMeta {
|
inline fn gen_huffman(freqs: []u32) HuffmanMeta {
|
||||||
freqs[freqs.len - 1] = 1;
|
freqs[freqs.len - 1] = 1;
|
||||||
var codesizes = [_]u8{0} ** 257;
|
var codesizes = [_]u8{0} ** 257;
|
||||||
var others = [_]u9{0x1ff} ** 257;
|
var others = [_]u9{0x1ff} ** 257; // 0x1ff since -1 not available and i8 wouldn't fit anyway
|
||||||
while (get_idx_min2(freqs)) |tmp| {
|
while (get_idx_min2(freqs)) |tmp| {
|
||||||
var v1 = tmp.@"0";
|
var v1 = tmp.@"0";
|
||||||
var v2 = tmp.@"1";
|
var v2 = tmp.@"1";
|
||||||
|
@ -244,17 +257,15 @@ inline fn gen_huffman(freqs: []u32) HuffmanMeta {
|
||||||
bits[codesizes[i]] += 1;
|
bits[codesizes[i]] += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var i: usize = 32;
|
var i: usize = 32;
|
||||||
while (i > 16) {
|
while (i > 16) {
|
||||||
if (bits[i] > 0) {
|
if (bits[i] > 0) {
|
||||||
var j = i - 2;
|
var j = i - 2;
|
||||||
while (bits[j] <= 0) : (j -= 1) {
|
while (bits[j] <= 0) : (j -= 1) {}
|
||||||
bits[i] -= 2;
|
bits[i] -= 2;
|
||||||
bits[i - 1] += 1;
|
bits[i - 1] += 1;
|
||||||
bits[j + 1] += 2;
|
bits[j + 1] += 2;
|
||||||
bits[j] -= 1;
|
bits[j] -= 1;
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
i -= 1;
|
i -= 1;
|
||||||
}
|
}
|
||||||
|
@ -279,6 +290,8 @@ inline fn gen_huffman(freqs: []u32) HuffmanMeta {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// returns size field for given value tested switch and bitshifts, this was
|
||||||
|
// somehow the fastest, slightly better than bit shifting
|
||||||
inline fn get_size(n: i16) u8 {
|
inline fn get_size(n: i16) u8 {
|
||||||
if (n == 0) {
|
if (n == 0) {
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -309,6 +322,9 @@ inline fn get_size(n: i16) u8 {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// for each block, process the dc by differencing it and perform RLE.
|
||||||
|
// dc symbols are just the size, ac symbols are the rle in top 4 bits, size in next
|
||||||
|
// append 0x00 at end of each block.
|
||||||
fn parse_block(block: *util.BlockQuantized, dc_diff: *i16, rle: *RLE_Seq, dc_freqs: []u32, ac_freqs: []u32) !void {
|
fn parse_block(block: *util.BlockQuantized, dc_diff: *i16, rle: *RLE_Seq, dc_freqs: []u32, ac_freqs: []u32) !void {
|
||||||
const diff = block[0] - dc_diff.*;
|
const diff = block[0] - dc_diff.*;
|
||||||
var symbol = get_size(diff);
|
var symbol = get_size(diff);
|
||||||
|
@ -350,12 +366,14 @@ fn write_headers(f: std.fs.File, buff: *const util.Buffers) !void {
|
||||||
const w = buff.Y.len * 16;
|
const w = buff.Y.len * 16;
|
||||||
const h = buff.Y[0].len * 16;
|
const h = buff.Y[0].len * 16;
|
||||||
|
|
||||||
|
// write out magic marker 0xff 0xd8 and both quant tables,
|
||||||
var out_buff = [6]u8{ 0xff, 0xd8, 0xff, 0xdb, 0x00, 0x84 } ++ [1]u8{0x00} ++ [_]u8{0x00} ** 64 ++ [1]u8{0x01} ++ [_]u8{0x00} ** 64;
|
var out_buff = [6]u8{ 0xff, 0xd8, 0xff, 0xdb, 0x00, 0x84 } ++ [1]u8{0x00} ++ [_]u8{0x00} ** 64 ++ [1]u8{0x01} ++ [_]u8{0x00} ** 64;
|
||||||
for (0..64) |i| {
|
for (0..64) |i| {
|
||||||
out_buff[7 + i] = @intFromFloat(buff.Q_Lum[i]);
|
out_buff[7 + i] = @intFromFloat(buff.Q_Lum[i]);
|
||||||
out_buff[7 + i + 65] = @intFromFloat(buff.Q_Chrom[i]);
|
out_buff[7 + i + 65] = @intFromFloat(buff.Q_Chrom[i]);
|
||||||
}
|
}
|
||||||
_ = try f.write(&out_buff);
|
_ = try f.write(&out_buff);
|
||||||
|
// write out SOF block, everything except w and h are largely irrelevant
|
||||||
var sof_buff = [_]u8{
|
var sof_buff = [_]u8{
|
||||||
0xff, 0xc0, 0x00, 0x11, 0x08, @truncate(w >> 8), @truncate(w & 0x00ff), @truncate(h >> 8), @truncate(h & 0x00ff), 0x03,
|
0xff, 0xc0, 0x00, 0x11, 0x08, @truncate(w >> 8), @truncate(w & 0x00ff), @truncate(h >> 8), @truncate(h & 0x00ff), 0x03,
|
||||||
0x01, 0x22, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01,
|
0x01, 0x22, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01,
|
||||||
|
@ -364,6 +382,7 @@ fn write_headers(f: std.fs.File, buff: *const util.Buffers) !void {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn dump_huffman(f: std.fs.File, huff_meta: HuffmanMeta, tree_i: usize) !void {
|
fn dump_huffman(f: std.fs.File, huff_meta: HuffmanMeta, tree_i: usize) !void {
|
||||||
|
// write out huffman table bits and huffval representation
|
||||||
const table_len = 3 + 16 + huff_meta.total_n;
|
const table_len = 3 + 16 + huff_meta.total_n;
|
||||||
var out_buff = [5]u8{ 0xff, 0xc4, @truncate(table_len >> 8), @truncate(table_len & 0x00ff), @truncate(((tree_i % 2) << 4) | (tree_i / 2)) };
|
var out_buff = [5]u8{ 0xff, 0xc4, @truncate(table_len >> 8), @truncate(table_len & 0x00ff), @truncate(((tree_i % 2) << 4) | (tree_i / 2)) };
|
||||||
_ = try f.write(&out_buff);
|
_ = try f.write(&out_buff);
|
||||||
|
@ -379,11 +398,8 @@ pub fn generate_jpg(buff: util.Buffers, alloc: std.mem.Allocator) !void {
|
||||||
defer f.close();
|
defer f.close();
|
||||||
try write_headers(f, &buff);
|
try write_headers(f, &buff);
|
||||||
|
|
||||||
|
// requires 2 passes, one for RLE and huffcode generation
|
||||||
|
// second to actually write out data
|
||||||
try scan_data.do_rle_freq_pass(&buff, f);
|
try scan_data.do_rle_freq_pass(&buff, f);
|
||||||
try scan_data.dump_scan(f);
|
try scan_data.dump_scan(f);
|
||||||
|
|
||||||
// rle, huffman pass
|
|
||||||
// file headers
|
|
||||||
// quant + huffman write
|
|
||||||
// write scan
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,10 +6,11 @@ const transform = @import("transform.zig");
|
||||||
const AtomicBool = std.atomic.Atomic(bool);
|
const AtomicBool = std.atomic.Atomic(bool);
|
||||||
const AtomicU32 = std.atomic.Atomic(u32);
|
const AtomicU32 = std.atomic.Atomic(u32);
|
||||||
|
|
||||||
|
// atomic variables for cross-thread control
|
||||||
const Signals = struct {
|
const Signals = struct {
|
||||||
quit: AtomicBool,
|
quit: AtomicBool,
|
||||||
processed: AtomicU32,
|
processed: AtomicU32,
|
||||||
eof_block: AtomicU32,
|
eof_block: AtomicU32, // not bool to use with Futex wait
|
||||||
|
|
||||||
const Self = @This();
|
const Self = @This();
|
||||||
|
|
||||||
|
@ -22,6 +23,7 @@ const Signals = struct {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// stores the jobqueue
|
||||||
const QueueWrap = struct {
|
const QueueWrap = struct {
|
||||||
queue: *util.JobQueue,
|
queue: *util.JobQueue,
|
||||||
job_pool: util.JobPool,
|
job_pool: util.JobPool,
|
||||||
|
@ -73,6 +75,7 @@ pub const ThreadManager = struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn quit(self: *Self) void {
|
pub fn quit(self: *Self) void {
|
||||||
|
// signal a quit and wait for threads to exit
|
||||||
self.signals.quit.store(true, .Release);
|
self.signals.quit.store(true, .Release);
|
||||||
self.unblock();
|
self.unblock();
|
||||||
for (self.threads.items) |thread| {
|
for (self.threads.items) |thread| {
|
||||||
|
@ -85,16 +88,21 @@ pub const ThreadManager = struct {
|
||||||
}
|
}
|
||||||
pub fn unblock(self: *Self) void {
|
pub fn unblock(self: *Self) void {
|
||||||
self.signals.eof_block.store(0, .Release);
|
self.signals.eof_block.store(0, .Release);
|
||||||
|
self.signals.processed.store(0, .Release);
|
||||||
|
std.Thread.Futex.wake(&self.signals.eof_block, @truncate(self.threads.items.len));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
fn quantize_loop(queue: *util.JobQueue, signals: *Signals, Q_Lum: *util.QTable, Q_Chrom: *util.QTable) void {
|
fn quantize_loop(queue: *util.JobQueue, signals: *Signals, Q_Lum: *util.QTable, Q_Chrom: *util.QTable) void {
|
||||||
|
// loop while check jobs avail or quit is not signalled
|
||||||
while (queue.HasJobs() or !signals.quit.load(std.builtin.AtomicOrder.Acquire)) : (std.time.sleep(1)) {
|
while (queue.HasJobs() or !signals.quit.load(std.builtin.AtomicOrder.Acquire)) : (std.time.sleep(1)) {
|
||||||
const job = queue.pop() orelse continue;
|
// if eof has been signalled, wait until it isn't
|
||||||
transform.quantize(job.source, job.target, if (job.is_lum) Q_Lum else Q_Chrom);
|
|
||||||
_ = @atomicRmw(u32, &signals.processed.value, .Add, 1, .SeqCst);
|
|
||||||
if (signals.eof_block.load(.Acquire) == 1) {
|
if (signals.eof_block.load(.Acquire) == 1) {
|
||||||
std.Thread.Futex.wait(&signals.eof_block, 1);
|
std.Thread.Futex.wait(&signals.eof_block, 1);
|
||||||
}
|
}
|
||||||
|
const job = queue.pop() orelse continue; // if check was stolen by other thread
|
||||||
|
transform.quantize(job.source, job.target, if (job.is_lum) Q_Lum else Q_Chrom);
|
||||||
|
// increment processed var since block now processed
|
||||||
|
_ = @atomicRmw(u32, &signals.processed.value, .Add, 1, .SeqCst);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,7 @@ const std = @import("std");
|
||||||
|
|
||||||
const util = @import("util.zig");
|
const util = @import("util.zig");
|
||||||
|
|
||||||
|
// pre-computes coefficients at comptime
|
||||||
pub const dct_coeffs = gen_coeffs();
|
pub const dct_coeffs = gen_coeffs();
|
||||||
|
|
||||||
inline fn dct_cos(x: usize, f: usize) f16 {
|
inline fn dct_cos(x: usize, f: usize) f16 {
|
||||||
|
@ -14,6 +15,7 @@ inline fn dct_coeff(u: usize, v: usize) f16 {
|
||||||
return 0.25 * (if (u == 0) 1.0 / @sqrt(2.0) else 1.0) * (if (v == 0) 1.0 / @sqrt(2.0) else 1.0);
|
return 0.25 * (if (u == 0) 1.0 / @sqrt(2.0) else 1.0) * (if (v == 0) 1.0 / @sqrt(2.0) else 1.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// helper function to convert given u and v to the zigzag equivalents
|
||||||
inline fn zz_conv(u: usize, v: usize) struct { u: usize, v: usize } {
|
inline fn zz_conv(u: usize, v: usize) struct { u: usize, v: usize } {
|
||||||
var band_i = u + v;
|
var band_i = u + v;
|
||||||
const band_max = @min(7, band_i);
|
const band_max = @min(7, band_i);
|
||||||
|
@ -36,6 +38,10 @@ inline fn zz_band_len(band_i: usize) usize {
|
||||||
return if (band_i < 8) band_i + 1 else 15 - band_i;
|
return if (band_i < 8) band_i + 1 else 15 - band_i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// generates an [8][8][8 * 8] set of values of dct coeffs
|
||||||
|
// can be directly multiplied and summed to get dct value.
|
||||||
|
// values are stored in zig-zagged order, so no need to rearrange
|
||||||
|
// at runtime
|
||||||
fn gen_coeffs() [8][8]@Vector(64, f16) {
|
fn gen_coeffs() [8][8]@Vector(64, f16) {
|
||||||
@setEvalBranchQuota(100000);
|
@setEvalBranchQuota(100000);
|
||||||
var ret: [8][8]@Vector(64, f16) = undefined;
|
var ret: [8][8]@Vector(64, f16) = undefined;
|
||||||
|
@ -53,6 +59,13 @@ fn gen_coeffs() [8][8]@Vector(64, f16) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// performs JPEG Type II DCT.
|
||||||
|
// SIMD Vector optimzations applied if target supported
|
||||||
|
// load source block bytes as floats
|
||||||
|
// shift down 128
|
||||||
|
// for each target coord in dct, mult shifted with corresponding coeff vector, add and store
|
||||||
|
// divide dct by quant table values
|
||||||
|
// store divved as i16 in target
|
||||||
pub fn quantize(source: *util.Block, target: *util.BlockQuantized, qtable: *util.QTable) void {
|
pub fn quantize(source: *util.Block, target: *util.BlockQuantized, qtable: *util.QTable) void {
|
||||||
var source_holder: @Vector(64, f16) = undefined;
|
var source_holder: @Vector(64, f16) = undefined;
|
||||||
var dct_holder: @Vector(64, f16) = undefined;
|
var dct_holder: @Vector(64, f16) = undefined;
|
||||||
|
|
84
src/util.zig
84
src/util.zig
|
@ -1,5 +1,9 @@
|
||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
|
|
||||||
|
pub const Errors = error{
|
||||||
|
EOFError,
|
||||||
|
};
|
||||||
|
|
||||||
pub const Options = struct {
|
pub const Options = struct {
|
||||||
width: usize,
|
width: usize,
|
||||||
height: usize,
|
height: usize,
|
||||||
|
@ -17,6 +21,10 @@ pub const Job = struct {
|
||||||
target: *BlockQuantized,
|
target: *BlockQuantized,
|
||||||
is_lum: bool,
|
is_lum: bool,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// rip std.atomic.Queue
|
||||||
|
// simple Mutex wrapper around tailqueue,
|
||||||
|
// also handles node storage in a MemoryPool
|
||||||
pub const JobQueue = struct {
|
pub const JobQueue = struct {
|
||||||
const List = std.TailQueue(Job);
|
const List = std.TailQueue(Job);
|
||||||
|
|
||||||
|
@ -60,6 +68,8 @@ pub const JobQueue = struct {
|
||||||
};
|
};
|
||||||
pub const JobPool = std.heap.MemoryPool(JobQueue.List.Node);
|
pub const JobPool = std.heap.MemoryPool(JobQueue.List.Node);
|
||||||
|
|
||||||
|
// main input buffers, + Qtables.
|
||||||
|
// Y is stored with extra 2x2 block to make looping signifanctly less complicated
|
||||||
pub const Buffers = struct {
|
pub const Buffers = struct {
|
||||||
arena: std.heap.ArenaAllocator,
|
arena: std.heap.ArenaAllocator,
|
||||||
|
|
||||||
|
@ -99,8 +109,10 @@ pub const Buffers = struct {
|
||||||
.U_quant = try alloc.alloc([]BlockQuantized, block_h),
|
.U_quant = try alloc.alloc([]BlockQuantized, block_h),
|
||||||
.V_quant = try alloc.alloc([]BlockQuantized, block_h),
|
.V_quant = try alloc.alloc([]BlockQuantized, block_h),
|
||||||
|
|
||||||
|
// h*w Y pixels, (h/2)*(w/2) U, V pixels, 64 pixels per block
|
||||||
.num_blocks = @truncate(w * h * 3 / 2 / 64),
|
.num_blocks = @truncate(w * h * 3 / 2 / 64),
|
||||||
|
|
||||||
|
// read full block row at a time
|
||||||
.input_buff = try alloc.alloc(u8, w * 8),
|
.input_buff = try alloc.alloc(u8, w * 8),
|
||||||
};
|
};
|
||||||
for (0..block_h) |i| {
|
for (0..block_h) |i| {
|
||||||
|
@ -122,11 +134,15 @@ pub const Buffers = struct {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// simplistic qtable generation
|
||||||
|
// assuming each high frequency band gets less and less important,
|
||||||
|
// simply factor it all out by increasing q-value by band.
|
||||||
|
// makes generation -> zig-zag simpler since can just do it in one step
|
||||||
pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector(64, f16) {
|
pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector(64, f16) {
|
||||||
var ret: @Vector(64, f16) = [_]f32{0.0} ** 64;
|
var ret: @Vector(64, f16) = [_]f32{0.0} ** 64;
|
||||||
const band_range = step_stop_band - step_start_band;
|
const band_range = step_stop_band - step_start_band;
|
||||||
const q_max: usize = @intFromFloat(255 - 235 * q);
|
const q_max: usize = @intFromFloat(255 - 235 * q);
|
||||||
const q_min = 8;
|
const q_min: usize = @intFromFloat(58 - 50 * q);
|
||||||
var step = (q_max - q_min) / band_range;
|
var step = (q_max - q_min) / band_range;
|
||||||
var idx: usize = 0;
|
var idx: usize = 0;
|
||||||
for (0..16) |band_i| {
|
for (0..16) |band_i| {
|
||||||
|
@ -134,9 +150,8 @@ pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector
|
||||||
for (0..band_len) |j| {
|
for (0..band_len) |j| {
|
||||||
if (band_i < step_start_band) {
|
if (band_i < step_start_band) {
|
||||||
ret[idx + j] = @floatFromInt(q_min);
|
ret[idx + j] = @floatFromInt(q_min);
|
||||||
}
|
} else if (band_i < band_range + step_start_band) {
|
||||||
if (band_i < band_range + step_start_band) {
|
ret[idx + j] = @floatFromInt(q_min + step * (band_i - step_start_band));
|
||||||
ret[idx + j] = @floatFromInt(q_min + step * band_i);
|
|
||||||
} else {
|
} else {
|
||||||
ret[idx + j] = @floatFromInt(q_max);
|
ret[idx + j] = @floatFromInt(q_max);
|
||||||
}
|
}
|
||||||
|
@ -146,10 +161,12 @@ pub fn gen_qtable(q: f16, step_start_band: usize, step_stop_band: usize) @Vector
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// bit writer for scan data, adds buffering with byte stuffing
|
||||||
|
// since BitWriter and BufferedWriter can be combined but bytes cannot be stuffed
|
||||||
pub const BufferedBitWriter = struct {
|
pub const BufferedBitWriter = struct {
|
||||||
byte_buff: u8,
|
byte_buff: u8,
|
||||||
bits_used: u4,
|
bits_used: u4,
|
||||||
buffer: [32]u8,
|
buffer: [8]u8,
|
||||||
buffer_idx: usize,
|
buffer_idx: usize,
|
||||||
f: std.fs.File,
|
f: std.fs.File,
|
||||||
|
|
||||||
|
@ -160,18 +177,65 @@ pub const BufferedBitWriter = struct {
|
||||||
.f = f,
|
.f = f,
|
||||||
.byte_buff = 0,
|
.byte_buff = 0,
|
||||||
.bits_used = 0,
|
.bits_used = 0,
|
||||||
.buffer = [_]u8{0x00} ** 32,
|
.buffer = [_]u8{0x00} ** 8,
|
||||||
.buffer_idx = 0,
|
.buffer_idx = 0,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// check if space avail is >= space needed
|
||||||
|
// if yes, just shove in bits,
|
||||||
|
// else shove in bits available, recurse with remaining
|
||||||
pub fn write_bits(self: *Self, val: u16, n_bits: u5) !void {
|
pub fn write_bits(self: *Self, val: u16, n_bits: u5) !void {
|
||||||
_ = n_bits;
|
const curr_byte_space = 8 - self.bits_used;
|
||||||
_ = val;
|
if (n_bits <= curr_byte_space) {
|
||||||
_ = self;
|
try self.add_bits(val, @truncate(n_bits));
|
||||||
|
} else {
|
||||||
|
try self.add_bits(val >> @truncate(n_bits - curr_byte_space), @truncate(curr_byte_space));
|
||||||
|
const val_remaining_mask = try std.math.powi(u16, 2, n_bits - curr_byte_space) - 1;
|
||||||
|
try self.write_bits(val & val_remaining_mask, n_bits - curr_byte_space);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// since write_bits handles alignment and extra values, adding here can only
|
||||||
|
// ever result in either a full byte or partial, no overflow into the next
|
||||||
|
inline fn add_bits(self: *Self, val: u16, n_bits: u4) !void {
|
||||||
|
self.byte_buff |= @truncate(val << (8 - self.bits_used - n_bits));
|
||||||
|
self.bits_used += n_bits;
|
||||||
|
if (self.bits_used == 8) {
|
||||||
|
// emit value
|
||||||
|
self.buffer[self.buffer_idx] = self.byte_buff;
|
||||||
|
if (self.byte_buff == 0xff) {
|
||||||
|
// if byte stuff, simply skip the next idx to get 0xff 0x00
|
||||||
|
self.buffer_idx += 1;
|
||||||
|
}
|
||||||
|
self.buffer_idx += 1;
|
||||||
|
self.bits_used = 0;
|
||||||
|
self.byte_buff = 0x00;
|
||||||
|
|
||||||
|
// if end of buffer was reached simply flush and wrap around
|
||||||
|
// the extra stuff increment should be preserved even if 0xff was
|
||||||
|
// the last value
|
||||||
|
if (self.buffer_idx >= self.buffer.len) {
|
||||||
|
try self.flush();
|
||||||
|
self.buffer_idx %= self.buffer.len;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn flush(self: *Self) !void {
|
pub fn flush(self: *Self) !void {
|
||||||
_ = self;
|
_ = try self.f.write(&self.buffer);
|
||||||
|
@memset(&self.buffer, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// special flush case when ending, requires partial flush and byte stuffing the
|
||||||
|
// last byte with 1s
|
||||||
|
pub fn flush_end(self: *Self) !void {
|
||||||
|
if (self.bits_used != 0) {
|
||||||
|
const stuffing = try std.math.powi(u8, 2, 8 - self.bits_used) - 1;
|
||||||
|
try self.add_bits(stuffing, 8 - self.bits_used);
|
||||||
|
}
|
||||||
|
if (self.buffer_idx != 0) {
|
||||||
|
_ = try self.f.write(self.buffer[0..self.buffer_idx]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in a new issue