Compiler Explorer

Source code

const std = @import("std");

/// This implementation uses a 64-bit integer as a vector of 16 u4's
/// It utilizes PEXT to extract nibbles corresponding to newlines from `ascending_indices`,
/// subtract them from their neighbor, such that a prefix-sum would undo this work.
/// Then we use PDEP to put these prefix-sum-inverted values back into place, and calculate the
/// prefix-sum in each nibble simultaneously via a multiplication by 0x1111111111111111.
/// In the end, we will have smeared the reset-positions up to the next reset-position,
/// such that subtracting from `ascending_indices` yeilds the correct answer.
fn columnCountsSWAR(chunk: @Vector(16, u8)) @Vector(16, u8) {
    const newlines: u16 = @bitCast(@as(@Vector(16, u8), chunk) == @as(@Vector(16, u8), @splat('\n')));
    const ones = 0x1111111111111111;
    const ascending_indices = 0xFEDCBA9876543210;
    const restart_nibbles_mask = pdep(newlines, ones ^ 1) *% 0xF;
    const restart_nibbles_indices = pext(ascending_indices, restart_nibbles_mask);
    const prefix_diff = restart_nibbles_indices -% (restart_nibbles_indices << 4);
    const vec: @Vector(8, u8) = @bitCast(ascending_indices -% pdep(prefix_diff, restart_nibbles_mask) *% ones);
    return std.simd.interlace(.{ vec, vec >> @splat(4) }) & @as(@Vector(16, u8), @splat(0xF));
}

/// This implementation uses predicated shift+adds to emulate predicated prefix-sum
fn columnCountsVec1(chunk: @Vector(16, u8)) @TypeOf(chunk) {
    var mask = chunk != @as(@TypeOf(chunk), @splat('\n'));
    var array_count = @select(u8, mask, @as(@TypeOf(chunk), @splat(1)), @as(@TypeOf(chunk), @splat(0)));

inline for (0..std.math.log2(@sizeOf(@TypeOf(chunk)))) |i| {
        array_count = @select(u8, mask, array_count +% std.simd.shiftElementsRight(array_count, 1 << i, 0), array_count);
        mask = @select(bool, mask, std.simd.shiftElementsRight(mask, 1 << i, false), @as(@TypeOf(mask), @splat(false)));
    }

return std.simd.shiftElementsRight(array_count, 1, 0);
}

/// This implementation does the same thing as the previous but inverts the conditions.
/// Same number of instructions, but performs slightly worse on my machine.
fn columnCountsVec2(chunk: @Vector(16, u8)) @TypeOf(chunk) {
    var mask = chunk == @as(@TypeOf(chunk), @splat('\n'));
    var array_count = @select(u8, mask, @as(@TypeOf(chunk), @splat(0)), @as(@TypeOf(chunk), @splat(1)));

inline for (0..std.math.log2(@sizeOf(@TypeOf(chunk)))) |i| {
        array_count = @select(u8, mask, array_count, array_count +% std.simd.shiftElementsRight(array_count, 1 << i, 0));
        mask = @select(bool, mask, @as(@TypeOf(mask), @splat(true)), std.simd.shiftElementsRight(mask, 1 << i, true));
    }

return std.simd.shiftElementsRight(array_count, 1, 0);
}

/// This implementation inverts the conditions like the previous but instead uses a strategy similar to the SWAR implementation,
/// producing a vector to be substracted from an iota vector.
fn columnCountsVec3(chunk: @Vector(16, u8)) @TypeOf(chunk) {
    var mask = chunk == @as(@TypeOf(chunk), @splat('\n'));
    mask = std.simd.shiftElementsRight(mask, 1, false);
    var array_count = @select(u8, mask, std.simd.iota(u8, @sizeOf(@TypeOf(chunk))), @as(@TypeOf(chunk), @splat(0)));

inline for (0..std.math.log2(@sizeOf(@TypeOf(chunk)))) |i| {
        array_count = @select(u8, mask, array_count, array_count | std.simd.shiftElementsRight(array_count, 1 << i, 0));
        mask = @select(bool, mask, @as(@TypeOf(mask), @splat(true)), std.simd.shiftElementsRight(mask, 1 << i, true));
    }

return std.simd.iota(u8, @sizeOf(@TypeOf(chunk))) - array_count;
}

inline fn pdep(src: u64, mask: u64) u64 {
    return asm ("pdep %[mask], %[src], %[ret]"
        : [ret] "=r" (-> u64),
        : [src] "r" (src),
          [mask] "r" (mask),
    );
}

inline fn pext(src: u64, mask: u64) u64 {
    return asm ("pext %[mask], %[src], %[ret]"
        : [ret] "=r" (-> u64),
        : [src] "r" (src),
          [mask] "r" (mask),
    );
}

fn printVec(v: u64) void {
    std.debug.print("\t{x}\n", .{@as(@Vector(16, u4), @bitCast(v))});
}

fn printu(v: anytype) void {
    comptime var shift = @bitSizeOf(@TypeOf(v));
    inline while (true) {
        shift -= 4;
        std.debug.print("{b:0>4}{c}", .{ @as(u4, @truncate(v >> shift)), if (shift % 8 == 0) ' ' else '_' });
        if (shift == 0) break;
    }
    std.debug.print("\n", .{});
}

test "columnCounts" {
    std.debug.print("\n", .{});
    // const newlines = @as(u16, 0b0010000001001000);
    // std.debug.print("{x: >2}\n", .{@as(@Vector(16, u1), @bitCast(newlines))});
    // printVec(columnCounts(newlines));

var i: u16 = 0b0000000000000010;
    while (true) {
        var j: u4 = 0;
        var ans: u64 = 0;
        var cur: u4 = 0;
        // var prev_bit_was_1 = true;
        while (true) {
            const bit: u1 = @truncate(i >> j);
            // if (bit == 1) cur = 0; // This sets the value in the newline slot, which is somewhat incidental
            // if (prev_bit_was_1 and bit == 0) cur = 0;
            ans |= @as(u64, cur) << (@as(u6, j) << 2);
            j +%= 1;
            if (j == 0) break;
            cur += 1;
            // prev_bit_was_1 = bit == 1;
            if (bit == 1) cur = 0;
        }

var chunk: [16]u8 = undefined;
        inline for (&chunk, 0..) |*slot, k| {
            slot.* = if (((i >> k) & 1) == 1) '\n' else '\x00';
        }
        const reference_ans = @as(@Vector(16, u8), @as(@Vector(16, u4), @bitCast(ans)));
        const optimized_ans0 = columnCountsSWAR(chunk);
        const optimized_ans1 = columnCountsVec1(chunk);
        const optimized_ans2 = columnCountsVec2(chunk);
        const optimized_ans3 = columnCountsVec3(chunk);
        // std.debug.print("\t{x}\n", .{@as(@Vector(16, u1), @bitCast(i))});
        // std.debug.print("\t{x}\n", .{chunk});
        // std.debug.print("\t{x}\n", .{reference_ans});
        // std.debug.print("\t{x}\n", .{optimized_ans1});
        // std.debug.print("\t{x}\n\n", .{optimized_ans2});

try std.testing.expectEqual(reference_ans, optimized_ans0);
        try std.testing.expectEqual(reference_ans, optimized_ans1);
        try std.testing.expectEqual(reference_ans, optimized_ans2);
        try std.testing.expectEqual(reference_ans, optimized_ans3);
        i +%= 1;
        if (i == 0) break;
    }
}

pub fn main() void {
    var buffer: [1 << 16][16]u8 = undefined;
    for (&buffer, 0..) |*chunk, i| {
        for (chunk, 0..) |*slot, k| {
            slot.* = if (((i >> @intCast(k)) & 1) == 1) '\n' else '\x00';
        }
    }

inline for ([_]@TypeOf(columnCountsSWAR){ columnCountsSWAR, columnCountsVec1, columnCountsVec2, columnCountsVec3 }) |func| {
        const t1 = std.time.nanoTimestamp();
        var acc: @Vector(16, u8) = @splat(0);
        for (0..10000) |_| {
            for (buffer) |chunk| {
                acc +%= func(chunk);
                std.mem.doNotOptimizeAway(acc);
            }
        }
        const t2 = std.time.nanoTimestamp();
        std.mem.doNotOptimizeAway(acc);
        std.debug.print("{}\n", .{std.fmt.fmtDuration(@intCast(t2 - t1))});
    }
}