Compiler Explorer

Source code

const std = @import("std");
const builtin = @import("builtin");

const WIDTH = 16; // How many elements to operate on at once

/// Compresses the elements in `data` corresponding to the `condition` vector.
/// Writes to `dest`, including a number of undefined bytes.
/// In total, this expression gives the number of bytes written past `dest`:
/// switch (WIDTH) {
///    8, 16, 32 => (64 - WIDTH) + 32,
///    64 => 64,
/// }
export fn compress(data: @Vector(64, u8), condition: @Vector(64, u8), dest: [*]u8) u8 {
    const U = std.meta.Int(.unsigned, WIDTH*2);
    const indices = comptime @as(@Vector(64, u8), @bitCast(std.simd.deinterlace(4, std.simd.iota(u8, 64) & @as(@Vector(64, u8), @splat(WIDTH - 1)))));

var prefix_sums = @select(u8, condition != @as(@Vector(64, u8), @splat(0)),
        @as(@Vector(64, u8), @splat(255)),
        @as(@Vector(64, u8), @splat(0)),
    );

// Next, shift elements right by 1, 2, 4, 8, 16, and 32, and accumulate at each step
    inline for (0..std.math.log2(WIDTH)) |i| {
        prefix_sums +%= shiftInterleavedElementsRight(prefix_sums, 1 << i, U);
    }

comptime var prefix_sum_multiplier = 0;
    inline for (0..64 / WIDTH) |i| prefix_sum_multiplier |= 1 << i*WIDTH;
    const prefix_sum_of_offsets: [8]u8 = @bitCast(
    @as([2]u64, @bitCast(
        uzp2(
            neg(
                @as([4]@Vector(16, u8), @bitCast(prefix_sums))[3]
            )
        )
    ))[0] *% prefix_sum_multiplier);

// Now take the identity indices and add it to the prefix_sums.
    // This value tells us how far each value should be left-shifted
    var travel_distances = indices +% shiftInterleavedElementsRight(prefix_sums, 1, U);
    var compressed_data = data;

inline for (0..std.math.log2(WIDTH)) |x| {
        const i = 1 << x;
        const shifted_left = shiftInterleavedElementsLeft(travel_distances, i, U);
        const shifted_compressed_data = shiftInterleavedElementsLeft(compressed_data, i, U);
        const selector = cmtst(shifted_left, @splat(i));
        travel_distances = bsl(selector, shifted_left, travel_distances);
        compressed_data = bsl(selector, shifted_compressed_data, compressed_data);
    }

inline for (0..64 / WIDTH) |i| {
        (if (WIDTH == 64) st4 else st4_first_32)(
            dest[if (i == 0) 0 else prefix_sum_of_offsets[i*(WIDTH / 8) - 1]..],
            shiftInterleavedElementsLeft(compressed_data, WIDTH*i, u128)
        );
    }

return prefix_sum_of_offsets[7];
}

fn bsl(selector: anytype, a: @TypeOf(selector), b: @TypeOf(selector)) @TypeOf(selector) {
    return (a & selector) | (b & ~selector);
}

fn cmtst(a: anytype, comptime b: @TypeOf(a)) @TypeOf(a) {
    return @select(u8, (a & b) != @as(@TypeOf(a), @splat(0)), @as(@TypeOf(a), @splat(0xff)), @as(@TypeOf(a), @splat(0)));
}

fn uzp2(x: @Vector(16, u8)) @Vector(16, u8) {
    return @shuffle(u8, x, undefined, [_]i32{ 1, 3, 5, 7, 9, 11, 13, 15, 1, 3, 5, 7, 9, 11, 13, 15 });
}

fn neg(x: anytype) @TypeOf(x) {
    return @as(@TypeOf(x), @splat(1)) +% ~x;
}

const HAS_ARM_NEON = switch (builtin.cpu.arch) {
    .aarch64, .aarch64_be => std.Target.aarch64.featureSetHas(builtin.cpu.features, .neon),
    .arm, .armeb => std.Target.arm.featureSetHas(builtin.cpu.features, .neon),
    else => false,
};

fn st4(ptr: [*]u8, vec: @Vector(64, u8)) void {
    const chunks: [4]@Vector(16, u8) = @bitCast(vec);
    if (!HAS_ARM_NEON or @inComptime()) {
        ptr[0..64].* = std.simd.interlace(chunks);
    } else struct {
        extern fn @"llvm.aarch64.neon.st4.v16i8.p0"(@Vector(16, u8), @Vector(16, u8), @Vector(16, u8), @Vector(16, u8), [*]u8) void;
    }.@"llvm.aarch64.neon.st4.v16i8.p0"(chunks[0], chunks[1], chunks[2], chunks[3], ptr);
}

fn st4_first_32(ptr: [*]u8, vec: @Vector(64, u8)) void {
    const chunks: [4]@Vector(16, u8) = @bitCast(vec);
    if (!HAS_ARM_NEON or @inComptime()) {
        ptr[0..32].* = std.simd.interlace(.{
            @as([2]@Vector(8, u8), @bitCast(chunks[0]))[0],
            @as([2]@Vector(8, u8), @bitCast(chunks[1]))[0],
            @as([2]@Vector(8, u8), @bitCast(chunks[2]))[0],
            @as([2]@Vector(8, u8), @bitCast(chunks[3]))[0],
        });
    } else struct {
        extern fn @"llvm.aarch64.neon.st4.v16i8.p0"(@Vector(8, u8), @Vector(8, u8), @Vector(8, u8), @Vector(8, u8), [*]u8) void;
    }.@"llvm.aarch64.neon.st4.v16i8.p0"(
        @as([2]@Vector(8, u8), @bitCast(chunks[0]))[0],
        @as([2]@Vector(8, u8), @bitCast(chunks[1]))[0],
        @as([2]@Vector(8, u8), @bitCast(chunks[2]))[0],
        @as([2]@Vector(8, u8), @bitCast(chunks[3]))[0],
        ptr,
    );
}

fn shiftElementsLeft(vec: @Vector(16, u8), comptime amount: std.simd.VectorCount(@Vector(64, u8)), comptime boundary: type) @Vector(16, u8) {
    return if (boundary == u128)
        std.simd.shiftElementsLeft(vec, amount, 0)
    else
        @bitCast(@as(@Vector(16 / @sizeOf(boundary), boundary), @bitCast(vec)) >> @splat(8*amount));
}

fn shiftInterleavedElementsLeft(vecs: @Vector(64, u8), comptime amount: std.simd.VectorCount(@Vector(64, u8)), boundary: type) @Vector(64, u8) {
    var new_vecs: [4]@Vector(16, u8) = @bitCast(vecs);

if ((amount & 1) == 1) {
        const n = shiftElementsLeft(new_vecs[0], 1, boundary);
        new_vecs[0] = new_vecs[1];
        new_vecs[1] = new_vecs[2];
        new_vecs[2] = new_vecs[3];
        new_vecs[3] = n;
    }

if ((amount & 2) == 2) {
        const n0 = shiftElementsLeft(new_vecs[0], 1, boundary);
        const n1 = shiftElementsLeft(new_vecs[1], 1, boundary);
        new_vecs[0] = new_vecs[2];
        new_vecs[1] = new_vecs[3];
        new_vecs[2] = n0;
        new_vecs[3] = n1;
    }

const remaining_amt = amount >> 2;

if (remaining_amt > 0) {
        new_vecs = .{
            shiftElementsLeft(new_vecs[0], remaining_amt, boundary),
            shiftElementsLeft(new_vecs[1], remaining_amt, boundary),
            shiftElementsLeft(new_vecs[2], remaining_amt, boundary),
            shiftElementsLeft(new_vecs[3], remaining_amt, boundary)
        };
    }

return @bitCast(new_vecs);
}

fn shiftElementsRight(vec: @Vector(16, u8), comptime amount: std.simd.VectorCount(@Vector(64, u8)), comptime boundary: type) @Vector(16, u8) {
    return if (boundary == u128)
        std.simd.shiftElementsRight(vec, amount, 0)
    else
        @bitCast(@as(@Vector(16 / @sizeOf(boundary), boundary), @bitCast(vec)) << @splat(8*amount));
}

fn shiftInterleavedElementsRight(vecs: @Vector(64, u8), comptime amount: std.simd.VectorCount(@Vector(64, u8)), boundary: type) @Vector(64, u8) {
    var new_vecs: [4]@Vector(16, u8) = @bitCast(vecs);

if ((amount & 1) == 1) {
        const n = shiftElementsRight(new_vecs[3], 1, boundary);
        new_vecs[3] = new_vecs[2];
        new_vecs[2] = new_vecs[1];
        new_vecs[1] = new_vecs[0];
        new_vecs[0] = n;
    }

if ((amount & 2) == 2) {
        const n1 = shiftElementsRight(new_vecs[3], 1, boundary);
        const n0 = shiftElementsRight(new_vecs[2], 1, boundary);
        new_vecs[3] = new_vecs[1];
        new_vecs[2] = new_vecs[0];
        new_vecs[1] = n1;
        new_vecs[0] = n0;
    }

const remaining_amt = amount >> 2;

if (remaining_amt > 0) {
        new_vecs = .{
            shiftElementsRight(new_vecs[0], remaining_amt, boundary),
            shiftElementsRight(new_vecs[1], remaining_amt, boundary),
            shiftElementsRight(new_vecs[2], remaining_amt, boundary),
            shiftElementsRight(new_vecs[3], remaining_amt, boundary)
        };
    }

return @bitCast(new_vecs);
}

fn main() void {
    comptime {
        @setEvalBranchQuota(100000);
        const ptr = "0123456789abcdefghijklmnopqrstuvwxyz%$ABCDEFGHIJKLMNOPQRSTUVWXYZ";

for (ptr, 0..) |c, i| {
            if (c == ' ') @compileLog(i);
        }
        
        const vec: @Vector(64, u8) = @bitCast(std.simd.deinterlace(4, ptr[0..64].*));
        var selected: @Vector(64, u8) = @splat(0);

for (.{
            '3',
            '1',
            '2',
            'a',
            'd',
            'r',
            'D',
            '$',
            '%',
            'I',
            'T',
            'C',
            'B',
            'A',
            'Q',
            'Z',
            'V',
            'U',
        }) |c| {
            selected |= @select(u8, vec == @as(@Vector(64, u8), @splat(c)), @as(@Vector(64, u8), @splat(0xFF)), @as(@Vector(64, u8), @splat(0)));
        }

//const mask = vmovmaskq_u8(selected);
        var dest = std.mem.zeroes([64]u8);

_ = compress(vec, selected, &dest);
        @compileLog(std.fmt.comptimePrint("{s}", .{dest}));

}
}