Compiler Explorer

Source code

// This is an implementation of an Dynamic Score-Decomposed Trie.
// demo: https://validark.github.io/DynSDT/demo/
// paper: https://validark.github.io/DynSDT/

// This implementation does not support empty string queries, because they are probably not useful in practice.
// This particular implementation uses the LCRS-style of representing nodes.
// I.e., each node has a `next` and a `down` pointer to another node, instead of each node holding an array.

// The advantage to this implementation is that updating a particular term's score in the data structure will
// not perform any allocations. The only allocations that may occur are when adding genuinely new data.

// The disadvantage is that this structure would probably be less than ideal if it was to be shared in a
// multi-threaded setting. The best option in that setting might be to use multiple structures, or
// consider using a lock-free implementation that relies on arrays of nodes, such that the bottommost-
// changed array and its parents up to the root can be copied and then the root pointer can be updated
// atomically to the new copy, or the work can be repeated if another thread updated the structure first.
// I suppose one *could* dream up a similar scheme with the LCRS linked-lists, but that would be pretty crazy.

const std = @import("std");
const builtin = @import("builtin");
const testing = std.testing;
const Allocator = std.mem.Allocator;

const string = []const u8;
const score_int = u32;
const scores_list_type = std.ArrayListUnmanaged(score_int);
const str_buf_int = u32;
const term_len_int = u8;
const node_index_int = u24;
const depq_int = u32;

/// This switch makes us maintain our node list in sorted order by score in memory (sorted by first character).
/// This means that topK completion queries do not need to look at scores at all (improving query times).
/// There is always the issue of reducing cache misses, which is somewhat difficult to plan for when each query
/// needs a potentially completely different set of answers. At the moment, the most optimal queries are
/// single-character queries, as their answers are laid out directly next to each other in memory. However,
/// if k is predetermined I think it would be best to make 2 character queries the baseline, and precompute
/// 1-character queries and just store the answer in a lookup table. This would require relatively little
/// precomputation and may give a small boost to longer queries.
/// However, this also means that dynamically changing scores in our tree can potentially
/// cause many nodes to shift in memory (that start with the same character). Worst case,
/// shifting can be an O(n) operation where n is the number of nodes. However, I suspect in
/// most use cases there is no need to move a node from the bottom to the top or vice versa.
/// In real-world cases, scores typically follow a skewed power law distribution.
/// (See Hsu & Ottaviano, 2013: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/TopKCompletion.pdf)
/// This means that if the scores just represented how many queries each term got, we would expect that
/// nodes would not move around that much relative to their neighbors. The only thing that might be troublesome
/// is that the vast majority of queries will have very low scores. This means a DDOS attack would be viable
/// if someone repeatedly queried for the extremely rare queries and they had to do a linear scan out of the
/// giant pool of terms at the bottom.
/// To combat this, we could allocate spaces specifically for this purpose, e.g. when a query's score goes from
/// 1 to 2, and there could be 100000 terms with a score of 1.
/// Another optimization this could enable is that scores could be stored more efficiently, like in https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/TopKCompletion.pdf
// Perhaps Elias-Fano will reduce the space used by scores.
const MAINTAIN_SORTED_ORDER_IN_MEMORY = true;

// These switches allow us to easily change strategies 😊

/// True increases code size, but may be faster if querying repeatedly
const MOVE_FIRST_ITERATION_OF_TOPK_QUERY_OUT_OF_LOOP = false;
/// Increases the node size, but may improve cache complexity when finding the locus node for a given string.
/// May worsen cache complexity for the topK enumeration algorithm, but if your cache is already a lost cause it can't hurt.
const STORE_4_TERM_BYTES_IN_NODE = false;

const USE_SIMD_FOR_PARSING_FILES = true;
const VEC_SIZE_FOR_PARSING_FILES = 32;
const FILE_BUFFER_SIZE: usize = std.mem.page_size * 8;

const USE_SIMD_TO_FIND_LCP = true;
const VEC_SIZE_TO_FIND_LCP = 16;

fn printCommifiedNumber(num: u32) void {
    var degree: usize = 1;
    while (num >= degree * 1000) degree *= 1000;
    var x = num / degree;
    var num_term_pairs_print: usize = x * degree;
    std.debug.print("{}", .{x});

while (degree != 1) {
        degree /= 1000;
        x = (num - num_term_pairs_print) / degree;
        num_term_pairs_print += x * degree;
        std.debug.print(",{:0>3}", .{x});
    }
}

const BufferRequirements = struct {
    num_term_pairs: u32,
    string_buffer_size: usize,
};

fn precomputeFileBufferRequirements(file: std.fs.File, file_buf: []u8) !BufferRequirements {
    var num_term_pairs: u32 = 2; // account for sentinels
    var string_buffer_size: usize = 0;
    var delimiter: u8 = '\t';

OUTER_LOOP: while (true) {
        var buf: []u8 = file_buf[0..FILE_BUFFER_SIZE];
        var bytes_to_consume = try file.read(buf);
        buf.len += 2;
        const is_final_buffer = bytes_to_consume != FILE_BUFFER_SIZE; // If we don't fill up buffer, break later
        while (bytes_to_consume != 0) {
            var buf_start = @ptrToInt(buf.ptr);
            while (USE_SIMD_FOR_PARSING_FILES and buf.len >= VEC_SIZE_FOR_PARSING_FILES) {
                const vec: @Vector(32, u8) = buf[0..VEC_SIZE_FOR_PARSING_FILES].*;
                var cmp2 = vec == @splat(VEC_SIZE_FOR_PARSING_FILES, delimiter);
                const bitmask = @bitCast(@Type(.{ .Int = .{ .signedness = .unsigned, .bits = VEC_SIZE_FOR_PARSING_FILES } }), cmp2);
                const first = @ctz(bitmask);
                buf = buf[first..];
                if (first != VEC_SIZE_FOR_PARSING_FILES) break;
            } else while (buf[0] != delimiter) buf = buf[1..];
            const diff = @ptrToInt(buf.ptr) - buf_start;
            string_buffer_size += if (delimiter == '\n') 0 else diff;
            const is_sentinel = @ptrToInt(&file_buf[FILE_BUFFER_SIZE]) == @ptrToInt(buf.ptr) - @boolToInt(delimiter == '\n');
            if (is_sentinel) continue :OUTER_LOOP;
            buf = buf[1..];
            bytes_to_consume -= diff + 1;
            num_term_pairs += @boolToInt(delimiter == '\n');
            delimiter ^= 3; // flip between '\n' <-> '\t'
        }

if (is_final_buffer) break;
    }

try file.seekTo(0);

return BufferRequirements{
        .num_term_pairs = num_term_pairs,
        .string_buffer_size = string_buffer_size,
    };
}

const DynSDT = struct {
    allocator: Allocator,
    nodes: std.ArrayListUnmanaged(Node),
    scores: scores_list_type,
    roots: [*]node_index_int,
    string_buffer: std.ArrayListUnmanaged(u8),

const Self = @This();
    const NULL: node_index_int = 0;

const @"down/term_len" = packed struct(u32) {
        down: node_index_int = NULL,
        term_len: term_len_int,
    };

const @"next/LCP" = packed struct(u32) {
        next: node_index_int = NULL,
        LCP: term_len_int = 0,
    };

const Node = struct {
        term_start: str_buf_int,
        @"down/term_len": @"down/term_len",
        @"next/LCP": @"next/LCP",
        next4chars: if (STORE_4_TERM_BYTES_IN_NODE) [4]u8 else void,

pub fn term(self: *const Node, buffer: string) string {
            return buffer[self.term_start .. self.term_start + self.getTermLen()];
        }

pub fn init(term_start: str_buf_int, term_len: term_len_int) Node {
            return Node{
                .term_start = term_start,
                .@"down/term_len" = @"down/term_len"{ .term_len = term_len },
                .@"next/LCP" = .{},
                .next4chars = if (STORE_4_TERM_BYTES_IN_NODE) [4]u8{ 0, 0, 0, 0 },
            };
        }

pub fn getDown(self: *const Node) node_index_int {
            return self.@"down/term_len".down;
        }

pub fn getTermLen(self: *const Node) u8 {
            return self.@"down/term_len".term_len;
        }

pub fn getNext(self: *const Node) node_index_int {
            return self.@"next/LCP".next;
        }

pub fn getLCP(self: *const Node) u8 {
            return self.@"next/LCP".LCP;
        }

pub fn update_next4chars(self: *Node, string_buffer: string) void {
            const LCP = self.getLCP();
            const term_start = self.term_start + LCP;
            // std.debug.print("{s} {} {}\n", .{ self.term(string_buffer), self.getTermLen(), LCP });
            const term_len = self.getTermLen() - LCP;

self.next4chars = [4]u8{
                if (term_len != 0) string_buffer[term_start + 0] else 0,
                if (term_len != 1) string_buffer[term_start + 1] else 0,
                if (term_len != 2) string_buffer[term_start + 2] else 0,
                if (term_len != 3) string_buffer[term_start + 3] else 0,
            };
        }
    };

/// Calculates Longest Common Prefix between `str1` and `str2`, must not alias
    fn longestCommonPrefix(LCP: term_len_int, noalias str1: string, noalias str2: string) term_len_int {
        @setCold(STORE_4_TERM_BYTES_IN_NODE);
        const len = @intCast(term_len_int, @min(str1.len, str2.len));
        var lcp = LCP;
        var str1_: string = str1;
        var str2_: string = str2;
        var first = lcp;
        while (USE_SIMD_TO_FIND_LCP and len - lcp >= VEC_SIZE_TO_FIND_LCP) {
            str1_ = str1_[first..];
            str2_ = str2_[first..];
            const vec1: @Vector(VEC_SIZE_TO_FIND_LCP, u8) = str1_[0..VEC_SIZE_TO_FIND_LCP].*;
            const vec2: @Vector(VEC_SIZE_TO_FIND_LCP, u8) = str2_[0..VEC_SIZE_TO_FIND_LCP].*;
            const answers = vec1 != vec2;
            const bitmask = @bitCast(@Type(.{ .Int = .{ .signedness = .unsigned, .bits = VEC_SIZE_TO_FIND_LCP } }), answers);
            first = @ctz(bitmask);
            lcp += first;
            if (first != VEC_SIZE_TO_FIND_LCP) break;
        } else while (lcp < len and str1[lcp] == str2[lcp]) lcp += 1;
        std.debug.assert(!(lcp < len and str1[lcp] == str2[lcp]));
        return lcp;
    }

fn initWithNodes(allocator: Allocator, node_list: std.ArrayListUnmanaged(Node), string_buffer: std.ArrayListUnmanaged(u8), scores: scores_list_type, num_term_pairs: node_index_int) !Self {
        // @compileLog(@sizeOf(Node));
        @setRuntimeSafety(false);
        const roots = try allocator.alloc(node_index_int, 256);
        {
            comptime var j = 0;
            inline while (j < 256) : (j += 1) {
                roots[j] = NULL;
            }
        }

var i = num_term_pairs;
        const nodes: []Node = node_list.items;
        const string_buffer_items = string_buffer.items;

NEXT_NODE: while (true) {
            i -= 1;
            if (i == 1) break;
            const term = nodes[i].term(string_buffer_items);
            const term_len = nodes[i].getTermLen();
            std.debug.assert(term.len > 0);

var LCP: term_len_int = 1;
            var cur_i: node_index_int = roots[term[0]];
            if (cur_i == NULL) {
                nodes[i].@"next/LCP".LCP = 1;
                if (STORE_4_TERM_BYTES_IN_NODE)
                    nodes[i].update_next4chars(string_buffer_items);
                roots[term[0]] = i;
                continue;
            }

while (true) {
                const term_chars_left = if (STORE_4_TERM_BYTES_IN_NODE) term_len - LCP;
                const first = if (STORE_4_TERM_BYTES_IN_NODE) @ctz(@bitCast(u4, @as(@Vector(4, u8), [4]u8{
                    if (term_chars_left != 0) term[LCP + 0] else 0,
                    if (term_chars_left != 1) term[LCP + 1] else 0,
                    if (term_chars_left != 2) term[LCP + 2] else 0,
                    if (term_chars_left != 3) term[LCP + 3] else 0,
                }) != nodes[cur_i].next4chars));
                if (STORE_4_TERM_BYTES_IN_NODE) LCP += first;

if (!STORE_4_TERM_BYTES_IN_NODE or first == 4)
                    LCP = longestCommonPrefix(LCP, term, nodes[cur_i].term(string_buffer_items));

if (nodes[cur_i].getNext() == NULL) {
                    nodes[i].@"next/LCP".LCP = LCP;
                    if (STORE_4_TERM_BYTES_IN_NODE) nodes[i].update_next4chars(string_buffer_items);
                    nodes[cur_i].@"next/LCP".next = i;
                    continue :NEXT_NODE;
                }

cur_i = nodes[cur_i].getNext();

while (nodes[cur_i].getLCP() != LCP) : (cur_i = nodes[cur_i].getDown()) {
                    if (nodes[cur_i].getDown() == NULL) {
                        nodes[i].@"next/LCP".LCP = LCP;
                        if (STORE_4_TERM_BYTES_IN_NODE) nodes[i].update_next4chars(string_buffer_items);
                        nodes[cur_i].@"down/term_len".down = i;
                        continue :NEXT_NODE;
                    }
                }
            }
        }

return Self{
            .nodes = node_list,
            .allocator = allocator,
            .roots = roots.ptr,
            .string_buffer = string_buffer,
            .scores = scores,
        };
    }

pub fn deinit(self: *Self) void {
        self.nodes.deinit(self.allocator);
        self.scores.deinit(self.allocator);
        self.string_buffer.deinit(self.allocator);
        self.allocator.free(self.roots[0..256]);
        self.* = undefined;
    }

inline fn getLocusIndexForPrefix(self: *const Self, prefix: string) u32 {
        const nodes: []Node = self.nodes.items;
        const str_buffer = self.string_buffer.items;
        std.debug.assert(prefix.len > 0);
        var cur_i: u32 = self.roots[prefix[0]];
        if (cur_i == NULL) return NULL;
        var LCP: term_len_int = 1;
        while (true) {
            const chars_left = if (STORE_4_TERM_BYTES_IN_NODE) prefix.len - LCP;
            const first = if (STORE_4_TERM_BYTES_IN_NODE) @ctz(@bitCast(u4, @as(@Vector(4, u8), [4]u8{
                if (chars_left != 0) prefix[LCP + 0] else 0,
                if (chars_left != 1) prefix[LCP + 1] else 0,
                if (chars_left != 2) prefix[LCP + 2] else 0,
                if (chars_left != 3) prefix[LCP + 3] else 0,
            }) != nodes[cur_i].next4chars));
            if (STORE_4_TERM_BYTES_IN_NODE) LCP += first;

if (!STORE_4_TERM_BYTES_IN_NODE or first == 4)
                LCP = longestCommonPrefix(LCP, prefix, nodes[cur_i].term(str_buffer));

if (LCP == prefix.len) return cur_i;
            cur_i = nodes[cur_i].getNext();

while (true) : (cur_i = nodes[cur_i].getDown()) {
                if (cur_i == NULL) return NULL;
                if (nodes[cur_i].getLCP() == LCP) break;
            }
        }
    }

inline fn firstVerticalSuccessor(self: *const Self, prefix_len: usize, _cur_i: u32) u32 {
        const nodes: []Node = self.nodes.items;
        var cur_i = nodes[_cur_i].getNext();
        while (true) : (cur_i = nodes[cur_i].getDown()) { // finds vertical successor
            // ideally, this should jump to the continuation expression of the while loop below
            if (cur_i == NULL) break;
            if (nodes[cur_i].getLCP() >= prefix_len) break; // might skip nodes when directly under the locus
        }
        return cur_i;
    }

fn verticalSuccessor(self: *const Self, prefix_len: usize, _cur_i: u32) u32 {
        const nodes: []Node = self.nodes.items;
        var cur_i = _cur_i;
        while (true) { // finds vertical successor
            cur_i = nodes[cur_i].getDown();
            // ideally, this should jump to the continuation expression of the while loop below
            if (cur_i == NULL) break;
            if (nodes[cur_i].getLCP() >= prefix_len) break; // might skip nodes when directly under the locus
        }
        return cur_i;
    }

inline fn horizontalSuccessor(self: *const Self, cur_i: u32) u32 {
        return self.nodes.items[cur_i].getNext();
    }

const topK_int = if (builtin.mode == .Debug) u32 else u6;

pub fn topKCompletionsToPrefixImpl(self: *const Self, prefix: string, results: anytype, comptime impl: u3) topK_int {
        return switch (impl) {
            0 => self.topKCompletionsToPrefix(prefix, results),
            1 => self.topKCompletionsToPrefix2(prefix, results),
            2 => self.topKCompletionsToPrefix6(prefix, results),
            3 => self.topKCompletionsToPrefixSIMD(prefix, results),
        };
    }

/// Takes in a string `prefix` and a pointer to an array `results`.
    /// Fills the `results` array as much as possible and returns the number of results found.
    /// The capacity of `results` has to fit in the return type of this function.
    pub fn topKCompletionsToPrefix5(self: *const Self, prefix: string, results: anytype) topK_int {
        @setRuntimeSafety(false);
        // infer topK from results, which should be a pointer to a fixed-length array.
        comptime var topK: topK_int =
            switch (@typeInfo(@TypeOf(results))) {
            .Pointer => |ptrType| switch (@typeInfo(ptrType.child)) {
                .Array => |arrType| arrType.len,
                else => @compileError("topKCompletionsToPrefix expects results to be a *[N]string."),
            },
            else => @compileError("topKCompletionsToPrefix expects results to be a *[N]string."),
        };

const nodes: []Node = self.nodes.items;
        const scores = self.scores.items;
        const str_buffer = self.string_buffer.items;
        var cur_i: u32 = self.getLocusIndexForPrefix(prefix);
        if (cur_i == NULL) return 0;

// nodes[cur_i] is now the locus node, so push it to the results!
        results[0] = nodes[cur_i].term(str_buffer);
        if (topK == 1) return 1;

// Find vertical successor, skipping over nodes as necessary with insufficient LCP's
        cur_i = self.firstVerticalSuccessor(prefix.len, cur_i);
        if (cur_i == NULL) return 1;

// nodes[cur_i] is now the second completion
        results[1] = nodes[cur_i].term(str_buffer);
        var k: topK_int = topK - 2;
        if (k == 0) return topK;

var depq: [topK / 2 + 2]depq_int = undefined; // double-ended priority queue

// these are sentinels, to avoid bounds checks, so depq is otherwise 1-indexed
        depq[0] = if (MAINTAIN_SORTED_ORDER_IN_MEMORY) std.math.minInt(depq_int) else NULL;
        {
            comptime var p = 1;
            inline while (p < depq.len) : (p += 1) {
                depq[p] = if (MAINTAIN_SORTED_ORDER_IN_MEMORY) std.math.maxInt(depq_int) else 1;
            }
        }

var depq_len: u32 = 0;
        var next_i: u32 = self.horizontalSuccessor(cur_i);
        cur_i = self.verticalSuccessor(prefix.len, cur_i);

if (MOVE_FIRST_ITERATION_OF_TOPK_QUERY_OUT_OF_LOOP) {
            const winner_i = if (MAINTAIN_SORTED_ORDER_IN_MEMORY)
                @max(cur_i, next_i)
            else if (scores[cur_i] > scores[next_i]) cur_i else next_i;
            if (winner_i == NULL) return 2;
            results[2] = nodes[winner_i].term(str_buffer);
            k -= 1;
            if (k == 0) return 3;
            const loser_i = if (MAINTAIN_SORTED_ORDER_IN_MEMORY)
                @min(cur_i, next_i)
            else
                next_i ^ cur_i ^ winner_i; // grab the loser

if (loser_i != NULL) {
                depq[1] = loser_i;
                depq_len += 1;
            }
            next_i = self.horizontalSuccessor(winner_i);
            cur_i = self.verticalSuccessor(prefix.len, winner_i);
        }

comptime var isFull = false;
        inline while (true) {
            NON_FULL_LOOP: while (true) {
                // insert cur_i and next_i into depq
                MY_LOOP: {
                    comptime var a: u2 = 2;
                    inline while (true) {
                        // return to this point, but with isFull comptime-known to be true
                        if (!isFull and depq_len == k) {
                            if (a == 1) next_i = NULL;
                            break :NON_FULL_LOOP;
                        }

if (cur_i != NULL) {
                            const score = if (MAINTAIN_SORTED_ORDER_IN_MEMORY) cur_i else scores[cur_i];

// if depq is full and node.score is lower than its minimum, don't insert
                            if (!(isFull and
                                if (MAINTAIN_SORTED_ORDER_IN_MEMORY)
                                score < depq[1] // score == depq[1] is impossible because indexes are unique
                            else
                                score <= scores[depq[1]]))
                            {
                                var i: i32 = if (isFull) 2 else @intCast(i32, depq_len);
                                var iter: i32 = if (isFull) -1 else 1;

// [ ::, 12, 8, 6, 4, 2 ::]
                                while (if (MAINTAIN_SORTED_ORDER_IN_MEMORY)
                                    (if (isFull)
                                        score > depq[@intCast(usize, i)]
                                    else
                                        score < depq[@intCast(usize, i)])
                                else
                                    (if (isFull)
                                        score > scores[depq[@intCast(usize, i)]]
                                    else
                                        score < scores[depq[@intCast(usize, i)]])) : (i -= iter)
                                { // we have a sentinel at depq[0]
                                    depq[@intCast(usize, i + iter)] = depq[@intCast(usize, i)]; // shift elements as needed to maintain ascending order (by score)
                                }
                                depq[@intCast(usize, i + iter)] = cur_i;
                                depq_len += 1 - @boolToInt(isFull);
                            }
                        }

a -= 1;
                        if (a == 0) break;
                        if (next_i == NULL) break :MY_LOOP;
                        cur_i = next_i;
                    }
                }

// pop from the DEPQ if possible
                if (depq_len == 0) return topK - k;

// Pop the top element off the DEPQ (1-indexed)
                cur_i = depq[depq_len];

if (builtin.mode == .Debug)
                    // We don't NEED to do this, because depq[depq_len] is higher ordered than any nodes
                    // which may follow. It does make it easier to see what is going on in Debug mode though.
                    depq[depq_len] = if (MAINTAIN_SORTED_ORDER_IN_MEMORY) std.math.maxInt(depq_int) else 1;

depq_len -= 1;
                results[topK - k] = nodes[cur_i].term(str_buffer);
                k -= 1;
                if (k == 0) return topK;
                next_i = self.horizontalSuccessor(cur_i);
                cur_i = self.verticalSuccessor(prefix.len, cur_i);
            }

if (isFull) break;
            isFull = true;
        }

return topK - k;
    }

// SIMD version
    /// Takes in a string `prefix` and a pointer to an array `results`.
    /// Fills the `results` array as much as possible and returns the number of results found.
    /// The capacity of `results` has to fit in the return type of this function.
    pub fn topKCompletionsToPrefixSIMD(self: *const Self, prefix: string, results: anytype) topK_int {
        @setRuntimeSafety(false);
        // infer topK from results, which should be a pointer to a fixed-length array.
        comptime var topK: topK_int =
            switch (@typeInfo(@TypeOf(results))) {
            .Pointer => |ptrType| switch (@typeInfo(ptrType.child)) {
                .Array => |arrType| arrType.len,
                else => @compileError("topKCompletionsToPrefix expects results to be a *[N]string."),
            },
            else => @compileError("topKCompletionsToPrefix expects results to be a *[N]string."),
        };

const nodes: []Node = self.nodes.items;
        const str_buffer = self.string_buffer.items;
        var cur_i: u32 = self.getLocusIndexForPrefix(prefix);
        if (cur_i == NULL) return 0;

// nodes[cur_i] is now the locus node, so push it to the results!
        results[0] = nodes[cur_i].term(str_buffer);
        if (topK == 1) return 1;
        var k: topK_int = topK - 1;
        // Find vertical successor, skipping over nodes as necessary with insufficient LCP's
        cur_i = nodes[cur_i].getNext();

const DEPQ_SIZE = topK / 2;
        var depq: @Vector(DEPQ_SIZE, depq_int) = @splat(DEPQ_SIZE, @as(depq_int, 0)); // double-ended priority queue
        const bitmap_int = @Type(.{ .Int = .{ .signedness = .unsigned, .bits = DEPQ_SIZE } });

var next_i: u32 = NULL;
        // The index in depq which can safely be overwritten, because there is garbage there.
        var garbarge_slot: u32 = 0;

// @compileLog(bitmap_int);

while (true) {
            blk2: {
                blk: {
                    while (true) : (cur_i = nodes[cur_i].getDown()) { // finds vertical successor
                        // ideally, this should jump to the continuation expression of the while loop below
                        if (cur_i == NULL) break :blk;
                        if (nodes[cur_i].getLCP() >= prefix.len) break; // might skip nodes when directly under the locus
                    }
                    // insert cur_i and next_i into depq
                    depq[garbarge_slot] = cur_i;
                    if (next_i == NULL) break :blk2;
                    garbarge_slot = @ctz(@bitCast(bitmap_int, depq == @splat(DEPQ_SIZE, @reduce(.Min, depq))));
                }

depq[garbarge_slot] = next_i;
            }
            // Pop the top element off the DEPQ
            cur_i = @reduce(.Max, depq);
            if (cur_i == NULL) return topK - k;
            results[topK - k] = nodes[cur_i].term(str_buffer);
            k -= 1;
            if (k == 0) return topK;
            garbarge_slot = @ctz(@bitCast(bitmap_int, depq == @splat(DEPQ_SIZE, cur_i)));
            next_i = self.horizontalSuccessor(cur_i);
            cur_i = nodes[cur_i].getDown();
        }
    }

inline fn get_winner_i(a: u32, b: u32, scores: if (MAINTAIN_SORTED_ORDER_IN_MEMORY) []score_int) u32 {
        return if (MAINTAIN_SORTED_ORDER_IN_MEMORY) // grab the winner
            @max(a, b)
        else if (scores[a] > scores[b]) a else b;
    }

inline fn get_loser_i(a: u32, b: u32, winner_i: u32) u32 {
        return if (MAINTAIN_SORTED_ORDER_IN_MEMORY)
            @min(a, b)
        else
            a ^ b ^ winner_i; // grab the loser
    }

fn h(b: bool) string {
        return if (b) "⌄⌄⌄⌄⌄⌄⌄" else "⌃⌃⌃⌃⌃⌃⌃";
    }

fn printState(depq_len: u32, depq: [4]u32, pred: [4]bool, shifted_depq: [4]u32, cur_i: u32) void {
        if (1 != 1) {
            std.debug.print("T depq: [{}]string{{ {:0>7}, {:0>7}, {:0>7}, {:0>7} }}, inserted: {}\n", .{ depq_len, if (depq_len > 0) depq[0] else 0, if (depq_len > 1) depq[1] else 0, if (depq_len > 2) depq[2] else 0, if (depq_len > 3) depq[3] else 0, cur_i });
            std.debug.print("+ depq: [{}]string{{ {s: >0}, {s: >0}, {s: >0}, {s: >0} }}\n", .{ depq_len, h(pred[0]), h(pred[1]), h(pred[2]), h(pred[3]) });
            std.debug.print("F depq: [{}]string{{ {:0>7}, {:0>7}, {:0>7}, {:0>7} }}, inserted: {}\n", .{ depq_len, if (depq_len >= 0) shifted_depq[0] else 0, if (depq_len >= 1) shifted_depq[1] else 0, if (depq_len >= 2) shifted_depq[2] else 0, if (depq_len >= 3) shifted_depq[3] else 0, cur_i });
        }
    }

fn printFinalState(depq_len: u32, depq: [4]u32, cur_i: u32, k: u4) void {
        std.debug.print("_ depq: [{}]string{{ {:0>7}, {:0>7}, {:0>7}, {:0>7} }}, inserted: {}, k: {}\n\n", .{ depq_len, if (depq_len > 0) depq[0] else 0, if (depq_len > 1) depq[1] else 0, if (depq_len > 2) depq[2] else 0, if (depq_len > 3) depq[3] else 0, cur_i, k });
    }

pub fn topKCompletionsToPrefix6(self: *const Self, prefix: string, results: *[10]string) u4 {
        @setRuntimeSafety(false);
        const SHOULD_PRINT = false;
        const nodes: []Node = self.nodes.items;
        const scores = if (MAINTAIN_SORTED_ORDER_IN_MEMORY) self.scores.items;
        const str_buffer = self.string_buffer.items;

var cur_i: depq_int = self.getLocusIndexForPrefix(prefix);
        if (cur_i == NULL) return 0;

// nodes[cur_i] is now the locus node, so push it to the results!
        results[0] = nodes[cur_i].term(str_buffer);

// Find vertical successor, skipping over nodes as necessary with insufficient LCP's
        cur_i = self.firstVerticalSuccessor(prefix.len, cur_i);
        if (cur_i == NULL) return 1;

// nodes[cur_i] is now the second completion
        results[1] = nodes[cur_i].term(str_buffer);

var next_i: depq_int = self.horizontalSuccessor(cur_i);
        cur_i = self.verticalSuccessor(prefix.len, cur_i);

var depq_len: u16 = @boolToInt(loser_i != NULL); // TODO: try u8, u16, u32
        cur_i = winner_i;

// if (SHOULD_PRINT) std.debug.print("prefix: \"{s}\"\n_ depq: [{}]string{{ {:0>7}, {:0>7}, {:0>7}, {:0>7}, {:0>7} }}, inserted: {}\n", .{ prefix, depq_len, depq[0], 0, 0, 0, 0, loser_i });

var do_cur_insertion = false;
        while (true) {
            comptime var shifted: [half_topK]i32 = undefined;
            comptime var isFull = false;
            comptime for (shifted) |*p, j| {
                p.* = switch (@intCast(i32, j) - if (isFull) -1 else 1) {
                    half_topK => -1,
                    else => |e| e,
                };
            };

next_i = self.horizontalSuccessor(cur_i);
            cur_i = self.verticalSuccessor(prefix.len, cur_i);

if (cur_i != NULL) {

// { 0, 3, 5, 7, _}
                // { 0, 3, 5, 6, 7}
                var i: u16 = depq_len;
                while (true) {
                    if (cur_i > depq[i - 1]) break;
                    depq[i] = depq[i - 1];
                    i -= 1;
                }
                depq[i] = cur_i;
                // var depq_vec: @Vector(half_topK, depq_int) = depq;
                // const shifted_depq = @shuffle(depq_int, depq_vec, @splat(1, @as(u32, 0)), shifted);
                // const pred = @splat(half_topK, cur_i) < depq_vec;
                // if (SHOULD_PRINT) printState(depq_len, depq, pred, shifted_depq, cur_i);

// depq = @select(depq_int, pred, shifted_depq, depq_vec);
                // depq[@ctz(@bitCast(half_topK_bitmap_int, pred))] = cur_i;
                // if (SHOULD_PRINT) printFinalState(depq_len + 1, depq, cur_i, k);
                depq_len += 1;
                if (depq_len >= 10 - k) {
                    // if (SHOULD_PRINT) std.debug.print("depq_len: {}, k: {}\n", .{ depq_len, k });
                    do_cur_insertion = true;
                    break;
                }
            }

if (next_i != NULL) {
                var depq_vec: @Vector(half_topK, depq_int) = depq;
                const shifted_depq = @shuffle(depq_int, depq_vec, @splat(1, @as(u32, 0)), shifted);
                const pred = @splat(half_topK, next_i) < depq_vec;
                if (SHOULD_PRINT) printState(depq_len, depq, pred, shifted_depq, next_i);
                var tmp = depq[3];
                depq = @select(depq_int, pred, shifted_depq, depq_vec);
                const x = @ctz(@bitCast(half_topK_bitmap_int, pred));

if (SHOULD_PRINT) std.debug.print("Saving previous maximum: {}\n", .{next_i});
                if (x == 4) break;
                depq[x] = next_i;
                next_i = tmp;
                if (SHOULD_PRINT) std.debug.print("Saving previous maximum: {}\n", .{next_i});
                if (depq_len == 4) break;
                depq_len += 1;
                if (depq_len >= 11 - k) break;
                if (SHOULD_PRINT) printFinalState(depq_len, depq, depq[x], k);
            }

// for some reason, LLVM does not do this optimization automatically when I check here:
            // if (depq_len == 0) break;
            // So instead, we decrement first and then check for wrap-around
            depq_len -%= 1;
            if (depq_len == std.math.maxInt(@TypeOf(depq_len))) return k;
            cur_i = depq[depq_len];
            if (SHOULD_PRINT) std.debug.print("pushing {} to results[{}]\n", .{ cur_i, k });
            results[k] = nodes[cur_i].term(str_buffer);
            k += 1;
        }

if (SHOULD_PRINT) {
            std.debug.print("___\n", .{});
            printFinalState(depq_len, depq, next_i, k);
            std.debug.print("___\n", .{});
        }

while (true) {
            comptime var shifted: [half_topK]i32 = undefined;
            comptime var isFull = true;
            comptime for (shifted) |*p, j| {
                p.* = switch (@intCast(i32, j) - if (isFull) -1 else 1) {
                    half_topK => -1,
                    else => |e| e,
                };
            };

if (do_cur_insertion) {
                if (SHOULD_PRINT) std.debug.print("pushing {} to results[{}]\n", .{ next_i, k });
                results[k] = nodes[next_i].term(str_buffer);
                if (k == 9) return 10;
                k += 1;
                cur_i = self.verticalSuccessor(prefix.len, next_i);
                next_i = self.horizontalSuccessor(next_i);

// { 3, 5, 7, 9 } 10

if (cur_i != NULL) {
                    var depq_vec: @Vector(half_topK, depq_int) = depq;
                    const shifted_depq = @shuffle(depq_int, depq_vec, @splat(1, @as(u32, 0)), shifted);
                    const pred = @splat(half_topK, cur_i) < depq_vec;
                    if (SHOULD_PRINT) printState(depq_len, depq, pred, shifted_depq, cur_i);
                    depq = @select(depq_int, pred, depq_vec, shifted_depq);
                    const lk = @ctz(@bitCast(half_topK_bitmap_int, pred));
                    if (lk != 0)
                        depq[lk - 1] = cur_i;
                    if (SHOULD_PRINT) printFinalState(depq_len, depq, cur_i, k);
                }
            }

do_cur_insertion = false;

if (next_i != NULL) {
                var depq_vec: @Vector(half_topK, depq_int) = depq;
                const shifted_depq = @shuffle(depq_int, depq_vec, @splat(1, @as(u32, 0)), shifted);
                const pred = @splat(half_topK, next_i) < depq_vec;
                if (SHOULD_PRINT) printState(depq_len, depq, pred, shifted_depq, next_i);
                depq = @select(depq_int, pred, depq_vec, shifted_depq);
                const lk = @ctz(@bitCast(half_topK_bitmap_int, pred));
                if (lk != 0)
                    depq[lk - 1] = next_i;
                if (SHOULD_PRINT) printFinalState(depq_len, depq, next_i, k);
            }

fn insert_cur_i(self: *const Self, depq_len: u8, cur_i: u32, str_buffer: string, results: *[10]string, prefix_len: u64, depq: *[4]depq_int, depq_1_indexed: *[5]depq_int, k: u8, comptime half_topK: comptime_int) u8 {
        comptime var shifted: [half_topK]i32 = undefined;
        comptime var isFull = true;
        comptime for (shifted) |*p, j| {
            p.* = switch (@intCast(i32, j) - if (isFull) -1 else 1) {
                half_topK => -1,
                else => |e| e,
            };
        };
        const nodes: []Node = self.nodes.items;
        // if (SHOULD_PRINT) std.debug.print("pushing {} to results[{}]\n", .{ cur_i, k });
        results[k] = nodes[cur_i].term(str_buffer);

// Explain to me how it makes sense that this results in this assembly...
        // inc     al
        // cmp     al, 10
        // je      .LBB16_45
        // ...
        // ...
        // .LBB16_45:
        // mov     al, 10
        // jmp     .LBB16_2
        // You don't need a formal theorem prover to figure out that `al` is already 10...
        var j = k + 1;
        if (j == 10) return 10;

var next_i = self.horizontalSuccessor(cur_i);
        var cur_i2 = self.verticalSuccessor(prefix_len, cur_i);

// { 3, 5, 7, 9 } 10

if (cur_i2 != NULL) { // we could check if cur_i2 > depq[0]
            var depq_vec: @Vector(half_topK, depq_int) = depq.*;
            const shifted_depq = @shuffle(depq_int, depq_vec, @splat(1, @as(depq_int, std.math.maxInt(depq_int))), shifted);
            const pred = depq_vec < @splat(half_topK, cur_i2);
            // if (SHOULD_PRINT) printState(depq_len, depq, pred, shifted_depq, cur_i2);
            depq.* = @select(depq_int, pred, shifted_depq, depq_vec);
            depq_1_indexed[@popCount(@bitCast(@Type(.{ .Int = .{ .signedness = .unsigned, .bits = half_topK } }), pred))] = cur_i2;
            // if (SHOULD_PRINT) printFinalState(depq_len, depq, cur_i2, k);
        }

return self.insert_next_i(depq_len, next_i, str_buffer, results, prefix_len, depq, depq_1_indexed, j, half_topK);
    }

fn insert_next_i(self: *const Self, depq_len_: u8, next_i: u32, str_buffer: string, results: *[10]string, prefix_len: u64, depq: *[4]depq_int, depq_1_indexed: *[5]depq_int, k: u8, comptime half_topK: comptime_int) u8 {
        comptime var shifted: [half_topK]i32 = undefined;
        comptime var isFull = true;
        comptime for (shifted) |*p, j| {
            p.* = switch (@intCast(i32, j) - if (isFull) -1 else 1) {
                half_topK => -1,
                else => |e| e,
            };
        };
        if (next_i != NULL) { // we could check if (next_i > depq[0]
            var depq_vec: @Vector(half_topK, depq_int) = depq.*;
            const shifted_depq = @shuffle(depq_int, depq_vec, @splat(1, @as(depq_int, std.math.maxInt(depq_int))), shifted);
            const pred = depq_vec < @splat(half_topK, next_i);
            // if (SHOULD_PRINT) printState(depq_len, depq, pred, shifted_depq, next_i);
            depq.* = @select(depq_int, pred, shifted_depq, depq_vec);
            depq_1_indexed[@popCount(@bitCast(@Type(.{ .Int = .{ .signedness = .unsigned, .bits = half_topK } }), pred))] = next_i;
            // if (SHOULD_PRINT) printFinalState(depq_len, depq, next_i, k);
        }

// for some reason, LLVM does not do this optimization automatically when I check here:
        // if (depq_len == 0) break;
        // So instead, we decrement first and then check for wrap-around
        // CORRECTION: It does the optimization for this loop, just not the previous one.
        var depq_len = depq_len_ -% 1;
        if (depq_len == std.math.maxInt(@TypeOf(depq_len))) return k;

// LLVM moves `do_cur_insertion = false;` to the end of this block, yet fails to realize that
        // `do_cur_insertion` is `true`. Here is the ASM I get:
        // mov     bpl, 1
        // test    bpl, 1
        // jne     .LBB16_44
        // jmp     .LBB16_50

return self.insert_cur_i(depq_len, depq[depq_len], str_buffer, results, prefix_len, depq, depq_1_indexed, k, half_topK);
    }

/// Returns up to a given number of completions of a given prefix string, if present in the structure.
    /// The completions will be in descending sorted order by score, so the "most relevant" completions will
    /// come first. The algorithm is explained here: https://validark.github.io/DynSDT/
    pub fn topKCompletionsToPrefix(self: *const Self, prefix: string, results: *[10]string) u8 {
        // We use the term "successor" to just mean "the next one". It is not akin to how the term is used for
        // vEB tree or x-fast/y-fast/z-fast trees.
        @setRuntimeSafety(false);
        const SHOULD_PRINT = false;
        const nodes: []Node = self.nodes.items;
        const scores = if (MAINTAIN_SORTED_ORDER_IN_MEMORY) self.scores.items;
        const str_buffer = self.string_buffer.items;

var cur_i: depq_int = self.getLocusIndexForPrefix(prefix);
        if (cur_i == NULL) return 0;

// nodes[cur_i] is now the locus node, so push it to the results!
        results[0] = nodes[cur_i].term(str_buffer);

// Find vertical successor, skipping over nodes as necessary with insufficient LCP's
        cur_i = self.firstVerticalSuccessor(prefix.len, cur_i);
        if (cur_i == NULL) return 1;

// nodes[cur_i] is now the second completion
        results[1] = nodes[cur_i].term(str_buffer);

var next_i: depq_int = self.horizontalSuccessor(cur_i);
        cur_i = self.verticalSuccessor(prefix.len, cur_i);

var depq_len: u8 = @boolToInt(loser_i != NULL); // TODO: try u8, u16, u32
        cur_i = winner_i;

// if (SHOULD_PRINT) std.debug.print("prefix: \"{s}\"\n_ depq: [{}]string{{ {:0>7}, {:0>7}, {:0>7}, {:0>7}, {:0>7} }}, inserted: {}\n", .{ prefix, depq_len, depq[0], 0, 0, 0, 0, loser_i });

comptime var shifted: [half_topK]i32 = undefined;
        comptime var isFull = false;
        comptime for (shifted) |*p, j| {
            p.* = switch (@intCast(i32, j) - if (isFull) -1 else 1) {
                half_topK => -1,
                else => |e| e,
            };
        };
        while (true) {
            next_i = self.horizontalSuccessor(cur_i);
            cur_i = self.verticalSuccessor(prefix.len, cur_i);

if (cur_i != NULL) {
                var depq_vec: @Vector(half_topK, depq_int) = depq.*;
                const shifted_depq = @shuffle(depq_int, depq_vec, @splat(1, @as(u32, 0)), shifted);
                const pred = depq_vec < @splat(half_topK, cur_i);
                if (SHOULD_PRINT) printState(depq_len, depq, pred, shifted_depq, cur_i);

depq.* = @select(depq_int, pred, depq_vec, shifted_depq);
                depq[@popCount(@bitCast(half_topK_bitmap_int, pred))] = cur_i;
                depq_len += 1;
                if (SHOULD_PRINT) printFinalState(depq_len, depq, cur_i, k);

// When the DEPQ is full and we need to insert next_i, jump to the second while loop
                std.debug.assert(depq_len <= 10 - k);
                if (depq_len == 10 - k and next_i != NULL) {
                    if (SHOULD_PRINT) std.debug.print("depq_len: {}, k: {}\n", .{ depq_len, k });
                    return self.insert_next_i(depq_len, next_i, str_buffer, results, prefix.len, depq, &depq_1_indexed, k, half_topK);
                }
            }

if (next_i != NULL) {
                var depq_vec: @Vector(half_topK, depq_int) = depq.*;
                const shifted_depq = @shuffle(depq_int, depq_vec, @splat(1, @as(u32, 0)), shifted);
                const pred = depq_vec < @splat(half_topK, next_i);
                if (SHOULD_PRINT) printState(depq_len, depq, pred, shifted_depq, next_i);
                var tmp = depq[3];
                depq.* = @select(depq_int, pred, depq_vec, shifted_depq);
                const x = @popCount(@bitCast(half_topK_bitmap_int, pred));

if (SHOULD_PRINT) std.debug.print("Saving previous maximum: {}\n", .{next_i});
                // Because we can guarantee that the topK/2 slot is only needed a single time, during the
                // transition to the second phase of the algorithm that handles a full DEPQ, we omit it from
                // the DEPQ. E.g. for topK = 10, we don't need the 5th slot, i.e. index 4.
                // Instead, we simulate it in our cur_i variable by making it the maximum element that is
                // implicitly popped from the DEPQ.
                cur_i = next_i;

// When x is 4, we know cur_i/next_i is the maximum, so we can jump to the second phase
                // and push cur_i/next_i to results and find their successors
                if (x == 4) break;
                // When x is not 4, we know the maximum element was in depq[3], stored in tmp above, and
                // by this point it has been moved out of the depq
                cur_i = tmp;
                depq[x] = next_i;
                if (SHOULD_PRINT) std.debug.print("Saving previous maximum: {}\n", .{next_i});
                if (depq_len == 4) break; // Simulate popping cur_i, so skip incrementing depq_len
                depq_len += 1;
                if (SHOULD_PRINT) printFinalState(depq_len, depq, depq[x], k);
            }

// for some reason, LLVM does not do this optimization automatically when I check here:
            // if (depq_len == 0) return k;
            // So instead, we decrement first and then check for wrap-around
            depq_len -%= 1;
            if (depq_len == std.math.maxInt(@TypeOf(depq_len))) return @intCast(u4, k);
            cur_i = depq[depq_len];

// If the DEPQ is full but only because we already have so many completions. E.g. if we found
            // 8 completions already then we only need 2 more if topK=10. Because k has not been incremented
            // yet, when k=7, we would consider depq_len=2 to be full.
            // e.g. { 45, _ } -> 48, 49
            std.debug.assert(depq_len <= 9 - k);
            if (depq_len == 9 - k) break;
            if (SHOULD_PRINT) std.debug.print("pushing {} to results[{}]\n", .{ cur_i, k });
            results[k] = nodes[cur_i].term(str_buffer);
            k += 1;
        }

return self.insert_cur_i(depq_len, cur_i, str_buffer, results, prefix.len, depq, &depq_1_indexed, k, half_topK);

// if (SHOULD_PRINT) {
        //     std.debug.print("___\n", .{});
        //     printFinalState(depq_len, depq, cur_i, k);
        //     std.debug.print("___\n", .{});
        // }

// we saved the old maximum in cur_i, then we inserted another.
        // given e.g. { 1, 3, 5, 7 }
        // if we inserted 4 -> { 1, 3, 4, 5 } cur_i = 7
        // if we inserted 9 -> { 1, 3, 5, 9 } cur_i = 7
        // that means we may need to switch thesse
        // {
        //     while (true) {
        //         switch (next_state) {
        //             .insert_cur_i => {
        //                 if (SHOULD_PRINT) std.debug.print("pushing {} to results[{}]\n", .{ cur_i, k });
        //                 results[k] = nodes[cur_i].term(str_buffer);

//                 // Explain to me how it makes sense that this results in this assembly...
        //                 // inc     al
        //                 // cmp     al, 10
        //                 // je      .LBB16_45
        //                 // ...
        //                 // ...
        //                 // .LBB16_45:
        //                 // mov     al, 10
        //                 // jmp     .LBB16_2
        //                 // You don't need a formal theorem prover to figure out that `al` is already 10...
        //                 k += 1;
        //                 if (k == 10) return 10;

//                 next_i = self.horizontalSuccessor(cur_i);
        //                 cur_i = self.verticalSuccessor(prefix.len, cur_i);

//                 // { 3, 5, 7, 9 } 10

//                 if (cur_i != NULL) { // we could check if cur_i > depq[0]
        //                     var depq_vec: @Vector(half_topK, depq_int) = depq.*;
        //                     const shifted_depq = @shuffle(depq_int, depq_vec, @splat(1, @as(depq_int, std.math.maxInt(depq_int))), shifted);
        //                     const pred = depq_vec < @splat(half_topK, cur_i);
        //                     if (SHOULD_PRINT) printState(depq_len, depq, pred, shifted_depq, cur_i);
        //                     depq.* = @select(depq_int, pred, shifted_depq, depq_vec);
        //                     depq_1_indexed[@popCount(@bitCast(half_topK_bitmap_int, pred))] = cur_i;
        //                     if (SHOULD_PRINT) printFinalState(depq_len, depq, cur_i, k);
        //                 }

//                 next_state = .insert_next_i;
        //             },

//             .insert_next_i => {
        //                 if (next_i != NULL) { // we could check if (next_i > depq[0]
        //                     var depq_vec: @Vector(half_topK, depq_int) = depq.*;
        //                     const shifted_depq = @shuffle(depq_int, depq_vec, @splat(1, @as(depq_int, std.math.maxInt(depq_int))), shifted);
        //                     const pred = depq_vec < @splat(half_topK, next_i);
        //                     if (SHOULD_PRINT) printState(depq_len, depq, pred, shifted_depq, next_i);
        //                     depq.* = @select(depq_int, pred, shifted_depq, depq_vec);
        //                     depq_1_indexed[@popCount(@bitCast(half_topK_bitmap_int, pred))] = next_i;
        //                     if (SHOULD_PRINT) printFinalState(depq_len, depq, next_i, k);
        //                 }

//                 // for some reason, LLVM does not do this optimization automatically when I check here:
        //                 // if (depq_len == 0) break;
        //                 // So instead, we decrement first and then check for wrap-around
        //                 // CORRECTION: It does the optimization for this loop, just not the previous one.
        //                 depq_len -%= 1;
        //                 if (depq_len == std.math.maxInt(@TypeOf(depq_len))) return k;
        //                 cur_i = depq[depq_len];

//                 // LLVM moves `do_cur_insertion = false;` to the end of this block, yet fails to realize that
        //                 // `do_cur_insertion` is `true`. Here is the ASM I get:
        //                 // mov     bpl, 1
        //                 // test    bpl, 1
        //                 // jne     .LBB16_44
        //                 // jmp     .LBB16_50

//                 next_state = .insert_cur_i;
        //             },
        //         }
        //     }
        // }
    }

// top10CompletionsToPrefix
    pub fn topKCompletionsToPrefix2(self: *const Self, prefix: string, results: *[10]string) u4 {
        @setRuntimeSafety(false);
        const SHOULD_PRINT = false;
        const nodes: []Node = self.nodes.items;
        const scores = if (MAINTAIN_SORTED_ORDER_IN_MEMORY) self.scores.items;
        const str_buffer = self.string_buffer.items;
        var cur_i: u32 = self.getLocusIndexForPrefix(prefix);
        if (cur_i == NULL) return 0;

// nodes[cur_i] is now the locus node, so push it to the results!
        results[0] = nodes[cur_i].term(str_buffer);

// Find vertical successor, skipping over nodes as necessary with insufficient LCP's
        cur_i = self.firstVerticalSuccessor(prefix.len, cur_i);
        if (cur_i == NULL) return 1;

// nodes[cur_i] is now the second completion
        results[1] = nodes[cur_i].term(str_buffer);

var next_i = self.horizontalSuccessor(cur_i);
        cur_i = self.verticalSuccessor(prefix.len, cur_i);

const winner_i = get_winner_i(cur_i, next_i, scores);

if (winner_i == NULL) return 2;

results[2] = nodes[winner_i].term(str_buffer);

var k: u4 = 7;
        const loser_i = get_loser_i(next_i, cur_i, winner_i);

var depq: [5]depq_int = [5]depq_int{ std.math.maxInt(depq_int), std.math.maxInt(depq_int), std.math.maxInt(depq_int), std.math.maxInt(depq_int), std.math.maxInt(depq_int) };
        if (loser_i != NULL) depq[0] = loser_i;

var depq_len: u3 = @boolToInt(loser_i != NULL);
        cur_i = winner_i;

next_i = self.horizontalSuccessor(cur_i);
        cur_i = self.verticalSuccessor(prefix.len, cur_i);

if (SHOULD_PRINT) std.debug.print("prefix: \"{s}\"\n_ depq: [{}]string{{ {:0>7}, {:0>7}, {:0>7}, {:0>7}, {:0>7} }}, inserted: {}\n", .{ prefix, depq_len, depq[0], 0, 0, 0, 0, loser_i });

OUTER: while (true) {
            INNER: while (true) : ({
                cur_i = next_i;
                next_i = NULL;
                if (depq_len == k) break :OUTER;
                if (cur_i == NULL) break :INNER;
            }) {
                if (cur_i == NULL) continue;
                // insert cur_i into depq registers
                const x = @popCount(@bitCast(u4, @splat(4, cur_i) < depq[0..4].*));
                depq[4] = cur_i;
                if (x > 0) {
                    depq[4] = depq[3];
                    depq[3] = cur_i;
                }
                if (x > 1) {
                    depq[3] = depq[2];
                    depq[2] = cur_i;
                }
                if (x > 2) {
                    depq[2] = depq[1];
                    depq[1] = cur_i;
                }
                if (x > 3) {
                    depq[1] = depq[0];
                    depq[0] = cur_i;
                }
                // 4 -> bigger than all of them
                // 3 -> bigger than all but 1
                depq_len += 1;
                if (SHOULD_PRINT) std.debug.print("{} depq: [{}]string{{ {:0>7}, {:0>7}, {:0>7}, {:0>7}, {:0>7} }}, inserted: {}, k: {}\n", .{ x, depq_len, if (depq_len > 0) depq[0] else 0, if (depq_len > 1) depq[1] else 0, if (depq_len > 2) depq[2] else 0, if (depq_len > 3) depq[3] else 0, if (depq_len > 4) depq[4] else 0, cur_i, k });
            }

if (depq_len == 0) return 10 - k;
            k -= 1;
            depq_len -= 1;
            cur_i = depq[depq_len];

if (SHOULD_PRINT) std.debug.print("_ depq: [{}]string{{ {:0>7}, {:0>7}, {:0>7}, {:0>7}, {:0>7} }}, removed:  {}, k: {}, str: {s}\n\n", .{ depq_len, if (depq_len > 0) depq[0] else 0, if (depq_len > 1) depq[1] else 0, if (depq_len > 2) depq[2] else 0, if (depq_len > 3) depq[3] else 0, if (depq_len > 4) depq[4] else 0, cur_i, k, nodes[cur_i].term(str_buffer) });
            results[9 - k] = nodes[cur_i].term(str_buffer);
            next_i = self.horizontalSuccessor(cur_i);
            cur_i = self.verticalSuccessor(prefix.len, cur_i);
        }

if (SHOULD_PRINT) std.debug.print("\nFULL:\n", .{});

if (1 == 1) return 10;

// When full:
        while (true) {
            while (true) {
                // insert cur_i into depq registers
                if (cur_i != NULL) {
                    var cur_i_vec = @splat(5, cur_i);
                    const x = @popCount(@bitCast(u5, cur_i_vec > depq));
                    if (x > 0) {
                        depq[0] = cur_i;
                    }
                    if (x > 1) {
                        depq[0] = depq[1];
                        depq[1] = cur_i;
                    }
                    if (x > 2) {
                        depq[1] = depq[2];
                        depq[2] = cur_i;
                    }
                    if (x > 3) {
                        depq[2] = depq[3];
                        depq[3] = cur_i;
                    }
                    if (x > 4) {
                        depq[3] = depq[4];
                        depq[4] = cur_i;
                    }
                    if (SHOULD_PRINT) std.debug.print("{} depq: [{}]string{{ {:0>7}, {:0>7}, {:0>7}, {:0>7}, {:0>7} }}, inserted: {}, k: {}\n", .{ x, depq_len, if (depq_len > 0) depq[0] else 0, if (depq_len > 1) depq[1] else 0, if (depq_len > 2) depq[2] else 0, if (depq_len > 3) depq[3] else 0, if (depq_len > 4) depq[4] else 0, cur_i, k });
                }

if (next_i == NULL) break;
                cur_i = next_i;
                next_i = NULL;
            }

if (depq_len == 0) return 10 - k;
            k -= 1;
            depq_len -= 1;
            cur_i = depq[depq_len];
            depq[depq_len] = std.math.maxInt(depq_int);
            results[9 - k] = nodes[cur_i].term(str_buffer);

if (SHOULD_PRINT) std.debug.print("_ depq: [{}]string{{ {:0>7}, {:0>7}, {:0>7}, {:0>7}, {:0>7} }}, removed:  {}, k: {}\n\n", .{ depq_len, if (depq_len > 0) depq[0] else 0, if (depq_len > 1) depq[1] else 0, if (depq_len > 2) depq[2] else 0, if (depq_len > 3) depq[3] else 0, if (depq_len > 4) depq[4] else 0, cur_i, k });

if (k == 0) break;

next_i = self.horizontalSuccessor(cur_i);
            cur_i = self.verticalSuccessor(prefix.len, cur_i);
        }
        return 10;
    }

pub fn initFromFile(gpa: Allocator, file_path: string) !DynSDT {
        std.debug.print("Instantiating Trie ...\n", .{});
        var file = try std.fs.cwd().openFile(file_path, .{});
        defer file.close();

var t1 = std.time.milliTimestamp();

// make sure this ends in \t\n
        var file_buf: [FILE_BUFFER_SIZE + 2]u8 = undefined;
        file_buf[FILE_BUFFER_SIZE] = '\t';
        file_buf[FILE_BUFFER_SIZE + 1] = '\n';

const buffer_requirements: BufferRequirements = try precomputeFileBufferRequirements(file, &file_buf);
        var num_term_pairs = @intCast(node_index_int, buffer_requirements.num_term_pairs);
        var string_buffer_size = buffer_requirements.string_buffer_size;

var node_list = try std.ArrayListUnmanaged(Node).initCapacity(gpa, num_term_pairs);
        var scores = try std.ArrayListUnmanaged(score_int).initCapacity(gpa, num_term_pairs);
        var backing_string_buffer = try std.ArrayListUnmanaged(u8).initCapacity(gpa, 8 * (node_list.capacity - num_term_pairs) + string_buffer_size); // allocate more space for how many nodes we can fit without reallocation

// This is our NULL node, used as a sentinel sometimes to avoid bounds checks
        const sentinels = [_]Node{
            Node.init(std.math.maxInt(str_buf_int), std.math.maxInt(term_len_int)),
            Node.init(std.math.maxInt(str_buf_int), std.math.maxInt(term_len_int)),
        };
        const sentinel_scores = [_]score_int{ std.math.minInt(score_int), std.math.maxInt(score_int) };

node_list.appendSliceAssumeCapacity(&sentinels);
        scores.appendSliceAssumeCapacity(&sentinel_scores);

node_list.items.len = num_term_pairs;
        scores.items.len = num_term_pairs;
        var node_list_i = num_term_pairs;

backing_string_buffer.expandToCapacity();
        var string_buffer = backing_string_buffer.items[0..string_buffer_size];

var current_score: score_int = 0;
        var current_term_start: str_buf_int = 0;
        var current_term_len: term_len_int = 0;
        var delimiter: u8 = '\t';

OUTER_LOOP: while (true) {
            var buf: []u8 = file_buf[0..FILE_BUFFER_SIZE];
            var bytes_to_consume = try file.read(buf);
            buf.len += 2; // this makes our buffer include our two sentinels at the end!
            const is_final_buffer = bytes_to_consume != FILE_BUFFER_SIZE; // If we don't fill up buffer, break later
            while (bytes_to_consume != 0) {
                var buf_start = buf;
                var diff: term_len_int = undefined;

switch (delimiter) {
                    '\t' => {
                        while (USE_SIMD_FOR_PARSING_FILES and buf.len >= VEC_SIZE_FOR_PARSING_FILES) {
                            const vec: @Vector(32, u8) = buf[0..VEC_SIZE_FOR_PARSING_FILES].*;
                            var cmp2 = vec == @splat(VEC_SIZE_FOR_PARSING_FILES, @as(u8, '\t'));
                            const bitmask = @bitCast(@Type(.{ .Int = .{ .signedness = .unsigned, .bits = VEC_SIZE_FOR_PARSING_FILES } }), cmp2);
                            const first = @ctz(bitmask);
                            buf = buf[first..];
                            if (first != VEC_SIZE_FOR_PARSING_FILES) break;
                        } else while (buf[0] != delimiter) buf = buf[1..];
                        diff = @intCast(term_len_int, @ptrToInt(buf.ptr) - @ptrToInt(buf_start.ptr));
                        current_term_len += diff;
                        if (current_term_start == 0)
                            current_term_start = @intCast(str_buf_int, @ptrToInt(string_buffer.ptr) - @ptrToInt(backing_string_buffer.items.ptr));
                        std.mem.copy(u8, string_buffer, buf_start[0..diff]);
                        string_buffer = string_buffer[diff..];

if (@ptrToInt(&file_buf[FILE_BUFFER_SIZE]) == @ptrToInt(buf.ptr)) {
                            std.debug.assert(is_final_buffer == false);
                            continue :OUTER_LOOP;
                        }
                    },
                    '\n' => {
                        while (true) {
                            switch (buf[0]) {
                                '0'...'9' => |c| {
                                    current_score = current_score * 10 + (c - '0');
                                    buf = buf[1..];
                                },
                                else => break,
                            }
                        }

diff = @intCast(term_len_int, @ptrToInt(buf.ptr) - @ptrToInt(buf_start.ptr));
                        if (@ptrToInt(&file_buf[FILE_BUFFER_SIZE]) == @ptrToInt(buf.ptr)) {
                            std.debug.assert(is_final_buffer == false);
                            continue :OUTER_LOOP;
                        }

node_list_i -= 1;
                        node_list.items[node_list_i] = Node.init(current_term_start, current_term_len);
                        scores.items[node_list_i] = current_score;

current_score = 0;
                        current_term_start = 0;
                        current_term_len = 0;
                    },
                    else => unreachable,
                }

bytes_to_consume -= diff + 1; // the +1 is for the \t or \n
                delimiter ^= 3; // flip between '\n' <-> '\t'
                buf = buf[1..]; // skip the '\t' or '\n'
            }

if (is_final_buffer) break;
        }

std.debug.assert(node_list_i == 2);
        std.debug.print("        Read term list in {} ms. (", .{std.time.milliTimestamp() - t1});
        printCommifiedNumber(num_term_pairs - 2); // subtract 2 because we don't count sentinels
        std.debug.print(" terms)\n", .{});

t1 = std.time.milliTimestamp();
        var dynSDT = try DynSDT.initWithNodes(gpa, node_list, backing_string_buffer, scores, num_term_pairs);
        const elapsed = std.time.milliTimestamp() - t1;
        const bytes_used: usize = string_buffer_size +
            @intCast(usize, num_term_pairs) * @sizeOf(Node) +
            @sizeOf(node_index_int) * 256 +
            @sizeOf(DynSDT) +
            @intCast(usize, num_term_pairs) * @sizeOf(score_int);

fn getNodeIndexForTerm(self: *const Self, term: string) node_index_int {
        const nodes: []Node = self.nodes.items;
        const str_buffer = self.string_buffer.items;
        std.debug.assert(term.len > 0);
        var cur_i: node_index_int = self.roots[term[0]];
        if (cur_i == NULL) return NULL;
        var LCP: term_len_int = 1;
        while (true) {
            LCP = longestCommonPrefix(LCP, term, nodes[cur_i].term(str_buffer));
            if (LCP == term.len and LCP == nodes[cur_i].getTermLen()) return cur_i;
            cur_i = nodes[cur_i].getNext();

while (true) : (cur_i = nodes[cur_i].getDown()) {
                if (cur_i == NULL) return NULL;
                if (nodes[cur_i].getLCP() == LCP) break;
            }
        }
    }

pub fn getScore(self: *const Self, term: string) !score_int {
        return switch (self.getNodeIndexForTerm(term)) {
            NULL => error.NodeNotFound,
            else => |i| self.scores.items[i],
        };
    }

// pub fn increaseScore(term: string, n: score_int) void {
    //     const nodes: []Node = self.nodes.items;
    //     const scores: []Node = self.scores.items;
    //     const str_buffer = self.string_buffer.items;
    //     std.debug.assert(term.len > 0);
    //     var cur_i: node_index_int = self.roots[term[0]];
    //     if (cur_i == NULL) {
    //         // just insert into the list
    //         // self.roots[term[0]] =
    //         // find place to allocate it
    //     }
    //     var LCP: term_len_int = 1;
    //     while (true) {
    //         LCP = longestCommonPrefix(LCP, term, nodes[cur_i].term(str_buffer));
    //         if (LCP == term.len and LCP == nodes[cur_i].getTermLen()) {
    //             // SET-ExactMatchFound
    //         }

//         if (score > scores[cur_i]) {
    //             // Set-ScoreLocationFound
    //         }

//         cur_i = nodes[cur_i].getNext();

//         while (true) : (cur_i = nodes[cur_i].getDown()) {
    //             if (cur_i == NULL) return NULL; // just insert into the list
    //             if (nodes[cur_i].getLCP() == LCP) break;
    //         }
    //     }
    // }
};

pub fn main() !void {
    // test "query speed" {
    // if (1 == 1) return error.SkipZigTest;
    var general_purpose_allocator = std.heap.GeneralPurposeAllocator(.{
        .safety = true,
        .never_unmap = true,
        .retain_metadata = true,
    }){};
    const gpa = general_purpose_allocator.allocator();
    var trie = try DynSDT.initFromFile(gpa, "./terms_sorted2.txt"); // 95.6 MB in 224ms, that's ~427MB/s
    defer trie.deinit();

var file = try std.fs.cwd().openFile("../c#/PruningRadixTrie.Benchmark/all_queries_with_num_completions/queries.txt", .{});
    defer file.close();

// make sure this ends in \t\n
    var file_buf: [FILE_BUFFER_SIZE + 2]u8 = undefined;
    file_buf[FILE_BUFFER_SIZE] = '\t';
    file_buf[FILE_BUFFER_SIZE + 1] = '\n';

const buffer_requirements: BufferRequirements = try precomputeFileBufferRequirements(file, &file_buf);
    var num_term_pairs = buffer_requirements.num_term_pairs;
    var string_buffer_size = buffer_requirements.string_buffer_size;

var pairs = try std.ArrayListUnmanaged(string).initCapacity(gpa, num_term_pairs);
    defer pairs.deinit(gpa);

var backing_string_buffer = try std.ArrayListUnmanaged(u8).initCapacity(gpa, string_buffer_size);
    defer backing_string_buffer.deinit(gpa);

backing_string_buffer.expandToCapacity();
    var string_buffer = backing_string_buffer.items[0..string_buffer_size];

var current_term_start: str_buf_int = 0;
    var current_term_len: term_len_int = 0;
    var delimiter: u8 = '\t';

OUTER_LOOP: while (true) {
        var buf: []u8 = file_buf[0..FILE_BUFFER_SIZE];
        var bytes_to_consume = try file.read(buf);
        buf.len += 2; // this makes our buffer include our two sentinels at the end!
        const is_final_buffer = bytes_to_consume != FILE_BUFFER_SIZE; // If we don't fill up buffer, break later
        while (bytes_to_consume != 0) {
            var buf_start = buf;

while (USE_SIMD_FOR_PARSING_FILES and buf.len >= VEC_SIZE_FOR_PARSING_FILES) {
                const vec: @Vector(32, u8) = buf[0..VEC_SIZE_FOR_PARSING_FILES].*;
                var cmp2 = vec == @splat(VEC_SIZE_FOR_PARSING_FILES, @as(u8, delimiter));
                const bitmask = @bitCast(@Type(.{ .Int = .{ .signedness = .unsigned, .bits = VEC_SIZE_FOR_PARSING_FILES } }), cmp2);
                const first = @ctz(bitmask);
                buf = buf[first..];
                if (first != VEC_SIZE_FOR_PARSING_FILES) break;
            } else while (buf[0] != delimiter) buf = buf[1..];
            const diff = @ptrToInt(buf.ptr) - @ptrToInt(buf_start.ptr) - @boolToInt(@ptrCast(*u8, buf.ptr) == &file_buf[FILE_BUFFER_SIZE + 1]);

switch (delimiter) {
                '\t' => {
                    current_term_len += @intCast(u8, diff);
                    if (current_term_start == 0)
                        current_term_start = @intCast(str_buf_int, @ptrToInt(string_buffer.ptr) - @ptrToInt(backing_string_buffer.items.ptr));
                    std.mem.copy(u8, string_buffer, buf_start[0..diff]);
                    string_buffer = string_buffer[diff..];

if (@ptrToInt(&file_buf[FILE_BUFFER_SIZE]) == @ptrToInt(buf.ptr)) {
                        std.debug.assert(is_final_buffer == false);
                        continue :OUTER_LOOP;
                    }
                },
                '\n' => {
                    if (@ptrToInt(&file_buf[FILE_BUFFER_SIZE + 1]) == @ptrToInt(buf.ptr)) {
                        std.debug.assert(is_final_buffer == false);
                        continue :OUTER_LOOP;
                    }

pairs.appendAssumeCapacity(backing_string_buffer.items[current_term_start .. current_term_start + @intCast(u32, current_term_len)]);

current_term_start = 0;
                    current_term_len = 0;
                },
                else => unreachable,
            }

bytes_to_consume -= diff + 1; // the +1 is for the \t or \n
            delimiter ^= 3; // flip between '\n' <-> '\t'
            buf = buf[1..]; // skip the '\t' or '\n'
        }

if (is_final_buffer) break;
    }
    {
        var results: [10]string = undefined;
        const num_results = trie.topKCompletionsToPrefix("i", &results);
        std.debug.print("\"{s}\"", .{results[0]});
        for (results[1..10]) |result| {
            std.debug.print(", \"{s}\"", .{result});
        }
        std.debug.print("\n", .{});
        try std.testing.expectEqual(@as(u8, 10), num_results);
        for ([10]string{ "in", "in the", "i", "international", "island", "ii", "institute", "ice", "illinois", "indiana" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        // for ([10]string{ "station", "school", "season", "song", "state", "south", "st", "series", "states", "summer" }) |str, i| try std.testing.expectEqualStrings(results[i], str);
    }
    // if (1 == 1) return;
    {
        var results: [10]string = undefined;
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("s", &results));
        for ([10]string{ "station", "school", "season", "song", "state", "south", "st", "series", "states", "summer" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("a", &results));
        for ([10]string{ "and", "album", "at", "at the", "american", "a", "al", "airport", "athletics", "association" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("m", &results));
        for ([10]string{ "men", "michael", "m", "museum", "music", "martin", "mary", "my", "mount", "mark" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("c", &results));
        for ([10]string{ "county", "c", "championships", "cup", "church", "championship", "college", "charles", "city", "council" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("b", &results));
        for ([10]string{ "basketball", "band", "born", "by", "battle", "b", "battle of", "british", "bridge", "baseball" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("p", &results));
        for ([10]string{ "park", "politician", "paul", "peter", "party", "pennsylvania", "people", "p", "public", "power" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("t", &results));
        for ([10]string{ "the", "team", "thomas", "township", "tv", "to", "tv series", "texas", "tour", "the united" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("d", &results));
        for ([10]string{ "disambiguation", "de", "district", "david", "division", "d", "doubles", "daniel", "day", "del" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("r", &results));
        for ([10]string{ "river", "railway", "railway station", "robert", "richard", "road", "route", "rugby", "r", "red" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("h", &results));
        for ([10]string{ "house", "high", "high school", "henry", "historic", "hill", "hall", "hockey", "h", "history" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("l", &results));
        for ([10]string{ "list", "list of", "league", "la", "lake", "love", "language", "l", "lee", "louis" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("g", &results));
        for ([10]string{ "george", "games", "group", "game", "general", "grand", "green", "g", "great", "georgia" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("k", &results));
        for ([10]string{ "king", "k", "kentucky", "kingdom", "kansas", "khan", "kevin", "kim", "karl", "kong" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("e", &results));
        for ([10]string{ "election", "e", "european", "edward", "east", "el", "electoral", "elections", "earl", "episodes" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("f", &results));
        for ([10]string{ "film", "football", "footballer", "for", "f", "football team", "f c", "footballer born", "frank", "fc" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("j", &results));
        for ([10]string{ "john", "james", "j", "joseph", "jean", "jose", "jack", "jr", "jones", "joe" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("n", &results));
        for ([10]string{ "national", "new", "north", "new york", "no", "novel", "new zealand", "n", "number", "northern" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("ma", &results));
        for ([10]string{ "martin", "mary", "mark", "maria", "man", "magazine", "marie", "maryland", "management", "massachusetts" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("o", &results));
        for ([10]string{ "of", "of the", "open", "olympics", "on", "ohio", "one", "old", "o", "on the" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("w", &results));
        for ([10]string{ "wikipedia", "world", "women", "william", "west", "w", "white", "washington", "war", "wisconsin" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("i", &results));
        for ([10]string{ "in", "in the", "i", "international", "island", "ii", "institute", "ice", "illinois", "indiana" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
        try std.testing.expect(10 == trie.topKCompletionsToPrefix("v", &results));
        for ([10]string{ "virginia", "voivodeship", "v", "van", "valley", "video", "von", "video game", "victoria", "village" }) |str, i| try std.testing.expectEqualStrings(str, results[i]);
    }
    var queries = pairs.items;
    {
        comptime var i = 0;
        inline while (i < 1) : (i += 1) {
            const end_time = std.time.nanoTimestamp() + 1000000000;
            var iterations: u32 = 0;
            // while (true) {
            while (true) {
                // const t1 = std.time.nanoTimestamp();
                var results: [10]string = undefined;
                _ = trie.topKCompletionsToPrefix(queries[iterations], &results);
                // const t2 = std.time.nanoTimestamp();
                // std.debug.print("Got answer in {}ns: ", .{t2 - t1});

// std.debug.print("try std.testing.expect({} == trie.topKCompletionsToPrefix(\"{s}\", &results));\nfor ([{}]string{{ ", .{ num_results, queries[iterations], num_results });
                iterations += 1;

// if (num_results > 0) {
                //     std.debug.print("\"{s}\"", .{results[0]});
                //     for (results[1..num_results]) |result| {
                //         std.debug.print(", \"{s}\"", .{result});
                //     }
                //     std.debug.print(" }}) |str, i| try std.testing.expectEqualStrings(str, results[i]);\n", .{});
                // }
                // if (iterations == 200) break;
                if (std.time.nanoTimestamp() > end_time) break;
                // if (i == 1000) i = 0;
            }
            printCommifiedNumber(iterations);
            std.debug.print(" iterations in one second!\n", .{});
        }
    }
    {
        const start_time = std.time.milliTimestamp();
        var i: u32 = 0;
        while (i < 500000) : (i += 1) {
            var results: [10]string = undefined;
            _ = trie.topKCompletionsToPrefix(queries[i], &results);
        }
        const total_time = std.time.milliTimestamp() - start_time;

std.debug.print("Performed {} queries in {}ms\n", .{ 500000, total_time });
    }

// std.debug.print("{}\n", .{try trie.getScore("wikipedia")});
}