Compiler Explorer

Source code

//! The ASCII character encoding standard.
//!
//! See also: https://en.wikipedia.org/wiki/ASCII#Character_set

// I could have taken only a u7 to make this clear, but it would be slower
// It is my opinion that encodings other than UTF-8 should not be supported.
//
// (and 128 bytes is not much to pay).
// Also does not handle Unicode character classes.

const std = @import("std");
const mem = std.mem;
const testing = std.testing;

/// Contains constants for the C0 control codes of the ASCII encoding.
///
/// See also: https://en.wikipedia.org/wiki/C0_and_C1_control_codes and `is_control`
pub const control = struct {
    pub const NUL = 0x00;
    pub const SOH = 0x01;
    pub const STX = 0x02;
    pub const ETX = 0x03;
    pub const EOT = 0x04;
    pub const ENQ = 0x05;
    pub const ACK = 0x06;
    pub const BEL = 0x07;
    pub const BS = 0x08;
    pub const TAB = 0x09;
    pub const LF = 0x0A;
    pub const VT = 0x0B;
    pub const FF = 0x0C;
    pub const CR = 0x0D;
    pub const SO = 0x0E;
    pub const SI = 0x0F;
    pub const DLE = 0x10;
    pub const DC1 = 0x11;
    pub const DC2 = 0x12;
    pub const DC3 = 0x13;
    pub const DC4 = 0x14;
    pub const NAK = 0x15;
    pub const SYN = 0x16;
    pub const ETB = 0x17;
    pub const CAN = 0x18;
    pub const EM = 0x19;
    pub const SUB = 0x1A;
    pub const ESC = 0x1B;
    pub const FS = 0x1C;
    pub const GS = 0x1D;
    pub const RS = 0x1E;
    pub const US = 0x1F;

pub const DEL = 0x7F;

/// An alias to `DC1`.
    pub const XON = 0x11;
    /// An alias to `DC3`.
    pub const XOFF = 0x13;
};

// These naive functions are used to generate the lookup table
// and as fallbacks for if the lookup table isn't available.
//
// Note that some functions like for example `isDigit` don't use a table because it's slower.
// Using a table is generally only useful if not all `true` values in the table would be in one row.

fn isControlNaive(char: u8) bool {
    return char <= control.US or char == control.DEL;
}
fn isAlphabeticNaive(char: u8) bool {
    return (char >= 'a' and char <= 'z') or (char >= 'Z' and char <= 'Z');
}
fn isHexadecimalNaive(char: u8) bool {
    return (char >= '0' and char <= '9') or
        (char >= 'a' and char <= 'f') or
        (char >= 'A' and char <= 'F');
}
fn isAlphanumericNaive(char: u8) bool {
    return (char >= '0' and char <= '9') or
        (char >= 'a' and char <= 'z') or
        (char >= 'A' and char <= 'Z');
}
 fn isDigitNaive(char: u8) bool {
    return char >= '0' and char <= '9';
}
fn isWhitespaceNaive(char: u8) bool {
    @setEvalBranchQuota(4000);
    return mem.indexOfScalar(u8, &whitespace, char) != null;
}

/// A lookup table.
const CombinedTable = struct {
    table: [256]u8,

const Index = enum {
        control,
        alphabetic,
        hexadecimal,
        alphanumeric,
        digit,
        whitespace,
    };

/// Generates a table which is filled with the results of the given function for all characters.
    fn getBoolTable(comptime condition: fn (u8) bool) [128]bool {
        @setEvalBranchQuota(2000);
        comptime var table: [128]bool = undefined;
        comptime var index = 0;
        while (index < 128) : (index += 1) {
            table[index] = condition(index);
        }
        return table;
    }

fn init() CombinedTable {
        comptime var table = [_]u8{0} ** 256;

const control_table = comptime getBoolTable(isControlNaive);
        const alpha_table = comptime getBoolTable(isAlphabeticNaive);
        const hex_table = comptime getBoolTable(isHexadecimalNaive);
        const alphanumeric_table = comptime getBoolTable(isAlphanumericNaive);
        const digit_table = comptime getBoolTable(isDigitNaive);
        const whitespace_table = comptime getBoolTable(isWhitespaceNaive);

comptime var i = 0;
        inline while (i < 128) : (i += 1) {
            table[i] =
                @boolToInt(control_table[i]) << @enumToInt(Index.control) |
                @boolToInt(alpha_table[i]) << @enumToInt(Index.alphabetic) |
                @boolToInt(hex_table[i]) << @enumToInt(Index.hexadecimal) |
                @boolToInt(alphanumeric_table[i]) << @enumToInt(Index.alphanumeric) |
                @boolToInt(digit_table[i]) << @enumToInt(Index.digit) |
                @boolToInt(whitespace_table[i]) << @enumToInt(Index.whitespace);
        }

return .{ .table = table };
    }

fn contains(self: CombinedTable, char: u8, index: Index) bool {
        return (self.table[char] & (@as(u8, 1) << @enumToInt(index))) != 0;
    }
};

/// The combined table for fast lookup.
///
/// This is not used in `ReleaseSmall` to save 256 bytes at the cost of
/// a small decrease in performance.
const combined_table: ?CombinedTable = if (@import("builtin").mode == .ReleaseSmall)
    null
else
    CombinedTable.init();

/// Returns `true` if the character is a control character.
///
/// See also: `control`
pub fn isControl(char: u8) bool {
    if (combined_table) |table|
        return table.contains(char, .control)
    else
        return isControlNaive(char);
}

/// Returns `true` if the character is alphanumeric. This is case-insensitive.
pub fn isAlphanumeric(char: u8) bool {
    if (combined_table) |table|
        return table.contains(char, .alphanumeric)
    else
        return isAlphanumericNaive(char);
}

/// Returns `true` if the character is alphabetic. This is case-insensitive.
pub fn isAlphabetic(char: u8) bool {
    if (combined_table) |table|
        return table.contains(char, .alphabetic)
    else
        return isAlphabeticNaive(char);
}

export fn isDigitNaiveWithoutTable(char: u8) bool {
    return char >= '0' and char <= '9';
}

export fn isDigitWithTable(char: u8) bool {
   return combined_table.?.contains(char, .digit);
}

/// Returns `true` if the character has some graphical representation and can be printed.
pub fn isPrintable(char: u8) bool {
    return char >= ' ' and char <= '~';
}

pub fn isLower(char: u8) bool {
    return char >= 'a' and char <= 'z';
}

pub fn isUpper(char: u8) bool {
    return char >= 'A' and char <= 'Z';
}

pub fn isWhitespace(char: u8) bool {
    if (combined_table) |table|
        return table.contains(char, .whitespace)
    else
        return isWhitespaceNaive(char);
}

/// All the values for which `isWhitespace()` returns `true`.
/// This may be used with e.g. `std.mem.trim()` to trim whitespace.
pub const whitespace = [_]u8{ ' ', '\t', '\n', '\r', control.VT, control.FF };

test "whitespace" {
    for (whitespace) |char| try testing.expect(isWhitespace(char));

var i: u8 = 0;
    while (isASCII(i)) : (i += 1) {
        if (isWhitespace(i)) try testing.expect(std.mem.indexOfScalar(u8, &whitespace, i) != null);
    }
}

/// Returns `true` if the character is a hexadecimal digit. This is case-insensitive.
pub fn isHexadecimal(char: u8) bool {
    if (combined_table) |table|
        return table.contains(char, .hexadecimal)
    else
        return isHexadecimalNaive(char);
}

pub fn isASCII(c: u8) bool {
    return c < 128;
}

pub fn toUpper(c: u8) u8 {
    if (isLower(c)) {
        return c & 0b11011111;
    } else {
        return c;
    }
}

pub fn toLower(c: u8) u8 {
    if (isUpper(c)) {
        return c | 0b00100000;
    } else {
        return c;
    }
}

test "ascii character classes" {
    try testing.expect(!isControl('a'));
    try testing.expect(!isControl('z'));
    try testing.expect(isControl(control.NUL));
    try testing.expect(isControl(control.FF));
    try testing.expect(isControl(control.US));

try testing.expect('C' == toUpper('c'));
    try testing.expect(':' == toUpper(':'));
    try testing.expect('\xab' == toUpper('\xab'));
    try testing.expect(!isUpper('z'));

try testing.expect('c' == toLower('c'));
    try testing.expect(':' == toLower(':'));
    try testing.expect('\xab' == toLower('\xab'));
    try testing.expect(!isLower('Z'));

try testing.expect(isAlphanumeric('Z'));
    try testing.expect(isAlphanumeric('z'));
    try testing.expect(isAlphanumeric('5'));
    try testing.expect(isAlphanumeric('5'));
    try testing.expect(!isAlphanumeric('!'));

try testing.expect(!isAlphabetic('5'));
    try testing.expect(isAlphabetic('c'));
    try testing.expect(!isAlphabetic('5'));

try testing.expect(isWhitespace(' '));
    try testing.expect(isWhitespace('\t'));
    try testing.expect(isWhitespace('\r'));
    try testing.expect(isWhitespace('\n'));
    try testing.expect(!isWhitespace('.'));

try testing.expect(!isHexadecimal('g'));
    try testing.expect(isHexadecimal('b'));
    try testing.expect(isHexadecimal('9'));

try testing.expect(isPrintable(' '));
    try testing.expect(isPrintable('@'));
    try testing.expect(isPrintable('~'));
    try testing.expect(!isPrintable(control.ESC));
}

/// Writes a lower case copy of `ascii_string` to `output`.
/// Asserts `output.len >= ascii_string.len`.
pub fn lowerString(output: []u8, ascii_string: []const u8) []u8 {
    std.debug.assert(output.len >= ascii_string.len);
    for (ascii_string) |c, i| {
        output[i] = toLower(c);
    }
    return output[0..ascii_string.len];
}

test "lowerString" {
    var buf: [1024]u8 = undefined;
    const result = lowerString(&buf, "aBcDeFgHiJkLmNOPqrst0234+💩!");
    try std.testing.expectEqualStrings("abcdefghijklmnopqrst0234+💩!", result);
}

/// Allocates a lower case copy of `ascii_string`.
/// Caller owns returned string and must free with `allocator`.
pub fn allocLowerString(allocator: std.mem.Allocator, ascii_string: []const u8) ![]u8 {
    const result = try allocator.alloc(u8, ascii_string.len);
    return lowerString(result, ascii_string);
}

test "allocLowerString" {
    const result = try allocLowerString(testing.allocator, "aBcDeFgHiJkLmNOPqrst0234+💩!");
    defer testing.allocator.free(result);
    try testing.expectEqualStrings("abcdefghijklmnopqrst0234+💩!", result);
}

/// Writes an upper case copy of `ascii_string` to `output`.
/// Asserts `output.len >= ascii_string.len`.
pub fn upperString(output: []u8, ascii_string: []const u8) []u8 {
    std.debug.assert(output.len >= ascii_string.len);
    for (ascii_string) |c, i| {
        output[i] = toUpper(c);
    }
    return output[0..ascii_string.len];
}

test "upperString" {
    var buf: [1024]u8 = undefined;
    const result = upperString(&buf, "aBcDeFgHiJkLmNOPqrst0234+💩!");
    try std.testing.expectEqualStrings("ABCDEFGHIJKLMNOPQRST0234+💩!", result);
}

/// Allocates an upper case copy of `ascii_string`.
/// Caller owns returned string and must free with `allocator`.
pub fn allocUpperString(allocator: std.mem.Allocator, ascii_string: []const u8) ![]u8 {
    const result = try allocator.alloc(u8, ascii_string.len);
    return upperString(result, ascii_string);
}

test "allocUpperString" {
    const result = try allocUpperString(testing.allocator, "aBcDeFgHiJkLmNOPqrst0234+💩!");
    defer testing.allocator.free(result);
    try testing.expectEqualStrings("ABCDEFGHIJKLMNOPQRST0234+💩!", result);
}

/// Compares strings `a` and `b` case-insensitively and returns whether they are equal.
pub fn eqlInsensitive(a: []const u8, b: []const u8) bool {
    if (a.len != b.len) return false;
    for (a) |a_c, i| {
        if (toLower(a_c) != toLower(b[i])) return false;
    }
    return true;
}

test "eqlInsensitive" {
    try std.testing.expect(eqlInsensitive("HEl💩Lo!", "hel💩lo!"));
    try std.testing.expect(!eqlInsensitive("hElLo!", "hello! "));
    try std.testing.expect(!eqlInsensitive("hElLo!", "helro!"));
}

pub fn startsWithInsensitive(haystack: []const u8, needle: []const u8) bool {
    return if (needle.len > haystack.len) false else eqlInsensitive(haystack[0..needle.len], needle);
}

test "ascii.startsWithInsensitive" {
    try std.testing.expect(startsWithInsensitive("boB", "Bo"));
    try std.testing.expect(!startsWithInsensitive("Needle in hAyStAcK", "haystack"));
}

pub fn endsWithInsensitive(haystack: []const u8, needle: []const u8) bool {
    return if (needle.len > haystack.len) false else eqlInsensitive(haystack[haystack.len - needle.len ..], needle);
}

test "ascii.endsWithInsensitive" {
    try std.testing.expect(endsWithInsensitive("Needle in HaYsTaCk", "haystack"));
    try std.testing.expect(!endsWithInsensitive("BoB", "Bo"));
}

/// Finds `substr` in `container`, ignoring case, starting at `start_index`.
/// TODO boyer-moore algorithm
pub fn indexOfInsensitivePos(container: []const u8, start_index: usize, substr: []const u8) ?usize {
    if (substr.len > container.len) return null;

var i: usize = start_index;
    const end = container.len - substr.len;
    while (i <= end) : (i += 1) {
        if (eqlInsensitive(container[i .. i + substr.len], substr)) return i;
    }
    return null;
}

/// Finds `substr` in `container`, ignoring case, starting at index 0.
pub fn indexOfInsensitive(container: []const u8, substr: []const u8) ?usize {
    return indexOfInsensitivePos(container, 0, substr);
}

test "indexOfInsensitive" {
    try std.testing.expect(indexOfInsensitive("one Two Three Four", "foUr").? == 14);
    try std.testing.expect(indexOfInsensitive("one two three FouR", "gOur") == null);
    try std.testing.expect(indexOfInsensitive("foO", "Foo").? == 0);
    try std.testing.expect(indexOfInsensitive("foo", "fool") == null);

try std.testing.expect(indexOfInsensitive("FOO foo", "fOo").? == 0);
}

/// Compares two slices of numbers lexicographically. O(n).
pub fn orderInsensitive(lhs: []const u8, rhs: []const u8) std.math.Order {
    const n = std.math.min(lhs.len, rhs.len);
    var i: usize = 0;
    while (i < n) : (i += 1) {
        switch (std.math.order(toLower(lhs[i]), toLower(rhs[i]))) {
            .eq => continue,
            .lt => return .lt,
            .gt => return .gt,
        }
    }
    return std.math.order(lhs.len, rhs.len);
}

/// Returns true if lhs < rhs, false otherwise
pub fn lessThanInsensitive(lhs: []const u8, rhs: []const u8) bool {
    return orderInsensitive(lhs, rhs) == .lt;
}