Implement initial roc tokenizer in zig

2025-08-04 12:18:19 +00:00 · 2025-01-31 18:33:59 -08:00 · 2025-01-31 18:33:59 -08:00 · ce8f7065db
commit ce8f7065db
parent 5e4ff44483
8 changed files with 1497 additions and 72 deletions
--- a/build.zig
+++ b/build.zig
@ -9,12 +9,16 @@ pub fn build(b: *std.Build) void {
    const target = b.standardTargetOptions(.{});
    const optimize = b.standardOptimizeOption(.{});

+    // Zig unicode library - https://codeberg.org/atman/zg
+    const zg = b.dependency("zg", .{});
+
    const exe = b.addExecutable(.{
        .name = "roc",
        .root_source_file = b.path("src/main.zig"),
        .target = target,
        .optimize = optimize,
    });
+    exe.root_module.addImport("GenCatData", zg.module("GenCatData"));

    b.installArtifact(exe);

@ -34,6 +38,7 @@ pub fn build(b: *std.Build) void {
        .target = target,
        .optimize = optimize,
    });
+    all_tests.root_module.addImport("GenCatData", zg.module("GenCatData"));

    // Install the test binary so we can run separately
    // ```sh
--- a/build.zig.zon
+++ b/build.zig.zon
@ -7,6 +7,10 @@
            .url = "git+https://github.com/kristoff-it/zig-afl-kit#88c6b71377767c1b8d26979b0adfa12a58d988dd",
            .hash = "1220796f7d2d9a2d4d7f8339ee0b14aa4bf133a15ae9ba39c941cc68e08d5c5ce9a2",
        },
+        .zg = .{
+            .url = "https://codeberg.org/dude_the_builder/zg/archive/v0.13.2.tar.gz",
+            .hash = "122055beff332830a391e9895c044d33b15ea21063779557024b46169fb1984c6e40",
+        },
    },
    .paths = .{
        "build.zig",
--- a/crates/compiler/test_syntax/tests/snapshots/pass/tuple_function_annotation.expr.formatted.roc
+++ b/crates/compiler/test_syntax/tests/snapshots/pass/tuple_function_annotation.expr.formatted.roc
@ -1,4 +1,4 @@
-1P : (
+11 : (
    I,
    s,
    Mw
--- a/crates/compiler/test_syntax/tests/snapshots/pass/tuple_function_annotation.expr.result-ast
+++ b/crates/compiler/test_syntax/tests/snapshots/pass/tuple_function_annotation.expr.result-ast
@ -1,74 +1,79 @@
-@0-21 Defs(
-    Defs {
-        tags: [
-            EitherIndex(2147483648),
-        ],
-        regions: [
-            @0-17,
-        ],
-        space_before: [
-            Slice<roc_parse::ast::CommentOrNewline> { start: 0, length: 0 },
-        ],
-        space_after: [
-            Slice<roc_parse::ast::CommentOrNewline> { start: 0, length: 0 },
-        ],
-        spaces: [],
-        type_defs: [],
-        value_defs: [
-            Annotation(
-                @0-2 SpaceAfter(
-                    NumLiteral(
-                        "1P",
+@0-21 SpaceAfter(
+    Defs(
+        Defs {
+            tags: [
+                EitherIndex(2147483648),
+            ],
+            regions: [
+                @0-17,
+            ],
+            space_before: [
+                Slice<roc_parse::ast::CommentOrNewline> { start: 0, length: 0 },
+            ],
+            space_after: [
+                Slice<roc_parse::ast::CommentOrNewline> { start: 0, length: 0 },
+            ],
+            spaces: [],
+            type_defs: [],
+            value_defs: [
+                Annotation(
+                    @0-2 SpaceAfter(
+                        NumLiteral(
+                            "11",
+                        ),
+                        [
+                            Newline,
+                        ],
                    ),
-                    [
-                        Newline,
-                    ],
-                ),
-                @4-17 Tuple {
-                    elems: [
-                        @5-15 Function(
-                            [
-                                @5-6 Apply(
-                                    "",
-                                    "I",
-                                    [],
-                                ),
-                                @7-8 SpaceAfter(
-                                    BoundVariable(
-                                        "s",
+                    @4-17 Tuple {
+                        elems: [
+                            @5-15 Function(
+                                [
+                                    @5-6 Apply(
+                                        "",
+                                        "I",
+                                        [],
                                    ),
-                                    [
-                                        Newline,
-                                    ],
+                                    @7-8 SpaceAfter(
+                                        BoundVariable(
+                                            "s",
+                                        ),
+                                        [
+                                            Newline,
+                                        ],
+                                    ),
+                                    @10-12 Apply(
+                                        "",
+                                        "Mw",
+                                        [],
+                                    ),
+                                ],
+                                Pure,
+                                @14-15 BoundVariable(
+                                    "r",
                                ),
-                                @10-12 Apply(
-                                    "",
-                                    "Mw",
-                                    [],
-                                ),
-                            ],
-                            Pure,
-                            @14-15 BoundVariable(
-                                "r",
+                            ),
+                        ],
+                        ext: Some(
+                            @16-17 BoundVariable(
+                                "l",
                            ),
                        ),
-                    ],
-                    ext: Some(
-                        @16-17 BoundVariable(
-                            "l",
-                        ),
-                    ),
-                },
-            ),
-        ],
-    },
-    @18-21 SpaceBefore(
-        Var {
-            module_name: "",
-            ident: "asl",
+                    },
+                ),
+            ],
        },
-        [
-            Newline,
-        ],
+        @18-21 SpaceBefore(
+            Var {
+                module_name: "",
+                ident: "asl",
+            },
+            [
+                Newline,
+            ],
+        ),
    ),
+    [
+        Newline,
+    ],
 )
--- a/crates/compiler/test_syntax/tests/snapshots/pass/tuple_function_annotation.expr.roc
+++ b/crates/compiler/test_syntax/tests/snapshots/pass/tuple_function_annotation.expr.roc
@ -1,4 +1,4 @@
-1P
+11
 :(I,s
 ,Mw->r)l
-asl
+asl
--- a/src/check/parse.zig
+++ b/src/check/parse.zig
@ -0,0 +1,195 @@
+const std = @import("std");
+const tokenize = @import("tokenize.zig");
+
+pub const Region = struct {
+    start: usize,
+    end: usize,
+};
+
+pub const Node = struct {
+    tag: Tag,
+    data: Data,
+    region: Region,
+
+    pub const Tag = enum {
+        Unary,
+        Binary,
+        // TODO
+    };
+
+    pub const Data = union {
+        Unary: UnaryOpData,
+        Binary: BinaryOpData,
+        // Add more node data as needed
+    };
+
+    pub const UnaryOpData = struct {
+        // TODO
+    };
+
+    pub const BinaryOpData = struct {
+        // TODO
+    };
+};
+
+pub const Diagnostic = struct {
+    tag: Tag,
+    region: Region,
+
+    pub const Tag = enum {
+        // TODO
+    };
+};
+
+pub const Parser = struct {
+    pos: usize,
+    tokens: tokenize.TokenizedBuffer,
+    nodes: std.MultiArrayList(Node),
+    diagnostics: std.ArrayList(tokenize.Diagnostic),
+    allocator: std.mem.Allocator,
+
+    pub fn init(tokens: tokenize.TokenizedBuffer, allocator: std.mem.Allocator) Parser {
+        return Parser{
+            .pos = 0,
+            .tokens = tokens,
+            .nodes = std.MultiArrayList(Node){},
+            .diagnostics = std.ArrayList(tokenize.Diagnostic).init(allocator),
+            .allocator = allocator,
+        };
+    }
+
+    pub fn advance(self: *Parser) void {
+        if (self.pos >= self.tokens.tokens.len) {
+            return;
+        }
+        std.debug.print("advance {s}\n", .{@tagName(self.tokens.tokens.items(.tag)[self.pos])});
+        self.pos += 1;
+    }
+
+    pub fn peek(self: *Parser) tokenize.Token.Tag {
+        if (self.pos >= self.tokens.tokens.len) {
+            return .EndOfFile;
+        }
+        return self.tokens.tokens.items(.tag)[self.pos];
+    }
+
+    // If the next token is a newline, consume it
+    // Returns the indent level of the next line if it is a newline, otherwise null
+    pub fn consumeNewline(self: *Parser) ?u16 {
+        if (self.peek() != .Newline) {
+            return null;
+        }
+        const indent = self.tokens.tokens.items(.offset)[self.pos];
+        self.advance();
+        return @intCast(indent);
+    }
+
+    // Returns the indent level of the next line if the next token is a newline, otherwise null
+    pub fn peekNewline(self: *Parser) ?u16 {
+        if (self.peek() != .Newline) {
+            return null;
+        }
+        const indent = self.tokens.tokens.items(.offset)[self.pos];
+        return @intCast(indent);
+    }
+
+    pub fn parseFile(self: *Parser) !void {
+        while (self.peek() != .EndOfFile) {
+            if (self.consumeNewline()) |indent| {
+                std.debug.print("parseFile indent {d}\n", .{indent});
+                std.debug.assert(indent == 0); // TODO: report an error
+            }
+            if (self.peek() == .EndOfFile) {
+                break;
+            }
+            self.parseStmt(0);
+        }
+    }
+
+    pub fn parseStmt(self: *Parser, base_indent: u16) void {
+        switch (self.peek()) {
+            .LowerIdent => {
+                self.advance();
+                if (self.peek() == .OpEquals) {
+                    self.finishParseAssign(base_indent);
+                    std.debug.print("parseStmt assign\n", .{});
+                } else {
+                    std.debug.print("parseStmt expr\n", .{});
+                }
+            },
+            else => {
+                std.debug.panic("todo: emit error, unexpected token {s}", .{@tagName(self.peek())});
+            },
+        }
+    }
+
+    pub fn parseExpr(self: *Parser) void {
+        switch (self.peek()) {
+            .LowerIdent => {
+                self.advance();
+                std.debug.print("parseExpr {s}\n", .{@tagName(self.peek())});
+                // TODO: add node
+            },
+            .Int => {
+                self.advance();
+                std.debug.print("parseExpr {s}\n", .{@tagName(self.peek())});
+                // TODO: add node
+            },
+            else => {
+                std.debug.panic("todo: emit error", .{});
+            },
+        }
+    }
+
+    pub fn finishParseAssign(self: *Parser, base_indent: u16) void {
+        std.debug.assert(self.peek() == .OpEquals);
+        self.advance();
+        if (self.consumeNewline()) |indent| {
+            std.debug.print("startParseAssign indent {d}\n", .{indent});
+            if (indent <= base_indent) {
+                std.debug.panic("todo: emit error", .{});
+            }
+
+            self.parseStmt(indent);
+
+            while (true) {
+                if (self.peekNewline()) |i| {
+                    if (i <= base_indent) {
+                        break;
+                    }
+                    self.advance();
+                } else {
+                    break;
+                }
+                self.parseStmt(indent);
+            }
+        } else {
+            self.parseExpr();
+        }
+
+        std.debug.print("finishParseAssign\n", .{});
+    }
+};
+test "Parser advance and peek" {
+    const allocator = std.heap.page_allocator;
+    var tokens = try tokenize.TokenizedBuffer.init(allocator);
+    // x =
+    //     y = 1
+    //     y
+    try tokens.pushToken(.LowerIdent, 0, 1);
+    try tokens.pushToken(.OpEquals, 0, 0);
+    try tokens.pushNewline(4);
+    try tokens.pushToken(.LowerIdent, 0, 0);
+    try tokens.pushToken(.OpEquals, 0, 0);
+    try tokens.pushToken(.Int, 0, 0);
+    try tokens.pushNewline(4);
+    try tokens.pushToken(.LowerIdent, 0, 0);
+    try tokens.pushNewline(0);
+    try tokens.pushToken(.EndOfFile, 0, 0);
+
+    var parser = Parser.init(tokens, allocator);
+
+    try parser.parseFile();
+
+    // std.debug.assert(parser.nodes)
+}
--- a/src/check/tokenize.zig
+++ b/src/check/tokenize.zig
--- a/src/main.zig
+++ b/src/main.zig
@ -3,6 +3,7 @@ const mem = std.mem;
 const Allocator = std.mem.Allocator;
 const RocCmd = @import("cli.zig").RocCmd;
 const RocOpt = @import("cli.zig").RocOpt;
+const syntax = @import("check/syntax.zig");

 const usage =
    \\Usage:
@ -138,11 +139,20 @@ fn rocVersion(allocator: Allocator, args: []const []const u8) !void {
 }

 fn rocCheck(allocator: Allocator, opt: RocOpt, args: []const []const u8) !void {
-    _ = allocator;
+    var syn = try syntax.Syntax.init(allocator);
+    defer syn.deinit();
+    _ = opt;

-    std.debug.print("TODO roc check\n{}\n{s}\n\n", .{ opt, args });
+    // Temporary implementation for early testing
+    const dir_path = args[0];
+    const fs = std.fs.cwd();
+    var dir = try fs.openDir(dir_path, .{ .iterate = true });
+    defer dir.close();

-    fatal("not implemented", .{});
+    const success = try syn.tokenizeAndCheckSyntaxFiles(dir);
+    if (!success) {
+        fatal("syntax check failed", .{});
+    }
 }

 fn rocDocs(allocator: Allocator, opt: RocOpt, args: []const []const u8) !void {