Generate builtins list for the smith fuzzer

This moves one more place of duplication of builtins into generate_keywords.py as a single source of truth, resolving a to do in the smith fuzzer. This does once more shuffle all of these around in the fuzzer, which makes the existing fuzz corpus mostly meaningless. Fortunately, this should be the last time that this happens: with the new approach we can modify the builtins with minimal changes to the meaning of the fuzz corpus, which is something that I wanted for a long time.
2025-10-09 16:32:11 +00:00 · 2025-03-15 19:21:55 +01:00 · 2025-03-15 19:21:55 +01:00 · 5bcc7e74c7
commit 5bcc7e74c7
parent 34e347a387
4 changed files with 189 additions and 50 deletions
--- a/tools/generate_keywords.py
+++ b/tools/generate_keywords.py
@ -14,11 +14,12 @@ so that there is a single source of truth. We take the Pygments lexer in
 importable here.
 """

+import hashlib
+import json
 import os
 import sys
-import json

-from typing import Iterable
+from typing import Iterable, List

 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

@ -26,6 +27,9 @@ from grammar.pygments.rcl import KEYWORDS, BUILTINS, TYPES


 def generate_fuzz_dictionary() -> None:
+    """
+    Take dictionary_base.txt and append all keywords to it.
+    """
    base = open("fuzz/dictionary_base.txt", "r", encoding="utf-8").read()
    with open("fuzz/dictionary.txt", "w", encoding="utf-8") as f:
        f.write("# This file is generated by tools/generate_keywords.py.\n\n")
@ -42,5 +46,78 @@ def generate_fuzz_dictionary() -> None:
        write_section("Builtin types.", TYPES)


+def hash_keyword(slot: int, keyword: str) -> bytes:
+    """
+    Hash a (slot, keyword) pair, used to build the smith dictionaries below.
+    We fix a particular hash function so the output is stable across Python
+    versions.
+    """
+    h = hashlib.blake2s(slot.to_bytes(), usedforsecurity=False)
+    h.update(keyword.encode("utf-8"))
+    return h.digest()
+
+
+def generate_smith_builtins() -> None:
+    """
+    Generate a Rust file with the builtins, for use by the smith fuzzer.
+
+    For the smith fuzzer, we generate an array of keywords and then the
+    fuzzer picks one based on an u8 from the seed. However, if we just
+    put the values there in order, then any time we add a builtin, that
+    changes the meaning of the existing fuzz corpus, and the fuzzer has
+    to run for a long time to "learn" the new meaning. It's also not
+    ideal when switching branches. So instead, we use rendezvous hashing
+    to generate a slice of 256 elements, such that the changes to the
+    slice are minimal when the builtins change.
+    """
+    assert len(TYPES) <= 256, "Smith keyword indices are only 8 bits."
+    assert len(BUILTINS) <= 256, "Smith keyword indices are only 8 bits."
+
+    types_list: List[str] = []
+    builtins_list: List[str] = []
+
+    # Fill up the 256 slots in these lists using rendezvous hashing.
+    for i in range(0, 256):
+        _, t = min((hash_keyword(i, t), t) for t in TYPES)
+        _, b = min((hash_keyword(i, b), b) for b in BUILTINS)
+
+        types_list.append(t)
+        builtins_list.append(b)
+
+    # There is no theoretical guarantee every type/builtin got placed into
+    # one of the slots, so confirm that here. We have plenty of slots so
+    # it's unlikely to fail, but if it does fail, we should change the salt
+    # for the hash function.
+    for t in TYPES:
+        assert t in types_list, "Generated slice must contain all types."
+    for b in BUILTINS:
+        assert b in builtins_list, "Generated slice must contain all builtins."
+
+    with open("fuzz/src/builtins.rs", "w", encoding="utf-8") as f:
+        f.write("// This file is generated by tools/generate_keywords.py.\n")
+        f.write("// See that file for why these arrays contain duplicates.\n\n")
+
+        f.write("#[rustfmt::skip]\n")
+        f.write("pub const BUILTIN_TYPES: [&str; 256] = [")
+        for i, t in enumerate(types_list):
+            if i % 8 == 0:
+                f.write("\n    ")
+            f.write(" ")
+            json.dump(t, f)
+            f.write(",")
+        f.write("\n];\n\n")
+
+        f.write("#[rustfmt::skip]\n")
+        f.write("pub const BUILTINS: [&str; 256] = [")
+        for i, b in enumerate(builtins_list):
+            if i % 4 == 0:
+                f.write("\n    ")
+            f.write(" ")
+            json.dump(b, f)
+            f.write(",")
+        f.write("\n];\n")
+
+
 if __name__ == "__main__":
    generate_fuzz_dictionary()
+    generate_smith_builtins()