RustPython/scripts/generate_opcode_metadata.py

"""
Generate Lib/_opcode_metadata.py for RustPython bytecode.

This file generates opcode metadata that is compatible with CPython 3.13.
"""

import itertools
import pathlib
import re
import typing

ROOT = pathlib.Path(__file__).parents[1]
BYTECODE_FILE = (
    ROOT / "crates" / "compiler-core" / "src" / "bytecode" / "instructions.rs"
)
OPCODE_METADATA_FILE = ROOT / "Lib" / "_opcode_metadata.py"


# Opcodes that needs to be first, regardless of their opcode ID.
PRIORITY_OPMAP = {
    "CACHE",
    "RESERVED",
    "RESUME",
    "INSTRUMENTED_LINE",
    "ENTER_EXECUTOR",
}


def to_snake_case(s: str) -> str:
    res = re.sub(r"(?<=[a-z0-9])([A-Z])", r"_\1", s)
    return re.sub(r"(\D)(\d+)$", r"\1_\2", res).upper()


class Opcode(typing.NamedTuple):
    rust_name: str
    id: int
    have_oparg: bool

    @property
    def is_instrumented(self):
        return self.cpython_name.startswith("INSTRUMENTED_")

    @property
    def cpython_name(self):
        return to_snake_case(self.rust_name)

    @classmethod
    def from_str(cls, text: str):
        # Split on commas that are followed by a newline + an uppercase letter (new entry)
        entries = re.split(r",\s*\n\s*(?=[A-Z])", text)
        for entry in entries:
            entry = entry.strip()
            if not entry:
                continue
            have_oparg = "Arg<" in entry  # Hacky but works
            rust_name = re.match(r"(\w+)", entry).group(1)
            id_num = re.findall(r"= (\d+)", entry)[0]
            yield cls(rust_name=rust_name, id=int(id_num), have_oparg=have_oparg)

    def __lt__(self, other: typing.Self) -> bool:
        sprio, oprio = (
            opcode.cpython_name not in PRIORITY_OPMAP for opcode in (self, other)
        )
        return (sprio, self.id) < (oprio, other.id)


def extract_enum_body(text: str, name: str) -> str:
    # Find the start of the enum block
    start_match = re.search(rf"enum\s+{name}\s*\{{", text)
    if not start_match:
        return None

    # Manually track brace depth from that point
    depth = 0
    start = start_match.end() - 1  # position of opening '{'
    for i, ch in enumerate(text[start:], start):
        if ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0:
                # Return only the inner content (excluding outer braces)
                return text[start + 1 : i]


def build_deopts(text: str) -> dict[str, list[str]]:
    raw_body = re.search(r"fn deopt\(self\)(.*)", text, re.DOTALL).group(1)
    match_start = raw_body.find("match self")
    if match_start == -1:
        raise ValueError("Could not detect a match statement in deopt method")

    brace_depth = 0
    block_start = None
    block_end = None

    for i, ch in enumerate(raw_body[match_start:], match_start):
        if ch == "{":
            brace_depth += 1
            if block_start is None:
                block_start = i + 1
        elif ch == "}":
            brace_depth -= 1
            if brace_depth == 0:
                block_end = i
                break

    match_body = raw_body[block_start:block_end]

    arm_pattern = re.compile(
        r"((?:Self::\w+\s*\|\s*)*Self::\w+)\s*=>\s*(?:\{\s*)?Self::(\w+)", re.DOTALL
    )
    variants_pattern = re.compile(r"Self::(\w+)")

    deopts = {}
    for hit in arm_pattern.finditer(match_body):
        raw_variants = hit.group(1)
        opcode = hit.group(2)

        variants = variants_pattern.findall(raw_variants)

        key = to_snake_case(opcode)
        value = [to_snake_case(variant) for variant in variants]
        deopts[key] = value

    return deopts


contents = BYTECODE_FILE.read_text(encoding="utf-8")

deopts = build_deopts(contents)

enum_body = "\n".join(
    extract_enum_body(contents, enum_name)
    for enum_name in ("Instruction", "PseudoInstruction")
)
opcodes = list(Opcode.from_str(enum_body))

have_oparg = min(opcode.id for opcode in opcodes if opcode.have_oparg) - 1
min_instrumented = min(opcode.id for opcode in opcodes if opcode.is_instrumented)

# Generate the output file
output = """# This file is generated by scripts/generate_opcode_metadata.py
# for RustPython bytecode format (CPython 3.14 compatible opcode numbers).
# Do not edit!
"""

output += "\n_specializations = {\n"

for key, lst in deopts.items():
    output += f'    "{key}": [\n'
    for item in lst:
        output += f'        "{item}",\n'
    output += "    ],\n"

output += "}\n"

specialized = set(itertools.chain.from_iterable(deopts.values()))
output += "\n_specialized_opmap = {\n"
for opcode in sorted(opcodes, key=lambda op: op.cpython_name):
    cpython_name = opcode.cpython_name
    if cpython_name not in specialized:
        continue

    output += f"    '{cpython_name}': {opcode.id},\n"

output += "}\n"

output += "\nopmap = {\n"

for opcode in sorted(opcodes):
    cpython_name = opcode.cpython_name
    if cpython_name in specialized:
        continue

    output += f"    '{cpython_name}': {opcode.id},\n"

output += "}\n"

output += f"""
HAVE_ARGUMENT = {have_oparg}
MIN_INSTRUMENTED_OPCODE = {min_instrumented}
"""

OPCODE_METADATA_FILE.write_text(output, encoding="utf-8")