Welcome to other chapters of Let’s Understand Chrome V8

Dispatch is responsible for scheduling bytecode, which is equivalent to registering EIP++, and jumps to the next bytecode. Dispatch consists of two parts, one is a dispatch table and the other is the physical register. The table is an array that contains all bytecode addresses. V8 uses the physical register to dispatch bytecode for getting greater efficiency.

1. Dispatch Table

The dispatch table is an array of pointers, which is used to store the address of the bytecode processing program. The array type is Code, and the Code Class is below:

  class Code: public HeapObject {
    public: NEVER_READ_ONLY_SPACE
    using Flags = uint32_t;
    #define CODE_KIND_LIST(V)\
    V(OPTIMIZED_FUNCTION)\
    V(BYTECODE_HANDLER)\
    V(STUB)\
    V(BUILTIN)\
    V(REGEXP)\
    V(WASM_FUNCTION)\
    V(WASM_TO_CAPI_FUNCTION)\
    V(WASM_TO_JS_FUNCTION)\
    V(JS_TO_WASM_FUNCTION)\
    V(JS_TO_JS_FUNCTION)\
    V(WASM_INTERPRETER_ENTRY)\
    V(C_WASM_ENTRY)
    enum Kind {
      #define DEFINE_CODE_KIND_ENUM(name) name,
        CODE_KIND_LIST(DEFINE_CODE_KIND_ENUM)
      #undef DEFINE_CODE_KIND_ENUM
      NUMBER_OF_KINDS
    };
    static
    const char * Kind2String(Kind kind);
    #ifdef ENABLE_DISASSEMBLER
    const char * GetName(Isolate * isolate) const;
    V8_EXPORT_PRIVATE void Disassemble(const char * name, std::ostream & os,
      Address current_pc = kNullAddress);
    #endif
    //................omit................
  };

Note: In the isolate, there is another pointer array that is the same type as the dispatch table. It holds all the Builtins addresses, which can confuse beginners easily.

The dispatch table is maintained by BuildWithMacroAssembler, code is below:

  Code BuildWithMacroAssembler(Isolate * isolate, int32_t builtin_index,
    MacroAssemblerGenerator generator,
    const char * s_name) {
    HandleScope scope(isolate);
    // Canonicalize handles, so that we can share constant pool entries pointing
    // to code targets without dereferencing their handles.
    CanonicalHandleScope canonical(isolate);
    constexpr int kBufferSize = 32 * KB;
    byte buffer[kBufferSize];
    MacroAssembler masm(isolate, BuiltinAssemblerOptions(isolate, builtin_index),
      CodeObjectRequired::kYes,
      ExternalAssemblerBuffer(buffer, kBufferSize));
    masm.set_builtin_index(builtin_index);
    DCHECK(!masm.has_frame());
    generator( & masm);
    int handler_table_offset = 0;
    // JSEntry builtins are a special case and need to generate a handler table.
    DCHECK_EQ(Builtins::KindOf(Builtins::kJSEntry), Builtins::ASM);
    DCHECK_EQ(Builtins::KindOf(Builtins::kJSConstructEntry), Builtins::ASM);
    DCHECK_EQ(Builtins::KindOf(Builtins::kJSRunMicrotasksEntry), Builtins::ASM);
    if (Builtins::IsJSEntryVariant(builtin_index)) {
      handler_table_offset = HandlerTable::EmitReturnTableStart( & masm);
      HandlerTable::EmitReturnEntry( &
        masm, 0, isolate -> builtins() -> js_entry_handler_offset());
    }
    //.....................................................
    //................omit.............................
  }

You can see Chapter 9 regarding how to debug the BuildWithMacroAssembler. When the value of the parameter builtin_index is 65, the function pointer generator points to Generate_InterpreterEnterBytecodeDispatch which is responsible for maintaining the Dispatch table, code is below:

  static void Generate_InterpreterEnterBytecode(MacroAssembler * masm) {
    // Set the return address to the correct point in the interpreter entry
    // trampoline.
    Label builtin_trampoline, trampoline_loaded;
    Smi interpreter_entry_return_pc_offset(
      masm -> isolate() -> heap() -> interpreter_entry_return_pc_offset());
    DCHECK_NE(interpreter_entry_return_pc_offset, Smi::kZero);
    // If the SFI function_data is an InterpreterData, the function will have a
    // custom copy of the interpreter entry trampoline for profiling. If so,
    // get the custom trampoline, otherwise grab the entry address of the global
    // trampoline.
    __ movq(rbx, Operand(rbp, StandardFrameConstants::kFunctionOffset));
    __ LoadTaggedPointerField(
      rbx, FieldOperand(rbx, JSFunction::kSharedFunctionInfoOffset));
    __ LoadTaggedPointerField(
      rbx, FieldOperand(rbx, SharedFunctionInfo::kFunctionDataOffset));
    __ CmpObjectType(rbx, INTERPRETER_DATA_TYPE, kScratchRegister);
    __ j(not_equal, & builtin_trampoline, Label::kNear);
    __ movq(rbx,
      FieldOperand(rbx, InterpreterData::kInterpreterTrampolineOffset));
    __ addq(rbx, Immediate(Code::kHeaderSize - kHeapObjectTag));
    __ jmp( & trampoline_loaded, Label::kNear);
    __ bind( & builtin_trampoline);
    // TODO(jgruber): Replace this by a lookup in the builtin entry table.
    __ movq(rbx,
      __ ExternalReferenceAsOperand(
        ExternalReference::
        address_of_interpreter_entry_trampoline_instruction_start(
          masm -> isolate()),
        kScratchRegister));
    __ bind( & trampoline_loaded);
    __ addq(rbx, Immediate(interpreter_entry_return_pc_offset.value()));
    __ Push(rbx);
    // Initialize dispatch table register.
    __ Move(
      kInterpreterDispatchTableRegister,
      ExternalReference::interpreter_dispatch_table_address(masm -> isolate()));
    // Get the bytecode array pointer from the frame.
    __ movq(kInterpreterBytecodeArrayRegister,
      Operand(rbp, InterpreterFrameConstants::kBytecodeArrayFromFp));
    if (FLAG_debug_code) {
      // Check function data field is actually a BytecodeArray object.
      __ AssertNotSmi(kInterpreterBytecodeArrayRegister);
      __ CmpObjectType(kInterpreterBytecodeArrayRegister, BYTECODE_ARRAY_TYPE,
        rbx);
      __ Assert(
        equal,
        AbortReason::kFunctionDataShouldBeBytecodeArrayOnInterpreterEntry);
    }
    // Get the target bytecode offset from the frame.
    __ movq(kInterpreterBytecodeOffsetRegister,
      Operand(rbp, InterpreterFrameConstants::kBytecodeOffsetFromFp));
    __ SmiUntag(kInterpreterBytecodeOffsetRegister,
      kInterpreterBytecodeOffsetRegister);
    // Dispatch to the target bytecode.
    __ movzxbq(r11, Operand(kInterpreterBytecodeArrayRegister,
      kInterpreterBytecodeOffsetRegister, times_1, 0));
    __ movq(kJavaScriptCallCodeStartRegister,
      Operand(kInterpreterDispatchTableRegister, r11,
        times_system_pointer_size, 0));
    __ jmp(kJavaScriptCallCodeStartRegister);
  }
  //================================separation=========================
  void Builtins::Generate_InterpreterEnterBytecodeDispatch(MacroAssembler * masm) {
    Generate_InterpreterEnterBytecode(masm);
  }

In the above code, the Generate_InterpreterEnterBytecodeDispatch is the entry point of bytecode dispatch, and the Generate_InterpreterEnterBytecode is the key code that implements the bytecode dispatch. Here are two important concepts:

The Generate_InterpreterEnterBytecodeDispatch is a Builtin and its index is 65. (The index may be dissimilar in different V8 versions)
V8 uses a physical register to manage the dispatch table, the name of the register is kInterpreterDispatchTableRegister. Using the physical register can avoid the push and pop of the dispatch table frequently, to improve efficiency.

Here are the key lines of code：

(1) Line 35, loads the dispatch table to the A register. Line 37 of code, take out the table address from the isolate. The implementation of A is in the below three functions.

ExternalReference ExternalReference::interpreter_dispatch_table_address(
  Isolate * isolate) {
  return ExternalReference(isolate -> interpreter() -> dispatch_table_address());
}
interpreter::Interpreter * interpreter() const {
  return interpreter_;
}
Address dispatch_table_address() {
  return reinterpret_cast < Address > ( & dispatch_table_[0]);
}

(2) Line 39 takes out the bytecode array from the stack.

(3) Line 51 takes out the offset of the target bytecode from the bytecode array.

(4) Lines 56 and 58 calculate the target bytecode address and store the address in kJavaScriptCallCodeStartRegister.

(5) Line 61 jumps to register kJavaScriptCallCodeStartRegister and executes bytecode. Figure 1 is the call stack.

2. TailCall

The Dispatch() is the last step in a bytecode, so its alias is the tail call. Let’s look at a few bytecodes：

  IGNITION_HANDLER(StaGlobal, InterpreterAssembler) {
    TNode < Context > context = GetContext();
    //omit..............
    Goto( & end);
    Bind( & no_feedback);
    CallRuntime(Runtime::kStoreGlobalICNoFeedback_Miss, context, value, name);
    Goto( & end);
    Bind( & end);
    Dispatch(); // !!! this is the tail call.
  }
  IGNITION_HANDLER(LdaContextSlot, InterpreterAssembler) {
    TNode < Context > context = CAST(LoadRegisterAtOperandIndex(0));
    TNode < IntPtrT > slot_index = Signed(BytecodeOperandIdx(1));
    TNode < Uint32T > depth = BytecodeOperandUImm(2);
    TNode < Context > slot_context = GetContextAtDepth(context, depth);
    TNode < Object > result = LoadContextElement(slot_context, slot_index);
    SetAccumulator(result);
    Dispatch(); //!!! here also is.
  }
  IGNITION_HANDLER(LdaImmutableContextSlot, InterpreterAssembler) {
    TNode < Context > context = CAST(LoadRegisterAtOperandIndex(0));
    TNode < IntPtrT > slot_index = Signed(BytecodeOperandIdx(1));
    TNode < Uint32T > depth = BytecodeOperandUImm(2);
    TNode < Context > slot_context = GetContextAtDepth(context, depth);
    TNode < Object > result = LoadContextElement(slot_context, slot_index);
    SetAccumulator(result);
    Dispatch(); //!!! here 
  }

The above three bytecodes all end with Dispatch(). Below is the code of Dispatch:

  void InterpreterAssembler::Dispatch() {
    Comment("========= Dispatch");
    DCHECK_IMPLIES(Bytecodes::MakesCallAlongCriticalPath(bytecode_), made_call_);
    TNode < IntPtrT > target_offset = Advance();
    TNode < WordT > target_bytecode = LoadBytecode(target_offset);
    if (Bytecodes::IsStarLookahead(bytecode_, operand_scale_)) {
      target_bytecode = StarDispatchLookahead(target_bytecode);
    }
    DispatchToBytecode(target_bytecode, BytecodeOffset());
  }

The 2nd line is the comment function. When debugging, we can use its output information to locate the execution position. The 4th line takes out the offset of the target(next) bytecode. The 5th line loads the target bytecode. The 9th line calls DspatchToBytecode() which calls DispatchToBytecodeHandlerEntry() eventually.

void InterpreterAssembler::DispatchToBytecodeHandlerEntry(
  TNode < RawPtrT > handler_entry, TNode < IntPtrT > bytecode_offset) {
  // Propagate speculation poisoning.
  TNode < RawPtrT > poisoned_handler_entry =
    UncheckedCast < RawPtrT > (WordPoisonOnSpeculation(handler_entry));
  TailCallBytecodeDispatch(InterpreterDispatchDescriptor {},
    poisoned_handler_entry, GetAccumulatorUnchecked(),
    bytecode_offset, BytecodeArrayTaggedPointer(),
    DispatchTablePointer());
}

In DispatchToBytecodeHandlerEntry, parameter 1 is the target bytecode and 2 is the offset of the bytecode. Parameter 1 is used to load the bytecode handler and 2 is used to load operands. Then call TailCallBytecodeDispatch, which is below:

  template < class...TArgs >
    void CodeAssembler::TailCallBytecodeDispatch(
      const CallInterfaceDescriptor & descriptor, TNode < RawPtrT > target,
        TArgs...args) {
      DCHECK_EQ(descriptor.GetParameterCount(), sizeof...(args));
      auto call_descriptor = Linkage::GetBytecodeDispatchCallDescriptor(
        zone(), descriptor, descriptor.GetStackParameterCount());
      Node * nodes[] = {
        target,
        args...
      };
      CHECK_EQ(descriptor.GetParameterCount() + 1, arraysize(nodes));
      raw_assembler() -> TailCallN(call_descriptor, arraysize(nodes), nodes);
    }

In the above code, the 5th line creates the descriptor, as below:

  CallDescriptor * Linkage::GetBytecodeDispatchCallDescriptor(
    Zone * zone,
    const CallInterfaceDescriptor & descriptor,
      int stack_parameter_count) {
    const int register_parameter_count = descriptor.GetRegisterParameterCount();
    const int parameter_count = register_parameter_count + stack_parameter_count;
    DCHECK_EQ(descriptor.GetReturnCount(), 1);
    LocationSignature::Builder locations(zone, 1, parameter_count);
    locations.AddReturn(regloc(kReturnRegister0, descriptor.GetReturnType(0)));
    for (int i = 0; i < parameter_count; i++) {
      if (i < register_parameter_count) {
        // The first parameters go in registers.
        Register reg = descriptor.GetRegisterParameter(i);
        MachineType type = descriptor.GetParameterType(i);
        locations.AddParam(regloc(reg, type));
      } else {
        int stack_slot = i - register_parameter_count - stack_parameter_count;
        locations.AddParam(LinkageLocation::ForCallerFrameSlot(
          stack_slot, MachineType::AnyTagged()));
      }
    }
    MachineType target_type = MachineType::Pointer();
    LinkageLocation target_loc = LinkageLocation::ForAnyRegister(target_type);
    const CallDescriptor::Flags kFlags =
      CallDescriptor::kCanUseRoots | CallDescriptor::kFixedTargetRegister;
    return new(zone) CallDescriptor( // --
      CallDescriptor::kCallAddress, // kind
      target_type, // target MachineType
      target_loc, // target location
      locations.Build(), // location_sig
      stack_parameter_count, // stack_parameter_count
      Operator::kNoProperties, // properties
      kNoCalleeSaved, // callee-saved registers
      kNoCalleeSaved, // callee-saved fp
      kFlags, // flags
      descriptor.DebugName());
  }

In GetBytecodeDispatchCallDescriptor, parameter 2 is the target bytecode, and parameter 3 is the stack parameter count. This function is used to apply for register resources and create a CallDescriptor. The CallDescriptor gives what parameters to supply when executing a bytecode. The comments on lines 24 to 25 give its layout.

In TailCallBytecodeDispatch, the TailCallN() on the 9th line is below, we see that it’s parameter 1 is CallDescriptor.

  void RawMachineAssembler::TailCallN(CallDescriptor * call_descriptor,
    int input_count, Node *
    const * inputs) {
    // +1 is for target.
    DCHECK_EQ(input_count, call_descriptor -> ParameterCount() + 1);
    Node * tail_call =
      MakeNode(common() -> TailCall(call_descriptor), input_count, inputs);
    schedule() -> AddTailCall(CurrentBlock(), tail_call);
    current_block_ = nullptr;
  }

The 5th line generates a node. In the 7th line AddTailCall() adds the node to the end of the current basic block. So far, the dispatch is completed and the next bytecode is started to execute. Figure 2 shows the function call stack.

Okay, that wraps it up for this share. I’ll see you guys next time, take care!

Please reach out to me if you have any issues.

WeChat: qq9123013 Email: [email protected]

Also published here.

Let's Understand Chrome V8 — Chapter 11: Bytecode Dispatch

1. Dispatch Table

2. TailCall