diff --git a/Backport-JDK-8315743-8315856-8344010-8344382-RISC-V-Use-Zacas-extension-for-cmpxchg.patch b/Backport-JDK-8315743-8315856-8344010-8344382-RISC-V-Use-Zacas-extension-for-cmpxchg.patch
new file mode 100644
index 0000000000000000000000000000000000000000..411f0e38b82c2cbc7b413f0d8f5674650274ab75
--- /dev/null
+++ b/Backport-JDK-8315743-8315856-8344010-8344382-RISC-V-Use-Zacas-extension-for-cmpxchg.patch
@@ -0,0 +1,419 @@
+diff --git a/src/hotspot/cpu/riscv/assembler_riscv.hpp b/src/hotspot/cpu/riscv/assembler_riscv.hpp
+index 44c60de74..522550a07 100644
+--- a/src/hotspot/cpu/riscv/assembler_riscv.hpp
++++ b/src/hotspot/cpu/riscv/assembler_riscv.hpp
+@@ -851,6 +851,8 @@ enum Aqrl {relaxed = 0b00, rl = 0b01, aq = 0b10, aqrl = 0b11};
+   INSN(amomax_d , 0b0101111, 0b011, 0b10100);
+   INSN(amominu_d, 0b0101111, 0b011, 0b11000);
+   INSN(amomaxu_d, 0b0101111, 0b011, 0b11100);
++  INSN(amocas_w,  0b0101111, 0b010, 0b00101);
++  INSN(amocas_d,  0b0101111, 0b011, 0b00101);
+ #undef INSN
+ 
+ enum operand_size { int8, int16, int32, uint32, int64 };
+diff --git a/src/hotspot/cpu/riscv/globals_riscv.hpp b/src/hotspot/cpu/riscv/globals_riscv.hpp
+index cae52c8de..2c18805ec 100644
+--- a/src/hotspot/cpu/riscv/globals_riscv.hpp
++++ b/src/hotspot/cpu/riscv/globals_riscv.hpp
+@@ -106,6 +106,7 @@ define_pd_global(intx, InlineSmallCode,          1000);
+   product(bool, UseZba, false, DIAGNOSTIC, "Use Zba instructions")               \
+   product(bool, UseZbb, false, DIAGNOSTIC, "Use Zbb instructions")               \
+   product(bool, UseZbs, false, DIAGNOSTIC, "Use Zbs instructions")               \
++  product(bool, UseZacas, false, EXPERIMENTAL, "Use Zacas instructions")         \
+   product(bool, UseZfa, false, EXPERIMENTAL, "Use Zfa instructions")             \
+   product(bool, UseZic64b, false, EXPERIMENTAL, "Use Zic64b instructions")       \
+   product(bool, UseZicbom, false, EXPERIMENTAL, "Use Zicbom instructions")       \
+diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
+index 419783e16..e411b8956 100644
+--- a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
++++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
+@@ -3246,20 +3246,29 @@ void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Reg
+   // oldv holds comparison value
+   // newv holds value to write in exchange
+   // addr identifies memory word to compare against/update
+-  Label retry_load, nope;
+-  bind(retry_load);
+-  // Load reserved from the memory location
+-  lr_d(tmp, addr, Assembler::aqrl);
+-  // Fail and exit if it is not what we expect
+-  bne(tmp, oldv, nope);
+-  // If the store conditional succeeds, tmp will be zero
+-  sc_d(tmp, newv, addr, Assembler::rl);
+-  beqz(tmp, succeed);
+-  // Retry only when the store conditional failed
+-  j(retry_load);
+-
+-  bind(nope);
++  if (UseZacas) {
++    mv(tmp, oldv);
++    atomic_cas(tmp, newv, addr, Assembler::int64, Assembler::aq, Assembler::rl);
++    beq(tmp, oldv, succeed);
++  } else {
++    Label retry_load, nope;
++    bind(retry_load);
++    // Load reserved from the memory location
++    load_reserved(tmp, addr, int64, Assembler::aqrl);
++    // Fail and exit if it is not what we expect
++    bne(tmp, oldv, nope);
++    // If the store conditional succeeds, tmp will be zero
++    store_conditional(tmp, newv, addr, int64, Assembler::rl);
++    beqz(tmp, succeed);
++    // Retry only when the store conditional failed
++    j(retry_load);
++
++    bind(nope);
++  }
++
++  // neither amocas nor lr/sc have an implied barrier in the failing case
+   membar(AnyAny);
++
+   mv(oldv, tmp);
+   if (fail != nullptr) {
+     j(*fail);
+@@ -3272,18 +3281,19 @@ void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register o
+   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
+ }
+ 
+-void MacroAssembler::load_reserved(Register addr,
++void MacroAssembler::load_reserved(Register dst,
++                                   Register addr,
+                                    enum operand_size size,
+                                    Assembler::Aqrl acquire) {
+   switch (size) {
+     case int64:
+-      lr_d(t0, addr, acquire);
++      lr_d(dst, addr, acquire);
+       break;
+     case int32:
+-      lr_w(t0, addr, acquire);
++      lr_w(dst, addr, acquire);
+       break;
+     case uint32:
+-      lr_w(t0, addr, acquire);
++      lr_w(dst, addr, acquire);
+       zero_extend(t0, t0, 32);
+       break;
+     default:
+@@ -3291,17 +3301,18 @@ void MacroAssembler::load_reserved(Register addr,
+   }
+ }
+ 
+-void MacroAssembler::store_conditional(Register addr,
++void MacroAssembler::store_conditional(Register dst,
+                                        Register new_val,
++                                       Register addr,
+                                        enum operand_size size,
+                                        Assembler::Aqrl release) {
+   switch (size) {
+     case int64:
+-      sc_d(t0, new_val, addr, release);
++      sc_d(dst, new_val, addr, release);
+       break;
+     case int32:
+     case uint32:
+-      sc_w(t0, new_val, addr, release);
++      sc_w(dst, new_val, addr, release);
+       break;
+     default:
+       ShouldNotReachHere();
+@@ -3309,14 +3320,11 @@ void MacroAssembler::store_conditional(Register addr,
+ }
+ 
+ 
+-void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected,
+-                                                 Register new_val,
++void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected, Register new_val,
+                                                  enum operand_size size,
+-                                                 Register tmp1, Register tmp2, Register tmp3) {
++                                                 Register shift, Register mask, Register aligned_addr) {
+   assert(size == int8 || size == int16, "unsupported operand size");
+ 
+-  Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3;
+-
+   andi(shift, addr, 3);
+   slli(shift, shift, 3);
+ 
+@@ -3331,8 +3339,6 @@ void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expecte
+   }
+   sll(mask, mask, shift);
+ 
+-  xori(not_mask, mask, -1);
+-
+   sll(expected, expected, shift);
+   andr(expected, expected, mask);
+ 
+@@ -3341,7 +3347,7 @@ void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expecte
+ }
+ 
+ // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
+-// It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w,
++// It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w or amocas.w,
+ // which are forced to work with 4-byte aligned address.
+ void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
+                                           Register new_val,
+@@ -3349,21 +3355,47 @@ void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
+                                           Assembler::Aqrl acquire, Assembler::Aqrl release,
+                                           Register result, bool result_as_bool,
+                                           Register tmp1, Register tmp2, Register tmp3) {
+-  Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
+-  assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
+-  cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
++  assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
++
++  Register scratch0 = t0, aligned_addr = t1;
++  Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
++
++  cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
+ 
+   Label retry, fail, done;
+ 
+-  bind(retry);
+-  lr_w(old, aligned_addr, acquire);
+-  andr(tmp, old, mask);
+-  bne(tmp, expected, fail);
++  if (UseZacas) {
++    lw(result, aligned_addr);
++
++    bind(retry); // amocas loads the current value into result
++    notr(scratch1, mask);
++
++    andr(scratch0, result, scratch1);  // scratch0 = word - cas bits
++    orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
++    bne(result, scratch1, fail);       // cas bits differ, cas failed
++
++    // result is the same as expected, use as expected value.
++
++    // scratch0 is still = word - cas bits
++    // Or in the new value to create complete new value.
++    orr(scratch0, scratch0, new_val);
++
++    mv(scratch1, result); // save our expected value
++    atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
++    bne(scratch1, result, retry);
++  } else {
++    notr(scratch1, mask);
++    bind(retry);
++
++    lr_w(result, aligned_addr, acquire);
++    andr(scratch0, result, mask);
++    bne(scratch0, expected, fail);
+ 
+-  andr(tmp, old, not_mask);
+-  orr(tmp, tmp, new_val);
+-  sc_w(tmp, tmp, aligned_addr, release);
+-  bnez(tmp, retry);
++    andr(scratch0, result, scratch1); // scratch1 is ~mask
++    orr(scratch0, scratch0, new_val);
++    sc_w(scratch0, scratch0, aligned_addr, release);
++    bnez(scratch0, retry);
++  }
+ 
+   if (result_as_bool) {
+     mv(result, 1);
+@@ -3374,10 +3406,10 @@ void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
+ 
+     bind(done);
+   } else {
+-    andr(tmp, old, mask);
+-
+     bind(fail);
+-    srl(result, tmp, shift);
++
++    andr(scratch0, result, mask);
++    srl(result, scratch0, shift);
+ 
+     if (size == int8) {
+       sign_extend(result, result, 8);
+@@ -3397,20 +3429,45 @@ void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
+                                                Assembler::Aqrl acquire, Assembler::Aqrl release,
+                                                Register result,
+                                                Register tmp1, Register tmp2, Register tmp3) {
+-  Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
+-  assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
+-  cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
++  assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
++
++  Register scratch0 = t0, aligned_addr = t1;
++  Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
++
++  cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
+ 
+   Label fail, done;
+ 
+-  lr_w(old, aligned_addr, acquire);
+-  andr(tmp, old, mask);
+-  bne(tmp, expected, fail);
++  if (UseZacas) {
++    lw(result, aligned_addr);
++
++    notr(scratch1, mask);
+ 
+-  andr(tmp, old, not_mask);
+-  orr(tmp, tmp, new_val);
+-  sc_w(tmp, tmp, aligned_addr, release);
+-  bnez(tmp, fail);
++    andr(scratch0, result, scratch1);  // scratch0 = word - cas bits
++    orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
++    bne(result, scratch1, fail);       // cas bits differ, cas failed
++
++    // result is the same as expected, use as expected value.
++
++    // scratch0 is still = word - cas bits
++    // Or in the new value to create complete new value.
++    orr(scratch0, scratch0, new_val);
++
++    mv(scratch1, result); // save our expected value
++    atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
++    bne(scratch1, result, fail); // This weak, so just bail-out.
++  } else {
++    notr(scratch1, mask);
++
++    lr_w(result, aligned_addr, acquire);
++    andr(scratch0, result, mask);
++    bne(scratch0, expected, fail);
++
++    andr(scratch0, result, scratch1); // scratch1 is ~mask
++    orr(scratch0, scratch0, new_val);
++    sc_w(scratch0, scratch0, aligned_addr, release);
++    bnez(scratch0, fail);
++  }
+ 
+   // Success
+   mv(result, 1);
+@@ -3433,11 +3490,36 @@ void MacroAssembler::cmpxchg(Register addr, Register expected,
+   assert_different_registers(expected, t0);
+   assert_different_registers(new_val, t0);
+ 
++  // NOTE:
++  // Register _result_ may be the same register as _new_val_ or _expected_.
++  // Hence do NOT use _result_ until after 'cas'.
++  //
++  // Register _expected_ may be the same register as _new_val_ and is assumed to be preserved.
++  // Hence do NOT change _expected_ or _new_val_.
++  //
++  // Having _expected_ and _new_val_ being the same register is a very puzzling cas.
++  //
++  // TODO: Address these issues.
++
++  if (UseZacas) {
++    if (result_as_bool) {
++      mv(t0, expected);
++      atomic_cas(t0, new_val, addr, size, acquire, release);
++      xorr(t0, t0, expected);
++      seqz(result, t0);
++    } else {
++      mv(t0, expected);
++      atomic_cas(t0, new_val, addr, size, acquire, release);
++      mv(result, t0);
++    }
++    return;
++  }
++
+   Label retry_load, done, ne_done;
+   bind(retry_load);
+-  load_reserved(addr, size, acquire);
++  load_reserved(t0, addr, size, acquire);
+   bne(t0, expected, ne_done);
+-  store_conditional(addr, new_val, size, release);
++  store_conditional(t0, new_val, addr, size, release);
+   bnez(t0, retry_load);
+ 
+   // equal, succeed
+@@ -3464,14 +3546,20 @@ void MacroAssembler::cmpxchg_weak(Register addr, Register expected,
+                                   enum operand_size size,
+                                   Assembler::Aqrl acquire, Assembler::Aqrl release,
+                                   Register result) {
++
+   assert_different_registers(addr, t0);
+   assert_different_registers(expected, t0);
+   assert_different_registers(new_val, t0);
+ 
++  if (UseZacas) {
++    cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
++    return;
++  }
++
+   Label fail, done;
+-  load_reserved(addr, size, acquire);
++  load_reserved(t0, addr, size, acquire);
+   bne(t0, expected, fail);
+-  store_conditional(addr, new_val, size, release);
++  store_conditional(t0, new_val, addr, size, release);
+   bnez(t0, fail);
+ 
+   // Success
+@@ -3530,6 +3618,24 @@ ATOMIC_XCHGU(xchgalwu, xchgalw)
+ 
+ #undef ATOMIC_XCHGU
+ 
++void MacroAssembler::atomic_cas(Register prev, Register newv, Register addr,
++                                enum operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
++  switch (size) {
++    case int64:
++      amocas_d(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
++      break;
++    case int32:
++      amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
++      break;
++    case uint32:
++      amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
++      zero_extend(prev, prev, 32);
++      break;
++    default:
++      ShouldNotReachHere();
++  }
++}
++
+ void MacroAssembler::far_jump(Address entry, Register tmp) {
+   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
+   assert(CodeCache::find_blob(entry.target()) != nullptr,
+diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
+index c8b34c5ea..692306a73 100644
+--- a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
++++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
+@@ -1025,10 +1025,9 @@ public:
+                     enum operand_size size,
+                     Assembler::Aqrl acquire, Assembler::Aqrl release,
+                     Register result);
+-  void cmpxchg_narrow_value_helper(Register addr, Register expected,
+-                                   Register new_val,
++  void cmpxchg_narrow_value_helper(Register addr, Register expected, Register new_val,
+                                    enum operand_size size,
+-                                   Register tmp1, Register tmp2, Register tmp3);
++                                   Register shift, Register mask, Register aligned_addr);
+   void cmpxchg_narrow_value(Register addr, Register expected,
+                             Register new_val,
+                             enum operand_size size,
+@@ -1054,6 +1053,9 @@ public:
+   void atomic_xchgwu(Register prev, Register newv, Register addr);
+   void atomic_xchgalwu(Register prev, Register newv, Register addr);
+ 
++  void atomic_cas(Register prev, Register newv, Register addr, enum operand_size size,
++              Assembler::Aqrl acquire = Assembler::relaxed, Assembler::Aqrl release = Assembler::relaxed);
++
+   static bool far_branches() {
+     return ReservedCodeCacheSize > branch_range;
+   }
+@@ -1506,8 +1508,8 @@ private:
+   int bitset_to_regs(unsigned int bitset, unsigned char* regs);
+   Address add_memory_helper(const Address dst, Register tmp);
+ 
+-  void load_reserved(Register addr, enum operand_size size, Assembler::Aqrl acquire);
+-  void store_conditional(Register addr, Register new_val, enum operand_size size, Assembler::Aqrl release);
++  void load_reserved(Register dst, Register addr, enum operand_size size, Assembler::Aqrl acquire);
++  void store_conditional(Register dst, Register new_val, Register addr, enum operand_size size, Assembler::Aqrl release);
+ 
+ public:
+   void lightweight_lock(Register obj, Register hdr, Register tmp1, Register tmp2, Label& slow);
+diff --git a/src/hotspot/cpu/riscv/vm_version_riscv.hpp b/src/hotspot/cpu/riscv/vm_version_riscv.hpp
+index e99b4be9c..20fd260ee 100644
+--- a/src/hotspot/cpu/riscv/vm_version_riscv.hpp
++++ b/src/hotspot/cpu/riscv/vm_version_riscv.hpp
+@@ -145,6 +145,7 @@ class VM_Version : public Abstract_VM_Version {
+   decl(ext_Zifencei    , "Zifencei"    , RV_NO_FLAG_BIT, true , NO_UPDATE_DEFAULT)              \
+   decl(ext_Zic64b      , "Zic64b"      , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZic64b))      \
+   decl(ext_Zihintpause , "Zihintpause" , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZihintpause)) \
++  decl(ext_Zacas       , "Zacas"       , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZacas))       \
+   decl(ext_Zvbc        , "Zvbc"        , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZvbc))        \
+   decl(ext_Zvkn        , "Zvkn"        , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZvkn))        \
+   decl(mvendorid       , "VendorId"    , RV_NO_FLAG_BIT, false, NO_UPDATE_DEFAULT)              \
diff --git a/Backport-JDK-8319778-8324881-8319797-8319900-Recursive-lightweight-locking-riscv64-implementation.patch b/Backport-JDK-8319778-8324881-8319797-8319900-Recursive-lightweight-locking-riscv64-implementation.patch
new file mode 100644
index 0000000000000000000000000000000000000000..c8deb1fda51db4a8063bad128cf016de340314f6
--- /dev/null
+++ b/Backport-JDK-8319778-8324881-8319797-8319900-Recursive-lightweight-locking-riscv64-implementation.patch
@@ -0,0 +1,2881 @@
+diff --git a/src/hotspot/cpu/riscv/c1_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c1_MacroAssembler_riscv.cpp
+index 6c1dce0de..702bae688 100644
+--- a/src/hotspot/cpu/riscv/c1_MacroAssembler_riscv.cpp
++++ b/src/hotspot/cpu/riscv/c1_MacroAssembler_riscv.cpp
+@@ -69,13 +69,12 @@ int C1_MacroAssembler::lock_object(Register hdr, Register obj, Register disp_hdr
+     bnez(temp, slow_case, true /* is_far */);
+   }
+ 
+-  // Load object header
+-  ld(hdr, Address(obj, hdr_offset));
+-
+   if (LockingMode == LM_LIGHTWEIGHT) {
+     lightweight_lock(obj, hdr, temp, t1, slow_case);
+   } else if (LockingMode == LM_LEGACY) {
+     Label done;
++    // Load object header
++    ld(hdr, Address(obj, hdr_offset));
+     // and mark it as unlocked
+     ori(hdr, hdr, markWord::unlocked_value);
+     // save unlocked object header into the displaced header location on the stack
+@@ -134,9 +133,6 @@ void C1_MacroAssembler::unlock_object(Register hdr, Register obj, Register disp_
+   verify_oop(obj);
+ 
+   if (LockingMode == LM_LIGHTWEIGHT) {
+-    ld(hdr, Address(obj, oopDesc::mark_offset_in_bytes()));
+-    test_bit(temp, hdr, exact_log2(markWord::monitor_value));
+-    bnez(temp, slow_case, /* is_far */ true);
+     lightweight_unlock(obj, hdr, temp, t1, slow_case);
+   } else if (LockingMode == LM_LEGACY) {
+     // test if object header is pointing to the displaced header, and if so, restore
+diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
+index 1aa2bb778..8fa218ae9 100644
+--- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
++++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
+@@ -32,6 +32,7 @@
+ #include "opto/output.hpp"
+ #include "opto/subnode.hpp"
+ #include "runtime/stubRoutines.hpp"
++#include "utilities/globalDefinitions.hpp"
+ 
+ #ifdef PRODUCT
+ #define BLOCK_COMMENT(str) /* nothing */
+@@ -51,30 +52,35 @@ void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg,
+   Register box = boxReg;
+   Register disp_hdr = tmp1Reg;
+   Register tmp = tmp2Reg;
+-  Label cont;
+   Label object_has_monitor;
+-  Label count, no_count;
++  // Finish fast lock successfully. MUST branch to with flag == 0
++  Label locked;
++  // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
++  Label slow_path;
+ 
++  assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
+   assert_different_registers(oop, box, tmp, disp_hdr, flag, tmp3Reg, t0);
+ 
++  mv(flag, 1);
++
+   // Load markWord from object into displaced_header.
+   ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
+ 
+   if (DiagnoseSyncOnValueBasedClasses != 0) {
+-    load_klass(flag, oop);
+-    lwu(flag, Address(flag, Klass::access_flags_offset()));
+-    test_bit(flag, flag, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
+-    bnez(flag, cont, true /* is_far */);
++    load_klass(tmp, oop);
++    lwu(tmp, Address(tmp, Klass::access_flags_offset()));
++    test_bit(tmp, tmp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
++    bnez(tmp, slow_path);
+   }
+ 
+   // Check for existing monitor
+-  test_bit(t0, disp_hdr, exact_log2(markWord::monitor_value));
+-  bnez(t0, object_has_monitor);
++  test_bit(tmp, disp_hdr, exact_log2(markWord::monitor_value));
++  bnez(tmp, object_has_monitor);
+ 
+   if (LockingMode == LM_MONITOR) {
+-    mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow-path
+-    j(cont);
+-  } else if (LockingMode == LM_LEGACY) {
++    j(slow_path);
++  } else {
++    assert(LockingMode == LM_LEGACY, "must be");
+     // Set tmp to be (markWord of object | UNLOCK_VALUE).
+     ori(tmp, disp_hdr, markWord::unlocked_value);
+ 
+@@ -84,39 +90,27 @@ void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg,
+     // Compare object markWord with an unlocked value (tmp) and if
+     // equal exchange the stack address of our box with object markWord.
+     // On failure disp_hdr contains the possibly locked markWord.
+-    cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64, Assembler::aq,
+-            Assembler::rl, /*result*/disp_hdr);
+-    mv(flag, zr);
+-    beq(disp_hdr, tmp, cont); // prepare zero flag and goto cont if we won the cas
++    cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64,
++            Assembler::aq, Assembler::rl, /*result*/disp_hdr);
++    beq(disp_hdr, tmp, locked);
+ 
+     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
+ 
+     // If the compare-and-exchange succeeded, then we found an unlocked
+-    // object, will have now locked it will continue at label cont
++    // object, will have now locked it will continue at label locked
+     // We did not see an unlocked object so try the fast recursive case.
+ 
+     // Check if the owner is self by comparing the value in the
+     // markWord of object (disp_hdr) with the stack pointer.
+     sub(disp_hdr, disp_hdr, sp);
+     mv(tmp, (intptr_t) (~(os::vm_page_size()-1) | (uintptr_t)markWord::lock_mask_in_place));
+-    // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto cont,
++    // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto label locked,
+     // hence we can store 0 as the displaced header in the box, which indicates that it is a
+     // recursive lock.
+     andr(tmp/*==0?*/, disp_hdr, tmp);
+     sd(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
+-    mv(flag, tmp); // we can use the value of tmp as the result here
+-    j(cont);
+-  } else {
+-    assert(LockingMode == LM_LIGHTWEIGHT, "");
+-    Label slow;
+-    lightweight_lock(oop, disp_hdr, tmp, tmp3Reg, slow);
+-
+-    // Indicate success on completion.
+-    mv(flag, zr);
+-    j(count);
+-    bind(slow);
+-    mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow-path
+-    j(no_count);
++    beqz(tmp, locked);
++    j(slow_path);
+   }
+ 
+   // Handle existing monitor.
+@@ -126,35 +120,42 @@ void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg,
+   //
+   // Try to CAS m->owner from NULL to current thread.
+   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value));
+-  cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/xthread, Assembler::int64, Assembler::aq,
+-          Assembler::rl, /*result*/flag); // cas succeeds if flag == zr(expected)
+-
+-  if (LockingMode != LM_LIGHTWEIGHT) {
+-    // Store a non-null value into the box to avoid looking like a re-entrant
+-    // lock. The fast-path monitor unlock code checks for
+-    // markWord::monitor_value so use markWord::unused_mark which has the
+-    // relevant bit set, and also matches ObjectSynchronizer::slow_enter.
+-    mv(tmp, (address)markWord::unused_mark().value());
+-    sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
+-  }
++  cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/xthread, Assembler::int64,
++          Assembler::aq, Assembler::rl, /*result*/tmp3Reg); // cas succeeds if tmp3Reg == zr(expected)
+ 
+-  beqz(flag, cont); // CAS success means locking succeeded
++  // Store a non-null value into the box to avoid looking like a re-entrant
++  // lock. The fast-path monitor unlock code checks for
++  // markWord::monitor_value so use markWord::unused_mark which has the
++  // relevant bit set, and also matches ObjectSynchronizer::slow_enter.
++  mv(tmp, (address)markWord::unused_mark().value());
++  sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
+ 
+-  bne(flag, xthread, cont); // Check for recursive locking
++  beqz(tmp3Reg, locked); // CAS success means locking succeeded
++
++  bne(tmp3Reg, xthread, slow_path); // Check for recursive locking
+ 
+   // Recursive lock case
+-  mv(flag, zr);
+-  increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, t0, tmp);
++  increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, tmp2Reg, tmp3Reg);
+ 
+-  bind(cont);
+-  // zero flag indicates success
+-  // non-zero flag indicates failure
+-  bnez(flag, no_count);
++  bind(locked);
++  mv(flag, zr);
++  increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2Reg, tmp3Reg);
+ 
+-  bind(count);
+-  increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, t0, tmp);
++#ifdef ASSERT
++  // Check that locked label is reached with flag == 0.
++  Label flag_correct;
++  beqz(flag, flag_correct);
++  stop("Fast Lock Flag != 0");
++#endif
+ 
+-  bind(no_count);
++  bind(slow_path);
++#ifdef ASSERT
++  // Check that slow_path label is reached with flag != 0.
++  bnez(flag, flag_correct);
++  stop("Fast Lock Flag == 0");
++  bind(flag_correct);
++#endif
++  // C2 uses the value of flag (0 vs !0) to determine the continuation.
+ }
+ 
+ void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg,
+@@ -165,19 +166,23 @@ void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg,
+   Register box = boxReg;
+   Register disp_hdr = tmp1Reg;
+   Register tmp = tmp2Reg;
+-  Label cont;
+   Label object_has_monitor;
+-  Label count, no_count;
++  // Finish fast lock successfully. MUST branch to with flag == 0
++  Label unlocked;
++  // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
++  Label slow_path;
+ 
++  assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
+   assert_different_registers(oop, box, tmp, disp_hdr, flag, t0);
+ 
++  mv(flag, 1);
++
+   if (LockingMode == LM_LEGACY) {
+     // Find the lock address and load the displaced header from the stack.
+     ld(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
+ 
+     // If the displaced header is 0, we have a recursive unlock.
+-    mv(flag, disp_hdr);
+-    beqz(disp_hdr, cont);
++    beqz(disp_hdr, unlocked);
+   }
+ 
+   // Handle existing monitor.
+@@ -186,28 +191,17 @@ void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg,
+   bnez(t0, object_has_monitor);
+ 
+   if (LockingMode == LM_MONITOR) {
+-    mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow path
+-    j(cont);
+-  } else if (LockingMode == LM_LEGACY) {
++    j(slow_path);
++  } else {
++    assert(LockingMode == LM_LEGACY, "must be");
+     // Check if it is still a light weight lock, this is true if we
+     // see the stack address of the basicLock in the markWord of the
+     // object.
+ 
+-    cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64, Assembler::relaxed,
+-            Assembler::rl, /*result*/tmp);
+-    xorr(flag, box, tmp); // box == tmp if cas succeeds
+-    j(cont);
+-  } else {
+-    assert(LockingMode == LM_LIGHTWEIGHT, "");
+-    Label slow;
+-    lightweight_unlock(oop, tmp, box, disp_hdr, slow);
+-
+-    // Indicate success on completion.
+-    mv(flag, zr);
+-    j(count);
+-    bind(slow);
+-    mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow path
+-    j(no_count);
++    cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64,
++            Assembler::relaxed, Assembler::rl, /*result*/tmp);
++    beq(box, tmp, unlocked); // box == tmp if cas succeeds
++    j(slow_path);
+   }
+ 
+   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
+@@ -217,17 +211,6 @@ void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg,
+   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
+   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
+ 
+-  if (LockingMode == LM_LIGHTWEIGHT) {
+-    // If the owner is anonymous, we need to fix it -- in an outline stub.
+-    Register tmp2 = disp_hdr;
+-    ld(tmp2, Address(tmp, ObjectMonitor::owner_offset()));
+-    test_bit(t0, tmp2, exact_log2(ObjectMonitor::ANONYMOUS_OWNER));
+-    C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmp, tmp2);
+-    Compile::current()->output()->add_stub(stub);
+-    bnez(t0, stub->entry(), /* is_far */ true);
+-    bind(stub->continuation());
+-  }
+-
+   ld(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
+ 
+   Label notRecursive;
+@@ -236,28 +219,304 @@ void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg,
+   // Recursive lock
+   addi(disp_hdr, disp_hdr, -1);
+   sd(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
+-  mv(flag, zr);
+-  j(cont);
++  j(unlocked);
+ 
+   bind(notRecursive);
+-  ld(flag, Address(tmp, ObjectMonitor::EntryList_offset()));
++  ld(t0, Address(tmp, ObjectMonitor::EntryList_offset()));
+   ld(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
+-  orr(flag, flag, disp_hdr); // Will be 0 if both are 0.
+-  bnez(flag, cont);
++  orr(t0, t0, disp_hdr); // Will be 0 if both are 0.
++  bnez(t0, slow_path);
++
+   // need a release store here
+   la(tmp, Address(tmp, ObjectMonitor::owner_offset()));
+   membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
+   sd(zr, Address(tmp)); // set unowned
+ 
+-  bind(cont);
+-  // zero flag indicates success
+-  // non-zero flag indicates failure
+-  bnez(flag, no_count);
++  bind(unlocked);
++  mv(flag, zr);
++  decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp1Reg, tmp2Reg);
+ 
+-  bind(count);
+-  decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, t0, tmp);
++#ifdef ASSERT
++  // Check that unlocked label is reached with flag == 0.
++  Label flag_correct;
++  beqz(flag, flag_correct);
++  stop("Fast Lock Flag != 0");
++#endif
+ 
+-  bind(no_count);
++  bind(slow_path);
++#ifdef ASSERT
++  // Check that slow_path label is reached with flag != 0.
++  bnez(flag, flag_correct);
++  stop("Fast Lock Flag == 0");
++  bind(flag_correct);
++#endif
++  // C2 uses the value of flag (0 vs !0) to determine the continuation.
++}
++
++void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register tmp1, Register tmp2, Register tmp3) {
++  // Flag register, zero for success; non-zero for failure.
++  Register flag = t1;
++
++  assert(LockingMode == LM_LIGHTWEIGHT, "must be");
++  assert_different_registers(obj, tmp1, tmp2, tmp3, flag, t0);
++
++  mv(flag, 1);
++
++  // Handle inflated monitor.
++  Label inflated;
++  // Finish fast lock successfully. MUST branch to with flag == 0
++  Label locked;
++  // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
++  Label slow_path;
++
++  if (DiagnoseSyncOnValueBasedClasses != 0) {
++    load_klass(tmp1, obj);
++    lwu(tmp1, Address(tmp1, Klass::access_flags_offset()));
++    test_bit(tmp1, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
++    bnez(tmp1, slow_path);
++  }
++
++  const Register tmp1_mark = tmp1;
++
++  { // Lightweight locking
++
++    // Push lock to the lock stack and finish successfully. MUST branch to with flag == 0
++    Label push;
++
++    const Register tmp2_top = tmp2;
++    const Register tmp3_t = tmp3;
++
++    // Check if lock-stack is full.
++    lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
++    mv(tmp3_t, (unsigned)LockStack::end_offset());
++    bge(tmp2_top, tmp3_t, slow_path);
++
++    // Check if recursive.
++    add(tmp3_t, xthread, tmp2_top);
++    ld(tmp3_t, Address(tmp3_t, -oopSize));
++    beq(obj, tmp3_t, push);
++
++    // Relaxed normal load to check for monitor. Optimization for monitor case.
++    ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
++    test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
++    bnez(tmp3_t, inflated);
++
++    // Not inflated
++    assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
++
++    // Try to lock. Transition lock-bits 0b01 => 0b00
++    ori(tmp1_mark, tmp1_mark, markWord::unlocked_value);
++    xori(tmp3_t, tmp1_mark, markWord::unlocked_value);
++    cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
++            /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_t);
++    bne(tmp1_mark, tmp3_t, slow_path);
++
++    bind(push);
++    // After successful lock, push object on lock-stack.
++    add(tmp3_t, xthread, tmp2_top);
++    sd(obj, Address(tmp3_t));
++    addw(tmp2_top, tmp2_top, oopSize);
++    sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
++    j(locked);
++  }
++
++  { // Handle inflated monitor.
++    bind(inflated);
++
++    // mark contains the tagged ObjectMonitor*.
++    const Register tmp1_tagged_monitor = tmp1_mark;
++    const uintptr_t monitor_tag = markWord::monitor_value;
++    const Register tmp2_owner_addr = tmp2;
++    const Register tmp3_owner = tmp3;
++
++    // Compute owner address.
++    la(tmp2_owner_addr, Address(tmp1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag)));
++
++    // CAS owner (null => current thread).
++    cmpxchg(/*addr*/ tmp2_owner_addr, /*expected*/ zr, /*new*/ xthread, Assembler::int64,
++            /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_owner);
++    beqz(tmp3_owner, locked);
++
++    // Check if recursive.
++    bne(tmp3_owner, xthread, slow_path);
++
++    // Recursive.
++    increment(Address(tmp1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1, tmp2, tmp3);
++  }
++
++  bind(locked);
++  mv(flag, zr);
++  increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3);
++
++#ifdef ASSERT
++  // Check that locked label is reached with flag == 0.
++  Label flag_correct;
++  beqz(flag, flag_correct);
++  stop("Fast Lock Flag != 0");
++#endif
++
++  bind(slow_path);
++#ifdef ASSERT
++  // Check that slow_path label is reached with flag != 0.
++  bnez(flag, flag_correct);
++  stop("Fast Lock Flag == 0");
++  bind(flag_correct);
++#endif
++  // C2 uses the value of flag (0 vs !0) to determine the continuation.
++}
++
++void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register tmp1, Register tmp2,
++                                                Register tmp3) {
++  // Flag register, zero for success; non-zero for failure.
++  Register flag = t1;
++
++  assert(LockingMode == LM_LIGHTWEIGHT, "must be");
++  assert_different_registers(obj, tmp1, tmp2, tmp3, flag, t0);
++
++  mv(flag, 1);
++
++  // Handle inflated monitor.
++  Label inflated, inflated_load_monitor;
++  // Finish fast unlock successfully. unlocked MUST branch to with flag == 0
++  Label unlocked;
++  // Finish fast unlock unsuccessfully. MUST branch to with flag != 0
++  Label slow_path;
++
++  const Register tmp1_mark = tmp1;
++  const Register tmp2_top = tmp2;
++  const Register tmp3_t = tmp3;
++
++  { // Lightweight unlock
++
++    // Check if obj is top of lock-stack.
++    lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
++    subw(tmp2_top, tmp2_top, oopSize);
++    add(tmp3_t, xthread, tmp2_top);
++    ld(tmp3_t, Address(tmp3_t));
++    // Top of lock stack was not obj. Must be monitor.
++    bne(obj, tmp3_t, inflated_load_monitor);
++
++    // Pop lock-stack.
++    DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
++    DEBUG_ONLY(sd(zr, Address(tmp3_t));)
++    sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
++
++    // Check if recursive.
++    add(tmp3_t, xthread, tmp2_top);
++    ld(tmp3_t, Address(tmp3_t, -oopSize));
++    beq(obj, tmp3_t, unlocked);
++
++    // Not recursive.
++    // Load Mark.
++    ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
++
++    // Check header for monitor (0b10).
++    test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
++    bnez(tmp3_t, inflated);
++
++    // Try to unlock. Transition lock bits 0b00 => 0b01
++    assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
++    ori(tmp3_t, tmp1_mark, markWord::unlocked_value);
++    cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
++            /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ tmp3_t);
++    beq(tmp1_mark, tmp3_t, unlocked);
++
++    // Compare and exchange failed.
++    // Restore lock-stack and handle the unlock in runtime.
++    DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
++    DEBUG_ONLY(sd(obj, Address(tmp3_t));)
++    addw(tmp2_top, tmp2_top, oopSize);
++    sd(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
++    j(slow_path);
++  }
++
++  { // Handle inflated monitor.
++    bind(inflated_load_monitor);
++    ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
++#ifdef ASSERT
++    test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
++    bnez(tmp3_t, inflated);
++    stop("Fast Unlock not monitor");
++#endif
++
++    bind(inflated);
++
++#ifdef ASSERT
++    Label check_done;
++    subw(tmp2_top, tmp2_top, oopSize);
++    mv(tmp3_t, in_bytes(JavaThread::lock_stack_base_offset()));
++    blt(tmp2_top, tmp3_t, check_done);
++    add(tmp3_t, xthread, tmp2_top);
++    ld(tmp3_t, Address(tmp3_t));
++    bne(obj, tmp3_t, inflated);
++    stop("Fast Unlock lock on stack");
++    bind(check_done);
++#endif
++
++    // mark contains the tagged ObjectMonitor*.
++    const Register tmp1_monitor = tmp1_mark;
++    const uintptr_t monitor_tag = markWord::monitor_value;
++
++    // Untag the monitor.
++    sub(tmp1_monitor, tmp1_mark, monitor_tag);
++
++    const Register tmp2_recursions = tmp2;
++    Label not_recursive;
++
++    // Check if recursive.
++    ld(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
++    beqz(tmp2_recursions, not_recursive);
++
++    // Recursive unlock.
++    addi(tmp2_recursions, tmp2_recursions, -1);
++    sd(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
++    j(unlocked);
++
++    bind(not_recursive);
++
++    Label release;
++    const Register tmp2_owner_addr = tmp2;
++
++    // Compute owner address.
++    la(tmp2_owner_addr, Address(tmp1_monitor, ObjectMonitor::owner_offset()));
++
++    // Check if the entry lists are empty.
++    ld(t0, Address(tmp1_monitor, ObjectMonitor::EntryList_offset()));
++    ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::cxq_offset()));
++    orr(t0, t0, tmp3_t);
++    beqz(t0, release);
++
++    // The owner may be anonymous and we removed the last obj entry in
++    // the lock-stack. This loses the information about the owner.
++    // Write the thread to the owner field so the runtime knows the owner.
++    sd(xthread, Address(tmp2_owner_addr));
++    j(slow_path);
++
++    bind(release);
++    // Set owner to null.
++    membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
++    sd(zr, Address(tmp2_owner_addr));
++  }
++
++  bind(unlocked);
++  mv(flag, zr);
++  decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3);
++
++#ifdef ASSERT
++  // Check that unlocked label is reached with flag == 0.
++  Label flag_correct;
++  beqz(flag, flag_correct);
++  stop("Fast Lock Flag != 0");
++#endif
++
++  bind(slow_path);
++#ifdef ASSERT
++  // Check that slow_path label is reached with flag != 0.
++  bnez(flag, flag_correct);
++  stop("Fast Lock Flag == 0");
++  bind(flag_correct);
++#endif
++  // C2 uses the value of flag (0 vs !0) to determine the continuation.
+ }
+ 
+ // short string
+diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp
+index 94d9ee791..c40b96998 100644
+--- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp
++++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp
+@@ -40,9 +40,11 @@
+                        bool is_latin, Label& DONE);
+  public:
+   // Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
+-  // See full description in macroAssembler_riscv.cpp.
+   void fast_lock(Register object, Register box, Register tmp1, Register tmp2, Register tmp3);
+   void fast_unlock(Register object, Register box, Register tmp1, Register tmp2);
++  // Code used by cmpFastLockLightweight and cmpFastUnlockLightweight mach instructions in .ad file.
++  void fast_lock_lightweight(Register object, Register tmp1, Register tmp2, Register tmp3);
++  void fast_unlock_lightweight(Register object, Register tmp1, Register tmp2, Register tmp3);
+ 
+   void string_compare(Register str1, Register str2,
+                       Register cnt1, Register cnt2, Register result,
+diff --git a/src/hotspot/cpu/riscv/interp_masm_riscv.cpp b/src/hotspot/cpu/riscv/interp_masm_riscv.cpp
+index 458c5689c..e2c7b17e0 100644
+--- a/src/hotspot/cpu/riscv/interp_masm_riscv.cpp
++++ b/src/hotspot/cpu/riscv/interp_masm_riscv.cpp
+@@ -835,7 +835,6 @@ void InterpreterMacroAssembler::lock_object(Register lock_reg)
+     }
+ 
+     if (LockingMode == LM_LIGHTWEIGHT) {
+-      ld(tmp, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
+       lightweight_lock(obj_reg, tmp, tmp2, tmp3, slow_case);
+       j(count);
+     } else if (LockingMode == LM_LEGACY) {
+@@ -932,24 +931,6 @@ void InterpreterMacroAssembler::unlock_object(Register lock_reg)
+ 
+     if (LockingMode == LM_LIGHTWEIGHT) {
+       Label slow_case;
+-
+-      // Check for non-symmetric locking. This is allowed by the spec and the interpreter
+-      // must handle it.
+-      Register tmp1 = t0;
+-      Register tmp2 = header_reg;
+-      // First check for lock-stack underflow.
+-      lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
+-      mv(tmp2, (unsigned)LockStack::start_offset());
+-      ble(tmp1, tmp2, slow_case);
+-      // Then check if the top of the lock-stack matches the unlocked object.
+-      subw(tmp1, tmp1, oopSize);
+-      add(tmp1, xthread, tmp1);
+-      ld(tmp1, Address(tmp1, 0));
+-      bne(tmp1, obj_reg, slow_case);
+-
+-      ld(header_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
+-      test_bit(t0, header_reg, exact_log2(markWord::monitor_value));
+-      bnez(t0, slow_case);
+       lightweight_unlock(obj_reg, header_reg, swap_reg, tmp_reg, slow_case);
+       j(count);
+ 
+diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
+index e411b8956..17bf4314c 100644
+--- a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
++++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
+@@ -47,6 +47,7 @@
+ #include "runtime/jniHandles.inline.hpp"
+ #include "runtime/sharedRuntime.hpp"
+ #include "runtime/stubRoutines.hpp"
++#include "utilities/globalDefinitions.hpp"
+ #include "utilities/powerOfTwo.hpp"
+ #ifdef COMPILER2
+ #include "opto/compile.hpp"
+@@ -5383,98 +5384,124 @@ void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
+ }
+ 
+ // Implements lightweight-locking.
+-// Branches to slow upon failure to lock the object.
+-// Falls through upon success.
+ //
+ //  - obj: the object to be locked
+-//  - hdr: the header, already loaded from obj, will be destroyed
+-//  - tmp1, tmp2: temporary registers, will be destroyed
+-void MacroAssembler::lightweight_lock(Register obj, Register hdr, Register tmp1, Register tmp2, Label& slow) {
++//  - tmp1, tmp2, tmp3: temporary registers, will be destroyed
++//  - slow: branched to if locking fails
++void MacroAssembler::lightweight_lock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
+   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
+-  assert_different_registers(obj, hdr, tmp1, tmp2, t0);
+-
+-  // Check if we would have space on lock-stack for the object.
+-  lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
+-  mv(tmp2, (unsigned)LockStack::end_offset());
+-  bge(tmp1, tmp2, slow, /* is_far */ true);
+-
+-  // Load (object->mark() | 1) into hdr
+-  ori(hdr, hdr, markWord::unlocked_value);
+-  // Clear lock-bits, into tmp2
+-  xori(tmp2, hdr, markWord::unlocked_value);
+-
+-  // Try to swing header from unlocked to locked
+-  Label success;
+-  cmpxchgptr(hdr, tmp2, obj, tmp1, success, &slow);
+-  bind(success);
+-
+-  // After successful lock, push object on lock-stack
+-  lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
+-  add(tmp2, xthread, tmp1);
+-  sd(obj, Address(tmp2, 0));
+-  addw(tmp1, tmp1, oopSize);
+-  sw(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
++  assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
++
++  Label push;
++  const Register top = tmp1;
++  const Register mark = tmp2;
++  const Register t = tmp3;
++
++  // Preload the markWord. It is important that this is the first
++  // instruction emitted as it is part of C1's null check semantics.
++  ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
++
++  // Check if the lock-stack is full.
++  lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
++  mv(t, (unsigned)LockStack::end_offset());
++  bge(top, t, slow, /* is_far */ true);
++
++  // Check for recursion.
++  add(t, xthread, top);
++  ld(t, Address(t, -oopSize));
++  beq(obj, t, push);
++
++  // Check header for monitor (0b10).
++  test_bit(t, mark, exact_log2(markWord::monitor_value));
++  bnez(t, slow, /* is_far */ true);
++
++  // Try to lock. Transition lock-bits 0b01 => 0b00
++  assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
++  ori(mark, mark, markWord::unlocked_value);
++  xori(t, mark, markWord::unlocked_value);
++  cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
++          /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ t);
++  bne(mark, t, slow, /* is_far */ true);
++
++  bind(push);
++  // After successful lock, push object on lock-stack.
++  add(t, xthread, top);
++  sd(obj, Address(t));
++  addw(top, top, oopSize);
++  sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
+ }
+ 
+ // Implements ligthweight-unlocking.
+-// Branches to slow upon failure.
+-// Falls through upon success.
+ //
+ // - obj: the object to be unlocked
+-// - hdr: the (pre-loaded) header of the object
+-// - tmp1, tmp2: temporary registers
+-void MacroAssembler::lightweight_unlock(Register obj, Register hdr, Register tmp1, Register tmp2, Label& slow) {
++// - tmp1, tmp2, tmp3: temporary registers
++// - slow: branched to if unlocking fails
++void MacroAssembler::lightweight_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
+   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
+-  assert_different_registers(obj, hdr, tmp1, tmp2, t0);
++  assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
+ 
+ #ifdef ASSERT
+   {
+-    // The following checks rely on the fact that LockStack is only ever modified by
+-    // its owning thread, even if the lock got inflated concurrently; removal of LockStack
+-    // entries after inflation will happen delayed in that case.
+-
+     // Check for lock-stack underflow.
+     Label stack_ok;
+     lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
+     mv(tmp2, (unsigned)LockStack::start_offset());
+-    bgt(tmp1, tmp2, stack_ok);
++    bge(tmp1, tmp2, stack_ok);
+     STOP("Lock-stack underflow");
+     bind(stack_ok);
+   }
+-  {
+-    // Check if the top of the lock-stack matches the unlocked object.
+-    Label tos_ok;
+-    subw(tmp1, tmp1, oopSize);
+-    add(tmp1, xthread, tmp1);
+-    ld(tmp1, Address(tmp1, 0));
+-    beq(tmp1, obj, tos_ok);
+-    STOP("Top of lock-stack does not match the unlocked object");
+-    bind(tos_ok);
+-  }
+-  {
+-    // Check that hdr is fast-locked.
+-   Label hdr_ok;
+-    andi(tmp1, hdr, markWord::lock_mask_in_place);
+-    beqz(tmp1, hdr_ok);
+-    STOP("Header is not fast-locked");
+-    bind(hdr_ok);
+-  }
+ #endif
+ 
+-  // Load the new header (unlocked) into tmp1
+-  ori(tmp1, hdr, markWord::unlocked_value);
++  Label unlocked, push_and_slow;
++  const Register top = tmp1;
++  const Register mark = tmp2;
++  const Register t = tmp3;
++
++  // Check if obj is top of lock-stack.
++  lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
++  subw(top, top, oopSize);
++  add(t, xthread, top);
++  ld(t, Address(t));
++  bne(obj, t, slow, /* is_far */ true);
++
++  // Pop lock-stack.
++  DEBUG_ONLY(add(t, xthread, top);)
++  DEBUG_ONLY(sd(zr, Address(t));)
++  sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
++
++  // Check if recursive.
++  add(t, xthread, top);
++  ld(t, Address(t, -oopSize));
++  beq(obj, t, unlocked);
++
++  // Not recursive. Check header for monitor (0b10).
++  ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
++  test_bit(t, mark, exact_log2(markWord::monitor_value));
++  bnez(t, push_and_slow);
+ 
+-  // Try to swing header from locked to unlocked
+-  Label success;
+-  cmpxchgptr(hdr, tmp1, obj, tmp2, success, &slow);
+-  bind(success);
+-
+-  // After successful unlock, pop object from lock-stack
+-  lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
+-  subw(tmp1, tmp1, oopSize);
+ #ifdef ASSERT
+-  add(tmp2, xthread, tmp1);
+-  sd(zr, Address(tmp2, 0));
++  // Check header not unlocked (0b01).
++  Label not_unlocked;
++  test_bit(t, mark, exact_log2(markWord::unlocked_value));
++  beqz(t, not_unlocked);
++  stop("lightweight_unlock already unlocked");
++  bind(not_unlocked);
+ #endif
+-  sw(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
++
++  // Try to unlock. Transition lock bits 0b00 => 0b01
++  assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
++  ori(t, mark, markWord::unlocked_value);
++  cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
++          /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ t);
++  beq(mark, t, unlocked);
++
++  bind(push_and_slow);
++  // Restore lock-stack and handle the unlock in runtime.
++  DEBUG_ONLY(add(t, xthread, top);)
++  DEBUG_ONLY(sd(obj, Address(t));)
++  addw(top, top, oopSize);
++  sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
++  j(slow);
++
++  bind(unlocked);
+ }
+diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
+index 692306a73..479d8d1a6 100644
+--- a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
++++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
+@@ -1512,8 +1512,8 @@ private:
+   void store_conditional(Register dst, Register new_val, Register addr, enum operand_size size, Assembler::Aqrl release);
+ 
+ public:
+-  void lightweight_lock(Register obj, Register hdr, Register tmp1, Register tmp2, Label& slow);
+-  void lightweight_unlock(Register obj, Register hdr, Register tmp1, Register tmp2, Label& slow);
++  void lightweight_lock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow);
++  void lightweight_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow);
+ };
+ 
+ #ifdef ASSERT
+diff --git a/src/hotspot/cpu/riscv/riscv.ad b/src/hotspot/cpu/riscv/riscv.ad
+index b07713b95..ac22dc536 100644
+--- a/src/hotspot/cpu/riscv/riscv.ad
++++ b/src/hotspot/cpu/riscv/riscv.ad
+@@ -10279,10 +10279,11 @@ instruct tlsLoadP(javaThread_RegP dst)
+ // using t1 as the 'flag' register to bridge the BoolNode producers and consumers
+ instruct cmpFastLock(rFlagsReg cr, iRegP object, iRegP box, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3)
+ %{
++  predicate(LockingMode != LM_LIGHTWEIGHT);
+   match(Set cr (FastLock object box));
+   effect(TEMP tmp1, TEMP tmp2, TEMP tmp3);
+ 
+-  ins_cost(LOAD_COST * 2 + STORE_COST * 3 + ALU_COST * 6 + BRANCH_COST * 3);
++  ins_cost(10 * DEFAULT_COST);
+   format %{ "fastlock $object,$box\t! kills $tmp1,$tmp2,$tmp3, #@cmpFastLock" %}
+ 
+   ins_encode %{
+@@ -10295,10 +10296,11 @@ instruct cmpFastLock(rFlagsReg cr, iRegP object, iRegP box, iRegPNoSp tmp1, iReg
+ // using t1 as the 'flag' register to bridge the BoolNode producers and consumers
+ instruct cmpFastUnlock(rFlagsReg cr, iRegP object, iRegP box, iRegPNoSp tmp1, iRegPNoSp tmp2)
+ %{
++  predicate(LockingMode != LM_LIGHTWEIGHT);
+   match(Set cr (FastUnlock object box));
+   effect(TEMP tmp1, TEMP tmp2);
+ 
+-  ins_cost(LOAD_COST * 2 + STORE_COST + ALU_COST * 2 + BRANCH_COST * 4);
++  ins_cost(10 * DEFAULT_COST);
+   format %{ "fastunlock $object,$box\t! kills $tmp1, $tmp2, #@cmpFastUnlock" %}
+ 
+   ins_encode %{
+@@ -10308,6 +10310,38 @@ instruct cmpFastUnlock(rFlagsReg cr, iRegP object, iRegP box, iRegPNoSp tmp1, iR
+   ins_pipe(pipe_serial);
+ %}
+ 
++instruct cmpFastLockLightweight(rFlagsReg cr, iRegP object, iRegP_R10 box, iRegPNoSp tmp1, iRegPNoSp tmp2)
++%{
++  predicate(LockingMode == LM_LIGHTWEIGHT);
++  match(Set cr (FastLock object box));
++  effect(TEMP tmp1, TEMP tmp2, USE_KILL box);
++
++  ins_cost(10 * DEFAULT_COST);
++  format %{ "fastlock $object,$box\t! kills $box,$tmp1,$tmp2 #@cmpFastLockLightweight" %}
++
++  ins_encode %{
++    __ fast_lock_lightweight($object$$Register, $box$$Register, $tmp1$$Register, $tmp2$$Register);
++  %}
++
++  ins_pipe(pipe_serial);
++%}
++
++instruct cmpFastUnlockLightweight(rFlagsReg cr, iRegP object, iRegP_R10 box, iRegPNoSp tmp1, iRegPNoSp tmp2)
++%{
++  predicate(LockingMode == LM_LIGHTWEIGHT);
++  match(Set cr (FastUnlock object box));
++  effect(TEMP tmp1, TEMP tmp2, USE_KILL box);
++
++  ins_cost(10 * DEFAULT_COST);
++  format %{ "fastunlock $object,$box\t! kills $box,$tmp1,$tmp2, #@cmpFastUnlockLightweight" %}
++
++  ins_encode %{
++    __ fast_unlock_lightweight($object$$Register, $box$$Register, $tmp1$$Register, $tmp2$$Register);
++  %}
++
++  ins_pipe(pipe_serial);
++%}
++
+ // Tail Call; Jump from runtime stub to Java code.
+ // Also known as an 'interprocedural jump'.
+ // Target of jump will eventually return to caller.
+diff --git a/src/hotspot/cpu/riscv/sharedRuntime_riscv.cpp b/src/hotspot/cpu/riscv/sharedRuntime_riscv.cpp
+index d900e8732..a38c8ec12 100644
+--- a/src/hotspot/cpu/riscv/sharedRuntime_riscv.cpp
++++ b/src/hotspot/cpu/riscv/sharedRuntime_riscv.cpp
+@@ -1704,8 +1704,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
+       __ sd(swap_reg, Address(lock_reg, mark_word_offset));
+       __ bnez(swap_reg, slow_path_lock);
+     } else {
+-      assert(LockingMode == LM_LIGHTWEIGHT, "");
+-      __ ld(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
++      assert(LockingMode == LM_LIGHTWEIGHT, "must be");
+       __ lightweight_lock(obj_reg, swap_reg, tmp, lock_tmp, slow_path_lock);
+     }
+ 
+@@ -1831,9 +1830,6 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
+       __ decrement(Address(xthread, JavaThread::held_monitor_count_offset()));
+     } else {
+       assert(LockingMode == LM_LIGHTWEIGHT, "");
+-      __ ld(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
+-      __ test_bit(t0, old_hdr, exact_log2(markWord::monitor_value));
+-      __ bnez(t0, slow_path_unlock);
+       __ lightweight_unlock(obj_reg, old_hdr, swap_reg, lock_tmp, slow_path_unlock);
+       __ decrement(Address(xthread, JavaThread::held_monitor_count_offset()));
+     }
+diff --git a/src/hotspot/cpu/riscv/vm_version_riscv.hpp b/src/hotspot/cpu/riscv/vm_version_riscv.hpp
+index 20fd260ee..99a4c3dd4 100644
+--- a/src/hotspot/cpu/riscv/vm_version_riscv.hpp
++++ b/src/hotspot/cpu/riscv/vm_version_riscv.hpp
+@@ -259,6 +259,8 @@ class VM_Version : public Abstract_VM_Version {
+ 
+   constexpr static bool supports_stack_watermark_barrier() { return true; }
+ 
++  constexpr static bool supports_recursive_lightweight_locking() { return true; }
++
+   static bool supports_on_spin_wait() { return UseZihintpause; }
+ 
+   // RISCV64 supports fast class initialization checks
+diff --git a/src/hotspot/share/prims/whitebox.cpp b/src/hotspot/share/prims/whitebox.cpp
+index 50410d1fa..9271f5c60 100644
+--- a/src/hotspot/share/prims/whitebox.cpp
++++ b/src/hotspot/share/prims/whitebox.cpp
+@@ -82,6 +82,7 @@
+ #include "runtime/javaCalls.hpp"
+ #include "runtime/javaThread.inline.hpp"
+ #include "runtime/jniHandles.inline.hpp"
++#include "runtime/lockStack.hpp"
+ #include "runtime/os.hpp"
+ #include "runtime/stackFrameStream.inline.hpp"
+ #include "runtime/synchronizer.hpp"
+@@ -1856,6 +1857,14 @@ WB_ENTRY(jboolean, WB_IsUbsanEnabled(JNIEnv* env))
+   return (jboolean) WhiteBox::is_ubsan_enabled();
+ WB_END
+ 
++WB_ENTRY(jint, WB_getLockStackCapacity(JNIEnv* env))
++  return (jint) LockStack::CAPACITY;
++WB_END
++
++WB_ENTRY(jboolean, WB_supportsRecursiveLightweightLocking(JNIEnv* env))
++  return (jboolean) VM_Version::supports_recursive_lightweight_locking();
++WB_END
++
+ WB_ENTRY(jboolean, WB_DeflateIdleMonitors(JNIEnv* env, jobject wb))
+   log_info(monitorinflation)("WhiteBox initiated DeflateIdleMonitors");
+   return ObjectSynchronizer::request_deflate_idle_monitors_from_wb();
+@@ -2782,6 +2791,8 @@ static JNINativeMethod methods[] = {
+                                                       (void*)&WB_AddModuleExportsToAll },
+   {CC"deflateIdleMonitors", CC"()Z",                  (void*)&WB_DeflateIdleMonitors },
+   {CC"isMonitorInflated0", CC"(Ljava/lang/Object;)Z", (void*)&WB_IsMonitorInflated  },
++  {CC"getLockStackCapacity", CC"()I",                 (void*)&WB_getLockStackCapacity },
++  {CC"supportsRecursiveLightweightLocking", CC"()Z",  (void*)&WB_supportsRecursiveLightweightLocking },
+   {CC"isAsanEnabled", CC"()Z",                        (void*)&WB_IsAsanEnabled },
+   {CC"isUbsanEnabled", CC"()Z",                       (void*)&WB_IsUbsanEnabled },
+   {CC"forceSafepoint",     CC"()V",                   (void*)&WB_ForceSafepoint     },
+diff --git a/src/hotspot/share/runtime/abstract_vm_version.hpp b/src/hotspot/share/runtime/abstract_vm_version.hpp
+index 4bf0741a2..fb5db3f47 100644
+--- a/src/hotspot/share/runtime/abstract_vm_version.hpp
++++ b/src/hotspot/share/runtime/abstract_vm_version.hpp
+@@ -1,5 +1,5 @@
+ /*
+- * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
++ * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
+  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+  *
+  * This code is free software; you can redistribute it and/or modify it
+@@ -183,6 +183,9 @@ class Abstract_VM_Version: AllStatic {
+   // Does platform support secondary supers table lookup?
+   constexpr static bool supports_secondary_supers_table() { return false; }
+ 
++  // Is recursive lightweight locking implemented for this platform?
++  constexpr static bool supports_recursive_lightweight_locking() { return false; }
++
+   // Does platform support float16 instructions?
+   static bool supports_float16() { return false; }
+ 
+diff --git a/src/hotspot/share/runtime/deoptimization.cpp b/src/hotspot/share/runtime/deoptimization.cpp
+index 2058da0ff..4f82836a4 100644
+--- a/src/hotspot/share/runtime/deoptimization.cpp
++++ b/src/hotspot/share/runtime/deoptimization.cpp
+@@ -1665,13 +1665,13 @@ bool Deoptimization::relock_objects(JavaThread* thread, GrowableArray<MonitorInf
+           // We have lost information about the correct state of the lock stack.
+           // Inflate the locks instead. Enter then inflate to avoid races with
+           // deflation.
+-          ObjectSynchronizer::enter(obj, nullptr, deoptee_thread);
++          ObjectSynchronizer::enter_for(obj, nullptr, deoptee_thread);
+           assert(mon_info->owner()->is_locked(), "object must be locked now");
+-          ObjectMonitor* mon = ObjectSynchronizer::inflate(deoptee_thread, obj(), ObjectSynchronizer::inflate_cause_vm_internal);
++          ObjectMonitor* mon = ObjectSynchronizer::inflate_for(deoptee_thread, obj(), ObjectSynchronizer::inflate_cause_vm_internal);
+           assert(mon->owner() == deoptee_thread, "must be");
+         } else {
+           BasicLock* lock = mon_info->lock();
+-          ObjectSynchronizer::enter(obj, lock, deoptee_thread);
++          ObjectSynchronizer::enter_for(obj, lock, deoptee_thread);
+           assert(mon_info->owner()->is_locked(), "object must be locked now");
+         }
+       }
+diff --git a/src/hotspot/share/runtime/lockStack.cpp b/src/hotspot/share/runtime/lockStack.cpp
+index b4a3bf1e8..d7dcbdda7 100644
+--- a/src/hotspot/share/runtime/lockStack.cpp
++++ b/src/hotspot/share/runtime/lockStack.cpp
+@@ -1,6 +1,7 @@
+ /*
+  * Copyright (c) 2022, Red Hat, Inc. All rights reserved.
+  * Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
++ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+  *
+  * This code is free software; you can redistribute it and/or modify it
+@@ -25,20 +26,30 @@
+ 
+ #include "precompiled.hpp"
+ #include "memory/allocation.hpp"
++#include "runtime/globals.hpp"
+ #include "runtime/lockStack.inline.hpp"
+ #include "runtime/safepoint.hpp"
+ #include "runtime/stackWatermark.hpp"
+ #include "runtime/stackWatermarkSet.inline.hpp"
+ #include "runtime/thread.hpp"
+ #include "utilities/copy.hpp"
++#include "utilities/debug.hpp"
++#include "utilities/globalDefinitions.hpp"
+ #include "utilities/ostream.hpp"
+ 
++#include <type_traits>
++
+ const int LockStack::lock_stack_offset =      in_bytes(JavaThread::lock_stack_offset());
+ const int LockStack::lock_stack_top_offset =  in_bytes(JavaThread::lock_stack_top_offset());
+ const int LockStack::lock_stack_base_offset = in_bytes(JavaThread::lock_stack_base_offset());
+ 
+ LockStack::LockStack(JavaThread* jt) :
+   _top(lock_stack_base_offset), _base() {
++  // Make sure the layout of the object is compatible with the emitted code's assumptions.
++  STATIC_ASSERT(sizeof(_bad_oop_sentinel) == oopSize);
++  STATIC_ASSERT(sizeof(_base[0]) == oopSize);
++  STATIC_ASSERT(std::is_standard_layout<LockStack>::value);
++  STATIC_ASSERT(offsetof(LockStack, _bad_oop_sentinel) == offsetof(LockStack, _base) - oopSize);
+ #ifdef ASSERT
+   for (int i = 0; i < CAPACITY; i++) {
+     _base[i] = nullptr;
+@@ -62,11 +73,21 @@ uint32_t LockStack::end_offset() {
+ void LockStack::verify(const char* msg) const {
+   assert(LockingMode == LM_LIGHTWEIGHT, "never use lock-stack when light weight locking is disabled");
+   assert((_top <= end_offset()), "lockstack overflow: _top %d end_offset %d", _top, end_offset());
+-  assert((_top >= start_offset()), "lockstack underflow: _top %d end_offset %d", _top, start_offset());
++  assert((_top >= start_offset()), "lockstack underflow: _top %d start_offset %d", _top, start_offset());
+   if (SafepointSynchronize::is_at_safepoint() || (Thread::current()->is_Java_thread() && is_owning_thread())) {
+     int top = to_index(_top);
+     for (int i = 0; i < top; i++) {
+       assert(_base[i] != nullptr, "no zapped before top");
++      if (VM_Version::supports_recursive_lightweight_locking()) {
++        oop o = _base[i];
++        for (; i < top - 1; i++) {
++          // Consecutive entries may be the same
++          if (_base[i + 1] != o) {
++            break;
++          }
++        }
++      }
++
+       for (int j = i + 1; j < top; j++) {
+         assert(_base[i] != _base[j], "entries must be unique: %s", msg);
+       }
+diff --git a/src/hotspot/share/runtime/lockStack.hpp b/src/hotspot/share/runtime/lockStack.hpp
+index 25ab7a8de..45649b86a 100644
+--- a/src/hotspot/share/runtime/lockStack.hpp
++++ b/src/hotspot/share/runtime/lockStack.hpp
+@@ -1,6 +1,7 @@
+ /*
+  * Copyright (c) 2022, Red Hat, Inc. All rights reserved.
+  * Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
++ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+  *
+  * This code is free software; you can redistribute it and/or modify it
+@@ -35,9 +36,11 @@ class OopClosure;
+ class outputStream;
+ 
+ class LockStack {
++  friend class LockStackTest;
+   friend class VMStructs;
+-private:
++public:
+   static const int CAPACITY = 8;
++private:
+ 
+   // TODO: It would be very useful if JavaThread::lock_stack_offset() and friends were constexpr,
+   // but this is currently not the case because we're using offset_of() which is non-constexpr,
+@@ -50,6 +53,9 @@ private:
+   // We do this instead of a simple index into the array because this allows for
+   // efficient addressing in generated code.
+   uint32_t _top;
++  // The _bad_oop_sentinel acts as a sentinel value to elide underflow checks in generated code.
++  // The correct layout is statically asserted in the constructor.
++  const uintptr_t _bad_oop_sentinel = badOopVal;
+   oop _base[CAPACITY];
+ 
+   // Get the owning thread of this lock-stack.
+@@ -74,17 +80,35 @@ public:
+   static uint32_t start_offset();
+   static uint32_t end_offset();
+ 
+-  // Return true if we have room to push onto this lock-stack, false otherwise.
+-  inline bool can_push() const;
++  // Returns true if the lock-stack is full. False otherwise.
++  inline bool is_full() const;
+ 
+   // Pushes an oop on this lock-stack.
+   inline void push(oop o);
+ 
+-  // Pops an oop from this lock-stack.
+-  inline oop pop();
++  // Get the oldest oop from this lock-stack.
++  // Precondition: This lock-stack must not be empty.
++  inline oop bottom() const;
++
++  // Is the lock-stack empty.
++  inline bool is_empty() const;
++
++  // Check if object is recursive.
++  // Precondition: This lock-stack must contain the oop.
++  inline bool is_recursive(oop o) const;
++
++  // Try recursive enter.
++  // Precondition: This lock-stack must not be full.
++  inline bool try_recursive_enter(oop o);
++
++  // Try recursive exit.
++  // Precondition: This lock-stack must contain the oop.
++  inline bool try_recursive_exit(oop o);
+ 
+   // Removes an oop from an arbitrary location of this lock-stack.
+-  inline void remove(oop o);
++  // Precondition: This lock-stack must contain the oop.
++  // Returns the number of oops removed.
++  inline size_t remove(oop o);
+ 
+   // Tests whether the oop is on this lock-stack.
+   inline bool contains(oop o) const;
+diff --git a/src/hotspot/share/runtime/lockStack.inline.hpp b/src/hotspot/share/runtime/lockStack.inline.hpp
+index b36be2f72..7a9874a92 100644
+--- a/src/hotspot/share/runtime/lockStack.inline.hpp
++++ b/src/hotspot/share/runtime/lockStack.inline.hpp
+@@ -1,6 +1,7 @@
+ /*
+  * Copyright (c) 2022, Red Hat, Inc. All rights reserved.
+  * Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
++ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+  *
+  * This code is free software; you can redistribute it and/or modify it
+@@ -26,14 +27,20 @@
+ #ifndef SHARE_RUNTIME_LOCKSTACK_INLINE_HPP
+ #define SHARE_RUNTIME_LOCKSTACK_INLINE_HPP
+ 
++#include "runtime/lockStack.hpp"
++
+ #include "memory/iterator.hpp"
+ #include "runtime/javaThread.hpp"
+-#include "runtime/lockStack.hpp"
+ #include "runtime/safepoint.hpp"
+ #include "runtime/stackWatermark.hpp"
+ #include "runtime/stackWatermarkSet.inline.hpp"
++#include "utilities/align.hpp"
++#include "utilities/globalDefinitions.hpp"
+ 
+ inline int LockStack::to_index(uint32_t offset) {
++  assert(is_aligned(offset, oopSize), "Bad alignment: %u", offset);
++  assert((offset <= end_offset()), "lockstack overflow: offset %d end_offset %d", offset, end_offset());
++  assert((offset >= start_offset()), "lockstack underflow: offset %d start_offset %d", offset, start_offset());
+   return (offset - lock_stack_base_offset) / oopSize;
+ }
+ 
+@@ -42,8 +49,8 @@ JavaThread* LockStack::get_thread() const {
+   return reinterpret_cast<JavaThread*>(addr - lock_stack_offset);
+ }
+ 
+-inline bool LockStack::can_push() const {
+-  return to_index(_top) < CAPACITY;
++inline bool LockStack::is_full() const {
++  return to_index(_top) == CAPACITY;
+ }
+ 
+ inline bool LockStack::is_owning_thread() const {
+@@ -61,45 +68,132 @@ inline void LockStack::push(oop o) {
+   verify("pre-push");
+   assert(oopDesc::is_oop(o), "must be");
+   assert(!contains(o), "entries must be unique");
+-  assert(can_push(), "must have room");
++  assert(!is_full(), "must have room");
+   assert(_base[to_index(_top)] == nullptr, "expect zapped entry");
+   _base[to_index(_top)] = o;
+   _top += oopSize;
+   verify("post-push");
+ }
+ 
+-inline oop LockStack::pop() {
+-  verify("pre-pop");
+-  assert(to_index(_top) > 0, "underflow, probably unbalanced push/pop");
++inline oop LockStack::bottom() const {
++  assert(to_index(_top) > 0, "must contain an oop");
++  return _base[0];
++}
++
++inline bool LockStack::is_empty() const {
++  return to_index(_top) == 0;
++}
++
++inline bool LockStack::is_recursive(oop o) const {
++  if (!VM_Version::supports_recursive_lightweight_locking()) {
++    return false;
++  }
++  verify("pre-is_recursive");
++
++  // This will succeed iff there is a consecutive run of oops on the
++  // lock-stack with a length of at least 2.
++
++  assert(contains(o), "at least one entry must exist");
++  int end = to_index(_top);
++  // Start iterating from the top because the runtime code is more
++  // interested in the balanced locking case when the top oop on the
++  // lock-stack matches o. This will cause the for loop to break out
++  // in the first loop iteration if it is non-recursive.
++  for (int i = end - 1; i > 0; i--) {
++    if (_base[i - 1] == o && _base[i] == o) {
++      verify("post-is_recursive");
++      return true;
++    }
++    if (_base[i] == o) {
++      // o can only occur in one consecutive run on the lock-stack.
++      // Only one of the two oops checked matched o, so this run
++      // must be of length 1 and thus not be recursive. Stop the search.
++      break;
++    }
++  }
++
++  verify("post-is_recursive");
++  return false;
++}
++
++inline bool LockStack::try_recursive_enter(oop o) {
++  if (!VM_Version::supports_recursive_lightweight_locking()) {
++    return false;
++  }
++  verify("pre-try_recursive_enter");
++
++  // This will succeed iff the top oop on the stack matches o.
++  // When successful o will be pushed to the lock-stack creating
++  // a consecutive run at least 2 oops that matches o on top of
++  // the lock-stack.
++
++  assert(!is_full(), "precond");
++
++  int end = to_index(_top);
++  if (end == 0 || _base[end - 1] != o) {
++    // Topmost oop does not match o.
++    verify("post-try_recursive_enter");
++    return false;
++  }
++
++  _base[end] = o;
++  _top += oopSize;
++  verify("post-try_recursive_enter");
++  return true;
++}
++
++inline bool LockStack::try_recursive_exit(oop o) {
++  if (!VM_Version::supports_recursive_lightweight_locking()) {
++    return false;
++  }
++  verify("pre-try_recursive_exit");
++
++  // This will succeed iff the top two oops on the stack matches o.
++  // When successful the top oop will be popped of the lock-stack.
++  // When unsuccessful the lock may still be recursive, in which
++  // case the locking is unbalanced. This case is handled externally.
++
++  assert(contains(o), "entries must exist");
++
++  int end = to_index(_top);
++  if (end <= 1 || _base[end - 1] != o || _base[end - 2] != o) {
++    // The two topmost oops do not match o.
++    verify("post-try_recursive_exit");
++    return false;
++  }
++
+   _top -= oopSize;
+-  oop o = _base[to_index(_top)];
+-#ifdef ASSERT
+-  _base[to_index(_top)] = nullptr;
+-#endif
+-  assert(!contains(o), "entries must be unique: " PTR_FORMAT, p2i(o));
+-  verify("post-pop");
+-  return o;
++  DEBUG_ONLY(_base[to_index(_top)] = nullptr;)
++  verify("post-try_recursive_exit");
++  return true;
+ }
+ 
+-inline void LockStack::remove(oop o) {
++inline size_t LockStack::remove(oop o) {
+   verify("pre-remove");
+   assert(contains(o), "entry must be present: " PTR_FORMAT, p2i(o));
++
+   int end = to_index(_top);
++  int inserted = 0;
+   for (int i = 0; i < end; i++) {
+-    if (_base[i] == o) {
+-      int last = end - 1;
+-      for (; i < last; i++) {
+-        _base[i] = _base[i + 1];
++    if (_base[i] != o) {
++      if (inserted != i) {
++        _base[inserted] = _base[i];
+       }
+-      _top -= oopSize;
+-#ifdef ASSERT
+-      _base[to_index(_top)] = nullptr;
+-#endif
+-      break;
++      inserted++;
+     }
+   }
+-  assert(!contains(o), "entries must be unique: " PTR_FORMAT, p2i(o));
++
++#ifdef ASSERT
++  for (int i = inserted; i < end; i++) {
++    _base[i] = nullptr;
++  }
++#endif
++
++  uint32_t removed = end - inserted;
++  _top -= removed * oopSize;
++  assert(!contains(o), "entry must have been removed: " PTR_FORMAT, p2i(o));
+   verify("post-remove");
++  return removed;
+ }
+ 
+ inline bool LockStack::contains(oop o) const {
+diff --git a/src/hotspot/share/runtime/objectMonitor.cpp b/src/hotspot/share/runtime/objectMonitor.cpp
+index ee0f754b8..696803bbe 100644
+--- a/src/hotspot/share/runtime/objectMonitor.cpp
++++ b/src/hotspot/share/runtime/objectMonitor.cpp
+@@ -1,5 +1,5 @@
+ /*
+- * Copyright (c) 1998, 2023, Oracle and/or its affiliates. All rights reserved.
++ * Copyright (c) 1998, 2024, Oracle and/or its affiliates. All rights reserved.
+  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+  *
+  * This code is free software; you can redistribute it and/or modify it
+@@ -39,6 +39,7 @@
+ #include "prims/jvmtiDeferredUpdates.hpp"
+ #include "prims/jvmtiExport.hpp"
+ #include "runtime/atomic.hpp"
++#include "runtime/globals.hpp"
+ #include "runtime/handles.inline.hpp"
+ #include "runtime/interfaceSupport.inline.hpp"
+ #include "runtime/javaThread.inline.hpp"
+@@ -53,6 +54,7 @@
+ #include "runtime/sharedRuntime.hpp"
+ #include "services/threadService.hpp"
+ #include "utilities/dtrace.hpp"
++#include "utilities/globalDefinitions.hpp"
+ #include "utilities/macros.hpp"
+ #include "utilities/preserveException.hpp"
+ #if INCLUDE_JFR
+@@ -312,7 +314,70 @@ void ObjectMonitor::ClearSuccOnSuspend::operator()(JavaThread* current) {
+ // -----------------------------------------------------------------------------
+ // Enter support
+ 
++bool ObjectMonitor::enter_for(JavaThread* locking_thread) {
++  // Used by ObjectSynchronizer::enter_for to enter for another thread.
++  // The monitor is private to or already owned by locking_thread which must be suspended.
++  // So this code may only contend with deflation.
++  assert(locking_thread == Thread::current() || locking_thread->is_obj_deopt_suspend(), "must be");
++
++  // Block out deflation as soon as possible.
++  add_to_contentions(1);
++
++  bool success = false;
++  if (!is_being_async_deflated()) {
++    void* prev_owner = try_set_owner_from(nullptr, locking_thread);
++
++    if (prev_owner == nullptr) {
++      assert(_recursions == 0, "invariant");
++      success = true;
++    } else if (prev_owner == locking_thread) {
++      _recursions++;
++      success = true;
++    } else if (prev_owner == DEFLATER_MARKER) {
++      // Racing with deflation.
++      prev_owner = try_set_owner_from(DEFLATER_MARKER, locking_thread);
++      if (prev_owner == DEFLATER_MARKER) {
++        // Cancelled deflation. Increment contentions as part of the deflation protocol.
++        add_to_contentions(1);
++        success = true;
++      } else if (prev_owner == nullptr) {
++        // At this point we cannot race with deflation as we have both incremented
++        // contentions, seen contention > 0 and seen a DEFLATER_MARKER.
++        // success will only be false if this races with something other than
++        // deflation.
++        prev_owner = try_set_owner_from(nullptr, locking_thread);
++        success = prev_owner == nullptr;
++      }
++    } else if (LockingMode == LM_LEGACY && locking_thread->is_lock_owned((address)prev_owner)) {
++      assert(_recursions == 0, "must be");
++      _recursions = 1;
++      set_owner_from_BasicLock(prev_owner, locking_thread);
++      success = true;
++    }
++    assert(success, "Failed to enter_for: locking_thread=" INTPTR_FORMAT
++           ", this=" INTPTR_FORMAT "{owner=" INTPTR_FORMAT "}, observed owner: " INTPTR_FORMAT,
++           p2i(locking_thread), p2i(this), p2i(owner_raw()), p2i(prev_owner));
++  } else {
++    // Async deflation is in progress and our contentions increment
++    // above lost the race to async deflation. Undo the work and
++    // force the caller to retry.
++    const oop l_object = object();
++    if (l_object != nullptr) {
++      // Attempt to restore the header/dmw to the object's header so that
++      // we only retry once if the deflater thread happens to be slow.
++      install_displaced_markword_in_object(l_object);
++    }
++  }
++
++  add_to_contentions(-1);
++
++  assert(!success || owner_raw() == locking_thread, "must be");
++
++  return success;
++}
++
+ bool ObjectMonitor::enter(JavaThread* current) {
++  assert(current == JavaThread::current(), "must be");
+   // The following code is ordered to check the most common cases first
+   // and to reduce RTS->RTO cache line upgrades on SPARC and IA32 processors.
+ 
+diff --git a/src/hotspot/share/runtime/objectMonitor.hpp b/src/hotspot/share/runtime/objectMonitor.hpp
+index d6c0e31f7..a56b6f8fb 100644
+--- a/src/hotspot/share/runtime/objectMonitor.hpp
++++ b/src/hotspot/share/runtime/objectMonitor.hpp
+@@ -1,5 +1,5 @@
+ /*
+- * Copyright (c) 1998, 2023, Oracle and/or its affiliates. All rights reserved.
++ * Copyright (c) 1998, 2024, Oracle and/or its affiliates. All rights reserved.
+  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+  *
+  * This code is free software; you can redistribute it and/or modify it
+@@ -298,6 +298,7 @@ private:
+   int       contentions() const;
+   void      add_to_contentions(int value);
+   intx      recursions() const                                         { return _recursions; }
++  void      set_recursions(size_t recursions);
+ 
+   // JVM/TI GetObjectMonitorUsage() needs this:
+   ObjectWaiter* first_waiter()                                         { return _WaitSet; }
+@@ -332,6 +333,7 @@ private:
+     void operator()(JavaThread* current);
+   };
+  public:
++  bool      enter_for(JavaThread* locking_thread);
+   bool      enter(JavaThread* current);
+   void      exit(JavaThread* current, bool not_suspended = true);
+   void      wait(jlong millis, bool interruptible, TRAPS);
+diff --git a/src/hotspot/share/runtime/objectMonitor.inline.hpp b/src/hotspot/share/runtime/objectMonitor.inline.hpp
+index 36790925b..b371663ee 100644
+--- a/src/hotspot/share/runtime/objectMonitor.inline.hpp
++++ b/src/hotspot/share/runtime/objectMonitor.inline.hpp
+@@ -1,5 +1,5 @@
+ /*
+- * Copyright (c) 1998, 2023, Oracle and/or its affiliates. All rights reserved.
++ * Copyright (c) 1998, 2024, Oracle and/or its affiliates. All rights reserved.
+  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+  *
+  * This code is free software; you can redistribute it and/or modify it
+@@ -102,6 +102,12 @@ inline void ObjectMonitor::add_to_contentions(int value) {
+   Atomic::add(&_contentions, value);
+ }
+ 
++inline void ObjectMonitor::set_recursions(size_t recursions) {
++  assert(_recursions == 0, "must be");
++  assert(has_owner(), "must be owned");
++  _recursions = checked_cast<intx>(recursions);
++}
++
+ // Clear _owner field; current value must match old_value.
+ inline void ObjectMonitor::release_clear_owner(void* old_value) {
+ #ifdef ASSERT
+diff --git a/src/hotspot/share/runtime/synchronizer.cpp b/src/hotspot/share/runtime/synchronizer.cpp
+index cc73082ed..cdcea5436 100644
+--- a/src/hotspot/share/runtime/synchronizer.cpp
++++ b/src/hotspot/share/runtime/synchronizer.cpp
+@@ -36,6 +36,7 @@
+ #include "oops/oop.inline.hpp"
+ #include "runtime/atomic.hpp"
+ #include "runtime/frame.inline.hpp"
++#include "runtime/globals.hpp"
+ #include "runtime/handles.inline.hpp"
+ #include "runtime/handshake.hpp"
+ #include "runtime/interfaceSupport.inline.hpp"
+@@ -60,6 +61,7 @@
+ #include "utilities/align.hpp"
+ #include "utilities/dtrace.hpp"
+ #include "utilities/events.hpp"
++#include "utilities/globalDefinitions.hpp"
+ #include "utilities/linkedlist.hpp"
+ #include "utilities/preserveException.hpp"
+ 
+@@ -384,6 +386,19 @@ bool ObjectSynchronizer::quick_enter(oop obj, JavaThread* current,
+     return false;
+   }
+ 
++  if (LockingMode == LM_LIGHTWEIGHT) {
++    LockStack& lock_stack = current->lock_stack();
++    if (lock_stack.is_full()) {
++      // Always go into runtime if the lock stack is full.
++      return false;
++    }
++    if (lock_stack.try_recursive_enter(obj)) {
++      // Recursive lock successful.
++      current->inc_held_monitor_count();
++      return true;
++    }
++  }
++
+   const markWord mark = obj->mark();
+ 
+   if (mark.has_monitor()) {
+@@ -437,8 +452,9 @@ bool ObjectSynchronizer::quick_enter(oop obj, JavaThread* current,
+ }
+ 
+ // Handle notifications when synchronizing on value based classes
+-void ObjectSynchronizer::handle_sync_on_value_based_class(Handle obj, JavaThread* current) {
+-  frame last_frame = current->last_frame();
++void ObjectSynchronizer::handle_sync_on_value_based_class(Handle obj, JavaThread* locking_thread) {
++  assert(locking_thread == Thread::current() || locking_thread->is_obj_deopt_suspend(), "must be");
++  frame last_frame = locking_thread->last_frame();
+   bool bcp_was_adjusted = false;
+   // Don't decrement bcp if it points to the frame's first instruction.  This happens when
+   // handle_sync_on_value_based_class() is called because of a synchronized method.  There
+@@ -451,9 +467,9 @@ void ObjectSynchronizer::handle_sync_on_value_based_class(Handle obj, JavaThread
+   }
+ 
+   if (DiagnoseSyncOnValueBasedClasses == FATAL_EXIT) {
+-    ResourceMark rm(current);
++    ResourceMark rm;
+     stringStream ss;
+-    current->print_active_stack_on(&ss);
++    locking_thread->print_active_stack_on(&ss);
+     char* base = (char*)strstr(ss.base(), "at");
+     char* newline = (char*)strchr(ss.base(), '\n');
+     if (newline != nullptr) {
+@@ -462,13 +478,13 @@ void ObjectSynchronizer::handle_sync_on_value_based_class(Handle obj, JavaThread
+     fatal("Synchronizing on object " INTPTR_FORMAT " of klass %s %s", p2i(obj()), obj->klass()->external_name(), base);
+   } else {
+     assert(DiagnoseSyncOnValueBasedClasses == LOG_WARNING, "invalid value for DiagnoseSyncOnValueBasedClasses");
+-    ResourceMark rm(current);
++    ResourceMark rm;
+     Log(valuebasedclasses) vblog;
+ 
+     vblog.info("Synchronizing on object " INTPTR_FORMAT " of klass %s", p2i(obj()), obj->klass()->external_name());
+-    if (current->has_last_Java_frame()) {
++    if (locking_thread->has_last_Java_frame()) {
+       LogStream info_stream(vblog.info());
+-      current->print_active_stack_on(&info_stream);
++      locking_thread->print_active_stack_on(&info_stream);
+     } else {
+       vblog.info("Cannot find the last Java frame");
+     }
+@@ -495,38 +511,111 @@ static bool useHeavyMonitors() {
+ 
+ // -----------------------------------------------------------------------------
+ // Monitor Enter/Exit
++
++void ObjectSynchronizer::enter_for(Handle obj, BasicLock* lock, JavaThread* locking_thread) {
++  // When called with locking_thread != Thread::current() some mechanism must synchronize
++  // the locking_thread with respect to the current thread. Currently only used when
++  // deoptimizing and re-locking locks. See Deoptimization::relock_objects
++  assert(locking_thread == Thread::current() || locking_thread->is_obj_deopt_suspend(), "must be");
++  if (!enter_fast_impl(obj, lock, locking_thread)) {
++    // Inflated ObjectMonitor::enter_for is required
++
++    // An async deflation can race after the inflate_for() call and before
++    // enter_for() can make the ObjectMonitor busy. enter_for() returns false
++    // if we have lost the race to async deflation and we simply try again.
++    while (true) {
++      ObjectMonitor* monitor = inflate_for(locking_thread, obj(), inflate_cause_monitor_enter);
++      if (monitor->enter_for(locking_thread)) {
++        return;
++      }
++      assert(monitor->is_being_async_deflated(), "must be");
++    }
++  }
++}
++
++void ObjectSynchronizer::enter(Handle obj, BasicLock* lock, JavaThread* current) {
++  assert(current == Thread::current(), "must be");
++  if (!enter_fast_impl(obj, lock, current)) {
++    // Inflated ObjectMonitor::enter is required
++
++    // An async deflation can race after the inflate() call and before
++    // enter() can make the ObjectMonitor busy. enter() returns false if
++    // we have lost the race to async deflation and we simply try again.
++    while (true) {
++      ObjectMonitor* monitor = inflate(current, obj(), inflate_cause_monitor_enter);
++      if (monitor->enter(current)) {
++        return;
++      }
++    }
++  }
++}
++
+ // The interpreter and compiler assembly code tries to lock using the fast path
+ // of this algorithm. Make sure to update that code if the following function is
+ // changed. The implementation is extremely sensitive to race condition. Be careful.
++bool ObjectSynchronizer::enter_fast_impl(Handle obj, BasicLock* lock, JavaThread* locking_thread) {
+ 
+-void ObjectSynchronizer::enter(Handle obj, BasicLock* lock, JavaThread* current) {
+   if (obj->klass()->is_value_based()) {
+-    handle_sync_on_value_based_class(obj, current);
++    handle_sync_on_value_based_class(obj, locking_thread);
+   }
+ 
+-  current->inc_held_monitor_count();
++  locking_thread->inc_held_monitor_count();
+ 
+   if (!useHeavyMonitors()) {
+     if (LockingMode == LM_LIGHTWEIGHT) {
+       // Fast-locking does not use the 'lock' argument.
+-      LockStack& lock_stack = current->lock_stack();
+-      if (lock_stack.can_push()) {
+-        markWord mark = obj()->mark_acquire();
+-        while (mark.is_neutral()) {
+-          // Retry until a lock state change has been observed.  cas_set_mark() may collide with non lock bits modifications.
+-          // Try to swing into 'fast-locked' state.
+-          assert(!lock_stack.contains(obj()), "thread must not already hold the lock");
+-          const markWord locked_mark = mark.set_fast_locked();
+-          const markWord old_mark = obj()->cas_set_mark(locked_mark, mark);
+-          if (old_mark == mark) {
+-            // Successfully fast-locked, push object to lock-stack and return.
+-            lock_stack.push(obj());
+-            return;
+-          }
+-          mark = old_mark;
++      LockStack& lock_stack = locking_thread->lock_stack();
++      if (lock_stack.is_full()) {
++        // We unconditionally make room on the lock stack by inflating
++        // the least recently locked object on the lock stack.
++
++        // About the choice to inflate least recently locked object.
++        // First we must chose to inflate a lock, either some lock on
++        // the lock-stack or the lock that is currently being entered
++        // (which may or may not be on the lock-stack).
++        // Second the best lock to inflate is a lock which is entered
++        // in a control flow where there are only a very few locks being
++        // used, as the costly part of inflated locking is inflation,
++        // not locking. But this property is entirely program dependent.
++        // Third inflating the lock currently being entered on when it
++        // is not present on the lock-stack will result in a still full
++        // lock-stack. This creates a scenario where every deeper nested
++        // monitorenter must call into the runtime.
++        // The rational here is as follows:
++        // Because we cannot (currently) figure out the second, and want
++        // to avoid the third, we inflate a lock on the lock-stack.
++        // The least recently locked lock is chosen as it is the lock
++        // with the longest critical section.
++
++        log_info(monitorinflation)("LockStack capacity exceeded, inflating.");
++        ObjectMonitor* monitor = inflate_for(locking_thread, lock_stack.bottom(), inflate_cause_vm_internal);
++        assert(monitor->owner() == Thread::current(), "must be owner=" PTR_FORMAT " current=" PTR_FORMAT " mark=" PTR_FORMAT,
++               p2i(monitor->owner()), p2i(Thread::current()), monitor->object()->mark_acquire().value());
++        assert(!lock_stack.is_full(), "must have made room here");
++      }
++
++      markWord mark = obj()->mark_acquire();
++      while (mark.is_neutral()) {
++        // Retry until a lock state change has been observed. cas_set_mark() may collide with non lock bits modifications.
++        // Try to swing into 'fast-locked' state.
++        assert(!lock_stack.contains(obj()), "thread must not already hold the lock");
++        const markWord locked_mark = mark.set_fast_locked();
++        const markWord old_mark = obj()->cas_set_mark(locked_mark, mark);
++        if (old_mark == mark) {
++          // Successfully fast-locked, push object to lock-stack and return.
++          lock_stack.push(obj());
++          return true;
+         }
++        mark = old_mark;
+       }
+-      // All other paths fall-through to inflate-enter.
++
++      if (mark.is_fast_locked() && lock_stack.try_recursive_enter(obj())) {
++        // Recursive lock successful.
++        return true;
++      }
++
++      // Failed to fast lock.
++      return false;
+     } else if (LockingMode == LM_LEGACY) {
+       markWord mark = obj->mark();
+       if (mark.is_neutral()) {
+@@ -534,15 +623,14 @@ void ObjectSynchronizer::enter(Handle obj, BasicLock* lock, JavaThread* current)
+         // be visible <= the ST performed by the CAS.
+         lock->set_displaced_header(mark);
+         if (mark == obj()->cas_set_mark(markWord::from_pointer(lock), mark)) {
+-          return;
++          return true;
+         }
+-        // Fall through to inflate() ...
+       } else if (mark.has_locker() &&
+-                 current->is_lock_owned((address) mark.locker())) {
++                 locking_thread->is_lock_owned((address) mark.locker())) {
+         assert(lock != mark.locker(), "must not re-lock the same lock");
+         assert(lock != (BasicLock*) obj->mark().value(), "don't relock with same BasicLock");
+         lock->set_displaced_header(markWord::from_pointer(nullptr));
+-        return;
++        return true;
+       }
+ 
+       // The object header will never be displaced to this lock,
+@@ -550,20 +638,15 @@ void ObjectSynchronizer::enter(Handle obj, BasicLock* lock, JavaThread* current)
+       // must be non-zero to avoid looking like a re-entrant lock,
+       // and must not look locked either.
+       lock->set_displaced_header(markWord::unused_mark());
++
++      // Failed to fast lock.
++      return false;
+     }
+   } else if (VerifyHeavyMonitors) {
+     guarantee((obj->mark().value() & markWord::lock_mask_in_place) != markWord::locked_value, "must not be lightweight/stack-locked");
+   }
+ 
+-  // An async deflation can race after the inflate() call and before
+-  // enter() can make the ObjectMonitor busy. enter() returns false if
+-  // we have lost the race to async deflation and we simply try again.
+-  while (true) {
+-    ObjectMonitor* monitor = inflate(current, obj(), inflate_cause_monitor_enter);
+-    if (monitor->enter(current)) {
+-      return;
+-    }
+-  }
++  return false;
+ }
+ 
+ void ObjectSynchronizer::exit(oop object, BasicLock* lock, JavaThread* current) {
+@@ -573,15 +656,28 @@ void ObjectSynchronizer::exit(oop object, BasicLock* lock, JavaThread* current)
+     markWord mark = object->mark();
+     if (LockingMode == LM_LIGHTWEIGHT) {
+       // Fast-locking does not use the 'lock' argument.
+-      while (mark.is_fast_locked()) {
+-        // Retry until a lock state change has been observed.  cas_set_mark() may collide with non lock bits modifications.
+-        const markWord unlocked_mark = mark.set_unlocked();
+-        const markWord old_mark = object->cas_set_mark(unlocked_mark, mark);
+-        if (old_mark == mark) {
+-          current->lock_stack().remove(object);
+-          return;
++      LockStack& lock_stack = current->lock_stack();
++      if (mark.is_fast_locked() && lock_stack.try_recursive_exit(object)) {
++        // Recursively unlocked.
++        return;
++      }
++
++      if (mark.is_fast_locked() && lock_stack.is_recursive(object)) {
++        // This lock is recursive but is not at the top of the lock stack so we're
++        // doing an unbalanced exit. We have to fall thru to inflation below and
++        // let ObjectMonitor::exit() do the unlock.
++      } else {
++        while (mark.is_fast_locked()) {
++          // Retry until a lock state change has been observed. cas_set_mark() may collide with non lock bits modifications.
++          const markWord unlocked_mark = mark.set_unlocked();
++          const markWord old_mark = object->cas_set_mark(unlocked_mark, mark);
++          if (old_mark == mark) {
++            size_t recursions = lock_stack.remove(object) - 1;
++            assert(recursions == 0, "must not be recursive here");
++            return;
++          }
++          mark = old_mark;
+         }
+-        mark = old_mark;
+       }
+     } else if (LockingMode == LM_LEGACY) {
+       markWord dhw = lock->displaced_header();
+@@ -631,13 +727,7 @@ void ObjectSynchronizer::exit(oop object, BasicLock* lock, JavaThread* current)
+   // The ObjectMonitor* can't be async deflated until ownership is
+   // dropped inside exit() and the ObjectMonitor* must be !is_busy().
+   ObjectMonitor* monitor = inflate(current, object, inflate_cause_vm_internal);
+-  if (LockingMode == LM_LIGHTWEIGHT && monitor->is_owner_anonymous()) {
+-    // It must be owned by us. Pop lock object from lock stack.
+-    LockStack& lock_stack = current->lock_stack();
+-    oop popped = lock_stack.pop();
+-    assert(popped == object, "must be owned by this thread");
+-    monitor->set_owner_from_anonymous(current);
+-  }
++  assert(!monitor->is_owner_anonymous(), "must not be");
+   monitor->exit(current);
+ }
+ 
+@@ -1313,15 +1403,28 @@ void ObjectSynchronizer::inflate_helper(oop obj) {
+   (void)inflate(Thread::current(), obj, inflate_cause_vm_internal);
+ }
+ 
+-// Can be called from non JavaThreads (e.g., VMThread) for FastHashCode
+-// calculations as part of JVM/TI tagging.
+-static bool is_lock_owned(Thread* thread, oop obj) {
+-  assert(LockingMode == LM_LIGHTWEIGHT, "only call this with new lightweight locking enabled");
+-  return thread->is_Java_thread() ? JavaThread::cast(thread)->lock_stack().contains(obj) : false;
++ObjectMonitor* ObjectSynchronizer::inflate(Thread* current, oop obj, const InflateCause cause) {
++  assert(current == Thread::current(), "must be");
++  if (LockingMode == LM_LIGHTWEIGHT && current->is_Java_thread()) {
++    return inflate_impl(JavaThread::cast(current), obj, cause);
++  }
++  return inflate_impl(nullptr, obj, cause);
++}
++
++ObjectMonitor* ObjectSynchronizer::inflate_for(JavaThread* thread, oop obj, const InflateCause cause) {
++  assert(thread == Thread::current() || thread->is_obj_deopt_suspend(), "must be");
++  return inflate_impl(thread, obj, cause);
+ }
+ 
+-ObjectMonitor* ObjectSynchronizer::inflate(Thread* current, oop object,
+-                                           const InflateCause cause) {
++ObjectMonitor* ObjectSynchronizer::inflate_impl(JavaThread* inflating_thread, oop object, const InflateCause cause) {
++  // The JavaThread* inflating_thread parameter is only used by LM_LIGHTWEIGHT and requires
++  // that the inflating_thread == Thread::current() or is suspended throughout the call by
++  // some other mechanism.
++  // Even with LM_LIGHTWEIGHT the thread might be nullptr when called from a non
++  // JavaThread. (As may still be the case from FastHashCode). However it is only
++  // important for the correctness of the LM_LIGHTWEIGHT algorithm that the thread
++  // is set when called from ObjectSynchronizer::enter from the owning thread,
++  // ObjectSynchronizer::enter_for from any thread, or ObjectSynchronizer::exit.
+   EventJavaMonitorInflate event;
+ 
+   for (;;) {
+@@ -1330,10 +1433,10 @@ ObjectMonitor* ObjectSynchronizer::inflate(Thread* current, oop object,
+     // The mark can be in one of the following states:
+     // *  inflated     - Just return if using stack-locking.
+     //                   If using fast-locking and the ObjectMonitor owner
+-    //                   is anonymous and the current thread owns the
+-    //                   object lock, then we make the current thread the
+-    //                   ObjectMonitor owner and remove the lock from the
+-    //                   current thread's lock stack.
++    //                   is anonymous and the inflating_thread owns the
++    //                   object lock, then we make the inflating_thread
++    //                   the ObjectMonitor owner and remove the lock from
++    //                   the inflating_thread's lock stack.
+     // *  fast-locked  - Coerce it to inflated from fast-locked.
+     // *  stack-locked - Coerce it to inflated from stack-locked.
+     // *  INFLATING    - Busy wait for conversion from stack-locked to
+@@ -1345,9 +1448,11 @@ ObjectMonitor* ObjectSynchronizer::inflate(Thread* current, oop object,
+       ObjectMonitor* inf = mark.monitor();
+       markWord dmw = inf->header();
+       assert(dmw.is_neutral(), "invariant: header=" INTPTR_FORMAT, dmw.value());
+-      if (LockingMode == LM_LIGHTWEIGHT && inf->is_owner_anonymous() && is_lock_owned(current, object)) {
+-        inf->set_owner_from_anonymous(current);
+-        JavaThread::cast(current)->lock_stack().remove(object);
++      if (LockingMode == LM_LIGHTWEIGHT && inf->is_owner_anonymous() &&
++          inflating_thread != nullptr && inflating_thread->lock_stack().contains(object)) {
++        inf->set_owner_from_anonymous(inflating_thread);
++        size_t removed = inflating_thread->lock_stack().remove(object);
++        inf->set_recursions(removed - 1);
+       }
+       return inf;
+     }
+@@ -1367,12 +1472,12 @@ ObjectMonitor* ObjectSynchronizer::inflate(Thread* current, oop object,
+     }
+ 
+     // CASE: fast-locked
+-    // Could be fast-locked either by current or by some other thread.
++    // Could be fast-locked either by the inflating_thread or by some other thread.
+     //
+     // Note that we allocate the ObjectMonitor speculatively, _before_
+     // attempting to set the object's mark to the new ObjectMonitor. If
+-    // this thread owns the monitor, then we set the ObjectMonitor's
+-    // owner to this thread. Otherwise, we set the ObjectMonitor's owner
++    // the inflating_thread owns the monitor, then we set the ObjectMonitor's
++    // owner to the inflating_thread. Otherwise, we set the ObjectMonitor's owner
+     // to anonymous. If we lose the race to set the object's mark to the
+     // new ObjectMonitor, then we just delete it and loop around again.
+     //
+@@ -1380,10 +1485,10 @@ ObjectMonitor* ObjectSynchronizer::inflate(Thread* current, oop object,
+     if (LockingMode == LM_LIGHTWEIGHT && mark.is_fast_locked()) {
+       ObjectMonitor* monitor = new ObjectMonitor(object);
+       monitor->set_header(mark.set_unlocked());
+-      bool own = is_lock_owned(current, object);
++      bool own = inflating_thread != nullptr && inflating_thread->lock_stack().contains(object);
+       if (own) {
+-        // Owned by us.
+-        monitor->set_owner_from(nullptr, current);
++        // Owned by inflating_thread.
++        monitor->set_owner_from(nullptr, inflating_thread);
+       } else {
+         // Owned by somebody else.
+         monitor->set_owner_anonymous();
+@@ -1393,7 +1498,8 @@ ObjectMonitor* ObjectSynchronizer::inflate(Thread* current, oop object,
+       if (old_mark == mark) {
+         // Success! Return inflated monitor.
+         if (own) {
+-          JavaThread::cast(current)->lock_stack().remove(object);
++          size_t removed = inflating_thread->lock_stack().remove(object);
++          monitor->set_recursions(removed - 1);
+         }
+         // Once the ObjectMonitor is configured and object is associated
+         // with the ObjectMonitor, it is safe to allow async deflation:
+@@ -1403,7 +1509,7 @@ ObjectMonitor* ObjectSynchronizer::inflate(Thread* current, oop object,
+         // cache lines to avoid false sharing on MP systems ...
+         OM_PERFDATA_OP(Inflations, inc());
+         if (log_is_enabled(Trace, monitorinflation)) {
+-          ResourceMark rm(current);
++          ResourceMark rm;
+           lsh.print_cr("inflate(has_locker): object=" INTPTR_FORMAT ", mark="
+                        INTPTR_FORMAT ", type='%s'", p2i(object),
+                        object->mark().value(), object->klass()->external_name());
+@@ -1502,7 +1608,7 @@ ObjectMonitor* ObjectSynchronizer::inflate(Thread* current, oop object,
+       // to avoid false sharing on MP systems ...
+       OM_PERFDATA_OP(Inflations, inc());
+       if (log_is_enabled(Trace, monitorinflation)) {
+-        ResourceMark rm(current);
++        ResourceMark rm;
+         lsh.print_cr("inflate(has_locker): object=" INTPTR_FORMAT ", mark="
+                      INTPTR_FORMAT ", type='%s'", p2i(object),
+                      object->mark().value(), object->klass()->external_name());
+@@ -1546,7 +1652,7 @@ ObjectMonitor* ObjectSynchronizer::inflate(Thread* current, oop object,
+     // cache lines to avoid false sharing on MP systems ...
+     OM_PERFDATA_OP(Inflations, inc());
+     if (log_is_enabled(Trace, monitorinflation)) {
+-      ResourceMark rm(current);
++      ResourceMark rm;
+       lsh.print_cr("inflate(neutral): object=" INTPTR_FORMAT ", mark="
+                    INTPTR_FORMAT ", type='%s'", p2i(object),
+                    object->mark().value(), object->klass()->external_name());
+diff --git a/src/hotspot/share/runtime/synchronizer.hpp b/src/hotspot/share/runtime/synchronizer.hpp
+index e983aeb9d..f1a14e362 100644
+--- a/src/hotspot/share/runtime/synchronizer.hpp
++++ b/src/hotspot/share/runtime/synchronizer.hpp
+@@ -1,5 +1,5 @@
+ /*
+- * Copyright (c) 1998, 2023, Oracle and/or its affiliates. All rights reserved.
++ * Copyright (c) 1998, 2024, Oracle and/or its affiliates. All rights reserved.
+  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+  *
+  * This code is free software; you can redistribute it and/or modify it
+@@ -92,7 +92,18 @@ class ObjectSynchronizer : AllStatic {
+   // This is the "slow path" version of monitor enter and exit.
+   static void enter(Handle obj, BasicLock* lock, JavaThread* current);
+   static void exit(oop obj, BasicLock* lock, JavaThread* current);
++  // Used to enter a monitor for another thread. This requires that the
++  // locking_thread is suspended, and that entering on a potential
++  // inflated monitor may only contend with deflation. That is the obj being
++  // locked on is either already locked by the locking_thread or cannot
++  // escape the locking_thread.
++  static void enter_for(Handle obj, BasicLock* lock, JavaThread* locking_thread);
++private:
++  // Shared implementation for enter and enter_for. Performs all but
++  // inflated monitor enter.
++  static bool enter_fast_impl(Handle obj, BasicLock* lock, JavaThread* locking_thread);
+ 
++public:
+   // Used only to handle jni locks or other unmatched monitor enter/exit
+   // Internally they will use heavy weight monitor.
+   static void jni_enter(Handle obj, JavaThread* current);
+@@ -113,6 +124,14 @@ class ObjectSynchronizer : AllStatic {
+ 
+   // Inflate light weight monitor to heavy weight monitor
+   static ObjectMonitor* inflate(Thread* current, oop obj, const InflateCause cause);
++  // Used to inflate a monitor as if it was done from the thread JavaThread.
++  static ObjectMonitor* inflate_for(JavaThread* thread, oop obj, const InflateCause cause);
++
++private:
++  // Shared implementation between the different LockingMode.
++  static ObjectMonitor* inflate_impl(JavaThread* thread, oop obj, const InflateCause cause);
++
++public:
+   // This version is only for internal use
+   static void inflate_helper(oop obj);
+   static const char* inflate_cause_name(const InflateCause cause);
+@@ -193,7 +212,7 @@ class ObjectSynchronizer : AllStatic {
+   static size_t get_gvars_size();
+   static u_char* get_gvars_stw_random_addr();
+ 
+-  static void handle_sync_on_value_based_class(Handle obj, JavaThread* current);
++  static void handle_sync_on_value_based_class(Handle obj, JavaThread* locking_thread);
+ };
+ 
+ // ObjectLocker enforces balanced locking and can never throw an
+diff --git a/test/hotspot/gtest/runtime/test_lockStack.cpp b/test/hotspot/gtest/runtime/test_lockStack.cpp
+new file mode 100644
+index 000000000..43e8959ed
+--- /dev/null
++++ b/test/hotspot/gtest/runtime/test_lockStack.cpp
+@@ -0,0 +1,427 @@
++/*
++ * Copyright (c) 2023, 2024, Oracle and/or its affiliates. All rights reserved.
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This code is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 only, as
++ * published by the Free Software Foundation.
++ *
++ * This code is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++ * version 2 for more details (a copy is included in the LICENSE file that
++ * accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License version
++ * 2 along with this work; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
++ *
++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
++ * or visit www.oracle.com if you need additional information or have any
++ * questions.
++ */
++
++#include "precompiled.hpp"
++#include "runtime/interfaceSupport.inline.hpp"
++#include "runtime/lockStack.inline.hpp"
++#include "runtime/os.hpp"
++#include "unittest.hpp"
++#include "utilities/globalDefinitions.hpp"
++
++class LockStackTest : public ::testing::Test {
++public:
++  static void push_raw(LockStack& ls, oop obj) {
++    ls._base[ls.to_index(ls._top)] = obj;
++    ls._top += oopSize;
++  }
++
++  static void pop_raw(LockStack& ls) {
++    ls._top -= oopSize;
++#ifdef ASSERT
++    ls._base[ls.to_index(ls._top)] = nullptr;
++#endif
++  }
++
++  static oop at(LockStack& ls, int index) {
++    return ls._base[index];
++  }
++
++  static size_t size(LockStack& ls) {
++    return ls.to_index(ls._top);
++  }
++};
++
++#define recursive_enter(ls, obj)             \
++  do {                                       \
++    bool ret = ls.try_recursive_enter(obj);  \
++    EXPECT_TRUE(ret);                        \
++  } while (false)
++
++#define recursive_exit(ls, obj)             \
++  do {                                      \
++    bool ret = ls.try_recursive_exit(obj);  \
++    EXPECT_TRUE(ret);                       \
++  } while (false)
++
++TEST_VM_F(LockStackTest, is_recursive) {
++  if (LockingMode != LM_LIGHTWEIGHT || !VM_Version::supports_recursive_lightweight_locking()) {
++    return;
++  }
++
++  JavaThread* THREAD = JavaThread::current();
++  // the thread should be in vm to use locks
++  ThreadInVMfromNative ThreadInVMfromNative(THREAD);
++
++  LockStack& ls = THREAD->lock_stack();
++
++  EXPECT_TRUE(ls.is_empty());
++
++  oop obj0 = Universe::int_mirror();
++  oop obj1 = Universe::float_mirror();
++
++  push_raw(ls, obj0);
++
++  // 0
++  EXPECT_FALSE(ls.is_recursive(obj0));
++
++  push_raw(ls, obj1);
++
++  // 0, 1
++  EXPECT_FALSE(ls.is_recursive(obj0));
++  EXPECT_FALSE(ls.is_recursive(obj1));
++
++  push_raw(ls, obj1);
++
++  // 0, 1, 1
++  EXPECT_FALSE(ls.is_recursive(obj0));
++  EXPECT_TRUE(ls.is_recursive(obj1));
++
++  pop_raw(ls);
++  pop_raw(ls);
++  push_raw(ls, obj0);
++
++  // 0, 0
++  EXPECT_TRUE(ls.is_recursive(obj0));
++
++  push_raw(ls, obj0);
++
++  // 0, 0, 0
++  EXPECT_TRUE(ls.is_recursive(obj0));
++
++  pop_raw(ls);
++  push_raw(ls, obj1);
++
++  // 0, 0, 1
++  EXPECT_TRUE(ls.is_recursive(obj0));
++  EXPECT_FALSE(ls.is_recursive(obj1));
++
++  push_raw(ls, obj1);
++
++  // 0, 0, 1, 1
++  EXPECT_TRUE(ls.is_recursive(obj0));
++  EXPECT_TRUE(ls.is_recursive(obj1));
++
++  // Clear stack
++  pop_raw(ls);
++  pop_raw(ls);
++  pop_raw(ls);
++  pop_raw(ls);
++
++  EXPECT_TRUE(ls.is_empty());
++}
++
++TEST_VM_F(LockStackTest, try_recursive_enter) {
++  if (LockingMode != LM_LIGHTWEIGHT || !VM_Version::supports_recursive_lightweight_locking()) {
++    return;
++  }
++
++  JavaThread* THREAD = JavaThread::current();
++  // the thread should be in vm to use locks
++  ThreadInVMfromNative ThreadInVMfromNative(THREAD);
++
++  LockStack& ls = THREAD->lock_stack();
++
++  EXPECT_TRUE(ls.is_empty());
++
++  oop obj0 = Universe::int_mirror();
++  oop obj1 = Universe::float_mirror();
++
++  ls.push(obj0);
++
++  // 0
++  EXPECT_FALSE(ls.is_recursive(obj0));
++
++  ls.push(obj1);
++
++  // 0, 1
++  EXPECT_FALSE(ls.is_recursive(obj0));
++  EXPECT_FALSE(ls.is_recursive(obj1));
++
++  recursive_enter(ls, obj1);
++
++  // 0, 1, 1
++  EXPECT_FALSE(ls.is_recursive(obj0));
++  EXPECT_TRUE(ls.is_recursive(obj1));
++
++  recursive_exit(ls, obj1);
++  pop_raw(ls);
++  recursive_enter(ls, obj0);
++
++  // 0, 0
++  EXPECT_TRUE(ls.is_recursive(obj0));
++
++  recursive_enter(ls, obj0);
++
++  // 0, 0, 0
++  EXPECT_TRUE(ls.is_recursive(obj0));
++
++  recursive_exit(ls, obj0);
++  push_raw(ls, obj1);
++
++  // 0, 0, 1
++  EXPECT_TRUE(ls.is_recursive(obj0));
++  EXPECT_FALSE(ls.is_recursive(obj1));
++
++  recursive_enter(ls, obj1);
++
++  // 0, 0, 1, 1
++  EXPECT_TRUE(ls.is_recursive(obj0));
++  EXPECT_TRUE(ls.is_recursive(obj1));
++
++  // Clear stack
++  pop_raw(ls);
++  pop_raw(ls);
++  pop_raw(ls);
++  pop_raw(ls);
++
++  EXPECT_TRUE(ls.is_empty());
++}
++
++TEST_VM_F(LockStackTest, contains) {
++  if (LockingMode != LM_LIGHTWEIGHT) {
++    return;
++  }
++
++  const bool test_recursive = VM_Version::supports_recursive_lightweight_locking();
++
++  JavaThread* THREAD = JavaThread::current();
++  // the thread should be in vm to use locks
++  ThreadInVMfromNative ThreadInVMfromNative(THREAD);
++
++  LockStack& ls = THREAD->lock_stack();
++
++  EXPECT_TRUE(ls.is_empty());
++
++  oop obj0 = Universe::int_mirror();
++  oop obj1 = Universe::float_mirror();
++
++  EXPECT_FALSE(ls.contains(obj0));
++
++  ls.push(obj0);
++
++  // 0
++  EXPECT_TRUE(ls.contains(obj0));
++  EXPECT_FALSE(ls.contains(obj1));
++
++  if (test_recursive) {
++    push_raw(ls, obj0);
++
++    // 0, 0
++    EXPECT_TRUE(ls.contains(obj0));
++    EXPECT_FALSE(ls.contains(obj1));
++  }
++
++  push_raw(ls, obj1);
++
++  // 0, 0, 1
++  EXPECT_TRUE(ls.contains(obj0));
++  EXPECT_TRUE(ls.contains(obj1));
++
++  if (test_recursive) {
++    push_raw(ls, obj1);
++
++    // 0, 0, 1, 1
++    EXPECT_TRUE(ls.contains(obj0));
++    EXPECT_TRUE(ls.contains(obj1));
++  }
++
++  pop_raw(ls);
++  if (test_recursive) {
++    pop_raw(ls);
++    pop_raw(ls);
++  }
++  push_raw(ls, obj1);
++
++  // 0, 1
++  EXPECT_TRUE(ls.contains(obj0));
++  EXPECT_TRUE(ls.contains(obj1));
++
++  // Clear stack
++  pop_raw(ls);
++  pop_raw(ls);
++
++  EXPECT_TRUE(ls.is_empty());
++}
++
++TEST_VM_F(LockStackTest, remove) {
++  if (LockingMode != LM_LIGHTWEIGHT) {
++    return;
++  }
++
++  const bool test_recursive = VM_Version::supports_recursive_lightweight_locking();
++
++  JavaThread* THREAD = JavaThread::current();
++  // the thread should be in vm to use locks
++  ThreadInVMfromNative ThreadInVMfromNative(THREAD);
++
++  LockStack& ls = THREAD->lock_stack();
++
++  EXPECT_TRUE(ls.is_empty());
++
++  oop obj0 = Universe::int_mirror();
++  oop obj1 = Universe::float_mirror();
++  oop obj2 = Universe::short_mirror();
++  oop obj3 = Universe::long_mirror();
++
++  push_raw(ls, obj0);
++
++  // 0
++  {
++    size_t removed = ls.remove(obj0);
++    EXPECT_EQ(removed, 1u);
++    EXPECT_FALSE(ls.contains(obj0));
++  }
++
++  if (test_recursive) {
++    push_raw(ls, obj0);
++    push_raw(ls, obj0);
++
++    // 0, 0
++    {
++      size_t removed = ls.remove(obj0);
++      EXPECT_EQ(removed, 2u);
++      EXPECT_FALSE(ls.contains(obj0));
++    }
++  }
++
++  push_raw(ls, obj0);
++  push_raw(ls, obj1);
++
++  // 0, 1
++  {
++    size_t removed = ls.remove(obj0);
++    EXPECT_EQ(removed, 1u);
++    EXPECT_FALSE(ls.contains(obj0));
++    EXPECT_TRUE(ls.contains(obj1));
++
++    ls.remove(obj1);
++    EXPECT_TRUE(ls.is_empty());
++  }
++
++  push_raw(ls, obj0);
++  push_raw(ls, obj1);
++
++  // 0, 1
++  {
++    size_t removed = ls.remove(obj1);
++    EXPECT_EQ(removed, 1u);
++    EXPECT_FALSE(ls.contains(obj1));
++    EXPECT_TRUE(ls.contains(obj0));
++
++    ls.remove(obj0);
++    EXPECT_TRUE(ls.is_empty());
++  }
++
++  if (test_recursive) {
++    push_raw(ls, obj0);
++    push_raw(ls, obj0);
++    push_raw(ls, obj1);
++
++    // 0, 0, 1
++    {
++      size_t removed = ls.remove(obj0);
++      EXPECT_EQ(removed, 2u);
++      EXPECT_FALSE(ls.contains(obj0));
++      EXPECT_TRUE(ls.contains(obj1));
++
++      ls.remove(obj1);
++      EXPECT_TRUE(ls.is_empty());
++    }
++
++    push_raw(ls, obj0);
++    push_raw(ls, obj1);
++    push_raw(ls, obj1);
++
++    // 0, 1, 1
++    {
++      size_t removed = ls.remove(obj1);
++      EXPECT_EQ(removed, 2u);
++      EXPECT_FALSE(ls.contains(obj1));
++      EXPECT_TRUE(ls.contains(obj0));
++
++      ls.remove(obj0);
++      EXPECT_TRUE(ls.is_empty());
++    }
++
++    push_raw(ls, obj0);
++    push_raw(ls, obj1);
++    push_raw(ls, obj1);
++    push_raw(ls, obj2);
++    push_raw(ls, obj2);
++    push_raw(ls, obj2);
++    push_raw(ls, obj2);
++    push_raw(ls, obj3);
++
++    // 0, 1, 1, 2, 2, 2, 2, 3
++    {
++      EXPECT_EQ(size(ls), 8u);
++
++      size_t removed = ls.remove(obj1);
++      EXPECT_EQ(removed, 2u);
++
++      EXPECT_TRUE(ls.contains(obj0));
++      EXPECT_FALSE(ls.contains(obj1));
++      EXPECT_TRUE(ls.contains(obj2));
++      EXPECT_TRUE(ls.contains(obj3));
++
++      EXPECT_EQ(at(ls, 0), obj0);
++      EXPECT_EQ(at(ls, 1), obj2);
++      EXPECT_EQ(at(ls, 2), obj2);
++      EXPECT_EQ(at(ls, 3), obj2);
++      EXPECT_EQ(at(ls, 4), obj2);
++      EXPECT_EQ(at(ls, 5), obj3);
++      EXPECT_EQ(size(ls), 6u);
++
++      removed = ls.remove(obj2);
++      EXPECT_EQ(removed, 4u);
++
++      EXPECT_TRUE(ls.contains(obj0));
++      EXPECT_FALSE(ls.contains(obj1));
++      EXPECT_FALSE(ls.contains(obj2));
++      EXPECT_TRUE(ls.contains(obj3));
++
++      EXPECT_EQ(at(ls, 0), obj0);
++      EXPECT_EQ(at(ls, 1), obj3);
++      EXPECT_EQ(size(ls), 2u);
++
++      removed = ls.remove(obj0);
++      EXPECT_EQ(removed, 1u);
++
++      EXPECT_FALSE(ls.contains(obj0));
++      EXPECT_FALSE(ls.contains(obj1));
++      EXPECT_FALSE(ls.contains(obj2));
++      EXPECT_TRUE(ls.contains(obj3));
++
++      EXPECT_EQ(at(ls, 0), obj3);
++      EXPECT_EQ(size(ls), 1u);
++
++      removed = ls.remove(obj3);
++      EXPECT_EQ(removed, 1u);
++
++      EXPECT_TRUE(ls.is_empty());
++      EXPECT_EQ(size(ls), 0u);
++    }
++  }
++
++  EXPECT_TRUE(ls.is_empty());
++}
+diff --git a/test/hotspot/jtreg/TEST.groups b/test/hotspot/jtreg/TEST.groups
+index 6fb2e2b0b..ff2b0cf00 100644
+--- a/test/hotspot/jtreg/TEST.groups
++++ b/test/hotspot/jtreg/TEST.groups
+@@ -1,5 +1,5 @@
+ #
+-# Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
++# Copyright (c) 2013, 2024, Oracle and/or its affiliates. All rights reserved.
+ # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ #
+ # This code is free software; you can redistribute it and/or modify it
+@@ -149,6 +149,7 @@ serviceability_ttf_virtual = \
+ tier1_common = \
+   sanity/BasicVMTest.java \
+   gtest/GTestWrapper.java \
++  gtest/LockStackGtests.java \
+   gtest/MetaspaceGtests.java \
+   gtest/LargePageGtests.java \
+   gtest/NMTGtests.java \
+diff --git a/test/hotspot/jtreg/gtest/LockStackGtests.java b/test/hotspot/jtreg/gtest/LockStackGtests.java
+new file mode 100644
+index 000000000..e426b2c56
+--- /dev/null
++++ b/test/hotspot/jtreg/gtest/LockStackGtests.java
+@@ -0,0 +1,32 @@
++/*
++ * Copyright (c) 2023, 2024, Oracle and/or its affiliates. All rights reserved.
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This code is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 only, as
++ * published by the Free Software Foundation.
++ *
++ * This code is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++ * version 2 for more details (a copy is included in the LICENSE file that
++ * accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License version
++ * 2 along with this work; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
++ *
++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
++ * or visit www.oracle.com if you need additional information or have any
++ * questions.
++ *
++ */
++
++/* @test
++ * @summary Run LockStack gtests with LockingMode=2
++ * @library /test/lib
++ * @modules java.base/jdk.internal.misc
++ *          java.xml
++ * @requires vm.flagless
++ * @run main/native GTestWrapper --gtest_filter=LockStackTest* -XX:LockingMode=2
++ */
+diff --git a/test/hotspot/jtreg/runtime/lockStack/TestLockStackCapacity.java b/test/hotspot/jtreg/runtime/lockStack/TestLockStackCapacity.java
+new file mode 100644
+index 000000000..01ba1f4f1
+--- /dev/null
++++ b/test/hotspot/jtreg/runtime/lockStack/TestLockStackCapacity.java
+@@ -0,0 +1,108 @@
++/*
++ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This code is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 only, as
++ * published by the Free Software Foundation.
++ *
++ * This code is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++ * version 2 for more details (a copy is included in the LICENSE file that
++ * accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License version
++ * 2 along with this work; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
++ *
++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
++ * or visit www.oracle.com if you need additional information or have any
++ * questions.
++ *
++ */
++
++/*
++ * @test TestLockStackCapacity
++ * @summary Tests the interaction between recursive lightweight locking and
++ *          when the lock stack capacity is exceeded.
++ * @requires vm.flagless
++ * @library /testlibrary /test/lib
++ * @build jdk.test.whitebox.WhiteBox
++ * @run driver jdk.test.lib.helpers.ClassFileInstaller jdk.test.whitebox.WhiteBox
++ * @run main/othervm -Xbootclasspath/a:. -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xint -XX:LockingMode=2 TestLockStackCapacity
++ */
++
++import jdk.test.lib.Asserts;
++import jdk.test.whitebox.WhiteBox;
++import jtreg.SkippedException;
++
++public class TestLockStackCapacity {
++    static final WhiteBox WB = WhiteBox.getWhiteBox();
++    static final int LockingMode = WB.getIntVMFlag("LockingMode").intValue();
++    static final int LM_LIGHTWEIGHT = 2;
++
++    static class SynchronizedObject {
++        static final SynchronizedObject OUTER = new SynchronizedObject();
++        static final SynchronizedObject INNER = new SynchronizedObject();
++        static final int LockStackCapacity = WB.getLockStackCapacity();
++
++        synchronized void runInner(int depth) {
++            assertNotInflated();
++            if (depth == 1) {
++                return;
++            } else {
++                runInner(depth - 1);
++            }
++            assertNotInflated();
++        }
++
++        synchronized void runOuter(int depth, SynchronizedObject inner) {
++            assertNotInflated();
++            if (depth == 1) {
++                inner.runInner(LockStackCapacity);
++            } else {
++                runOuter(depth - 1, inner);
++            }
++            assertInflated();
++        }
++
++        public static void runTest() {
++            // Test Requires a capacity of at least 2.
++            Asserts.assertGTE(LockStackCapacity, 2);
++
++            // Just checking
++            OUTER.assertNotInflated();
++            INNER.assertNotInflated();
++
++            synchronized(OUTER) {
++                OUTER.assertNotInflated();
++                INNER.assertNotInflated();
++                OUTER.runOuter(LockStackCapacity - 1, INNER);
++
++                OUTER.assertInflated();
++                INNER.assertNotInflated();
++            }
++        }
++
++        void assertNotInflated() {
++            Asserts.assertFalse(WB.isMonitorInflated(this));
++        }
++
++        void assertInflated() {
++            Asserts.assertTrue(WB.isMonitorInflated(this));
++        }
++    }
++
++    public static void main(String... args) throws Exception {
++        if (LockingMode != LM_LIGHTWEIGHT) {
++            throw new SkippedException("Test only valid for LM_LIGHTWEIGHT");
++        }
++
++        if (!WB.supportsRecursiveLightweightLocking()) {
++            throw new SkippedException("Test only valid if LM_LIGHTWEIGHT supports recursion");
++        }
++
++        SynchronizedObject.runTest();
++    }
++}
+diff --git a/test/jdk/com/sun/jdi/EATests.java b/test/jdk/com/sun/jdi/EATests.java
+index 8f0a8fabd..70adc9d7f 100644
+--- a/test/jdk/com/sun/jdi/EATests.java
++++ b/test/jdk/com/sun/jdi/EATests.java
+@@ -120,7 +120,46 @@
+  *                 -XX:-DoEscapeAnalysis -XX:-EliminateAllocations -XX:+EliminateLocks -XX:+EliminateNestedLocks
+  *                 -XX:+IgnoreUnrecognizedVMOptions -XX:+DeoptimizeObjectsALot
+  *
++ * @bug 8324881
++ * @comment Regression test for using the wrong thread when logging during re-locking from deoptimization.
++ *
++ * @comment DiagnoseSyncOnValueBasedClasses=2 will cause logging when locking on \@ValueBased objects.
++ * @run driver EATests
++ *                 -XX:+UnlockDiagnosticVMOptions
++ *                 -Xms256m -Xmx256m
++ *                 -Xbootclasspath/a:.
++ *                 -XX:CompileCommand=dontinline,*::dontinline_*
++ *                 -XX:+WhiteBoxAPI
++ *                 -Xbatch
++ *                 -XX:+DoEscapeAnalysis -XX:+EliminateAllocations -XX:+EliminateLocks -XX:+EliminateNestedLocks
++ *                 -XX:LockingMode=1
++ *                 -XX:DiagnoseSyncOnValueBasedClasses=2
++ *
++ * @comment Re-lock may inflate monitors when re-locking, which cause monitorinflation trace logging.
++ * @run driver EATests
++ *                 -XX:+UnlockDiagnosticVMOptions
++ *                 -Xms256m -Xmx256m
++ *                 -Xbootclasspath/a:.
++ *                 -XX:CompileCommand=dontinline,*::dontinline_*
++ *                 -XX:+WhiteBoxAPI
++ *                 -Xbatch
++ *                 -XX:+DoEscapeAnalysis -XX:+EliminateAllocations -XX:+EliminateLocks -XX:+EliminateNestedLocks
++ *                 -XX:LockingMode=2
++ *                 -Xlog:monitorinflation=trace:file=monitorinflation.log
++ *
++ * @comment Re-lock may race with deflation.
++ * @run driver EATests
++ *                 -XX:+UnlockDiagnosticVMOptions
++ *                 -Xms256m -Xmx256m
++ *                 -Xbootclasspath/a:.
++ *                 -XX:CompileCommand=dontinline,*::dontinline_*
++ *                 -XX:+WhiteBoxAPI
++ *                 -Xbatch
++ *                 -XX:+DoEscapeAnalysis -XX:+EliminateAllocations -XX:+EliminateLocks -XX:+EliminateNestedLocks
++ *                 -XX:LockingMode=0
++ *                 -XX:GuaranteedAsyncDeflationInterval=1000
+  */
++
+ /**
+  * @test
+  * @bug 8227745
+@@ -254,12 +293,14 @@ class EATestsTarget {
+         new EARelockingRecursiveTarget()                                                    .run();
+         new EARelockingNestedInflatedTarget()                                               .run();
+         new EARelockingNestedInflated_02Target()                                            .run();
++        new EARelockingNestedInflated_03Target()                                            .run();
+         new EARelockingArgEscapeLWLockedInCalleeFrameTarget()                               .run();
+         new EARelockingArgEscapeLWLockedInCalleeFrame_2Target()                             .run();
+         new EARelockingArgEscapeLWLockedInCalleeFrameNoRecursiveTarget()                    .run();
+         new EAGetOwnedMonitorsTarget()                                                      .run();
+         new EAEntryCountTarget()                                                            .run();
+         new EARelockingObjectCurrentlyWaitingOnTarget()                                     .run();
++        new EARelockingValueBasedTarget()                                                   .run();
+ 
+         // Test cases that require deoptimization even though neither
+         // locks nor allocations are eliminated at the point where
+@@ -374,12 +415,14 @@ public class EATests extends TestScaffold {
+         new EARelockingRecursive()                                                    .run(this);
+         new EARelockingNestedInflated()                                               .run(this);
+         new EARelockingNestedInflated_02()                                            .run(this);
++        new EARelockingNestedInflated_03()                                            .run(this);
+         new EARelockingArgEscapeLWLockedInCalleeFrame()                               .run(this);
+         new EARelockingArgEscapeLWLockedInCalleeFrame_2()                             .run(this);
+         new EARelockingArgEscapeLWLockedInCalleeFrameNoRecursive()                    .run(this);
+         new EAGetOwnedMonitors()                                                      .run(this);
+         new EAEntryCount()                                                            .run(this);
+         new EARelockingObjectCurrentlyWaitingOn()                                     .run(this);
++        new EARelockingValueBased()                                                   .run(this);
+ 
+         // Test cases that require deoptimization even though neither
+         // locks nor allocations are eliminated at the point where
+@@ -2013,6 +2056,94 @@ class EARelockingNestedInflated_02Target extends EATestCaseBaseTarget {
+ 
+ /////////////////////////////////////////////////////////////////////////////
+ 
++/**
++ * Like {@link EARelockingNestedInflated_02} with the difference that the
++ * inflation of the lock happens because of contention.
++ */
++class EARelockingNestedInflated_03 extends EATestCaseBaseDebugger {
++
++    public void runTestCase() throws Exception {
++        BreakpointEvent bpe = resumeTo(TARGET_TESTCASE_BASE_NAME, "dontinline_brkpt", "()V");
++        printStack(bpe.thread());
++        @SuppressWarnings("unused")
++        ObjectReference o = getLocalRef(bpe.thread().frame(2), XYVAL_NAME, "l1");
++    }
++}
++
++class EARelockingNestedInflated_03Target extends EATestCaseBaseTarget {
++
++    public XYVal lockInflatedByContention;
++    public boolean doLockNow;
++    public EATestCaseBaseTarget testCase;
++
++    @Override
++    public void setUp() {
++        super.setUp();
++        testMethodDepth = 2;
++        lockInflatedByContention = new XYVal(1, 1);
++        testCase = this;
++    }
++
++    @Override
++    public void warmupDone() {
++        super.warmupDone();
++        // Use new lock. lockInflatedByContention might have been inflated because of recursion.
++        lockInflatedByContention = new XYVal(1, 1);
++        // Start thread that tries to enter lockInflatedByContention while the main thread owns it -> inflation
++        DebuggeeWrapper.newThread(() -> {
++            while (true) {
++                synchronized (testCase) {
++                    try {
++                        if (doLockNow) {
++                            doLockNow = false; // reset for main thread
++                            testCase.notify();
++                            break;
++                        }
++                        testCase.wait();
++                    } catch (InterruptedException e) { /* ignored */ }
++                }
++            }
++            synchronized (lockInflatedByContention) { // will block and trigger inflation
++                msg(Thread.currentThread().getName() + ": acquired lockInflatedByContention");
++            }
++            }, testCaseName + ": Lock Contender (test thread)").start();
++    }
++
++    public void dontinline_testMethod() {
++        @SuppressWarnings("unused")
++        XYVal xy = new XYVal(1, 1);            // scalar replaced
++        XYVal l1 = lockInflatedByContention;   // read by debugger
++        synchronized (l1) {
++            testMethod_inlined(l1);
++        }
++    }
++
++    public void testMethod_inlined(XYVal l2) {
++        synchronized (l2) {                 // eliminated nested locking
++            dontinline_notifyOtherThread();
++            dontinline_brkpt();
++        }
++    }
++
++    public void dontinline_notifyOtherThread() {
++        if (!warmupDone) {
++            return;
++        }
++        synchronized (testCase) {
++            doLockNow = true;
++            testCase.notify();
++            // wait for other thread to reset doLockNow again
++            while (doLockNow) {
++                try {
++                    testCase.wait();
++                } catch (InterruptedException e) { /* ignored */ }
++            }
++        }
++    }
++}
++
++/////////////////////////////////////////////////////////////////////////////
++
+ /**
+  * Checks if an eliminated lock of an ArgEscape object l1 can be relocked if
+  * l1 is locked in a callee frame.
+@@ -2228,6 +2359,32 @@ class EARelockingObjectCurrentlyWaitingOnTarget extends EATestCaseBaseTarget {
+     }
+ }
+ 
++
++/////////////////////////////////////////////////////////////////////////////
++
++/**
++ * Test relocking eliminated @ValueBased object.
++ */
++class EARelockingValueBased extends EATestCaseBaseDebugger {
++
++    public void runTestCase() throws Exception {
++        BreakpointEvent bpe = resumeTo(TARGET_TESTCASE_BASE_NAME, "dontinline_brkpt", "()V");
++        printStack(bpe.thread());
++        @SuppressWarnings("unused")
++        ObjectReference o = getLocalRef(bpe.thread().frame(1), Integer.class.getName(), "l1");
++    }
++}
++
++class EARelockingValueBasedTarget extends EATestCaseBaseTarget {
++
++    public void dontinline_testMethod() {
++        Integer l1 = new Integer(255);
++        synchronized (l1) {
++            dontinline_brkpt();
++        }
++    }
++}
++
+ /////////////////////////////////////////////////////////////////////////////
+ //
+ // Test cases that require deoptimization even though neither locks
+diff --git a/test/lib/jdk/test/whitebox/WhiteBox.java b/test/lib/jdk/test/whitebox/WhiteBox.java
+index b0e2530f7..9d905b684 100644
+--- a/test/lib/jdk/test/whitebox/WhiteBox.java
++++ b/test/lib/jdk/test/whitebox/WhiteBox.java
+@@ -1,5 +1,5 @@
+ /*
+- * Copyright (c) 2012, 2023, Oracle and/or its affiliates. All rights reserved.
++ * Copyright (c) 2012, 2024, Oracle and/or its affiliates. All rights reserved.
+  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+  *
+  * This code is free software; you can redistribute it and/or modify it
+@@ -119,6 +119,10 @@ public class WhiteBox {
+     return isMonitorInflated0(obj);
+   }
+ 
++  public native int getLockStackCapacity();
++
++  public native boolean supportsRecursiveLightweightLocking();
++
+   public native void forceSafepoint();
+ 
+   public native void forceClassLoaderStatsSafepoint();
diff --git a/Backport-JDK-8345351-8356159-RISC-V-Add-Zabha.patch b/Backport-JDK-8345351-8356159-RISC-V-Add-Zabha.patch
new file mode 100644
index 0000000000000000000000000000000000000000..0520205d9205245d68b51a3c5e21164bdf65e7c3
--- /dev/null
+++ b/Backport-JDK-8345351-8356159-RISC-V-Add-Zabha.patch
@@ -0,0 +1,1601 @@
+diff --git a/src/hotspot/cpu/riscv/assembler_riscv.hpp b/src/hotspot/cpu/riscv/assembler_riscv.hpp
+index 522550a07..4c167073a 100644
+--- a/src/hotspot/cpu/riscv/assembler_riscv.hpp
++++ b/src/hotspot/cpu/riscv/assembler_riscv.hpp
+@@ -818,81 +818,239 @@ public:
+ 
+ #undef INSN
+ 
+-enum Aqrl {relaxed = 0b00, rl = 0b01, aq = 0b10, aqrl = 0b11};
+-
+-#define INSN(NAME, op, funct3, funct7)                                                  \
+-  void NAME(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {        \
+-    unsigned insn = 0;                                                                  \
+-    patch((address)&insn, 6, 0, op);                                                    \
+-    patch((address)&insn, 14, 12, funct3);                                              \
+-    patch_reg((address)&insn, 7, Rd);                                                   \
+-    patch_reg((address)&insn, 15, Rs1);                                                 \
+-    patch_reg((address)&insn, 20, Rs2);                                                 \
+-    patch((address)&insn, 31, 27, funct7);                                              \
+-    patch((address)&insn, 26, 25, memory_order);                                        \
+-    emit(insn);                                                                         \
+-  }
+-
+-  INSN(amoswap_w, 0b0101111, 0b010, 0b00001);
+-  INSN(amoadd_w,  0b0101111, 0b010, 0b00000);
+-  INSN(amoxor_w,  0b0101111, 0b010, 0b00100);
+-  INSN(amoand_w,  0b0101111, 0b010, 0b01100);
+-  INSN(amoor_w,   0b0101111, 0b010, 0b01000);
+-  INSN(amomin_w,  0b0101111, 0b010, 0b10000);
+-  INSN(amomax_w,  0b0101111, 0b010, 0b10100);
+-  INSN(amominu_w, 0b0101111, 0b010, 0b11000);
+-  INSN(amomaxu_w, 0b0101111, 0b010, 0b11100);
+-  INSN(amoswap_d, 0b0101111, 0b011, 0b00001);
+-  INSN(amoadd_d,  0b0101111, 0b011, 0b00000);
+-  INSN(amoxor_d,  0b0101111, 0b011, 0b00100);
+-  INSN(amoand_d,  0b0101111, 0b011, 0b01100);
+-  INSN(amoor_d,   0b0101111, 0b011, 0b01000);
+-  INSN(amomin_d,  0b0101111, 0b011, 0b10000);
+-  INSN(amomax_d , 0b0101111, 0b011, 0b10100);
+-  INSN(amominu_d, 0b0101111, 0b011, 0b11000);
+-  INSN(amomaxu_d, 0b0101111, 0b011, 0b11100);
+-  INSN(amocas_w,  0b0101111, 0b010, 0b00101);
+-  INSN(amocas_d,  0b0101111, 0b011, 0b00101);
+-#undef INSN
+-
+-enum operand_size { int8, int16, int32, uint32, int64 };
+-
+-#define INSN(NAME, op, funct3, funct7)                                              \
+-  void NAME(Register Rd, Register Rs1, Aqrl memory_order = relaxed) {               \
+-    unsigned insn = 0;                                                              \
+-    uint32_t val = memory_order & 0x3;                                              \
+-    patch((address)&insn, 6, 0, op);                                                \
+-    patch((address)&insn, 14, 12, funct3);                                          \
+-    patch_reg((address)&insn, 7, Rd);                                               \
+-    patch_reg((address)&insn, 15, Rs1);                                             \
+-    patch((address)&insn, 25, 20, 0b00000);                                         \
+-    patch((address)&insn, 31, 27, funct7);                                          \
+-    patch((address)&insn, 26, 25, val);                                             \
+-    emit(insn);                                                                     \
+-  }
+-
+-  INSN(lr_w, 0b0101111, 0b010, 0b00010);
+-  INSN(lr_d, 0b0101111, 0b011, 0b00010);
+-
+-#undef INSN
+-
+-#define INSN(NAME, op, funct3, funct7)                                                      \
+-  void NAME(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = relaxed) {         \
+-    unsigned insn = 0;                                                                      \
+-    uint32_t val = memory_order & 0x3;                                                      \
+-    patch((address)&insn, 6, 0, op);                                                        \
+-    patch((address)&insn, 14, 12, funct3);                                                  \
+-    patch_reg((address)&insn, 7, Rd);                                                       \
+-    patch_reg((address)&insn, 15, Rs2);                                                     \
+-    patch_reg((address)&insn, 20, Rs1);                                                     \
+-    patch((address)&insn, 31, 27, funct7);                                                  \
+-    patch((address)&insn, 26, 25, val);                                                     \
+-    emit(insn);                                                                             \
++  enum Aqrl {relaxed = 0b00, rl = 0b01, aq = 0b10, aqrl = 0b11};
++
++ private:
++
++  enum AmoWidthFunct3 : uint8_t {
++    AMO_WIDTH_BYTE        = 0b000, // Zabha extension
++    AMO_WIDTH_HALFWORD    = 0b001, // Zabha extension
++    AMO_WIDTH_WORD        = 0b010,
++    AMO_WIDTH_DOUBLEWORD  = 0b011,
++    AMO_WIDTH_QUADWORD    = 0b100,
++    // 0b101 to 0b111 are reserved
++  };
++
++  enum AmoOperationFunct5 : uint8_t {
++    AMO_ADD  = 0b00000,
++    AMO_SWAP = 0b00001,
++    AMO_LR   = 0b00010,
++    AMO_SC   = 0b00011,
++    AMO_XOR  = 0b00100,
++    AMO_OR   = 0b01000,
++    AMO_AND  = 0b01100,
++    AMO_MIN  = 0b10000,
++    AMO_MAX  = 0b10100,
++    AMO_MINU = 0b11000,
++    AMO_MAXU = 0b11100,
++    AMO_CAS  = 0b00101 // Zacas
++  };
++
++  static constexpr uint32_t OP_AMO_MAJOR = 0b0101111;
++
++  template <AmoOperationFunct5 funct5, AmoWidthFunct3 width>
++  void amo_base(Register Rd, Register Rs1, uint8_t Rs2, Aqrl memory_order = aqrl) {
++    assert(width > AMO_WIDTH_HALFWORD || UseZabha, "Must be");
++    assert(funct5 != AMO_CAS || UseZacas, "Must be");
++    unsigned insn = 0;
++    patch((address)&insn,  6,  0, OP_AMO_MAJOR);
++    patch_reg((address)&insn,  7, Rd);
++    patch((address)&insn, 14, 12, width);
++    patch_reg((address)&insn, 15, Rs1);
++    patch((address)&insn, 24, 20, Rs2);
++    patch((address)&insn, 26, 25, memory_order);
++    patch((address)&insn, 31, 27, funct5);
++    emit(insn);
+   }
+ 
+-  INSN(sc_w, 0b0101111, 0b010, 0b00011);
+-  INSN(sc_d, 0b0101111, 0b011, 0b00011);
+-#undef INSN
++  template <AmoOperationFunct5 funct5, AmoWidthFunct3 width>
++  void amo_base(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<funct5, width>(Rd, Rs1, Rs2->raw_encoding(), memory_order);
++  }
++
++ public:
++
++  void amoadd_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_ADD, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoadd_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_ADD, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoadd_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_ADD, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoadd_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_ADD, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoswap_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_SWAP, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoswap_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_SWAP, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoswap_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_SWAP, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoswap_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_SWAP, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoxor_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_XOR, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoxor_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_XOR, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoxor_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_XOR, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoxor_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_XOR, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoor_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_OR, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoor_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_OR, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoor_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_OR, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoor_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_OR, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoand_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_AND, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoand_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_AND, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoand_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_AND, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amoand_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_AND, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amomin_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_MIN, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amomin_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_MIN, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amomin_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_MIN, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amomin_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_MIN, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amominu_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_MINU, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amominu_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_MINU, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amominu_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_MINU, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amominu_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_MINU, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amomax_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_MAX, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amomax_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_MAX, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amomax_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_MAX, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amomax_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_MAX, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amomaxu_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_MAXU, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amomaxu_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_MAXU, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amomaxu_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_MAXU, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amomaxu_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_MAXU, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++ protected:
++
++  void lr_w(Register Rd, Register Rs1, Aqrl memory_order = aqrl) {
++    amo_base<AMO_LR, AMO_WIDTH_WORD>(Rd, Rs1, 0, memory_order);
++  }
++
++  void lr_d(Register Rd, Register Rs1, Aqrl memory_order = aqrl) {
++    amo_base<AMO_LR, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, 0, memory_order);
++  }
++
++  void sc_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_SC, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void sc_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_SC, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amocas_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_CAS, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amocas_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_CAS, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amocas_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_CAS, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++  void amocas_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
++    amo_base<AMO_CAS, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
++  }
++
++ public:
++
++  enum operand_size { int8, int16, int32, uint32, int64 };
+ 
+ #define INSN(NAME, op, funct5, funct7)                                                      \
+   void NAME(FloatRegister Rd, FloatRegister Rs1, RoundingMode rm = rne) {                   \
+diff --git a/src/hotspot/cpu/riscv/gc/z/zBarrierSetAssembler_riscv.cpp b/src/hotspot/cpu/riscv/gc/z/zBarrierSetAssembler_riscv.cpp
+index d0a281442..3738f2953 100644
+--- a/src/hotspot/cpu/riscv/gc/z/zBarrierSetAssembler_riscv.cpp
++++ b/src/hotspot/cpu/riscv/gc/z/zBarrierSetAssembler_riscv.cpp
+@@ -286,7 +286,7 @@ void ZBarrierSetAssembler::store_barrier_medium(MacroAssembler* masm,
+     __ relocate(barrier_Relocation::spec(), [&] {
+       __ li16u(rtmp1, barrier_Relocation::unpatched);
+     }, ZBarrierRelocationFormatStoreGoodBits);
+-    __ cmpxchg_weak(rtmp2, zr, rtmp1,
++    __ weak_cmpxchg(rtmp2, zr, rtmp1,
+                     Assembler::int64,
+                     Assembler::relaxed /* acquire */, Assembler::relaxed /* release */,
+                     rtmp3);
+diff --git a/src/hotspot/cpu/riscv/globals_riscv.hpp b/src/hotspot/cpu/riscv/globals_riscv.hpp
+index 2c18805ec..824d36f0f 100644
+--- a/src/hotspot/cpu/riscv/globals_riscv.hpp
++++ b/src/hotspot/cpu/riscv/globals_riscv.hpp
+@@ -107,6 +107,7 @@ define_pd_global(intx, InlineSmallCode,          1000);
+   product(bool, UseZbb, false, DIAGNOSTIC, "Use Zbb instructions")               \
+   product(bool, UseZbs, false, DIAGNOSTIC, "Use Zbs instructions")               \
+   product(bool, UseZacas, false, EXPERIMENTAL, "Use Zacas instructions")         \
++  product(bool, UseZabha, false, EXPERIMENTAL, "Use UseZabha instructions")      \
+   product(bool, UseZfa, false, EXPERIMENTAL, "Use Zfa instructions")             \
+   product(bool, UseZic64b, false, EXPERIMENTAL, "Use Zic64b instructions")       \
+   product(bool, UseZicbom, false, EXPERIMENTAL, "Use Zicbom instructions")       \
+diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
+index 17bf4314c..e2cc6cd92 100644
+--- a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
++++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
+@@ -3284,7 +3284,7 @@ void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register o
+ 
+ void MacroAssembler::load_reserved(Register dst,
+                                    Register addr,
+-                                   enum operand_size size,
++                                   Assembler::operand_size size,
+                                    Assembler::Aqrl acquire) {
+   switch (size) {
+     case int64:
+@@ -3305,15 +3305,15 @@ void MacroAssembler::load_reserved(Register dst,
+ void MacroAssembler::store_conditional(Register dst,
+                                        Register new_val,
+                                        Register addr,
+-                                       enum operand_size size,
++                                       Assembler::operand_size size,
+                                        Assembler::Aqrl release) {
+   switch (size) {
+     case int64:
+-      sc_d(dst, new_val, addr, release);
++      sc_d(dst, addr, new_val, release);
+       break;
+     case int32:
+     case uint32:
+-      sc_w(dst, new_val, addr, release);
++      sc_w(dst, addr, new_val, release);
+       break;
+     default:
+       ShouldNotReachHere();
+@@ -3322,7 +3322,7 @@ void MacroAssembler::store_conditional(Register dst,
+ 
+ 
+ void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected, Register new_val,
+-                                                 enum operand_size size,
++                                                 Assembler::operand_size size,
+                                                  Register shift, Register mask, Register aligned_addr) {
+   assert(size == int8 || size == int16, "unsupported operand size");
+ 
+@@ -3352,10 +3352,11 @@ void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expecte
+ // which are forced to work with 4-byte aligned address.
+ void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
+                                           Register new_val,
+-                                          enum operand_size size,
++                                          Assembler::operand_size size,
+                                           Assembler::Aqrl acquire, Assembler::Aqrl release,
+                                           Register result, bool result_as_bool,
+                                           Register tmp1, Register tmp2, Register tmp3) {
++  assert(!(UseZacas && UseZabha), "Use amocas");
+   assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
+ 
+   Register scratch0 = t0, aligned_addr = t1;
+@@ -3388,13 +3389,13 @@ void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
+     notr(scratch1, mask);
+     bind(retry);
+ 
+-    lr_w(result, aligned_addr, acquire);
++    load_reserved(result, aligned_addr, operand_size::int32, acquire);
+     andr(scratch0, result, mask);
+     bne(scratch0, expected, fail);
+ 
+     andr(scratch0, result, scratch1); // scratch1 is ~mask
+     orr(scratch0, scratch0, new_val);
+-    sc_w(scratch0, scratch0, aligned_addr, release);
++    store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
+     bnez(scratch0, retry);
+   }
+ 
+@@ -3426,10 +3427,11 @@ void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
+ // failed.
+ void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
+                                                Register new_val,
+-                                               enum operand_size size,
++                                               Assembler::operand_size size,
+                                                Assembler::Aqrl acquire, Assembler::Aqrl release,
+                                                Register result,
+                                                Register tmp1, Register tmp2, Register tmp3) {
++  assert(!(UseZacas && UseZabha), "Use amocas");
+   assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
+ 
+   Register scratch0 = t0, aligned_addr = t1;
+@@ -3460,13 +3462,13 @@ void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
+   } else {
+     notr(scratch1, mask);
+ 
+-    lr_w(result, aligned_addr, acquire);
++    load_reserved(result, aligned_addr, operand_size::int32, acquire);
+     andr(scratch0, result, mask);
+     bne(scratch0, expected, fail);
+ 
+     andr(scratch0, result, scratch1); // scratch1 is ~mask
+     orr(scratch0, scratch0, new_val);
+-    sc_w(scratch0, scratch0, aligned_addr, release);
++    store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
+     bnez(scratch0, fail);
+   }
+ 
+@@ -3483,10 +3485,10 @@ void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
+ 
+ void MacroAssembler::cmpxchg(Register addr, Register expected,
+                              Register new_val,
+-                             enum operand_size size,
++                             Assembler::operand_size size,
+                              Assembler::Aqrl acquire, Assembler::Aqrl release,
+                              Register result, bool result_as_bool) {
+-  assert(size != int8 && size != int16, "unsupported operand size");
++  assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
+   assert_different_registers(addr, t0);
+   assert_different_registers(expected, t0);
+   assert_different_registers(new_val, t0);
+@@ -3542,12 +3544,12 @@ void MacroAssembler::cmpxchg(Register addr, Register expected,
+   bind(done);
+ }
+ 
+-void MacroAssembler::cmpxchg_weak(Register addr, Register expected,
++void MacroAssembler::weak_cmpxchg(Register addr, Register expected,
+                                   Register new_val,
+-                                  enum operand_size size,
++                                  Assembler::operand_size size,
+                                   Assembler::Aqrl acquire, Assembler::Aqrl release,
+                                   Register result) {
+-
++  assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
+   assert_different_registers(addr, t0);
+   assert_different_registers(expected, t0);
+   assert_different_registers(new_val, t0);
+@@ -3620,7 +3622,7 @@ ATOMIC_XCHGU(xchgalwu, xchgalw)
+ #undef ATOMIC_XCHGU
+ 
+ void MacroAssembler::atomic_cas(Register prev, Register newv, Register addr,
+-                                enum operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
++                                Assembler::operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
+   switch (size) {
+     case int64:
+       amocas_d(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
+@@ -3632,6 +3634,12 @@ void MacroAssembler::atomic_cas(Register prev, Register newv, Register addr,
+       amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
+       zero_extend(prev, prev, 32);
+       break;
++    case int16:
++      amocas_h(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
++      break;
++    case int8:
++      amocas_b(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
++      break;
+     default:
+       ShouldNotReachHere();
+   }
+diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
+index 479d8d1a6..0be049b1b 100644
+--- a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
++++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
+@@ -1017,26 +1017,26 @@ public:
+   void cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, Label &succeed, Label *fail);
+   void cmpxchg(Register addr, Register expected,
+                Register new_val,
+-               enum operand_size size,
++               Assembler::operand_size size,
+                Assembler::Aqrl acquire, Assembler::Aqrl release,
+                Register result, bool result_as_bool = false);
+-  void cmpxchg_weak(Register addr, Register expected,
++  void weak_cmpxchg(Register addr, Register expected,
+                     Register new_val,
+-                    enum operand_size size,
++                    Assembler::operand_size size,
+                     Assembler::Aqrl acquire, Assembler::Aqrl release,
+                     Register result);
+   void cmpxchg_narrow_value_helper(Register addr, Register expected, Register new_val,
+-                                   enum operand_size size,
++                                   Assembler::operand_size size,
+                                    Register shift, Register mask, Register aligned_addr);
+   void cmpxchg_narrow_value(Register addr, Register expected,
+                             Register new_val,
+-                            enum operand_size size,
++                            Assembler::operand_size size,
+                             Assembler::Aqrl acquire, Assembler::Aqrl release,
+                             Register result, bool result_as_bool,
+                             Register tmp1, Register tmp2, Register tmp3);
+   void weak_cmpxchg_narrow_value(Register addr, Register expected,
+                                  Register new_val,
+-                                 enum operand_size size,
++                                 Assembler::operand_size size,
+                                  Assembler::Aqrl acquire, Assembler::Aqrl release,
+                                  Register result,
+                                  Register tmp1, Register tmp2, Register tmp3);
+@@ -1053,7 +1053,7 @@ public:
+   void atomic_xchgwu(Register prev, Register newv, Register addr);
+   void atomic_xchgalwu(Register prev, Register newv, Register addr);
+ 
+-  void atomic_cas(Register prev, Register newv, Register addr, enum operand_size size,
++  void atomic_cas(Register prev, Register newv, Register addr, Assembler::operand_size size,
+               Assembler::Aqrl acquire = Assembler::relaxed, Assembler::Aqrl release = Assembler::relaxed);
+ 
+   static bool far_branches() {
+@@ -1508,8 +1508,8 @@ private:
+   int bitset_to_regs(unsigned int bitset, unsigned char* regs);
+   Address add_memory_helper(const Address dst, Register tmp);
+ 
+-  void load_reserved(Register dst, Register addr, enum operand_size size, Assembler::Aqrl acquire);
+-  void store_conditional(Register dst, Register new_val, Register addr, enum operand_size size, Assembler::Aqrl release);
++  void load_reserved(Register dst, Register addr, Assembler::operand_size size, Assembler::Aqrl acquire);
++  void store_conditional(Register dst, Register new_val, Register addr, Assembler::operand_size size, Assembler::Aqrl release);
+ 
+ public:
+   void lightweight_lock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow);
+diff --git a/src/hotspot/cpu/riscv/riscv.ad b/src/hotspot/cpu/riscv/riscv.ad
+index ac22dc536..6e381d3f0 100644
+--- a/src/hotspot/cpu/riscv/riscv.ad
++++ b/src/hotspot/cpu/riscv/riscv.ad
+@@ -2256,48 +2256,6 @@ encode %{
+     }
+   %}
+ 
+-  enc_class riscv_enc_cmpxchgw(iRegINoSp res, memory mem, iRegI oldval, iRegI newval) %{
+-    C2_MacroAssembler _masm(&cbuf);
+-    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int32,
+-               /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register,
+-               /*result as bool*/ true);
+-  %}
+-
+-  enc_class riscv_enc_cmpxchgn(iRegINoSp res, memory mem, iRegI oldval, iRegI newval) %{
+-    C2_MacroAssembler _masm(&cbuf);
+-    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::uint32,
+-               /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register,
+-               /*result as bool*/ true);
+-  %}
+-
+-  enc_class riscv_enc_cmpxchg(iRegINoSp res, memory mem, iRegL oldval, iRegL newval) %{
+-    C2_MacroAssembler _masm(&cbuf);
+-    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int64,
+-               /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register,
+-               /*result as bool*/ true);
+-  %}
+-
+-  enc_class riscv_enc_cmpxchgw_acq(iRegINoSp res, memory mem, iRegI oldval, iRegI newval) %{
+-    C2_MacroAssembler _masm(&cbuf);
+-    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int32,
+-               /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register,
+-               /*result as bool*/ true);
+-  %}
+-
+-  enc_class riscv_enc_cmpxchgn_acq(iRegINoSp res, memory mem, iRegI oldval, iRegI newval) %{
+-    C2_MacroAssembler _masm(&cbuf);
+-    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::uint32,
+-               /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register,
+-               /*result as bool*/ true);
+-  %}
+-
+-  enc_class riscv_enc_cmpxchg_acq(iRegINoSp res, memory mem, iRegL oldval, iRegL newval) %{
+-    C2_MacroAssembler _masm(&cbuf);
+-    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int64,
+-               /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register,
+-               /*result as bool*/ true);
+-  %}
+-
+   // compare and branch instruction encodings
+ 
+   enc_class riscv_enc_j(label lbl) %{
+@@ -5221,18 +5179,20 @@ instruct prefetchalloc( memory mem ) %{
+ 
+ // standard CompareAndSwapX when we are using barriers
+ // these have higher priority than the rules selected by a predicate
+-instruct compareAndSwapB(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
+-                         iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
++instruct compareAndSwapB_narrow(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
++                                iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
+ %{
++  predicate(!UseZabha || !UseZacas);
++
+   match(Set res (CompareAndSwapB mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + ALU_COST * 10 + BRANCH_COST * 4);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   effect(TEMP_DEF res, USE_KILL oldval, USE_KILL newval, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+ 
+   format %{
+     "cmpxchg $mem, $oldval, $newval\t# (byte) if $mem == $oldval then $mem <-- $newval\n\t"
+-    "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapB"
++    "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapB_narrow"
+   %}
+ 
+   ins_encode %{
+@@ -5244,18 +5204,42 @@ instruct compareAndSwapB(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R1
+   ins_pipe(pipe_slow);
+ %}
+ 
+-instruct compareAndSwapS(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
+-                         iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
++instruct compareAndSwapB(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
+ %{
++  predicate(UseZabha && UseZacas);
++
++  match(Set res (CompareAndSwapB mem (Binary oldval newval)));
++
++  ins_cost(2 * VOLATILE_REF_COST);
++
++  format %{
++    "cmpxchg $mem, $oldval, $newval\t# (byte) if $mem == $oldval then $mem <-- $newval\n\t"
++    "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapB"
++  %}
++
++  ins_encode %{
++    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int8,
++               Assembler::relaxed /* acquire */, Assembler::rl /* release */, $res$$Register,
++               true /* result as bool */);
++  %}
++
++  ins_pipe(pipe_slow);
++%}
++
++instruct compareAndSwapS_narrow(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
++                                iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
++%{
++  predicate(!UseZabha || !UseZacas);
++
+   match(Set res (CompareAndSwapS mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + ALU_COST * 11 + BRANCH_COST * 4);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   effect(TEMP_DEF res, USE_KILL oldval, USE_KILL newval, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+ 
+   format %{
+     "cmpxchg $mem, $oldval, $newval\t# (short) if $mem == $oldval then $mem <-- $newval\n\t"
+-    "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapS"
++    "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapS_narrow"
+   %}
+ 
+   ins_encode %{
+@@ -5267,18 +5251,44 @@ instruct compareAndSwapS(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R1
+   ins_pipe(pipe_slow);
+ %}
+ 
++instruct compareAndSwapS(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
++%{
++  predicate(UseZabha && UseZacas);
++
++  match(Set res (CompareAndSwapS mem (Binary oldval newval)));
++
++  ins_cost(2 * VOLATILE_REF_COST);
++
++  format %{
++    "cmpxchg $mem, $oldval, $newval\t# (short) if $mem == $oldval then $mem <-- $newval\n\t"
++    "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapS"
++  %}
++
++  ins_encode %{
++    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int16,
++               Assembler::relaxed /* acquire */, Assembler::rl /* release */, $res$$Register,
++               true /* result as bool */);
++  %}
++
++  ins_pipe(pipe_slow);
++%}
++
+ instruct compareAndSwapI(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
+ %{
+   match(Set res (CompareAndSwapI mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + ALU_COST * 6 + BRANCH_COST * 4);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+     "cmpxchg $mem, $oldval, $newval\t# (int) if $mem == $oldval then $mem <-- $newval\n\t"
+     "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapI"
+   %}
+ 
+-  ins_encode(riscv_enc_cmpxchgw(res, mem, oldval, newval));
++  ins_encode %{
++    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int32,
++               /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register,
++               /*result as bool*/ true);
++  %}
+ 
+   ins_pipe(pipe_slow);
+ %}
+@@ -5287,14 +5297,18 @@ instruct compareAndSwapL(iRegINoSp res, indirect mem, iRegL oldval, iRegL newval
+ %{
+   match(Set res (CompareAndSwapL mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + ALU_COST * 6 + BRANCH_COST * 4);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+     "cmpxchg $mem, $oldval, $newval\t# (long) if $mem == $oldval then $mem <-- $newval\n\t"
+     "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapL"
+   %}
+ 
+-  ins_encode(riscv_enc_cmpxchg(res, mem, oldval, newval));
++  ins_encode %{
++    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int64,
++               /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register,
++               /*result as bool*/ true);
++  %}
+ 
+   ins_pipe(pipe_slow);
+ %}
+@@ -5305,14 +5319,18 @@ instruct compareAndSwapP(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval
+ 
+   match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + ALU_COST * 6 + BRANCH_COST * 4);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+     "cmpxchg $mem, $oldval, $newval\t# (ptr) if $mem == $oldval then $mem <-- $newval\n\t"
+     "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapP"
+   %}
+ 
+-  ins_encode(riscv_enc_cmpxchg(res, mem, oldval, newval));
++  ins_encode %{
++    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int64,
++               /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register,
++               /*result as bool*/ true);
++  %}
+ 
+   ins_pipe(pipe_slow);
+ %}
+@@ -5321,33 +5339,37 @@ instruct compareAndSwapN(iRegINoSp res, indirect mem, iRegN oldval, iRegN newval
+ %{
+   match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + ALU_COST * 8 + BRANCH_COST * 4);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+     "cmpxchg $mem, $oldval, $newval\t# (narrow oop) if $mem == $oldval then $mem <-- $newval\n\t"
+     "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapN"
+   %}
+ 
+-  ins_encode(riscv_enc_cmpxchgn(res, mem, oldval, newval));
++  ins_encode %{
++    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::uint32,
++               /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register,
++               /*result as bool*/ true);
++  %}
+ 
+   ins_pipe(pipe_slow);
+ %}
+ 
+ // alternative CompareAndSwapX when we are eliding barriers
+-instruct compareAndSwapBAcq(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
+-                            iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
++instruct compareAndSwapBAcq_narrow(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
++                                   iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
+ %{
+-  predicate(needs_acquiring_load_reserved(n));
++  predicate((!UseZabha || !UseZacas) && needs_acquiring_load_reserved(n));
+ 
+   match(Set res (CompareAndSwapB mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + ALU_COST * 10 + BRANCH_COST * 4);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   effect(TEMP_DEF res, KILL cr, USE_KILL oldval, USE_KILL newval, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+ 
+   format %{
+     "cmpxchg_acq $mem, $oldval, $newval\t# (byte) if $mem == $oldval then $mem <-- $newval\n\t"
+-    "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapBAcq"
++    "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapBAcq_narrow"
+   %}
+ 
+   ins_encode %{
+@@ -5359,20 +5381,42 @@ instruct compareAndSwapBAcq(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI
+   ins_pipe(pipe_slow);
+ %}
+ 
+-instruct compareAndSwapSAcq(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
+-                            iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
++instruct compareAndSwapBAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
+ %{
+-  predicate(needs_acquiring_load_reserved(n));
++  predicate((UseZabha && UseZacas) && needs_acquiring_load_reserved(n));
++
++  match(Set res (CompareAndSwapB mem (Binary oldval newval)));
++
++  ins_cost(2 * VOLATILE_REF_COST);
++
++  format %{
++    "cmpxchg $mem, $oldval, $newval\t# (byte) if $mem == $oldval then $mem <-- $newval\n\t"
++    "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapBAcq"
++  %}
++
++  ins_encode %{
++    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int8,
++               Assembler::aq /* acquire */, Assembler::rl /* release */, $res$$Register,
++               true /* result as bool */);
++  %}
++
++  ins_pipe(pipe_slow);
++%}
++
++instruct compareAndSwapSAcq_narrow(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
++                                   iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
++%{
++  predicate((!UseZabha || !UseZacas) && needs_acquiring_load_reserved(n));
+ 
+   match(Set res (CompareAndSwapS mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + ALU_COST * 11 + BRANCH_COST * 4);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   effect(TEMP_DEF res, KILL cr, USE_KILL oldval, USE_KILL newval, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+ 
+   format %{
+     "cmpxchg_acq $mem, $oldval, $newval\t# (short) if $mem == $oldval then $mem <-- $newval\n\t"
+-    "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapSAcq"
++    "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapSAcq_narrow"
+   %}
+ 
+   ins_encode %{
+@@ -5384,20 +5428,46 @@ instruct compareAndSwapSAcq(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI
+   ins_pipe(pipe_slow);
+ %}
+ 
++instruct compareAndSwapSAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
++%{
++  predicate((UseZabha && UseZacas) && needs_acquiring_load_reserved(n));
++
++  match(Set res (CompareAndSwapS mem (Binary oldval newval)));
++
++  ins_cost(2 * VOLATILE_REF_COST);
++
++  format %{
++    "cmpxchg $mem, $oldval, $newval\t# (short) if $mem == $oldval then $mem <-- $newval\n\t"
++    "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapSAcq"
++  %}
++
++  ins_encode %{
++    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int16,
++               Assembler::aq /* acquire */, Assembler::rl /* release */, $res$$Register,
++               true /* result as bool */);
++  %}
++
++  ins_pipe(pipe_slow);
++%}
++
+ instruct compareAndSwapIAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
+ %{
+   predicate(needs_acquiring_load_reserved(n));
+ 
+   match(Set res (CompareAndSwapI mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + ALU_COST * 6 + BRANCH_COST * 4);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+     "cmpxchg_acq $mem, $oldval, $newval\t# (int) if $mem == $oldval then $mem <-- $newval\n\t"
+     "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapIAcq"
+   %}
+ 
+-  ins_encode(riscv_enc_cmpxchgw_acq(res, mem, oldval, newval));
++  ins_encode %{
++    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int32,
++               /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register,
++               /*result as bool*/ true);
++  %}
+ 
+   ins_pipe(pipe_slow);
+ %}
+@@ -5408,14 +5478,18 @@ instruct compareAndSwapLAcq(iRegINoSp res, indirect mem, iRegL oldval, iRegL new
+ 
+   match(Set res (CompareAndSwapL mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + ALU_COST * 6 + BRANCH_COST * 4);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+     "cmpxchg_acq $mem, $oldval, $newval\t# (long) if $mem == $oldval then $mem <-- $newval\n\t"
+     "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapLAcq"
+   %}
+ 
+-  ins_encode(riscv_enc_cmpxchg_acq(res, mem, oldval, newval));
++  ins_encode %{
++    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int64,
++               /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register,
++               /*result as bool*/ true);
++  %}
+ 
+   ins_pipe(pipe_slow);
+ %}
+@@ -5426,14 +5500,18 @@ instruct compareAndSwapPAcq(iRegINoSp res, indirect mem, iRegP oldval, iRegP new
+ 
+   match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + ALU_COST * 6 + BRANCH_COST * 4);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+     "cmpxchg_acq $mem, $oldval, $newval\t# (ptr) if $mem == $oldval then $mem <-- $newval\n\t"
+     "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapPAcq"
+   %}
+ 
+-  ins_encode(riscv_enc_cmpxchg_acq(res, mem, oldval, newval));
++  ins_encode %{
++    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int64,
++               /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register,
++               /*result as bool*/ true);
++  %}
+ 
+   ins_pipe(pipe_slow);
+ %}
+@@ -5444,14 +5522,18 @@ instruct compareAndSwapNAcq(iRegINoSp res, indirect mem, iRegN oldval, iRegN new
+ 
+   match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + ALU_COST * 8 + BRANCH_COST * 4);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+     "cmpxchg_acq $mem, $oldval, $newval\t# (narrow oop) if $mem == $oldval then $mem <-- $newval\n\t"
+     "mv $res, $res == $oldval\t# $res <-- ($res == $oldval ? 1 : 0), #@compareAndSwapNAcq"
+   %}
+ 
+-  ins_encode(riscv_enc_cmpxchgn_acq(res, mem, oldval, newval));
++  ins_encode %{
++    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::uint32,
++               /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register,
++               /*result as bool*/ true);
++  %}
+ 
+   ins_pipe(pipe_slow);
+ %}
+@@ -5462,17 +5544,19 @@ instruct compareAndSwapNAcq(iRegINoSp res, indirect mem, iRegN oldval, iRegN new
+ // no trailing StoreLoad barrier emitted by C2.  Unfortunately we
+ // can't check the type of memory ordering here, so we always emit a
+ // sc_d(w) with rl bit set.
+-instruct compareAndExchangeB(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
+-                             iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
++instruct compareAndExchangeB_narrow(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
++                                    iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
+ %{
++  predicate(!UseZabha || !UseZacas);
++
+   match(Set res (CompareAndExchangeB mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 3 + ALU_COST * 5);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   effect(TEMP_DEF res, KILL cr, USE_KILL oldval, USE_KILL newval, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+ 
+   format %{
+-    "cmpxchg $res = $mem, $oldval, $newval\t# (byte, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeB"
++    "cmpxchg $res = $mem, $oldval, $newval\t# (byte, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeB_narrow"
+   %}
+ 
+   ins_encode %{
+@@ -5484,17 +5568,39 @@ instruct compareAndExchangeB(iRegINoSp res, indirect mem, iRegI_R12 oldval, iReg
+   ins_pipe(pipe_slow);
+ %}
+ 
+-instruct compareAndExchangeS(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
+-                             iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
++instruct compareAndExchangeB(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
++%{
++  predicate(UseZabha && UseZacas);
++
++  match(Set res (CompareAndExchangeB mem (Binary oldval newval)));
++
++  ins_cost(2 * VOLATILE_REF_COST);
++
++  format %{
++    "cmpxchg $res = $mem, $oldval, $newval\t# (byte, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeB"
++  %}
++
++  ins_encode %{
++    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int8,
++               /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register);
++  %}
++
++  ins_pipe(pipe_slow);
++%}
++
++instruct compareAndExchangeS_narrow(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
++                                    iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
+ %{
++  predicate(!UseZabha || !UseZacas);
++
+   match(Set res (CompareAndExchangeS mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 3 + ALU_COST * 6);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   effect(TEMP_DEF res, KILL cr, USE_KILL oldval, USE_KILL newval, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+ 
+   format %{
+-    "cmpxchg $res = $mem, $oldval, $newval\t# (short, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeS"
++    "cmpxchg $res = $mem, $oldval, $newval\t# (short, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeS_narrow"
+   %}
+ 
+   ins_encode %{
+@@ -5506,13 +5612,31 @@ instruct compareAndExchangeS(iRegINoSp res, indirect mem, iRegI_R12 oldval, iReg
+   ins_pipe(pipe_slow);
+ %}
+ 
++instruct compareAndExchangeS(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
++%{
++  predicate(UseZabha && UseZacas);
++
++  match(Set res (CompareAndExchangeS mem (Binary oldval newval)));
++
++  ins_cost(2 * VOLATILE_REF_COST);
++
++  format %{
++    "cmpxchg $res = $mem, $oldval, $newval\t# (short, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeS"
++  %}
++
++  ins_encode %{
++    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int16,
++               /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register);
++  %}
++
++  ins_pipe(pipe_slow);
++%}
++
+ instruct compareAndExchangeI(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
+ %{
+   match(Set res (CompareAndExchangeI mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 3 + ALU_COST);
+-
+-  effect(TEMP_DEF res);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+     "cmpxchg $res = $mem, $oldval, $newval\t# (int, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeI"
+@@ -5530,9 +5654,7 @@ instruct compareAndExchangeL(iRegLNoSp res, indirect mem, iRegL oldval, iRegL ne
+ %{
+   match(Set res (CompareAndExchangeL mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 3 + ALU_COST);
+-
+-  effect(TEMP_DEF res);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+     "cmpxchg $res = $mem, $oldval, $newval\t# (long, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeL"
+@@ -5550,9 +5672,7 @@ instruct compareAndExchangeN(iRegNNoSp res, indirect mem, iRegN oldval, iRegN ne
+ %{
+   match(Set res (CompareAndExchangeN mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 3 + ALU_COST * 3);
+-
+-  effect(TEMP_DEF res);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+     "cmpxchg $res = $mem, $oldval, $newval\t# (narrow oop, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeN"
+@@ -5569,11 +5689,10 @@ instruct compareAndExchangeN(iRegNNoSp res, indirect mem, iRegN oldval, iRegN ne
+ instruct compareAndExchangeP(iRegPNoSp res, indirect mem, iRegP oldval, iRegP newval)
+ %{
+   predicate(n->as_LoadStore()->barrier_data() == 0);
+-  match(Set res (CompareAndExchangeP mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 3 + ALU_COST);
++  match(Set res (CompareAndExchangeP mem (Binary oldval newval)));
+ 
+-  effect(TEMP_DEF res);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+     "cmpxchg $res = $mem, $oldval, $newval\t# (ptr, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeP"
+@@ -5587,19 +5706,19 @@ instruct compareAndExchangeP(iRegPNoSp res, indirect mem, iRegP oldval, iRegP ne
+   ins_pipe(pipe_slow);
+ %}
+ 
+-instruct compareAndExchangeBAcq(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
+-                                iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
++instruct compareAndExchangeBAcq_narrow(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
++                                       iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
+ %{
+-  predicate(needs_acquiring_load_reserved(n));
++  predicate((!UseZabha || !UseZacas) && needs_acquiring_load_reserved(n));
+ 
+   match(Set res (CompareAndExchangeB mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 3 + ALU_COST * 5);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   effect(TEMP_DEF res, KILL cr, USE_KILL oldval, USE_KILL newval, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+ 
+   format %{
+-    "cmpxchg_acq $res = $mem, $oldval, $newval\t# (byte, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeBAcq"
++    "cmpxchg_acq $res = $mem, $oldval, $newval\t# (byte, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeBAcq_narrow"
+   %}
+ 
+   ins_encode %{
+@@ -5611,19 +5730,39 @@ instruct compareAndExchangeBAcq(iRegINoSp res, indirect mem, iRegI_R12 oldval, i
+   ins_pipe(pipe_slow);
+ %}
+ 
+-instruct compareAndExchangeSAcq(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
+-                                iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
++instruct compareAndExchangeBAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
+ %{
+-  predicate(needs_acquiring_load_reserved(n));
++  predicate((UseZabha && UseZacas) && needs_acquiring_load_reserved(n));
++
++  match(Set res (CompareAndExchangeB mem (Binary oldval newval)));
++
++  ins_cost(2 * VOLATILE_REF_COST);
++
++  format %{
++    "cmpxchg_acq $res = $mem, $oldval, $newval\t# (byte, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeBAcq"
++  %}
++
++  ins_encode %{
++    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int8,
++               /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register);
++  %}
++
++  ins_pipe(pipe_slow);
++%}
++
++instruct compareAndExchangeSAcq_narrow(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
++                                       iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
++%{
++  predicate((!UseZabha || !UseZacas) && needs_acquiring_load_reserved(n));
+ 
+   match(Set res (CompareAndExchangeS mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 3 + ALU_COST * 6);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   effect(TEMP_DEF res, KILL cr, USE_KILL oldval, USE_KILL newval, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+ 
+   format %{
+-    "cmpxchg_acq $res = $mem, $oldval, $newval\t# (short, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeSAcq"
++    "cmpxchg_acq $res = $mem, $oldval, $newval\t# (short, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeSAcq_narrow"
+   %}
+ 
+   ins_encode %{
+@@ -5635,15 +5774,33 @@ instruct compareAndExchangeSAcq(iRegINoSp res, indirect mem, iRegI_R12 oldval, i
+   ins_pipe(pipe_slow);
+ %}
+ 
++instruct compareAndExchangeSAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
++%{
++  predicate((UseZabha && UseZacas) && needs_acquiring_load_reserved(n));
++
++  match(Set res (CompareAndExchangeS mem (Binary oldval newval)));
++
++  ins_cost(2 * VOLATILE_REF_COST);
++
++  format %{
++    "cmpxchg_acq $res = $mem, $oldval, $newval\t# (short, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeSAcq"
++  %}
++
++  ins_encode %{
++    __ cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int16,
++               /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register);
++  %}
++
++  ins_pipe(pipe_slow);
++%}
++
+ instruct compareAndExchangeIAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
+ %{
+   predicate(needs_acquiring_load_reserved(n));
+ 
+   match(Set res (CompareAndExchangeI mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 3 + ALU_COST);
+-
+-  effect(TEMP_DEF res);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+     "cmpxchg_acq $res = $mem, $oldval, $newval\t# (int, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeIAcq"
+@@ -5663,9 +5820,7 @@ instruct compareAndExchangeLAcq(iRegLNoSp res, indirect mem, iRegL oldval, iRegL
+ 
+   match(Set res (CompareAndExchangeL mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 3 + ALU_COST);
+-
+-  effect(TEMP_DEF res);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+     "cmpxchg_acq $res = $mem, $oldval, $newval\t# (long, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeLAcq"
+@@ -5685,9 +5840,7 @@ instruct compareAndExchangeNAcq(iRegNNoSp res, indirect mem, iRegN oldval, iRegN
+ 
+   match(Set res (CompareAndExchangeN mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 3 + ALU_COST);
+-
+-  effect(TEMP_DEF res);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+     "cmpxchg_acq $res = $mem, $oldval, $newval\t# (narrow oop, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangeNAcq"
+@@ -5707,9 +5860,7 @@ instruct compareAndExchangePAcq(iRegPNoSp res, indirect mem, iRegP oldval, iRegP
+ 
+   match(Set res (CompareAndExchangeP mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 3 + ALU_COST);
+-
+-  effect(TEMP_DEF res);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+     "cmpxchg_acq $res = $mem, $oldval, $newval\t# (ptr, weak) if $mem == $oldval then $mem <-- $newval, #@compareAndExchangePAcq"
+@@ -5723,18 +5874,20 @@ instruct compareAndExchangePAcq(iRegPNoSp res, indirect mem, iRegP oldval, iRegP
+   ins_pipe(pipe_slow);
+ %}
+ 
+-instruct weakCompareAndSwapB(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
+-                             iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
++instruct weakCompareAndSwapB_narrow(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
++                                    iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
+ %{
++  predicate(!UseZabha || !UseZacas);
++
+   match(Set res (WeakCompareAndSwapB mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 2 + ALU_COST * 6);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   effect(TEMP_DEF res, KILL cr, USE_KILL oldval, USE_KILL newval, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+ 
+   format %{
+-    "cmpxchg_weak $mem, $oldval, $newval\t# (byte, weak) if $mem == $oldval then $mem <-- $newval\n\t"
+-    "# $res == 1 when success, #@weakCompareAndSwapB"
++    "weak_cmpxchg $mem, $oldval, $newval\t# (byte, weak) if $mem == $oldval then $mem <-- $newval\n\t"
++    "# $res == 1 when success, #@weakCompareAndSwapB_narrow"
+   %}
+ 
+   ins_encode %{
+@@ -5746,18 +5899,41 @@ instruct weakCompareAndSwapB(iRegINoSp res, indirect mem, iRegI_R12 oldval, iReg
+   ins_pipe(pipe_slow);
+ %}
+ 
+-instruct weakCompareAndSwapS(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
+-                             iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
++instruct weakCompareAndSwapB(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
++%{
++  predicate(UseZabha && UseZacas);
++
++  match(Set res (WeakCompareAndSwapB mem (Binary oldval newval)));
++
++  ins_cost(2 * VOLATILE_REF_COST);
++
++  format %{
++    "weak_cmpxchg $mem, $oldval, $newval\t# (byte, weak) if $mem == $oldval then $mem <-- $newval\n\t"
++    "# $res == 1 when success, #@weakCompareAndSwapB"
++  %}
++
++  ins_encode %{
++    __ weak_cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int8,
++                    /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register);
++  %}
++
++  ins_pipe(pipe_slow);
++%}
++
++instruct weakCompareAndSwapS_narrow(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
++                                    iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
+ %{
++  predicate(!UseZabha || !UseZacas);
++
+   match(Set res (WeakCompareAndSwapS mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 2 + ALU_COST * 7);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   effect(TEMP_DEF res, KILL cr, USE_KILL oldval, USE_KILL newval, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+ 
+   format %{
+-    "cmpxchg_weak $mem, $oldval, $newval\t# (short, weak) if $mem == $oldval then $mem <-- $newval\n\t"
+-    "# $res == 1 when success, #@weakCompareAndSwapS"
++    "weak_cmpxchg $mem, $oldval, $newval\t# (short, weak) if $mem == $oldval then $mem <-- $newval\n\t"
++    "# $res == 1 when success, #@weakCompareAndSwapS_narrow"
+   %}
+ 
+   ins_encode %{
+@@ -5769,19 +5945,40 @@ instruct weakCompareAndSwapS(iRegINoSp res, indirect mem, iRegI_R12 oldval, iReg
+   ins_pipe(pipe_slow);
+ %}
+ 
++instruct weakCompareAndSwapS(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
++%{
++  predicate(UseZabha && UseZacas);
++
++  match(Set res (WeakCompareAndSwapS mem (Binary oldval newval)));
++
++  ins_cost(2 * VOLATILE_REF_COST);
++
++  format %{
++    "weak_cmpxchg $mem, $oldval, $newval\t# (short, weak) if $mem == $oldval then $mem <-- $newval\n\t"
++    "# $res == 1 when success, #@weakCompareAndSwapS"
++  %}
++
++  ins_encode %{
++    __ weak_cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int16,
++                    /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register);
++  %}
++
++  ins_pipe(pipe_slow);
++%}
++
+ instruct weakCompareAndSwapI(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
+ %{
+   match(Set res (WeakCompareAndSwapI mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 2 + ALU_COST * 2);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+-    "cmpxchg_weak $mem, $oldval, $newval\t# (int, weak) if $mem == $oldval then $mem <-- $newval\n\t"
++    "weak_cmpxchg $mem, $oldval, $newval\t# (int, weak) if $mem == $oldval then $mem <-- $newval\n\t"
+     "# $res == 1 when success, #@weakCompareAndSwapI"
+   %}
+ 
+   ins_encode %{
+-    __ cmpxchg_weak(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int32,
++    __ weak_cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int32,
+                     /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register);
+   %}
+ 
+@@ -5792,15 +5989,15 @@ instruct weakCompareAndSwapL(iRegINoSp res, indirect mem, iRegL oldval, iRegL ne
+ %{
+   match(Set res (WeakCompareAndSwapL mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 2 + ALU_COST * 2);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+-    "cmpxchg_weak $mem, $oldval, $newval\t# (long, weak) if $mem == $oldval then $mem <-- $newval\n\t"
++    "weak_cmpxchg $mem, $oldval, $newval\t# (long, weak) if $mem == $oldval then $mem <-- $newval\n\t"
+     "# $res == 1 when success, #@weakCompareAndSwapL"
+   %}
+ 
+   ins_encode %{
+-    __ cmpxchg_weak(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int64,
++    __ weak_cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int64,
+                     /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register);
+   %}
+ 
+@@ -5811,15 +6008,15 @@ instruct weakCompareAndSwapN(iRegINoSp res, indirect mem, iRegN oldval, iRegN ne
+ %{
+   match(Set res (WeakCompareAndSwapN mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 2 + ALU_COST * 4);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+-    "cmpxchg_weak $mem, $oldval, $newval\t# (narrow oop, weak) if $mem == $oldval then $mem <-- $newval\n\t"
++    "weak_cmpxchg $mem, $oldval, $newval\t# (narrow oop, weak) if $mem == $oldval then $mem <-- $newval\n\t"
+     "# $res == 1 when success, #@weakCompareAndSwapN"
+   %}
+ 
+   ins_encode %{
+-    __ cmpxchg_weak(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::uint32,
++    __ weak_cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::uint32,
+                     /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register);
+   %}
+ 
+@@ -5829,37 +6026,38 @@ instruct weakCompareAndSwapN(iRegINoSp res, indirect mem, iRegN oldval, iRegN ne
+ instruct weakCompareAndSwapP(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval)
+ %{
+   predicate(n->as_LoadStore()->barrier_data() == 0);
++
+   match(Set res (WeakCompareAndSwapP mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 2 + ALU_COST * 2);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+-    "cmpxchg_weak $mem, $oldval, $newval\t# (ptr, weak) if $mem == $oldval then $mem <-- $newval\n\t"
++    "weak_cmpxchg $mem, $oldval, $newval\t# (ptr, weak) if $mem == $oldval then $mem <-- $newval\n\t"
+     "# $res == 1 when success, #@weakCompareAndSwapP"
+   %}
+ 
+   ins_encode %{
+-    __ cmpxchg_weak(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int64,
++    __ weak_cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int64,
+                     /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register);
+   %}
+ 
+   ins_pipe(pipe_slow);
+ %}
+ 
+-instruct weakCompareAndSwapBAcq(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
+-                                iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
++instruct weakCompareAndSwapBAcq_narrow(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
++                                       iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
+ %{
+-  predicate(needs_acquiring_load_reserved(n));
++  predicate((!UseZabha || !UseZacas) && needs_acquiring_load_reserved(n));
+ 
+   match(Set res (WeakCompareAndSwapB mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 2 + ALU_COST * 6);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   effect(TEMP_DEF res, KILL cr, USE_KILL oldval, USE_KILL newval, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+ 
+   format %{
+-    "cmpxchg_weak_acq $mem, $oldval, $newval\t# (byte, weak) if $mem == $oldval then $mem <-- $newval\n\t"
+-    "# $res == 1 when success, #@weakCompareAndSwapBAcq"
++    "weak_cmpxchg_acq $mem, $oldval, $newval\t# (byte, weak) if $mem == $oldval then $mem <-- $newval\n\t"
++    "# $res == 1 when success, #@weakCompareAndSwapBAcq_narrow"
+   %}
+ 
+   ins_encode %{
+@@ -5871,20 +6069,41 @@ instruct weakCompareAndSwapBAcq(iRegINoSp res, indirect mem, iRegI_R12 oldval, i
+   ins_pipe(pipe_slow);
+ %}
+ 
+-instruct weakCompareAndSwapSAcq(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
+-                                iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
++instruct weakCompareAndSwapBAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
+ %{
+-  predicate(needs_acquiring_load_reserved(n));
++  predicate((UseZabha && UseZacas) && needs_acquiring_load_reserved(n));
++
++  match(Set res (WeakCompareAndSwapB mem (Binary oldval newval)));
++
++  ins_cost(2 * VOLATILE_REF_COST);
++
++  format %{
++    "weak_cmpxchg_acq $mem, $oldval, $newval\t# (byte, weak) if $mem == $oldval then $mem <-- $newval\n\t"
++    "# $res == 1 when success, #@weakCompareAndSwapBAcq"
++  %}
++
++  ins_encode %{
++    __ weak_cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int8,
++                    /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register);
++  %}
++
++  ins_pipe(pipe_slow);
++%}
++
++instruct weakCompareAndSwapSAcq_narrow(iRegINoSp res, indirect mem, iRegI_R12 oldval, iRegI_R13 newval,
++                                       iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, rFlagsReg cr)
++%{
++  predicate((!UseZabha || !UseZacas) && needs_acquiring_load_reserved(n));
+ 
+   match(Set res (WeakCompareAndSwapS mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 2 + ALU_COST * 7);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   effect(TEMP_DEF res, KILL cr, USE_KILL oldval, USE_KILL newval, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+ 
+   format %{
+-    "cmpxchg_weak_acq $mem, $oldval, $newval\t# (short, weak) if $mem == $oldval then $mem <-- $newval\n\t"
+-    "# $res == 1 when success, #@weakCompareAndSwapSAcq"
++    "weak_cmpxchg_acq $mem, $oldval, $newval\t# (short, weak) if $mem == $oldval then $mem <-- $newval\n\t"
++    "# $res == 1 when success, #@weakCompareAndSwapSAcq_narrow"
+   %}
+ 
+   ins_encode %{
+@@ -5896,21 +6115,42 @@ instruct weakCompareAndSwapSAcq(iRegINoSp res, indirect mem, iRegI_R12 oldval, i
+   ins_pipe(pipe_slow);
+ %}
+ 
++instruct weakCompareAndSwapSAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
++%{
++  predicate((UseZabha && UseZacas) && needs_acquiring_load_reserved(n));
++
++  match(Set res (WeakCompareAndSwapS mem (Binary oldval newval)));
++
++  ins_cost(2 * VOLATILE_REF_COST);
++
++  format %{
++    "weak_cmpxchg_acq $mem, $oldval, $newval\t# (short, weak) if $mem == $oldval then $mem <-- $newval\n\t"
++    "# $res == 1 when success, #@weakCompareAndSwapSAcq"
++  %}
++
++  ins_encode %{
++    __ weak_cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int16,
++                    /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register);
++  %}
++
++  ins_pipe(pipe_slow);
++%}
++
+ instruct weakCompareAndSwapIAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval)
+ %{
+   predicate(needs_acquiring_load_reserved(n));
+ 
+   match(Set res (WeakCompareAndSwapI mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 2 + ALU_COST * 2);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+-    "cmpxchg_weak_acq $mem, $oldval, $newval\t# (int, weak) if $mem == $oldval then $mem <-- $newval\n\t"
++    "weak_cmpxchg_acq $mem, $oldval, $newval\t# (int, weak) if $mem == $oldval then $mem <-- $newval\n\t"
+     "# $res == 1 when success, #@weakCompareAndSwapIAcq"
+   %}
+ 
+   ins_encode %{
+-    __ cmpxchg_weak(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int32,
++    __ weak_cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int32,
+                     /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register);
+   %}
+ 
+@@ -5923,15 +6163,15 @@ instruct weakCompareAndSwapLAcq(iRegINoSp res, indirect mem, iRegL oldval, iRegL
+ 
+   match(Set res (WeakCompareAndSwapL mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 2 + ALU_COST * 2);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+-    "cmpxchg_weak_acq $mem, $oldval, $newval\t# (long, weak) if $mem == $oldval then $mem <-- $newval\n\t"
++    "weak_cmpxchg_acq $mem, $oldval, $newval\t# (long, weak) if $mem == $oldval then $mem <-- $newval\n\t"
+     "# $res == 1 when success, #@weakCompareAndSwapLAcq"
+   %}
+ 
+   ins_encode %{
+-    __ cmpxchg_weak(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int64,
++    __ weak_cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int64,
+                     /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register);
+   %}
+ 
+@@ -5944,15 +6184,15 @@ instruct weakCompareAndSwapNAcq(iRegINoSp res, indirect mem, iRegN oldval, iRegN
+ 
+   match(Set res (WeakCompareAndSwapN mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 2 + ALU_COST * 4);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+-    "cmpxchg_weak_acq $mem, $oldval, $newval\t# (narrow oop, weak) if $mem == $oldval then $mem <-- $newval\n\t"
++    "weak_cmpxchg_acq $mem, $oldval, $newval\t# (narrow oop, weak) if $mem == $oldval then $mem <-- $newval\n\t"
+     "# $res == 1 when success, #@weakCompareAndSwapNAcq"
+   %}
+ 
+   ins_encode %{
+-    __ cmpxchg_weak(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::uint32,
++    __ weak_cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::uint32,
+                     /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register);
+   %}
+ 
+@@ -5965,15 +6205,15 @@ instruct weakCompareAndSwapPAcq(iRegINoSp res, indirect mem, iRegP oldval, iRegP
+ 
+   match(Set res (WeakCompareAndSwapP mem (Binary oldval newval)));
+ 
+-  ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 2 + ALU_COST * 2);
++  ins_cost(2 * VOLATILE_REF_COST);
+ 
+   format %{
+-    "cmpxchg_weak_acq $mem, $oldval, $newval\t# (ptr, weak) if $mem == $oldval then $mem <-- $newval\n\t"
++    "weak_cmpxchg_acq $mem, $oldval, $newval\t# (ptr, weak) if $mem == $oldval then $mem <-- $newval\n\t"
+     "\t# $res == 1 when success, #@weakCompareAndSwapPAcq"
+   %}
+ 
+   ins_encode %{
+-    __ cmpxchg_weak(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int64,
++    __ weak_cmpxchg(as_Register($mem$$base), $oldval$$Register, $newval$$Register, Assembler::int64,
+                     /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register);
+   %}
+ 
diff --git a/openjdk-21.spec b/openjdk-21.spec
index 8a29824872345be6251dc19ecdca45d2a625735b..53c88563df80da88513f1f6f2fc261958b09ef4f 100644
--- a/openjdk-21.spec
+++ b/openjdk-21.spec
@@ -905,7 +905,7 @@ Name:    java-21-%{origin}
 Version: %{newjavaver}.%{buildver}
 # This package needs `.rolling` as part of Release so as to not conflict on install with
 # java-X-openjdk. I.e. when latest rolling release is also an LTS release packaged as
-Release: 3
+Release: 4
 
 
 # java-1.5.0-ibm from jpackage.org set Epoch to 1 for unknown reasons
@@ -1074,6 +1074,9 @@ Patch3017: Backport-JDK-8314125-RISC-V-implement-Base64-intrinsic.patch
 Patch3018: Backport-JDK-8318217-RISC-V-C2-VectorizedHashCode.patch
 Patch3019: Backport-JDK-8317971-RISC-V-implement-copySignF-D-and-signumF-D-intrinsics.patch
 Patch3020: Backport-JDK-8327964-8360179-RISC-V-Only-enable-BigInteger-intrinsics-when-AvoidUnalignedAccess-false.patch
+Patch3021: Backport-JDK-8315743-8315856-8344010-8344382-RISC-V-Use-Zacas-extension-for-cmpxchg.patch
+Patch3022: Backport-JDK-8319778-8324881-8319797-8319900-Recursive-lightweight-locking-riscv64-implementation.patch
+Patch3023: Backport-JDK-8345351-8356159-RISC-V-Add-Zabha.patch
 
 BuildRequires: autoconf
 BuildRequires: automake
@@ -1388,6 +1391,9 @@ pushd %{top_level_dir_name}
 %patch3018 -p1
 %patch3019 -p1
 %patch3020 -p1
+%patch3021 -p1
+%patch3022 -p1
+%patch3023 -p1
 popd
 %endif
 
@@ -1945,6 +1951,9 @@ cjc.mainProgram(args) -- the returns from copy_jdk_configs.lua should not affect
 
 
 %changelog
+* Fri Nov 14 2025 zhangshihui <zhang.shihui1@zte.com.cn> - 1:21.0.9.10-4
+- RISC-V add Zacas and Zabha and implemente lightweight locking
+
 * Tue Nov 04 2025 panxuefeng <panxuefeng@loongson.cn> - 1:21.0.9.10-3
 - update LoongArch64 port to 21.0.9