blob: 4f5ef3b0fc32fc518e6b093d437fe071008a8ffb [file] [log] [blame]
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a MIT
// license that can be found in the LICENSE file.
/*
* Project: cockroach
* Issue or PR : https://github.com/cockroachdb/cockroach/pull/10214
* Buggy version: 7207111aa3a43df0552509365fdec741a53f873f
* fix commit-id: 27e863d90ab0660494778f1c35966cc5ddc38e32
* Flaky: 3/100
* Description: This goroutine leak is caused by different order when acquiring
* coalescedMu.Lock() and raftMu.Lock(). The fix is to refactor sendQueuedHeartbeats()
* so that cockroachdb can unlock coalescedMu before locking raftMu.
*/
package main
import (
"os"
"runtime/pprof"
"sync"
"time"
"unsafe"
)
func init() {
register("Cockroach10214", Cockroach10214)
}
type Store_cockroach10214 struct {
coalescedMu struct {
sync.Mutex // L1
heartbeatResponses []int
}
mu struct {
replicas map[int]*Replica_cockroach10214
}
}
func (s *Store_cockroach10214) sendQueuedHeartbeats() {
s.coalescedMu.Lock() // L1 acquire
defer s.coalescedMu.Unlock() // L2 release
for i := 0; i < len(s.coalescedMu.heartbeatResponses); i++ {
s.sendQueuedHeartbeatsToNode() // L2
}
}
func (s *Store_cockroach10214) sendQueuedHeartbeatsToNode() {
for i := 0; i < len(s.mu.replicas); i++ {
r := s.mu.replicas[i]
r.reportUnreachable() // L2
}
}
type Replica_cockroach10214 struct {
raftMu sync.Mutex // L2
mu sync.Mutex // L3
store *Store_cockroach10214
}
func (r *Replica_cockroach10214) reportUnreachable() {
r.raftMu.Lock() // L2 acquire
time.Sleep(time.Millisecond)
defer r.raftMu.Unlock() // L2 release
}
func (r *Replica_cockroach10214) tick() {
r.raftMu.Lock() // L2 acquire
defer r.raftMu.Unlock() // L2 release
r.tickRaftMuLocked()
}
func (r *Replica_cockroach10214) tickRaftMuLocked() {
r.mu.Lock() // L3 acquire
defer r.mu.Unlock() // L3 release
if r.maybeQuiesceLocked() {
return
}
}
func (r *Replica_cockroach10214) maybeQuiesceLocked() bool {
for i := 0; i < 2; i++ {
if !r.maybeCoalesceHeartbeat() {
return true
}
}
return false
}
func (r *Replica_cockroach10214) maybeCoalesceHeartbeat() bool {
msgtype := uintptr(unsafe.Pointer(r)) % 3
switch msgtype {
case 0, 1, 2:
r.store.coalescedMu.Lock() // L1 acquire
default:
return false
}
r.store.coalescedMu.Unlock() // L1 release
return true
}
func Cockroach10214() {
prof := pprof.Lookup("goroutineleak")
defer func() {
time.Sleep(100 * time.Millisecond)
prof.WriteTo(os.Stdout, 2)
}()
for i := 0; i < 1000; i++ {
go func() {
store := &Store_cockroach10214{}
responses := &store.coalescedMu.heartbeatResponses
*responses = append(*responses, 1, 2)
store.mu.replicas = make(map[int]*Replica_cockroach10214)
rp1 := &Replica_cockroach10214{ // L2,3[0]
store: store,
}
rp2 := &Replica_cockroach10214{ // L2,3[1]
store: store,
}
store.mu.replicas[0] = rp1
store.mu.replicas[1] = rp2
go store.sendQueuedHeartbeats() // G1
go rp1.tick() // G2
}()
}
}
// Example of goroutine leak trace:
//
// G1 G2
//------------------------------------------------------------------------------------
// s.sendQueuedHeartbeats() .
// s.coalescedMu.Lock() [L1] .
// s.sendQueuedHeartbeatsToNode() .
// s.mu.replicas[0].reportUnreachable() .
// s.mu.replicas[0].raftMu.Lock() [L2] .
// . s.mu.replicas[0].tick()
// . s.mu.replicas[0].raftMu.Lock() [L2]
// . s.mu.replicas[0].tickRaftMuLocked()
// . s.mu.replicas[0].mu.Lock() [L3]
// . s.mu.replicas[0].maybeQuiesceLocked()
// . s.mu.replicas[0].maybeCoalesceHeartbeat()
// . s.coalescedMu.Lock() [L1]
//--------------------------------G1,G2 leak------------------------------------------