revert port
This commit is contained in:
@@ -368,7 +368,9 @@ func (p *SyncedPool) RequestOwnership(id CartId) error {
|
||||
ok := 0
|
||||
all := 0
|
||||
remotes := p.GetHealthyRemotes()
|
||||
log.Printf("RequestOwnership start id=%s host=%s healthyRemotes=%d", id, p.Hostname, len(remotes))
|
||||
for _, r := range remotes {
|
||||
log.Printf("RequestOwnership sending ConfirmOwner to host=%s id=%s", r.Host, id)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 800*time.Millisecond)
|
||||
reply, err := r.ControlClient.ConfirmOwner(ctx, &proto.OwnerChangeRequest{
|
||||
CartId: id.String(),
|
||||
@@ -377,20 +379,32 @@ func (p *SyncedPool) RequestOwnership(id CartId) error {
|
||||
cancel()
|
||||
all++
|
||||
if err != nil || reply == nil || !reply.Accepted {
|
||||
log.Printf("ConfirmOwner failure from %s for %s: %v (reply=%v)", r.Host, id, err, reply)
|
||||
log.Printf("RequestOwnership negative/failed response from host=%s id=%s err=%v reply=%v", r.Host, id, err, reply)
|
||||
continue
|
||||
}
|
||||
ok++
|
||||
log.Printf("RequestOwnership accept from host=%s id=%s (ok=%d all=%d)", r.Host, id, ok, all)
|
||||
}
|
||||
|
||||
// Quorum rule mirrors legacy:
|
||||
// - If fewer than 3 total, require all.
|
||||
// - Else require majority (ok >= all/2).
|
||||
if (all < 3 && ok < all) || ok < (all/2) {
|
||||
// Quorum rule (majority semantics):
|
||||
// - Let N = all remotes + 1 (self)
|
||||
// - We require ok + 1 (implicit self vote) >= floor(N/2)+1
|
||||
// => ok >= floor(N/2)
|
||||
// - Examples:
|
||||
// N=2 (all=1): threshold=1 (need 1 remote)
|
||||
// N=3 (all=2): threshold=1 (need 1 remote; previously required 2)
|
||||
// N=4 (all=3): threshold=2
|
||||
// N=5 (all=4): threshold=2
|
||||
// - This change allows faster ownership under partial remote availability in small clusters.
|
||||
log.Printf("RequestOwnership quorum evaluation id=%s host=%s ok=%d all=%d", id, p.Hostname, ok, all)
|
||||
threshold := (all + 1) / 2 // floor(N/2)
|
||||
if ok < threshold {
|
||||
p.removeLocalGrain(id)
|
||||
return fmt.Errorf("quorum not reached (ok=%d all=%d)", ok, all)
|
||||
log.Printf("RequestOwnership failed quorum id=%s host=%s ok=%d all=%d threshold=%d", id, p.Hostname, ok, all, threshold)
|
||||
return fmt.Errorf("quorum not reached (ok=%d all=%d threshold=%d)", ok, all, threshold)
|
||||
}
|
||||
grainSyncCount.Inc()
|
||||
log.Printf("RequestOwnership success id=%s host=%s ok=%d all=%d threshold=%d", id, p.Hostname, ok, all, threshold)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -400,7 +414,9 @@ func (p *SyncedPool) removeLocalGrain(id CartId) {
|
||||
p.mu.Unlock()
|
||||
}
|
||||
|
||||
// getGrain returns a local or remote grain; if absent, attempts ownership.
|
||||
// getGrain returns a local or remote grain. If absent, it synchronously attempts
|
||||
// to acquire ownership before spawning a local grain to eliminate the race where
|
||||
// the first mutation applies before peers have installed remote proxies.
|
||||
func (p *SyncedPool) getGrain(id CartId) (Grain, error) {
|
||||
p.mu.RLock()
|
||||
localGrain, isLocal := p.local.grains[id]
|
||||
@@ -415,10 +431,20 @@ func (p *SyncedPool) getGrain(id CartId) (Grain, error) {
|
||||
return remoteGrain, nil
|
||||
}
|
||||
|
||||
// Attempt to claim ownership (async semantics preserved)
|
||||
go p.RequestOwnership(id)
|
||||
// Synchronously attempt to claim ownership. If this fails (quorum not reached)
|
||||
// we re-check for a newly appeared remote proxy (another node became owner).
|
||||
if err := p.RequestOwnership(id); err != nil {
|
||||
p.mu.RLock()
|
||||
if rg, ok := p.remoteIndex[id]; ok {
|
||||
p.mu.RUnlock()
|
||||
remoteLookupCount.Inc()
|
||||
return rg, nil
|
||||
}
|
||||
p.mu.RUnlock()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Create local grain (lazy spawn) - may be rolled back by quorum failure
|
||||
// Ownership acquired; now lazily spawn the local grain.
|
||||
grain, err := p.local.GetGrain(id)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
||||
Reference in New Issue
Block a user