revert port
Some checks failed
Build and Publish / BuildAndDeploy (push) Successful in 3m2s
Build and Publish / BuildAndDeployAmd64 (push) Has been cancelled

This commit is contained in:
matst80
2025-10-10 12:10:37 +00:00
parent 716f1121aa
commit c30be581cd
6 changed files with 666 additions and 12 deletions

View File

@@ -368,7 +368,9 @@ func (p *SyncedPool) RequestOwnership(id CartId) error {
ok := 0
all := 0
remotes := p.GetHealthyRemotes()
log.Printf("RequestOwnership start id=%s host=%s healthyRemotes=%d", id, p.Hostname, len(remotes))
for _, r := range remotes {
log.Printf("RequestOwnership sending ConfirmOwner to host=%s id=%s", r.Host, id)
ctx, cancel := context.WithTimeout(context.Background(), 800*time.Millisecond)
reply, err := r.ControlClient.ConfirmOwner(ctx, &proto.OwnerChangeRequest{
CartId: id.String(),
@@ -377,20 +379,32 @@ func (p *SyncedPool) RequestOwnership(id CartId) error {
cancel()
all++
if err != nil || reply == nil || !reply.Accepted {
log.Printf("ConfirmOwner failure from %s for %s: %v (reply=%v)", r.Host, id, err, reply)
log.Printf("RequestOwnership negative/failed response from host=%s id=%s err=%v reply=%v", r.Host, id, err, reply)
continue
}
ok++
log.Printf("RequestOwnership accept from host=%s id=%s (ok=%d all=%d)", r.Host, id, ok, all)
}
// Quorum rule mirrors legacy:
// - If fewer than 3 total, require all.
// - Else require majority (ok >= all/2).
if (all < 3 && ok < all) || ok < (all/2) {
// Quorum rule (majority semantics):
// - Let N = all remotes + 1 (self)
// - We require ok + 1 (implicit self vote) >= floor(N/2)+1
// => ok >= floor(N/2)
// - Examples:
// N=2 (all=1): threshold=1 (need 1 remote)
// N=3 (all=2): threshold=1 (need 1 remote; previously required 2)
// N=4 (all=3): threshold=2
// N=5 (all=4): threshold=2
// - This change allows faster ownership under partial remote availability in small clusters.
log.Printf("RequestOwnership quorum evaluation id=%s host=%s ok=%d all=%d", id, p.Hostname, ok, all)
threshold := (all + 1) / 2 // floor(N/2)
if ok < threshold {
p.removeLocalGrain(id)
return fmt.Errorf("quorum not reached (ok=%d all=%d)", ok, all)
log.Printf("RequestOwnership failed quorum id=%s host=%s ok=%d all=%d threshold=%d", id, p.Hostname, ok, all, threshold)
return fmt.Errorf("quorum not reached (ok=%d all=%d threshold=%d)", ok, all, threshold)
}
grainSyncCount.Inc()
log.Printf("RequestOwnership success id=%s host=%s ok=%d all=%d threshold=%d", id, p.Hostname, ok, all, threshold)
return nil
}
@@ -400,7 +414,9 @@ func (p *SyncedPool) removeLocalGrain(id CartId) {
p.mu.Unlock()
}
// getGrain returns a local or remote grain; if absent, attempts ownership.
// getGrain returns a local or remote grain. If absent, it synchronously attempts
// to acquire ownership before spawning a local grain to eliminate the race where
// the first mutation applies before peers have installed remote proxies.
func (p *SyncedPool) getGrain(id CartId) (Grain, error) {
p.mu.RLock()
localGrain, isLocal := p.local.grains[id]
@@ -415,10 +431,20 @@ func (p *SyncedPool) getGrain(id CartId) (Grain, error) {
return remoteGrain, nil
}
// Attempt to claim ownership (async semantics preserved)
go p.RequestOwnership(id)
// Synchronously attempt to claim ownership. If this fails (quorum not reached)
// we re-check for a newly appeared remote proxy (another node became owner).
if err := p.RequestOwnership(id); err != nil {
p.mu.RLock()
if rg, ok := p.remoteIndex[id]; ok {
p.mu.RUnlock()
remoteLookupCount.Inc()
return rg, nil
}
p.mu.RUnlock()
return nil, err
}
// Create local grain (lazy spawn) - may be rolled back by quorum failure
// Ownership acquired; now lazily spawn the local grain.
grain, err := p.local.GetGrain(id)
if err != nil {
return nil, err