// Copyright 2019 Sorint.lab // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied // See the License for the specific language governing permissions and // limitations under the License. package wal import ( "bytes" "container/ring" "context" "encoding/json" "io" "io/ioutil" "path" "strings" "time" uuid "github.com/satori/go.uuid" "github.com/sorintlab/agola/internal/etcd" "github.com/sorintlab/agola/internal/objectstorage" "github.com/sorintlab/agola/internal/sequence" "github.com/pkg/errors" etcdclientv3 "go.etcd.io/etcd/clientv3" "go.etcd.io/etcd/clientv3/concurrency" etcdclientv3rpc "go.etcd.io/etcd/etcdserver/api/v3rpc/rpctypes" "go.etcd.io/etcd/mvcc/mvccpb" "go.uber.org/zap" ) // TODO(sgotti) handle etcd unwanted changes: // * Etcd cluster rebuild: we cannot rely on etcd header ClusterID since it could be the same as it's generated using the listen urls. We should add our own clusterid key and use it. // * Etcd cluster restored to a previous revision: really bad cause should detect that the revision is smaller than the current one // Storage paths // wals/{walSeq} // // Etcd paths // wals/{walSeq} const ( DefaultEtcdWalsKeepNum = 100 ) var ( ErrCompacted = errors.New("required revision has been compacted") ErrConcurrency = errors.New("wal concurrency error: change groups already updated") ) var ( // Storage paths. Always use path (not filepath) to use the "/" separator storageObjectsPrefix = "data/" storageWalsDir = "wals" storageWalsStatusDir = path.Join(storageWalsDir, "status") storageWalsDataDir = path.Join(storageWalsDir, "data") // etcd paths. Always use path (not filepath) to use the "/" separator etcdWalBaseDir = "walmanager" etcdWalsDir = path.Join(etcdWalBaseDir, "wals") etcdWalsDataKey = path.Join(etcdWalBaseDir, "walsdata") etcdWalSeqKey = path.Join(etcdWalBaseDir, "walseq") etcdLastCommittedStorageWalSeqKey = path.Join(etcdWalBaseDir, "lastcommittedstoragewalseq") etcdSyncLockKey = path.Join(etcdWalBaseDir, "synclock") etcdCheckpointLockKey = path.Join(etcdWalBaseDir, "checkpointlock") etcdWalCleanerLockKey = path.Join(etcdWalBaseDir, "walcleanerlock") etcdChangeGroupsDir = path.Join(etcdWalBaseDir, "changegroups") etcdChangeGroupMinRevisionKey = path.Join(etcdWalBaseDir, "changegroupsminrev") etcdPingKey = path.Join(etcdWalBaseDir, "ping") ) const ( etcdChangeGroupMinRevisionRange = 1000 ) func (w *WalManager) toStorageDataPath(path string) string { return w.basePath + storageObjectsPrefix + path } func (w *WalManager) fromStorageDataPath(path string) string { return strings.TrimPrefix(path, w.basePath+storageObjectsPrefix) } func (w *WalManager) storageWalStatusFile(walSeq string) string { return path.Join(w.basePath, storageWalsStatusDir, walSeq) } func (w *WalManager) storageWalDataFile(walFileID string) string { return path.Join(w.basePath, storageWalsDataDir, walFileID) } func etcdWalKey(walSeq string) string { return path.Join(etcdWalsDir, walSeq) } type ActionType string const ( ActionTypePut ActionType = "put" ActionTypeDelete ActionType = "delete" ) type Action struct { ActionType ActionType DataType string ID string Data []byte } type WalHeader struct { WalDataFileID string PreviousWalSequence string ChangeGroups map[string]int64 } type WalStatus string const ( // WalStatusCommitted represent a wal written to the objectstorage WalStatusCommitted WalStatus = "committed" // WalStatusCommittedStorage represent the .committed marker file written to the objectstorage WalStatusCommittedStorage WalStatus = "committed_storage" // WalStatusCheckpointed mean that all the wal actions have been executed on the objectstorage WalStatusCheckpointed WalStatus = "checkpointed" ) type WalsData struct { LastCommittedWalSequence string Revision int64 `json:"-"` } type WalData struct { WalDataFileID string WalStatus WalStatus WalSequence string PreviousWalSequence string ChangeGroups map[string]int64 } type ChangeGroupsUpdateToken struct { CurRevision int64 `json:"cur_revision"` ChangeGroupsRevisions changeGroupsRevisions `json:"change_groups_revisions"` } type changeGroupsRevisions map[string]int64 func (w *WalManager) ChangesCurrentRevision() (int64, error) { w.changes.Lock() defer w.changes.Unlock() if !w.changes.initialized { return 0, errors.Errorf("wal changes not ready") } return w.changes.revision, nil } func (w *WalManager) GetChangeGroupsUpdateToken(cgNames []string) (*ChangeGroupsUpdateToken, error) { w.changes.Lock() defer w.changes.Unlock() if !w.changes.initialized { return nil, errors.Errorf("wal changes not ready") } revision := w.changes.curRevision() cgr := w.changes.getChangeGroups(cgNames) return &ChangeGroupsUpdateToken{CurRevision: revision, ChangeGroupsRevisions: cgr}, nil } func (w *WalManager) MergeChangeGroupsUpdateTokens(cgts []*ChangeGroupsUpdateToken) *ChangeGroupsUpdateToken { mcgt := &ChangeGroupsUpdateToken{ChangeGroupsRevisions: make(changeGroupsRevisions)} for _, cgt := range cgts { // keep the lower curRevision if cgt.CurRevision != 0 && cgt.CurRevision < mcgt.CurRevision { mcgt.CurRevision = cgt.CurRevision } // keep the lower changegroup revision for cgName, cgRev := range cgt.ChangeGroupsRevisions { if mr, ok := mcgt.ChangeGroupsRevisions[cgName]; ok { if cgRev < mr { mcgt.ChangeGroupsRevisions[cgName] = cgRev } } else { mcgt.ChangeGroupsRevisions[cgName] = cgRev } } } return mcgt } func (w *WalManager) ReadObject(p string, cgNames []string) (io.ReadCloser, *ChangeGroupsUpdateToken, error) { w.changes.Lock() if !w.changes.initialized { w.changes.Unlock() return nil, nil, errors.Errorf("wal changes not ready") } walseq, ok := w.changes.getPut(p) revision := w.changes.curRevision() cgr := w.changes.getChangeGroups(cgNames) actions := w.changes.actions[walseq] w.changes.Unlock() cgt := &ChangeGroupsUpdateToken{CurRevision: revision, ChangeGroupsRevisions: cgr} if ok { for _, action := range actions { if action.ActionType == ActionTypePut { dataPath := w.dataToPathFunc(action.DataType, action.ID) if dataPath == p { w.log.Debugf("reading file from wal: %q", dataPath) return ioutil.NopCloser(bytes.NewReader(action.Data)), cgt, nil } } } return nil, nil, errors.Errorf("no file %s in wal %s", p, walseq) } f, err := w.ost.ReadObject(w.toStorageDataPath(p)) return f, cgt, err } func (w *WalManager) changesList(paths []string, prefix, startWith string, recursive bool) []string { fpaths := []string{} for _, p := range paths { if !recursive && len(p) > len(prefix) { rel := strings.TrimPrefix(p, prefix) skip := strings.Contains(rel, w.ost.Delimiter()) if skip { continue } } if strings.HasPrefix(p, prefix) && p > startWith { fpaths = append(fpaths, p) } } return fpaths } func (w *WalManager) List(prefix, startWith string, recursive bool, doneCh <-chan struct{}) <-chan objectstorage.ObjectInfo { objectCh := make(chan objectstorage.ObjectInfo, 1) prefix = w.toStorageDataPath(prefix) startWith = w.toStorageDataPath(startWith) w.changes.Lock() if !w.changes.initialized { w.changes.Unlock() objectCh <- objectstorage.ObjectInfo{Err: errors.Errorf("wal changes not ready")} return objectCh } changesList := w.changesList(w.changes.pathsOrdered, prefix, startWith, recursive) deletedChangesMap := w.changes.getDeletesMap() w.changes.Unlock() ci := 0 go func(objectCh chan<- objectstorage.ObjectInfo) { defer close(objectCh) for object := range w.ost.List(prefix, startWith, recursive, doneCh) { if object.Err != nil { objectCh <- object return } object.Path = w.fromStorageDataPath(object.Path) for ci < len(changesList) { p := changesList[ci] if p < object.Path { //w.log.Infof("using path from changelist: %q", p) select { // Send object content. case objectCh <- objectstorage.ObjectInfo{Path: p}: // If receives done from the caller, return here. case <-doneCh: return } ci++ } else if p == object.Path { ci++ break } else { break } } if _, ok := deletedChangesMap[object.Path]; ok { continue } //w.log.Infof("using path from objectstorage: %q", object.Path) select { // Send object content. case objectCh <- object: // If receives done from the caller, return here. case <-doneCh: return } } for ci < len(changesList) { //w.log.Infof("using path from changelist: %q", changesList[ci]) objectCh <- objectstorage.ObjectInfo{ Path: changesList[ci], } ci++ } }(objectCh) return objectCh } func (w *WalManager) HasOSTWal(walseq string) (bool, error) { _, err := w.ost.Stat(w.storageWalStatusFile(walseq) + ".committed") if err == objectstorage.ErrNotExist { return false, nil } if err != nil { return false, err } return true, nil } func (w *WalManager) ReadWal(walseq string) (io.ReadCloser, error) { return w.ost.ReadObject(w.storageWalStatusFile(walseq) + ".committed") } func (w *WalManager) ReadWalData(walFileID string) (io.ReadCloser, error) { return w.ost.ReadObject(w.storageWalDataFile(walFileID)) } type WalFile struct { WalSequence string Err error Committed bool Checkpointed bool } func (w *WalManager) ListOSTWals(start string) <-chan *WalFile { walCh := make(chan *WalFile, 1) go func() { doneCh := make(chan struct{}) defer close(doneCh) defer close(walCh) curWal := &WalFile{} var startPath string if start != "" { startPath = w.storageWalStatusFile(start) } for object := range w.ost.List(path.Join(w.basePath, storageWalsStatusDir)+"/", startPath, true, doneCh) { if object.Err != nil { walCh <- &WalFile{ Err: object.Err, } return } name := path.Base(object.Path) ext := path.Ext(name) walSequence := strings.TrimSuffix(name, ext) // wal file refers to another wal, so return the current one if curWal.WalSequence != walSequence { // if this happen something is wrong on the objectstorage if !curWal.Committed && curWal.Checkpointed { walCh <- &WalFile{ Err: errors.Errorf("wal is checkpointed but not committed. this should never happen"), } return } if curWal.WalSequence != "" { // skip not committed wals if curWal.Committed { walCh <- curWal } } curWal = &WalFile{ WalSequence: walSequence, } } if ext == ".committed" { curWal.Committed = true } if ext == ".checkpointed" { curWal.Checkpointed = true } } if curWal.WalSequence != "" { walCh <- curWal } }() return walCh } type ListEtcdWalsElement struct { WalData *WalData Err error } func (w *WalManager) ListEtcdWals(ctx context.Context, revision int64) <-chan *ListEtcdWalsElement { walCh := make(chan *ListEtcdWalsElement, 1) go func() { defer close(walCh) var continuation *etcd.ListPagedContinuation for { listResp, err := w.e.ListPaged(ctx, etcdWalsDir, revision, 10, continuation) if err != nil { walCh <- &ListEtcdWalsElement{ Err: err, } return } resp := listResp.Resp continuation = listResp.Continuation for _, kv := range resp.Kvs { var walData *WalData err := json.Unmarshal(kv.Value, &walData) walCh <- &ListEtcdWalsElement{ WalData: walData, Err: err, } } if !listResp.HasMore { break } } }() return walCh } // FirstAvailableWalData returns the first (the one with smaller sequence) wal // and returns it (or nil if not available) and the etcd revision at the time of // the operation func (w *WalManager) FirstAvailableWalData(ctx context.Context) (*WalData, int64, error) { // list waldata and just get the first if available listResp, err := w.e.ListPaged(ctx, etcdWalsDir, 0, 1, nil) if err != nil { return nil, 0, err } resp := listResp.Resp revision := resp.Header.Revision if len(resp.Kvs) == 0 { return nil, revision, nil } var walData *WalData if err := json.Unmarshal(resp.Kvs[0].Value, &walData); err != nil { return nil, 0, err } return walData, revision, nil } func (w *WalManager) LastCommittedStorageWal(ctx context.Context) (string, int64, error) { resp, err := w.e.Get(ctx, etcdLastCommittedStorageWalSeqKey, 0) if err != nil && err != etcd.ErrKeyNotFound { return "", 0, err } if err == etcd.ErrKeyNotFound { return "", 0, errors.Errorf("no last committedstorage wal on etcd") } lastCommittedStorageWal := string(resp.Kvs[0].Value) revision := resp.Header.Revision return lastCommittedStorageWal, revision, nil } type WatchElement struct { Revision int64 WalData *WalData ChangeGroupsRevisions changeGroupsRevisions Err error } func (w *WalManager) Watch(ctx context.Context, revision int64) <-chan *WatchElement { walCh := make(chan *WatchElement, 1) // TODO(sgotti) if the etcd cluster goes down, watch won't return an error but // wait until it comes back. We have to find a way to detect when the cluster // is down and report an error so our clients can react (i.e. a readdb could // mark itself as not in sync) wctx := etcdclientv3.WithRequireLeader(ctx) wch := w.e.Watch(wctx, etcdWalBaseDir+"/", revision) go func() { defer close(walCh) for wresp := range wch { we := &WatchElement{ChangeGroupsRevisions: make(changeGroupsRevisions)} send := false if wresp.Canceled { err := wresp.Err() switch err { case etcdclientv3rpc.ErrCompacted: we.Err = ErrCompacted default: we.Err = err } walCh <- we return } we.Revision = wresp.Header.Revision for _, ev := range wresp.Events { key := string(ev.Kv.Key) switch { case strings.HasPrefix(key, etcdWalsDir+"/"): send = true switch ev.Type { case mvccpb.PUT: var walData *WalData if err := json.Unmarshal(ev.Kv.Value, &walData); err != nil { we.Err = wresp.Err() walCh <- we return } we.WalData = walData } case strings.HasPrefix(key, etcdChangeGroupsDir+"/"): send = true switch ev.Type { case mvccpb.PUT: changeGroup := path.Base(string(ev.Kv.Key)) we.ChangeGroupsRevisions[changeGroup] = ev.Kv.ModRevision case mvccpb.DELETE: changeGroup := path.Base(string(ev.Kv.Key)) we.ChangeGroupsRevisions[changeGroup] = 0 } case key == etcdPingKey: send = true default: continue } } if send { walCh <- we } } }() return walCh } // WriteWal writes the provided actions in a wal file. The wal will be marked as // "committed" on etcd if the provided group changes aren't changed in the // meantime or a optimistic concurrency error will be returned and the wal won't // be committed // // TODO(sgotti) save inside the wal file also the previous committed wal to // handle possible objectstorage list operation eventual consistency gaps (list // won't report a wal at seq X but a wal at X+n, if this kind of eventual // consistency ever exists) func (w *WalManager) WriteWal(ctx context.Context, actions []*Action, cgt *ChangeGroupsUpdateToken) (*ChangeGroupsUpdateToken, error) { return w.WriteWalAdditionalOps(ctx, actions, cgt, nil, nil) } func (w *WalManager) WriteWalAdditionalOps(ctx context.Context, actions []*Action, cgt *ChangeGroupsUpdateToken, cmp []etcdclientv3.Cmp, then []etcdclientv3.Op) (*ChangeGroupsUpdateToken, error) { if len(actions) == 0 { return nil, errors.Errorf("cannot write wal: actions is empty") } walSequence, err := sequence.IncSequence(ctx, w.e, etcdWalSeqKey) if err != nil { return nil, err } resp, err := w.e.Get(ctx, etcdWalsDataKey, 0) if err != nil { return nil, err } var walsData WalsData if err := json.Unmarshal(resp.Kvs[0].Value, &walsData); err != nil { return nil, err } walsData.Revision = resp.Kvs[0].ModRevision walDataFileID := uuid.NewV4().String() walDataFilePath := w.storageWalDataFile(walDataFileID) walKey := etcdWalKey(walSequence.String()) var buf bytes.Buffer for _, action := range actions { actionj, err := json.Marshal(action) if err != nil { return nil, err } if _, err := buf.Write(actionj); err != nil { return nil, err } } if err := w.ost.WriteObject(walDataFilePath, bytes.NewReader(buf.Bytes())); err != nil { return nil, err } w.log.Debugf("wrote wal file: %s", walDataFilePath) walsData.LastCommittedWalSequence = walSequence.String() walData := &WalData{ WalSequence: walSequence.String(), WalDataFileID: walDataFileID, WalStatus: WalStatusCommitted, } walsDataj, err := json.Marshal(walsData) if err != nil { return nil, err } walDataj, err := json.Marshal(walData) if err != nil { return nil, err } if cmp == nil { cmp = []etcdclientv3.Cmp{} } if then == nil { then = []etcdclientv3.Op{} } getWalsData := etcdclientv3.OpGet(etcdWalsDataKey) getWal := etcdclientv3.OpGet(walKey) //w.log.Infof("cgt: %s", util.Dump(cgt)) if cgt != nil { for cgName, cgRev := range cgt.ChangeGroupsRevisions { cgKey := path.Join(etcdChangeGroupsDir, cgName) if cgRev > 0 { cmp = append(cmp, etcdclientv3.Compare(etcdclientv3.ModRevision(cgKey), "=", cgRev)) } else { cmp = append(cmp, etcdclientv3.Compare(etcdclientv3.CreateRevision(cgKey), "=", 0)) } then = append(then, etcdclientv3.OpPut(cgKey, "")) } if cgt.CurRevision > 0 { cmp = append(cmp, etcdclientv3.Compare(etcdclientv3.ModRevision(etcdChangeGroupMinRevisionKey), "<", cgt.CurRevision+etcdChangeGroupMinRevisionRange)) } } cmp = append(cmp, etcdclientv3.Compare(etcdclientv3.ModRevision(etcdWalsDataKey), "=", walsData.Revision)) cmp = append(cmp, etcdclientv3.Compare(etcdclientv3.Version(walKey), "=", 0)) then = append(then, etcdclientv3.OpPut(etcdWalsDataKey, string(walsDataj))) then = append(then, etcdclientv3.OpPut(walKey, string(walDataj))) // This will only succeed if no one else have concurrently updated the walsData // TODO(sgotti) retry if it failed due to concurrency errors txn := w.e.Client().Txn(ctx).If(cmp...).Then(then...).Else(getWalsData, getWal) tresp, err := txn.Commit() if err != nil { return nil, etcd.FromEtcdError(err) } if !tresp.Succeeded { walsDataRev := tresp.Responses[0].GetResponseRange().Kvs[0].ModRevision walDataCreateRev := tresp.Responses[0].GetResponseRange().Kvs[0].CreateRevision // TODO(sgotti) If the tx failed due to walsdata already updated we could retry if walsDataRev == walsData.Revision && walDataCreateRev == 0 { return nil, errors.Errorf("failed to write committed wal: wals groups already updated") } return nil, ErrConcurrency } ncgt := &ChangeGroupsUpdateToken{CurRevision: tresp.Header.Revision, ChangeGroupsRevisions: make(changeGroupsRevisions)} if cgt != nil { for cgName := range cgt.ChangeGroupsRevisions { ncgt.ChangeGroupsRevisions[cgName] = tresp.Header.Revision } } // try to commit storage right now if err := w.sync(ctx); err != nil { w.log.Errorf("wal sync error: %+v", err) } return ncgt, nil } func (w *WalManager) syncLoop(ctx context.Context) { for { w.log.Debugf("syncer") if err := w.sync(ctx); err != nil { w.log.Errorf("syncer error: %+v", err) } select { case <-ctx.Done(): return default: } time.Sleep(5 * time.Second) } } func (w *WalManager) sync(ctx context.Context) error { session, err := concurrency.NewSession(w.e.Client(), concurrency.WithTTL(5), concurrency.WithContext(ctx)) if err != nil { return err } defer session.Close() m := concurrency.NewMutex(session, etcdSyncLockKey) if err := m.Lock(ctx); err != nil { return err } defer m.Unlock(ctx) resp, err := w.e.List(ctx, etcdWalsDir+"/", "", 0) if err != nil { return err } for _, kv := range resp.Kvs { var walData WalData if err := json.Unmarshal(kv.Value, &walData); err != nil { return err } // wals must be committed and checkpointed in order. // TODO(sgotti) this could be optimized by parallelizing writes of wals that don't have common change groups switch walData.WalStatus { case WalStatusCommitted: walFilePath := w.storageWalStatusFile(walData.WalSequence) w.log.Debugf("syncing committed wal to storage") header := &WalHeader{ WalDataFileID: walData.WalDataFileID, ChangeGroups: walData.ChangeGroups, PreviousWalSequence: walData.PreviousWalSequence, } headerj, err := json.Marshal(header) if err != nil { return err } walFileCommittedPath := walFilePath + ".committed" if err := w.ost.WriteObject(walFileCommittedPath, bytes.NewReader(headerj)); err != nil { return err } w.log.Debugf("updating wal to state %q", WalStatusCommittedStorage) walData.WalStatus = WalStatusCommittedStorage walDataj, err := json.Marshal(walData) if err != nil { return err } cmp := []etcdclientv3.Cmp{} then := []etcdclientv3.Op{} cmp = append(cmp, etcdclientv3.Compare(etcdclientv3.ModRevision(string(kv.Key)), "=", kv.ModRevision)) then = append(then, etcdclientv3.OpPut(string(kv.Key), string(walDataj))) then = append(then, etcdclientv3.OpPut(string(etcdLastCommittedStorageWalSeqKey), string(walData.WalSequence))) // This will only succeed if the no one else have concurrently updated the wal keys in etcd txn := w.e.Client().Txn(ctx).If(cmp...).Then(then...) tresp, err := txn.Commit() if err != nil { return etcd.FromEtcdError(err) } if !tresp.Succeeded { return errors.Errorf("failed to write committedstorage wal: concurrent update") } case WalStatusCheckpointed: walFilePath := w.storageWalStatusFile(walData.WalSequence) w.log.Debugf("checkpointing committed wal to storage") walFileCheckpointedPath := walFilePath + ".checkpointed" if err := w.ost.WriteObject(walFileCheckpointedPath, bytes.NewReader([]byte{})); err != nil { return err } } } return nil } func (w *WalManager) checkpointLoop(ctx context.Context) { for { w.log.Debugf("checkpointer") if err := w.checkpoint(ctx); err != nil { w.log.Errorf("checkpoint error: %v", err) } select { case <-ctx.Done(): return default: } time.Sleep(2 * time.Second) } } func (w *WalManager) checkpoint(ctx context.Context) error { session, err := concurrency.NewSession(w.e.Client(), concurrency.WithTTL(5), concurrency.WithContext(ctx)) if err != nil { return err } defer session.Close() m := concurrency.NewMutex(session, etcdCheckpointLockKey) if err := m.Lock(ctx); err != nil { return err } defer m.Unlock(ctx) resp, err := w.e.List(ctx, etcdWalsDir+"/", "", 0) if err != nil { return err } for _, kv := range resp.Kvs { var walData WalData if err := json.Unmarshal(kv.Value, &walData); err != nil { return err } if walData.WalStatus == WalStatusCommitted { w.log.Warnf("wal %s not yet committed storage", walData.WalSequence) break } if walData.WalStatus == WalStatusCheckpointed { continue } walFilePath := w.storageWalDataFile(walData.WalDataFileID) w.log.Debugf("checkpointing wal: %q", walData.WalSequence) walFile, err := w.ost.ReadObject(walFilePath) if err != nil { return err } dec := json.NewDecoder(walFile) for { var action *Action err := dec.Decode(&action) if err == io.EOF { // all done break } if err != nil { walFile.Close() return err } if err := w.checkpointAction(ctx, action); err != nil { walFile.Close() return err } } walFile.Close() w.log.Debugf("updating wal to state %q", WalStatusCheckpointed) walData.WalStatus = WalStatusCheckpointed walDataj, err := json.Marshal(walData) if err != nil { return err } if _, err := w.e.AtomicPut(ctx, string(kv.Key), walDataj, kv.ModRevision, nil); err != nil { return err } } return nil } func (w *WalManager) checkpointAction(ctx context.Context, action *Action) error { dataPath := w.dataToPathFunc(action.DataType, action.ID) if dataPath == "" { return nil } path := w.toStorageDataPath(dataPath) switch action.ActionType { case ActionTypePut: w.log.Debugf("writing file: %q", path) if err := w.ost.WriteObject(path, bytes.NewReader(action.Data)); err != nil { return err } case ActionTypeDelete: w.log.Debugf("deleting file: %q", path) if err := w.ost.DeleteObject(path); err != nil && err != objectstorage.ErrNotExist { return err } } return nil } func (w *WalManager) walCleanerLoop(ctx context.Context) { for { w.log.Debugf("walcleaner") if err := w.walCleaner(ctx); err != nil { w.log.Errorf("walcleaner error: %v", err) } select { case <-ctx.Done(): return default: } time.Sleep(2 * time.Second) } } // walCleaner will clean already checkpointed wals from etcd // it must always keep at least one wal that is needed for resync operations // from clients func (w *WalManager) walCleaner(ctx context.Context) error { session, err := concurrency.NewSession(w.e.Client(), concurrency.WithTTL(5), concurrency.WithContext(ctx)) if err != nil { return err } defer session.Close() m := concurrency.NewMutex(session, etcdWalCleanerLockKey) if err := m.Lock(ctx); err != nil { return err } defer m.Unlock(ctx) resp, err := w.e.List(ctx, etcdWalsDir+"/", "", 0) if err != nil { return err } if len(resp.Kvs) <= w.etcdWalsKeepNum { return nil } removeCount := len(resp.Kvs) - w.etcdWalsKeepNum for _, kv := range resp.Kvs { var walData WalData if err := json.Unmarshal(kv.Value, &walData); err != nil { return err } if walData.WalStatus != WalStatusCheckpointed { break } // TODO(sgotti) check that the objectstorage returns the wal actions as checkpointed. // With eventual consistent object storages like S3 we shouldn't remove a wal // file from etcd (and so from the cache) until we are sure there're no // eventual consistency issues. The difficult part is how to check them and be // sure that no objects with old data will be returned? Is it enough to read // it back or the result could just be luckily correct but another client may // arrive to a differnt S3 server that is not yet in sync? w.log.Infof("removing wal %q from etcd", walData.WalSequence) if _, err := w.e.AtomicDelete(ctx, string(kv.Key), kv.ModRevision); err != nil { return err } removeCount-- if removeCount == 0 { return nil } } return nil } func (w *WalManager) compactChangeGroupsLoop(ctx context.Context) { for { if err := w.compactChangeGroups(ctx); err != nil { w.log.Errorf("err: %+v", err) } select { case <-ctx.Done(): return default: } time.Sleep(1 * time.Second) } } func (w *WalManager) compactChangeGroups(ctx context.Context) error { resp, err := w.e.Client().Get(ctx, etcdChangeGroupMinRevisionKey) if err != nil { return err } revision := resp.Kvs[0].ModRevision // first update minrevision cmp := etcdclientv3.Compare(etcdclientv3.ModRevision(etcdChangeGroupMinRevisionKey), "=", revision) then := etcdclientv3.OpPut(etcdChangeGroupMinRevisionKey, "") txn := w.e.Client().Txn(ctx).If(cmp).Then(then) tresp, err := txn.Commit() if err != nil { return etcd.FromEtcdError(err) } if !tresp.Succeeded { return errors.Errorf("failed to update change group min revision key due to concurrent update") } revision = tresp.Header.Revision // then remove all the groups keys with modrevision < minrevision resp, err = w.e.List(ctx, etcdChangeGroupsDir, "", 0) if err != nil { return err } for _, kv := range resp.Kvs { if kv.ModRevision < revision-etcdChangeGroupMinRevisionRange { cmp := etcdclientv3.Compare(etcdclientv3.ModRevision(string(kv.Key)), "=", kv.ModRevision) then := etcdclientv3.OpDelete(string(kv.Key)) txn := w.e.Client().Txn(ctx).If(cmp).Then(then) tresp, err := txn.Commit() if err != nil { return etcd.FromEtcdError(err) } if !tresp.Succeeded { w.log.Errorf("failed to update change group min revision key due to concurrent update") } } } return nil } // etcdPingerLoop periodically updates a key. // This is used by watchers to inform the client of the current revision // this is needed since if other users are updating other unwatched keys on // etcd we won't be notified, not updating the known revisions and thus all the // walWrites will fails since the provided changegrouptoken will have an old // revision // TODO(sgotti) use upcoming etcd 3.4 watch RequestProgress??? func (w *WalManager) etcdPingerLoop(ctx context.Context) { for { if err := w.etcdPinger(ctx); err != nil { w.log.Errorf("err: %+v", err) } select { case <-ctx.Done(): return default: } time.Sleep(1 * time.Second) } } func (w *WalManager) etcdPinger(ctx context.Context) error { if _, err := w.e.Put(ctx, etcdPingKey, []byte{}, nil); err != nil { return err } return nil } func (w *WalManager) InitEtcd(ctx context.Context) error { writeWal := func(wal *WalFile) error { w.log.Infof("wal seq: %s", wal.WalSequence) walFile, err := w.ost.ReadObject(w.storageWalStatusFile(wal.WalSequence) + ".committed") if err != nil { return err } dec := json.NewDecoder(walFile) var header *WalHeader if err = dec.Decode(&header); err != nil && err != io.EOF { walFile.Close() return err } walFile.Close() walData := &WalData{ WalSequence: wal.WalSequence, WalDataFileID: header.WalDataFileID, WalStatus: WalStatusCommitted, ChangeGroups: header.ChangeGroups, } if wal.Checkpointed { walData.WalStatus = WalStatusCheckpointed } walDataj, err := json.Marshal(walData) if err != nil { return err } cmp := []etcdclientv3.Cmp{} then := []etcdclientv3.Op{} // only add if it doesn't exist cmp = append(cmp, etcdclientv3.Compare(etcdclientv3.CreateRevision(etcdWalKey(wal.WalSequence)), "=", 0)) then = append(then, etcdclientv3.OpPut(etcdWalKey(wal.WalSequence), string(walDataj))) txn := w.e.Client().Txn(ctx).If(cmp...).Then(then...) tresp, err := txn.Commit() if err != nil { return etcd.FromEtcdError(err) } if !tresp.Succeeded { return errors.Errorf("failed to sync etcd: wal %q already written", wal.WalSequence) } return nil } // Create changegroup min revision if it doesn't exists cmp := []etcdclientv3.Cmp{} then := []etcdclientv3.Op{} cmp = append(cmp, etcdclientv3.Compare(etcdclientv3.CreateRevision(etcdChangeGroupMinRevisionKey), "=", 0)) then = append(then, etcdclientv3.OpPut(etcdChangeGroupMinRevisionKey, "")) txn := w.e.Client().Txn(ctx).If(cmp...).Then(then...) if _, err := txn.Commit(); err != nil { return etcd.FromEtcdError(err) } _, err := w.e.Get(ctx, etcdWalsDataKey, 0) if err != nil && err != etcd.ErrKeyNotFound { return err } if err == nil { return nil } w.log.Infof("no data found in etcd, initializing") // walsdata not found in etcd // if there're some wals in the objectstorage this means etcd has been reset. // So take all the wals in committed or checkpointed state starting from the // first not checkpointed wal and put them in etcd lastCommittedStorageWalsRing := ring.New(100) lastCommittedStorageWalElem := lastCommittedStorageWalsRing lastCommittedStorageWalSequence := "" wroteWals := 0 for wal := range w.ListOSTWals("") { w.log.Infof("wal: %s", wal) if wal.Err != nil { return wal.Err } lastCommittedStorageWalElem.Value = wal lastCommittedStorageWalElem = lastCommittedStorageWalElem.Next() lastCommittedStorageWalSequence = wal.WalSequence if wal.Checkpointed { continue } if err := writeWal(wal); err != nil { return err } wroteWals++ } // if no wal has been written (because all are checkpointed), write at least // the ones in the ring if wroteWals == 0 { var err error lastCommittedStorageWalsRing.Do(func(e interface{}) { if e == nil { return } wal := e.(*WalFile) err = writeWal(wal) if err != nil { return } lastCommittedStorageWalSequence = wal.WalSequence }) if err != nil { return err } } walsData := &WalsData{ LastCommittedWalSequence: lastCommittedStorageWalSequence, } walsDataj, err := json.Marshal(walsData) if err != nil { return err } // save walsdata and lastcommittedstoragewalseq only after writing all the // wals in etcd // in this way if something fails while adding wals to etcd it'll be retried // since waldata doesn't exists cmp = []etcdclientv3.Cmp{} then = []etcdclientv3.Op{} cmp = append(cmp, etcdclientv3.Compare(etcdclientv3.CreateRevision(etcdWalsDataKey), "=", 0)) then = append(then, etcdclientv3.OpPut(etcdWalsDataKey, string(walsDataj))) then = append(then, etcdclientv3.OpPut(etcdLastCommittedStorageWalSeqKey, lastCommittedStorageWalSequence)) txn = w.e.Client().Txn(ctx).If(cmp...).Then(then...) tresp, err := txn.Commit() if err != nil { return etcd.FromEtcdError(err) } if !tresp.Succeeded { return errors.Errorf("failed to sync etcd: waldata already written") } return nil } type CheckpointFunc func(action *Action) error type DataToPathFunc func(dataType string, id string) string func NoOpDataToPath(dataType string, id string) string { return "" } type WalManagerConfig struct { BasePath string E *etcd.Store OST *objectstorage.ObjStorage EtcdWalsKeepNum int CheckpointFunc CheckpointFunc DataToPathFunc DataToPathFunc } type WalManager struct { basePath string log *zap.SugaredLogger e *etcd.Store ost *objectstorage.ObjStorage changes *WalChanges etcdWalsKeepNum int checkpointFunc CheckpointFunc dataToPathFunc DataToPathFunc } func NewWalManager(ctx context.Context, logger *zap.Logger, conf *WalManagerConfig) (*WalManager, error) { if conf.EtcdWalsKeepNum == 0 { conf.EtcdWalsKeepNum = DefaultEtcdWalsKeepNum } if conf.EtcdWalsKeepNum < 1 { return nil, errors.New("etcdWalsKeepNum must be greater than 0") } dataToPathFunc := conf.DataToPathFunc if dataToPathFunc == nil { dataToPathFunc = NoOpDataToPath } w := &WalManager{ basePath: conf.BasePath, log: logger.Sugar(), e: conf.E, ost: conf.OST, etcdWalsKeepNum: conf.EtcdWalsKeepNum, changes: NewWalChanges(), checkpointFunc: conf.CheckpointFunc, dataToPathFunc: dataToPathFunc, } // add trailing slash the basepath if w.basePath != "" && !strings.HasSuffix(w.basePath, "/") { w.basePath = w.basePath + "/" } return w, nil } func (w *WalManager) Run(ctx context.Context, readyCh chan struct{}) error { for { err := w.InitEtcd(ctx) if err == nil { break } w.log.Errorf("failed to initialize etcd: %+v", err) time.Sleep(1 * time.Second) } readyCh <- struct{}{} go w.watcherLoop(ctx) go w.syncLoop(ctx) go w.checkpointLoop(ctx) go w.walCleanerLoop(ctx) go w.compactChangeGroupsLoop(ctx) go w.etcdPingerLoop(ctx) select { case <-ctx.Done(): w.log.Infof("walmanager exiting") return nil } }