diff --git a/docs/configuration.md b/docs/configuration.md index d2c9e05c..2dd8cdc1 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -44,11 +44,42 @@ The option `imageCopyPolicy` (default: `delayed`) defines the image copy strateg ## ImageCopyDeadline +## Cache Configuration + +When caching is enabled, k8s-image-swapper caches the existence of images to reduce strain on the target registry. +This means that if an image is deleted from the target registry, k8s-image-swapper will continue to think it exists until the cache expires. +There are two settings that control this behavior: + +### Cache TTL + +The option `cacheTtlMinutes` (default: `1440` - 24 hours) defines how long image existence information is cached. Set to `0` to disable caching entirely. + +### Cache Jitter + +The option `cacheJitterMaxMinutes` (default: `180` - 3 hours) defines the maximum random time added to the TTL to prevent a cache stampede. When many cache entries expire at the same time, it can cause a sudden spike in registry requests. Adding random jitter helps spread these requests out. + +!!! example + ```yaml + # Cache for 4 hours (240 minutes) with up to 30 minutes of random jitter + cacheTtlMinutes: 240 + cacheJitterMaxMinutes: 30 + + # Disable caching completely + cacheTtlMinutes: 0 + cacheJitterMaxMinutes: 0 + + # Default behavior if not specified: + # cacheTtlMinutes: 1440 # 24 hours + # cacheJitterMaxMinutes: 180 # 3 hours + ``` + +!!! note + The actual cache duration for each entry will be: `cacheTtlMinutes + random(0 to cacheJitterMaxMinutes)` minutes + The option `imageCopyDeadline` (default: `8s`) defines the duration after which the image copy if aborted. This option only applies for `immediate` and `force` image copy strategies. - ## Source This section configures details about the image source. diff --git a/pkg/config/config.go b/pkg/config/config.go index a4bb860d..9d9c1111 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -38,10 +38,12 @@ type Config struct { ListenAddress string - DryRun bool `yaml:"dryRun"` - ImageSwapPolicy string `yaml:"imageSwapPolicy" validate:"oneof=always exists"` - ImageCopyPolicy string `yaml:"imageCopyPolicy" validate:"oneof=delayed immediate force none"` - ImageCopyDeadline time.Duration `yaml:"imageCopyDeadline"` + DryRun bool `yaml:"dryRun"` + ImageSwapPolicy string `yaml:"imageSwapPolicy" validate:"oneof=always exists"` + ImageCopyPolicy string `yaml:"imageCopyPolicy" validate:"oneof=delayed immediate force none"` + ImageCopyDeadline time.Duration `yaml:"imageCopyDeadline"` + CacheTtlMinutes int `yaml:"cacheTtlMinutes"` + CacheJitterMaxMinutes int `yaml:"cacheJitterMaxMinutes"` Source Source `yaml:"source"` Target Registry `yaml:"target"` @@ -164,4 +166,6 @@ func SetViperDefaults(v *viper.Viper) { v.SetDefault("Target.AWS.ECROptions.ImageScanningConfiguration.ImageScanOnPush", true) v.SetDefault("Target.AWS.ECROptions.ImageTagMutability", "MUTABLE") v.SetDefault("Target.AWS.ECROptions.EncryptionConfiguration.EncryptionType", "AES256") + v.SetDefault("CacheTtlMinutes", 1440) // 24 hours + v.SetDefault("CacheJitterMaxMinutes", 180) // 3 hours } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index ed10f129..aec25fd3 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -21,6 +21,8 @@ func TestConfigParses(t *testing.T) { name: "should render empty config with defaults", cfg: "", expCfg: Config{ + CacheTtlMinutes: 1440, + CacheJitterMaxMinutes: 180, Target: Registry{ Type: "aws", AWS: AWS{ @@ -46,6 +48,8 @@ source: - jmespath: "obj.metadata.namespace != 'playground'" `, expCfg: Config{ + CacheTtlMinutes: 1440, + CacheJitterMaxMinutes: 180, Target: Registry{ Type: "aws", AWS: AWS{ @@ -85,6 +89,8 @@ target: value: B `, expCfg: Config{ + CacheTtlMinutes: 1440, + CacheJitterMaxMinutes: 180, Target: Registry{ Type: "aws", AWS: AWS{ @@ -129,6 +135,8 @@ source: region: "us-east-1" `, expCfg: Config{ + CacheTtlMinutes: 1440, + CacheJitterMaxMinutes: 180, Target: Registry{ Type: "aws", AWS: AWS{ @@ -178,6 +186,8 @@ target: value: B `, expCfg: Config{ + CacheTtlMinutes: 1440, + CacheJitterMaxMinutes: 180, Target: Registry{ Type: "aws", AWS: AWS{ @@ -207,6 +217,56 @@ target: }, }, }, + { + name: "should render custom cache settings", + cfg: ` +cacheTtlMinutes: 60 +cacheJitterMaxMinutes: 20 +`, + expCfg: Config{ + CacheTtlMinutes: 60, + CacheJitterMaxMinutes: 20, + Target: Registry{ + Type: "aws", + AWS: AWS{ + ECROptions: ECROptions{ + ImageTagMutability: "MUTABLE", + ImageScanningConfiguration: ImageScanningConfiguration{ + ImageScanOnPush: true, + }, + EncryptionConfiguration: EncryptionConfiguration{ + EncryptionType: "AES256", + }, + }, + }, + }, + }, + }, + { + name: "should allow disabling cache", + cfg: ` +cacheTtlMinutes: 0 # Disable cache +cacheJitterMaxMinutes: 0 # No jitter needed when cache is disabled +`, + expCfg: Config{ + CacheTtlMinutes: 0, + CacheJitterMaxMinutes: 0, + Target: Registry{ + Type: "aws", + AWS: AWS{ + ECROptions: ECROptions{ + ImageTagMutability: "MUTABLE", + ImageScanningConfiguration: ImageScanningConfiguration{ + ImageScanOnPush: true, + }, + EncryptionConfiguration: EncryptionConfiguration{ + EncryptionType: "AES256", + }, + }, + }, + }, + }, + }, } for _, test := range tests { diff --git a/pkg/registry/ecr.go b/pkg/registry/ecr.go index c456b5ed..45c881a9 100644 --- a/pkg/registry/ecr.go +++ b/pkg/registry/ecr.go @@ -22,16 +22,19 @@ import ( "github.com/estahn/k8s-image-swapper/pkg/config" "github.com/go-co-op/gocron" "github.com/rs/zerolog/log" + "github.com/spf13/viper" ) type ECRClient struct { - client ecriface.ECRAPI - ecrDomain string - authToken []byte - cache *ristretto.Cache - scheduler *gocron.Scheduler - targetAccount string - options config.ECROptions + client ecriface.ECRAPI + ecrDomain string + authToken []byte + cache *ristretto.Cache + scheduler *gocron.Scheduler + targetAccount string + options config.ECROptions + cacheTtlMinutes int + cacheJitterMaxMinutes int } func NewECRClient(clientConfig config.AWS) (*ECRClient, error) { @@ -78,12 +81,14 @@ func NewECRClient(clientConfig config.AWS) (*ECRClient, error) { scheduler.StartAsync() client := &ECRClient{ - client: ecrClient, - ecrDomain: ecrDomain, - cache: cache, - scheduler: scheduler, - targetAccount: clientConfig.AccountID, - options: clientConfig.ECROptions, + client: ecrClient, + ecrDomain: ecrDomain, + cache: cache, + scheduler: scheduler, + targetAccount: clientConfig.AccountID, + options: clientConfig.ECROptions, + cacheTtlMinutes: viper.GetInt("CacheTtlMinutes"), + cacheJitterMaxMinutes: viper.GetInt("CacheJitterMaxMinutes"), } if err := client.scheduleTokenRenewal(); err != nil { @@ -242,9 +247,11 @@ func (e *ECRClient) PutImage() error { func (e *ECRClient) ImageExists(ctx context.Context, imageRef ctypes.ImageReference) bool { ref := imageRef.DockerReference().String() - if _, found := e.cache.Get(ref); found { - log.Ctx(ctx).Trace().Str("ref", ref).Msg("found in cache") - return true + if e.cacheTtlMinutes > 0 { + if _, found := e.cache.Get(ref); found { + log.Ctx(ctx).Trace().Str("ref", ref).Msg("found in cache") + return true + } } app := "skopeo" @@ -263,7 +270,12 @@ func (e *ECRClient) ImageExists(ctx context.Context, imageRef ctypes.ImageRefere log.Ctx(ctx).Trace().Str("ref", ref).Msg("found in target repository") - e.cache.SetWithTTL(ref, "", 1, 24*time.Hour+time.Duration(rand.Intn(180))*time.Minute) + if e.cacheTtlMinutes > 0 { + // Add random jitter to prevent cache stampede + jitter := time.Duration(rand.Intn(e.cacheJitterMaxMinutes)) * time.Minute + cacheTtl := time.Duration(e.cacheTtlMinutes) * time.Minute + e.cache.SetWithTTL(ref, "", 1, cacheTtl+jitter) + } return true } diff --git a/pkg/registry/ecr_test.go b/pkg/registry/ecr_test.go index 4660b96f..a6674d91 100644 --- a/pkg/registry/ecr_test.go +++ b/pkg/registry/ecr_test.go @@ -1,10 +1,13 @@ package registry import ( + "context" "encoding/base64" "testing" + "time" "github.com/containers/image/v5/transports/alltransports" + "github.com/dgraph-io/ristretto" "github.com/estahn/k8s-image-swapper/pkg/config" "github.com/stretchr/testify/assert" @@ -51,3 +54,57 @@ func TestECRIsOrigin(t *testing.T) { assert.Equal(t, testcase.expected, result) } } + +func TestEcrImageExistsCaching(t *testing.T) { + // Setup a test cache + cache, err := ristretto.NewCache(&ristretto.Config{ + NumCounters: 1e7, // number of keys to track frequency of (10M). + MaxCost: 1 << 30, // maximum cost of cache (1GB). + BufferItems: 64, // number of keys per Get buffer. + }) + assert.NoError(t, err) + + tests := []struct { + name string + cacheTtlMinutes int + expectCached bool + }{ + { + name: "cache disabled when TTL is 0", + cacheTtlMinutes: 0, + expectCached: false, + }, + { + name: "cache enabled with TTL and jitter", + cacheTtlMinutes: 60, + expectCached: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + ctx := context.Background() + client := NewDummyECRClient("us-east-1", "12345678912", "", config.ECROptions{}, []byte("")) + + // Setup cache + client.cache = cache + client.cacheTtlMinutes = tc.cacheTtlMinutes + + // Create a test image reference and add to cache. Use 100ms as TTL + imageRef, err := alltransports.ParseImageName("docker://12345678912.dkr.ecr.us-east-1.amazonaws.com/test-project/repo/test-image:latest") + cache.SetWithTTL(imageRef.DockerReference().String(), true, 1, 100*time.Millisecond) + assert.NoError(t, err) + + // Cache should be a hit + exists := client.ImageExists(ctx, imageRef) + assert.Equal(t, tc.expectCached, exists) + + if tc.expectCached { + // Verify cache expiry + time.Sleep(time.Duration(150 * time.Millisecond)) // Use milliseconds for testing + _, found := client.cache.Get(imageRef.DockerReference().String()) + assert.False(t, found, "cache entry should have expired") + } + }) + } +} diff --git a/pkg/registry/gar.go b/pkg/registry/gar.go index c129246b..2d4644e9 100644 --- a/pkg/registry/gar.go +++ b/pkg/registry/gar.go @@ -20,16 +20,19 @@ import ( "google.golang.org/api/transport" "github.com/rs/zerolog/log" + "github.com/spf13/viper" ) type GARAPI interface{} type GARClient struct { - client GARAPI - garDomain string - cache *ristretto.Cache - scheduler *gocron.Scheduler - authToken []byte + client GARAPI + garDomain string + cache *ristretto.Cache + scheduler *gocron.Scheduler + authToken []byte + cacheTtlMinutes int + cacheJitterMaxMinutes int } func NewGARClient(clientConfig config.GCP) (*GARClient, error) { @@ -46,10 +49,12 @@ func NewGARClient(clientConfig config.GCP) (*GARClient, error) { scheduler.StartAsync() client := &GARClient{ - client: nil, - garDomain: clientConfig.GarDomain(), - cache: cache, - scheduler: scheduler, + client: nil, + garDomain: clientConfig.GarDomain(), + cache: cache, + scheduler: scheduler, + cacheTtlMinutes: viper.GetInt("CacheTtlMinutes"), + cacheJitterMaxMinutes: viper.GetInt("CacheJitterMaxMinutes"), } if err := client.scheduleTokenRenewal(); err != nil { @@ -132,9 +137,11 @@ func (e *GARClient) PutImage() error { func (e *GARClient) ImageExists(ctx context.Context, imageRef ctypes.ImageReference) bool { ref := imageRef.DockerReference().String() - if _, found := e.cache.Get(ref); found { - log.Ctx(ctx).Trace().Str("ref", ref).Msg("found in cache") - return true + if e.cacheTtlMinutes > 0 { + if _, found := e.cache.Get(ref); found { + log.Ctx(ctx).Trace().Str("ref", ref).Msg("found in cache") + return true + } } app := "skopeo" @@ -153,7 +160,12 @@ func (e *GARClient) ImageExists(ctx context.Context, imageRef ctypes.ImageRefere log.Ctx(ctx).Trace().Str("ref", ref).Msg("found in target repository") - e.cache.SetWithTTL(ref, "", 1, 24*time.Hour+time.Duration(rand.Intn(180))*time.Minute) + if e.cacheTtlMinutes > 0 { + // Add random jitter to prevent cache stampede + jitter := time.Duration(rand.Intn(e.cacheJitterMaxMinutes)) * time.Minute + cacheTtl := time.Duration(e.cacheTtlMinutes) * time.Minute + e.cache.SetWithTTL(ref, "", 1, cacheTtl+jitter) + } return true } diff --git a/pkg/registry/gar_test.go b/pkg/registry/gar_test.go index 17ddde7e..f6ebd55a 100644 --- a/pkg/registry/gar_test.go +++ b/pkg/registry/gar_test.go @@ -1,10 +1,12 @@ package registry import ( + "context" "testing" + "time" "github.com/containers/image/v5/transports/alltransports" - + "github.com/dgraph-io/ristretto" "github.com/stretchr/testify/assert" ) @@ -36,3 +38,58 @@ func TestGARIsOrigin(t *testing.T) { assert.Equal(t, testcase.expected, result) } } + +func TestGarImageExistsCaching(t *testing.T) { + // Setup a test cache + cache, err := ristretto.NewCache(&ristretto.Config{ + NumCounters: 1e7, // number of keys to track frequency of (10M). + MaxCost: 1 << 30, // maximum cost of cache (1GB). + BufferItems: 64, // number of keys per Get buffer. + }) + assert.NoError(t, err) + + tests := []struct { + name string + cacheTtlMinutes int + expectCached bool + }{ + { + name: "cache disabled when TTL is 0", + cacheTtlMinutes: 0, + expectCached: false, + }, + { + name: "cache enabled with TTL and jitter", + cacheTtlMinutes: 60, + expectCached: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + ctx := context.Background() + client, err := NewMockGARClient(nil, "us-central1-docker.pkg.dev/test-project/repo") + assert.NoError(t, err) + + // Setup cache + client.cache = cache + client.cacheTtlMinutes = tc.cacheTtlMinutes + + // Create a test image reference and add to cache. Use 100ms as TTL + imageRef, err := alltransports.ParseImageName("docker://us-central1-docker.pkg.dev/test-project/repo/test-image:latest") + cache.SetWithTTL(imageRef.DockerReference().String(), true, 1, 100*time.Millisecond) + assert.NoError(t, err) + + // Cache should be a hit + exists := client.ImageExists(ctx, imageRef) + assert.Equal(t, tc.expectCached, exists) + + if tc.expectCached { + // Verify cache expiry + time.Sleep(time.Duration(150 * time.Millisecond)) // Use milliseconds for testing + _, found := client.cache.Get(imageRef.DockerReference().String()) + assert.False(t, found, "cache entry should have expired") + } + }) + } +}