Skip to content

Commit e345f28

Browse files
authored
[filebeat][azure-blob-storage] - Simplified state checkpoint calculation (elastic#40936)
1 parent 4184106 commit e345f28

File tree

2 files changed

+15
-34
lines changed

2 files changed

+15
-34
lines changed

CHANGELOG-developer.next.asciidoc

+1
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@ The list below covers the major changes between 7.0.0-rc2 and main only.
205205
- Add a configuration option for TCP/UDP network type. {issue}40407[40407] {pull}40623[40623]
206206
- Added debug logging to parquet reader in x-pack/libbeat/reader. {pull}40651[40651]
207207
- Added filebeat debug histograms for s3 object size and events per processed s3 object. {pull}40775[40775]
208+
- Simplified Azure Blob Storage input state checkpoint calculation logic. {issue}40674[40674] {pull}40936[40936]
208209

209210
==== Deprecated
210211

x-pack/filebeat/input/azureblobstorage/scheduler.go

+14-34
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ package azureblobstorage
77
import (
88
"context"
99
"fmt"
10+
"slices"
11+
"sort"
1012
"sync"
1113

1214
azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
@@ -190,41 +192,19 @@ func (s *scheduler) fetchBlobPager(batchSize int32) *azruntime.Pager[azblob.List
190192
// moveToLastSeenJob, moves to the latest job position past the last seen job
191193
// Jobs are stored in lexicographical order always, hence the latest position can be found either on the basis of job name or timestamp
192194
func (s *scheduler) moveToLastSeenJob(jobs []*job) []*job {
193-
var latestJobs []*job
194-
jobsToReturn := make([]*job, 0)
195-
counter := 0
196-
flag := false
197-
ignore := false
198-
199-
for _, job := range jobs {
200-
switch {
201-
case job.timestamp().After(s.state.checkpoint().LatestEntryTime):
202-
latestJobs = append(latestJobs, job)
203-
case job.name() == s.state.checkpoint().BlobName:
204-
flag = true
205-
case job.name() > s.state.checkpoint().BlobName:
206-
flag = true
207-
counter--
208-
case job.name() <= s.state.checkpoint().BlobName && (!ignore):
209-
ignore = true
210-
}
211-
counter++
212-
}
213-
214-
if flag && (counter < len(jobs)-1) {
215-
jobsToReturn = jobs[counter+1:]
216-
} else if !flag && !ignore {
217-
jobsToReturn = jobs
218-
}
219-
220-
// in a senario where there are some jobs which have a greater timestamp
221-
// but lesser alphanumeric order and some jobs have greater alphanumeric order
222-
// than the current checkpoint blob name, then we append the latest jobs
223-
if len(jobsToReturn) != len(jobs) && len(latestJobs) > 0 {
224-
jobsToReturn = append(latestJobs, jobsToReturn...)
225-
}
195+
cp := s.state.checkpoint()
196+
jobs = slices.DeleteFunc(jobs, func(j *job) bool {
197+
return !(j.timestamp().After(cp.LatestEntryTime) || j.name() > cp.BlobName)
198+
})
226199

227-
return jobsToReturn
200+
// In a scenario where there are some jobs which have a greater timestamp
201+
// but lesser lexicographic order and some jobs have greater lexicographic order
202+
// than the current checkpoint blob name, we then sort around the pivot checkpoint
203+
// timestamp.
204+
sort.SliceStable(jobs, func(i, _ int) bool {
205+
return jobs[i].timestamp().After(cp.LatestEntryTime)
206+
})
207+
return jobs
228208
}
229209

230210
func (s *scheduler) isFileSelected(name string) bool {

0 commit comments

Comments
 (0)