// Copyright (C) 2025 wangyusong // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . package block import ( "bufio" "context" "encoding/json" "fmt" "math" "os" "path/filepath" "runtime" "slices" "strconv" "strings" "sync" "sync/atomic" "time" "github.com/benbjohnson/clock" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "k8s.io/utils/ptr" "github.com/glidea/zenfeed/pkg/component" "github.com/glidea/zenfeed/pkg/llm" "github.com/glidea/zenfeed/pkg/model" "github.com/glidea/zenfeed/pkg/storage/feed/block/chunk" "github.com/glidea/zenfeed/pkg/storage/feed/block/index" "github.com/glidea/zenfeed/pkg/storage/feed/block/index/inverted" "github.com/glidea/zenfeed/pkg/storage/feed/block/index/primary" "github.com/glidea/zenfeed/pkg/storage/feed/block/index/vector" "github.com/glidea/zenfeed/pkg/telemetry" "github.com/glidea/zenfeed/pkg/telemetry/log" telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model" "github.com/glidea/zenfeed/pkg/util/heap" "github.com/glidea/zenfeed/pkg/util/retry" runtimeutil "github.com/glidea/zenfeed/pkg/util/runtime" timeutil "github.com/glidea/zenfeed/pkg/util/time" ) var clk = clock.New() // --- Interface code block --- type Block interface { component.Component Reload(c *Config) error Start() time.Time End() time.Time State() State TransformToCold() error ClearOnDisk() error Append(ctx context.Context, feeds ...*model.Feed) error Query(ctx context.Context, query QueryOptions) ([]*FeedVO, error) Exists(ctx context.Context, id uint64) (bool, error) } type Config struct { Dir string FlushInterval time.Duration ForCreate *ForCreateConfig // Copy from ForCreateConfig or load from disk. start time.Time duration time.Duration embeddingLLM string } type ForCreateConfig struct { Start time.Time Duration time.Duration EmbeddingLLM string } func (c *Config) Validate() error { if c.Dir == "" { return errors.New("dir is required") } if c.FlushInterval == 0 { c.FlushInterval = 200 * time.Millisecond } if c.ForCreate != nil { if err := c.validateForCreate(); err != nil { return errors.Wrap(err, "validate for create") } } else { // Load from disk. if err := c.validateForLoad(); err != nil { return errors.Wrap(err, "validate for load") } } return nil } func (c *Config) validateForCreate() error { cfc := c.ForCreate if cfc.Start.IsZero() { return errors.New("start is required") } if cfc.Duration == 0 { cfc.Duration = 25 * time.Hour } if cfc.Duration < timeutil.Day || cfc.Duration > 15*timeutil.Day { return errors.Errorf("duration must be between %s and %s", timeutil.Day, 15*timeutil.Day) } if cfc.EmbeddingLLM == "" { return errors.New("embedding LLM is required") } c.start = cfc.Start c.duration = cfc.Duration c.embeddingLLM = cfc.EmbeddingLLM return nil } func (c *Config) validateForLoad() error { b, err := os.ReadFile(filepath.Join(c.Dir, metadataFilename)) switch { case err == nil: case os.IsNotExist(err): return errors.New("metadata file not found") default: return errors.Wrap(err, "reading metadata file") } var m metadata if err := json.Unmarshal(b, &m); err != nil { return errors.Wrap(err, "unmarshalling metadata") } c.start = m.Start c.duration = m.Duration c.embeddingLLM = m.EmbeddingLLM return nil } func (c *Config) end() time.Time { return c.start.Add(c.duration) } type Dependencies struct { ChunkFactory chunk.Factory PrimaryFactory primary.Factory InvertedFactory inverted.Factory VectorFactory vector.Factory LLMFactory llm.Factory } var ( states = []string{string(StateHot), string(StateCold)} blockInfo = promauto.NewGaugeVec( prometheus.GaugeOpts{ Namespace: model.AppName, Subsystem: "block", Name: "info", Help: "Block info.", }, []string{telemetrymodel.KeyComponent, telemetrymodel.KeyComponentInstance, "state", "dir"}, ) ) // Directory structure (Interface on disk) // // .data // * cold-block-name-1 // *** archive.json // archive metadata, also indicates that this is a cold block // *** metadata.json // metadata // *** index // ***** primary // ***** inverted // ***** vector // *** chunk // ***** 1 // ***** 2 // ***** ... // // * hot-block-name-1 // *** metadata.json // metadata // *** chunk // If there is only a chunk directory, it means it is a hot block // ***** 1 // ***** 2 // ***** ... const ( archiveMetaFilename = "archive.json" metadataFilename = "metadata.json" chunkDirname = "chunk" indexDirname = "index" indexPrimaryFilename = "primary" indexInvertedFilename = "inverted" indexVectorFilename = "vector" // estimatedChunkFeedsLimit is the estimated maximum number of feeds in a chunk. // Due to lock-free concurrency, the actual value may be larger. estimatedChunkFeedsLimit = 5000 ) // archiveMetadata is the metadata of the cold block. type archiveMetadata struct { FeedCount uint32 `json:"feed_count"` } type metadata struct { Start time.Time `json:"start"` Duration time.Duration `json:"duration"` EmbeddingLLM string `json:"embedding_llm"` } var ( chunkFilename = func(chunk uint32) string { return strconv.FormatUint(uint64(chunk), 10) } parseChunkFilename = func(name string) (chunk uint32, err error) { chunk64, err := strconv.ParseUint(name, 10, 32) if err != nil { return 0, errors.Wrap(err, "invalid chunk filename format") } return uint32(chunk64), nil } ) // State is the state of the block. type State string const ( // StateHot means the block is writable. // ALL data is in memory. // It is the head of the block chain. StateHot State = "hot" // StateCold for read only. // It has indexs on disk. StateCold State = "cold" ) // FeedVO is the feed view for query result. type FeedVO struct { *model.Feed `json:",inline"` Vectors [][]float32 `json:"-"` Score float32 `json:"score,omitempty"` // Only exists when SemanticFilter is set. } type FeedVOs []*FeedVO func NewFeedVOHeap(feeds []*FeedVO) *heap.Heap[*FeedVO] { return heap.New(feeds, func(a, b *FeedVO) bool { if a.Score == b.Score { return a.Time.Before(b.Time) } return a.Score < b.Score }) } type QueryOptions struct { Query string Threshold float32 LabelFilters []string labelFilters []LabelFilter Limit int Start, End time.Time } var ( LabelFilterEqual = "=" LabelFilterNotEqual = "!=" NewLabelFilter = func(key, value string, eq bool) string { if eq { return fmt.Sprintf("%s%s%s", key, LabelFilterEqual, value) } return fmt.Sprintf("%s%s%s", key, LabelFilterNotEqual, value) } ParseLabelFilter = func(filter string) (LabelFilter, error) { eq := false parts := strings.Split(filter, LabelFilterNotEqual) if len(parts) != 2 { parts = strings.Split(filter, LabelFilterEqual) eq = true } if len(parts) != 2 { return LabelFilter{}, errors.New("invalid label filter") } return LabelFilter{Label: parts[0], Value: parts[1], Equal: eq}, nil } ) func (q *QueryOptions) Validate() error { //nolint:cyclop if q.Threshold < 0 || q.Threshold > 1 { return errors.New("threshold must be between 0 and 1") } for _, labelFilter := range q.LabelFilters { if labelFilter == "" { return errors.New("label filter is required") } filter, err := ParseLabelFilter(labelFilter) if err != nil { return errors.Wrap(err, "parse label filter") } q.labelFilters = append(q.labelFilters, filter) } if q.Threshold == 0 { q.Threshold = 0.55 } if q.Limit <= 0 { q.Limit = 10 } if q.Limit > 500 { return errors.New("limit must be less than or equal to 500") } if q.Start.IsZero() { q.Start = time.Now().Add(-time.Hour * 24) } if q.End.IsZero() { q.End = time.Now() } if q.End.Before(q.Start) { return errors.New("end time must be after start time") } return nil } func (q *QueryOptions) HitTimeRangeCondition(b Block) bool { bStart, bEnd := b.Start(), b.End() qStart, qEnd := q.Start, q.End if qStart.IsZero() && qEnd.IsZero() { return true } if qStart.IsZero() { return bStart.Before(qEnd) } if qEnd.IsZero() { return !bEnd.Before(qStart) } in := func(t time.Time, s, e time.Time) bool { // [start, end) return !t.Before(s) && t.Before(e) } queryAsBase := in(bStart, qStart, qEnd) || in(bEnd, qStart, qEnd) blockAsBase := in(qStart, bStart, bEnd) || in(qEnd, bStart, bEnd) return queryAsBase || blockAsBase } // LabelFilter defines the matcher for an item. type LabelFilter struct { Label string Equal bool Value string } // --- Factory code block --- type Factory component.Factory[Block, Config, Dependencies] func NewFactory(mockOn ...component.MockOption) Factory { if len(mockOn) > 0 { return component.FactoryFunc[Block, Config, Dependencies]( func(instance string, config *Config, dependencies Dependencies) (Block, error) { m := &mockBlock{} component.MockOptions(mockOn).Apply(&m.Mock) return m, nil }, ) } return component.FactoryFunc[Block, Config, Dependencies](new) } func new(instance string, config *Config, dependencies Dependencies) (Block, error) { if err := config.Validate(); err != nil { return nil, errors.Wrap(err, "validate config") } // New block. block, err := newBlock(instance, config, dependencies) if err != nil { return nil, errors.Wrap(err, "creating block") } // Init block. if createMode := config.ForCreate != nil; createMode { return initBlock(block) } // Load mode. state, err := block.loadState() if err != nil { return nil, errors.Wrap(err, "checking hot on disk") } block.state.Store(state) // Load hot block data from disk. // For cold block, it will be loaded when the query is called. if state == StateHot { if err := block.load(context.Background(), nil); err != nil { return nil, errors.Wrap(err, "decoding from disk") } } return block, nil } func newBlock(instance string, config *Config, dependencies Dependencies) (*block, error) { primaryIndex, vectorIndex, invertedIndex, err := newIndexs(instance, dependencies) if err != nil { return nil, errors.Wrap(err, "creating indexs") } block := &block{ Base: component.New(&component.BaseConfig[Config, Dependencies]{ Name: "FeedBlock", Instance: instance, Config: config, Dependencies: dependencies, }), primaryIndex: primaryIndex, vectorIndex: vectorIndex, invertedIndex: invertedIndex, toWrite: make(chan []*chunk.Feed, 1024), chunks: make(chunkChain, 0, 2), } block.lastDataAccess.Store(clk.Now()) return block, nil } func newIndexs(instance string, dependencies Dependencies) (primary.Index, vector.Index, inverted.Index, error) { primaryIndex, err := dependencies.PrimaryFactory.New(instance, &primary.Config{}, primary.Dependencies{}) if err != nil { return nil, nil, nil, errors.Wrap(err, "creating primary index") } vectorIndex, err := dependencies.VectorFactory.New(instance, &vector.Config{}, vector.Dependencies{}) if err != nil { return nil, nil, nil, errors.Wrap(err, "creating vector index") } invertedIndex, err := dependencies.InvertedFactory.New(instance, &inverted.Config{}, inverted.Dependencies{}) if err != nil { return nil, nil, nil, errors.Wrap(err, "creating inverted index") } return primaryIndex, vectorIndex, invertedIndex, nil } func initBlock(block *block) (Block, error) { // Create metadata file. if err := os.MkdirAll(block.Config().Dir, 0700); err != nil { return nil, errors.Wrap(err, "creating block directory") } metadata := metadata{ Start: block.Config().start, Duration: block.Config().duration, EmbeddingLLM: block.Config().embeddingLLM, } b := runtimeutil.Must1(json.Marshal(metadata)) p := filepath.Join(block.Config().Dir, metadataFilename) if err := os.WriteFile(p, b, 0600); err != nil { return nil, errors.Wrap(err, "creating metadata file") } // Create head chunk. if err := os.MkdirAll(filepath.Join(block.Config().Dir, chunkDirname), 0700); err != nil { return nil, errors.Wrap(err, "creating chunk directory") } id := uint32(0) chunk, err := block.Dependencies().ChunkFactory.New( fmt.Sprintf("%s-%d", block.Instance(), id), &chunk.Config{Path: filepath.Join(block.Config().Dir, chunkDirname, chunkFilename(id))}, chunk.Dependencies{}, ) if err != nil { return nil, errors.Wrap(err, "creating head chunk file") } block.chunks = append(block.chunks, chunk) // Set state to hot. block.state.Store(StateHot) return block, nil } // --- Implementation code block --- type block struct { *component.Base[Config, Dependencies] primaryIndex primary.Index vectorIndex vector.Index invertedIndex inverted.Index toWrite chan []*chunk.Feed chunks chunkChain mu sync.RWMutex lastDataAccess atomic.Value state atomic.Value coldLoaded bool } func (b *block) Run() error { ctx := telemetry.StartWith(b.Context(), append(b.TelemetryLabels(), telemetrymodel.KeyOperation, "Run")...) defer func() { telemetry.End(ctx, nil) }() // Maintain metrics. go b.maintainMetrics(ctx) // Run writing worker. go b.runWritingWorker(ctx) // Run indexs. if err := component.RunUntilReady(ctx, b.primaryIndex, 10*time.Second); err != nil { return errors.Wrap(err, "running primary index") } if err := component.RunUntilReady(ctx, b.vectorIndex, 10*time.Second); err != nil { return errors.Wrap(err, "running vector index") } if err := component.RunUntilReady(ctx, b.invertedIndex, 10*time.Second); err != nil { return errors.Wrap(err, "running inverted index") } // Run chunks. for _, c := range b.chunks { if err := component.RunUntilReady(ctx, c, 10*time.Second); err != nil { return errors.Wrap(err, "running chunk") } } b.MarkReady() tick := clk.Ticker(30 * time.Second) defer tick.Stop() for { NEXT_SELECT: select { case now := <-tick.C: if err := b.reconcileState(ctx, now); err != nil { log.Error(b.Context(), errors.Wrap(err, "reconciling state")) break NEXT_SELECT } case <-b.Context().Done(): return nil } } } func (b *block) Close() error { if err := b.Base.Close(); err != nil { return errors.Wrap(err, "closing base") } // Remove Metrics. blockInfo.DeletePartialMatch(b.TelemetryLabelsID()) // Close indexs. if err := b.primaryIndex.Close(); err != nil { return errors.Wrap(err, "closing primary index") } if err := b.vectorIndex.Close(); err != nil { return errors.Wrap(err, "closing vector index") } if err := b.invertedIndex.Close(); err != nil { return errors.Wrap(err, "closing inverted index") } // Close chunks. for _, c := range b.chunks { if err := c.Close(); err != nil { return errors.Wrap(err, "closing chunk") } } // Reset. if err := b.resetMem(); err != nil { return errors.Wrap(err, "resetting memory") } return nil } func (b *block) Reload(c *Config) error { currentConfig := b.Config() if c.ForCreate != nil { return errors.New("cannot reload the for create config") } if c.Dir != "" && c.Dir != currentConfig.Dir { return errors.New("cannot reload the dir, MUST pass the same dir, or set it to empty for unchange") } if c.Dir == "" { c.Dir = currentConfig.Dir } if c.FlushInterval == 0 { c.FlushInterval = currentConfig.FlushInterval } if c.ForCreate == nil { c.ForCreate = currentConfig.ForCreate } // Validate the config. if err := c.Validate(); err != nil { return errors.Wrap(err, "validate config") } // Set new config. b.SetConfig(c) return nil } func (b *block) Append(ctx context.Context, feeds ...*model.Feed) (err error) { ctx = telemetry.StartWith(ctx, append(b.TelemetryLabels(), telemetrymodel.KeyOperation, "Append")...) defer func() { telemetry.End(ctx, err) }() if b.State() != StateHot { return errors.New("block is not writable") } b.lastDataAccess.Store(clk.Now()) embedded, err := b.fillEmbedding(ctx, feeds) if err != nil { return errors.Wrap(err, "fill embedding") } b.toWrite <- embedded return nil } func (b *block) Query(ctx context.Context, query QueryOptions) (feeds []*FeedVO, err error) { ctx = telemetry.StartWith(ctx, append(b.TelemetryLabels(), telemetrymodel.KeyOperation, "Query")...) defer func() { telemetry.End(ctx, err) }() if err := (&query).Validate(); err != nil { return nil, errors.Wrap(err, "validate query") } b.lastDataAccess.Store(clk.Now()) // Ensure the block is loaded. if err := b.ensureLoaded(ctx); err != nil { return nil, errors.Wrap(err, "ensuring block loaded") } // Apply filters. filterResult, err := b.applyFilters(ctx, &query) if err != nil { return nil, errors.Wrap(err, "applying filters") } if isMatchedNothingFilterResult(filterResult) { return []*FeedVO{}, nil } // Read feeds. b.mu.RLock() chunks := b.chunks b.mu.RUnlock() result := NewFeedVOHeap(make(FeedVOs, 0, query.Limit)) if err := filterResult.forEach(ctx, b.primaryIndex, func(ref primary.FeedRef, score float32) error { if ref.Time.Before(query.Start) || !ref.Time.Before(query.End) { return nil } ck := chunks[ref.Chunk] if ck == nil { log.Error(ctx, errors.Errorf("chunk file not found, data may be corrupted: %d", ref.Chunk)) return nil } feed, err := ck.Read(ctx, ref.Offset) if err != nil { log.Error(ctx, errors.Wrapf(err, "reading chunk file: %d", ref.Chunk)) return nil } result.TryEvictPush(&FeedVO{ Feed: feed.Feed, Vectors: feed.Vectors, Score: score, }) return nil }); err != nil { return nil, errors.Wrap(err, "iterating filter result") } return result.Slice(), nil } func (b *block) Exists(ctx context.Context, id uint64) (bool, error) { ctx = telemetry.StartWith(ctx, append(b.TelemetryLabels(), telemetrymodel.KeyOperation, "Exists")...) defer func() { telemetry.End(ctx, nil) }() // Ensure the block is loaded. if err := b.ensureLoaded(ctx); err != nil { return false, errors.Wrap(err, "ensuring block loaded") } // Search the primary index. _, ok := b.primaryIndex.Search(ctx, id) return ok, nil } func (b *block) TransformToCold() (err error) { ctx := telemetry.StartWith(b.Context(), append(b.TelemetryLabels(), telemetrymodel.KeyOperation, "TransformToCold")...) defer func() { telemetry.End(ctx, err) }() // Dump meta and indexes to disk. config := b.Config() indexDirpath := filepath.Join(config.Dir, indexDirname) if err := b.writeIndexFile(ctx, filepath.Join(indexDirpath, indexPrimaryFilename), b.primaryIndex); err != nil { return errors.Wrap(err, "writing primary index") } if err := b.writeIndexFile(ctx, filepath.Join(indexDirpath, indexInvertedFilename), b.invertedIndex); err != nil { return errors.Wrap(err, "writing inverted index") } if err := b.writeIndexFile(ctx, filepath.Join(indexDirpath, indexVectorFilename), b.vectorIndex); err != nil { return errors.Wrap(err, "writing vector index") } bs := runtimeutil.Must1(json.Marshal(archiveMetadata{ FeedCount: b.primaryIndex.Count(ctx), })) if err := os.WriteFile(filepath.Join(config.Dir, archiveMetaFilename), bs, 0600); err != nil { return errors.Wrap(err, "writing archive metadata") } // Reset memory. b.mu.Lock() defer b.mu.Unlock() if err := b.resetMem(); err != nil { return errors.Wrap(err, "resetting memory") } b.state.Store(StateCold) return nil } func (b *block) ClearOnDisk() error { return os.RemoveAll(b.Config().Dir) } func (b *block) Start() time.Time { return b.Config().start } func (b *block) End() time.Time { return b.Config().end() } func (b *block) State() State { return b.state.Load().(State) } func (b *block) ensureLoaded(ctx context.Context) error { if b.State() != StateCold { return nil } b.mu.RLock() loaded := b.coldLoaded b.mu.RUnlock() if loaded { return nil } b.mu.Lock() defer b.mu.Unlock() if b.coldLoaded { return nil } if err := b.load(ctx, func(ck chunk.File) error { return component.RunUntilReady(ctx, ck, 10*time.Second) }); err != nil { return errors.Wrap(err, "decoding from disk") } b.coldLoaded = true log.Info(ctx, "cold block loaded") return nil } func (b *block) maintainMetrics(ctx context.Context) { _ = timeutil.Tick(ctx, 30*time.Second, func() error { var ( dir = b.Config().Dir state = string(b.State()) ) blockInfo.WithLabelValues(append(b.TelemetryLabelsIDFields(), state, dir)...) for _, s := range states { if s == state { continue } blockInfo.DeleteLabelValues(append(b.TelemetryLabelsIDFields(), s, dir)...) } return nil }) } func (b *block) runWritingWorker(ctx context.Context) { var concurrency = runtime.NumCPU() * 4 // I/O bound. var wg sync.WaitGroup wg.Add(concurrency) for i := range concurrency { go func(i int) { defer wg.Done() workerCtx := telemetry.StartWith(ctx, append(b.TelemetryLabels(), "worker", i, telemetrymodel.KeyOperation, "Run")..., ) defer func() { telemetry.End(workerCtx, nil) }() flushInterval := b.Config().FlushInterval tick := time.NewTimer(flushInterval) defer tick.Stop() buffer := make([]*chunk.Feed, 0) for { select { case <-workerCtx.Done(): gracefulCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() eof := b.flush(gracefulCtx, buffer) for !eof { eof = b.flush(gracefulCtx, buffer) } return case <-tick.C: _ = b.flush(workerCtx, buffer) flushInterval = b.Config().FlushInterval tick.Reset(flushInterval) } } }(i) } wg.Wait() } func (b *block) flush(ctx context.Context, buffer []*chunk.Feed) (eof bool) { const maxBatch = 1000 buffer = buffer[:0] OUTER: for { select { case feeds := <-b.toWrite: buffer = append(buffer, feeds...) if len(buffer) >= maxBatch { eof = false break OUTER } default: eof = true break OUTER } } if len(buffer) == 0 { return eof } // Append feeds. if err := retry.Backoff(ctx, func() error { return b.append(ctx, buffer) }, &retry.Options{ MinInterval: 100 * time.Millisecond, MaxAttempts: ptr.To(3), }); err != nil { log.Error(ctx, errors.Wrap(err, "append feeds")) } return eof } func (b *block) append(ctx context.Context, feeds []*chunk.Feed) (err error) { ctx = telemetry.StartWith(ctx, append(b.TelemetryLabels(), telemetrymodel.KeyOperation, "append")...) defer func() { telemetry.End(ctx, err) }() // Ensure the head chunk's space is enough. b.mu.RLock() headChunkID, headChunk := b.chunks.head() needGrow := headChunk.Count(ctx)+uint32(len(feeds)) > estimatedChunkFeedsLimit b.mu.RUnlock() if needGrow { b.mu.Lock() headChunkID, headChunk = b.chunks.head() needGrow = headChunk.Count(ctx)+uint32(len(feeds)) > estimatedChunkFeedsLimit if needGrow { if err := b.nextChunk(); err != nil { b.mu.Unlock() return errors.Wrap(err, "creating new chunk") } headChunkID, headChunk = b.chunks.head() } b.mu.Unlock() } // Write to head chunk. if err := headChunk.Append(ctx, feeds, func(feed *chunk.Feed, offset uint64) error { b.primaryIndex.Add(ctx, feed.ID, primary.FeedRef{ // Write primary first, for query. Chunk: headChunkID, Offset: offset, Time: feed.Time, }) b.invertedIndex.Add(ctx, feed.ID, feed.Labels) if len(feed.Vectors) > 0 { if err := b.vectorIndex.Add(ctx, feed.ID, feed.Vectors); err != nil { return errors.Wrap(err, "adding to vector index") } } return nil }); err != nil { return errors.Wrap(err, "writing to head chunk") } return nil } const coldingWindow = 30 * time.Minute func (b *block) reconcileState(ctx context.Context, now time.Time) error { b.mu.Lock() defer b.mu.Unlock() if b.coldLoaded && now.Add(-coldingWindow).After(b.lastDataAccess.Load().(time.Time)) { if err := b.resetMem(); err != nil { return errors.Wrap(err, "resetting memory") } b.coldLoaded = false log.Info(ctx, "block is archived") } return nil } func (b *block) nextChunk() (err error) { ctx := telemetry.StartWith(b.Context(), append(b.TelemetryLabels(), telemetrymodel.KeyOperation, "nextChunk")...) defer func() { telemetry.End(ctx, err) }() oldHeadID, oldHead := b.chunks.head() id := oldHeadID + 1 chunk, err := b.Dependencies().ChunkFactory.New( fmt.Sprintf("%s-%d", b.Instance(), id), &chunk.Config{ Path: filepath.Join(b.Config().Dir, chunkDirname, chunkFilename(id)), }, chunk.Dependencies{}, ) if err != nil { return errors.Wrap(err, "creating new chunk") } if err := component.RunUntilReady(ctx, chunk, 10*time.Second); err != nil { return errors.Wrap(err, "running chunk") } b.chunks = append(b.chunks, chunk) if err := oldHead.EnsureReadonly(ctx); err != nil { return errors.Wrap(err, "ensuring old chunk readonly") } log.Info(ctx, "new chunk created", "chunk", id) return nil } func (b *block) loadState() (State, error) { _, err := os.Stat(filepath.Join(b.Config().Dir, archiveMetaFilename)) switch { case err == nil: return StateCold, nil case os.IsNotExist(err): return StateHot, nil default: return StateCold, errors.Wrap(err, "checking meta file") } } func (b *block) load(ctx context.Context, callback func(chunk chunk.File) error) (err error) { ctx = telemetry.StartWith(ctx, append(b.TelemetryLabels(), telemetrymodel.KeyOperation, "load", "state", b.State())...) defer func() { telemetry.End(ctx, err) }() cold := b.State() == StateCold // List all chunks. if err := b.loadChunks(cold, callback); err != nil { return errors.Wrap(err, "loading chunks") } // Load indexs. b.primaryIndex, b.vectorIndex, b.invertedIndex, err = newIndexs(b.Instance(), b.Dependencies()) if err != nil { return errors.Wrap(err, "creating indexs") } if cold { if err := b.loadIndexs(ctx); err != nil { return errors.Wrap(err, "loading index files") } } else { for id, ck := range b.chunks { if err := b.replayIndex(ctx, uint32(id), ck); err != nil { return errors.Wrapf(err, "replaying index for chunk %d", id) } } } return nil } func (b *block) loadChunks(cold bool, callback func(chunk chunk.File) error) error { // List all chunk ids. ids, err := b.loadChunkIDs() if err != nil { return errors.Wrap(err, "loading chunk IDs") } // Decode each chunk file. for _, id := range ids { p := filepath.Join(b.Config().Dir, chunkDirname, chunkFilename(id)) ck, err := b.Dependencies().ChunkFactory.New( fmt.Sprintf("%s-%d", b.Instance(), id), &chunk.Config{ Path: p, ReadonlyAtFirst: cold, }, chunk.Dependencies{}, ) if err != nil { return errors.Wrapf(err, "creating chunk file %s", p) } if callback != nil { if err := callback(ck); err != nil { return errors.Wrapf(err, "running callback for chunk %d", id) } } b.chunks = append(b.chunks, ck) } return nil } func (b *block) loadChunkIDs() ([]uint32, error) { chunkInfos, err := os.ReadDir(filepath.Join(b.Config().Dir, chunkDirname)) if err != nil { return nil, errors.Wrap(err, "reading chunk directory") } if len(chunkInfos) == 0 { return nil, nil } chunkIDs := make([]uint32, 0, len(chunkInfos)) for _, info := range chunkInfos { if info.IsDir() { continue } chunkID, err := parseChunkFilename(info.Name()) if err != nil { return nil, errors.Wrap(err, "converting chunk file name to int") } chunkIDs = append(chunkIDs, chunkID) } hasGap := false slices.Sort(chunkIDs) prevID := chunkIDs[0] for _, id := range chunkIDs[1:] { if id != prevID+1 { hasGap = true break } prevID = id } if hasGap { // TODO: may be tolerant and fix it. return nil, errors.New("chunk IDs are not continuous, data may be corrupted") } return chunkIDs, nil } func (b *block) loadIndexs(ctx context.Context) error { indexDirpath := filepath.Join(b.Config().Dir, indexDirname) if err := b.decodeIndexFile(ctx, filepath.Join(indexDirpath, indexPrimaryFilename), b.primaryIndex); err != nil { return errors.Wrap(err, "decoding primary index") } if err := b.decodeIndexFile(ctx, filepath.Join(indexDirpath, indexInvertedFilename), b.invertedIndex); err != nil { return errors.Wrap(err, "decoding inverted index") } if err := b.decodeIndexFile(ctx, filepath.Join(indexDirpath, indexVectorFilename), b.vectorIndex); err != nil { return errors.Wrap(err, "decoding vector index") } if err := component.RunUntilReady(ctx, b.primaryIndex, 10*time.Second); err != nil { return errors.Wrap(err, "running primary index") } if err := component.RunUntilReady(ctx, b.invertedIndex, 10*time.Second); err != nil { return errors.Wrap(err, "running inverted index") } if err := component.RunUntilReady(ctx, b.vectorIndex, 10*time.Second); err != nil { return errors.Wrap(err, "running vector index") } return nil } func (b *block) decodeIndexFile(ctx context.Context, path string, idx index.Codec) error { f, err := os.Open(path) if err != nil { return errors.Wrap(err, "opening index file") } defer func() { _ = f.Close() }() buf := bufio.NewReaderSize(f, 1*1024*1024) defer buf.Reset(nil) if err := idx.DecodeFrom(ctx, buf); err != nil { return errors.Wrap(err, "decoding index") } return nil } func (b *block) replayIndex(ctx context.Context, id uint32, ck chunk.File) error { if err := ck.Range(ctx, func(feed *chunk.Feed, offset uint64) error { b.primaryIndex.Add(ctx, feed.ID, primary.FeedRef{ Chunk: id, Offset: offset, Time: feed.Time, }) b.invertedIndex.Add(ctx, feed.ID, feed.Labels) if len(feed.Vectors) > 0 { if err := b.vectorIndex.Add(ctx, feed.ID, feed.Vectors); err != nil { return errors.Wrap(err, "adding to vector index") } } return nil }); err != nil { return errors.Wrapf(err, "replaying index for chunk %d", id) } return nil } func (b *block) resetMem() error { if err := b.closeFiles(); err != nil { return errors.Wrap(err, "closing files") } b.primaryIndex, b.vectorIndex, b.invertedIndex = nil, nil, nil b.chunks = nil return nil } func (b *block) closeFiles() error { if err := b.primaryIndex.Close(); err != nil { return errors.Wrap(err, "closing primary index") } if err := b.vectorIndex.Close(); err != nil { return errors.Wrap(err, "closing vector index") } if err := b.invertedIndex.Close(); err != nil { return errors.Wrap(err, "closing inverted index") } for _, c := range b.chunks { if err := c.Close(); err != nil { return errors.Wrap(err, "closing chunk") } } return nil } func (b *block) applyFilters(ctx context.Context, query *QueryOptions) (res filterResult, err error) { // Apply label filters. labelsResult := b.applyLabelFilters(ctx, query.labelFilters) if isMatchedNothingFilterResult(labelsResult) { return matchedNothingFilterResult, nil } // Apply vector filter. vectorsResult, err := b.applyVectorFilter(ctx, query.Query, query.Threshold, query.Limit) if err != nil { return nil, errors.Wrap(err, "applying vector filter") } if isMatchedNothingFilterResult(vectorsResult) { return matchedNothingFilterResult, nil } // Merge filter results and prepare for reading. return b.mergeFilterResults(labelsResult, vectorsResult), nil } func (b *block) applyLabelFilters(ctx context.Context, filters []LabelFilter) filterResult { if len(filters) == 0 { return matchedAllFilterResult } var allIDs map[uint64]struct{} for _, filter := range filters { ids := b.invertedIndex.Search(ctx, filter.Label, filter.Equal, filter.Value) if len(ids) == 0 { return matchedNothingFilterResult } // initialize allIDs if allIDs == nil { allIDs = ids continue } // merge AND results for id := range allIDs { if _, ok := ids[id]; !ok { delete(allIDs, id) } } if len(allIDs) == 0 { return matchedNothingFilterResult } } // convert to filterResult result := make(filterResult, len(allIDs)) for id := range allIDs { result[id] = 0 } return result } func (b *block) applyVectorFilter( ctx context.Context, query string, threshold float32, limit int, ) (filterResult, error) { if query == "" { return matchedAllFilterResult, nil } llm := b.Dependencies().LLMFactory.Get(b.Config().embeddingLLM) queryVector, err := llm.Embedding(ctx, query) if err != nil { return nil, errors.Wrap(err, "embed query") } ids, err := b.vectorIndex.Search(ctx, queryVector, threshold, limit) if err != nil { return nil, errors.Wrap(err, "applying vector filter") } return ids, nil } func (b *block) mergeFilterResults(x, y filterResult) filterResult { switch { case len(x) > 0 && len(y) > 0: // estimate capacity as the smaller of the two maps result := make(filterResult, int(math.Min(float64(len(x)), float64(len(y))))) for id := range y { if _, ok := x[id]; ok { result[id] = y[id] } } return result case len(x) > 0: return x case len(y) > 0: return y default: if isMatchedNothingFilterResult(x) || isMatchedNothingFilterResult(y) { return matchedNothingFilterResult } return matchedAllFilterResult } } func (b *block) fillEmbedding(ctx context.Context, feeds []*model.Feed) ([]*chunk.Feed, error) { embedded := make([]*chunk.Feed, len(feeds)) llm := b.Dependencies().LLMFactory.Get(b.Config().embeddingLLM) var wg sync.WaitGroup var mu sync.Mutex var errs []error for i, feed := range feeds { wg.Add(1) go func(i int, feed *model.Feed) { // TODO: limit go routines. defer wg.Done() vectors, err := llm.EmbeddingLabels(ctx, feed.Labels) if err != nil { mu.Lock() errs = append(errs, errors.Wrap(err, "fill embedding")) mu.Unlock() return } mu.Lock() embedded[i] = &chunk.Feed{ Feed: feed, Vectors: vectors, } mu.Unlock() }(i, feed) } wg.Wait() if len(errs) > 0 { return nil, errs[0] } return embedded, nil } func (b *block) writeIndexFile(ctx context.Context, path string, idx index.Codec) error { // Ensure index directory. if err := os.MkdirAll(filepath.Dir(path), 0700); err != nil { return errors.Wrap(err, "ensure index directory") } // Create index file. f, err := os.Create(path) if err != nil { return errors.Wrap(err, "creating index file") } defer func() { _ = f.Close() }() // Encode index. w := bufio.NewWriterSize(f, 4*1024) defer func() { _ = w.Flush() }() if err := idx.EncodeTo(ctx, w); err != nil { return errors.Wrap(err, "encoding index") } return nil } type chunkChain []chunk.File func (c chunkChain) head() (uint32, chunk.File) { return uint32(len(c) - 1), c[len(c)-1] } // filterResult is the result of a filter, id -> score. // If the filter is not a vector filter, the score is 0. type filterResult map[uint64]float32 var ( matchedAllFilterResult filterResult = nil isMatchedAllFilterResult = func(r filterResult) bool { return r == nil } matchedNothingFilterResult filterResult = map[uint64]float32{} isMatchedNothingFilterResult = func(r filterResult) bool { return r != nil && len(r) == 0 } ) func (r filterResult) forEach( ctx context.Context, primaryIndex primary.Index, iter func(ref primary.FeedRef, score float32) error, ) error { realFilterResult := r if isMatchedAllFilterResult(r) { ids := primaryIndex.IDs(ctx) realFilterResult = make(filterResult, len(ids)) for id := range ids { realFilterResult[id] = 0 } } for id, score := range realFilterResult { ref, ok := primaryIndex.Search(ctx, id) if !ok { log.Error(ctx, errors.Errorf("feed: %d not found in primary index via other index, may be BUG", id)) continue } if err := iter(ref, score); err != nil { return errors.Wrap(err, "iterating filter result") } } return nil } type mockBlock struct { component.Mock } func (m *mockBlock) Reload(c *Config) error { args := m.Called(c) return args.Error(0) } func (m *mockBlock) TransformToCold() error { args := m.Called() return args.Error(0) } func (m *mockBlock) ClearOnDisk() error { args := m.Called() return args.Error(0) } func (m *mockBlock) Append(ctx context.Context, feeds ...*model.Feed) error { args := m.Called(ctx, feeds) return args.Error(0) } func (m *mockBlock) Query(ctx context.Context, query QueryOptions) ([]*FeedVO, error) { args := m.Called(ctx, query) return args.Get(0).([]*FeedVO), args.Error(1) } func (m *mockBlock) Exists(ctx context.Context, id uint64) (bool, error) { args := m.Called(ctx, id) return args.Bool(0), args.Error(1) } func (m *mockBlock) Start() time.Time { args := m.Called() return args.Get(0).(time.Time) } func (m *mockBlock) End() time.Time { args := m.Called() return args.Get(0).(time.Time) } func (m *mockBlock) State() State { args := m.Called() return args.Get(0).(State) }