This commit is contained in:
glidea
2025-04-19 15:50:26 +08:00
commit 8b33df8a05
109 changed files with 24407 additions and 0 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,741 @@
// Copyright (C) 2025 wangyusong
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package chunk
import (
"bytes"
"context"
"encoding/binary"
"io"
"os"
"sync"
"sync/atomic"
"time"
"github.com/edsrzf/mmap-go"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/model"
"github.com/glidea/zenfeed/pkg/telemetry"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
"github.com/glidea/zenfeed/pkg/util/buffer"
timeutil "github.com/glidea/zenfeed/pkg/util/time"
)
// --- Interface code block ---
// File is the interface for a chunk file.
// Concurrent safe.
type File interface {
component.Component
// EnsureReadonly ensures the file is readonly (can not Append).
// It should be fast when the file already is readonly.
// It will ensure the writeonly related resources are closed,
// and open the readonly related resources, such as mmap to save memory.
EnsureReadonly(ctx context.Context) (err error)
Count(ctx context.Context) (count uint32)
// Append appends feeds to the file.
// onSuccess is called when the feed is appended successfully (synchronously).
// The offset is the offset of the feed in the file.
// !!! It doesn't buffer the data between requests, so the caller should buffer the feeds to avoid high I/O.
Append(ctx context.Context, feeds []*Feed, onSuccess func(feed *Feed, offset uint64) error) (err error)
// Read reads a feed from the file.
Read(ctx context.Context, offset uint64) (feed *Feed, err error)
// Range ranges over all feeds in the file.
Range(ctx context.Context, iter func(feed *Feed, offset uint64) (err error)) (err error)
}
// Config for a chunk file.
type Config struct {
// Path is the path to the chunk file.
// If the file does not exist, it will be created.
// If the file exists, it will be reloaded.
Path string
// ReadonlyAtFirst indicates whether the file should be readonly at first.
// If file of path does not exist, it cannot be true.
ReadonlyAtFirst bool
}
func (c *Config) Validate() (fileExists bool, err error) {
if c.Path == "" {
return false, errors.New("path is required")
}
fi, err := os.Stat(c.Path)
switch {
case err == nil:
if fi.IsDir() {
return false, errors.New("path is a directory")
}
return true, nil
case os.IsNotExist(err):
if c.ReadonlyAtFirst {
return false, errors.New("path does not exist")
}
return false, nil
default:
return false, errors.Wrap(err, "stat path")
}
}
type Dependencies struct{}
// File struct.
var (
headerBytes = 64
headerMagicNumber = []byte{0x77, 0x79, 0x73, 0x20, 0x69, 0x73, 0x20,
0x61, 0x77, 0x65, 0x73, 0x6f, 0x6d, 0x65, 0x00, 0x00}
headerMagicNumberBytes = 16
headerVersionStart = headerMagicNumberBytes
headerVersion = uint32(1)
headerVersionBytes = 4
dataStart = headerBytes
header = func() []byte {
b := make([]byte, headerBytes)
copy(b[:headerMagicNumberBytes], headerMagicNumber)
binary.LittleEndian.PutUint32(b[headerVersionStart:headerVersionStart+headerVersionBytes], headerVersion)
return b
}()
)
// Metrics.
var (
modes = []string{"readwrite", "readonly"}
feedCount = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: model.AppName,
Subsystem: "chunk",
Name: "feed_count",
Help: "Number of feeds in the chunk file.",
},
[]string{telemetrymodel.KeyComponent, telemetrymodel.KeyComponentInstance, "mode"},
)
byteSize = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: model.AppName,
Subsystem: "chunk",
Name: "bytes",
Help: "Size of the chunk file.",
},
[]string{telemetrymodel.KeyComponent, telemetrymodel.KeyComponentInstance, "mode"},
)
)
// --- Factory code block ---
type Factory component.Factory[File, Config, Dependencies]
func NewFactory(mockOn ...component.MockOption) Factory {
if len(mockOn) > 0 {
return component.FactoryFunc[File, Config, Dependencies](
func(instance string, config *Config, dependencies Dependencies) (File, error) {
m := &mockFile{}
component.MockOptions(mockOn).Apply(&m.Mock)
return m, nil
},
)
}
return component.FactoryFunc[File, Config, Dependencies](new)
}
// new creates a new chunk file.
// It will create a new chunk file if the file that path points to does not exist.
// It will open the file if the file exists, and reload it.
// If readonlyAtFirst is true, it will open the file readonly.
func new(instance string, config *Config, dependencies Dependencies) (File, error) {
fileExists, err := config.Validate()
if err != nil {
return nil, errors.Wrap(err, "validate config")
}
osFile, readWriteBuf, appendOffset, readonlyMmap, count, err := init0(fileExists, config)
if err != nil {
return nil, err
}
var rn atomic.Bool
rn.Store(config.ReadonlyAtFirst)
var cnt atomic.Uint32
cnt.Store(count)
return &file{
Base: component.New(&component.BaseConfig[Config, Dependencies]{
Name: "FeedChunk",
Instance: instance,
Config: config,
Dependencies: dependencies,
}),
f: osFile,
readWriteBuf: readWriteBuf,
appendOffset: appendOffset,
readonlyMmap: readonlyMmap,
readonly: &rn,
count: &cnt,
}, nil
}
func init0(
fileExists bool,
config *Config,
) (
osFile *os.File,
readWriteBuf *buffer.Bytes,
appendOffset uint64,
readonlyMmap mmap.MMap,
count uint32,
err error,
) {
// Ensure file.
if fileExists {
osFile, err = loadFromExisting(config.Path, config.ReadonlyAtFirst)
if err != nil {
return nil, nil, 0, nil, 0, errors.Wrap(err, "load from existing")
}
} else { // Create new file.
if config.ReadonlyAtFirst {
return nil, nil, 0, nil, 0, errors.New("cannot create readonly file")
}
osFile, err = createNewOSFile(config.Path)
if err != nil {
return nil, nil, 0, nil, 0, errors.Wrap(err, "create new os file")
}
}
// Setup for Read.
readWriteBuf, count, err = validateOSFile(osFile)
if err != nil {
_ = osFile.Close()
return nil, nil, 0, nil, 0, errors.Wrap(err, "validate os file")
}
if config.ReadonlyAtFirst {
readWriteBuf = nil // Help GC.
m, err := mmap.Map(osFile, mmap.RDONLY, 0)
if err != nil {
_ = osFile.Close()
return nil, nil, 0, nil, 0, errors.Wrap(err, "mmap file")
}
readonlyMmap = m
} else {
appendOffset = uint64(readWriteBuf.Len())
}
return
}
func validateOSFile(f *os.File) (readWriteBuf *buffer.Bytes, count uint32, err error) {
header, err := validateHeader(f)
if err != nil {
return nil, 0, errors.Wrap(err, "validate header")
}
readWriteBuf = &buffer.Bytes{B: header} // len(header) == cap(header).
if _, err := f.Seek(int64(dataStart), io.SeekStart); err != nil {
return nil, 0, errors.Wrap(err, "seek to data start")
}
tr := &trackReader{Reader: f}
var lastSuccessReaded int
var p Feed
for {
err := p.validateFrom(tr, readWriteBuf)
switch {
case err == nil:
count++
lastSuccessReaded = tr.Readed()
continue
case (errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF)) ||
errors.Is(err, errChecksumMismatch):
// Truncate uncompleted feed if any.
readWriteBuf.B = readWriteBuf.B[:lastSuccessReaded+len(header)]
return readWriteBuf, count, nil
default:
return nil, 0, errors.Wrap(err, "validate payload")
}
}
}
func validateHeader(f *os.File) (header []byte, err error) {
header = make([]byte, headerBytes)
if _, err := f.ReadAt(header, 0); err != nil {
return nil, errors.Wrap(err, "read header")
}
// Validate magic number.
if !bytes.Equal(header[:headerMagicNumberBytes], headerMagicNumber) {
return nil, errors.New("invalid magic number")
}
// Validate version.
version := binary.LittleEndian.Uint32(header[headerVersionStart : headerVersionStart+headerVersionBytes])
if version != headerVersion {
return nil, errors.New("invalid version")
}
return header, nil
}
func loadFromExisting(path string, readonlyAtFirst bool) (osFile *os.File, err error) {
flag := os.O_RDWR
if readonlyAtFirst {
flag = os.O_RDONLY
}
osFile, err = os.OpenFile(path, flag, 0600)
if err != nil {
return nil, errors.Wrap(err, "open file")
}
return osFile, nil
}
func createNewOSFile(path string) (osFile *os.File, err error) {
osFile, err = os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0600)
if err != nil {
return nil, errors.Wrap(err, "create file")
}
if _, err = osFile.Write(header); err != nil {
_ = osFile.Close()
return nil, errors.Wrap(err, "write header")
}
if err = osFile.Sync(); err != nil {
_ = osFile.Close()
return nil, errors.Wrap(err, "sync file")
}
return osFile, nil
}
// --- Implementation code block ---
type file struct {
*component.Base[Config, Dependencies]
f *os.File
count *atomic.Uint32
readonly *atomic.Bool
mu sync.RWMutex
// Only readwrite.
readWriteBuf *buffer.Bytes
appendOffset uint64
// Only readonly.
readonlyMmap mmap.MMap
}
func (f *file) Run() error {
f.MarkReady()
return timeutil.Tick(f.Context(), 30*time.Second, func() error {
mode := "readwrite"
sizeValue := f.appendOffset
if f.readonly.Load() {
mode = "readonly"
sizeValue = uint64(len(f.readonlyMmap))
}
feedCount.WithLabelValues(append(f.TelemetryLabelsIDFields(), mode)...).Set(float64(f.Count(context.Background())))
byteSize.WithLabelValues(append(f.TelemetryLabelsIDFields(), mode)...).Set(float64(sizeValue))
for _, m := range modes {
if m == mode {
continue
}
feedCount.DeleteLabelValues(append(f.TelemetryLabelsIDFields(), m)...)
byteSize.DeleteLabelValues(append(f.TelemetryLabelsIDFields(), m)...)
}
return nil
})
}
func (f *file) Close() error {
// Close Run().
if err := f.Base.Close(); err != nil {
return errors.Wrap(err, "closing base")
}
// Clean metrics.
feedCount.DeletePartialMatch(f.TelemetryLabelsID())
byteSize.DeletePartialMatch(f.TelemetryLabelsID())
// Unmap if readonly.
f.mu.Lock()
defer f.mu.Unlock()
if f.readonlyMmap != nil {
if err := f.readonlyMmap.Unmap(); err != nil {
return errors.Wrap(err, "unmap file")
}
f.readonlyMmap = nil
}
// Close file.
if err := f.f.Close(); err != nil {
return errors.Wrap(err, "close file")
}
f.f = nil
f.appendOffset = 0
return nil
}
func (f *file) EnsureReadonly(ctx context.Context) (err error) {
ctx = telemetry.StartWith(ctx, append(f.TelemetryLabels(), telemetrymodel.KeyOperation, "EnsureReadonly")...)
defer func() { telemetry.End(ctx, err) }()
// Fast path - already readonly.
if f.readonly.Load() {
return nil
}
// Acquire write lock
f.mu.Lock()
defer f.mu.Unlock()
if f.readonly.Load() {
return nil
}
// Clear readwrite resources.
f.readWriteBuf = nil
// Open mmap.
m, err := mmap.Map(f.f, mmap.RDONLY, 0)
if err != nil {
return errors.Wrap(err, "mmap file")
}
// Update state.
f.readonlyMmap = m
f.readonly.Store(true)
return nil
}
func (f *file) Count(ctx context.Context) uint32 {
ctx = telemetry.StartWith(ctx, append(f.TelemetryLabels(), telemetrymodel.KeyOperation, "Count")...)
defer func() { telemetry.End(ctx, nil) }()
return f.count.Load()
}
func (f *file) Append(ctx context.Context, feeds []*Feed, onSuccess func(feed *Feed, offset uint64) error) (err error) {
ctx = telemetry.StartWith(ctx, append(f.TelemetryLabels(), telemetrymodel.KeyOperation, "Append")...)
defer func() { telemetry.End(ctx, err) }()
f.mu.Lock()
// Precheck.
if f.readonly.Load() {
f.mu.Unlock()
return errors.New("file is readonly")
}
// Encode feeds into buffer.
currentAppendOffset := f.appendOffset
relativeOffsets, encodedBytesCount, err := f.encodeFeeds(feeds)
if err != nil {
f.readWriteBuf.B = f.readWriteBuf.B[:currentAppendOffset]
f.mu.Unlock()
return errors.Wrap(err, "encode feeds")
}
// Prepare for commit.
encodedData := f.readWriteBuf.Bytes()[currentAppendOffset:]
newAppendOffset := currentAppendOffset + uint64(encodedBytesCount)
// Commit data and header to file.
if err = f.commitAppendToFile(encodedData, currentAppendOffset); err != nil {
f.readWriteBuf.B = f.readWriteBuf.B[:currentAppendOffset]
f.mu.Unlock()
return errors.Wrap(err, "commit append to file")
}
// Update internal state on successful commit.
f.appendOffset = newAppendOffset
f.count.Add(uint32(len(feeds)))
f.mu.Unlock()
// Call callbacks after releasing the lock.
absoluteOffsets := make([]uint64, len(relativeOffsets))
for i, relOff := range relativeOffsets {
absoluteOffsets[i] = currentAppendOffset + relOff // Calculate absolute offsets based on append position.
}
if err := f.notifySuccess(feeds, absoluteOffsets, onSuccess); err != nil {
return errors.Wrap(err, "notify success callbacks")
}
return nil
}
func (f *file) Read(ctx context.Context, offset uint64) (feed *Feed, err error) {
ctx = telemetry.StartWith(ctx, append(f.TelemetryLabels(), telemetrymodel.KeyOperation, "Read")...)
defer func() { telemetry.End(ctx, err) }()
// Validate offset.
if offset < uint64(dataStart) {
return nil, errors.New("offset too small")
}
// Handle readonly mode.
if f.readonly.Load() {
if offset >= uint64(len(f.readonlyMmap)) {
return nil, errors.New("offset too large")
}
feed, _, err = f.readFeed(ctx, f.readonlyMmap, offset)
if err != nil {
return nil, errors.Wrap(err, "read feed")
}
return feed, nil
}
// Handle readwrite mode.
f.mu.RLock()
defer f.mu.RUnlock()
if offset >= f.appendOffset {
return nil, errors.New("offset too large")
}
feed, _, err = f.readFeed(ctx, f.readWriteBuf.Bytes(), offset)
if err != nil {
return nil, errors.Wrap(err, "read feed")
}
return feed, nil
}
func (f *file) Range(ctx context.Context, iter func(feed *Feed, offset uint64) error) (err error) {
ctx = telemetry.StartWith(ctx, append(f.TelemetryLabels(), telemetrymodel.KeyOperation, "Range")...)
defer func() { telemetry.End(ctx, err) }()
// Handle readonly mode.
if f.readonly.Load() {
// Start from data section.
offset := uint64(dataStart)
for offset < uint64(len(f.readonlyMmap)) {
feed, n, err := f.readFeed(ctx, f.readonlyMmap, offset)
if err != nil {
return errors.Wrap(err, "read feed")
}
if err := iter(feed, offset); err != nil {
return errors.Wrap(err, "iterate feed")
}
// Move to next feed.
offset += uint64(n) // G115: Safe conversion as n is uint32
}
return nil
}
// Handle readwrite mode.
f.mu.RLock()
defer f.mu.RUnlock()
data := f.readWriteBuf.Bytes()
offset := uint64(dataStart)
for offset < f.appendOffset { // appendOffset is already checked/maintained correctly.
feed, n, err := f.readFeed(ctx, data, offset)
if err != nil {
return errors.Wrap(err, "read feed")
}
if err := iter(feed, offset); err != nil {
return errors.Wrap(err, "iterate feed")
}
// Move to next feed.
offset += uint64(n)
}
return nil
}
const estimatedFeedSize = 4 * 1024
// encodeFeeds encodes a slice of feeds into the internal readWriteBuf.
// It returns the relative offsets of each feed within the newly added data,
// the total number of bytes encoded, and any error encountered.
func (f *file) encodeFeeds(feeds []*Feed) (relativeOffsets []uint64, encodedBytesCount int, err error) {
relativeOffsets = make([]uint64, len(feeds))
startOffset := f.readWriteBuf.Len()
f.readWriteBuf.EnsureRemaining(estimatedFeedSize * len(feeds))
for i, feed := range feeds {
currentOffsetInBuf := f.readWriteBuf.Len()
relativeOffsets[i] = uint64(currentOffsetInBuf - startOffset)
if err := feed.encodeTo(f.readWriteBuf); err != nil {
return nil, 0, errors.Wrapf(err, "encode feed %d", i)
}
}
encodedBytesCount = f.readWriteBuf.Len() - startOffset
return relativeOffsets, encodedBytesCount, nil
}
// commitAppendToFile writes the encoded data and updated header to the file and syncs.
func (f *file) commitAppendToFile(data []byte, currentAppendOffset uint64) error {
// Append data.
if _, err := f.f.WriteAt(data, int64(currentAppendOffset)); err != nil {
// Data might be partially written.
// We will overwrite it in the next append.
return errors.Wrap(err, "write feeds")
}
// Sync file to persist changes.
if err := f.f.Sync(); err != nil {
return errors.Wrap(err, "sync file")
}
return nil
}
// notifySuccess calls the onSuccess callback for each successfully appended feed.
func (f *file) notifySuccess(
feeds []*Feed,
absoluteOffsets []uint64,
onSuccess func(feed *Feed, offset uint64) error,
) error {
if onSuccess == nil {
return nil
}
for i, feed := range feeds {
if err := onSuccess(feed, absoluteOffsets[i]); err != nil {
// Return the first error encountered during callbacks.
return errors.Wrapf(err, "on success callback for feed %d", i)
}
}
return nil
}
func (f *file) readFeed(ctx context.Context, data []byte, offset uint64) (feed *Feed, length int, err error) {
ctx = telemetry.StartWith(ctx, append(f.TelemetryLabels(), telemetrymodel.KeyOperation, "readFeed")...)
defer func() { telemetry.End(ctx, err) }()
// Prepare reader.
r := io.NewSectionReader(bytes.NewReader(data), int64(offset), int64(uint64(len(data))-offset))
tr := &trackReader{Reader: r}
// Decode feed.
feed = &Feed{Feed: &model.Feed{}}
if err = feed.decodeFrom(tr); err != nil {
return nil, 0, errors.Wrap(err, "decode feed")
}
return feed, tr.Readed(), nil
}
type trackReader struct {
io.Reader
length int
}
func (r *trackReader) Read(p []byte) (n int, err error) {
n, err = r.Reader.Read(p)
r.length += n
return
}
func (r *trackReader) Readed() int {
return r.length
}
type mockFile struct {
component.Mock
}
func (m *mockFile) Run() error {
args := m.Called()
return args.Error(0)
}
func (m *mockFile) Ready() <-chan struct{} {
args := m.Called()
return args.Get(0).(<-chan struct{})
}
func (m *mockFile) Close() error {
args := m.Called()
return args.Error(0)
}
func (m *mockFile) Append(ctx context.Context, feeds []*Feed, onSuccess func(feed *Feed, offset uint64) error) error {
args := m.Called(ctx, feeds, onSuccess)
return args.Error(0)
}
func (m *mockFile) Read(ctx context.Context, offset uint64) (*Feed, error) {
args := m.Called(ctx, offset)
return args.Get(0).(*Feed), args.Error(1)
}
func (m *mockFile) Range(ctx context.Context, iter func(feed *Feed, offset uint64) error) error {
args := m.Called(ctx, iter)
return args.Error(0)
}
func (m *mockFile) Count(ctx context.Context) uint32 {
args := m.Called(ctx)
return args.Get(0).(uint32)
}
func (m *mockFile) EnsureReadonly(ctx context.Context) error {
args := m.Called(ctx)
return args.Error(0)
}

View File

@@ -0,0 +1,270 @@
package chunk
import (
"context"
"fmt"
"math/rand"
"os"
"path/filepath"
"strings"
"testing"
"time"
"github.com/glidea/zenfeed/pkg/model"
)
// --- Benchmark Setup ---
const (
benchmarkFeedCount = 10000 // Number of feeds for benchmark setup
benchmarkBatchSize = 100 // Batch size for append benchmark
)
var (
benchmarkFeeds []*Feed
benchmarkOffsets []uint64 // Store offsets for read benchmark
benchmarkTempPath string
)
// setupBenchmarkFile creates a temporary file and populates it with benchmarkFeeds.
// It returns the path and a cleanup function.
func setupBenchmarkFile(b *testing.B, readonly bool) (File, func()) {
b.Helper()
// Create temp file path only once
if benchmarkTempPath == "" {
dir, err := os.MkdirTemp("", "chunk-benchmark")
if err != nil {
b.Fatalf("Failed to create temp dir: %v", err)
}
benchmarkTempPath = filepath.Join(dir, "benchmark.chunk")
}
cleanup := func() {
os.RemoveAll(filepath.Dir(benchmarkTempPath))
benchmarkTempPath = "" // Reset path for next potential setup
benchmarkFeeds = nil // Clear feeds
benchmarkOffsets = nil // Clear offsets
}
// Generate feeds only once per setup phase if needed
if len(benchmarkFeeds) == 0 {
benchmarkFeeds = generateBenchmarkFeeds(benchmarkFeedCount)
benchmarkOffsets = make([]uint64, 0, benchmarkFeedCount)
}
// Create and populate the file in read-write mode first
rwConfig := &Config{Path: benchmarkTempPath}
rwFile, err := new("benchmark-setup", rwConfig, Dependencies{})
if err != nil {
cleanup()
b.Fatalf("Failed to create benchmark file for setup: %v", err)
}
currentOffsetCount := int(rwFile.Count(context.Background()))
if currentOffsetCount < benchmarkFeedCount { // Only append if not already populated
appendCount := 0
onSuccess := func(feed *Feed, offset uint64) error {
// Collect offsets only during the initial population
if len(benchmarkOffsets) < benchmarkFeedCount {
benchmarkOffsets = append(benchmarkOffsets, offset)
}
appendCount++
return nil
}
for i := currentOffsetCount; i < benchmarkFeedCount; i += benchmarkBatchSize {
end := i + benchmarkBatchSize
if end > benchmarkFeedCount {
end = benchmarkFeedCount
}
if err := rwFile.Append(context.Background(), benchmarkFeeds[i:end], onSuccess); err != nil {
rwFile.Close()
cleanup()
b.Fatalf("Failed to append feeds during setup: %v", err)
}
}
}
// Close the read-write file before potentially reopening as readonly
if err := rwFile.Close(); err != nil {
cleanup()
b.Fatalf("Failed to close rw file during setup: %v", err)
}
// Reopen file with the desired mode for the benchmark
config := &Config{
Path: benchmarkTempPath,
ReadonlyAtFirst: readonly,
}
f, err := new("benchmark", config, Dependencies{})
if err != nil {
cleanup()
b.Fatalf("Failed to open benchmark file in target mode: %v", err)
}
if readonly {
// For read benchmarks, ensure mmap is active if file was just created/populated
if err := f.EnsureReadonly(context.Background()); err != nil {
f.Close()
cleanup()
b.Fatalf("Failed to ensure readonly mode: %v", err)
}
}
return f, cleanup
}
func generateBenchmarkFeeds(count int) []*Feed {
feeds := make([]*Feed, count)
rng := rand.New(rand.NewSource(time.Now().UnixNano())) // Use a fixed seed for reproducibility if needed
// Pre-generate some random characters for building large strings efficiently.
const letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 "
letterRunes := []rune(letters)
randString := func(n int) string {
sb := strings.Builder{}
sb.Grow(n)
for i := 0; i < n; i++ {
sb.WriteRune(letterRunes[rng.Intn(len(letterRunes))])
}
return sb.String()
}
minLabelSize := 8 * 1024 // 8KB
maxLabelSize := 15 * 1024 // 15KB
for i := range count {
// Generate large label content size.
largeLabelSize := minLabelSize + rng.Intn(maxLabelSize-minLabelSize+1)
// Estimate the overhead of other labels and structure (key names, length prefixes etc.).
// This is a rough estimation, adjust if needed.
otherLabelsOverhead := 100
largeContentSize := largeLabelSize - otherLabelsOverhead
if largeContentSize < 0 {
largeContentSize = 0
}
feeds[i] = &Feed{
Feed: &model.Feed{
ID: uint64(i + 1),
Labels: model.Labels{
model.Label{Key: "type", Value: fmt.Sprintf("type_%d", rng.Intn(10))},
model.Label{Key: "source", Value: fmt.Sprintf("source_%d", rng.Intn(5))},
model.Label{Key: "large_content", Value: randString(largeContentSize)}, // Add large label
},
Time: time.Now().Add(-time.Duration(rng.Intn(3600*24*30)) * time.Second), // Random time within the last 30 days
},
Vectors: [][]float32{
generateFloat32Vector(rng, 1024), // Example dimension
generateFloat32Vector(rng, 1024),
},
}
}
return feeds
}
func generateFloat32Vector(rng *rand.Rand, dim int) []float32 {
vec := make([]float32, dim)
for i := range vec {
vec[i] = rng.Float32()
}
return vec
}
// --- Benchmarks ---
func BenchmarkAppend(b *testing.B) {
// Setup: Start with an empty file for appending.
// Note: setupBenchmarkFile(b, false) creates the file but doesn't populate it fully here.
// We need a fresh file for append benchmark.
dir, err := os.MkdirTemp("", "chunk-append-benchmark")
if err != nil {
b.Fatalf("Failed to create temp dir: %v", err)
}
path := filepath.Join(dir, "append_benchmark.chunk")
cleanup := func() {
os.RemoveAll(dir)
}
defer cleanup()
config := &Config{Path: path}
f, err := new("benchmark-append", config, Dependencies{})
if err != nil {
b.Fatalf("Failed to create benchmark file for append: %v", err)
}
defer f.Close()
feedsToAppend := generateBenchmarkFeeds(benchmarkBatchSize) // Generate a batch
b.ResetTimer()
b.ReportAllocs()
// Measure appending batches of feeds.
for i := 0; i < b.N; i++ {
// Simulate appending new batches. In a real scenario, feeds would differ.
// For benchmark consistency, we reuse the same batch data.
err := f.Append(context.Background(), feedsToAppend, nil) // onSuccess is nil for performance
if err != nil {
b.Fatalf("Append failed during benchmark: %v", err)
}
}
b.StopTimer() // Stop timer before potential cleanup/close overhead
}
func BenchmarkRead(b *testing.B) {
// Setup: Populate a file and make it readonly (mmap).
f, cleanup := setupBenchmarkFile(b, true)
defer cleanup()
if len(benchmarkOffsets) == 0 {
b.Fatal("Benchmark setup failed: no offsets generated.")
}
// Pre-select random offsets to read
rng := rand.New(rand.NewSource(42)) // Use a fixed seed for reproducibility
readIndices := make([]int, b.N)
for i := 0; i < b.N; i++ {
readIndices[i] = rng.Intn(len(benchmarkOffsets))
}
b.ResetTimer()
b.ReportAllocs()
// Measure reading feeds at random valid offsets using mmap.
for i := 0; i < b.N; i++ {
offset := benchmarkOffsets[readIndices[i]]
feed, err := f.Read(context.Background(), offset)
if err != nil {
b.Fatalf("Read failed during benchmark at offset %d: %v", offset, err)
}
// Prevent compiler optimization by using the result slightly
if feed == nil {
b.Fatal("Read returned nil feed")
}
}
b.StopTimer()
}
func BenchmarkRange(b *testing.B) {
// Setup: Populate a file and make it readonly (mmap).
f, cleanup := setupBenchmarkFile(b, false)
defer cleanup()
b.ResetTimer()
b.ReportAllocs()
// Measure ranging over all feeds using mmap.
for i := 0; i < b.N; i++ {
count := 0
err := f.Range(context.Background(), func(feed *Feed, offset uint64) (err error) {
// Minimal operation inside the iterator
count++
if feed == nil { // Basic check
return fmt.Errorf("nil feed encountered at offset %d", offset)
}
return nil
})
if err != nil {
b.Fatalf("Range failed during benchmark: %v", err)
}
// Optionally verify count, though it adds overhead to the benchmark itself
// if uint32(count) != f.Count(context.Background()) {
// b.Fatalf("Range count mismatch: expected %d, got %d", f.Count(context.Background()), count)
// }
}
b.StopTimer()
}

View File

@@ -0,0 +1,567 @@
// Copyright (C) 2025 wangyusong
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package chunk
import (
"context"
"os"
"path/filepath"
"testing"
"time"
. "github.com/onsi/gomega"
"github.com/pkg/errors"
"github.com/glidea/zenfeed/pkg/model"
"github.com/glidea/zenfeed/pkg/test"
)
func TestNew(t *testing.T) {
RegisterTestingT(t)
type givenDetail struct {
path string
readonlyAtFirst bool
setupFeeds []*Feed
}
type whenDetail struct{}
type thenExpected struct {
count uint32
err string
}
tests := []test.Case[givenDetail, whenDetail, thenExpected]{
{
Scenario: "Create New Chunk File",
Given: "A valid non-existing file path",
When: "Creating a new chunk file",
Then: "Should return a valid File instance with count 0",
GivenDetail: givenDetail{
readonlyAtFirst: false,
},
WhenDetail: whenDetail{},
ThenExpected: thenExpected{
count: 0,
},
},
{
Scenario: "Open Existing Chunk File",
Given: "A valid existing chunk file with data",
When: "Opening the file in readonly mode",
Then: "Should return a valid File instance with correct count",
GivenDetail: givenDetail{
readonlyAtFirst: true,
setupFeeds: []*Feed{
createTestFeed(1),
createTestFeed(2),
createTestFeed(3),
},
},
WhenDetail: whenDetail{},
ThenExpected: thenExpected{
count: 3,
},
},
{
Scenario: "Invalid Configuration",
Given: "An invalid configuration with empty path",
When: "Creating a new chunk file",
Then: "Should return an error",
GivenDetail: givenDetail{
path: "", // Empty path
},
WhenDetail: whenDetail{},
ThenExpected: thenExpected{
err: "validate config: path is required",
},
},
}
for _, tt := range tests {
t.Run(tt.Scenario, func(t *testing.T) {
// Given.
if tt.GivenDetail.path == "" && tt.ThenExpected.err == "" {
tt.GivenDetail.path = createTempFile(t)
defer cleanupTempFile(tt.GivenDetail.path)
}
if len(tt.GivenDetail.setupFeeds) > 0 {
initialFile, err := new("test", &Config{
Path: tt.GivenDetail.path,
ReadonlyAtFirst: false,
}, Dependencies{})
Expect(err).NotTo(HaveOccurred())
err = initialFile.Append(context.Background(), tt.GivenDetail.setupFeeds, nil)
Expect(err).NotTo(HaveOccurred())
initialFile.Close()
}
// When.
file, err := new("test", &Config{
Path: tt.GivenDetail.path,
ReadonlyAtFirst: tt.GivenDetail.readonlyAtFirst,
}, Dependencies{})
// Then.
if tt.ThenExpected.err != "" {
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring(tt.ThenExpected.err))
} else {
Expect(err).NotTo(HaveOccurred())
Expect(file).NotTo(BeNil())
Expect(file.Count(context.Background())).To(Equal(tt.ThenExpected.count))
file.Close()
}
})
}
}
func TestFileModeSwitching(t *testing.T) {
RegisterTestingT(t)
tests := []struct {
scenario string
given string
when string
then string
initialMode bool // true for readonly
expectedError string
}{
{
scenario: "ReadWrite to ReadOnly Switch",
given: "a read-write mode chunk file",
when: "calling EnsureReadonly()",
then: "file should switch to read-only mode",
initialMode: false,
expectedError: "",
},
{
scenario: "Already ReadOnly",
given: "a read-only mode chunk file",
when: "calling EnsureReadonly()",
then: "operation should return quickly",
initialMode: true,
expectedError: "",
},
}
for _, tt := range tests {
t.Run(tt.scenario, func(t *testing.T) {
// Setup
path := createTempFile(t)
defer cleanupTempFile(path)
// Create initial file
initialConfig := Config{
Path: path,
ReadonlyAtFirst: false,
}
initialFile, err := new("test", &initialConfig, Dependencies{})
Expect(err).NotTo(HaveOccurred())
initialFile.Close()
// Open file with specified mode
config := Config{
Path: path,
ReadonlyAtFirst: tt.initialMode,
}
f, err := new("test", &config, Dependencies{})
Expect(err).NotTo(HaveOccurred())
defer f.Close()
// Execute
err = f.EnsureReadonly(context.Background())
// Verify
if tt.expectedError != "" {
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring(tt.expectedError))
} else {
Expect(err).NotTo(HaveOccurred())
// Verify it's now in readonly mode by attempting an append
appendErr := f.Append(context.Background(), []*Feed{createTestFeed(1)}, nil)
Expect(appendErr).To(HaveOccurred())
Expect(appendErr.Error()).To(ContainSubstring("file is readonly"))
}
})
}
}
func TestAppend(t *testing.T) {
RegisterTestingT(t)
type givenDetail struct {
readonly bool
}
type whenDetail struct {
appendFeeds []*Feed
}
type thenExpected struct {
count uint32
err string
}
tests := []test.Case[givenDetail, whenDetail, thenExpected]{
{
Scenario: "Append Single Feed",
Given: "A read-write mode chunk file",
When: "Adding a single feed",
Then: "Should successfully write the feed",
GivenDetail: givenDetail{
readonly: false,
},
WhenDetail: whenDetail{
appendFeeds: []*Feed{createTestFeed(1)},
},
ThenExpected: thenExpected{
count: 1,
},
},
{
Scenario: "Batch Append Multiple Feeds",
Given: "A read-write mode chunk file",
When: "Adding multiple feeds at once",
Then: "Should write all feeds as a single transaction",
GivenDetail: givenDetail{
readonly: false,
},
WhenDetail: whenDetail{
appendFeeds: []*Feed{
createTestFeed(1),
createTestFeed(2),
createTestFeed(3),
},
},
ThenExpected: thenExpected{
count: 3,
},
},
{
Scenario: "Append in ReadOnly Mode",
Given: "A read-only mode chunk file",
When: "Attempting to add a feed",
Then: "Should fail with readonly error",
GivenDetail: givenDetail{
readonly: true,
},
WhenDetail: whenDetail{
appendFeeds: []*Feed{createTestFeed(1)},
},
ThenExpected: thenExpected{
err: "file is readonly",
},
},
}
for _, tt := range tests {
t.Run(tt.Scenario, func(t *testing.T) {
// Given.
path := createTempFile(t)
defer cleanupTempFile(path)
if tt.GivenDetail.readonly {
// Create and close initial file for readonly test.
rwFile, err := new("test", &Config{Path: path}, Dependencies{})
Expect(err).NotTo(HaveOccurred())
rwFile.Close()
}
f, err := new("test", &Config{
Path: path,
ReadonlyAtFirst: tt.GivenDetail.readonly,
}, Dependencies{})
Expect(err).NotTo(HaveOccurred())
defer f.Close()
// When.
var offsets []uint64
err = f.Append(context.Background(), tt.WhenDetail.appendFeeds, func(_ *Feed, offset uint64) error {
offsets = append(offsets, offset)
return nil
})
// Then.
if tt.ThenExpected.err != "" {
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring(tt.ThenExpected.err))
} else {
Expect(err).NotTo(HaveOccurred())
Expect(f.Count(context.Background())).To(Equal(tt.ThenExpected.count))
// Verify each feed can be read back.
for i, offset := range offsets {
feed, readErr := f.Read(context.Background(), offset)
Expect(readErr).NotTo(HaveOccurred())
Expect(feed.ID).To(Equal(tt.WhenDetail.appendFeeds[i].ID))
}
}
})
}
}
func TestRead(t *testing.T) {
RegisterTestingT(t)
tests := []struct {
scenario string
given string
when string
then string
readonly bool
setupFeeds []*Feed
readOffset uint64
expectedErr string
}{
{
scenario: "Read from Valid Offset",
given: "a chunk file with feeds",
when: "reading with a valid offset",
then: "should return the correct feed",
readonly: false,
setupFeeds: []*Feed{createTestFeed(1)},
readOffset: uint64(dataStart), // Will be adjusted in the test
expectedErr: "",
},
{
scenario: "Read from ReadOnly Mode",
given: "a read-only chunk file with feeds",
when: "reading with a valid offset",
then: "should return the correct feed using mmap",
readonly: true,
setupFeeds: []*Feed{createTestFeed(2)},
readOffset: uint64(dataStart), // Will be adjusted in the test
expectedErr: "",
},
{
scenario: "Read with Small Offset",
given: "a chunk file with feeds",
when: "reading with an offset smaller than dataStart",
then: "should return 'offset too small' error",
readonly: false,
setupFeeds: []*Feed{createTestFeed(3)},
readOffset: uint64(dataStart - 1),
expectedErr: "offset too small",
},
{
scenario: "Read with Large Offset",
given: "a chunk file with feeds",
when: "reading with an offset larger than appendOffset",
then: "should return 'offset too large' error",
readonly: false,
setupFeeds: []*Feed{createTestFeed(4)},
readOffset: 999999, // Definitely beyond appendOffset
expectedErr: "offset too large",
},
}
for _, tt := range tests {
t.Run(tt.scenario, func(t *testing.T) {
// Setup
path := createTempFile(t)
defer cleanupTempFile(path)
// Create and populate initial file
initialConfig := Config{
Path: path,
ReadonlyAtFirst: false,
}
initialFile, err := new("test", &initialConfig, Dependencies{})
Expect(err).NotTo(HaveOccurred())
var validOffset uint64
if len(tt.setupFeeds) > 0 {
// Track the first offset for later reading
var firstOffset uint64
err = initialFile.Append(context.Background(), tt.setupFeeds, func(_ *Feed, offset uint64) error {
if firstOffset == 0 {
firstOffset = offset
}
return nil
})
Expect(err).NotTo(HaveOccurred())
validOffset = firstOffset
}
initialFile.Close()
// Reopen with specified mode
config := Config{
Path: path,
ReadonlyAtFirst: tt.readonly,
}
f, err := new("test", &config, Dependencies{})
Expect(err).NotTo(HaveOccurred())
defer f.Close()
// Use valid offset if needed
readOffset := tt.readOffset
if readOffset == uint64(dataStart) && validOffset > 0 {
readOffset = validOffset
}
// Execute
feed, err := f.Read(context.Background(), readOffset)
// Verify
if tt.expectedErr != "" {
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring(tt.expectedErr))
} else {
Expect(err).NotTo(HaveOccurred())
Expect(feed).NotTo(BeNil())
Expect(feed.ID).To(Equal(tt.setupFeeds[0].ID))
}
})
}
}
func TestRange(t *testing.T) {
RegisterTestingT(t)
tests := []struct {
scenario string
given string
when string
then string
readonly bool
setupFeeds []*Feed
earlyExit bool
expectedCount int
expectedErr string
}{
{
scenario: "Range All Feeds",
given: "a chunk file with multiple feeds",
when: "calling Range()",
then: "iterator should visit each feed in sequence",
readonly: false,
setupFeeds: []*Feed{
createTestFeed(1),
createTestFeed(2),
createTestFeed(3),
},
earlyExit: false,
expectedCount: 3,
expectedErr: "",
},
{
scenario: "Range with Early Exit",
given: "a chunk file with multiple feeds",
when: "calling Range() and returning an error from iterator",
then: "range should stop and return that error",
readonly: false,
setupFeeds: []*Feed{
createTestFeed(4),
createTestFeed(5),
createTestFeed(6),
},
earlyExit: true,
expectedCount: 1, // Should stop after first feed
expectedErr: "early exit",
},
{
scenario: "Range in ReadOnly Mode",
given: "a read-only chunk file with feeds",
when: "calling Range()",
then: "should use mmap and correctly visit all feeds",
readonly: true,
setupFeeds: []*Feed{
createTestFeed(7),
createTestFeed(8),
},
earlyExit: false,
expectedCount: 2,
expectedErr: "",
},
}
for _, tt := range tests {
t.Run(tt.scenario, func(t *testing.T) {
// Setup
path := createTempFile(t)
defer cleanupTempFile(path)
// Create and populate initial file
initialConfig := Config{
Path: path,
ReadonlyAtFirst: false,
}
initialFile, err := new("test", &initialConfig, Dependencies{})
Expect(err).NotTo(HaveOccurred())
if len(tt.setupFeeds) > 0 {
err = initialFile.Append(context.Background(), tt.setupFeeds, nil)
Expect(err).NotTo(HaveOccurred())
}
initialFile.Close()
// Reopen with specified mode
config := Config{
Path: path,
ReadonlyAtFirst: tt.readonly,
}
f, err := new("test", &config, Dependencies{})
Expect(err).NotTo(HaveOccurred())
defer f.Close()
// Execute
visitCount := 0
err = f.Range(context.Background(), func(feed *Feed, offset uint64) (err error) {
visitCount++
if tt.earlyExit && visitCount == 1 {
return errors.New("early exit")
}
return nil
})
// Verify
if tt.expectedErr != "" {
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring(tt.expectedErr))
} else {
Expect(err).NotTo(HaveOccurred())
}
Expect(visitCount).To(Equal(tt.expectedCount))
})
}
}
func createTempFile(t *testing.T) string {
dir, err := os.MkdirTemp("", "chunk-test")
if err != nil {
t.Fatalf("Failed to create temp dir: %v", err)
}
return filepath.Join(dir, "test.chunk")
}
func cleanupTempFile(path string) {
os.RemoveAll(filepath.Dir(path))
}
func createTestFeed(id uint64) *Feed {
return &Feed{
Feed: &model.Feed{
ID: id,
Labels: model.Labels{model.Label{Key: "test", Value: "value"}},
Time: time.Now(),
},
Vectors: [][]float32{
{1.0, 2.0, 3.0},
{4.0, 5.0, 6.0},
},
}
}

View File

@@ -0,0 +1,296 @@
package chunk
import (
"bytes"
"encoding/binary"
"hash/crc32"
"io"
"math"
"time"
"github.com/pkg/errors"
"github.com/glidea/zenfeed/pkg/model"
binaryutil "github.com/glidea/zenfeed/pkg/util/binary"
"github.com/glidea/zenfeed/pkg/util/buffer"
)
const (
// feedHeaderSize is the size of the record header (length + checksum).
feedHeaderSize = 8 // uint32 length + uint32 checksum
)
var (
errChecksumMismatch = errors.New("checksum mismatch")
crc32Table = crc32.MakeTable(crc32.IEEE)
)
// Feed is the feed model in the chunk file.
type Feed struct {
*model.Feed
Vectors [][]float32
}
// encodeTo encodes the Feed into the provided buffer, including a length prefix and checksum.
// It writes the record structure: [payloadLen(uint32)][checksum(uint32)][payload...].
func (f *Feed) encodeTo(buf *buffer.Bytes) error {
buf.EnsureRemaining(4 * 1024)
// 1. Reserve space for length and checksum.
startOffset := buf.Len()
headerPos := buf.Len() // Position where header starts.
buf.B = buf.B[:headerPos+feedHeaderSize] // Extend buffer to include header space.
payloadStartOffset := buf.Len() // Position where payload starts.
// 2. Encode the actual payload.
if err := f.encodePayload(buf); err != nil {
// If payload encoding fails, revert the buffer to its initial state.
buf.B = buf.B[:startOffset]
return errors.Wrap(err, "encode payload")
}
payloadEndOffset := buf.Len()
// 3. Calculate payload length and checksum.
payloadLen := uint32(payloadEndOffset - payloadStartOffset)
payloadSlice := buf.Bytes()[payloadStartOffset:payloadEndOffset]
checksum := crc32.Checksum(payloadSlice, crc32Table)
// 4. Write the actual length and checksum into the reserved space.
binary.LittleEndian.PutUint32(buf.Bytes()[headerPos:headerPos+4], payloadLen)
binary.LittleEndian.PutUint32(buf.Bytes()[headerPos+4:headerPos+8], checksum)
return nil
}
// encodePayload encodes the core fields (ID, Time, Labels, Vectors) into the buffer.
func (f *Feed) encodePayload(w io.Writer) error {
// Write ID.
if err := binaryutil.WriteUint64(w, f.ID); err != nil {
return errors.Wrap(err, "write id")
}
// Write time.
if err := binaryutil.WriteUint64(w, uint64(f.Time.UnixNano())); err != nil {
return errors.Wrap(err, "write time")
}
// Write labels.
if err := f.encodeLabels(w); err != nil {
return errors.Wrap(err, "encode labels")
}
// Write vectors.
if err := f.encodeVectors(w); err != nil {
return errors.Wrap(err, "encode vectors")
}
return nil
}
// encodeLabels writes the label data to the writer.
func (f *Feed) encodeLabels(w io.Writer) error {
labelsLen := uint32(len(f.Labels))
if len(f.Labels) > math.MaxUint32 {
return errors.New("too many labels")
}
if err := binaryutil.WriteUint32(w, labelsLen); err != nil {
return errors.Wrap(err, "write labels count")
}
for i, label := range f.Labels {
if err := binaryutil.WriteString(w, label.Key); err != nil {
return errors.Wrapf(err, "write label key index %d", i)
}
if err := binaryutil.WriteString(w, label.Value); err != nil {
return errors.Wrapf(err, "write label value index %d", i)
}
}
return nil
}
// encodeVectors writes the vector data to the writer.
func (f *Feed) encodeVectors(w io.Writer) error {
vectorCount := uint32(len(f.Vectors))
if len(f.Vectors) > math.MaxUint32 {
return errors.New("too many vectors")
}
if err := binaryutil.WriteUint32(w, vectorCount); err != nil {
return errors.Wrap(err, "write vectors count")
}
if vectorCount == 0 {
return nil // Nothing more to write if there are no vectors.
}
// Write dimension.
dimension := uint32(len(f.Vectors[0]))
if len(f.Vectors[0]) > math.MaxUint32 {
return errors.New("vector dimension exceeds maximum uint32")
}
if err := binaryutil.WriteUint32(w, dimension); err != nil {
return errors.Wrap(err, "write vector dimension")
}
// Write vector data.
var floatBuf [4]byte
for i, vec := range f.Vectors {
// Ensure vector has the correct dimension.
if uint32(len(vec)) != dimension {
return errors.Errorf("vector %d has inconsistent dimension %d, expected %d", i, len(vec), dimension)
}
for _, val := range vec { // Avoid using binary.Write for performance.
bits := math.Float32bits(val)
binary.LittleEndian.PutUint32(floatBuf[:], bits)
if _, err := w.Write(floatBuf[:]); err != nil {
return errors.Wrapf(err, "write for vector %d, value %f", i, val)
}
}
}
return nil
}
func (f *Feed) validateFrom(r io.Reader, buf *buffer.Bytes) (err error) {
// 1. Read header (length and checksum).
var payloadLen, expectedChecksum uint32
startOffset := buf.Len()
if _, err := io.CopyN(buf, r, feedHeaderSize); err != nil {
return errors.Wrap(err, "read header")
}
payloadLen = binary.LittleEndian.Uint32(buf.B[startOffset : startOffset+4])
expectedChecksum = binary.LittleEndian.Uint32(buf.B[startOffset+4:])
// 2. Read payload, calculate checksum simultaneously.
buf.EnsureRemaining(int(payloadLen))
limitedReader := io.LimitReader(r, int64(payloadLen))
checksumWriter := crc32.New(crc32Table)
teeReader := io.TeeReader(limitedReader, checksumWriter)
// Read the exact payload length into the buffer.
if _, err := io.CopyN(buf, teeReader, int64(payloadLen)); err != nil {
// EOF, may be writing not complete.
return errors.Wrap(err, "read payload")
}
// 3. Verify checksum.
calculatedChecksum := checksumWriter.Sum32()
if calculatedChecksum != expectedChecksum {
return errors.Wrapf(errChecksumMismatch, "expected %x, got %x", expectedChecksum, calculatedChecksum)
}
return nil
}
// decodeFrom decodes the feed from the reader, validating length and checksum.
// It expects the format: [payloadLen(uint32)][checksum(uint32)][payload...].
func (f *Feed) decodeFrom(r io.Reader) (err error) {
buf := buffer.Get()
defer buffer.Put(buf)
if err := f.validateFrom(r, buf); err != nil {
return errors.Wrap(err, "validate payload")
}
payloadReader := bytes.NewReader(buf.B[feedHeaderSize:])
if err := f.decodePayload(payloadReader); err != nil {
return errors.Wrap(err, "decode payload")
}
return nil
}
// decodePayload decodes the core fields from the reader.
func (f *Feed) decodePayload(r io.Reader) error {
f.Feed = &model.Feed{} // Ensure Feed is initialized.
// Read ID.
if err := binary.Read(r, binary.LittleEndian, &f.ID); err != nil {
return errors.Wrap(err, "read id")
}
// Read time.
var timestamp int64
if err := binary.Read(r, binary.LittleEndian, &timestamp); err != nil {
return errors.Wrap(err, "read time")
}
f.Time = time.Unix(0, timestamp).In(time.UTC)
// Read labels.
if err := f.decodeLabels(r); err != nil {
return errors.Wrap(err, "decode labels")
}
// Read vectors.
if err := f.decodeVectors(r); err != nil {
return errors.Wrap(err, "decode vectors")
}
return nil
}
// decodeLabels reads the label data from the reader.
func (f *Feed) decodeLabels(r io.Reader) error {
var labelCount uint32
if err := binary.Read(r, binary.LittleEndian, &labelCount); err != nil {
return errors.Wrap(err, "read labels count")
}
f.Labels = make(model.Labels, labelCount)
for i := range labelCount {
// Read key.
key, err := binaryutil.ReadString(r)
if err != nil {
return errors.Wrapf(err, "read label key index %d", i)
}
// Read value.
value, err := binaryutil.ReadString(r)
if err != nil {
return errors.Wrapf(err, "read label value index %d", i)
}
f.Labels[i] = model.Label{
Key: key,
Value: value,
}
}
return nil
}
// decodeVectors reads the vector data from the reader.
func (f *Feed) decodeVectors(r io.Reader) error {
var vectorCount uint32
if err := binary.Read(r, binary.LittleEndian, &vectorCount); err != nil {
return errors.Wrap(err, "read vectors count")
}
if vectorCount == 0 {
f.Vectors = nil // Ensure vectors is nil if count is 0
return nil
}
f.Vectors = make([][]float32, vectorCount)
var dimension uint32
if err := binary.Read(r, binary.LittleEndian, &dimension); err != nil {
return errors.Wrap(err, "read vector dimension")
}
// Pre-allocate the underlying float data contiguously for potentially better cache locality.
totalFloats := uint64(vectorCount) * uint64(dimension)
floatData := make([]float32, totalFloats)
offset := 0
for i := range vectorCount {
f.Vectors[i] = floatData[offset : offset+int(dimension)] // Slice into the pre-allocated data
if err := binary.Read(r, binary.LittleEndian, f.Vectors[i]); err != nil {
return errors.Wrapf(err, "read vector data for vector %d", i)
}
offset += int(dimension)
}
return nil
}

View File

@@ -0,0 +1,14 @@
package index
import (
"context"
"io"
)
// Codec defines interface for encoding and decoding index.
type Codec interface {
// EncodeTo encodes the index to the given writer.
EncodeTo(ctx context.Context, w io.Writer) (err error)
// DecodeFrom decodes the index from the given reader.
DecodeFrom(ctx context.Context, r io.Reader) (err error)
}

View File

@@ -0,0 +1,436 @@
package inverted
import (
"bytes"
"context"
"encoding/binary"
"io"
"maps"
"sync"
"github.com/pkg/errors"
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/model"
"github.com/glidea/zenfeed/pkg/storage/feed/block/index"
"github.com/glidea/zenfeed/pkg/telemetry"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
binaryutil "github.com/glidea/zenfeed/pkg/util/binary"
)
// --- Interface code block ---
type Index interface {
component.Component
index.Codec
// Search returns item IDs matching the given label and value.
Search(ctx context.Context, label string, eq bool, value string) (ids map[uint64]struct{})
// Add adds item to the index.
// If label or value in labels is empty, it will be ignored.
// If value is too long, it will be ignored,
// because does not support regex search, so long value is not useful.
Add(ctx context.Context, id uint64, labels model.Labels)
}
type Config struct{}
type Dependencies struct{}
const (
maxLabelValueLength = 64
)
var (
headerMagicNumber = []byte{0x77, 0x79, 0x73, 0x20, 0x69, 0x73, 0x20,
0x61, 0x77, 0x65, 0x73, 0x6f, 0x6d, 0x65, 0x00, 0x00}
headerVersion = uint8(1)
)
// --- Factory code block ---
type Factory component.Factory[Index, Config, Dependencies]
func NewFactory(mockOn ...component.MockOption) Factory {
if len(mockOn) > 0 {
return component.FactoryFunc[Index, Config, Dependencies](
func(instance string, config *Config, dependencies Dependencies) (Index, error) {
m := &mockIndex{}
component.MockOptions(mockOn).Apply(&m.Mock)
return m, nil
},
)
}
return component.FactoryFunc[Index, Config, Dependencies](new)
}
func new(instance string, config *Config, dependencies Dependencies) (Index, error) {
return &idx{
Base: component.New(&component.BaseConfig[Config, Dependencies]{
Name: "FeedInvertedIndex",
Instance: instance,
Config: config,
Dependencies: dependencies,
}),
m: make(map[string]map[string]map[uint64]struct{}, 64),
ids: make(map[uint64]struct{}, 64),
}, nil
}
// --- Implementation code block ---
type idx struct {
*component.Base[Config, Dependencies]
// Label -> values -> ids.
m map[string]map[string]map[uint64]struct{}
// All ids.
ids map[uint64]struct{}
mu sync.RWMutex
}
func (idx *idx) Search(ctx context.Context, label string, eq bool, value string) (ids map[uint64]struct{}) {
ctx = telemetry.StartWith(ctx, append(idx.TelemetryLabels(), telemetrymodel.KeyOperation, "Search")...)
defer func() { telemetry.End(ctx, nil) }()
idx.mu.RLock()
defer idx.mu.RUnlock()
if value == "" {
return idx.searchEmptyValue(label, eq)
}
return idx.searchNonEmptyValue(label, eq, value)
}
func (idx *idx) Add(ctx context.Context, id uint64, labels model.Labels) {
ctx = telemetry.StartWith(ctx, append(idx.TelemetryLabels(), telemetrymodel.KeyOperation, "Add")...)
defer func() { telemetry.End(ctx, nil) }()
idx.mu.Lock()
defer idx.mu.Unlock()
// Add all labels.
for _, label := range labels {
if label.Key == "" || label.Value == "" {
continue
}
if len(label.Value) > maxLabelValueLength {
continue
}
if _, ok := idx.m[label.Key]; !ok {
idx.m[label.Key] = make(map[string]map[uint64]struct{})
}
if _, ok := idx.m[label.Key][label.Value]; !ok {
idx.m[label.Key][label.Value] = make(map[uint64]struct{})
}
idx.m[label.Key][label.Value][id] = struct{}{}
}
// Add to ids.
idx.ids[id] = struct{}{}
}
func (idx *idx) EncodeTo(ctx context.Context, w io.Writer) (err error) {
ctx = telemetry.StartWith(ctx, append(idx.TelemetryLabels(), telemetrymodel.KeyOperation, "EncodeTo")...)
defer func() { telemetry.End(ctx, err) }()
idx.mu.RLock()
defer idx.mu.RUnlock()
if err := idx.writeHeader(w); err != nil {
return errors.Wrap(err, "write header")
}
if err := idx.writeLabels(w); err != nil {
return errors.Wrap(err, "write labels")
}
return nil
}
// DecodeFrom decodes the index from the given reader.
func (idx *idx) DecodeFrom(ctx context.Context, r io.Reader) (err error) {
ctx = telemetry.StartWith(ctx, append(idx.TelemetryLabels(), telemetrymodel.KeyOperation, "DecodeFrom")...)
defer func() { telemetry.End(ctx, err) }()
idx.mu.Lock()
defer idx.mu.Unlock()
// Read header.
if err := idx.readHeader(r); err != nil {
return errors.Wrap(err, "read header")
}
// Read labels.
if err := idx.readLabels(r); err != nil {
return errors.Wrap(err, "read labels")
}
return nil
}
// searchEmptyValue handles the search logic when the target value is empty.
// If eq is true, it returns IDs that *do not* have the given label.
// If eq is false, it returns IDs that *do* have the given label (with any value).
func (idx *idx) searchEmptyValue(label string, eq bool) map[uint64]struct{} {
// Find all IDs associated with the given label, regardless of value.
idsWithLabel := make(map[uint64]struct{})
if values, ok := idx.m[label]; ok {
for _, ids := range values {
for id := range ids {
idsWithLabel[id] = struct{}{}
}
}
}
// If not equal (!eq), return the IDs that have the label.
if !eq {
return idsWithLabel
}
// If equal (eq), return IDs that *do not* have the label.
// Start with all known IDs and remove those that have the label.
resultIDs := maps.Clone(idx.ids)
for id := range idsWithLabel {
delete(resultIDs, id)
}
return resultIDs
}
// searchNonEmptyValue handles the search logic when the target value is not empty.
// If eq is true, it returns IDs that have the exact label-value pair.
// If eq is false, it returns IDs that *do not* have the exact label-value pair.
func (idx *idx) searchNonEmptyValue(label string, eq bool, value string) map[uint64]struct{} {
// Get the map of values for the given label.
values, labelExists := idx.m[label]
// If equal (eq), find the exact match.
if eq {
if !labelExists {
return make(map[uint64]struct{}) // Label doesn't exist.
}
ids, valueExists := values[value]
if !valueExists {
return make(map[uint64]struct{}) // Value doesn't exist for this label.
}
// Return a clone to prevent modification of the underlying index data.
return maps.Clone(ids)
}
// If not equal (!eq), return IDs that *do not* have this specific label-value pair.
// Start with all known IDs.
resultIDs := maps.Clone(idx.ids)
if labelExists {
// If the specific label-value pair exists, remove its associated IDs.
if matchingIDs, valueExists := values[value]; valueExists {
for id := range matchingIDs {
delete(resultIDs, id)
}
}
}
return resultIDs
}
func (idx *idx) writeHeader(w io.Writer) error {
if _, err := w.Write(headerMagicNumber); err != nil {
return errors.Wrap(err, "write header magic number")
}
if _, err := w.Write([]byte{headerVersion}); err != nil {
return errors.Wrap(err, "write header version")
}
return nil
}
func (idx *idx) writeLabels(w io.Writer) error {
// Write total unique ID count.
idCount := uint32(len(idx.ids))
if err := binary.Write(w, binary.LittleEndian, idCount); err != nil {
return errors.Wrap(err, "write total id count")
}
// Write label count.
labelCount := uint32(len(idx.m))
if err := binary.Write(w, binary.LittleEndian, labelCount); err != nil {
return errors.Wrap(err, "write label count")
}
// Write each label and its associated value entries.
for label, values := range idx.m {
if err := idx.writeLabelEntry(w, label, values); err != nil {
return errors.Wrap(err, "write label entry")
}
}
return nil
}
// writeLabelEntry writes a single label, its value count, and then calls writeValueEntry for each value.
func (idx *idx) writeLabelEntry(w io.Writer, label string, values map[string]map[uint64]struct{}) error {
// Write label string.
if err := binaryutil.WriteString(w, label); err != nil {
return errors.Wrap(err, "write label")
}
// Write value count for this label.
valueCount := uint32(len(values))
if err := binary.Write(w, binary.LittleEndian, valueCount); err != nil {
return errors.Wrap(err, "write value count for label")
}
// Write each value and its associated IDs.
for value, ids := range values {
if err := idx.writeValueEntry(w, value, ids); err != nil {
return errors.Wrap(err, "write value entry")
}
}
return nil
}
// writeValueEntry writes a single value, its ID count, and then writes each associated ID.
func (idx *idx) writeValueEntry(w io.Writer, value string, ids map[uint64]struct{}) error {
// Write value string.
if err := binaryutil.WriteString(w, value); err != nil {
return errors.Wrap(err, "write value")
}
// Write ID count for this label-value pair.
idCount := uint32(len(ids))
if err := binary.Write(w, binary.LittleEndian, idCount); err != nil {
return errors.Wrap(err, "write id count for value")
}
// Write each associated ID.
for id := range ids {
if err := binary.Write(w, binary.LittleEndian, id); err != nil {
return errors.Wrap(err, "write id")
}
}
return nil
}
func (idx *idx) readHeader(r io.Reader) error {
magicNumber := make([]byte, len(headerMagicNumber))
if _, err := io.ReadFull(r, magicNumber); err != nil {
return errors.Wrap(err, "read header magic number")
}
if !bytes.Equal(magicNumber, headerMagicNumber) {
return errors.New("invalid magic number")
}
versionByte := make([]byte, 1)
if _, err := io.ReadFull(r, versionByte); err != nil {
return errors.Wrap(err, "read header version")
}
if versionByte[0] != headerVersion {
return errors.New("invalid version")
}
return nil
}
func (idx *idx) readLabels(r io.Reader) error {
// Read total unique ID count (used for pre-allocation).
var totalIDCount uint32
if err := binary.Read(r, binary.LittleEndian, &totalIDCount); err != nil {
return errors.Wrap(err, "read total id count")
}
idx.ids = make(map[uint64]struct{}, totalIDCount) // Pre-allocate ids map.
// Read label count.
var labelCount uint32
if err := binary.Read(r, binary.LittleEndian, &labelCount); err != nil {
return errors.Wrap(err, "read label count")
}
idx.m = make(map[string]map[string]map[uint64]struct{}, labelCount) // Pre-allocate labels map.
// Read each label and its associated value entries.
for range labelCount {
if err := idx.readLabelEntry(r); err != nil {
return errors.Wrap(err, "read label entry")
}
}
return nil
}
// readLabelEntry reads a single label, its value count, and then calls readValueEntry for each value.
func (idx *idx) readLabelEntry(r io.Reader) error {
// Read label string.
label, err := binaryutil.ReadString(r)
if err != nil {
return errors.Wrap(err, "read label")
}
// Read value count for this label.
var valueCount uint32
if err := binary.Read(r, binary.LittleEndian, &valueCount); err != nil {
return errors.Wrap(err, "read value count for label")
}
idx.m[label] = make(map[string]map[uint64]struct{}, valueCount) // Pre-allocate values map for this label.
// Read each value and its associated IDs.
for range valueCount {
if err := idx.readValueEntry(r, label); err != nil {
return errors.Wrap(err, "read value entry")
}
}
return nil
}
// readValueEntry reads a single value, its ID count, and then reads each associated ID, populating the index maps.
func (idx *idx) readValueEntry(r io.Reader, label string) error {
// Read value string.
value, err := binaryutil.ReadString(r)
if err != nil {
return errors.Wrap(err, "read value")
}
// Read ID count for this label-value pair.
var idCount uint32
if err := binary.Read(r, binary.LittleEndian, &idCount); err != nil {
return errors.Wrap(err, "read id count for value")
}
idx.m[label][value] = make(map[uint64]struct{}, idCount) // Pre-allocate ids map for this label-value.
// Read each associated ID.
for range idCount {
var id uint64
if err := binary.Read(r, binary.LittleEndian, &id); err != nil {
return errors.Wrap(err, "read id")
}
idx.m[label][value][id] = struct{}{}
idx.ids[id] = struct{}{} // Add to the global set of IDs.
}
return nil
}
type mockIndex struct {
component.Mock
}
func (m *mockIndex) Search(ctx context.Context, label string, eq bool, value string) (ids map[uint64]struct{}) {
args := m.Called(ctx, label, eq, value)
return args.Get(0).(map[uint64]struct{})
}
func (m *mockIndex) Add(ctx context.Context, id uint64, labels model.Labels) {
m.Called(ctx, id, labels)
}
func (m *mockIndex) EncodeTo(ctx context.Context, w io.Writer) (err error) {
args := m.Called(ctx, w)
return args.Error(0)
}
func (m *mockIndex) DecodeFrom(ctx context.Context, r io.Reader) (err error) {
args := m.Called(ctx, r)
return args.Error(0)
}

View File

@@ -0,0 +1,327 @@
package inverted
import (
"bytes"
"context"
"testing"
. "github.com/onsi/gomega"
"github.com/glidea/zenfeed/pkg/model"
"github.com/glidea/zenfeed/pkg/test"
)
func TestAdd(t *testing.T) {
RegisterTestingT(t)
type givenDetail struct {
existingLabels map[uint64]model.Labels
}
type whenDetail struct {
id uint64
labels model.Labels
}
type thenExpected struct {
indexState map[string]map[string]map[uint64]struct{}
}
tests := []test.Case[givenDetail, whenDetail, thenExpected]{
{
Scenario: "Add Single Label",
Given: "An empty index",
When: "Adding an item with a single label",
Then: "Should index the item correctly",
GivenDetail: givenDetail{
existingLabels: map[uint64]model.Labels{},
},
WhenDetail: whenDetail{
id: 1,
labels: model.Labels{
{Key: "category", Value: "tech"},
},
},
ThenExpected: thenExpected{
indexState: map[string]map[string]map[uint64]struct{}{
"category": {
"tech": {1: struct{}{}},
},
},
},
},
{
Scenario: "Add Multiple Labels",
Given: "An empty index",
When: "Adding an item with multiple labels",
Then: "Should index all labels correctly",
GivenDetail: givenDetail{
existingLabels: map[uint64]model.Labels{
1: {model.Label{Key: "category", Value: "tech"}},
3: {model.Label{Key: "category", Value: "news"}},
},
},
WhenDetail: whenDetail{
id: 2,
labels: model.Labels{
{Key: "category", Value: "tech"},
{Key: "status", Value: "new"},
{Key: "author", Value: "john"},
},
},
ThenExpected: thenExpected{
indexState: map[string]map[string]map[uint64]struct{}{
"category": {
"tech": {1: struct{}{}, 2: struct{}{}},
"news": {3: struct{}{}},
},
"status": {
"new": {2: struct{}{}},
},
"author": {
"john": {2: struct{}{}},
},
},
},
},
}
for _, tt := range tests {
t.Run(tt.Scenario, func(t *testing.T) {
// Given.
idx0, err := NewFactory().New("test", &Config{}, Dependencies{})
Expect(err).NotTo(HaveOccurred())
for id, labels := range tt.GivenDetail.existingLabels {
idx0.Add(context.Background(), id, labels)
}
// When.
idx0.Add(context.Background(), tt.WhenDetail.id, tt.WhenDetail.labels)
// Then.
invIdx := idx0.(*idx)
for label, values := range tt.ThenExpected.indexState {
Expect(invIdx.m).To(HaveKey(label))
for value, ids := range values {
Expect(invIdx.m[label]).To(HaveKey(value))
for id := range ids {
Expect(invIdx.m[label][value]).To(HaveKey(id))
}
}
}
})
}
}
func TestSearch(t *testing.T) {
RegisterTestingT(t)
type givenDetail struct {
setupLabels map[uint64]model.Labels
}
type whenDetail struct {
searchLabel string
eq bool
searchValue string
}
type thenExpected struct {
want []uint64
}
tests := []test.Case[givenDetail, whenDetail, thenExpected]{
{
Scenario: "Search Existing Label-Value",
Given: "An index with feeds",
When: "Searching for existing label and value",
Then: "Should return matching item IDs",
GivenDetail: givenDetail{
setupLabels: map[uint64]model.Labels{
1: {model.Label{Key: "category", Value: "tech"}},
2: {model.Label{Key: "category", Value: "tech"}},
3: {model.Label{Key: "category", Value: "news"}},
},
},
WhenDetail: whenDetail{
searchLabel: "category",
searchValue: "tech",
eq: true,
},
ThenExpected: thenExpected{
want: []uint64{1, 2},
},
},
{
Scenario: "Search Non-Existing Label",
Given: "An index with feeds",
When: "Searching for non-existing label",
Then: "Should return empty result",
GivenDetail: givenDetail{
setupLabels: map[uint64]model.Labels{
1: {model.Label{Key: "category", Value: "tech"}},
},
},
WhenDetail: whenDetail{
searchLabel: "invalid",
searchValue: "value",
eq: true,
},
ThenExpected: thenExpected{
want: nil,
},
},
{
Scenario: "Search Non-Existing Value",
Given: "An index with feeds",
When: "Searching for existing label but non-existing value",
Then: "Should return empty result",
GivenDetail: givenDetail{
setupLabels: map[uint64]model.Labels{
1: {model.Label{Key: "category", Value: "tech"}},
},
},
WhenDetail: whenDetail{
searchLabel: "category",
searchValue: "invalid",
eq: true,
},
ThenExpected: thenExpected{
want: nil,
},
},
// Not equal tests.
{
Scenario: "Search Not Matching Label-Value",
Given: "An index with multiple feeds",
When: "Searching for feeds not matching a label-value pair",
Then: "Should return all feeds except those matching the pair",
GivenDetail: givenDetail{
setupLabels: map[uint64]model.Labels{
1: {model.Label{Key: "category", Value: "tech"}, model.Label{Key: "status", Value: "new"}},
2: {model.Label{Key: "category", Value: "news"}, model.Label{Key: "status", Value: "old"}},
3: {model.Label{Key: "category", Value: "tech"}, model.Label{Key: "status", Value: "old"}},
},
},
WhenDetail: whenDetail{
searchLabel: "category",
searchValue: "tech",
eq: false,
},
ThenExpected: thenExpected{
want: []uint64{2},
},
},
{
Scenario: "Search Not Matching Non-Existing Label",
Given: "An index with feeds",
When: "Searching for feeds not matching a non-existing label",
Then: "Should return all feeds",
GivenDetail: givenDetail{
setupLabels: map[uint64]model.Labels{
1: {model.Label{Key: "category", Value: "tech"}},
2: {model.Label{Key: "category", Value: "news"}},
},
},
WhenDetail: whenDetail{
searchLabel: "invalid",
searchValue: "value",
eq: false,
},
ThenExpected: thenExpected{
want: []uint64{1, 2},
},
},
}
for _, tt := range tests {
t.Run(tt.Scenario, func(t *testing.T) {
// Given.
idx, err := NewFactory().New("test", &Config{}, Dependencies{})
Expect(err).NotTo(HaveOccurred())
for id, labels := range tt.GivenDetail.setupLabels {
idx.Add(context.Background(), id, labels)
}
// When.
result := idx.Search(context.Background(), tt.WhenDetail.searchLabel, tt.WhenDetail.eq, tt.WhenDetail.searchValue)
// Then.
if tt.ThenExpected.want == nil {
Expect(result).To(BeEmpty())
} else {
resultIDs := make([]uint64, 0, len(result))
for id := range result {
resultIDs = append(resultIDs, id)
}
Expect(resultIDs).To(ConsistOf(tt.ThenExpected.want))
}
})
}
}
func TestEncodeDecode(t *testing.T) {
RegisterTestingT(t)
type givenDetail struct {
setupLabels map[uint64]model.Labels
}
type whenDetail struct{}
type thenExpected struct {
success bool
}
tests := []test.Case[givenDetail, whenDetail, thenExpected]{
{
Scenario: "Encode and Decode Empty Index",
Given: "An empty index",
When: "Encoding and decoding",
Then: "Should restore empty index correctly",
GivenDetail: givenDetail{
setupLabels: map[uint64]model.Labels{},
},
WhenDetail: whenDetail{},
ThenExpected: thenExpected{
success: true,
},
},
{
Scenario: "Encode and Decode Index with Data",
Given: "An index with feeds",
When: "Encoding and decoding",
Then: "Should restore all data correctly",
GivenDetail: givenDetail{
setupLabels: map[uint64]model.Labels{
1: {model.Label{Key: "category", Value: "tech"}, model.Label{Key: "status", Value: "new"}},
2: {model.Label{Key: "category", Value: "news"}, model.Label{Key: "author", Value: "john"}},
},
},
WhenDetail: whenDetail{},
ThenExpected: thenExpected{
success: true,
},
},
}
for _, tt := range tests {
t.Run(tt.Scenario, func(t *testing.T) {
// Given.
original, err := NewFactory().New("test", &Config{}, Dependencies{})
Expect(err).NotTo(HaveOccurred())
for id, labels := range tt.GivenDetail.setupLabels {
original.Add(context.Background(), id, labels)
}
// When.
var buf bytes.Buffer
err = original.EncodeTo(context.Background(), &buf)
Expect(err).NotTo(HaveOccurred())
decoded, err := NewFactory().New("test", &Config{}, Dependencies{})
Expect(err).NotTo(HaveOccurred())
err = decoded.DecodeFrom(context.Background(), &buf)
Expect(err).NotTo(HaveOccurred())
// Then.
origIdx := original.(*idx)
decodedIdx := decoded.(*idx)
Expect(decodedIdx.m).To(Equal(origIdx.m))
})
}
}

View File

@@ -0,0 +1,285 @@
package primary
import (
"bytes"
"context"
"encoding/binary"
"io"
"sync"
"time"
"github.com/pkg/errors"
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/storage/feed/block/index"
telemetry "github.com/glidea/zenfeed/pkg/telemetry"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
)
// --- Interface code block ---
type Index interface {
component.Component
index.Codec
// Search returns item location by ID.
Search(ctx context.Context, id uint64) (ref FeedRef, ok bool)
// Add adds item location to the index.
Add(ctx context.Context, id uint64, item FeedRef)
// IDs returns all item IDs.
IDs(ctx context.Context) (ids map[uint64]bool)
// Count returns the number of feeds in the index.
Count(ctx context.Context) (count uint32)
}
type Config struct{}
type Dependencies struct{}
var (
headerMagicNumber = []byte{0x77, 0x79, 0x73, 0x20, 0x69, 0x73, 0x20,
0x61, 0x77, 0x65, 0x73, 0x6f, 0x6d, 0x65, 0x00, 0x00}
headerVersion = uint8(1)
)
type FeedRef struct {
Chunk uint32
Offset uint64
Time time.Time
}
// --- Factory code block ---
type Factory component.Factory[Index, Config, Dependencies]
func NewFactory(mockOn ...component.MockOption) Factory {
if len(mockOn) > 0 {
return component.FactoryFunc[Index, Config, Dependencies](
func(instance string, config *Config, dependencies Dependencies) (Index, error) {
m := &mockIndex{}
component.MockOptions(mockOn).Apply(&m.Mock)
return m, nil
},
)
}
return component.FactoryFunc[Index, Config, Dependencies](new)
}
func new(instance string, config *Config, dependencies Dependencies) (Index, error) {
return &idx{
Base: component.New(&component.BaseConfig[Config, Dependencies]{
Name: "FeedPrimaryIndex",
Instance: instance,
Config: config,
Dependencies: dependencies,
}),
m: make(map[uint64]FeedRef, 64),
}, nil
}
// --- Implementation code block ---
type idx struct {
*component.Base[Config, Dependencies]
m map[uint64]FeedRef
mu sync.RWMutex
}
func (idx *idx) Search(ctx context.Context, id uint64) (ref FeedRef, ok bool) {
ctx = telemetry.StartWith(ctx, append(idx.TelemetryLabels(), telemetrymodel.KeyOperation, "Search")...)
defer func() { telemetry.End(ctx, nil) }()
idx.mu.RLock()
defer idx.mu.RUnlock()
ref, ok = idx.m[id]
return ref, ok
}
func (idx *idx) Add(ctx context.Context, id uint64, item FeedRef) {
ctx = telemetry.StartWith(ctx, append(idx.TelemetryLabels(), telemetrymodel.KeyOperation, "Add")...)
defer func() { telemetry.End(ctx, nil) }()
idx.mu.Lock()
defer idx.mu.Unlock()
item.Time = item.Time.In(time.UTC)
idx.m[id] = item
}
func (idx *idx) IDs(ctx context.Context) (ids map[uint64]bool) {
ctx = telemetry.StartWith(ctx, append(idx.TelemetryLabels(), telemetrymodel.KeyOperation, "IDs")...)
defer func() { telemetry.End(ctx, nil) }()
idx.mu.RLock()
defer idx.mu.RUnlock()
result := make(map[uint64]bool, len(idx.m))
for id := range idx.m {
result[id] = true
}
return result
}
func (idx *idx) Count(ctx context.Context) (count uint32) {
ctx = telemetry.StartWith(ctx, append(idx.TelemetryLabels(), telemetrymodel.KeyOperation, "Count")...)
defer func() { telemetry.End(ctx, nil) }()
idx.mu.RLock()
defer idx.mu.RUnlock()
return uint32(len(idx.m))
}
func (idx *idx) EncodeTo(ctx context.Context, w io.Writer) (err error) {
ctx = telemetry.StartWith(ctx, append(idx.TelemetryLabels(), telemetrymodel.KeyOperation, "EncodeTo")...)
defer func() { telemetry.End(ctx, err) }()
idx.mu.RLock()
defer idx.mu.RUnlock()
// Write header.
if _, err := w.Write(headerMagicNumber); err != nil {
return errors.Wrap(err, "write header magic number")
}
if _, err := w.Write([]byte{headerVersion}); err != nil {
return errors.Wrap(err, "write header version")
}
// Write map count.
count := uint64(len(idx.m))
if err := binary.Write(w, binary.LittleEndian, count); err != nil {
return errors.Wrap(err, "write map count")
}
// Write all key-value pairs.
for id, ref := range idx.m {
// Write Key.
if err := binary.Write(w, binary.LittleEndian, id); err != nil {
return errors.Wrap(err, "write id")
}
// Write Value.
if err := binary.Write(w, binary.LittleEndian, ref.Chunk); err != nil {
return errors.Wrap(err, "write chunk")
}
if err := binary.Write(w, binary.LittleEndian, ref.Offset); err != nil {
return errors.Wrap(err, "write offset")
}
if err := binary.Write(w, binary.LittleEndian, ref.Time.UnixNano()); err != nil {
return errors.Wrap(err, "write time")
}
}
return nil
}
func (idx *idx) DecodeFrom(ctx context.Context, r io.Reader) (err error) {
ctx = telemetry.StartWith(ctx, append(idx.TelemetryLabels(), telemetrymodel.KeyOperation, "DecodeFrom")...)
defer func() { telemetry.End(ctx, err) }()
idx.mu.Lock()
defer idx.mu.Unlock()
// Read header.
if err := idx.readHeader(r); err != nil {
return errors.Wrap(err, "read header")
}
// Read map count.
var count uint64
if err := binary.Read(r, binary.LittleEndian, &count); err != nil {
return errors.Wrap(err, "read map count")
}
idx.m = make(map[uint64]FeedRef, count)
// Read all key-value pairs.
for range count {
id, ref, err := idx.readEntry(r)
if err != nil {
return errors.Wrap(err, "read entry")
}
idx.m[id] = ref
}
return nil
}
// readHeader reads and validates the index file header.
func (idx *idx) readHeader(r io.Reader) error {
magicNumber := make([]byte, len(headerMagicNumber))
if _, err := io.ReadFull(r, magicNumber); err != nil {
return errors.Wrap(err, "read magic number")
}
if !bytes.Equal(magicNumber, headerMagicNumber) {
return errors.New("invalid magic number")
}
versionByte := make([]byte, 1)
if _, err := io.ReadFull(r, versionByte); err != nil {
return errors.Wrap(err, "read version")
}
if versionByte[0] != headerVersion {
return errors.New("invalid version")
}
return nil
}
// readEntry reads a single key-value pair (feed ID and FeedRef) from the reader.
func (idx *idx) readEntry(r io.Reader) (id uint64, ref FeedRef, err error) {
// Read Key (ID).
if err := binary.Read(r, binary.LittleEndian, &id); err != nil {
return 0, FeedRef{}, errors.Wrap(err, "read id")
}
// Read Value (FeedRef).
if err := binary.Read(r, binary.LittleEndian, &ref.Chunk); err != nil {
return 0, FeedRef{}, errors.Wrap(err, "read chunk")
}
if err := binary.Read(r, binary.LittleEndian, &ref.Offset); err != nil {
return 0, FeedRef{}, errors.Wrap(err, "read offset")
}
var timestamp int64
if err := binary.Read(r, binary.LittleEndian, &timestamp); err != nil {
return 0, FeedRef{}, errors.Wrap(err, "read time")
}
ref.Time = time.Unix(0, timestamp).In(time.UTC)
return id, ref, nil
}
type mockIndex struct {
component.Mock
}
func (m *mockIndex) Search(ctx context.Context, id uint64) (ref FeedRef, ok bool) {
args := m.Called(ctx, id)
return args.Get(0).(FeedRef), args.Bool(1)
}
func (m *mockIndex) Add(ctx context.Context, id uint64, item FeedRef) {
m.Called(ctx, id, item)
}
func (m *mockIndex) IDs(ctx context.Context) (ids map[uint64]bool) {
args := m.Called(ctx)
return args.Get(0).(map[uint64]bool)
}
func (m *mockIndex) Count(ctx context.Context) (count uint32) {
args := m.Called(ctx)
return args.Get(0).(uint32)
}
func (m *mockIndex) EncodeTo(ctx context.Context, w io.Writer) (err error) {
args := m.Called(ctx, w)
return args.Error(0)
}
func (m *mockIndex) DecodeFrom(ctx context.Context, r io.Reader) (err error) {
args := m.Called(ctx, r)
return args.Error(0)
}

View File

@@ -0,0 +1,222 @@
package primary
import (
"bytes"
"context"
"testing"
"time"
. "github.com/onsi/gomega"
"github.com/glidea/zenfeed/pkg/test"
)
func TestAdd(t *testing.T) {
RegisterTestingT(t)
type givenDetail struct {
existingItems map[uint64]FeedRef
}
type whenDetail struct {
id uint64
item FeedRef
}
type thenExpected struct {
items map[uint64]FeedRef
}
tests := []test.Case[givenDetail, whenDetail, thenExpected]{
{
Scenario: "Add Single Feed",
Given: "An index with existing item",
When: "Adding a single item",
Then: "Should store the item correctly",
GivenDetail: givenDetail{
existingItems: map[uint64]FeedRef{
0: {Chunk: 0, Offset: 0},
},
},
WhenDetail: whenDetail{
id: 1,
item: FeedRef{Chunk: 1, Offset: 100},
},
ThenExpected: thenExpected{
items: map[uint64]FeedRef{
0: {Chunk: 0, Offset: 0},
1: {Chunk: 1, Offset: 100},
},
},
},
{
Scenario: "Update Existing Feed",
Given: "An index with existing item",
When: "Adding item with same ID",
Then: "Should update the item reference",
GivenDetail: givenDetail{
existingItems: map[uint64]FeedRef{
1: {Chunk: 1, Offset: 100},
},
},
WhenDetail: whenDetail{
id: 1,
item: FeedRef{Chunk: 2, Offset: 200},
},
ThenExpected: thenExpected{
items: map[uint64]FeedRef{
1: {Chunk: 2, Offset: 200},
},
},
},
}
for _, tt := range tests {
t.Run(tt.Scenario, func(t *testing.T) {
// Given.
idx0, err := NewFactory().New("test", &Config{}, Dependencies{})
Expect(err).NotTo(HaveOccurred())
for id, item := range tt.GivenDetail.existingItems {
idx0.Add(context.Background(), id, item)
}
// When.
idx0.Add(context.Background(), tt.WhenDetail.id, tt.WhenDetail.item)
// Then.
primIdx := idx0.(*idx)
for id, expected := range tt.ThenExpected.items {
Expect(primIdx.m).To(HaveKey(id))
Expect(primIdx.m[id]).To(Equal(expected))
}
})
}
}
func TestSearch(t *testing.T) {
RegisterTestingT(t)
type givenDetail struct {
feeds map[uint64]FeedRef
}
type whenDetail struct {
searchID uint64
}
type thenExpected struct {
feedRef FeedRef
found bool
}
tests := []test.Case[givenDetail, whenDetail, thenExpected]{
{
Scenario: "Search Existing Feed",
Given: "An index with feeds",
When: "Searching for existing ID",
Then: "Should return correct FeedRef",
GivenDetail: givenDetail{
feeds: map[uint64]FeedRef{
1: {Chunk: 1, Offset: 100},
2: {Chunk: 2, Offset: 200},
},
},
WhenDetail: whenDetail{
searchID: 1,
},
ThenExpected: thenExpected{
feedRef: FeedRef{Chunk: 1, Offset: 100},
found: true,
},
},
{
Scenario: "Search Non-Existing Feed",
Given: "An index with feeds",
When: "Searching for non-existing ID",
Then: "Should return empty FeedRef",
GivenDetail: givenDetail{
feeds: map[uint64]FeedRef{
1: {Chunk: 1, Offset: 100},
},
},
WhenDetail: whenDetail{
searchID: 2,
},
ThenExpected: thenExpected{
feedRef: FeedRef{},
found: false,
},
},
}
for _, tt := range tests {
t.Run(tt.Scenario, func(t *testing.T) {
// Given.
idx, err := NewFactory().New("test", &Config{}, Dependencies{})
Expect(err).NotTo(HaveOccurred())
for id, item := range tt.GivenDetail.feeds {
idx.Add(context.Background(), id, item)
}
// When.
result, ok := idx.Search(context.Background(), tt.WhenDetail.searchID)
// Then.
Expect(result).To(Equal(tt.ThenExpected.feedRef))
Expect(ok).To(Equal(tt.ThenExpected.found))
})
}
}
func TestEncodeDecode(t *testing.T) {
RegisterTestingT(t)
type givenDetail struct {
feeds map[uint64]FeedRef
}
type whenDetail struct{}
type thenExpected struct {
success bool
}
tests := []test.Case[givenDetail, whenDetail, thenExpected]{
{
Scenario: "Encode and Decode Index with Data",
Given: "An index with feeds",
When: "Encoding and decoding",
Then: "Should restore all data correctly",
GivenDetail: givenDetail{
feeds: map[uint64]FeedRef{
1: {Chunk: 1, Offset: 100, Time: time.Now()},
2: {Chunk: 2, Offset: 200, Time: time.Now()},
},
},
WhenDetail: whenDetail{},
ThenExpected: thenExpected{
success: true,
},
},
}
for _, tt := range tests {
t.Run(tt.Scenario, func(t *testing.T) {
// Given.
original, err := NewFactory().New("test", &Config{}, Dependencies{})
Expect(err).NotTo(HaveOccurred())
for id, item := range tt.GivenDetail.feeds {
original.Add(context.Background(), id, item)
}
// When.
var buf bytes.Buffer
err = original.EncodeTo(context.Background(), &buf)
Expect(err).NotTo(HaveOccurred())
decoded, err := NewFactory().New("test", &Config{}, Dependencies{})
Expect(err).NotTo(HaveOccurred())
err = decoded.DecodeFrom(context.Background(), &buf)
Expect(err).NotTo(HaveOccurred())
// Then.
origIdx := original.(*idx)
decodedIdx := decoded.(*idx)
Expect(decodedIdx.m).To(Equal(origIdx.m))
})
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,329 @@
package vector
import (
"bytes"
"context"
"testing"
. "github.com/onsi/gomega"
"github.com/glidea/zenfeed/pkg/test"
)
func TestSearch(t *testing.T) {
RegisterTestingT(t)
type givenDetail struct {
vectors map[uint64][][]float32
}
type whenDetail struct {
q []float32
threshold float32
limit int
}
type thenExpected struct {
idWithScores map[uint64]float32
err string
}
tests := []test.Case[givenDetail, whenDetail, thenExpected]{
{
Scenario: "Search for similar vectors",
Given: "An index with some vectors",
When: "Searching for a vector with a threshold",
Then: "Should return IDs of similar vectors with scores",
GivenDetail: givenDetail{
vectors: map[uint64][][]float32{
1: {{1.0, 0.0, 0.0}},
2: {{0.8, 1.0, 0.0}},
3: {{0.8, 0.1, 0.1} /*0.9847*/, {0.7, 0.1, 0.9} /*0.6116*/},
},
},
WhenDetail: whenDetail{
q: []float32{1.0, 0.0, 0.0},
threshold: 0.9,
limit: 5,
},
ThenExpected: thenExpected{
idWithScores: map[uint64]float32{
1: 1.0,
3: 0.9847,
},
},
},
{
Scenario: "Search for similar vectors with strict limit",
Given: "An index with some vectors",
When: "Searching for a vector with a strict limit",
Then: "Should return IDs of similar vectors with scores",
GivenDetail: givenDetail{
vectors: map[uint64][][]float32{
1: {{1.0, 0.0, 0.0}},
2: {{0.8, 1.0, 0.0}},
3: {{0.8, 0.1, 0.1} /*0.9847*/, {0.7, 0.1, 0.9} /*0.6116*/},
},
},
WhenDetail: whenDetail{
q: []float32{1.0, 0.0, 0.0},
threshold: 0.9,
limit: 1,
},
ThenExpected: thenExpected{
idWithScores: map[uint64]float32{
1: 1.0,
},
},
},
{
Scenario: "Search with dimension mismatch",
Given: "An index with some vectors",
When: "Searching for a vector with different dimension",
Then: "Should return an error",
GivenDetail: givenDetail{
vectors: map[uint64][][]float32{
1: {{1.0, 0.0, 0.0}},
},
},
WhenDetail: whenDetail{
q: []float32{1.0, 0.0}, // Different dimension.
threshold: 0.8,
limit: 10,
},
ThenExpected: thenExpected{
err: "vector dimension mismatch",
},
},
}
for _, tt := range tests {
t.Run(tt.Scenario, func(t *testing.T) {
// Given.
idx, err := NewFactory().New("test", &Config{}, Dependencies{})
Expect(err).NotTo(HaveOccurred())
for id, vectors := range tt.GivenDetail.vectors {
err := idx.Add(context.Background(), id, vectors)
Expect(err).NotTo(HaveOccurred())
}
// When.
idWithScores, err := idx.Search(context.Background(), tt.WhenDetail.q, tt.WhenDetail.threshold, tt.WhenDetail.limit)
// Then.
if tt.ThenExpected.err != "" {
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring(tt.ThenExpected.err))
} else {
Expect(err).NotTo(HaveOccurred())
Expect(idWithScores).To(HaveLen(len(tt.ThenExpected.idWithScores)))
for id, score := range tt.ThenExpected.idWithScores {
Expect(idWithScores).To(HaveKey(id))
Expect(idWithScores[id]).To(BeNumerically("~", score, 0.01))
}
}
})
}
}
func TestAdd(t *testing.T) {
RegisterTestingT(t)
type givenDetail struct {
existingVectors map[uint64][][]float32
}
type whenDetail struct {
id uint64
vectors [][]float32
}
type thenExpected struct {
err string
nodeExists bool
layersContain bool
}
tests := []test.Case[givenDetail, whenDetail, thenExpected]{
{
Scenario: "Add a vector to an empty index",
Given: "An empty vector index",
When: "Adding a vector",
Then: "Should add the vector and update layers",
GivenDetail: givenDetail{
existingVectors: map[uint64][][]float32{},
},
WhenDetail: whenDetail{
id: 1,
vectors: [][]float32{{1.0, 0.0, 0.0}},
},
ThenExpected: thenExpected{
nodeExists: true,
layersContain: true,
},
},
{
Scenario: "Add multiple vectors",
Given: "An index with existing vectors",
When: "Adding another vector",
Then: "Should add the vector and update layers",
GivenDetail: givenDetail{
existingVectors: map[uint64][][]float32{
1: {{1.0, 0.0, 0.0}},
},
},
WhenDetail: whenDetail{
id: 2,
vectors: [][]float32{{0.0, 1.0, 0.0}},
},
ThenExpected: thenExpected{
nodeExists: true,
layersContain: true,
},
},
{
Scenario: "Add a vector with dimension mismatch",
Given: "An index with existing vectors",
When: "Adding a vector with different dimension",
Then: "Should return error",
GivenDetail: givenDetail{
existingVectors: map[uint64][][]float32{
1: {{1.0, 0.0, 0.0}},
},
},
WhenDetail: whenDetail{
id: 2,
vectors: [][]float32{{1.0, 0.0}}, // Different dimension
},
ThenExpected: thenExpected{
err: "vector dimension mismatch",
},
},
}
for _, tt := range tests {
t.Run(tt.Scenario, func(t *testing.T) {
// Given
idx0, err := NewFactory().New("test", &Config{}, Dependencies{})
Expect(err).NotTo(HaveOccurred())
for id, vectors := range tt.GivenDetail.existingVectors {
err := idx0.Add(context.Background(), id, vectors)
Expect(err).NotTo(HaveOccurred())
}
// When
err = idx0.Add(context.Background(), tt.WhenDetail.id, tt.WhenDetail.vectors)
// Then
if tt.ThenExpected.err != "" {
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring(tt.ThenExpected.err))
} else {
Expect(err).NotTo(HaveOccurred())
v := idx0.(*idx)
v.mu.RLock()
defer v.mu.RUnlock()
if tt.ThenExpected.nodeExists {
Expect(v.m).To(HaveKey(tt.WhenDetail.id))
node := v.m[tt.WhenDetail.id]
Expect(node.vectors).To(Equal(tt.WhenDetail.vectors))
}
if tt.ThenExpected.layersContain {
nodeInLayers := false
for _, id := range v.layers[0].nodes {
if id == tt.WhenDetail.id {
nodeInLayers = true
break
}
}
Expect(nodeInLayers).To(BeTrue(), "Node should be in layer 0")
if len(tt.GivenDetail.existingVectors) > 0 {
node := v.m[tt.WhenDetail.id]
hasFriends := false
for _, friends := range node.friendsOnLayers {
if len(friends) > 0 {
hasFriends = true
break
}
}
Expect(hasFriends).To(BeTrue(), "Node should have friends")
}
}
}
})
}
}
func TestEncodeDecode(t *testing.T) {
RegisterTestingT(t)
type givenDetail struct {
vectors map[uint64][][]float32
}
type whenDetail struct{}
type thenExpected struct {
err string
}
tests := []test.Case[givenDetail, whenDetail, thenExpected]{
{
Scenario: "Encode and decode an index with data",
Given: "An index with some vectors",
When: "Encoding and decoding the index",
Then: "Should restore the index correctly",
GivenDetail: givenDetail{
vectors: map[uint64][][]float32{
1: {{1.0, 0.0, 0.0}},
2: {{0.0, 1.0, 0.0}},
},
},
WhenDetail: whenDetail{},
ThenExpected: thenExpected{},
},
}
for _, tt := range tests {
t.Run(tt.Scenario, func(t *testing.T) {
// Given.
original, err := NewFactory().New("test", &Config{}, Dependencies{})
Expect(err).NotTo(HaveOccurred())
for id, vectors := range tt.GivenDetail.vectors {
err := original.Add(context.Background(), id, vectors)
Expect(err).NotTo(HaveOccurred())
}
// When.
var buf bytes.Buffer
err = original.EncodeTo(context.Background(), &buf)
Expect(err).NotTo(HaveOccurred())
decoded, err := NewFactory().New("test", &Config{}, Dependencies{})
Expect(err).NotTo(HaveOccurred())
err = decoded.DecodeFrom(context.Background(), &buf)
// Then.
if tt.ThenExpected.err != "" {
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring(tt.ThenExpected.err))
} else {
Expect(err).NotTo(HaveOccurred())
// Verify by searching.
for _, vectors := range tt.GivenDetail.vectors {
for _, vector := range vectors {
originalResults, err := original.Search(context.Background(), vector, 0.99, 10)
Expect(err).NotTo(HaveOccurred())
decodedResults, err := decoded.Search(context.Background(), vector, 0.99, 10)
Expect(err).NotTo(HaveOccurred())
Expect(decodedResults).To(HaveLen(len(originalResults)))
for id, score := range originalResults {
Expect(decodedResults).To(HaveKey(id))
Expect(decodedResults[id]).To(BeNumerically("~", score, 0.000001))
}
}
}
}
})
}
}

643
pkg/storage/feed/feed.go Normal file
View File

@@ -0,0 +1,643 @@
// Copyright (C) 2025 wangyusong
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package feed
import (
"context"
"os"
"path/filepath"
"reflect"
"strconv"
"sync"
"time"
"github.com/benbjohnson/clock"
"github.com/pkg/errors"
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/config"
"github.com/glidea/zenfeed/pkg/llm"
"github.com/glidea/zenfeed/pkg/model"
"github.com/glidea/zenfeed/pkg/rewrite"
"github.com/glidea/zenfeed/pkg/storage/feed/block"
"github.com/glidea/zenfeed/pkg/storage/feed/block/chunk"
"github.com/glidea/zenfeed/pkg/storage/feed/block/index/inverted"
"github.com/glidea/zenfeed/pkg/storage/feed/block/index/primary"
"github.com/glidea/zenfeed/pkg/storage/feed/block/index/vector"
"github.com/glidea/zenfeed/pkg/telemetry"
"github.com/glidea/zenfeed/pkg/telemetry/log"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
timeutil "github.com/glidea/zenfeed/pkg/util/time"
)
var clk = clock.New()
// --- Interface code block ---
type Storage interface {
component.Component
config.Watcher
// Append stores some feeds.
Append(ctx context.Context, feeds ...*model.Feed) error
// Query retrieves feeds by query options.
// Results are sorted by score (if vector query) and time.
Query(ctx context.Context, query block.QueryOptions) ([]*block.FeedVO, error)
// Exists checks if a feed exists in the storage.
// If hintTime is zero, it only checks the head block.
Exists(ctx context.Context, id uint64, hintTime time.Time) (bool, error)
}
type Config struct {
Dir string
Retention time.Duration
BlockDuration time.Duration
EmbeddingLLM string
FlushInterval time.Duration
}
const subDir = "feed"
func (c *Config) Validate() error {
if c.Dir == "" {
c.Dir = "./data/" + subDir
}
if c.Retention <= 0 {
c.Retention = 8 * timeutil.Day
}
if c.Retention < timeutil.Day || c.Retention > 15*timeutil.Day {
return errors.New("retention must be between 1 day and 15 days")
}
if c.BlockDuration <= 0 {
c.BlockDuration = 25 * time.Hour
}
if c.Retention < c.BlockDuration {
return errors.Errorf("retention must be greater than %s", c.BlockDuration)
}
if c.EmbeddingLLM == "" {
return errors.New("embedding LLM is required")
}
return nil
}
func (c *Config) From(app *config.App) {
*c = Config{
Dir: app.Storage.Dir,
Retention: app.Storage.Feed.Retention,
BlockDuration: app.Storage.Feed.BlockDuration,
FlushInterval: app.Storage.Feed.FlushInterval,
EmbeddingLLM: app.Storage.Feed.EmbeddingLLM,
}
}
type Dependencies struct {
BlockFactory block.Factory
LLMFactory llm.Factory
ChunkFactory chunk.Factory
PrimaryFactory primary.Factory
InvertedFactory inverted.Factory
VectorFactory vector.Factory
Rewriter rewrite.Rewriter
}
// --- Factory code block ---
type Factory component.Factory[Storage, config.App, Dependencies]
func NewFactory(mockOn ...component.MockOption) Factory {
if len(mockOn) > 0 {
return component.FactoryFunc[Storage, config.App, Dependencies](
func(instance string, app *config.App, dependencies Dependencies) (Storage, error) {
m := &mockStorage{}
component.MockOptions(mockOn).Apply(&m.Mock)
return m, nil
},
)
}
return component.FactoryFunc[Storage, config.App, Dependencies](new)
}
func new(instance string, app *config.App, dependencies Dependencies) (Storage, error) {
config := &Config{}
config.From(app)
if err := config.Validate(); err != nil {
return nil, errors.Wrap(err, "validate config")
}
s := &storage{
Base: component.New(&component.BaseConfig[Config, Dependencies]{
Name: "FeedStorage",
Instance: instance,
Config: config,
Dependencies: dependencies,
}),
blocks: &blockChain{blocks: make(map[string]block.Block)},
}
if err := os.MkdirAll(config.Dir, 0700); err != nil {
return nil, errors.Wrap(err, "ensure data dir")
}
if err := loadBlocks(config.Dir, s); err != nil {
return nil, errors.Wrap(err, "load blocks")
}
// Ensure head block.
if len(s.blocks.list(nil)) == 0 {
if _, err := s.createBlock(clk.Now()); err != nil {
return nil, errors.Wrap(err, "create head block")
}
}
return s, nil
}
func loadBlocks(path string, s *storage) error {
// Scan path.
ls, err := os.ReadDir(path)
if err != nil {
return errors.Wrap(err, "read dir")
}
// Load blocks.
for _, info := range ls {
if !info.IsDir() {
continue
}
if _, err := s.loadBlock(info.Name()); err != nil {
return errors.Wrapf(err, "load block %s", info.Name())
}
}
return nil
}
type blockChain struct {
blocks map[string]block.Block
mu sync.RWMutex
}
func (c *blockChain) isHead(b block.Block) bool {
return timeutil.InRange(clk.Now(), b.Start(), b.End())
}
func (c *blockChain) head() block.Block {
b, ok := c.get(clk.Now())
if !ok {
return nil
}
return b
}
func (c *blockChain) list(filter func(block block.Block) bool) []block.Block {
c.mu.RLock()
defer c.mu.RUnlock()
blocks := make([]block.Block, 0, len(c.blocks))
for _, b := range c.blocks {
if filter != nil && !filter(b) {
continue
}
blocks = append(blocks, b)
}
return blocks
}
func (c *blockChain) endTime() time.Time {
c.mu.RLock()
defer c.mu.RUnlock()
if len(c.blocks) == 0 {
return time.Time{}
}
var maxEnd time.Time
for _, b := range c.blocks {
if !b.End().After(maxEnd) {
continue
}
maxEnd = b.End()
}
return maxEnd
}
func (c *blockChain) get(time time.Time) (block.Block, bool) {
c.mu.RLock()
defer c.mu.RUnlock()
for _, b := range c.blocks {
if timeutil.InRange(time, b.Start(), b.End()) {
return b, true
}
}
return nil, false
}
func (c *blockChain) add(block block.Block) {
c.mu.Lock()
defer c.mu.Unlock()
c.blocks[blockName(block.Start())] = block
}
func (c *blockChain) remove(before time.Time, callback func(block block.Block)) {
c.mu.Lock()
defer c.mu.Unlock()
keys := make([]string, 0)
for key, b := range c.blocks {
if b.End().After(before) {
continue
}
keys = append(keys, key)
}
for _, key := range keys {
b := c.blocks[key]
delete(c.blocks, key)
callback(b)
}
}
// --- Implementation code block ---
type storage struct {
*component.Base[Config, Dependencies]
blocks *blockChain
}
func (s *storage) Run() (err error) {
ctx := telemetry.StartWith(s.Context(), append(s.TelemetryLabels(), telemetrymodel.KeyOperation, "Run")...)
defer func() { telemetry.End(ctx, err) }()
// Run blocks.
for _, b := range s.blocks.list(nil) {
if err := component.RunUntilReady(ctx, b, 10*time.Second); err != nil {
return errors.Wrap(err, "run block")
}
}
// Maintain blocks.
s.MarkReady()
ticker := clk.Timer(0)
defer ticker.Stop()
for {
select {
case now := <-ticker.C:
if err := s.reconcileBlocks(ctx, now); err != nil {
log.Error(ctx, errors.Wrap(err, "reconcile blocks"))
continue
}
log.Debug(ctx, "reconcile blocks success")
ticker.Reset(30 * time.Second)
case <-ctx.Done():
return nil
}
}
}
func (s *storage) Close() error {
if err := s.Base.Close(); err != nil {
return errors.Wrap(err, "close base")
}
for _, b := range s.blocks.list(nil) {
if err := b.Close(); err != nil {
return errors.Wrap(err, "close block")
}
}
return nil
}
func (s *storage) Reload(app *config.App) error {
// Validate new config.
newConfig := &Config{}
newConfig.From(app)
if err := newConfig.Validate(); err != nil {
return errors.Wrap(err, "validate config")
}
if reflect.DeepEqual(s.Config(), newConfig) {
log.Debug(s.Context(), "no changes in feed storage config")
return nil
}
// Check immutable fields.
curConfig := s.Config()
if newConfig.Dir != curConfig.Dir {
return errors.New("cannot reload the dir, MUST pass the same dir, or set it to empty for unchange")
}
// Reload blocks.
for _, b := range s.blocks.list(nil) {
if err := b.Reload(&block.Config{
FlushInterval: newConfig.FlushInterval,
}); err != nil {
return errors.Wrapf(err, "reload block %s", blockName(b.Start()))
}
}
// Set config.
s.SetConfig(newConfig)
return nil
}
func (s *storage) Append(ctx context.Context, feeds ...*model.Feed) (err error) {
ctx = telemetry.StartWith(ctx, append(s.TelemetryLabels(), telemetrymodel.KeyOperation, "Append")...)
defer func() { telemetry.End(ctx, err) }()
for _, f := range feeds {
if err := f.Validate(); err != nil {
return errors.Wrap(err, "validate feed")
}
}
// Rewrite feeds.
rewritten, err := s.rewrite(ctx, feeds)
if err != nil {
return errors.Wrap(err, "rewrite feeds")
}
if len(rewritten) == 0 {
log.Debug(ctx, "no feeds to write after rewrites")
return nil
}
// Append feeds to head block.
log.Debug(ctx, "append feeds", "count", len(rewritten))
if err := s.blocks.head().Append(ctx, rewritten...); err != nil {
return errors.Wrap(err, "append feeds")
}
return nil
}
func (s *storage) Query(ctx context.Context, query block.QueryOptions) (feeds []*block.FeedVO, err error) {
ctx = telemetry.StartWith(ctx, append(s.TelemetryLabels(), telemetrymodel.KeyOperation, "Query")...)
defer func() { telemetry.End(ctx, err) }()
if err := (&query).Validate(); err != nil {
return nil, errors.Wrap(err, "validate query")
}
// Parallel read.
blocks := s.blocks.list(nil)
feedHeap := block.NewFeedVOHeap(make(block.FeedVOs, 0, query.Limit))
var (
mu sync.Mutex
wg sync.WaitGroup
errs []error
)
for _, b := range blocks {
if !query.HitTimeRangeCondition(b) {
continue
}
wg.Add(1)
go func(b block.Block) {
defer wg.Done()
fs, err := b.Query(ctx, query)
if err != nil {
mu.Lock()
errs = append(errs, err)
mu.Unlock()
return
}
mu.Lock()
for _, f := range fs {
feedHeap.TryEvictPush(f)
}
mu.Unlock()
}(b)
}
wg.Wait()
if len(errs) > 0 {
return nil, errs[0]
}
feedHeap.DESCSort()
return feedHeap.Slice(), nil
}
func (s *storage) Exists(ctx context.Context, id uint64, hintTime time.Time) (bool, error) {
// Normal path.
if !hintTime.IsZero() {
b, ok := s.blocks.get(hintTime)
if ok {
return b.Exists(ctx, id)
}
}
// Fallback to head block.
return s.blocks.head().Exists(ctx, id)
}
const headBlockCreateBuffer = 30 * time.Minute
func (s *storage) reconcileBlocks(ctx context.Context, now time.Time) error {
// Create new head block if needed.
if err := s.ensureHeadBlock(ctx, now); err != nil {
return errors.Wrap(err, "ensure head block")
}
// Transform non-head hot blocks to cold.
if err := s.ensureColdBlocks(ctx); err != nil {
return errors.Wrap(err, "ensure cold blocks")
}
// Remove expired blocks.
s.ensureRemovedExpiredBlocks(ctx, now)
return nil
}
func (s *storage) ensureHeadBlock(ctx context.Context, now time.Time) error {
if maxEnd := s.blocks.endTime(); now.After(maxEnd.Add(-headBlockCreateBuffer)) {
nextStart := maxEnd
if now.After(maxEnd) {
nextStart = now
}
b, err := s.createBlock(nextStart)
if err != nil {
return errors.Wrap(err, "create new hot block")
}
if err := component.RunUntilReady(ctx, b, 10*time.Second); err != nil {
return errors.Wrap(err, "run new hot block")
}
s.blocks.add(b)
log.Info(ctx, "block created", "name", blockName(b.Start()))
}
return nil
}
func (s *storage) ensureColdBlocks(ctx context.Context) error {
for _, b := range s.blocks.list(func(b block.Block) bool {
return b.State() == block.StateHot &&
!s.blocks.isHead(b) &&
clk.Now().After(b.End().Add(s.Config().BlockDuration)) // For recent queries.
}) {
if err := b.TransformToCold(); err != nil {
return errors.Wrap(err, "transform to cold")
}
log.Info(ctx, "block transformed to cold", "name", blockName(b.Start()))
}
return nil
}
func (s *storage) ensureRemovedExpiredBlocks(ctx context.Context, now time.Time) {
s.blocks.remove(now.Add(-s.Config().Retention), func(b block.Block) {
var err error
if err = b.Close(); err != nil {
log.Error(ctx, errors.Wrap(err, "close block"))
}
if err = b.ClearOnDisk(); err != nil {
log.Error(ctx, errors.Wrap(err, "clear on disk"))
}
if err == nil {
log.Info(ctx, "block deleted", "name", blockName(b.Start()))
}
})
}
var blockName = func(start time.Time) string {
return strconv.FormatInt(start.Unix(), 10)
}
func (s *storage) createBlock(start time.Time) (block.Block, error) {
config := s.Config()
blockName := blockName(start)
dir := filepath.Join(config.Dir, blockName)
b, err := s.Dependencies().BlockFactory.New(
blockName,
&block.Config{
Dir: dir,
FlushInterval: config.FlushInterval,
ForCreate: &block.ForCreateConfig{
Start: start,
Duration: config.BlockDuration,
EmbeddingLLM: config.EmbeddingLLM,
},
},
s.blockDependencies(),
)
if err != nil {
return nil, errors.Wrap(err, "create block")
}
s.blocks.add(b)
return b, nil
}
func (s *storage) loadBlock(name string) (block.Block, error) {
dir := filepath.Join(s.Config().Dir, name)
b, err := s.Dependencies().BlockFactory.New(
name,
&block.Config{Dir: dir},
s.blockDependencies(),
)
if err != nil {
return nil, errors.Wrap(err, "create block")
}
s.blocks.add(b)
return b, nil
}
func (s *storage) blockDependencies() block.Dependencies {
deps := s.Dependencies()
return block.Dependencies{
ChunkFactory: deps.ChunkFactory,
PrimaryFactory: deps.PrimaryFactory,
InvertedFactory: deps.InvertedFactory,
VectorFactory: deps.VectorFactory,
LLMFactory: deps.LLMFactory,
}
}
func (s *storage) rewrite(ctx context.Context, feeds []*model.Feed) ([]*model.Feed, error) {
rewritten := make([]*model.Feed, 0, len(feeds))
var wg sync.WaitGroup
var errs []error
var mu sync.Mutex
for _, item := range feeds { // TODO: Limit the concurrency & goroutine number.
wg.Add(1)
go func(item *model.Feed) {
defer wg.Done()
labels, err := s.Dependencies().Rewriter.Labels(ctx, item.Labels)
if err != nil {
mu.Lock()
errs = append(errs, errors.Wrap(err, "rewrite item"))
mu.Unlock()
return
}
if len(labels) == 0 {
log.Debug(ctx, "drop feed", "id", item.ID)
return // Drop empty labels.
}
item.Labels = labels
mu.Lock()
rewritten = append(rewritten, item)
mu.Unlock()
}(item)
}
wg.Wait()
if len(errs) > 0 {
return nil, errs[0]
}
return rewritten, nil
}
type mockStorage struct {
component.Mock
}
func (m *mockStorage) Reload(app *config.App) error {
args := m.Called(app)
return args.Error(0)
}
func (m *mockStorage) Append(ctx context.Context, feeds ...*model.Feed) error {
args := m.Called(ctx, feeds)
return args.Error(0)
}
func (m *mockStorage) Query(ctx context.Context, query block.QueryOptions) ([]*block.FeedVO, error) {
args := m.Called(ctx, query)
return args.Get(0).([]*block.FeedVO), args.Error(1)
}
func (m *mockStorage) Exists(ctx context.Context, id uint64, hintTime time.Time) (bool, error) {
args := m.Called(ctx, id, hintTime)
return args.Get(0).(bool), args.Error(1)
}

View File

@@ -0,0 +1,446 @@
// Copyright (C) 2025 wangyusong
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
// TODO: fix tests
package feed
// import (
// "context"
// "os"
// "testing"
// "time"
//
// "github.com/benbjohnson/clock"
// . "github.com/onsi/gomega"
// "github.com/stretchr/testify/mock"
// "github.com/glidea/zenfeed/pkg/config"
// "github.com/glidea/zenfeed/pkg/storage/feed/block"
// "github.com/glidea/zenfeed/pkg/storage/feed/block/chunk"
// "github.com/glidea/zenfeed/pkg/test"
// timeutil "github.com/glidea/zenfeed/pkg/util/time"
// )
// func TestNew(t *testing.T) {
// RegisterTestingT(t)
// type givenDetail struct {
// now time.Time
// blocksOnDisk []string // Block directory names in format "2006-01-02T15:04:05Z-2006-01-02T15:04:05Z"
// }
// type whenDetail struct {
// app *config.App
// }
// type thenExpected struct {
// storage storage
// storageHotLen int
// storageColdLen int
// blockCalls []func(obj *mock.Mock)
// }
// tests := []test.Case[givenDetail, whenDetail, thenExpected]{
// {
// Scenario: "Create a new storage from an empty directory",
// Given: "just mock a time",
// When: "call New with a config with a data directory",
// Then: "should return a new storage and a hot block created",
// GivenDetail: givenDetail{
// now: timeutil.MustParse("2025-03-03T10:00:00Z"),
// },
// WhenDetail: whenDetail{
// app: &config.App{
// DB: config.DB{
// Dir: "/tmp/TestNew",
// },
// },
// },
// ThenExpected: thenExpected{
// storage: storage{
// config: &Config{
// Dir: "/tmp/TestNew",
// },
// },
// storageHotLen: 1,
// storageColdLen: 0,
// },
// },
// {
// Scenario: "Create a storage from existing directory with blocks",
// Given: "existing blocks on disk",
// GivenDetail: givenDetail{
// now: timeutil.MustParse("2025-03-03T10:00:00Z"),
// blocksOnDisk: []string{
// "2025-03-02T10:00:00Z ~ 2025-03-03T10:00:00Z", // Hot block
// "2025-03-01T10:00:00Z ~ 2025-03-02T10:00:00Z", // Cold block
// "2025-02-28T10:00:00Z ~ 2025-03-01T10:00:00Z", // Cold block
// },
// },
// When: "call New with a config with existing data directory",
// WhenDetail: whenDetail{
// app: &config.App{
// DB: config.DB{
// Dir: "/tmp/TestNew",
// WriteableWindow: 49 * time.Hour,
// },
// },
// },
// Then: "should return a storage with existing blocks loaded",
// ThenExpected: thenExpected{
// storage: storage{
// config: &Config{
// Dir: "/tmp/TestNew",
// Block: BlockConfig{
// WriteableWindow: 49 * time.Hour,
// },
// },
// },
// storageHotLen: 1,
// storageColdLen: 2,
// blockCalls: []func(obj *mock.Mock){
// func(m *mock.Mock) {
// m.On("State").Return(block.StateHot).Once()
// },
// func(m *mock.Mock) {
// m.On("State").Return(block.StateCold).Once()
// },
// func(m *mock.Mock) {
// m.On("State").Return(block.StateCold).Once()
// },
// },
// },
// },
// }
// for _, tt := range tests {
// t.Run(tt.Scenario, func(t *testing.T) {
// // Given.
// c := clock.NewMock()
// c.Set(tt.GivenDetail.now)
// clk = c // Set global clock.
// defer func() { clk = clock.New() }()
// // Create test directories if needed
// if len(tt.GivenDetail.blocksOnDisk) > 0 {
// for _, blockDir := range tt.GivenDetail.blocksOnDisk {
// err := os.MkdirAll(tt.WhenDetail.app.DB.Dir+"/"+blockDir, 0755)
// Expect(err).To(BeNil())
// }
// }
// // When.
// var calls int
// var blockCalls []*mock.Mock
// blockFactory := block.NewFactory(func(obj *mock.Mock) {
// if calls < len(tt.ThenExpected.blockCalls) {
// tt.ThenExpected.blockCalls[calls](obj)
// calls++
// blockCalls = append(blockCalls, obj)
// }
// })
// s, err := new(tt.WhenDetail.app, blockFactory)
// defer os.RemoveAll(tt.WhenDetail.app.DB.Dir)
// // Then.
// Expect(err).To(BeNil())
// Expect(s).NotTo(BeNil())
// storage := s.(*storage)
// Expect(storage.config).To(Equal(tt.ThenExpected.storage.config))
// Expect(len(storage.hot.blocks)).To(Equal(tt.ThenExpected.storageHotLen))
// Expect(len(storage.cold.blocks)).To(Equal(tt.ThenExpected.storageColdLen))
// for _, call := range blockCalls {
// call.AssertExpectations(t)
// }
// })
// }
// }
// func TestAppend(t *testing.T) {
// RegisterTestingT(t)
// type givenDetail struct {
// hotBlocks []func(m *mock.Mock)
// coldBlocks []func(m *mock.Mock)
// }
// type whenDetail struct {
// feeds []*chunk.Feed
// }
// type thenExpected struct {
// err string
// }
// tests := []test.Case[givenDetail, whenDetail, thenExpected]{
// {
// Scenario: "Append feeds to hot block",
// Given: "a storage with one hot block",
// When: "append feeds within hot block time range",
// Then: "should append feeds to hot block successfully",
// GivenDetail: givenDetail{
// hotBlocks: []func(m *mock.Mock){
// func(m *mock.Mock) {
// m.On("Start").Return(timeutil.MustParse("2025-03-02T10:00:00Z")).Twice()
// m.On("End").Return(timeutil.MustParse("2025-03-03T10:00:00Z")).Twice()
// m.On("State").Return(block.StateHot).Twice()
// m.On("Append", mock.Anything, []*chunk.Feed{
// {ID: 1, Time: timeutil.MustParse("2025-03-02T11:00:00Z")},
// {ID: 2, Time: timeutil.MustParse("2025-03-02T12:00:00Z")},
// }).Return(nil)
// },
// },
// },
// WhenDetail: whenDetail{
// feeds: []*chunk.Feed{
// {ID: 1, Time: timeutil.MustParse("2025-03-02T11:00:00Z")},
// {ID: 2, Time: timeutil.MustParse("2025-03-02T12:00:00Z")},
// },
// },
// ThenExpected: thenExpected{
// err: "",
// },
// },
// {
// Scenario: "Append feeds to non-hot block",
// Given: "a storage with hot and cold blocks",
// When: "append feeds with time in cold block range",
// Then: "should return error",
// GivenDetail: givenDetail{
// coldBlocks: []func(m *mock.Mock){
// func(m *mock.Mock) {},
// },
// },
// WhenDetail: whenDetail{
// feeds: []*chunk.Feed{
// {ID: 1, Time: timeutil.MustParse("2025-03-01T11:00:00Z")},
// },
// },
// ThenExpected: thenExpected{
// err: "cannot find hot block",
// },
// },
// }
// for _, tt := range tests {
// t.Run(tt.Scenario, func(t *testing.T) {
// // Given.
// calls := 0
// var blockMocks []*mock.Mock
// blockFactory := block.NewFactory(func(obj *mock.Mock) {
// if calls < len(tt.GivenDetail.hotBlocks) {
// tt.GivenDetail.hotBlocks[calls](obj)
// calls++
// blockMocks = append(blockMocks, obj)
// }
// })
// var hotBlocks blockChain
// for range tt.GivenDetail.hotBlocks {
// block, err := blockFactory.New(nil, nil, nil, nil, nil)
// Expect(err).To(BeNil())
// hotBlocks.add(block)
// }
// blockFactory = block.NewFactory(func(obj *mock.Mock) {
// if calls < len(tt.GivenDetail.coldBlocks) {
// tt.GivenDetail.coldBlocks[calls](obj)
// calls++
// blockMocks = append(blockMocks, obj)
// }
// })
// var coldBlocks blockChain
// for range tt.GivenDetail.coldBlocks {
// block, err := blockFactory.New(nil, nil, nil, nil, nil)
// Expect(err).To(BeNil())
// coldBlocks.add(block)
// }
// s := storage{
// hot: &hotBlocks,
// cold: &coldBlocks,
// }
// // When.
// err := s.Append(context.Background(), tt.WhenDetail.feeds...)
// // Then.
// if tt.ThenExpected.err != "" {
// Expect(err.Error()).To(ContainSubstring(tt.ThenExpected.err))
// } else {
// Expect(err).To(BeNil())
// }
// for _, m := range blockMocks {
// m.AssertExpectations(t)
// }
// })
// }
// }
// func TestQuery(t *testing.T) {
// RegisterTestingT(t)
// type givenDetail struct {
// hotBlocks []func(m *mock.Mock)
// coldBlocks []func(m *mock.Mock)
// }
// type whenDetail struct {
// query block.QueryOptions
// }
// type thenExpected struct {
// feeds []*block.FeedVO
// err string
// }
// tests := []test.Case[givenDetail, whenDetail, thenExpected]{
// {
// Scenario: "Query feeds from hot blocks",
// Given: "a storage with one hot block containing feeds",
// When: "querying with time range within hot block",
// Then: "should return matching feeds from hot block",
// GivenDetail: givenDetail{
// hotBlocks: []func(m *mock.Mock){
// func(m *mock.Mock) {
// m.On("Start").Return(timeutil.MustParse("2025-03-02T10:00:00Z")).Once()
// m.On("End").Return(timeutil.MustParse("2025-03-03T10:00:00Z")).Once()
// m.On("Query", mock.Anything, mock.MatchedBy(func(q block.QueryOptions) bool {
// return q.Start.Equal(timeutil.MustParse("2025-03-02T12:00:00Z")) &&
// q.End.Equal(timeutil.MustParse("2025-03-02T14:00:00Z"))
// })).Return([]*block.FeedVO{
// {ID: 1, Time: timeutil.MustParse("2025-03-02T12:30:00Z")},
// {ID: 2, Time: timeutil.MustParse("2025-03-02T13:00:00Z")},
// }, nil)
// },
// },
// },
// WhenDetail: whenDetail{
// query: block.QueryOptions{
// Start: timeutil.MustParse("2025-03-02T12:00:00Z"),
// End: timeutil.MustParse("2025-03-02T14:00:00Z"),
// Limit: 10,
// },
// },
// ThenExpected: thenExpected{
// feeds: []*block.FeedVO{
// {ID: 2, Time: timeutil.MustParse("2025-03-02T13:00:00Z")},
// {ID: 1, Time: timeutil.MustParse("2025-03-02T12:30:00Z")},
// },
// err: "",
// },
// },
// {
// Scenario: "Query feeds from multiple blocks",
// Given: "a storage with hot and cold blocks containing feeds",
// When: "querying with time range spanning multiple blocks",
// Then: "should return combined and sorted feeds from all matching blocks",
// GivenDetail: givenDetail{
// hotBlocks: []func(m *mock.Mock){
// func(m *mock.Mock) {
// m.On("Start").Return(timeutil.MustParse("2025-03-02T10:00:00Z"))
// m.On("End").Return(timeutil.MustParse("2025-03-03T10:00:00Z"))
// m.On("Query", mock.Anything, mock.MatchedBy(func(q block.QueryOptions) bool {
// return !q.Start.IsZero() && q.End.IsZero()
// })).Return([]*block.FeedVO{
// {ID: 3, Time: timeutil.MustParse("2025-03-02T15:00:00Z")},
// {ID: 4, Time: timeutil.MustParse("2025-03-02T16:00:00Z")},
// }, nil)
// },
// },
// coldBlocks: []func(m *mock.Mock){
// func(m *mock.Mock) {
// m.On("Start").Return(timeutil.MustParse("2025-03-01T10:00:00Z"))
// m.On("End").Return(timeutil.MustParse("2025-03-02T10:00:00Z"))
// m.On("Query", mock.Anything, mock.MatchedBy(func(q block.QueryOptions) bool {
// return !q.Start.IsZero() && q.End.IsZero()
// })).Return([]*block.FeedVO{
// {ID: 1, Time: timeutil.MustParse("2025-03-01T15:00:00Z")},
// {ID: 2, Time: timeutil.MustParse("2025-03-01T16:00:00Z")},
// }, nil)
// },
// },
// },
// WhenDetail: whenDetail{
// query: block.QueryOptions{
// Start: timeutil.MustParse("2025-03-01T12:00:00Z"),
// Limit: 3,
// },
// },
// ThenExpected: thenExpected{
// feeds: []*block.FeedVO{
// {ID: 4, Time: timeutil.MustParse("2025-03-02T16:00:00Z")},
// {ID: 3, Time: timeutil.MustParse("2025-03-02T15:00:00Z")},
// {ID: 2, Time: timeutil.MustParse("2025-03-01T16:00:00Z")},
// },
// err: "",
// },
// },
// }
// for _, tt := range tests {
// t.Run(tt.Scenario, func(t *testing.T) {
// // Given.
// calls := 0
// var blockMocks []*mock.Mock
// blockFactory := block.NewFactory(func(obj *mock.Mock) {
// if calls < len(tt.GivenDetail.hotBlocks) {
// tt.GivenDetail.hotBlocks[calls](obj)
// calls++
// blockMocks = append(blockMocks, obj)
// }
// })
// var hotBlocks blockChain
// for range tt.GivenDetail.hotBlocks {
// block, err := blockFactory.New(nil, nil, nil, nil, nil)
// Expect(err).To(BeNil())
// hotBlocks.add(block)
// }
// blockFactory = block.NewFactory(func(obj *mock.Mock) {
// if calls < len(tt.GivenDetail.hotBlocks)+len(tt.GivenDetail.coldBlocks) {
// tt.GivenDetail.coldBlocks[calls-len(tt.GivenDetail.hotBlocks)](obj)
// calls++
// blockMocks = append(blockMocks, obj)
// }
// })
// var coldBlocks blockChain
// for range tt.GivenDetail.coldBlocks {
// block, err := blockFactory.New(nil, nil, nil, nil, nil)
// Expect(err).To(BeNil())
// coldBlocks.add(block)
// }
// s := storage{
// hot: &hotBlocks,
// cold: &coldBlocks,
// }
// // When.
// feeds, err := s.Query(context.Background(), tt.WhenDetail.query)
// // Then.
// if tt.ThenExpected.err != "" {
// Expect(err).NotTo(BeNil())
// Expect(err.Error()).To(ContainSubstring(tt.ThenExpected.err))
// } else {
// Expect(err).To(BeNil())
// Expect(feeds).To(HaveLen(len(tt.ThenExpected.feeds)))
// // Check feeds match expected
// for i, feed := range feeds {
// Expect(feed.ID).To(Equal(tt.ThenExpected.feeds[i].ID))
// Expect(feed.Time).To(Equal(tt.ThenExpected.feeds[i].Time))
// Expect(feed.Labels).To(Equal(tt.ThenExpected.feeds[i].Labels))
// }
// }
// for _, m := range blockMocks {
// m.AssertExpectations(t)
// }
// })
// }
// }

View File

@@ -0,0 +1,520 @@
// Copyright (C) 2025 wangyusong
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package feed
// import (
// "context"
// "os"
// "testing"
// "time"
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
// "github.com/benbjohnson/clock"
// . "github.com/onsi/gomega"
// "github.com/stretchr/testify/mock"
// "github.com/glidea/zenfeed/pkg/config"
// "github.com/glidea/zenfeed/pkg/storage/feed/block"
// "github.com/glidea/zenfeed/pkg/storage/feed/block/chunk"
// "github.com/glidea/zenfeed/pkg/test"
// timeutil "github.com/glidea/zenfeed/pkg/util/time"
// )
// func TestNew(t *testing.T) {
// RegisterTestingT(t)
// type givenDetail struct {
// now time.Time
// blocksOnDisk []string // Block directory names in format "2006-01-02T15:04:05Z-2006-01-02T15:04:05Z"
// }
// type whenDetail struct {
// app *config.App
// }
// type thenExpected struct {
// storage storage
// storageHotLen int
// storageColdLen int
// blockCalls []func(obj *mock.Mock)
// }
// tests := []test.Case[givenDetail, whenDetail, thenExpected]{
// {
// Scenario: "Create a new storage from an empty directory",
// Given: "just mock a time",
// When: "call New with a config with a data directory",
// Then: "should return a new storage and a hot block created",
// GivenDetail: givenDetail{
// now: timeutil.MustParse("2025-03-03T10:00:00Z"),
// },
// WhenDetail: whenDetail{
// app: &config.App{
// DB: config.DB{
// Dir: "/tmp/TestNew",
// },
// },
// },
// ThenExpected: thenExpected{
// storage: storage{
// config: &Config{
// Dir: "/tmp/TestNew",
// },
// },
// storageHotLen: 1,
// storageColdLen: 0,
// },
// },
// {
// Scenario: "Create a storage from existing directory with blocks",
// Given: "existing blocks on disk",
// GivenDetail: givenDetail{
// now: timeutil.MustParse("2025-03-03T10:00:00Z"),
// blocksOnDisk: []string{
// "2025-03-02T10:00:00Z ~ 2025-03-03T10:00:00Z", // Hot block
// "2025-03-01T10:00:00Z ~ 2025-03-02T10:00:00Z", // Cold block
// "2025-02-28T10:00:00Z ~ 2025-03-01T10:00:00Z", // Cold block
// },
// },
// When: "call New with a config with existing data directory",
// WhenDetail: whenDetail{
// app: &config.App{
// DB: config.DB{
// Dir: "/tmp/TestNew",
// WriteableWindow: 49 * time.Hour,
// },
// },
// },
// Then: "should return a storage with existing blocks loaded",
// ThenExpected: thenExpected{
// storage: storage{
// config: &Config{
// Dir: "/tmp/TestNew",
// Block: BlockConfig{
// WriteableWindow: 49 * time.Hour,
// },
// },
// },
// storageHotLen: 1,
// storageColdLen: 2,
// blockCalls: []func(obj *mock.Mock){
// func(m *mock.Mock) {
// m.On("State").Return(block.StateHot).Once()
// },
// func(m *mock.Mock) {
// m.On("State").Return(block.StateCold).Once()
// },
// func(m *mock.Mock) {
// m.On("State").Return(block.StateCold).Once()
// },
// },
// },
// },
// }
// for _, tt := range tests {
// t.Run(tt.Scenario, func(t *testing.T) {
// // Given.
// c := clock.NewMock()
// c.Set(tt.GivenDetail.now)
// clk = c // Set global clock.
// defer func() { clk = clock.New() }()
// // Create test directories if needed
// if len(tt.GivenDetail.blocksOnDisk) > 0 {
// for _, blockDir := range tt.GivenDetail.blocksOnDisk {
// err := os.MkdirAll(tt.WhenDetail.app.DB.Dir+"/"+blockDir, 0755)
// Expect(err).To(BeNil())
// }
// }
// // When.
// var calls int
// var blockCalls []*mock.Mock
// blockFactory := block.NewFactory(func(obj *mock.Mock) {
// if calls < len(tt.ThenExpected.blockCalls) {
// tt.ThenExpected.blockCalls[calls](obj)
// calls++
// blockCalls = append(blockCalls, obj)
// }
// })
// s, err := new(tt.WhenDetail.app, blockFactory)
// defer os.RemoveAll(tt.WhenDetail.app.DB.Dir)
// // Then.
// Expect(err).To(BeNil())
// Expect(s).NotTo(BeNil())
// storage := s.(*storage)
// Expect(storage.config).To(Equal(tt.ThenExpected.storage.config))
// Expect(len(storage.hot.blocks)).To(Equal(tt.ThenExpected.storageHotLen))
// Expect(len(storage.cold.blocks)).To(Equal(tt.ThenExpected.storageColdLen))
// for _, call := range blockCalls {
// call.AssertExpectations(t)
// }
// })
// }
// }
// func TestAppend(t *testing.T) {
// RegisterTestingT(t)
// type givenDetail struct {
// hotBlocks []func(m *mock.Mock)
// coldBlocks []func(m *mock.Mock)
// }
// type whenDetail struct {
// feeds []*chunk.Feed
// }
// type thenExpected struct {
// err string
// }
// tests := []test.Case[givenDetail, whenDetail, thenExpected]{
// {
// Scenario: "Append feeds to hot block",
// Given: "a storage with one hot block",
// When: "append feeds within hot block time range",
// Then: "should append feeds to hot block successfully",
// GivenDetail: givenDetail{
// hotBlocks: []func(m *mock.Mock){
// func(m *mock.Mock) {
// m.On("Start").Return(timeutil.MustParse("2025-03-02T10:00:00Z")).Twice()
// m.On("End").Return(timeutil.MustParse("2025-03-03T10:00:00Z")).Twice()
// m.On("State").Return(block.StateHot).Twice()
// m.On("Append", mock.Anything, []*chunk.Feed{
// {ID: 1, Time: timeutil.MustParse("2025-03-02T11:00:00Z")},
// {ID: 2, Time: timeutil.MustParse("2025-03-02T12:00:00Z")},
// }).Return(nil)
// },
// },
// },
// WhenDetail: whenDetail{
// feeds: []*chunk.Feed{
// {ID: 1, Time: timeutil.MustParse("2025-03-02T11:00:00Z")},
// {ID: 2, Time: timeutil.MustParse("2025-03-02T12:00:00Z")},
// },
// },
// ThenExpected: thenExpected{
// err: "",
// },
// },
// {
// Scenario: "Append feeds to non-hot block",
// Given: "a storage with hot and cold blocks",
// When: "append feeds with time in cold block range",
// Then: "should return error",
// GivenDetail: givenDetail{
// coldBlocks: []func(m *mock.Mock){
// func(m *mock.Mock) {},
// },
// },
// WhenDetail: whenDetail{
// feeds: []*chunk.Feed{
// {ID: 1, Time: timeutil.MustParse("2025-03-01T11:00:00Z")},
// },
// },
// ThenExpected: thenExpected{
// err: "cannot find hot block",
// },
// },
// }
// for _, tt := range tests {
// t.Run(tt.Scenario, func(t *testing.T) {
// // Given.
// calls := 0
// var blockMocks []*mock.Mock
// blockFactory := block.NewFactory(func(obj *mock.Mock) {
// if calls < len(tt.GivenDetail.hotBlocks) {
// tt.GivenDetail.hotBlocks[calls](obj)
// calls++
// blockMocks = append(blockMocks, obj)
// }
// })
// var hotBlocks blockChain
// for range tt.GivenDetail.hotBlocks {
// block, err := blockFactory.New(nil, nil, nil, nil, nil)
// Expect(err).To(BeNil())
// hotBlocks.add(block)
// }
// blockFactory = block.NewFactory(func(obj *mock.Mock) {
// if calls < len(tt.GivenDetail.coldBlocks) {
// tt.GivenDetail.coldBlocks[calls](obj)
// calls++
// blockMocks = append(blockMocks, obj)
// }
// })
// var coldBlocks blockChain
// for range tt.GivenDetail.coldBlocks {
// block, err := blockFactory.New(nil, nil, nil, nil, nil)
// Expect(err).To(BeNil())
// coldBlocks.add(block)
// }
// s := storage{
// hot: &hotBlocks,
// cold: &coldBlocks,
// }
// // When.
// err := s.Append(context.Background(), tt.WhenDetail.feeds...)
// // Then.
// if tt.ThenExpected.err != "" {
// Expect(err.Error()).To(ContainSubstring(tt.ThenExpected.err))
// } else {
// Expect(err).To(BeNil())
// }
// for _, m := range blockMocks {
// m.AssertExpectations(t)
// }
// })
// }
// }
// func TestQuery(t *testing.T) {
// RegisterTestingT(t)
// type givenDetail struct {
// hotBlocks []func(m *mock.Mock)
// coldBlocks []func(m *mock.Mock)
// }
// type whenDetail struct {
// query block.QueryOptions
// }
// type thenExpected struct {
// feeds []*block.FeedVO
// err string
// }
// tests := []test.Case[givenDetail, whenDetail, thenExpected]{
// {
// Scenario: "Query feeds from hot blocks",
// Given: "a storage with one hot block containing feeds",
// When: "querying with time range within hot block",
// Then: "should return matching feeds from hot block",
// GivenDetail: givenDetail{
// hotBlocks: []func(m *mock.Mock){
// func(m *mock.Mock) {
// m.On("Start").Return(timeutil.MustParse("2025-03-02T10:00:00Z")).Once()
// m.On("End").Return(timeutil.MustParse("2025-03-03T10:00:00Z")).Once()
// m.On("Query", mock.Anything, mock.MatchedBy(func(q block.QueryOptions) bool {
// return q.Start.Equal(timeutil.MustParse("2025-03-02T12:00:00Z")) &&
// q.End.Equal(timeutil.MustParse("2025-03-02T14:00:00Z"))
// })).Return([]*block.FeedVO{
// {ID: 1, Time: timeutil.MustParse("2025-03-02T12:30:00Z")},
// {ID: 2, Time: timeutil.MustParse("2025-03-02T13:00:00Z")},
// }, nil)
// },
// },
// },
// WhenDetail: whenDetail{
// query: block.QueryOptions{
// Start: timeutil.MustParse("2025-03-02T12:00:00Z"),
// End: timeutil.MustParse("2025-03-02T14:00:00Z"),
// Limit: 10,
// },
// },
// ThenExpected: thenExpected{
// feeds: []*block.FeedVO{
// {ID: 2, Time: timeutil.MustParse("2025-03-02T13:00:00Z")},
// {ID: 1, Time: timeutil.MustParse("2025-03-02T12:30:00Z")},
// },
// err: "",
// },
// },
// {
// Scenario: "Query feeds from multiple blocks",
// Given: "a storage with hot and cold blocks containing feeds",
// When: "querying with time range spanning multiple blocks",
// Then: "should return combined and sorted feeds from all matching blocks",
// GivenDetail: givenDetail{
// hotBlocks: []func(m *mock.Mock){
// func(m *mock.Mock) {
// m.On("Start").Return(timeutil.MustParse("2025-03-02T10:00:00Z"))
// m.On("End").Return(timeutil.MustParse("2025-03-03T10:00:00Z"))
// m.On("Query", mock.Anything, mock.MatchedBy(func(q block.QueryOptions) bool {
// return !q.Start.IsZero() && q.End.IsZero()
// })).Return([]*block.FeedVO{
// {ID: 3, Time: timeutil.MustParse("2025-03-02T15:00:00Z")},
// {ID: 4, Time: timeutil.MustParse("2025-03-02T16:00:00Z")},
// }, nil)
// },
// },
// coldBlocks: []func(m *mock.Mock){
// func(m *mock.Mock) {
// m.On("Start").Return(timeutil.MustParse("2025-03-01T10:00:00Z"))
// m.On("End").Return(timeutil.MustParse("2025-03-02T10:00:00Z"))
// m.On("Query", mock.Anything, mock.MatchedBy(func(q block.QueryOptions) bool {
// return !q.Start.IsZero() && q.End.IsZero()
// })).Return([]*block.FeedVO{
// {ID: 1, Time: timeutil.MustParse("2025-03-01T15:00:00Z")},
// {ID: 2, Time: timeutil.MustParse("2025-03-01T16:00:00Z")},
// }, nil)
// },
// },
// },
// WhenDetail: whenDetail{
// query: block.QueryOptions{
// Start: timeutil.MustParse("2025-03-01T12:00:00Z"),
// Limit: 3,
// },
// },
// ThenExpected: thenExpected{
// feeds: []*block.FeedVO{
// {ID: 4, Time: timeutil.MustParse("2025-03-02T16:00:00Z")},
// {ID: 3, Time: timeutil.MustParse("2025-03-02T15:00:00Z")},
// {ID: 2, Time: timeutil.MustParse("2025-03-01T16:00:00Z")},
// },
// err: "",
// },
// },
// }
// for _, tt := range tests {
// t.Run(tt.Scenario, func(t *testing.T) {
// // Given.
// calls := 0
// var blockMocks []*mock.Mock
// blockFactory := block.NewFactory(func(obj *mock.Mock) {
// if calls < len(tt.GivenDetail.hotBlocks) {
// tt.GivenDetail.hotBlocks[calls](obj)
// calls++
// blockMocks = append(blockMocks, obj)
// }
// })
// var hotBlocks blockChain
// for range tt.GivenDetail.hotBlocks {
// block, err := blockFactory.New(nil, nil, nil, nil, nil)
// Expect(err).To(BeNil())
// hotBlocks.add(block)
// }
// blockFactory = block.NewFactory(func(obj *mock.Mock) {
// if calls < len(tt.GivenDetail.hotBlocks)+len(tt.GivenDetail.coldBlocks) {
// tt.GivenDetail.coldBlocks[calls-len(tt.GivenDetail.hotBlocks)](obj)
// calls++
// blockMocks = append(blockMocks, obj)
// }
// })
// var coldBlocks blockChain
// for range tt.GivenDetail.coldBlocks {
// block, err := blockFactory.New(nil, nil, nil, nil, nil)
// Expect(err).To(BeNil())
// coldBlocks.add(block)
// }
// s := storage{
// hot: &hotBlocks,
// cold: &coldBlocks,
// }
// // When.
// feeds, err := s.Query(context.Background(), tt.WhenDetail.query)
// // Then.
// if tt.ThenExpected.err != "" {
// Expect(err).NotTo(BeNil())
// Expect(err.Error()).To(ContainSubstring(tt.ThenExpected.err))
// } else {
// Expect(err).To(BeNil())
// Expect(feeds).To(HaveLen(len(tt.ThenExpected.feeds)))
// // Check feeds match expected
// for i, feed := range feeds {
// Expect(feed.ID).To(Equal(tt.ThenExpected.feeds[i].ID))
// Expect(feed.Time).To(Equal(tt.ThenExpected.feeds[i].Time))
// Expect(feed.Labels).To(Equal(tt.ThenExpected.feeds[i].Labels))
// }
// }
// for _, m := range blockMocks {
// m.AssertExpectations(t)
// }
// })
// }
// }