Files
zenfeed/pkg/llm/embedding_spliter_test.go
glidea 8b33df8a05 init
2025-04-19 15:50:26 +08:00

159 lines
4.6 KiB
Go

// Copyright (C) 2025 wangyusong
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package llm
import (
"testing"
. "github.com/onsi/gomega"
"github.com/glidea/zenfeed/pkg/model"
"github.com/glidea/zenfeed/pkg/test"
)
func TestEmbeddingSpliter_Split(t *testing.T) {
RegisterTestingT(t)
type givenDetail struct {
maxLabelValueTokens int
overlapTokens int
}
type whenDetail struct {
labels model.Labels
}
type thenExpected struct {
splits []model.Labels
err string
}
tests := []test.Case[givenDetail, whenDetail, thenExpected]{
{
Scenario: "Split labels with all short values",
Given: "an embedding spliter with max token limit",
When: "splitting labels with all values under token limit",
Then: "should return original labels as single split",
GivenDetail: givenDetail{
maxLabelValueTokens: 1024,
},
WhenDetail: whenDetail{
labels: model.Labels{
{Key: "title", Value: "Short title"},
{Key: "description", Value: "Short description"},
},
},
ThenExpected: thenExpected{
splits: []model.Labels{
{
{Key: "title", Value: "Short title"},
{Key: "description", Value: "Short description"},
},
},
},
},
{
Scenario: "Split labels with one long value",
Given: "an embedding spliter with max token limit",
When: "splitting labels with one value exceeding token limit",
Then: "should split the long value and combine with common labels",
GivenDetail: givenDetail{
maxLabelValueTokens: 10, // Small limit to force splitting.
overlapTokens: 1,
},
WhenDetail: whenDetail{
labels: model.Labels{
{Key: "title", Value: "Short title"},
{Key: "content", Value: "This is a long content that exceeds the token limit and needs to be split into multiple parts"},
},
},
ThenExpected: thenExpected{
splits: []model.Labels{
{
{Key: "title", Value: "Short title"},
{Key: "content", Value: "This is a long content that exceeds the "},
},
{
{Key: "title", Value: "Short title"},
{Key: "content", Value: "the token limit and needs to be split in"},
},
{
{Key: "title", Value: "Short title"},
{Key: "content", Value: "t into multiple parts"},
},
},
},
},
{
Scenario: "Handle non-Latin characters",
Given: "an embedding spliter with max token limit",
When: "splitting labels with non-Latin characters",
Then: "should correctly estimate tokens and split accordingly",
GivenDetail: givenDetail{
maxLabelValueTokens: 10, // Small limit to force splitting.
overlapTokens: 2,
},
WhenDetail: whenDetail{
labels: model.Labels{
{Key: "title", Value: "Short title"},
{Key: "content", Value: "中文内容需要被分割因为它超过了令牌限制"}, // Chinese content that needs to be split.
},
},
ThenExpected: thenExpected{
splits: []model.Labels{
{
{Key: "title", Value: "Short title"},
{Key: "content", Value: "中文内容需要"},
},
{
{Key: "title", Value: "Short title"},
{Key: "content", Value: "要被分割因为"},
},
{
{Key: "title", Value: "Short title"},
{Key: "content", Value: "为它超过了令"},
},
{
{Key: "title", Value: "Short title"},
{Key: "content", Value: "令牌限制"},
},
},
},
},
}
for _, tt := range tests {
t.Run(tt.Scenario, func(t *testing.T) {
// Given.
spliter := newEmbeddingSpliter(tt.GivenDetail.maxLabelValueTokens, tt.GivenDetail.overlapTokens)
// When.
splits, err := spliter.Split(tt.WhenDetail.labels)
// Then.
if tt.ThenExpected.err != "" {
Expect(err).NotTo(BeNil())
Expect(err.Error()).To(ContainSubstring(tt.ThenExpected.err))
} else {
Expect(err).To(BeNil())
Expect(len(splits)).To(Equal(len(tt.ThenExpected.splits)))
for i, expectedSplit := range tt.ThenExpected.splits {
Expect(splits[i]).To(Equal(expectedSplit))
}
}
})
}
}