159 lines
4.6 KiB
Go
159 lines
4.6 KiB
Go
// Copyright (C) 2025 wangyusong
|
|
//
|
|
// This program is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU Affero General Public License as published by
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU Affero General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
package llm
|
|
|
|
import (
|
|
"testing"
|
|
|
|
. "github.com/onsi/gomega"
|
|
|
|
"github.com/glidea/zenfeed/pkg/model"
|
|
"github.com/glidea/zenfeed/pkg/test"
|
|
)
|
|
|
|
func TestEmbeddingSpliter_Split(t *testing.T) {
|
|
RegisterTestingT(t)
|
|
|
|
type givenDetail struct {
|
|
maxLabelValueTokens int
|
|
overlapTokens int
|
|
}
|
|
type whenDetail struct {
|
|
labels model.Labels
|
|
}
|
|
type thenExpected struct {
|
|
splits []model.Labels
|
|
err string
|
|
}
|
|
|
|
tests := []test.Case[givenDetail, whenDetail, thenExpected]{
|
|
{
|
|
Scenario: "Split labels with all short values",
|
|
Given: "an embedding spliter with max token limit",
|
|
When: "splitting labels with all values under token limit",
|
|
Then: "should return original labels as single split",
|
|
GivenDetail: givenDetail{
|
|
maxLabelValueTokens: 1024,
|
|
},
|
|
WhenDetail: whenDetail{
|
|
labels: model.Labels{
|
|
{Key: "title", Value: "Short title"},
|
|
{Key: "description", Value: "Short description"},
|
|
},
|
|
},
|
|
ThenExpected: thenExpected{
|
|
splits: []model.Labels{
|
|
{
|
|
{Key: "title", Value: "Short title"},
|
|
{Key: "description", Value: "Short description"},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
{
|
|
Scenario: "Split labels with one long value",
|
|
Given: "an embedding spliter with max token limit",
|
|
When: "splitting labels with one value exceeding token limit",
|
|
Then: "should split the long value and combine with common labels",
|
|
GivenDetail: givenDetail{
|
|
maxLabelValueTokens: 10, // Small limit to force splitting.
|
|
overlapTokens: 1,
|
|
},
|
|
WhenDetail: whenDetail{
|
|
labels: model.Labels{
|
|
{Key: "title", Value: "Short title"},
|
|
{Key: "content", Value: "This is a long content that exceeds the token limit and needs to be split into multiple parts"},
|
|
},
|
|
},
|
|
ThenExpected: thenExpected{
|
|
splits: []model.Labels{
|
|
{
|
|
{Key: "title", Value: "Short title"},
|
|
{Key: "content", Value: "This is a long content that exceeds the "},
|
|
},
|
|
{
|
|
{Key: "title", Value: "Short title"},
|
|
{Key: "content", Value: "the token limit and needs to be split in"},
|
|
},
|
|
{
|
|
{Key: "title", Value: "Short title"},
|
|
{Key: "content", Value: "t into multiple parts"},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
{
|
|
Scenario: "Handle non-Latin characters",
|
|
Given: "an embedding spliter with max token limit",
|
|
When: "splitting labels with non-Latin characters",
|
|
Then: "should correctly estimate tokens and split accordingly",
|
|
GivenDetail: givenDetail{
|
|
maxLabelValueTokens: 10, // Small limit to force splitting.
|
|
overlapTokens: 2,
|
|
},
|
|
WhenDetail: whenDetail{
|
|
labels: model.Labels{
|
|
{Key: "title", Value: "Short title"},
|
|
{Key: "content", Value: "中文内容需要被分割因为它超过了令牌限制"}, // Chinese content that needs to be split.
|
|
},
|
|
},
|
|
ThenExpected: thenExpected{
|
|
splits: []model.Labels{
|
|
{
|
|
{Key: "title", Value: "Short title"},
|
|
{Key: "content", Value: "中文内容需要"},
|
|
},
|
|
{
|
|
{Key: "title", Value: "Short title"},
|
|
{Key: "content", Value: "要被分割因为"},
|
|
},
|
|
{
|
|
{Key: "title", Value: "Short title"},
|
|
{Key: "content", Value: "为它超过了令"},
|
|
},
|
|
{
|
|
{Key: "title", Value: "Short title"},
|
|
{Key: "content", Value: "令牌限制"},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.Scenario, func(t *testing.T) {
|
|
// Given.
|
|
spliter := newEmbeddingSpliter(tt.GivenDetail.maxLabelValueTokens, tt.GivenDetail.overlapTokens)
|
|
|
|
// When.
|
|
splits, err := spliter.Split(tt.WhenDetail.labels)
|
|
|
|
// Then.
|
|
if tt.ThenExpected.err != "" {
|
|
Expect(err).NotTo(BeNil())
|
|
Expect(err.Error()).To(ContainSubstring(tt.ThenExpected.err))
|
|
} else {
|
|
Expect(err).To(BeNil())
|
|
Expect(len(splits)).To(Equal(len(tt.ThenExpected.splits)))
|
|
|
|
for i, expectedSplit := range tt.ThenExpected.splits {
|
|
Expect(splits[i]).To(Equal(expectedSplit))
|
|
}
|
|
}
|
|
})
|
|
}
|
|
}
|