init
This commit is contained in:
158
pkg/llm/embedding_spliter_test.go
Normal file
158
pkg/llm/embedding_spliter_test.go
Normal file
@@ -0,0 +1,158 @@
|
||||
// Copyright (C) 2025 wangyusong
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
package llm
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
. "github.com/onsi/gomega"
|
||||
|
||||
"github.com/glidea/zenfeed/pkg/model"
|
||||
"github.com/glidea/zenfeed/pkg/test"
|
||||
)
|
||||
|
||||
func TestEmbeddingSpliter_Split(t *testing.T) {
|
||||
RegisterTestingT(t)
|
||||
|
||||
type givenDetail struct {
|
||||
maxLabelValueTokens int
|
||||
overlapTokens int
|
||||
}
|
||||
type whenDetail struct {
|
||||
labels model.Labels
|
||||
}
|
||||
type thenExpected struct {
|
||||
splits []model.Labels
|
||||
err string
|
||||
}
|
||||
|
||||
tests := []test.Case[givenDetail, whenDetail, thenExpected]{
|
||||
{
|
||||
Scenario: "Split labels with all short values",
|
||||
Given: "an embedding spliter with max token limit",
|
||||
When: "splitting labels with all values under token limit",
|
||||
Then: "should return original labels as single split",
|
||||
GivenDetail: givenDetail{
|
||||
maxLabelValueTokens: 1024,
|
||||
},
|
||||
WhenDetail: whenDetail{
|
||||
labels: model.Labels{
|
||||
{Key: "title", Value: "Short title"},
|
||||
{Key: "description", Value: "Short description"},
|
||||
},
|
||||
},
|
||||
ThenExpected: thenExpected{
|
||||
splits: []model.Labels{
|
||||
{
|
||||
{Key: "title", Value: "Short title"},
|
||||
{Key: "description", Value: "Short description"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Scenario: "Split labels with one long value",
|
||||
Given: "an embedding spliter with max token limit",
|
||||
When: "splitting labels with one value exceeding token limit",
|
||||
Then: "should split the long value and combine with common labels",
|
||||
GivenDetail: givenDetail{
|
||||
maxLabelValueTokens: 10, // Small limit to force splitting.
|
||||
overlapTokens: 1,
|
||||
},
|
||||
WhenDetail: whenDetail{
|
||||
labels: model.Labels{
|
||||
{Key: "title", Value: "Short title"},
|
||||
{Key: "content", Value: "This is a long content that exceeds the token limit and needs to be split into multiple parts"},
|
||||
},
|
||||
},
|
||||
ThenExpected: thenExpected{
|
||||
splits: []model.Labels{
|
||||
{
|
||||
{Key: "title", Value: "Short title"},
|
||||
{Key: "content", Value: "This is a long content that exceeds the "},
|
||||
},
|
||||
{
|
||||
{Key: "title", Value: "Short title"},
|
||||
{Key: "content", Value: "the token limit and needs to be split in"},
|
||||
},
|
||||
{
|
||||
{Key: "title", Value: "Short title"},
|
||||
{Key: "content", Value: "t into multiple parts"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Scenario: "Handle non-Latin characters",
|
||||
Given: "an embedding spliter with max token limit",
|
||||
When: "splitting labels with non-Latin characters",
|
||||
Then: "should correctly estimate tokens and split accordingly",
|
||||
GivenDetail: givenDetail{
|
||||
maxLabelValueTokens: 10, // Small limit to force splitting.
|
||||
overlapTokens: 2,
|
||||
},
|
||||
WhenDetail: whenDetail{
|
||||
labels: model.Labels{
|
||||
{Key: "title", Value: "Short title"},
|
||||
{Key: "content", Value: "中文内容需要被分割因为它超过了令牌限制"}, // Chinese content that needs to be split.
|
||||
},
|
||||
},
|
||||
ThenExpected: thenExpected{
|
||||
splits: []model.Labels{
|
||||
{
|
||||
{Key: "title", Value: "Short title"},
|
||||
{Key: "content", Value: "中文内容需要"},
|
||||
},
|
||||
{
|
||||
{Key: "title", Value: "Short title"},
|
||||
{Key: "content", Value: "要被分割因为"},
|
||||
},
|
||||
{
|
||||
{Key: "title", Value: "Short title"},
|
||||
{Key: "content", Value: "为它超过了令"},
|
||||
},
|
||||
{
|
||||
{Key: "title", Value: "Short title"},
|
||||
{Key: "content", Value: "令牌限制"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.Scenario, func(t *testing.T) {
|
||||
// Given.
|
||||
spliter := newEmbeddingSpliter(tt.GivenDetail.maxLabelValueTokens, tt.GivenDetail.overlapTokens)
|
||||
|
||||
// When.
|
||||
splits, err := spliter.Split(tt.WhenDetail.labels)
|
||||
|
||||
// Then.
|
||||
if tt.ThenExpected.err != "" {
|
||||
Expect(err).NotTo(BeNil())
|
||||
Expect(err.Error()).To(ContainSubstring(tt.ThenExpected.err))
|
||||
} else {
|
||||
Expect(err).To(BeNil())
|
||||
Expect(len(splits)).To(Equal(len(tt.ThenExpected.splits)))
|
||||
|
||||
for i, expectedSplit := range tt.ThenExpected.splits {
|
||||
Expect(splits[i]).To(Equal(expectedSplit))
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user