init

2025-04-19 15:50:26 +08:00
commit 8b33df8a05
109 changed files with 24407 additions and 0 deletions
--- a/pkg/llm/embedding_spliter_test.go
+++ b/pkg/llm/embedding_spliter_test.go
@@ -0,0 +1,158 @@
+// Copyright (C) 2025 wangyusong
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+package llm
+
+import (
+	"testing"
+
+	. "github.com/onsi/gomega"
+
+	"github.com/glidea/zenfeed/pkg/model"
+	"github.com/glidea/zenfeed/pkg/test"
+)
+
+func TestEmbeddingSpliter_Split(t *testing.T) {
+	RegisterTestingT(t)
+
+	type givenDetail struct {
+		maxLabelValueTokens int
+		overlapTokens       int
+	}
+	type whenDetail struct {
+		labels model.Labels
+	}
+	type thenExpected struct {
+		splits []model.Labels
+		err    string
+	}
+
+	tests := []test.Case[givenDetail, whenDetail, thenExpected]{
+		{
+			Scenario: "Split labels with all short values",
+			Given:    "an embedding spliter with max token limit",
+			When:     "splitting labels with all values under token limit",
+			Then:     "should return original labels as single split",
+			GivenDetail: givenDetail{
+				maxLabelValueTokens: 1024,
+			},
+			WhenDetail: whenDetail{
+				labels: model.Labels{
+					{Key: "title", Value: "Short title"},
+					{Key: "description", Value: "Short description"},
+				},
+			},
+			ThenExpected: thenExpected{
+				splits: []model.Labels{
+					{
+						{Key: "title", Value: "Short title"},
+						{Key: "description", Value: "Short description"},
+					},
+				},
+			},
+		},
+		{
+			Scenario: "Split labels with one long value",
+			Given:    "an embedding spliter with max token limit",
+			When:     "splitting labels with one value exceeding token limit",
+			Then:     "should split the long value and combine with common labels",
+			GivenDetail: givenDetail{
+				maxLabelValueTokens: 10, // Small limit to force splitting.
+				overlapTokens:       1,
+			},
+			WhenDetail: whenDetail{
+				labels: model.Labels{
+					{Key: "title", Value: "Short title"},
+					{Key: "content", Value: "This is a long content that exceeds the token limit and needs to be split into multiple parts"},
+				},
+			},
+			ThenExpected: thenExpected{
+				splits: []model.Labels{
+					{
+						{Key: "title", Value: "Short title"},
+						{Key: "content", Value: "This is a long content that exceeds the "},
+					},
+					{
+						{Key: "title", Value: "Short title"},
+						{Key: "content", Value: "the token limit and needs to be split in"},
+					},
+					{
+						{Key: "title", Value: "Short title"},
+						{Key: "content", Value: "t into multiple parts"},
+					},
+				},
+			},
+		},
+		{
+			Scenario: "Handle non-Latin characters",
+			Given:    "an embedding spliter with max token limit",
+			When:     "splitting labels with non-Latin characters",
+			Then:     "should correctly estimate tokens and split accordingly",
+			GivenDetail: givenDetail{
+				maxLabelValueTokens: 10, // Small limit to force splitting.
+				overlapTokens:       2,
+			},
+			WhenDetail: whenDetail{
+				labels: model.Labels{
+					{Key: "title", Value: "Short title"},
+					{Key: "content", Value: "中文内容需要被分割因为它超过了令牌限制"}, // Chinese content that needs to be split.
+				},
+			},
+			ThenExpected: thenExpected{
+				splits: []model.Labels{
+					{
+						{Key: "title", Value: "Short title"},
+						{Key: "content", Value: "中文内容需要"},
+					},
+					{
+						{Key: "title", Value: "Short title"},
+						{Key: "content", Value: "要被分割因为"},
+					},
+					{
+						{Key: "title", Value: "Short title"},
+						{Key: "content", Value: "为它超过了令"},
+					},
+					{
+						{Key: "title", Value: "Short title"},
+						{Key: "content", Value: "令牌限制"},
+					},
+				},
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.Scenario, func(t *testing.T) {
+			// Given.
+			spliter := newEmbeddingSpliter(tt.GivenDetail.maxLabelValueTokens, tt.GivenDetail.overlapTokens)
+
+			// When.
+			splits, err := spliter.Split(tt.WhenDetail.labels)
+
+			// Then.
+			if tt.ThenExpected.err != "" {
+				Expect(err).NotTo(BeNil())
+				Expect(err.Error()).To(ContainSubstring(tt.ThenExpected.err))
+			} else {
+				Expect(err).To(BeNil())
+				Expect(len(splits)).To(Equal(len(tt.ThenExpected.splits)))
+
+				for i, expectedSplit := range tt.ThenExpected.splits {
+					Expect(splits[i]).To(Equal(expectedSplit))
+				}
+			}
+		})
+	}
+}