mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-02-13 02:49:08 +08:00
141 lines
4.4 KiB
Python
141 lines
4.4 KiB
Python
"""
|
|
Text chunking utilities for memory
|
|
|
|
Splits text into chunks with token limits and overlap
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
from typing import List, Tuple
|
|
from dataclasses import dataclass
|
|
|
|
|
|
@dataclass
|
|
class TextChunk:
|
|
"""Represents a text chunk with line numbers"""
|
|
text: str
|
|
start_line: int
|
|
end_line: int
|
|
|
|
|
|
class TextChunker:
|
|
"""Chunks text by line count with token estimation"""
|
|
|
|
def __init__(self, max_tokens: int = 500, overlap_tokens: int = 50):
|
|
"""
|
|
Initialize chunker
|
|
|
|
Args:
|
|
max_tokens: Maximum tokens per chunk
|
|
overlap_tokens: Overlap tokens between chunks
|
|
"""
|
|
self.max_tokens = max_tokens
|
|
self.overlap_tokens = overlap_tokens
|
|
# Rough estimation: ~4 chars per token for English/Chinese mixed
|
|
self.chars_per_token = 4
|
|
|
|
def chunk_text(self, text: str) -> List[TextChunk]:
|
|
"""
|
|
Chunk text into overlapping segments
|
|
|
|
Args:
|
|
text: Input text to chunk
|
|
|
|
Returns:
|
|
List of TextChunk objects
|
|
"""
|
|
if not text.strip():
|
|
return []
|
|
|
|
lines = text.split('\n')
|
|
chunks = []
|
|
|
|
max_chars = self.max_tokens * self.chars_per_token
|
|
overlap_chars = self.overlap_tokens * self.chars_per_token
|
|
|
|
current_chunk = []
|
|
current_chars = 0
|
|
start_line = 1
|
|
|
|
for i, line in enumerate(lines, start=1):
|
|
line_chars = len(line)
|
|
|
|
# If single line exceeds max, split it
|
|
if line_chars > max_chars:
|
|
# Save current chunk if exists
|
|
if current_chunk:
|
|
chunks.append(TextChunk(
|
|
text='\n'.join(current_chunk),
|
|
start_line=start_line,
|
|
end_line=i - 1
|
|
))
|
|
current_chunk = []
|
|
current_chars = 0
|
|
|
|
# Split long line into multiple chunks
|
|
for sub_chunk in self._split_long_line(line, max_chars):
|
|
chunks.append(TextChunk(
|
|
text=sub_chunk,
|
|
start_line=i,
|
|
end_line=i
|
|
))
|
|
|
|
start_line = i + 1
|
|
continue
|
|
|
|
# Check if adding this line would exceed limit
|
|
if current_chars + line_chars > max_chars and current_chunk:
|
|
# Save current chunk
|
|
chunks.append(TextChunk(
|
|
text='\n'.join(current_chunk),
|
|
start_line=start_line,
|
|
end_line=i - 1
|
|
))
|
|
|
|
# Start new chunk with overlap
|
|
overlap_lines = self._get_overlap_lines(current_chunk, overlap_chars)
|
|
current_chunk = overlap_lines + [line]
|
|
current_chars = sum(len(l) for l in current_chunk)
|
|
start_line = i - len(overlap_lines)
|
|
else:
|
|
# Add line to current chunk
|
|
current_chunk.append(line)
|
|
current_chars += line_chars
|
|
|
|
# Save last chunk
|
|
if current_chunk:
|
|
chunks.append(TextChunk(
|
|
text='\n'.join(current_chunk),
|
|
start_line=start_line,
|
|
end_line=len(lines)
|
|
))
|
|
|
|
return chunks
|
|
|
|
def _split_long_line(self, line: str, max_chars: int) -> List[str]:
|
|
"""Split a single long line into multiple chunks"""
|
|
chunks = []
|
|
for i in range(0, len(line), max_chars):
|
|
chunks.append(line[i:i + max_chars])
|
|
return chunks
|
|
|
|
def _get_overlap_lines(self, lines: List[str], target_chars: int) -> List[str]:
|
|
"""Get last few lines that fit within target_chars for overlap"""
|
|
overlap = []
|
|
chars = 0
|
|
|
|
for line in reversed(lines):
|
|
line_chars = len(line)
|
|
if chars + line_chars > target_chars:
|
|
break
|
|
overlap.insert(0, line)
|
|
chars += line_chars
|
|
|
|
return overlap
|
|
|
|
def chunk_markdown(self, text: str) -> List[TextChunk]:
|
|
"""
|
|
Chunk markdown text while respecting structure
|
|
(For future enhancement: respect markdown sections)
|
|
"""
|
|
return self.chunk_text(text)
|