chatgpt-on-wechat/agent/memory/chunker.py

"""
Text chunking utilities for memory

Splits text into chunks with token limits and overlap
"""

from __future__ import annotations
from typing import List, Tuple
from dataclasses import dataclass


@dataclass
class TextChunk:
    """Represents a text chunk with line numbers"""
    text: str
    start_line: int
    end_line: int


class TextChunker:
    """Chunks text by line count with token estimation"""

    def __init__(self, max_tokens: int = 500, overlap_tokens: int = 50):
        """
        Initialize chunker

        Args:
            max_tokens: Maximum tokens per chunk
            overlap_tokens: Overlap tokens between chunks
        """
        self.max_tokens = max_tokens
        self.overlap_tokens = overlap_tokens
        # Rough estimation: ~4 chars per token for English/Chinese mixed
        self.chars_per_token = 4

    def chunk_text(self, text: str) -> List[TextChunk]:
        """
        Chunk text into overlapping segments

        Args:
            text: Input text to chunk

        Returns:
            List of TextChunk objects
        """
        if not text.strip():
            return []

        lines = text.split('\n')
        chunks = []

        max_chars = self.max_tokens * self.chars_per_token
        overlap_chars = self.overlap_tokens * self.chars_per_token

        current_chunk = []
        current_chars = 0
        start_line = 1

        for i, line in enumerate(lines, start=1):
            line_chars = len(line)

            # If single line exceeds max, split it
            if line_chars > max_chars:
                # Save current chunk if exists
                if current_chunk:
                    chunks.append(TextChunk(
                        text='\n'.join(current_chunk),
                        start_line=start_line,
                        end_line=i - 1
                    ))
                    current_chunk = []
                    current_chars = 0

                # Split long line into multiple chunks
                for sub_chunk in self._split_long_line(line, max_chars):
                    chunks.append(TextChunk(
                        text=sub_chunk,
                        start_line=i,
                        end_line=i
                    ))

                start_line = i + 1
                continue

            # Check if adding this line would exceed limit
            if current_chars + line_chars > max_chars and current_chunk:
                # Save current chunk
                chunks.append(TextChunk(
                    text='\n'.join(current_chunk),
                    start_line=start_line,
                    end_line=i - 1
                ))

                # Start new chunk with overlap
                overlap_lines = self._get_overlap_lines(current_chunk, overlap_chars)
                current_chunk = overlap_lines + [line]
                current_chars = sum(len(l) for l in current_chunk)
                start_line = i - len(overlap_lines)
            else:
                # Add line to current chunk
                current_chunk.append(line)
                current_chars += line_chars

        # Save last chunk
        if current_chunk:
            chunks.append(TextChunk(
                text='\n'.join(current_chunk),
                start_line=start_line,
                end_line=len(lines)
            ))

        return chunks

    def _split_long_line(self, line: str, max_chars: int) -> List[str]:
        """Split a single long line into multiple chunks"""
        chunks = []
        for i in range(0, len(line), max_chars):
            chunks.append(line[i:i + max_chars])
        return chunks

    def _get_overlap_lines(self, lines: List[str], target_chars: int) -> List[str]:
        """Get last few lines that fit within target_chars for overlap"""
        overlap = []
        chars = 0

        for line in reversed(lines):
            line_chars = len(line)
            if chars + line_chars > target_chars:
                break
            overlap.insert(0, line)
            chars += line_chars

        return overlap

    def chunk_markdown(self, text: str) -> List[TextChunk]:
        """
        Chunk markdown text while respecting structure
        (For future enhancement: respect markdown sections)
        """
        return self.chunk_text(text)