""" Text chunking utilities for memory Splits text into chunks with token limits and overlap """ from __future__ import annotations from typing import List, Tuple from dataclasses import dataclass @dataclass class TextChunk: """Represents a text chunk with line numbers""" text: str start_line: int end_line: int class TextChunker: """Chunks text by line count with token estimation""" def __init__(self, max_tokens: int = 500, overlap_tokens: int = 50): """ Initialize chunker Args: max_tokens: Maximum tokens per chunk overlap_tokens: Overlap tokens between chunks """ self.max_tokens = max_tokens self.overlap_tokens = overlap_tokens # Rough estimation: ~4 chars per token for English/Chinese mixed self.chars_per_token = 4 def chunk_text(self, text: str) -> List[TextChunk]: """ Chunk text into overlapping segments Args: text: Input text to chunk Returns: List of TextChunk objects """ if not text.strip(): return [] lines = text.split('\n') chunks = [] max_chars = self.max_tokens * self.chars_per_token overlap_chars = self.overlap_tokens * self.chars_per_token current_chunk = [] current_chars = 0 start_line = 1 for i, line in enumerate(lines, start=1): line_chars = len(line) # If single line exceeds max, split it if line_chars > max_chars: # Save current chunk if exists if current_chunk: chunks.append(TextChunk( text='\n'.join(current_chunk), start_line=start_line, end_line=i - 1 )) current_chunk = [] current_chars = 0 # Split long line into multiple chunks for sub_chunk in self._split_long_line(line, max_chars): chunks.append(TextChunk( text=sub_chunk, start_line=i, end_line=i )) start_line = i + 1 continue # Check if adding this line would exceed limit if current_chars + line_chars > max_chars and current_chunk: # Save current chunk chunks.append(TextChunk( text='\n'.join(current_chunk), start_line=start_line, end_line=i - 1 )) # Start new chunk with overlap overlap_lines = self._get_overlap_lines(current_chunk, overlap_chars) current_chunk = overlap_lines + [line] current_chars = sum(len(l) for l in current_chunk) start_line = i - len(overlap_lines) else: # Add line to current chunk current_chunk.append(line) current_chars += line_chars # Save last chunk if current_chunk: chunks.append(TextChunk( text='\n'.join(current_chunk), start_line=start_line, end_line=len(lines) )) return chunks def _split_long_line(self, line: str, max_chars: int) -> List[str]: """Split a single long line into multiple chunks""" chunks = [] for i in range(0, len(line), max_chars): chunks.append(line[i:i + max_chars]) return chunks def _get_overlap_lines(self, lines: List[str], target_chars: int) -> List[str]: """Get last few lines that fit within target_chars for overlap""" overlap = [] chars = 0 for line in reversed(lines): line_chars = len(line) if chars + line_chars > target_chars: break overlap.insert(0, line) chars += line_chars return overlap def chunk_markdown(self, text: str) -> List[TextChunk]: """ Chunk markdown text while respecting structure (For future enhancement: respect markdown sections) """ return self.chunk_text(text)