chatgpt-on-wechat/agent/tools/utils/diff.py

"""
Diff tools for file editing
Provides fuzzy matching and diff generation functionality
"""

import difflib
import re
from typing import Optional, Tuple


def strip_bom(text: str) -> Tuple[str, str]:
    """
    Remove BOM (Byte Order Mark)

    :param text: Original text
    :return: (BOM, text after removing BOM)
    """
    if text.startswith('\ufeff'):
        return '\ufeff', text[1:]
    return '', text


def detect_line_ending(text: str) -> str:
    """
    Detect line ending type

    :param text: Text content
    :return: Line ending type ('\r\n' or '\n')
    """
    if '\r\n' in text:
        return '\r\n'
    return '\n'


def normalize_to_lf(text: str) -> str:
    """
    Normalize all line endings to LF (\n)

    :param text: Original text
    :return: Normalized text
    """
    return text.replace('\r\n', '\n').replace('\r', '\n')


def restore_line_endings(text: str, original_ending: str) -> str:
    """
    Restore original line endings

    :param text: LF normalized text
    :param original_ending: Original line ending
    :return: Text with restored line endings
    """
    if original_ending == '\r\n':
        return text.replace('\n', '\r\n')
    return text


def normalize_for_fuzzy_match(text: str) -> str:
    """
    Normalize text for fuzzy matching
    Remove excess whitespace but preserve basic structure

    :param text: Original text
    :return: Normalized text
    """
    # Compress multiple spaces to one
    text = re.sub(r'[ \t]+', ' ', text)
    # Remove trailing spaces
    text = re.sub(r' +\n', '\n', text)
    # Remove leading spaces (but preserve indentation structure, only remove excess)
    lines = text.split('\n')
    normalized_lines = []
    for line in lines:
        # Preserve indentation but normalize to multiples of single spaces
        stripped = line.lstrip()
        if stripped:
            indent_count = len(line) - len(stripped)
            # Normalize indentation (convert tabs to spaces)
            normalized_indent = ' ' * indent_count
            normalized_lines.append(normalized_indent + stripped)
        else:
            normalized_lines.append('')
    return '\n'.join(normalized_lines)


class FuzzyMatchResult:
    """Fuzzy match result"""

    def __init__(self, found: bool, index: int = -1, match_length: int = 0, content_for_replacement: str = ""):
        self.found = found
        self.index = index
        self.match_length = match_length
        self.content_for_replacement = content_for_replacement


def fuzzy_find_text(content: str, old_text: str) -> FuzzyMatchResult:
    """
    Find text in content, try exact match first, then fuzzy match

    :param content: Content to search in
    :param old_text: Text to find
    :return: Match result
    """
    # First try exact match
    index = content.find(old_text)
    if index != -1:
        return FuzzyMatchResult(
            found=True,
            index=index,
            match_length=len(old_text),
            content_for_replacement=content
        )

    # Try fuzzy match
    fuzzy_content = normalize_for_fuzzy_match(content)
    fuzzy_old_text = normalize_for_fuzzy_match(old_text)

    index = fuzzy_content.find(fuzzy_old_text)
    if index != -1:
        # Fuzzy match successful, use normalized content for replacement
        return FuzzyMatchResult(
            found=True,
            index=index,
            match_length=len(fuzzy_old_text),
            content_for_replacement=fuzzy_content
        )

    # Not found
    return FuzzyMatchResult(found=False)


def generate_diff_string(old_content: str, new_content: str) -> dict:
    """
    Generate unified diff string

    :param old_content: Old content
    :param new_content: New content
    :return: Dictionary containing diff and first changed line number
    """
    old_lines = old_content.split('\n')
    new_lines = new_content.split('\n')

    # Generate unified diff
    diff_lines = list(difflib.unified_diff(
        old_lines,
        new_lines,
        lineterm='',
        fromfile='original',
        tofile='modified'
    ))

    # Find first changed line number
    first_changed_line = None
    for line in diff_lines:
        if line.startswith('@@'):
            # Parse @@ -1,3 +1,3 @@ format
            match = re.search(r'@@ -\d+,?\d* \+(\d+)', line)
            if match:
                first_changed_line = int(match.group(1))
                break

    diff_string = '\n'.join(diff_lines)

    return {
        'diff': diff_string,
        'first_changed_line': first_changed_line
    }