broswer-automation/app/chrome-extension/utils/text-chunker.ts

/**
 * Text chunking utility
 * Based on semantic chunking strategy, splits long text into small chunks suitable for vectorization
 */

export interface TextChunk {
  text: string;
  source: string;
  index: number;
  wordCount: number;
}

export interface ChunkingOptions {
  maxWordsPerChunk?: number;
  overlapSentences?: number;
  minChunkLength?: number;
  includeTitle?: boolean;
}

export class TextChunker {
  private readonly defaultOptions: Required<ChunkingOptions> = {
    maxWordsPerChunk: 80,
    overlapSentences: 1,
    minChunkLength: 20,
    includeTitle: true,
  };

  public chunkText(content: string, title?: string, options?: ChunkingOptions): TextChunk[] {
    const opts = { ...this.defaultOptions, ...options };
    const chunks: TextChunk[] = [];

    if (opts.includeTitle && title?.trim() && title.trim().length > 5) {
      chunks.push({
        text: title.trim(),
        source: 'title',
        index: 0,
        wordCount: title.trim().split(/\s+/).length,
      });
    }

    const cleanContent = content.trim();
    if (!cleanContent) {
      return chunks;
    }

    const sentences = this.splitIntoSentences(cleanContent);

    if (sentences.length === 0) {
      return this.fallbackChunking(cleanContent, chunks, opts);
    }

    const hasLongSentences = sentences.some(
      (s: string) => s.split(/\s+/).length > opts.maxWordsPerChunk,
    );

    if (hasLongSentences) {
      return this.mixedChunking(sentences, chunks, opts);
    }

    return this.groupSentencesIntoChunks(sentences, chunks, opts);
  }

  private splitIntoSentences(content: string): string[] {
    const processedContent = content
      .replace(/([。！？])\s*/g, '$1\n')
      .replace(/([.!?])\s+(?=[A-Z])/g, '$1\n')
      .replace(/([.!?]["'])\s+(?=[A-Z])/g, '$1\n')
      .replace(/([.!?])\s*$/gm, '$1\n')
      .replace(/([。！？][""])\s*/g, '$1\n')
      .replace(/\n\s*\n/g, '\n');

    const sentences = processedContent
      .split('\n')
      .map((s) => s.trim())
      .filter((s) => s.length > 15);

    if (sentences.length < 3 && content.length > 500) {
      return this.aggressiveSentenceSplitting(content);
    }

    return sentences;
  }

  private aggressiveSentenceSplitting(content: string): string[] {
    const sentences = content
      .replace(/([.!?。！？])/g, '$1\n')
      .replace(/([;；:：])/g, '$1\n')
      .replace(/([)）])\s*(?=[\u4e00-\u9fa5A-Z])/g, '$1\n')
      .split('\n')
      .map((s) => s.trim())
      .filter((s) => s.length > 15);

    const maxWordsPerChunk = 80;
    const finalSentences: string[] = [];

    for (const sentence of sentences) {
      const words = sentence.split(/\s+/);
      if (words.length <= maxWordsPerChunk) {
        finalSentences.push(sentence);
      } else {
        const overlapWords = 5;
        for (let i = 0; i < words.length; i += maxWordsPerChunk - overlapWords) {
          const chunkWords = words.slice(i, i + maxWordsPerChunk);
          const chunkText = chunkWords.join(' ');
          if (chunkText.length > 15) {
            finalSentences.push(chunkText);
          }
        }
      }
    }

    return finalSentences;
  }

  /**
   * Group sentences into chunks
   */
  private groupSentencesIntoChunks(
    sentences: string[],
    existingChunks: TextChunk[],
    options: Required<ChunkingOptions>,
  ): TextChunk[] {
    const chunks = [...existingChunks];
    let chunkIndex = chunks.length;

    let i = 0;
    while (i < sentences.length) {
      let currentChunkText = '';
      let currentWordCount = 0;
      let sentencesUsed = 0;

      while (i + sentencesUsed < sentences.length && currentWordCount < options.maxWordsPerChunk) {
        const sentence = sentences[i + sentencesUsed];
        const sentenceWords = sentence.split(/\s+/).length;

        if (currentWordCount + sentenceWords > options.maxWordsPerChunk && currentWordCount > 0) {
          break;
        }

        currentChunkText += (currentChunkText ? ' ' : '') + sentence;
        currentWordCount += sentenceWords;
        sentencesUsed++;
      }

      if (currentChunkText.trim().length > options.minChunkLength) {
        chunks.push({
          text: currentChunkText.trim(),
          source: `content_chunk_${chunkIndex}`,
          index: chunkIndex,
          wordCount: currentWordCount,
        });
        chunkIndex++;
      }

      i += Math.max(1, sentencesUsed - options.overlapSentences);
    }
    return chunks;
  }

  /**
   * Mixed chunking method (handles long sentences)
   */
  private mixedChunking(
    sentences: string[],
    existingChunks: TextChunk[],
    options: Required<ChunkingOptions>,
  ): TextChunk[] {
    const chunks = [...existingChunks];
    let chunkIndex = chunks.length;

    for (const sentence of sentences) {
      const sentenceWords = sentence.split(/\s+/).length;

      if (sentenceWords <= options.maxWordsPerChunk) {
        chunks.push({
          text: sentence.trim(),
          source: `sentence_chunk_${chunkIndex}`,
          index: chunkIndex,
          wordCount: sentenceWords,
        });
        chunkIndex++;
      } else {
        const words = sentence.split(/\s+/);
        for (let i = 0; i < words.length; i += options.maxWordsPerChunk) {
          const chunkWords = words.slice(i, i + options.maxWordsPerChunk);
          const chunkText = chunkWords.join(' ');

          if (chunkText.length > options.minChunkLength) {
            chunks.push({
              text: chunkText,
              source: `long_sentence_chunk_${chunkIndex}_part_${Math.floor(i / options.maxWordsPerChunk)}`,
              index: chunkIndex,
              wordCount: chunkWords.length,
            });
          }
        }
        chunkIndex++;
      }
    }

    return chunks;
  }

  /**
   * Fallback chunking (when sentence splitting fails)
   */
  private fallbackChunking(
    content: string,
    existingChunks: TextChunk[],
    options: Required<ChunkingOptions>,
  ): TextChunk[] {
    const chunks = [...existingChunks];
    let chunkIndex = chunks.length;

    const paragraphs = content
      .split(/\n\s*\n/)
      .filter((p) => p.trim().length > options.minChunkLength);

    if (paragraphs.length > 1) {
      paragraphs.forEach((paragraph, index) => {
        const cleanParagraph = paragraph.trim();
        if (cleanParagraph.length > 0) {
          const words = cleanParagraph.split(/\s+/);
          const maxWordsPerChunk = 150;

          for (let i = 0; i < words.length; i += maxWordsPerChunk) {
            const chunkWords = words.slice(i, i + maxWordsPerChunk);
            const chunkText = chunkWords.join(' ');

            if (chunkText.length > options.minChunkLength) {
              chunks.push({
                text: chunkText,
                source: `paragraph_${index}_chunk_${Math.floor(i / maxWordsPerChunk)}`,
                index: chunkIndex,
                wordCount: chunkWords.length,
              });
              chunkIndex++;
            }
          }
        }
      });
    } else {
      const words = content.trim().split(/\s+/);
      const maxWordsPerChunk = 150;

      for (let i = 0; i < words.length; i += maxWordsPerChunk) {
        const chunkWords = words.slice(i, i + maxWordsPerChunk);
        const chunkText = chunkWords.join(' ');

        if (chunkText.length > options.minChunkLength) {
          chunks.push({
            text: chunkText,
            source: `content_chunk_${Math.floor(i / maxWordsPerChunk)}`,
            index: chunkIndex,
            wordCount: chunkWords.length,
          });
          chunkIndex++;
        }
      }
    }

    return chunks;
  }
}