first commit
This commit is contained in:
586
app/chrome-extension/utils/content-indexer.ts
Normal file
586
app/chrome-extension/utils/content-indexer.ts
Normal file
@@ -0,0 +1,586 @@
|
||||
/**
|
||||
* Content index manager
|
||||
* Responsible for automatically extracting, chunking and indexing tab content
|
||||
*/
|
||||
|
||||
import { TextChunker } from './text-chunker';
|
||||
import { VectorDatabase, getGlobalVectorDatabase } from './vector-database';
|
||||
import {
|
||||
SemanticSimilarityEngine,
|
||||
SemanticSimilarityEngineProxy,
|
||||
PREDEFINED_MODELS,
|
||||
type ModelPreset,
|
||||
} from './semantic-similarity-engine';
|
||||
import { TOOL_MESSAGE_TYPES } from '@/common/message-types';
|
||||
|
||||
export interface IndexingOptions {
|
||||
autoIndex?: boolean;
|
||||
maxChunksPerPage?: number;
|
||||
skipDuplicates?: boolean;
|
||||
}
|
||||
|
||||
export class ContentIndexer {
|
||||
private textChunker: TextChunker;
|
||||
private vectorDatabase!: VectorDatabase;
|
||||
private semanticEngine!: SemanticSimilarityEngine | SemanticSimilarityEngineProxy;
|
||||
private isInitialized = false;
|
||||
private isInitializing = false;
|
||||
private initPromise: Promise<void> | null = null;
|
||||
private indexedPages = new Set<string>();
|
||||
private readonly options: Required<IndexingOptions>;
|
||||
|
||||
constructor(options?: IndexingOptions) {
|
||||
this.options = {
|
||||
autoIndex: true,
|
||||
maxChunksPerPage: 50,
|
||||
skipDuplicates: true,
|
||||
...options,
|
||||
};
|
||||
|
||||
this.textChunker = new TextChunker();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current selected model configuration
|
||||
*/
|
||||
private async getCurrentModelConfig() {
|
||||
try {
|
||||
const result = await chrome.storage.local.get(['selectedModel', 'selectedVersion']);
|
||||
const selectedModel = (result.selectedModel as ModelPreset) || 'multilingual-e5-small';
|
||||
const selectedVersion =
|
||||
(result.selectedVersion as 'full' | 'quantized' | 'compressed') || 'quantized';
|
||||
|
||||
const modelInfo = PREDEFINED_MODELS[selectedModel];
|
||||
|
||||
return {
|
||||
modelPreset: selectedModel,
|
||||
modelIdentifier: modelInfo.modelIdentifier,
|
||||
dimension: modelInfo.dimension,
|
||||
modelVersion: selectedVersion,
|
||||
useLocalFiles: false,
|
||||
maxLength: 256,
|
||||
cacheSize: 1000,
|
||||
forceOffscreen: true,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('ContentIndexer: Failed to get current model config, using default:', error);
|
||||
return {
|
||||
modelPreset: 'multilingual-e5-small' as const,
|
||||
modelIdentifier: 'Xenova/multilingual-e5-small',
|
||||
dimension: 384,
|
||||
modelVersion: 'quantized' as const,
|
||||
useLocalFiles: false,
|
||||
maxLength: 256,
|
||||
cacheSize: 1000,
|
||||
forceOffscreen: true,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize content indexer
|
||||
*/
|
||||
public async initialize(): Promise<void> {
|
||||
if (this.isInitialized) return;
|
||||
if (this.isInitializing && this.initPromise) return this.initPromise;
|
||||
|
||||
this.isInitializing = true;
|
||||
this.initPromise = this._doInitialize().finally(() => {
|
||||
this.isInitializing = false;
|
||||
});
|
||||
|
||||
return this.initPromise;
|
||||
}
|
||||
|
||||
private async _doInitialize(): Promise<void> {
|
||||
try {
|
||||
// Get current selected model configuration
|
||||
const engineConfig = await this.getCurrentModelConfig();
|
||||
|
||||
// Use proxy class to reuse engine instance in offscreen
|
||||
this.semanticEngine = new SemanticSimilarityEngineProxy(engineConfig);
|
||||
await this.semanticEngine.initialize();
|
||||
|
||||
this.vectorDatabase = await getGlobalVectorDatabase({
|
||||
dimension: engineConfig.dimension,
|
||||
efSearch: 50,
|
||||
});
|
||||
await this.vectorDatabase.initialize();
|
||||
|
||||
this.setupTabEventListeners();
|
||||
|
||||
this.isInitialized = true;
|
||||
} catch (error) {
|
||||
console.error('ContentIndexer: Initialization failed:', error);
|
||||
this.isInitialized = false;
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Index content of specified tab
|
||||
*/
|
||||
public async indexTabContent(tabId: number): Promise<void> {
|
||||
// Check if semantic engine is ready before attempting to index
|
||||
if (!this.isSemanticEngineReady() && !this.isSemanticEngineInitializing()) {
|
||||
console.log(
|
||||
`ContentIndexer: Skipping tab ${tabId} - semantic engine not ready and not initializing`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!this.isInitialized) {
|
||||
// Only initialize if semantic engine is already ready
|
||||
if (!this.isSemanticEngineReady()) {
|
||||
console.log(
|
||||
`ContentIndexer: Skipping tab ${tabId} - ContentIndexer not initialized and semantic engine not ready`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
await this.initialize();
|
||||
}
|
||||
|
||||
try {
|
||||
const tab = await chrome.tabs.get(tabId);
|
||||
if (!tab.url || !this.shouldIndexUrl(tab.url)) {
|
||||
console.log(`ContentIndexer: Skipping tab ${tabId} - URL not indexable`);
|
||||
return;
|
||||
}
|
||||
|
||||
const pageKey = `${tab.url}_${tab.title}`;
|
||||
if (this.options.skipDuplicates && this.indexedPages.has(pageKey)) {
|
||||
console.log(`ContentIndexer: Skipping tab ${tabId} - already indexed`);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`ContentIndexer: Starting to index tab ${tabId}: ${tab.title}`);
|
||||
|
||||
const content = await this.extractTabContent(tabId);
|
||||
if (!content) {
|
||||
console.log(`ContentIndexer: No content extracted from tab ${tabId}`);
|
||||
return;
|
||||
}
|
||||
|
||||
const chunks = this.textChunker.chunkText(content.textContent, content.title);
|
||||
console.log(`ContentIndexer: Generated ${chunks.length} chunks for tab ${tabId}`);
|
||||
|
||||
const chunksToIndex = chunks.slice(0, this.options.maxChunksPerPage);
|
||||
if (chunks.length > this.options.maxChunksPerPage) {
|
||||
console.log(
|
||||
`ContentIndexer: Limited chunks from ${chunks.length} to ${this.options.maxChunksPerPage}`,
|
||||
);
|
||||
}
|
||||
|
||||
for (const chunk of chunksToIndex) {
|
||||
try {
|
||||
const embedding = await this.semanticEngine.getEmbedding(chunk.text);
|
||||
const label = await this.vectorDatabase.addDocument(
|
||||
tabId,
|
||||
tab.url!,
|
||||
tab.title || '',
|
||||
chunk,
|
||||
embedding,
|
||||
);
|
||||
console.log(`ContentIndexer: Indexed chunk ${chunk.index} with label ${label}`);
|
||||
} catch (error) {
|
||||
console.error(`ContentIndexer: Failed to index chunk ${chunk.index}:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
this.indexedPages.add(pageKey);
|
||||
|
||||
console.log(
|
||||
`ContentIndexer: Successfully indexed ${chunksToIndex.length} chunks for tab ${tabId}`,
|
||||
);
|
||||
} catch (error) {
|
||||
console.error(`ContentIndexer: Failed to index tab ${tabId}:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Search content
|
||||
*/
|
||||
public async searchContent(query: string, topK: number = 10) {
|
||||
// Check if semantic engine is ready before attempting to search
|
||||
if (!this.isSemanticEngineReady() && !this.isSemanticEngineInitializing()) {
|
||||
throw new Error(
|
||||
'Semantic engine is not ready yet. Please initialize the semantic engine first.',
|
||||
);
|
||||
}
|
||||
|
||||
if (!this.isInitialized) {
|
||||
// Only initialize if semantic engine is already ready
|
||||
if (!this.isSemanticEngineReady()) {
|
||||
throw new Error(
|
||||
'ContentIndexer not initialized and semantic engine not ready. Please initialize the semantic engine first.',
|
||||
);
|
||||
}
|
||||
await this.initialize();
|
||||
}
|
||||
|
||||
try {
|
||||
const queryEmbedding = await this.semanticEngine.getEmbedding(query);
|
||||
const results = await this.vectorDatabase.search(queryEmbedding, topK);
|
||||
|
||||
console.log(`ContentIndexer: Found ${results.length} results for query: "${query}"`);
|
||||
return results;
|
||||
} catch (error) {
|
||||
console.error('ContentIndexer: Search failed:', error);
|
||||
|
||||
if (error instanceof Error && error.message.includes('not initialized')) {
|
||||
console.log(
|
||||
'ContentIndexer: Attempting to reinitialize semantic engine and retry search...',
|
||||
);
|
||||
try {
|
||||
await this.semanticEngine.initialize();
|
||||
const queryEmbedding = await this.semanticEngine.getEmbedding(query);
|
||||
const results = await this.vectorDatabase.search(queryEmbedding, topK);
|
||||
|
||||
console.log(
|
||||
`ContentIndexer: Retry successful, found ${results.length} results for query: "${query}"`,
|
||||
);
|
||||
return results;
|
||||
} catch (retryError) {
|
||||
console.error('ContentIndexer: Retry after reinitialization also failed:', retryError);
|
||||
throw retryError;
|
||||
}
|
||||
}
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove tab index
|
||||
*/
|
||||
public async removeTabIndex(tabId: number): Promise<void> {
|
||||
if (!this.isInitialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await this.vectorDatabase.removeTabDocuments(tabId);
|
||||
|
||||
for (const pageKey of this.indexedPages) {
|
||||
if (pageKey.includes(`tab_${tabId}_`)) {
|
||||
this.indexedPages.delete(pageKey);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`ContentIndexer: Removed index for tab ${tabId}`);
|
||||
} catch (error) {
|
||||
console.error(`ContentIndexer: Failed to remove index for tab ${tabId}:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if semantic engine is ready (checks both local and global state)
|
||||
*/
|
||||
public isSemanticEngineReady(): boolean {
|
||||
return this.semanticEngine && this.semanticEngine.isInitialized;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if global semantic engine is ready (in background/offscreen)
|
||||
*/
|
||||
public async isGlobalSemanticEngineReady(): Promise<boolean> {
|
||||
try {
|
||||
// Since ContentIndexer runs in background script, directly call the function instead of sending message
|
||||
const { handleGetModelStatus } = await import('@/entrypoints/background/semantic-similarity');
|
||||
const response = await handleGetModelStatus();
|
||||
return (
|
||||
response &&
|
||||
response.success &&
|
||||
response.status &&
|
||||
response.status.initializationStatus === 'ready'
|
||||
);
|
||||
} catch (error) {
|
||||
console.error('ContentIndexer: Failed to check global semantic engine status:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if semantic engine is initializing
|
||||
*/
|
||||
public isSemanticEngineInitializing(): boolean {
|
||||
return (
|
||||
this.isInitializing || (this.semanticEngine && (this.semanticEngine as any).isInitializing)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reinitialize content indexer (for model switching)
|
||||
*/
|
||||
public async reinitialize(): Promise<void> {
|
||||
console.log('ContentIndexer: Reinitializing for model switch...');
|
||||
|
||||
this.isInitialized = false;
|
||||
this.isInitializing = false;
|
||||
this.initPromise = null;
|
||||
|
||||
await this.performCompleteDataCleanupForModelSwitch();
|
||||
|
||||
this.indexedPages.clear();
|
||||
console.log('ContentIndexer: Cleared indexed pages cache');
|
||||
|
||||
try {
|
||||
console.log('ContentIndexer: Creating new semantic engine proxy...');
|
||||
const newEngineConfig = await this.getCurrentModelConfig();
|
||||
console.log('ContentIndexer: New engine config:', newEngineConfig);
|
||||
|
||||
this.semanticEngine = new SemanticSimilarityEngineProxy(newEngineConfig);
|
||||
console.log('ContentIndexer: New semantic engine proxy created');
|
||||
|
||||
await this.semanticEngine.initialize();
|
||||
console.log('ContentIndexer: Semantic engine proxy initialization completed');
|
||||
} catch (error) {
|
||||
console.error('ContentIndexer: Failed to create new semantic engine proxy:', error);
|
||||
throw error;
|
||||
}
|
||||
|
||||
console.log(
|
||||
'ContentIndexer: New semantic engine proxy is ready, proceeding with initialization',
|
||||
);
|
||||
|
||||
await this.initialize();
|
||||
|
||||
console.log('ContentIndexer: Reinitialization completed successfully');
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform complete data cleanup for model switching
|
||||
*/
|
||||
private async performCompleteDataCleanupForModelSwitch(): Promise<void> {
|
||||
console.log('ContentIndexer: Starting complete data cleanup for model switch...');
|
||||
|
||||
try {
|
||||
// Clear existing vector database instance
|
||||
if (this.vectorDatabase) {
|
||||
try {
|
||||
console.log('ContentIndexer: Clearing existing vector database instance...');
|
||||
await this.vectorDatabase.clear();
|
||||
console.log('ContentIndexer: Vector database instance cleared successfully');
|
||||
} catch (error) {
|
||||
console.warn('ContentIndexer: Failed to clear vector database instance:', error);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const { clearAllVectorData } = await import('./vector-database');
|
||||
await clearAllVectorData();
|
||||
console.log('ContentIndexer: Cleared all vector data for model switch');
|
||||
} catch (error) {
|
||||
console.warn('ContentIndexer: Failed to clear vector data:', error);
|
||||
}
|
||||
|
||||
try {
|
||||
const keysToRemove = [
|
||||
'hnswlib_document_mappings_tab_content_index.dat',
|
||||
'hnswlib_document_mappings_content_index.dat',
|
||||
'hnswlib_document_mappings_vector_index.dat',
|
||||
'vectorDatabaseStats',
|
||||
'lastCleanupTime',
|
||||
];
|
||||
await chrome.storage.local.remove(keysToRemove);
|
||||
console.log('ContentIndexer: Cleared chrome.storage model-related data');
|
||||
} catch (error) {
|
||||
console.warn('ContentIndexer: Failed to clear chrome.storage data:', error);
|
||||
}
|
||||
|
||||
try {
|
||||
const deleteVectorDB = indexedDB.deleteDatabase('VectorDatabaseStorage');
|
||||
await new Promise<void>((resolve) => {
|
||||
deleteVectorDB.onsuccess = () => {
|
||||
console.log('ContentIndexer: VectorDatabaseStorage database deleted');
|
||||
resolve();
|
||||
};
|
||||
deleteVectorDB.onerror = () => {
|
||||
console.warn('ContentIndexer: Failed to delete VectorDatabaseStorage database');
|
||||
resolve(); // Don't block the process
|
||||
};
|
||||
deleteVectorDB.onblocked = () => {
|
||||
console.warn('ContentIndexer: VectorDatabaseStorage database deletion blocked');
|
||||
resolve(); // Don't block the process
|
||||
};
|
||||
});
|
||||
|
||||
// Clean up hnswlib-index database
|
||||
const deleteHnswDB = indexedDB.deleteDatabase('/hnswlib-index');
|
||||
await new Promise<void>((resolve) => {
|
||||
deleteHnswDB.onsuccess = () => {
|
||||
console.log('ContentIndexer: /hnswlib-index database deleted');
|
||||
resolve();
|
||||
};
|
||||
deleteHnswDB.onerror = () => {
|
||||
console.warn('ContentIndexer: Failed to delete /hnswlib-index database');
|
||||
resolve(); // Don't block the process
|
||||
};
|
||||
deleteHnswDB.onblocked = () => {
|
||||
console.warn('ContentIndexer: /hnswlib-index database deletion blocked');
|
||||
resolve(); // Don't block the process
|
||||
};
|
||||
});
|
||||
|
||||
console.log('ContentIndexer: All IndexedDB databases cleared for model switch');
|
||||
} catch (error) {
|
||||
console.warn('ContentIndexer: Failed to clear IndexedDB databases:', error);
|
||||
}
|
||||
|
||||
console.log('ContentIndexer: Complete data cleanup for model switch finished successfully');
|
||||
} catch (error) {
|
||||
console.error('ContentIndexer: Complete data cleanup for model switch failed:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Manually trigger semantic engine initialization (async, don't wait for completion)
|
||||
* Note: This should only be called after the semantic engine is already initialized
|
||||
*/
|
||||
public startSemanticEngineInitialization(): void {
|
||||
if (!this.isInitialized && !this.isInitializing) {
|
||||
console.log('ContentIndexer: Checking if semantic engine is ready...');
|
||||
|
||||
// Check if global semantic engine is ready before initializing ContentIndexer
|
||||
this.isGlobalSemanticEngineReady()
|
||||
.then((isReady) => {
|
||||
if (isReady) {
|
||||
console.log('ContentIndexer: Starting initialization (semantic engine ready)...');
|
||||
this.initialize().catch((error) => {
|
||||
console.error('ContentIndexer: Background initialization failed:', error);
|
||||
});
|
||||
} else {
|
||||
console.log('ContentIndexer: Semantic engine not ready, skipping initialization');
|
||||
}
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error('ContentIndexer: Failed to check semantic engine status:', error);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get indexing statistics
|
||||
*/
|
||||
public getStats() {
|
||||
const vectorStats = this.vectorDatabase
|
||||
? this.vectorDatabase.getStats()
|
||||
: {
|
||||
totalDocuments: 0,
|
||||
totalTabs: 0,
|
||||
indexSize: 0,
|
||||
};
|
||||
|
||||
return {
|
||||
...vectorStats,
|
||||
indexedPages: this.indexedPages.size,
|
||||
isInitialized: this.isInitialized,
|
||||
semanticEngineReady: this.isSemanticEngineReady(),
|
||||
semanticEngineInitializing: this.isSemanticEngineInitializing(),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear all indexes
|
||||
*/
|
||||
public async clearAllIndexes(): Promise<void> {
|
||||
if (!this.isInitialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await this.vectorDatabase.clear();
|
||||
this.indexedPages.clear();
|
||||
console.log('ContentIndexer: All indexes cleared');
|
||||
} catch (error) {
|
||||
console.error('ContentIndexer: Failed to clear indexes:', error);
|
||||
}
|
||||
}
|
||||
private setupTabEventListeners(): void {
|
||||
chrome.tabs.onUpdated.addListener(async (tabId, changeInfo, tab) => {
|
||||
if (this.options.autoIndex && changeInfo.status === 'complete' && tab.url) {
|
||||
setTimeout(() => {
|
||||
if (!this.isSemanticEngineReady() && !this.isSemanticEngineInitializing()) {
|
||||
console.log(
|
||||
`ContentIndexer: Skipping auto-index for tab ${tabId} - semantic engine not ready`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
this.indexTabContent(tabId).catch((error) => {
|
||||
console.error(`ContentIndexer: Auto-indexing failed for tab ${tabId}:`, error);
|
||||
});
|
||||
}, 2000);
|
||||
}
|
||||
});
|
||||
|
||||
chrome.tabs.onRemoved.addListener(async (tabId) => {
|
||||
await this.removeTabIndex(tabId);
|
||||
});
|
||||
|
||||
if (chrome.webNavigation) {
|
||||
chrome.webNavigation.onCommitted.addListener(async (details) => {
|
||||
if (details.frameId === 0) {
|
||||
await this.removeTabIndex(details.tabId);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private shouldIndexUrl(url: string): boolean {
|
||||
const excludePatterns = [
|
||||
/^chrome:\/\//,
|
||||
/^chrome-extension:\/\//,
|
||||
/^edge:\/\//,
|
||||
/^about:/,
|
||||
/^moz-extension:\/\//,
|
||||
/^file:\/\//,
|
||||
];
|
||||
|
||||
return !excludePatterns.some((pattern) => pattern.test(url));
|
||||
}
|
||||
|
||||
private async extractTabContent(
|
||||
tabId: number,
|
||||
): Promise<{ textContent: string; title: string } | null> {
|
||||
try {
|
||||
await chrome.scripting.executeScript({
|
||||
target: { tabId },
|
||||
files: ['inject-scripts/web-fetcher-helper.js'],
|
||||
});
|
||||
|
||||
const response = await chrome.tabs.sendMessage(tabId, {
|
||||
action: TOOL_MESSAGE_TYPES.WEB_FETCHER_GET_TEXT_CONTENT,
|
||||
});
|
||||
|
||||
if (response.success && response.textContent) {
|
||||
return {
|
||||
textContent: response.textContent,
|
||||
title: response.title || '',
|
||||
};
|
||||
} else {
|
||||
console.error(
|
||||
`ContentIndexer: Failed to extract content from tab ${tabId}:`,
|
||||
response.error,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`ContentIndexer: Error extracting content from tab ${tabId}:`, error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let globalContentIndexer: ContentIndexer | null = null;
|
||||
|
||||
/**
|
||||
* Get global ContentIndexer instance
|
||||
*/
|
||||
export function getGlobalContentIndexer(): ContentIndexer {
|
||||
if (!globalContentIndexer) {
|
||||
globalContentIndexer = new ContentIndexer();
|
||||
}
|
||||
return globalContentIndexer;
|
||||
}
|
Reference in New Issue
Block a user