Files
broswer-automation/app/chrome-extension/utils/vector-database.ts
nasir@endelospay.com d97cad1736 first commit
2025-08-12 02:54:17 +05:00

1564 lines
52 KiB
TypeScript

/**
* Vector database manager
* Uses hnswlib-wasm for high-performance vector similarity search
* Implements singleton pattern to avoid duplicate WASM module initialization
*/
import { loadHnswlib } from 'hnswlib-wasm-static';
import type { TextChunk } from './text-chunker';
export interface VectorDocument {
id: string;
tabId: number;
url: string;
title: string;
chunk: TextChunk;
embedding: Float32Array;
timestamp: number;
}
export interface SearchResult {
document: VectorDocument;
similarity: number;
distance: number;
}
export interface VectorDatabaseConfig {
dimension: number;
maxElements: number;
efConstruction: number;
M: number;
efSearch: number;
indexFileName: string;
enableAutoCleanup?: boolean;
maxRetentionDays?: number;
}
let globalHnswlib: any = null;
let globalHnswlibInitPromise: Promise<any> | null = null;
let globalHnswlibInitialized = false;
let syncInProgress = false;
let pendingSyncPromise: Promise<void> | null = null;
const DB_NAME = 'VectorDatabaseStorage';
const DB_VERSION = 1;
const STORE_NAME = 'documentMappings';
/**
* IndexedDB helper functions
*/
class IndexedDBHelper {
private static dbPromise: Promise<IDBDatabase> | null = null;
static async getDB(): Promise<IDBDatabase> {
if (!this.dbPromise) {
this.dbPromise = new Promise((resolve, reject) => {
const request = indexedDB.open(DB_NAME, DB_VERSION);
request.onerror = () => reject(request.error);
request.onsuccess = () => resolve(request.result);
request.onupgradeneeded = (event) => {
const db = (event.target as IDBOpenDBRequest).result;
if (!db.objectStoreNames.contains(STORE_NAME)) {
const store = db.createObjectStore(STORE_NAME, { keyPath: 'id' });
store.createIndex('indexFileName', 'indexFileName', { unique: false });
}
};
});
}
return this.dbPromise;
}
static async saveData(indexFileName: string, data: any): Promise<void> {
const db = await this.getDB();
const transaction = db.transaction([STORE_NAME], 'readwrite');
const store = transaction.objectStore(STORE_NAME);
await new Promise<void>((resolve, reject) => {
const request = store.put({
id: indexFileName,
indexFileName,
data,
timestamp: Date.now(),
});
request.onsuccess = () => resolve();
request.onerror = () => reject(request.error);
});
}
static async loadData(indexFileName: string): Promise<any | null> {
const db = await this.getDB();
const transaction = db.transaction([STORE_NAME], 'readonly');
const store = transaction.objectStore(STORE_NAME);
return new Promise<any | null>((resolve, reject) => {
const request = store.get(indexFileName);
request.onsuccess = () => {
const result = request.result;
resolve(result ? result.data : null);
};
request.onerror = () => reject(request.error);
});
}
static async deleteData(indexFileName: string): Promise<void> {
const db = await this.getDB();
const transaction = db.transaction([STORE_NAME], 'readwrite');
const store = transaction.objectStore(STORE_NAME);
await new Promise<void>((resolve, reject) => {
const request = store.delete(indexFileName);
request.onsuccess = () => resolve();
request.onerror = () => reject(request.error);
});
}
/**
* Clear all IndexedDB data (for complete cleanup during model switching)
*/
static async clearAllData(): Promise<void> {
try {
const db = await this.getDB();
const transaction = db.transaction([STORE_NAME], 'readwrite');
const store = transaction.objectStore(STORE_NAME);
await new Promise<void>((resolve, reject) => {
const request = store.clear();
request.onsuccess = () => {
console.log('IndexedDBHelper: All data cleared from IndexedDB');
resolve();
};
request.onerror = () => reject(request.error);
});
} catch (error) {
console.error('IndexedDBHelper: Failed to clear all data:', error);
throw error;
}
}
/**
* Get all stored keys (for debugging)
*/
static async getAllKeys(): Promise<string[]> {
try {
const db = await this.getDB();
const transaction = db.transaction([STORE_NAME], 'readonly');
const store = transaction.objectStore(STORE_NAME);
return new Promise<string[]>((resolve, reject) => {
const request = store.getAllKeys();
request.onsuccess = () => resolve(request.result as string[]);
request.onerror = () => reject(request.error);
});
} catch (error) {
console.error('IndexedDBHelper: Failed to get all keys:', error);
return [];
}
}
}
/**
* Global hnswlib-wasm initialization function
* Ensures initialization only once across the entire application
*/
async function initializeGlobalHnswlib(): Promise<any> {
if (globalHnswlibInitialized && globalHnswlib) {
return globalHnswlib;
}
if (globalHnswlibInitPromise) {
return globalHnswlibInitPromise;
}
globalHnswlibInitPromise = (async () => {
try {
console.log('VectorDatabase: Initializing global hnswlib-wasm instance...');
globalHnswlib = await loadHnswlib();
globalHnswlibInitialized = true;
console.log('VectorDatabase: Global hnswlib-wasm instance initialized successfully');
return globalHnswlib;
} catch (error) {
console.error('VectorDatabase: Failed to initialize global hnswlib-wasm:', error);
globalHnswlibInitPromise = null;
throw error;
}
})();
return globalHnswlibInitPromise;
}
export class VectorDatabase {
private index: any = null;
private isInitialized = false;
private isInitializing = false;
private initPromise: Promise<void> | null = null;
private documents = new Map<number, VectorDocument>();
private tabDocuments = new Map<number, Set<number>>();
private nextLabel = 0;
private readonly config: VectorDatabaseConfig;
constructor(config?: Partial<VectorDatabaseConfig>) {
this.config = {
dimension: 384,
maxElements: 100000,
efConstruction: 200,
M: 48,
efSearch: 50,
indexFileName: 'tab_content_index.dat',
enableAutoCleanup: true,
maxRetentionDays: 30,
...config,
};
console.log('VectorDatabase: Initialized with config:', {
dimension: this.config.dimension,
efSearch: this.config.efSearch,
M: this.config.M,
efConstruction: this.config.efConstruction,
enableAutoCleanup: this.config.enableAutoCleanup,
maxRetentionDays: this.config.maxRetentionDays,
});
}
/**
* Initialize vector database
*/
public async initialize(): Promise<void> {
if (this.isInitialized) return;
if (this.isInitializing && this.initPromise) return this.initPromise;
this.isInitializing = true;
this.initPromise = this._doInitialize().finally(() => {
this.isInitializing = false;
});
return this.initPromise;
}
private async _doInitialize(): Promise<void> {
try {
console.log('VectorDatabase: Initializing...');
const hnswlib = await initializeGlobalHnswlib();
hnswlib.EmscriptenFileSystemManager.setDebugLogs(true);
this.index = new hnswlib.HierarchicalNSW(
'cosine',
this.config.dimension,
this.config.indexFileName,
);
await this.syncFileSystem('read');
const indexExists = hnswlib.EmscriptenFileSystemManager.checkFileExists(
this.config.indexFileName,
);
if (indexExists) {
console.log('VectorDatabase: Loading existing index...');
try {
await this.index.readIndex(this.config.indexFileName, this.config.maxElements);
this.index.setEfSearch(this.config.efSearch);
await this.loadDocumentMappings();
if (this.documents.size > 0) {
const maxLabel = Math.max(...Array.from(this.documents.keys()));
this.nextLabel = maxLabel + 1;
console.log(
`VectorDatabase: Loaded existing index with ${this.documents.size} documents, next label: ${this.nextLabel}`,
);
} else {
const indexCount = this.index.getCurrentCount();
if (indexCount > 0) {
console.warn(
`VectorDatabase: Index has ${indexCount} vectors but no document mappings found. This may cause label mismatch.`,
);
this.nextLabel = indexCount;
} else {
this.nextLabel = 0;
}
console.log(
`VectorDatabase: No document mappings found, starting with next label: ${this.nextLabel}`,
);
}
} catch (loadError) {
console.warn(
'VectorDatabase: Failed to load existing index, creating new one:',
loadError,
);
this.index.initIndex(
this.config.maxElements,
this.config.M,
this.config.efConstruction,
200,
);
this.index.setEfSearch(this.config.efSearch);
this.nextLabel = 0;
}
} else {
console.log('VectorDatabase: Creating new index...');
this.index.initIndex(
this.config.maxElements,
this.config.M,
this.config.efConstruction,
200,
);
this.index.setEfSearch(this.config.efSearch);
this.nextLabel = 0;
}
this.isInitialized = true;
console.log('VectorDatabase: Initialization completed successfully');
} catch (error) {
console.error('VectorDatabase: Initialization failed:', error);
this.isInitialized = false;
throw error;
}
}
/**
* Add document to vector database
*/
public async addDocument(
tabId: number,
url: string,
title: string,
chunk: TextChunk,
embedding: Float32Array,
): Promise<number> {
if (!this.isInitialized) {
await this.initialize();
}
const documentId = this.generateDocumentId(tabId, chunk.index);
const document: VectorDocument = {
id: documentId,
tabId,
url,
title,
chunk,
embedding,
timestamp: Date.now(),
};
try {
// Validate vector data
if (!embedding || embedding.length !== this.config.dimension) {
const errorMsg = `Invalid embedding dimension: expected ${this.config.dimension}, got ${embedding?.length || 0}`;
console.error('VectorDatabase: Dimension mismatch detected!', {
expectedDimension: this.config.dimension,
actualDimension: embedding?.length || 0,
documentId,
tabId,
url,
title: title.substring(0, 50) + '...',
});
// This might be caused by model switching, suggest reinitialization
console.warn(
'VectorDatabase: This might be caused by model switching. Consider reinitializing the vector database with the correct dimension.',
);
throw new Error(errorMsg);
}
// Check if vector data contains invalid values
for (let i = 0; i < embedding.length; i++) {
if (!isFinite(embedding[i])) {
throw new Error(`Invalid embedding value at index ${i}: ${embedding[i]}`);
}
}
// Ensure we have a clean Float32Array
let cleanEmbedding: Float32Array;
if (embedding instanceof Float32Array) {
cleanEmbedding = embedding;
} else {
cleanEmbedding = new Float32Array(embedding);
}
// Use current nextLabel as label
const label = this.nextLabel++;
console.log(
`VectorDatabase: Adding document with label ${label}, embedding dimension: ${embedding.length}`,
);
// Add vector to index
// According to hnswlib-wasm-static emscripten binding requirements, need to create VectorFloat type
console.log(`VectorDatabase: 🔧 DEBUGGING - About to call addPoint with:`, {
embeddingType: typeof cleanEmbedding,
isFloat32Array: cleanEmbedding instanceof Float32Array,
length: cleanEmbedding.length,
firstFewValues: Array.from(cleanEmbedding.slice(0, 3)),
label: label,
replaceDeleted: false,
});
// Method 1: Try using VectorFloat constructor (if available)
let vectorToAdd;
try {
// Check if VectorFloat constructor exists
if (globalHnswlib && globalHnswlib.VectorFloat) {
console.log('VectorDatabase: Using VectorFloat constructor');
vectorToAdd = new globalHnswlib.VectorFloat();
// Add elements to VectorFloat one by one
for (let i = 0; i < cleanEmbedding.length; i++) {
vectorToAdd.push_back(cleanEmbedding[i]);
}
} else {
// Method 2: Use plain JS array (fallback)
console.log('VectorDatabase: Using plain JS array as fallback');
vectorToAdd = Array.from(cleanEmbedding);
}
// Call addPoint with constructed vector
this.index.addPoint(vectorToAdd, label, false);
// Clean up VectorFloat object (if manually created)
if (vectorToAdd && typeof vectorToAdd.delete === 'function') {
vectorToAdd.delete();
}
} catch (vectorError) {
console.error(
'VectorDatabase: VectorFloat approach failed, trying alternatives:',
vectorError,
);
// Method 3: Try passing Float32Array directly
try {
console.log('VectorDatabase: Trying Float32Array directly');
this.index.addPoint(cleanEmbedding, label, false);
} catch (float32Error) {
console.error('VectorDatabase: Float32Array approach failed:', float32Error);
// Method 4: Last resort - use spread operator
console.log('VectorDatabase: Trying spread operator as last resort');
this.index.addPoint([...cleanEmbedding], label, false);
}
}
console.log(`VectorDatabase: ✅ Successfully added document with label ${label}`);
// Store document mapping
this.documents.set(label, document);
// Update tab document mapping
if (!this.tabDocuments.has(tabId)) {
this.tabDocuments.set(tabId, new Set());
}
this.tabDocuments.get(tabId)!.add(label);
// Save index and mappings
await this.saveIndex();
await this.saveDocumentMappings();
// Check if auto cleanup is needed
if (this.config.enableAutoCleanup) {
await this.checkAndPerformAutoCleanup();
}
console.log(`VectorDatabase: Successfully added document ${documentId} with label ${label}`);
return label;
} catch (error) {
console.error('VectorDatabase: Failed to add document:', error);
console.error('VectorDatabase: Embedding info:', {
type: typeof embedding,
constructor: embedding?.constructor?.name,
length: embedding?.length,
isFloat32Array: embedding instanceof Float32Array,
firstFewValues: embedding ? Array.from(embedding.slice(0, 5)) : null,
});
throw error;
}
}
/**
* Search similar documents
*/
public async search(queryEmbedding: Float32Array, topK: number = 10): Promise<SearchResult[]> {
if (!this.isInitialized) {
await this.initialize();
}
try {
// Validate query vector
if (!queryEmbedding || queryEmbedding.length !== this.config.dimension) {
throw new Error(
`Invalid query embedding dimension: expected ${this.config.dimension}, got ${queryEmbedding?.length || 0}`,
);
}
// Check if query vector contains invalid values
for (let i = 0; i < queryEmbedding.length; i++) {
if (!isFinite(queryEmbedding[i])) {
throw new Error(`Invalid query embedding value at index ${i}: ${queryEmbedding[i]}`);
}
}
console.log(
`VectorDatabase: Searching with query embedding dimension: ${queryEmbedding.length}, topK: ${topK}`,
);
// Check if index is empty
const currentCount = this.index.getCurrentCount();
if (currentCount === 0) {
console.log('VectorDatabase: Index is empty, returning no results');
return [];
}
console.log(`VectorDatabase: Index contains ${currentCount} vectors`);
// Check if document mapping and index are synchronized
const mappingCount = this.documents.size;
if (mappingCount === 0 && currentCount > 0) {
console.warn(
`VectorDatabase: Index has ${currentCount} vectors but document mapping is empty. Attempting to reload mappings...`,
);
await this.loadDocumentMappings();
if (this.documents.size === 0) {
console.error(
'VectorDatabase: Failed to load document mappings. Index and mappings are out of sync.',
);
return [];
}
console.log(
`VectorDatabase: Successfully reloaded ${this.documents.size} document mappings`,
);
}
// Process query vector according to hnswlib-wasm-static emscripten binding requirements
let queryVector;
let searchResult;
try {
// Method 1: Try using VectorFloat constructor (if available)
if (globalHnswlib && globalHnswlib.VectorFloat) {
console.log('VectorDatabase: Using VectorFloat for search query');
queryVector = new globalHnswlib.VectorFloat();
// Add elements to VectorFloat one by one
for (let i = 0; i < queryEmbedding.length; i++) {
queryVector.push_back(queryEmbedding[i]);
}
searchResult = this.index.searchKnn(queryVector, topK, undefined);
// Clean up VectorFloat object
if (queryVector && typeof queryVector.delete === 'function') {
queryVector.delete();
}
} else {
// Method 2: Use plain JS array (fallback)
console.log('VectorDatabase: Using plain JS array for search query');
const queryArray = Array.from(queryEmbedding);
searchResult = this.index.searchKnn(queryArray, topK, undefined);
}
} catch (vectorError) {
console.error(
'VectorDatabase: VectorFloat search failed, trying alternatives:',
vectorError,
);
// Method 3: Try passing Float32Array directly
try {
console.log('VectorDatabase: Trying Float32Array directly for search');
searchResult = this.index.searchKnn(queryEmbedding, topK, undefined);
} catch (float32Error) {
console.error('VectorDatabase: Float32Array search failed:', float32Error);
// Method 4: Last resort - use spread operator
console.log('VectorDatabase: Trying spread operator for search as last resort');
searchResult = this.index.searchKnn([...queryEmbedding], topK, undefined);
}
}
const results: SearchResult[] = [];
console.log(`VectorDatabase: Processing ${searchResult.neighbors.length} search neighbors`);
console.log(`VectorDatabase: Available documents in mapping: ${this.documents.size}`);
console.log(`VectorDatabase: Index current count: ${this.index.getCurrentCount()}`);
for (let i = 0; i < searchResult.neighbors.length; i++) {
const label = searchResult.neighbors[i];
const distance = searchResult.distances[i];
const similarity = 1 - distance; // Convert cosine distance to similarity
console.log(
`VectorDatabase: Processing neighbor ${i}: label=${label}, distance=${distance}, similarity=${similarity}`,
);
// Find corresponding document by label
const document = this.findDocumentByLabel(label);
if (document) {
console.log(`VectorDatabase: Found document for label ${label}: ${document.id}`);
results.push({
document,
similarity,
distance,
});
} else {
console.warn(`VectorDatabase: No document found for label ${label}`);
// Detailed debug information
if (i < 5) {
// Only show detailed info for first 5 neighbors to avoid log spam
console.warn(
`VectorDatabase: Available labels (first 20): ${Array.from(this.documents.keys()).slice(0, 20).join(', ')}`,
);
console.warn(`VectorDatabase: Total available labels: ${this.documents.size}`);
console.warn(
`VectorDatabase: Label type: ${typeof label}, Available label types: ${Array.from(
this.documents.keys(),
)
.slice(0, 3)
.map((k) => typeof k)
.join(', ')}`,
);
}
}
}
console.log(
`VectorDatabase: Found ${results.length} search results out of ${searchResult.neighbors.length} neighbors`,
);
// If no results found but index has data, indicates label mismatch
if (results.length === 0 && searchResult.neighbors.length > 0) {
console.error(
'VectorDatabase: Label mismatch detected! Index has vectors but no matching documents found.',
);
console.error(
'VectorDatabase: This usually indicates the index and document mappings are out of sync.',
);
console.error('VectorDatabase: Consider rebuilding the index to fix this issue.');
// Provide some diagnostic information
const sampleLabels = searchResult.neighbors.slice(0, 5);
const availableLabels = Array.from(this.documents.keys()).slice(0, 5);
console.error('VectorDatabase: Sample search labels:', sampleLabels);
console.error('VectorDatabase: Sample available labels:', availableLabels);
}
return results.sort((a, b) => b.similarity - a.similarity);
} catch (error) {
console.error('VectorDatabase: Search failed:', error);
console.error('VectorDatabase: Query embedding info:', {
type: typeof queryEmbedding,
constructor: queryEmbedding?.constructor?.name,
length: queryEmbedding?.length,
isFloat32Array: queryEmbedding instanceof Float32Array,
firstFewValues: queryEmbedding ? Array.from(queryEmbedding.slice(0, 5)) : null,
});
throw error;
}
}
/**
* Remove all documents for a tab
*/
public async removeTabDocuments(tabId: number): Promise<void> {
if (!this.isInitialized) {
await this.initialize();
}
const documentLabels = this.tabDocuments.get(tabId);
if (!documentLabels) {
return;
}
try {
// Remove documents from mapping (hnswlib-wasm doesn't support direct deletion, only mark as deleted)
for (const label of documentLabels) {
this.documents.delete(label);
}
// Clean up tab mapping
this.tabDocuments.delete(tabId);
// Save changes
await this.saveDocumentMappings();
console.log(`VectorDatabase: Removed ${documentLabels.size} documents for tab ${tabId}`);
} catch (error) {
console.error('VectorDatabase: Failed to remove tab documents:', error);
throw error;
}
}
/**
* Get database statistics
*/
public getStats(): {
totalDocuments: number;
totalTabs: number;
indexSize: number;
isInitialized: boolean;
} {
return {
totalDocuments: this.documents.size,
totalTabs: this.tabDocuments.size,
indexSize: this.calculateStorageSize(),
isInitialized: this.isInitialized,
};
}
/**
* Calculate actual storage size (bytes)
*/
private calculateStorageSize(): number {
let totalSize = 0;
try {
// 1. 计算文档映射的大小
const documentsSize = this.calculateDocumentMappingsSize();
totalSize += documentsSize;
// 2. 计算向量数据的大小
const vectorsSize = this.calculateVectorsSize();
totalSize += vectorsSize;
// 3. 估算索引结构的大小
const indexStructureSize = this.calculateIndexStructureSize();
totalSize += indexStructureSize;
console.log(
`VectorDatabase: Storage size breakdown - Documents: ${documentsSize}, Vectors: ${vectorsSize}, Index: ${indexStructureSize}, Total: ${totalSize} bytes`,
);
} catch (error) {
console.warn('VectorDatabase: Failed to calculate storage size:', error);
// 返回一个基于文档数量的估算值
totalSize = this.documents.size * 1024; // 每个文档估算1KB
}
return totalSize;
}
/**
* Calculate document mappings size
*/
private calculateDocumentMappingsSize(): number {
let size = 0;
// Calculate documents Map size
for (const [label, document] of this.documents.entries()) {
// label (number): 8 bytes
size += 8;
// document object
size += this.calculateObjectSize(document);
}
// Calculate tabDocuments Map size
for (const [tabId, labels] of this.tabDocuments.entries()) {
// tabId (number): 8 bytes
size += 8;
// Set of labels: 8 bytes per label + Set overhead
size += labels.size * 8 + 32; // 32 bytes Set overhead
}
return size;
}
/**
* Calculate vectors data size
*/
private calculateVectorsSize(): number {
const documentCount = this.documents.size;
const dimension = this.config.dimension;
// Each vector: dimension * 4 bytes (Float32)
const vectorSize = dimension * 4;
return documentCount * vectorSize;
}
/**
* Estimate index structure size
*/
private calculateIndexStructureSize(): number {
const documentCount = this.documents.size;
if (documentCount === 0) return 0;
// HNSW index size estimation
// Based on papers and actual testing, HNSW index size is about 20-40% of vector data
const vectorsSize = this.calculateVectorsSize();
const indexOverhead = Math.floor(vectorsSize * 0.3); // 30% overhead
// Additional graph structure overhead
const graphOverhead = documentCount * 64; // About 64 bytes graph structure overhead per node
return indexOverhead + graphOverhead;
}
/**
* Calculate object size (rough estimation)
*/
private calculateObjectSize(obj: any): number {
let size = 0;
try {
const jsonString = JSON.stringify(obj);
// UTF-8 encoding, most characters 1 byte, Chinese etc 3 bytes, average 2 bytes
size = jsonString.length * 2;
} catch (error) {
// If JSON serialization fails, use default estimation
size = 512; // Default 512 bytes
}
return size;
}
/**
* Clear entire database
*/
public async clear(): Promise<void> {
console.log('VectorDatabase: Starting complete database clear...');
try {
// Clear in-memory data structures
this.documents.clear();
this.tabDocuments.clear();
this.nextLabel = 0;
// Clear HNSW index file (in hnswlib-index database)
if (this.isInitialized && this.index) {
try {
console.log('VectorDatabase: Clearing HNSW index file from IndexedDB...');
// 1. First try to physically delete index file (using EmscriptenFileSystemManager)
try {
if (
globalHnswlib &&
globalHnswlib.EmscriptenFileSystemManager.checkFileExists(this.config.indexFileName)
) {
console.log(
`VectorDatabase: Deleting physical index file: ${this.config.indexFileName}`,
);
globalHnswlib.EmscriptenFileSystemManager.deleteFile(this.config.indexFileName);
await this.syncFileSystem('write'); // Ensure deletion is synced to persistent storage
console.log(
`VectorDatabase: Physical index file ${this.config.indexFileName} deleted successfully`,
);
} else {
console.log(
`VectorDatabase: Physical index file ${this.config.indexFileName} does not exist or already deleted`,
);
}
} catch (fileError) {
console.warn(
`VectorDatabase: Failed to delete physical index file ${this.config.indexFileName}:`,
fileError,
);
// Continue with other cleanup operations, don't block the process
}
// 2. Delete index file from IndexedDB
await this.index.deleteIndex(this.config.indexFileName);
console.log('VectorDatabase: HNSW index file cleared from IndexedDB');
// 3. Reinitialize empty index
console.log('VectorDatabase: Reinitializing empty HNSW index...');
this.index.initIndex(
this.config.maxElements,
this.config.M,
this.config.efConstruction,
200,
);
this.index.setEfSearch(this.config.efSearch);
// 4. Force save empty index
await this.forceSaveIndex();
} catch (indexError) {
console.warn('VectorDatabase: Failed to clear HNSW index file:', indexError);
// Continue with other cleanup operations
}
}
// Clear document mappings from IndexedDB (in VectorDatabaseStorage database)
try {
console.log('VectorDatabase: Clearing document mappings from IndexedDB...');
await IndexedDBHelper.deleteData(this.config.indexFileName);
console.log('VectorDatabase: Document mappings cleared from IndexedDB');
} catch (idbError) {
console.warn(
'VectorDatabase: Failed to clear document mappings from IndexedDB, trying chrome.storage fallback:',
idbError,
);
// Clear backup data from chrome.storage
try {
const storageKey = `hnswlib_document_mappings_${this.config.indexFileName}`;
await chrome.storage.local.remove([storageKey]);
console.log('VectorDatabase: Chrome storage fallback cleared');
} catch (storageError) {
console.warn('VectorDatabase: Failed to clear chrome.storage fallback:', storageError);
}
}
// Save empty document mappings to ensure consistency
await this.saveDocumentMappings();
console.log('VectorDatabase: Complete database clear finished successfully');
} catch (error) {
console.error('VectorDatabase: Failed to clear database:', error);
throw error;
}
}
/**
* Force save index and sync filesystem
*/
private async forceSaveIndex(): Promise<void> {
try {
await this.index.writeIndex(this.config.indexFileName);
await this.syncFileSystem('write'); // Force sync
} catch (error) {
console.error('VectorDatabase: Failed to force save index:', error);
}
}
/**
* Check and perform auto cleanup
*/
private async checkAndPerformAutoCleanup(): Promise<void> {
try {
const currentCount = this.documents.size;
const maxElements = this.config.maxElements;
console.log(
`VectorDatabase: Auto cleanup check - current: ${currentCount}, max: ${maxElements}`,
);
// Check if maximum element count is exceeded
if (currentCount >= maxElements) {
console.log('VectorDatabase: Document count reached limit, performing cleanup...');
await this.performLRUCleanup(Math.floor(maxElements * 0.2)); // Clean up 20% of data
}
// Check if there's expired data
if (this.config.maxRetentionDays && this.config.maxRetentionDays > 0) {
await this.performTimeBasedCleanup();
}
} catch (error) {
console.error('VectorDatabase: Auto cleanup failed:', error);
}
}
/**
* Perform LRU-based cleanup (delete oldest documents)
*/
private async performLRUCleanup(cleanupCount: number): Promise<void> {
try {
console.log(
`VectorDatabase: Starting LRU cleanup, removing ${cleanupCount} oldest documents`,
);
// Get all documents and sort by timestamp
const allDocuments = Array.from(this.documents.entries());
allDocuments.sort((a, b) => a[1].timestamp - b[1].timestamp);
// Select documents to delete
const documentsToDelete = allDocuments.slice(0, cleanupCount);
for (const [label, _document] of documentsToDelete) {
await this.removeDocumentByLabel(label);
}
// Save updated index and mappings
await this.saveIndex();
await this.saveDocumentMappings();
console.log(
`VectorDatabase: LRU cleanup completed, removed ${documentsToDelete.length} documents`,
);
} catch (error) {
console.error('VectorDatabase: LRU cleanup failed:', error);
}
}
/**
* Perform time-based cleanup (delete expired documents)
*/
private async performTimeBasedCleanup(): Promise<void> {
try {
const maxRetentionMs = this.config.maxRetentionDays! * 24 * 60 * 60 * 1000;
const cutoffTime = Date.now() - maxRetentionMs;
console.log(
`VectorDatabase: Starting time-based cleanup, removing documents older than ${this.config.maxRetentionDays} days`,
);
const documentsToDelete: number[] = [];
for (const [label, document] of this.documents.entries()) {
if (document.timestamp < cutoffTime) {
documentsToDelete.push(label);
}
}
for (const label of documentsToDelete) {
await this.removeDocumentByLabel(label);
}
// Save updated index and mappings
if (documentsToDelete.length > 0) {
await this.saveIndex();
await this.saveDocumentMappings();
}
console.log(
`VectorDatabase: Time-based cleanup completed, removed ${documentsToDelete.length} expired documents`,
);
} catch (error) {
console.error('VectorDatabase: Time-based cleanup failed:', error);
}
}
/**
* Remove single document by label
*/
private async removeDocumentByLabel(label: number): Promise<void> {
try {
const document = this.documents.get(label);
if (!document) {
console.warn(`VectorDatabase: Document with label ${label} not found`);
return;
}
// Remove vector from HNSW index
if (this.index) {
try {
this.index.markDelete(label);
} catch (indexError) {
console.warn(
`VectorDatabase: Failed to mark delete in index for label ${label}:`,
indexError,
);
}
}
// Remove from memory mapping
this.documents.delete(label);
// Remove from tab mapping
const tabId = document.tabId;
if (this.tabDocuments.has(tabId)) {
this.tabDocuments.get(tabId)!.delete(label);
// If tab has no other documents, delete entire tab mapping
if (this.tabDocuments.get(tabId)!.size === 0) {
this.tabDocuments.delete(tabId);
}
}
console.log(`VectorDatabase: Removed document with label ${label} from tab ${tabId}`);
} catch (error) {
console.error(`VectorDatabase: Failed to remove document with label ${label}:`, error);
}
}
// 私有辅助方法
private generateDocumentId(tabId: number, chunkIndex: number): string {
return `tab_${tabId}_chunk_${chunkIndex}_${Date.now()}`;
}
private findDocumentByLabel(label: number): VectorDocument | null {
return this.documents.get(label) || null;
}
private async syncFileSystem(direction: 'read' | 'write'): Promise<void> {
try {
if (!globalHnswlib) {
return;
}
// If sync operation is already in progress, wait for it to complete
if (syncInProgress && pendingSyncPromise) {
console.log(`VectorDatabase: Sync already in progress, waiting...`);
await pendingSyncPromise;
return;
}
// Mark sync start
syncInProgress = true;
// Create sync Promise with timeout mechanism
pendingSyncPromise = new Promise<void>((resolve, reject) => {
const timeout = setTimeout(() => {
console.warn(`VectorDatabase: Filesystem sync (${direction}) timeout`);
syncInProgress = false;
pendingSyncPromise = null;
reject(new Error('Sync timeout'));
}, 5000); // 5 second timeout
try {
globalHnswlib.EmscriptenFileSystemManager.syncFS(direction === 'read', () => {
clearTimeout(timeout);
console.log(`VectorDatabase: Filesystem sync (${direction}) completed`);
syncInProgress = false;
pendingSyncPromise = null;
resolve();
});
} catch (error) {
clearTimeout(timeout);
console.warn(`VectorDatabase: Failed to sync filesystem (${direction}):`, error);
syncInProgress = false;
pendingSyncPromise = null;
reject(error);
}
});
await pendingSyncPromise;
} catch (error) {
console.warn(`VectorDatabase: Failed to sync filesystem (${direction}):`, error);
syncInProgress = false;
pendingSyncPromise = null;
}
}
private async saveIndex(): Promise<void> {
try {
await this.index.writeIndex(this.config.indexFileName);
// Reduce sync frequency, only sync when necessary
if (this.documents.size % 10 === 0) {
// Sync every 10 documents
await this.syncFileSystem('write');
}
} catch (error) {
console.error('VectorDatabase: Failed to save index:', error);
}
}
private async saveDocumentMappings(): Promise<void> {
try {
// Save document mappings to IndexedDB
const mappingData = {
documents: Array.from(this.documents.entries()),
tabDocuments: Array.from(this.tabDocuments.entries()).map(([tabId, labels]) => [
tabId,
Array.from(labels),
]),
nextLabel: this.nextLabel,
};
try {
// Use IndexedDB to save data, supports larger storage capacity
await IndexedDBHelper.saveData(this.config.indexFileName, mappingData);
console.log('VectorDatabase: Document mappings saved to IndexedDB');
} catch (idbError) {
console.warn(
'VectorDatabase: Failed to save to IndexedDB, falling back to chrome.storage:',
idbError,
);
// Fall back to chrome.storage.local
try {
const storageKey = `hnswlib_document_mappings_${this.config.indexFileName}`;
await chrome.storage.local.set({ [storageKey]: mappingData });
console.log('VectorDatabase: Document mappings saved to chrome.storage.local (fallback)');
} catch (storageError) {
console.error(
'VectorDatabase: Failed to save to both IndexedDB and chrome.storage:',
storageError,
);
}
}
} catch (error) {
console.error('VectorDatabase: Failed to save document mappings:', error);
}
}
public async loadDocumentMappings(): Promise<void> {
try {
// Load document mappings from IndexedDB
if (!globalHnswlib) {
return;
}
let mappingData = null;
try {
// First try to read from IndexedDB
mappingData = await IndexedDBHelper.loadData(this.config.indexFileName);
if (mappingData) {
console.log(`VectorDatabase: Loaded document mappings from IndexedDB`);
}
} catch (idbError) {
console.warn(
'VectorDatabase: Failed to read from IndexedDB, trying chrome.storage:',
idbError,
);
}
// If IndexedDB has no data, try reading from chrome.storage.local (backward compatibility)
if (!mappingData) {
try {
const storageKey = `hnswlib_document_mappings_${this.config.indexFileName}`;
const result = await chrome.storage.local.get([storageKey]);
mappingData = result[storageKey];
if (mappingData) {
console.log(
`VectorDatabase: Loaded document mappings from chrome.storage.local (fallback)`,
);
// Migrate to IndexedDB
try {
await IndexedDBHelper.saveData(this.config.indexFileName, mappingData);
console.log('VectorDatabase: Migrated data from chrome.storage to IndexedDB');
} catch (migrationError) {
console.warn('VectorDatabase: Failed to migrate data to IndexedDB:', migrationError);
}
}
} catch (storageError) {
console.warn('VectorDatabase: Failed to read from chrome.storage.local:', storageError);
}
}
if (mappingData) {
// Restore document mappings
this.documents.clear();
for (const [label, doc] of mappingData.documents) {
this.documents.set(label, doc);
}
// Restore tab mappings
this.tabDocuments.clear();
for (const [tabId, labels] of mappingData.tabDocuments) {
this.tabDocuments.set(tabId, new Set(labels));
}
// Restore nextLabel - use saved value or calculate max label + 1
if (mappingData.nextLabel !== undefined) {
this.nextLabel = mappingData.nextLabel;
} else if (this.documents.size > 0) {
// If no saved nextLabel, calculate max label + 1
const maxLabel = Math.max(...Array.from(this.documents.keys()));
this.nextLabel = maxLabel + 1;
} else {
this.nextLabel = 0;
}
console.log(
`VectorDatabase: Loaded ${this.documents.size} document mappings, next label: ${this.nextLabel}`,
);
} else {
console.log('VectorDatabase: No existing document mappings found');
}
} catch (error) {
console.error('VectorDatabase: Failed to load document mappings:', error);
}
}
}
// Global VectorDatabase singleton
let globalVectorDatabase: VectorDatabase | null = null;
let currentDimension: number | null = null;
/**
* Get global VectorDatabase singleton instance
* If dimension changes, will recreate instance to ensure compatibility
*/
export async function getGlobalVectorDatabase(
config?: Partial<VectorDatabaseConfig>,
): Promise<VectorDatabase> {
const newDimension = config?.dimension || 384;
// If dimension changes, need to recreate vector database
if (globalVectorDatabase && currentDimension !== null && currentDimension !== newDimension) {
console.log(
`VectorDatabase: Dimension changed from ${currentDimension} to ${newDimension}, recreating instance`,
);
// Clean up old instance - this will clean up index files and document mappings
try {
await globalVectorDatabase.clear();
console.log('VectorDatabase: Successfully cleared old instance for dimension change');
} catch (error) {
console.warn('VectorDatabase: Error during cleanup:', error);
}
globalVectorDatabase = null;
currentDimension = null;
}
if (!globalVectorDatabase) {
globalVectorDatabase = new VectorDatabase(config);
currentDimension = newDimension;
console.log(
`VectorDatabase: Created global singleton instance with dimension ${currentDimension}`,
);
}
return globalVectorDatabase;
}
/**
* Synchronous version of getting global VectorDatabase instance (for backward compatibility)
* Note: If dimension change is needed, recommend using async version
*/
export function getGlobalVectorDatabaseSync(
config?: Partial<VectorDatabaseConfig>,
): VectorDatabase {
const newDimension = config?.dimension || 384;
// If dimension changes, log warning but don't clean up (avoid race conditions)
if (globalVectorDatabase && currentDimension !== null && currentDimension !== newDimension) {
console.warn(
`VectorDatabase: Dimension mismatch detected (${currentDimension} vs ${newDimension}). Consider using async version for proper cleanup.`,
);
}
if (!globalVectorDatabase) {
globalVectorDatabase = new VectorDatabase(config);
currentDimension = newDimension;
console.log(
`VectorDatabase: Created global singleton instance with dimension ${currentDimension}`,
);
}
return globalVectorDatabase;
}
/**
* Reset global VectorDatabase instance (mainly for testing or model switching)
*/
export async function resetGlobalVectorDatabase(): Promise<void> {
console.log('VectorDatabase: Starting global instance reset...');
if (globalVectorDatabase) {
try {
console.log('VectorDatabase: Clearing existing global instance...');
await globalVectorDatabase.clear();
console.log('VectorDatabase: Global instance cleared successfully');
} catch (error) {
console.warn('VectorDatabase: Failed to clear during reset:', error);
}
}
// Additional cleanup: ensure all possible IndexedDB data is cleared
try {
console.log('VectorDatabase: Performing comprehensive IndexedDB cleanup...');
// Clear all data in VectorDatabaseStorage database
await IndexedDBHelper.clearAllData();
// Clear index files from hnswlib-index database
try {
console.log('VectorDatabase: Clearing HNSW index files from IndexedDB...');
// Try to clean up possible existing index files
const possibleIndexFiles = ['tab_content_index.dat', 'content_index.dat', 'vector_index.dat'];
// If global hnswlib instance exists, try to delete known index files
if (typeof globalHnswlib !== 'undefined' && globalHnswlib) {
for (const fileName of possibleIndexFiles) {
try {
// 1. First try to physically delete index file (using EmscriptenFileSystemManager)
try {
if (globalHnswlib.EmscriptenFileSystemManager.checkFileExists(fileName)) {
console.log(`VectorDatabase: Deleting physical index file: ${fileName}`);
globalHnswlib.EmscriptenFileSystemManager.deleteFile(fileName);
console.log(`VectorDatabase: Physical index file ${fileName} deleted successfully`);
}
} catch (fileError) {
console.log(
`VectorDatabase: Physical index file ${fileName} not found or failed to delete:`,
fileError,
);
}
// 2. Delete index file from IndexedDB
const tempIndex = new globalHnswlib.HierarchicalNSW('cosine', 384);
await tempIndex.deleteIndex(fileName);
console.log(`VectorDatabase: Deleted IndexedDB index file: ${fileName}`);
} catch (deleteError) {
// File might not exist, this is normal
console.log(`VectorDatabase: Index file ${fileName} not found or already deleted`);
}
}
// 3. Force sync filesystem to ensure deletion takes effect
try {
await new Promise<void>((resolve) => {
const timeout = setTimeout(() => {
console.warn('VectorDatabase: Filesystem sync timeout during cleanup');
resolve(); // Don't block the process
}, 3000);
globalHnswlib.EmscriptenFileSystemManager.syncFS(false, () => {
clearTimeout(timeout);
console.log('VectorDatabase: Filesystem sync completed during cleanup');
resolve();
});
});
} catch (syncError) {
console.warn('VectorDatabase: Failed to sync filesystem during cleanup:', syncError);
}
}
} catch (hnswError) {
console.warn('VectorDatabase: Failed to clear HNSW index files:', hnswError);
}
// Clear possible chrome.storage backup data (only clear vector database related data, preserve user preferences)
const possibleKeys = [
'hnswlib_document_mappings_tab_content_index.dat',
'hnswlib_document_mappings_content_index.dat',
'hnswlib_document_mappings_vector_index.dat',
// Note: Don't clear selectedModel and selectedVersion, these are user preference settings
// Note: Don't clear modelState, this contains model state info and should be handled by model management logic
];
if (possibleKeys.length > 0) {
try {
await chrome.storage.local.remove(possibleKeys);
console.log('VectorDatabase: Chrome storage backup data cleared');
} catch (storageError) {
console.warn('VectorDatabase: Failed to clear chrome.storage backup:', storageError);
}
}
console.log('VectorDatabase: Comprehensive cleanup completed');
} catch (cleanupError) {
console.warn('VectorDatabase: Comprehensive cleanup failed:', cleanupError);
}
globalVectorDatabase = null;
currentDimension = null;
console.log('VectorDatabase: Global singleton instance reset completed');
}
/**
* Specifically for data cleanup during model switching
* Clear all IndexedDB data, including HNSW index files and document mappings
*/
export async function clearAllVectorData(): Promise<void> {
console.log('VectorDatabase: Starting comprehensive vector data cleanup for model switch...');
try {
// 1. Clear global instance
if (globalVectorDatabase) {
try {
await globalVectorDatabase.clear();
} catch (error) {
console.warn('VectorDatabase: Failed to clear global instance:', error);
}
}
// 2. Clear VectorDatabaseStorage database
try {
console.log('VectorDatabase: Clearing VectorDatabaseStorage database...');
await IndexedDBHelper.clearAllData();
} catch (error) {
console.warn('VectorDatabase: Failed to clear VectorDatabaseStorage:', error);
}
// 3. Clear hnswlib-index database and physical files
try {
console.log('VectorDatabase: Clearing hnswlib-index database and physical files...');
// 3.1 First try to physically delete index files (using EmscriptenFileSystemManager)
if (typeof globalHnswlib !== 'undefined' && globalHnswlib) {
const possibleIndexFiles = [
'tab_content_index.dat',
'content_index.dat',
'vector_index.dat',
];
for (const fileName of possibleIndexFiles) {
try {
if (globalHnswlib.EmscriptenFileSystemManager.checkFileExists(fileName)) {
console.log(`VectorDatabase: Deleting physical index file: ${fileName}`);
globalHnswlib.EmscriptenFileSystemManager.deleteFile(fileName);
console.log(`VectorDatabase: Physical index file ${fileName} deleted successfully`);
}
} catch (fileError) {
console.log(
`VectorDatabase: Physical index file ${fileName} not found or failed to delete:`,
fileError,
);
}
}
// Force sync filesystem
try {
await new Promise<void>((resolve) => {
const timeout = setTimeout(() => {
console.warn('VectorDatabase: Filesystem sync timeout during model switch cleanup');
resolve();
}, 3000);
globalHnswlib.EmscriptenFileSystemManager.syncFS(false, () => {
clearTimeout(timeout);
console.log('VectorDatabase: Filesystem sync completed during model switch cleanup');
resolve();
});
});
} catch (syncError) {
console.warn(
'VectorDatabase: Failed to sync filesystem during model switch cleanup:',
syncError,
);
}
}
// 3.2 Delete entire hnswlib-index database
await new Promise<void>((resolve) => {
const deleteRequest = indexedDB.deleteDatabase('/hnswlib-index');
deleteRequest.onsuccess = () => {
console.log('VectorDatabase: Successfully deleted /hnswlib-index database');
resolve();
};
deleteRequest.onerror = () => {
console.warn(
'VectorDatabase: Failed to delete /hnswlib-index database:',
deleteRequest.error,
);
resolve(); // Don't block the process
};
deleteRequest.onblocked = () => {
console.warn('VectorDatabase: Deletion of /hnswlib-index database was blocked');
resolve(); // Don't block the process
};
});
} catch (error) {
console.warn(
'VectorDatabase: Failed to clear hnswlib-index database and physical files:',
error,
);
}
// 4. Clear backup data from chrome.storage
try {
const storageKeys = [
'hnswlib_document_mappings_tab_content_index.dat',
'hnswlib_document_mappings_content_index.dat',
'hnswlib_document_mappings_vector_index.dat',
];
await chrome.storage.local.remove(storageKeys);
console.log('VectorDatabase: Chrome storage backup data cleared');
} catch (error) {
console.warn('VectorDatabase: Failed to clear chrome.storage backup:', error);
}
// 5. Reset global state
globalVectorDatabase = null;
currentDimension = null;
console.log('VectorDatabase: Comprehensive vector data cleanup completed successfully');
} catch (error) {
console.error('VectorDatabase: Comprehensive vector data cleanup failed:', error);
throw error;
}
}