/**
 * TF-IDF Search Engine
 * Implements Term Frequency-Inverse Document Frequency ranking for intelligent search
 */

import { storage } from './storage.js';

// Common English stop words to filter out
const STOP_WORDS = new Set([
  'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
  'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
  'to', 'was', 'will', 'with', 'this', 'but', 'they', 'have', 'had',
  'what', 'when', 'where', 'who', 'which', 'why', 'how'
]);

/**
 * Tokenize text into searchable terms
 */
function tokenize(text) {
  if (!text) return [];
  
  return text
    .toLowerCase()
    .replace(/[^\w\s]/g, ' ') // Replace punctuation with spaces
    .split(/\s+/)
    .filter(token => token.length > 2) // Minimum 3 characters
    .filter(token => !STOP_WORDS.has(token));
}

/**
 * Build search index for a vault entry
 */
export async function buildSearchIndex(vaultId, entry) {
  try {
    // Combine all searchable text
    const searchableText = [
      entry.title || '',
      entry.domain || '',
      entry.textExtract || '',
      entry.url || ''
    ].join(' ');
    
    // Tokenize
    const tokens = tokenize(searchableText);
    
    // Calculate term frequencies
    const termFrequencies = {};
    const totalTokens = tokens.length || 1; // Avoid division by zero
    
    tokens.forEach(token => {
      termFrequencies[token] = (termFrequencies[token] || 0) + 1;
    });
    
    // Normalize to TF (term frequency / total terms)
    const terms = {};
    for (const [term, count] of Object.entries(termFrequencies)) {
      terms[term] = count / totalTokens;
    }
   
    // Store in search index
    await storage.updateSearchIndex(vaultId, terms);
    
    return terms;
  } catch (error) {
    console.error('Error building search index:', error);
    throw error;
  }
}


/**
 * Calculate IDF (Inverse Document Frequency) for a term
 */
async function calculateIDF(term, totalDocuments) {
  // Count how many documents contain this term
  const allIndexes = await storage.getAllSearchIndexes();
  const documentsWithTerm = allIndexes.filter(index => 
    index.terms && index.terms[term]
  ).length;
  
  if (documentsWithTerm === 0) return 0;
  
  // IDF = log(N / df)
  return Math.log(totalDocuments / documentsWithTerm);
}

/**
 * Search vault using TF-IDF ranking
 */
export async function searchWithTFIDF(query) {
  if (!query || query.trim().length < 2) return [];
  
  // Tokenize query
  const queryTokens = tokenize(query);
  if (queryTokens.length === 0) return [];
  
  // Get all vault entries and search indexes
  const vaultEntries = await storage.getAllVaultEntries();
  const totalDocuments = vaultEntries.length;
  
  if (totalDocuments === 0) return [];
  
  // Calculate IDF for each query term
  const idfScores = {};
  for (const token of queryTokens) {
    idfScores[token] = await calculateIDF(token, totalDocuments);
  }
  
  // Score each document
  const scoredResults = [];
  
  for (const entry of vaultEntries) {
    const searchIndex = await storage.getSearchIndex(entry.id);
    if (!searchIndex || !searchIndex.terms) continue;
    
    let score = 0;
    
    // Calculate TF-IDF score for this document
    for (const token of queryTokens) {
      const tf = searchIndex.terms[token] || 0;
      const idf = idfScores[token] || 0;
      score += tf * idf;
    }
    
    // Boost score if query matches title exactly
    if (entry.title && entry.title.toLowerCase().includes(query.toLowerCase())) {
      score *= 2;
    }
    
    // Boost score if query matches domain
    if (entry.domain && entry.domain.toLowerCase().includes(query.toLowerCase())) {
      score *= 1.5;
    }
    
    if (score > 0) {
      scoredResults.push({
        ...entry,
        searchScore: score
      });
    }
  }
  
  // Sort by score descending
  scoredResults.sort((a, b) => b.searchScore - a.searchScore);
  
  return scoredResults;
}

/**
 * Rebuild entire search index (for maintenance)
 */
export async function rebuildSearchIndex() {
  const vaultEntries = await storage.getAllVaultEntries();
  
  for (const entry of vaultEntries) {
    await buildSearchIndex(entry.id, entry);
  }
  
  console.log(`Rebuilt search index for ${vaultEntries.length} entries`);
}

/**
 * Get search suggestions based on partial query
 */
export async function getSearchSuggestions(partialQuery) {
  if (!partialQuery || partialQuery.length < 2) return [];
  
  const allIndexes = await storage.getAllSearchIndexes();
  const suggestions = new Set();
  
  const lowerQuery = partialQuery.toLowerCase();
  
  // Find terms that start with the query
  for (const index of allIndexes) {
    if (!index.terms) continue;
    
    for (const term of Object.keys(index.terms)) {
      if (term.startsWith(lowerQuery) && !STOP_WORDS.has(term)) {
        suggestions.add(term);
      }
    }
  }
  
  return Array.from(suggestions).slice(0, 10);
}
