import axios, {AxiosError} from 'axios'; import {TokenTracker} from "../utils/token-tracker"; import {JINA_API_KEY} from "../config"; const JINA_API_URL = 'https://api.jina.ai/v1/embeddings'; const SIMILARITY_THRESHOLD = 0.93; // Adjustable threshold for cosine similarity const JINA_API_CONFIG = { MODEL: 'jina-embeddings-v3', TASK: 'text-matching', DIMENSIONS: 1024, EMBEDDING_TYPE: 'float', LATE_CHUNKING: false } as const; // Types for Jina API interface JinaEmbeddingRequest { model: string; task: string; late_chunking: boolean; dimensions: number; embedding_type: string; input: string[]; } interface JinaEmbeddingResponse { model: string; object: string; usage: { total_tokens: number; prompt_tokens: number; }; data: Array<{ object: string; index: number; embedding: number[]; }>; } // Compute cosine similarity between two vectors function cosineSimilarity(vecA: number[], vecB: number[]): number { const dotProduct = vecA.reduce((sum, a, i) => sum + a * vecB[i], 0); const normA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0)); const normB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0)); return dotProduct / (normA * normB); } // Get embeddings for all queries in one batch async function getEmbeddings(queries: string[]): Promise<{ embeddings: number[][], tokens: number }> { if (!JINA_API_KEY) { throw new Error('JINA_API_KEY is not set'); } const request: JinaEmbeddingRequest = { model: JINA_API_CONFIG.MODEL, task: JINA_API_CONFIG.TASK, late_chunking: JINA_API_CONFIG.LATE_CHUNKING, dimensions: JINA_API_CONFIG.DIMENSIONS, embedding_type: JINA_API_CONFIG.EMBEDDING_TYPE, input: queries }; try { const response = await axios.post( JINA_API_URL, request, { headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${JINA_API_KEY}` } } ); // Validate response format if (!response.data.data || response.data.data.length !== queries.length) { console.error('Invalid response from Jina API:', response.data); return { embeddings: [], tokens: 0 }; } // Sort embeddings by index to maintain original order const embeddings = response.data.data .sort((a, b) => a.index - b.index) .map(item => item.embedding); return { embeddings, tokens: response.data.usage.total_tokens }; } catch (error) { console.error('Error getting embeddings from Jina:', error); if (error instanceof AxiosError && error.response?.status === 402) { return { embeddings: [], tokens: 0 }; } throw error; } } export async function dedupQueries( newQueries: string[], existingQueries: string[], tracker?: TokenTracker ): Promise<{ unique_queries: string[] }> { try { // Quick return for single new query with no existing queries if (newQueries.length === 1 && existingQueries.length === 0) { return { unique_queries: newQueries, }; } // Get embeddings for all queries in one batch const allQueries = [...newQueries, ...existingQueries]; const {embeddings: allEmbeddings, tokens} = await getEmbeddings(allQueries); // If embeddings is empty (due to 402 error), return all new queries if (!allEmbeddings.length) { return { unique_queries: newQueries, }; } // Split embeddings back into new and existing const newEmbeddings = allEmbeddings.slice(0, newQueries.length); const existingEmbeddings = allEmbeddings.slice(newQueries.length); const uniqueQueries: string[] = []; const usedIndices = new Set(); // Compare each new query against existing queries and already accepted queries for (let i = 0; i < newQueries.length; i++) { let isUnique = true; // Check against existing queries for (let j = 0; j < existingQueries.length; j++) { const similarity = cosineSimilarity(newEmbeddings[i], existingEmbeddings[j]); if (similarity >= SIMILARITY_THRESHOLD) { isUnique = false; break; } } // Check against already accepted queries if (isUnique) { for (const usedIndex of usedIndices) { const similarity = cosineSimilarity(newEmbeddings[i], newEmbeddings[usedIndex]); if (similarity >= SIMILARITY_THRESHOLD) { isUnique = false; break; } } } // Add to unique queries if passed all checks if (isUnique) { uniqueQueries.push(newQueries[i]); usedIndices.add(i); } } // Track token usage from the API (tracker || new TokenTracker()).trackUsage('dedup', { promptTokens: tokens, completionTokens: 0, totalTokens: tokens }); console.log('Dedup:', uniqueQueries); return { unique_queries: uniqueQueries, }; } catch (error) { console.error('Error in deduplication analysis:', error); throw error; } }