DeepResearch / src /tools /jina-dedup.ts
zhengr's picture
init
0bcc252
import axios, {AxiosError} from 'axios';
import {TokenTracker} from "../utils/token-tracker";
import {JINA_API_KEY} from "../config";
const JINA_API_URL = 'https://api.jina.ai/v1/embeddings';
const SIMILARITY_THRESHOLD = 0.93; // Adjustable threshold for cosine similarity
const JINA_API_CONFIG = {
MODEL: 'jina-embeddings-v3',
TASK: 'text-matching',
DIMENSIONS: 1024,
EMBEDDING_TYPE: 'float',
LATE_CHUNKING: false
} as const;
// Types for Jina API
interface JinaEmbeddingRequest {
model: string;
task: string;
late_chunking: boolean;
dimensions: number;
embedding_type: string;
input: string[];
}
interface JinaEmbeddingResponse {
model: string;
object: string;
usage: {
total_tokens: number;
prompt_tokens: number;
};
data: Array<{
object: string;
index: number;
embedding: number[];
}>;
}
// Compute cosine similarity between two vectors
function cosineSimilarity(vecA: number[], vecB: number[]): number {
const dotProduct = vecA.reduce((sum, a, i) => sum + a * vecB[i], 0);
const normA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0));
const normB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0));
return dotProduct / (normA * normB);
}
// Get embeddings for all queries in one batch
async function getEmbeddings(queries: string[]): Promise<{ embeddings: number[][], tokens: number }> {
if (!JINA_API_KEY) {
throw new Error('JINA_API_KEY is not set');
}
const request: JinaEmbeddingRequest = {
model: JINA_API_CONFIG.MODEL,
task: JINA_API_CONFIG.TASK,
late_chunking: JINA_API_CONFIG.LATE_CHUNKING,
dimensions: JINA_API_CONFIG.DIMENSIONS,
embedding_type: JINA_API_CONFIG.EMBEDDING_TYPE,
input: queries
};
try {
const response = await axios.post<JinaEmbeddingResponse>(
JINA_API_URL,
request,
{
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${JINA_API_KEY}`
}
}
);
// Validate response format
if (!response.data.data || response.data.data.length !== queries.length) {
console.error('Invalid response from Jina API:', response.data);
return {
embeddings: [],
tokens: 0
};
}
// Sort embeddings by index to maintain original order
const embeddings = response.data.data
.sort((a, b) => a.index - b.index)
.map(item => item.embedding);
return {
embeddings,
tokens: response.data.usage.total_tokens
};
} catch (error) {
console.error('Error getting embeddings from Jina:', error);
if (error instanceof AxiosError && error.response?.status === 402) {
return {
embeddings: [],
tokens: 0
};
}
throw error;
}
}
export async function dedupQueries(
newQueries: string[],
existingQueries: string[],
tracker?: TokenTracker
): Promise<{ unique_queries: string[] }> {
try {
// Quick return for single new query with no existing queries
if (newQueries.length === 1 && existingQueries.length === 0) {
return {
unique_queries: newQueries,
};
}
// Get embeddings for all queries in one batch
const allQueries = [...newQueries, ...existingQueries];
const {embeddings: allEmbeddings, tokens} = await getEmbeddings(allQueries);
// If embeddings is empty (due to 402 error), return all new queries
if (!allEmbeddings.length) {
return {
unique_queries: newQueries,
};
}
// Split embeddings back into new and existing
const newEmbeddings = allEmbeddings.slice(0, newQueries.length);
const existingEmbeddings = allEmbeddings.slice(newQueries.length);
const uniqueQueries: string[] = [];
const usedIndices = new Set<number>();
// Compare each new query against existing queries and already accepted queries
for (let i = 0; i < newQueries.length; i++) {
let isUnique = true;
// Check against existing queries
for (let j = 0; j < existingQueries.length; j++) {
const similarity = cosineSimilarity(newEmbeddings[i], existingEmbeddings[j]);
if (similarity >= SIMILARITY_THRESHOLD) {
isUnique = false;
break;
}
}
// Check against already accepted queries
if (isUnique) {
for (const usedIndex of usedIndices) {
const similarity = cosineSimilarity(newEmbeddings[i], newEmbeddings[usedIndex]);
if (similarity >= SIMILARITY_THRESHOLD) {
isUnique = false;
break;
}
}
}
// Add to unique queries if passed all checks
if (isUnique) {
uniqueQueries.push(newQueries[i]);
usedIndices.add(i);
}
}
// Track token usage from the API
(tracker || new TokenTracker()).trackUsage('dedup', {
promptTokens: tokens,
completionTokens: 0,
totalTokens: tokens
});
console.log('Dedup:', uniqueQueries);
return {
unique_queries: uniqueQueries,
};
} catch (error) {
console.error('Error in deduplication analysis:', error);
throw error;
}
}