Spaces:
Sleeping
Sleeping
File size: 5,113 Bytes
0bcc252 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
import axios, {AxiosError} from 'axios';
import {TokenTracker} from "../utils/token-tracker";
import {JINA_API_KEY} from "../config";
const JINA_API_URL = 'https://api.jina.ai/v1/embeddings';
const SIMILARITY_THRESHOLD = 0.93; // Adjustable threshold for cosine similarity
const JINA_API_CONFIG = {
MODEL: 'jina-embeddings-v3',
TASK: 'text-matching',
DIMENSIONS: 1024,
EMBEDDING_TYPE: 'float',
LATE_CHUNKING: false
} as const;
// Types for Jina API
interface JinaEmbeddingRequest {
model: string;
task: string;
late_chunking: boolean;
dimensions: number;
embedding_type: string;
input: string[];
}
interface JinaEmbeddingResponse {
model: string;
object: string;
usage: {
total_tokens: number;
prompt_tokens: number;
};
data: Array<{
object: string;
index: number;
embedding: number[];
}>;
}
// Compute cosine similarity between two vectors
function cosineSimilarity(vecA: number[], vecB: number[]): number {
const dotProduct = vecA.reduce((sum, a, i) => sum + a * vecB[i], 0);
const normA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0));
const normB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0));
return dotProduct / (normA * normB);
}
// Get embeddings for all queries in one batch
async function getEmbeddings(queries: string[]): Promise<{ embeddings: number[][], tokens: number }> {
if (!JINA_API_KEY) {
throw new Error('JINA_API_KEY is not set');
}
const request: JinaEmbeddingRequest = {
model: JINA_API_CONFIG.MODEL,
task: JINA_API_CONFIG.TASK,
late_chunking: JINA_API_CONFIG.LATE_CHUNKING,
dimensions: JINA_API_CONFIG.DIMENSIONS,
embedding_type: JINA_API_CONFIG.EMBEDDING_TYPE,
input: queries
};
try {
const response = await axios.post<JinaEmbeddingResponse>(
JINA_API_URL,
request,
{
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${JINA_API_KEY}`
}
}
);
// Validate response format
if (!response.data.data || response.data.data.length !== queries.length) {
console.error('Invalid response from Jina API:', response.data);
return {
embeddings: [],
tokens: 0
};
}
// Sort embeddings by index to maintain original order
const embeddings = response.data.data
.sort((a, b) => a.index - b.index)
.map(item => item.embedding);
return {
embeddings,
tokens: response.data.usage.total_tokens
};
} catch (error) {
console.error('Error getting embeddings from Jina:', error);
if (error instanceof AxiosError && error.response?.status === 402) {
return {
embeddings: [],
tokens: 0
};
}
throw error;
}
}
export async function dedupQueries(
newQueries: string[],
existingQueries: string[],
tracker?: TokenTracker
): Promise<{ unique_queries: string[] }> {
try {
// Quick return for single new query with no existing queries
if (newQueries.length === 1 && existingQueries.length === 0) {
return {
unique_queries: newQueries,
};
}
// Get embeddings for all queries in one batch
const allQueries = [...newQueries, ...existingQueries];
const {embeddings: allEmbeddings, tokens} = await getEmbeddings(allQueries);
// If embeddings is empty (due to 402 error), return all new queries
if (!allEmbeddings.length) {
return {
unique_queries: newQueries,
};
}
// Split embeddings back into new and existing
const newEmbeddings = allEmbeddings.slice(0, newQueries.length);
const existingEmbeddings = allEmbeddings.slice(newQueries.length);
const uniqueQueries: string[] = [];
const usedIndices = new Set<number>();
// Compare each new query against existing queries and already accepted queries
for (let i = 0; i < newQueries.length; i++) {
let isUnique = true;
// Check against existing queries
for (let j = 0; j < existingQueries.length; j++) {
const similarity = cosineSimilarity(newEmbeddings[i], existingEmbeddings[j]);
if (similarity >= SIMILARITY_THRESHOLD) {
isUnique = false;
break;
}
}
// Check against already accepted queries
if (isUnique) {
for (const usedIndex of usedIndices) {
const similarity = cosineSimilarity(newEmbeddings[i], newEmbeddings[usedIndex]);
if (similarity >= SIMILARITY_THRESHOLD) {
isUnique = false;
break;
}
}
}
// Add to unique queries if passed all checks
if (isUnique) {
uniqueQueries.push(newQueries[i]);
usedIndices.add(i);
}
}
// Track token usage from the API
(tracker || new TokenTracker()).trackUsage('dedup', {
promptTokens: tokens,
completionTokens: 0,
totalTokens: tokens
});
console.log('Dedup:', uniqueQueries);
return {
unique_queries: uniqueQueries,
};
} catch (error) {
console.error('Error in deduplication analysis:', error);
throw error;
}
}
|