amirmmahdavikia commited on
Commit
31f8407
·
verified ·
1 Parent(s): fa38533

Upload retriever.py

Browse files
Files changed (1) hide show
  1. retriever.py +208 -0
retriever.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ import math
4
+ import numpy as np
5
+ from multiprocessing import Pool, cpu_count
6
+
7
+ """
8
+ All of these algorithms have been taken from the paper:
9
+ Trotmam et al, Improvements to BM25 and Language Models Examined
10
+
11
+ Here we implement all the BM25 variations mentioned.
12
+ """
13
+
14
+
15
+ class BM25:
16
+ def __init__(self, corpus, tokenizer=None):
17
+ self.corpus_size = 0
18
+ self.avgdl = 0
19
+ self.doc_freqs = []
20
+ self.idf = {}
21
+ self.doc_len = []
22
+ self.tokenizer = tokenizer
23
+
24
+ if tokenizer:
25
+ corpus = self._tokenize_corpus(corpus)
26
+
27
+ nd = self._initialize(corpus)
28
+ self._calc_idf(nd)
29
+
30
+ def _initialize(self, corpus):
31
+ nd = {} # word -> number of documents with word
32
+ num_doc = 0
33
+ for document in corpus:
34
+ self.doc_len.append(len(document))
35
+ num_doc += len(document)
36
+
37
+ frequencies = {}
38
+ for word in document:
39
+ if word not in frequencies:
40
+ frequencies[word] = 0
41
+ frequencies[word] += 1
42
+ self.doc_freqs.append(frequencies)
43
+
44
+ for word, freq in frequencies.items():
45
+ try:
46
+ nd[word]+=1
47
+ except KeyError:
48
+ nd[word] = 1
49
+
50
+ self.corpus_size += 1
51
+
52
+ self.avgdl = num_doc / self.corpus_size
53
+ return nd
54
+
55
+ def _tokenize_corpus(self, corpus):
56
+ pool = Pool(cpu_count())
57
+ tokenized_corpus = pool.map(self.tokenizer, corpus)
58
+ return tokenized_corpus
59
+
60
+ def _calc_idf(self, nd):
61
+ raise NotImplementedError()
62
+
63
+ def get_scores(self, query):
64
+ raise NotImplementedError()
65
+
66
+ def get_batch_scores(self, query, doc_ids):
67
+ raise NotImplementedError()
68
+
69
+ def get_top_n(self, query, documents, n=5):
70
+
71
+ assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"
72
+
73
+ scores = self.get_scores(query)
74
+ top_n = np.argsort(scores)[::-1][:n]
75
+ return {documents[i]:i for i in top_n}
76
+
77
+
78
+ class BM25Okapi(BM25):
79
+ def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, epsilon=0.25):
80
+ self.k1 = k1
81
+ self.b = b
82
+ self.epsilon = epsilon
83
+ super().__init__(corpus, tokenizer)
84
+
85
+ def _calc_idf(self, nd):
86
+ """
87
+ Calculates frequencies of terms in documents and in corpus.
88
+ This algorithm sets a floor on the idf values to eps * average_idf
89
+ """
90
+ # collect idf sum to calculate an average idf for epsilon value
91
+ idf_sum = 0
92
+ # collect words with negative idf to set them a special epsilon value.
93
+ # idf can be negative if word is contained in more than half of documents
94
+ negative_idfs = []
95
+ for word, freq in nd.items():
96
+ idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
97
+ self.idf[word] = idf
98
+ idf_sum += idf
99
+ if idf < 0:
100
+ negative_idfs.append(word)
101
+ self.average_idf = idf_sum / len(self.idf)
102
+
103
+ eps = self.epsilon * self.average_idf
104
+ for word in negative_idfs:
105
+ self.idf[word] = eps
106
+
107
+ def get_scores(self, query):
108
+ """
109
+ The ATIRE BM25 variant uses an idf function which uses a log(idf) score. To prevent negative idf scores,
110
+ this algorithm also adds a floor to the idf value of epsilon.
111
+ See [Trotman, A., X. Jia, M. Crane, Towards an Efficient and Effective Search Engine] for more info
112
+ :param query:
113
+ :return:
114
+ """
115
+ score = np.zeros(self.corpus_size)
116
+ doc_len = np.array(self.doc_len)
117
+ for q in query:
118
+ q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
119
+ score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
120
+ (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
121
+ return score
122
+
123
+ def get_batch_scores(self, query, doc_ids):
124
+ """
125
+ Calculate bm25 scores between query and subset of all docs
126
+ """
127
+ assert all(di < len(self.doc_freqs) for di in doc_ids)
128
+ score = np.zeros(len(doc_ids))
129
+ doc_len = np.array(self.doc_len)[doc_ids]
130
+ for q in query:
131
+ q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
132
+ score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
133
+ (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
134
+ return score.tolist()
135
+
136
+
137
+ class BM25L(BM25):
138
+ def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, delta=0.5):
139
+ # Algorithm specific parameters
140
+ self.k1 = k1
141
+ self.b = b
142
+ self.delta = delta
143
+ super().__init__(corpus, tokenizer)
144
+
145
+ def _calc_idf(self, nd):
146
+ for word, freq in nd.items():
147
+ idf = math.log(self.corpus_size + 1) - math.log(freq + 0.5)
148
+ self.idf[word] = idf
149
+
150
+ def get_scores(self, query):
151
+ score = np.zeros(self.corpus_size)
152
+ doc_len = np.array(self.doc_len)
153
+ for q in query:
154
+ q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
155
+ ctd = q_freq / (1 - self.b + self.b * doc_len / self.avgdl)
156
+ score += (self.idf.get(q) or 0) * (self.k1 + 1) * (ctd + self.delta) / \
157
+ (self.k1 + ctd + self.delta)
158
+ return score
159
+
160
+ def get_batch_scores(self, query, doc_ids):
161
+ """
162
+ Calculate bm25 scores between query and subset of all docs
163
+ """
164
+ assert all(di < len(self.doc_freqs) for di in doc_ids)
165
+ score = np.zeros(len(doc_ids))
166
+ doc_len = np.array(self.doc_len)[doc_ids]
167
+ for q in query:
168
+ q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
169
+ ctd = q_freq / (1 - self.b + self.b * doc_len / self.avgdl)
170
+ score += (self.idf.get(q) or 0) * (self.k1 + 1) * (ctd + self.delta) / \
171
+ (self.k1 + ctd + self.delta)
172
+ return score.tolist()
173
+
174
+
175
+ class BM25Plus(BM25):
176
+ def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, delta=1):
177
+ # Algorithm specific parameters
178
+ self.k1 = k1
179
+ self.b = b
180
+ self.delta = delta
181
+ super().__init__(corpus, tokenizer)
182
+
183
+ def _calc_idf(self, nd):
184
+ for word, freq in nd.items():
185
+ idf = math.log(self.corpus_size + 1) - math.log(freq)
186
+ self.idf[word] = idf
187
+
188
+ def get_scores(self, query):
189
+ score = np.zeros(self.corpus_size)
190
+ doc_len = np.array(self.doc_len)
191
+ for q in query:
192
+ q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
193
+ score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
194
+ (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
195
+ return score
196
+
197
+ def get_batch_scores(self, query, doc_ids):
198
+ """
199
+ Calculate bm25 scores between query and subset of all docs
200
+ """
201
+ assert all(di < len(self.doc_freqs) for di in doc_ids)
202
+ score = np.zeros(len(doc_ids))
203
+ doc_len = np.array(self.doc_len)[doc_ids]
204
+ for q in query:
205
+ q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
206
+ score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
207
+ (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
208
+ return score.tolist()