victormiller
commited on
Commit
•
dc5ac06
1
Parent(s):
d95e4d8
Update common.py
Browse files
common.py
CHANGED
@@ -29,6 +29,53 @@ fig = px.bar(
|
|
29 |
|
30 |
dup_cluster_graph = fig.update_layout(showlegend=False)
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
dup_docs_count = {
|
33 |
"80": 382164413,
|
34 |
"90": 660766607,
|
@@ -224,7 +271,7 @@ global_div = Div(
|
|
224 |
H3("Finding Duplicate Pairs"),
|
225 |
P("Multiple bands can create the same document pairs, leading to duplicates. The simplest way to eliminate these duplicate pairs is to call distinct() before the compute(). However, we found that Dask is not very efficient when it comes to distributed distinct execution. Additionally, since we process each band separately, this approach wouldn’t remove duplicates across different bands."),
|
226 |
P("To address this, we use a Bloom filter with a capacity of 64 billion and a false positive rate of 0.001 to remove duplicates. One way we parallelize the Bloom filter execution is by partitioning pairs horizontally and running one filter per partition, as shown in the table below. There is a high chance that duplicates from different bands will have the same pairs in the same horizontal partition. This step reduces the number of pairs by nearly ninefold."),
|
227 |
-
|
228 |
P("The resulting unique pairs are then used to identify clusters of near-duplicates by finding connected components in a graph, where the vertices represent documents and the edges represent matches. This step produced 1.9 TB of unique pairs."),
|
229 |
),
|
230 |
Section(
|
|
|
29 |
|
30 |
dup_cluster_graph = fig.update_layout(showlegend=False)
|
31 |
|
32 |
+
|
33 |
+
bloom_filter_table_info = pd.DataFrame(
|
34 |
+
{
|
35 |
+
"Bloom Filter": [
|
36 |
+
"BF 0",
|
37 |
+
"BF 8 ",
|
38 |
+
],
|
39 |
+
"Band 0": [
|
40 |
+
"""
|
41 |
+
(A,B)
|
42 |
+
(C,D)
|
43 |
+
(E,K)
|
44 |
+
""",
|
45 |
+
"(B,K)",
|
46 |
+
],
|
47 |
+
"Band 1": [
|
48 |
+
"""
|
49 |
+
(A,B)
|
50 |
+
(C,D)
|
51 |
+
(F,K)
|
52 |
+
""",,
|
53 |
+
"(B,K)",
|
54 |
+
],
|
55 |
+
"....": [
|
56 |
+
"...",
|
57 |
+
"...",
|
58 |
+
],
|
59 |
+
"Band 8": [
|
60 |
+
"""
|
61 |
+
(A,B)
|
62 |
+
(C,D)
|
63 |
+
(D,E)
|
64 |
+
""",
|
65 |
+
"""
|
66 |
+
(E,K)
|
67 |
+
(B,K)
|
68 |
+
""",
|
69 |
+
],
|
70 |
+
|
71 |
+
}
|
72 |
+
)
|
73 |
+
|
74 |
+
table_html_bloom_filter = bloom_filter_table_info.to_html(index=False, border=0)
|
75 |
+
table_div_bloom_examples = Div(NotStr(table_html_bloom_filter), style="margin: 40px;")
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
dup_docs_count = {
|
80 |
"80": 382164413,
|
81 |
"90": 660766607,
|
|
|
271 |
H3("Finding Duplicate Pairs"),
|
272 |
P("Multiple bands can create the same document pairs, leading to duplicates. The simplest way to eliminate these duplicate pairs is to call distinct() before the compute(). However, we found that Dask is not very efficient when it comes to distributed distinct execution. Additionally, since we process each band separately, this approach wouldn’t remove duplicates across different bands."),
|
273 |
P("To address this, we use a Bloom filter with a capacity of 64 billion and a false positive rate of 0.001 to remove duplicates. One way we parallelize the Bloom filter execution is by partitioning pairs horizontally and running one filter per partition, as shown in the table below. There is a high chance that duplicates from different bands will have the same pairs in the same horizontal partition. This step reduces the number of pairs by nearly ninefold."),
|
274 |
+
table_div_bloom_examples,
|
275 |
P("The resulting unique pairs are then used to identify clusters of near-duplicates by finding connected components in a graph, where the vertices represent documents and the edges represent matches. This step produced 1.9 TB of unique pairs."),
|
276 |
),
|
277 |
Section(
|