{"data":{"1":{"x":["1B","10B","100B","350B","1T"],"y":[0.994974,0.9515081,0.60887281,0.1741474885714285,0.006232416],"label":"1"},"2":{"x":["1B","10B","100B","350B","1T"],"y":[0.005008,0.047331,0.30282154,0.3071204342857143,0.032470074],"label":"2"},"3":{"x":["1B","10B","100B","350B","1T"],"y":[0.000018,0.0011439,0.0745482,0.2680183371428571,0.083742993],"label":"3"},"4-8":{"x":["1B","10B","100B","350B","1T"],"y":[0,0.000017,0.01375745,0.25064894285714273,0.8176358810000001],"label":"4-8"},"8-16":{"x":["1B","10B","100B","350B","1T"],"y":[0,0,0,0.00006479714285714286,0.05991048400000001],"label":"8-16"},"16-32":{"x":["1B","10B","100B","350B","1T"],"y":[0,0,0,0,0.000008152000000000001],"label":"16-32"}},"layout":{"title":{"text":"Sampling from 1000 identical buckets with 200B tokens each"},"xaxis":{"title":{"text":"Sample size"}},"yaxis":{"title":{"text":"Dataset fraction"}},"barmode":"stack","legend":{"title":{"text":"# duplicates","font":{"size":14,"weight":"bold"}},"font":{"size":14},"bgcolor":"white 0.9","orientation":"v","xanchor":"left","yanchor":"bottom","x":0.01,"y":0}}}