stats
Browse files
data/plots/stats/index.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"files": {"lines_ended_with_punct": {"file": "lines_ended_with_punct.json"}, "lines_chars": {"file": "lines_chars.json"}, "short_lines": {"file": "short_lines.json"}}, "settings": {"defaultMetric": "lines_ended_with_punct", "autoSetXRange": false, "slider": null}}
|
data/plots/stats/lines_chars.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"data": {"dedup_minhash_CC-MAIN-2013-48_output": {"x": [0.0, 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017, 0.018, 0.019, 0.02, 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027, 0.028, 0.029, 0.03, 0.031, 0.032, 0.033, 0.034, 0.035, 0.036, 0.037, 0.038, 0.039, 0.04, 0.041, 0.042, 0.043, 0.044, 0.045, 0.046, 0.047, 0.048, 0.049, 0.05, 0.051, 0.052, 0.053, 0.054, 0.055, 0.056, 0.057, 0.058, 0.059, 0.06, 0.061, 0.062, 0.063, 0.064, 0.065, 0.066, 0.067, 0.068, 0.069, 0.07, 0.071, 0.072, 0.073, 0.074, 0.075, 0.076, 0.077, 0.078, 0.079, 0.08, 0.081, 0.082, 0.083, 0.084, 0.085, 0.086, 0.087, 0.088, 0.089, 0.09, 0.091, 0.092, 0.093, 0.094, 0.095, 0.096, 0.097, 0.098, 0.099, 0.1, 0.101, 0.102, 0.103, 0.104, 0.105, 0.106, 0.107, 0.108, 0.109, 0.11, 0.111, 0.112, 0.113, 0.114, 0.115, 0.116, 0.117, 0.118, 0.119, 0.12, 0.121, 0.122, 0.123, 0.124, 0.125, 0.126, 0.127, 0.128, 0.129, 0.13, 0.131, 0.132, 0.133, 0.134, 0.135, 0.136, 0.137, 0.138, 0.139, 0.14, 0.141, 0.142, 0.143, 0.144, 0.145, 0.146, 0.147, 0.148, 0.149, 0.15, 0.151, 0.152, 0.153, 0.154, 0.155, 0.156, 0.157, 0.158, 0.159, 0.16, 0.161, 0.162, 0.163, 0.164, 0.165, 0.166, 0.167, 0.168, 0.169, 0.17, 0.171, 0.172, 0.173, 0.174, 0.175, 0.176, 0.177, 0.178, 0.179, 0.18, 0.181, 0.182, 0.183, 0.184, 0.185, 0.186, 0.187, 0.188, 0.189, 0.19, 0.191, 0.192, 0.193, 0.194, 0.195, 0.196, 0.197, 0.198, 0.199, 0.2, 0.201, 0.202, 0.203, 0.204, 0.205, 0.206, 0.207, 0.208, 0.209, 0.21, 0.211, 0.212, 0.213, 0.214], "y": [0.6466472940022213, 0.03231325980506087, 0.026914735455119968, 0.020298687734401643, 0.013574441621834098, 0.012193887306090957, 0.010018975523319932, 0.00887587850550379, 0.008118828235109352, 0.007613864798418536, 0.0065650511898363165, 0.005891302774828053, 0.005419509648047217, 0.0052656926780564935, 0.0050729137567429375, 0.004870469563720918, 0.004832523030048389, 0.004708642076711118, 0.004558435480588148, 0.004339932645466852, 0.004172501557349811, 0.004081527657494663, 0.0038154882331146197, 0.003583145631500271, 0.003555127625964064, 0.0034263200166197542, 0.0033463652786734762, 0.003295744828402691, 0.0031896148799152044, 0.003146704082174516, 0.0030041695305205213, 0.002920341162278633, 0.002917859030244553, 0.0028979267578496675, 0.0028639666786561165, 0.0027759638156296396, 0.0026954073487054026, 0.002629217161129933, 0.0025760393627028222, 0.002537641532296825, 0.00243339198686546, 0.00234411044976082, 0.002249225311548939, 0.0022023656673903964, 0.0021078190017286167, 0.0020784094979308795, 0.002033505472950703, 0.001921508666928118, 0.001820831887303383, 0.0017377932883450662, 0.0016942807616264193, 0.0016616745726332758, 0.0015997717040257624, 0.0015408022641857982, 0.0015168835373119353, 0.001494356308699602, 0.0014717914720261463, 0.0014365903268155555, 0.0014339953705981082, 0.001398418144776293, 0.0014069927827122062, 0.0013781473998313054, 0.0014019909105829234, 0.0013076698932878788, 0.0013129350218450183, 0.001289505199765747, 0.0012632171650411711, 0.0012389975736783287, 0.0012368539141943504, 0.0012112052165088557, 0.001196575680732232, 0.00115810263420399, 0.0011600958614434787, 0.0011372301602810436, 0.0011091369386225913, 0.0010831121603258724, 0.001050242714904872, 0.0010321156294438626, 0.001003646327174186, 0.000986083362630013, 0.0009655117531960458, 0.0009480992208963626, 0.0009355757365425946, 0.000898531796337005, 0.0008740113404851831, 0.0008746130694631419, 0.0008356511181403085, 0.0008392238839469389, 0.0008216233113416435, 0.0008028568888415529, 0.0007880017046981945, 0.0007720558867822858, 0.0007540040174435213, 0.0007247449458902738, 0.0007236919201788459, 0.0007242936491568047, 0.00070375964778396, 0.0006917250682247837, 0.0006850684414061143, 0.0006729586457246931, 0.0006596077840262317, 0.0006464073545722602, 0.0006236920856543148, 0.0006224134115761523, 0.0006058658646822849, 0.000602970043975858, 0.0006033461245870823, 0.0005822856103585237, 0.0005735229321169984, 0.0005659261037702683, 0.0005582540593012934, 0.0005454297104585461, 0.000532981442227023, 0.0005288069474424337, 0.0005057532059743865, 0.0004960127181436781, 0.0004844670433790933, 0.0004760804457487923, 0.0004731846250423655, 0.0004521993269360517, 0.0004409169085993239, 0.0004284686403678008, 0.0004106048113346484, 0.000406505532672304, 0.0003895442971060898, 0.00038130813172027847, 0.00037656951601885275, 0.00035490727281233534, 0.0003407666418303031, 0.0003345613117451028, 0.00031692313107868495, 0.0003125982040496059, 0.0003018799066297145, 0.0002905222721707418, 0.00027882616516166725, 0.00026664115335800124, 0.00025460657379882486, 0.00024253438617852607, 0.00023486234170955113, 0.00021929260440486673, 0.000221436263888845, 0.00020936407626854622, 0.00019789361762620627, 0.0001880026975510082, 0.00018345212215519461, 0.00017183123126836495, 0.00016840889770622418, 0.00015968382752582131, 0.0001462577497051152, 0.00014283541614297442, 0.00013520097973512192, 0.00013369665729022487, 0.00012418181782625107, 0.0001203457955917636, 0.00011444132999554269, 0.00011158311735023831, 0.0001056786517540174, 9.969897003555165e-05, 9.729205412371638e-05, 8.886784843229293e-05, 8.333946344729628e-05, 8.473096170882606e-05, 7.468960938913827e-05, 7.28468143941394e-05, 7.047750654342656e-05, 6.288067819669648e-05, 6.021050585700422e-05, 6.066180259047334e-05, 5.78788060674138e-05, 5.547189015557853e-05, 4.892808752027639e-05, 4.8476790786807274e-05, 4.7799845686603605e-05, 4.712290058639993e-05, 4.253471712946394e-05, 4.054148988997536e-05, 4.035344958436323e-05, 3.70063321444673e-05, 3.396007919355079e-05, 3.4035295315795644e-05, 3.3283134093347116e-05, 3.1741203587327647e-05, 2.918385543100267e-05, 2.775474910835048e-05, 2.609999441896373e-05, 2.493414452416852e-05, 2.4294807485087274e-05, 2.211353993998656e-05, 2.437002360733213e-05, 2.0195528822742827e-05, 2.1286162595293186e-05, 2.0120312700497976e-05, 1.8315125766621523e-05, 1.966901596702886e-05, 1.7600572605295425e-05, 1.7901437094274834e-05, 1.5494521182439563e-05, 1.7111667810703888e-05, 1.6246682404888087e-05, 1.4817576082235892e-05, 1.3990198737542517e-05, 1.4742359959991041e-05, 1.3463685881828553e-05, 1.1733715070196951e-05, 1.00789603808102e-05, 1.0567865175401741e-05, 8.574637935913157e-06, 7.972908957954337e-06, 6.167722024077884e-06, 4.776223762548117e-06, 3.6103738677529076e-06, 1.9556191783661582e-06, 7.145531613260964e-07, 1.504322444897045e-07, 3.7608061122426123e-08], "label": "dedup_minhash_CC-MAIN-2013-48_output"}, "dedup_minhash_independent_output_CC-MAIN-2013-48": {"x": [0.0, 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017, 0.018, 0.019, 0.02, 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027, 0.028, 0.029, 0.03, 0.031, 0.032, 0.033, 0.034, 0.035, 0.036, 0.037, 0.038, 0.039, 0.04, 0.041, 0.042, 0.043, 0.044, 0.045, 0.046, 0.047, 0.048, 0.049, 0.05, 0.051, 0.052, 0.053, 0.054, 0.055, 0.056, 0.057, 0.058, 0.059, 0.06, 0.061, 0.062, 0.063, 0.064, 0.065, 0.066, 0.067, 0.068, 0.069, 0.07, 0.071, 0.072, 0.073, 0.074, 0.075, 0.076, 0.077, 0.078, 0.079, 0.08, 0.081, 0.082, 0.083, 0.084, 0.085, 0.086, 0.087, 0.088, 0.089, 0.09, 0.091, 0.092, 0.093, 0.094, 0.095, 0.096, 0.097, 0.098, 0.099, 0.1, 0.101, 0.102, 0.103, 0.104, 0.105, 0.106, 0.107, 0.108, 0.109, 0.11, 0.111, 0.112, 0.113, 0.114, 0.115, 0.116, 0.117, 0.118, 0.119, 0.12, 0.121, 0.122, 0.123, 0.124, 0.125, 0.126, 0.127, 0.128, 0.129, 0.13, 0.131, 0.132, 0.133, 0.134, 0.135, 0.136, 0.137, 0.138, 0.139, 0.14, 0.141, 0.142, 0.143, 0.144, 0.145, 0.146, 0.147, 0.148, 0.149, 0.15, 0.151, 0.152, 0.153, 0.154, 0.155, 0.156, 0.157, 0.158, 0.159, 0.16, 0.161, 0.162, 0.163, 0.164, 0.165, 0.166, 0.167, 0.168, 0.169, 0.17, 0.171, 0.172, 0.173, 0.174, 0.175, 0.176, 0.177, 0.178, 0.179, 0.18, 0.181, 0.182, 0.183, 0.184, 0.185, 0.186, 0.187, 0.188, 0.189, 0.19, 0.191, 0.192, 0.193, 0.194, 0.195, 0.196, 0.197, 0.198, 0.199, 0.2, 0.201, 0.202, 0.203, 0.204, 0.205, 0.206, 0.207, 0.208, 0.209, 0.21, 0.211, 0.212, 0.213, 0.214], "y": [0.7461295023573521, 0.03923197491864997, 0.027093721163589636, 0.01896118967684044, 0.011304646971105552, 0.008925572241091017, 0.007504260519578182, 0.006426802599126767, 0.005721209004034303, 0.00529004385232618, 0.004656354121909244, 0.0041525653199315235, 0.003686809717127249, 0.003527193866427187, 0.0033563126921369183, 0.0032123963159037387, 0.0031380682676708247, 0.0030132465809708953, 0.002994769820579724, 0.0027932638063908216, 0.0026722787127158873, 0.002596436398563373, 0.0024877148217081575, 0.0023661661223322063, 0.0022732255051054664, 0.0021510045531463744, 0.0020497931920179634, 0.0019845914818834446, 0.0019045187297582371, 0.0018441789675987565, 0.0017506543733806013, 0.0016874490497051065, 0.0016672746817807517, 0.0016045175264940176, 0.00154860512478108, 0.0014927063039284077, 0.0014655513738276017, 0.0014330523752123334, 0.0013801073914674008, 0.0013461959833845237, 0.0012873908584350412, 0.0012197581743130033, 0.001178479149536109, 0.0011417293416577487, 0.001095520464604489, 0.0010757467320579657, 0.0010408031785949128, 0.001003632364048323, 0.0009649133314314697, 0.0009376904970293362, 0.0009140054767263576, 0.0008853702328566153, 0.0008646118879408452, 0.0008380613061218526, 0.0008181585554028071, 0.0008032671421217199, 0.0007854830056040876, 0.0007711755693144156, 0.0007587422917413784, 0.000742547115874806, 0.0007318793501362798, 0.0007214696207427975, 0.0007262976165671719, 0.0006779565444522335, 0.0006752267915388742, 0.0006675400246286185, 0.0006498713254232427, 0.0006409758619493603, 0.0006276666188892002, 0.0006168155115370902, 0.0006040019698766198, 0.0005926959037056165, 0.0005832639962512479, 0.0005728338955673675, 0.0005632254369295479, 0.0005515255258108459, 0.000537523658877147, 0.000524750859797473, 0.0005169826077256245, 0.0005085081509199716, 0.0005001627122868406, 0.0004890060355787574, 0.00047632151209080903, 0.00046869585905174796, 0.00045600454513366676, 0.0004561675154568524, 0.0004403050706667841, 0.00043439739645130494, 0.00042622850900162504, 0.000418575694242033, 0.00041155438948478536, 0.0004023533566549298, 0.0003971043541623259, 0.00038764528498742645, 0.0003829666786259722, 0.0003800196319483654, 0.0003718100019178891, 0.00036168547058998163, 0.00035721736756264217, 0.00035247764732999334, 0.0003467872668787617, 0.000341327761052043, 0.0003342928754345298, 0.00033044949197940195, 0.00032412760152582596, 0.00031802300483649743, 0.00031472285579198835, 0.00031593155235561517, 0.00030959608104177364, 0.00029647017959519745, 0.00029466392517989, 0.00028797535149914635, 0.0002833171164280903, 0.00028307945137344463, 0.0002725271229471749, 0.000268147295511561, 0.0002610716673132514, 0.00025832154310949384, 0.00025168050243967933, 0.0002442857240251312, 0.00023742738959106916, 0.00023062337859806897, 0.0002202951343661795, 0.00021643817005078617, 0.00020936933228260933, 0.00020107821709054025, 0.00019655579062213894, 0.00018682510424193008, 0.00017733208291636696, 0.00017215098472509038, 0.00016502782351585167, 0.0001593985569358146, 0.00015193587421993916, 0.00014672761430813163, 0.0001387963919130975, 0.000131795458446248, 0.00012521553164762808, 0.00011750839344697417, 0.00011457492762963278, 0.00010679988512765153, 0.00010390716189110654, 9.622039498085085e-05, 9.321902486218211e-05, 8.896142516895745e-05, 8.500939483170585e-05, 7.946840384339432e-05, 7.726830448038827e-05, 7.346566360605684e-05, 6.687215594717143e-05, 6.544616561929714e-05, 6.262813711421223e-05, 5.8655735486562434e-05, 5.768470397758137e-05, 5.310116363798544e-05, 5.166159244984568e-05, 4.883677351462804e-05, 4.531933070587146e-05, 4.537365414693334e-05, 4.273217682529954e-05, 3.8365930249951115e-05, 3.687882605088221e-05, 3.46855171180089e-05, 3.342928754345298e-05, 3.10662178572613e-05, 2.9504418926732318e-05, 2.898834623664448e-05, 2.6808618164036636e-05, 2.659132439978913e-05, 2.4975202028198265e-05, 2.3705391593376877e-05, 2.228619169563532e-05, 2.0819458786964625e-05, 2.0894153518424705e-05, 1.948853448094862e-05, 1.8598988133560372e-05, 1.716620737555335e-05, 1.712546479475694e-05, 1.5733426617546326e-05, 1.5848863929802813e-05, 1.4477197042990403e-05, 1.3974705213168034e-05, 1.3064787575381583e-05, 1.2813541660470399e-05, 1.2039432625338642e-05, 1.112951498755219e-05, 1.0844316921977332e-05, 1.0891849932906476e-05, 9.526973476226799e-06, 9.954770574589085e-06, 9.004110356006226e-06, 9.167080679191859e-06, 8.644217558971286e-06, 7.591700888397407e-06, 7.897270244370468e-06, 7.449101855609978e-06, 7.40835927481357e-06, 6.681783250610956e-06, 6.641040669814547e-06, 6.349052174106954e-06, 5.961997656541076e-06, 5.724332601895361e-06, 5.7854464730899736e-06, 5.432344106187769e-06, 4.868738405170787e-06, 4.291551843888337e-06, 3.911287756455193e-06, 3.823012164729642e-06, 2.790866784553966e-06, 2.0778716206168217e-06, 1.5685893606617183e-06, 1.0932592513702884e-06, 5.703961311497157e-07, 2.512459149111843e-07, 8.148516159281653e-08, 2.7161720530938842e-08], "label": "dedup_minhash_independent_output_CC-MAIN-2013-48"}}, "layout": {"title": {"text": "Histograms of selected metrics"}, "xaxis": {"title": {"text": "Fraction of chars in duplicated lines"}, "range": [0, 0.05]}, "yaxis": {"range": [0, 0.015]}, "shapes": [{"type": "rect", "x0": 0.01, "y0": 0.0, "x1": 1.0, "y1": 1.0, "xref": "x", "yref": "y", "line": {"color": "rgba(255, 0, 0, 1)", "width": 1, "dash": "dashdot"}, "fillcolor": "rgba(255, 0, 0, 0.2)"}], "annotations": [{"text": "Filtered out", "x": 0.03, "y": 0.007, "xref": "x", "yref": "y", "font": {"size": 10}, "showarrow": false}], "legend": {"xanchor": "right", "yanchor": "top", "x": 1, "y": 1}}}
|
data/plots/stats/lines_ended_with_punct.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"data": {"dedup_minhash_CC-MAIN-2013-48_output": {"x": [0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0], "y": [0.09923783023978446, 0.0023246097068606976, 0.005683604176580645, 0.010645583805882734, 0.014533319649527074, 0.01273737632313301, 0.016450414906929556, 0.018963070709119012, 0.018166602690997746, 0.015815707777192788, 0.021298142003336983, 0.015874090184057568, 0.022973458352926936, 0.010206609600894942, 0.021533996427986615, 0.012769286039409137, 0.008821142965540544, 0.021334551327078604, 0.014339424064493175, 0.010260792374125262, 0.02374217879167766, 0.011738500962522281, 0.01213997843863096, 0.012267654800464345, 0.006444150328435723, 0.024761639856443023, 0.006712626907210182, 0.011331923931286044, 0.005890698610179995, 0.01786756627818681, 0.009625560290184186, 0.008643970921586728, 0.00607720733962471, 0.03398538519995862, 0.0027106385306711806, 0.006115491499810285, 0.008968167639475551, 0.003539953683290554, 0.009910497932936576, 0.007056059447013991, 0.016438790920977032, 0.004623646644625839, 0.0060941183643491905, 0.01184229190805967, 0.007977729042862623, 0.0025024191952054582, 0.007618735360573982, 0.0053696440656758205, 0.0037428109865265628, 0.0014390869576515319, 0.046351093946441844, 0.0013059360734017965, 0.0031416259324077217, 0.004219544397495624, 0.003321685224485229, 0.004613710011472874, 0.006987177956063058, 0.007918296727589227, 0.003943193505656551, 0.002411902091691276, 0.013621774170475803, 0.0038953476796068005, 0.006052834465853285, 0.0014426116501661686, 0.004535004377555613, 0.0024094648043141336, 0.0008326898582184936, 0.024016504860157262, 0.0018992093177271447, 0.0031317267959836354, 0.0037732583303764036, 0.007510294820655583, 0.0013834418119949262, 0.003858600885305268, 0.0014603850996548688, 0.015595264508112473, 0.0011334136238289913, 0.0028665124326216593, 0.00389324786278957, 0.00220597005668718, 0.010919909874362334, 0.0017753576222394299, 0.003224493703230564, 0.00881323115574705, 0.0009990253475262452, 0.002172335490882614, 0.006386667843064041, 0.0012925497411919527, 0.005112041538276305, 0.003495745039942078, 0.0032573408377286686, 0.002969103482834916, 0.002916308088573123, 0.0024018154716227945, 0.00203787222112211, 0.0015711129400348936, 0.0009821893162595228, 0.0005898235452684697, 0.00020630700229288748, 6.250704704148372e-05, 0.07793380125001345], "label": "dedup_minhash_CC-MAIN-2013-48_output"}, "dedup_minhash_independent_output_CC-MAIN-2013-48": {"x": [0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0], "y": [0.06067509358546734, 0.0009799804812990203, 0.0026737044044083383, 0.005808853255240398, 0.006741240415215013, 0.008724732875685022, 0.009743229952027291, 0.009872445819423053, 0.01106695659611478, 0.009423368837497772, 0.011172627367546767, 0.012111283247844397, 0.014611088455179262, 0.006845361004181006, 0.014111630543109146, 0.008897054694108885, 0.006074457983460097, 0.016450345955852708, 0.009697472373096828, 0.007471016007265733, 0.01825137922045675, 0.008258270733384525, 0.009280860835360696, 0.00699831913987174, 0.006388784674624179, 0.022580052986324536, 0.005147598447805017, 0.008542253282681348, 0.004359446247626963, 0.013727226085817829, 0.00781696506092693, 0.0068599653547816945, 0.0047715840131586015, 0.03250753708233631, 0.002092073223548686, 0.005294777394477788, 0.007839306725765787, 0.0030599514031395, 0.012102390095802636, 0.0037830435330670916, 0.016230424017058535, 0.004138361671983294, 0.0060345203878230174, 0.01177132415362417, 0.00864272958304307, 0.006216245286731404, 0.004476675397094876, 0.005461075258231163, 0.0037623880316122998, 0.0014248420547641391, 0.05676415319990976, 0.001335210232618477, 0.00344726585207751, 0.005088528336995526, 0.004190462760741711, 0.005782765316108721, 0.008691743904961304, 0.011070322123837005, 0.0052469801017762776, 0.0033493119977459802, 0.018287774513178114, 0.0029657234259630816, 0.011583323735346866, 0.00218993869028347, 0.006901092783451883, 0.0036858851671062705, 0.001235665401545532, 0.035913621113376405, 0.003116893412581199, 0.004969259473756566, 0.00641501539371983, 0.01285602353992869, 0.0024502877560197735, 0.006510406007745745, 0.0026760160800155235, 0.025953331620880305, 0.0032719932477315323, 0.003991243916723654, 0.0074002583365481486, 0.004290680697359101, 0.019380612357429106, 0.003359592155152052, 0.005958398269887583, 0.015279475461767401, 0.0019716553215520373, 0.004173512739304319, 0.011743155706446027, 0.002714077138983241, 0.010242885036428609, 0.007541909658901386, 0.006095093087777178, 0.00498867074981102, 0.006088328037103209, 0.004421956675663617, 0.00398499559353835, 0.0029186196359437273, 0.0019253266227656806, 0.0010902746039453775, 0.0003822559587858155, 0.00011331969807340876, 0.12302239211375945], "label": "dedup_minhash_independent_output_CC-MAIN-2013-48"}}, "layout": {"title": {"text": "Histograms of selected metrics"}, "xaxis": {"title": {"text": "Fraction of lines ended with punctuation"}, "range": [0, 1.0]}, "yaxis": {"title": {"text": "Document Frequency"}, "range": [0, 0.15]}, "shapes": [{"type": "rect", "x0": 0.0, "y0": 0.0, "x1": 0.12, "y1": 0.15, "xref": "x", "yref": "y", "line": {"color": "rgba(255, 0, 0, 1)", "width": 1, "dash": "dashdot"}, "fillcolor": "rgba(255, 0, 0, 0.2)"}], "annotations": [{"text": "Filtered out", "x": 0.065, "y": 0.075, "xref": "x", "yref": "y", "font": {"size": 10}, "showarrow": false}], "legend": {"xanchor": "right", "yanchor": "top", "x": 1, "y": 1}}}
|
data/plots/stats/short_lines.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"data": {"dedup_minhash_CC-MAIN-2013-48_output": {"x": [0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0], "y": [0.20005652256911238, 0.0003341708477706676, 0.0010070496475063757, 0.0025293793432695417, 0.004265927851119088, 0.0041117787986970486, 0.005760097503494039, 0.006554990659002385, 0.008228619655792777, 0.007417002959204347, 0.009123479090492843, 0.00821032125209977, 0.012847391725664332, 0.004227081240000326, 0.014260605940389276, 0.006264803474206919, 0.003883348726365484, 0.016472125512950565, 0.008186810803092566, 0.004969291491433516, 0.020817508932564496, 0.006430389028937088, 0.009028199902411013, 0.007840903479182735, 0.003843864670855776, 0.025294655857459634, 0.004751997947579048, 0.008740300018077173, 0.004562152009264092, 0.01632960044648105, 0.009635159452777238, 0.008322361477990564, 0.0052592536958557074, 0.03763467934466067, 0.0029662912281689826, 0.006354870616974552, 0.009465974212074674, 0.004233643167554171, 0.011688555319655302, 0.008870638647660975, 0.01938502139919569, 0.00717931119483964, 0.008553603804988062, 0.015333012386331947, 0.011541043188244865, 0.004116690870180213, 0.011519632556054892, 0.008228057204859592, 0.006390455012680831, 0.0030878181264661925, 0.04465354203662658, 0.0030727444414567883, 0.00614410150394505, 0.008267953724386969, 0.006652444657359204, 0.008488134516365129, 0.011559716559226664, 0.010805507354552157, 0.007818705415686298, 0.006199671656143897, 0.01498909238905271, 0.008111967332249852, 0.009845853572348982, 0.0042120450517198005, 0.01031887480715901, 0.0065605776716053735, 0.003250478936343791, 0.02296862127490153, 0.006068808072355788, 0.007736400095796643, 0.00790033579445613, 0.011510295870563991, 0.005311149168624402, 0.008055497258557906, 0.005195471760032334, 0.011810457185241303, 0.004744348614887709, 0.007779033876532196, 0.007478722574939369, 0.006945894057567153, 0.009251567916343899, 0.0063061623661605825, 0.006096443161539695, 0.007190410226587858, 0.004318910729025276, 0.005255654009883313, 0.005824291903335083, 0.003628108492885352, 0.006369831811797318, 0.004534891887368975, 0.004269490040362604, 0.004108929047302236, 0.0033001621021086173, 0.002574150437551204, 0.0022865130303195173, 0.0019165703031981747, 0.0013306089210042538, 0.0005063933235124401, 0.00019318314718519744, 1.9948259763688864e-05, 0.00014488736038889807], "label": "dedup_minhash_CC-MAIN-2013-48_output"}, "dedup_minhash_independent_output_CC-MAIN-2013-48": {"x": [0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0], "y": [0.2652386539901629, 0.00033649837985535167, 0.0012706804879484871, 0.0035209063182581714, 0.005307981141622359, 0.00740548680284793, 0.009500932353164746, 0.010030809196607059, 0.012873163934651379, 0.010554886453893698, 0.012086092782771979, 0.013970788307218424, 0.01771782409318065, 0.005777190098467862, 0.018364590133795704, 0.007956603865692736, 0.004664261073572068, 0.021268476237266576, 0.009584145875977515, 0.006537194248605481, 0.024689137422723745, 0.0074713423614691, 0.010422114685490414, 0.006912637563396004, 0.005894052099456987, 0.029571641070751824, 0.004882530844211693, 0.009071321039661146, 0.0043682442130260745, 0.015606679545872687, 0.00869906647739936, 0.007355799375385251, 0.005053037317379327, 0.03819209697941507, 0.002454632346352101, 0.005805297354232875, 0.008761964451052512, 0.003672742611374831, 0.013766748939654795, 0.004717538397271788, 0.01802941076884563, 0.005255002975942392, 0.0071109997276302215, 0.012823075363480393, 0.009749743438002832, 0.007349306286547421, 0.005530860665384548, 0.006456373989950738, 0.005038147406850692, 0.0020644215038592064, 0.045923740813299155, 0.0019986543328347818, 0.004757843141387655, 0.005983534342592772, 0.00481339134641914, 0.006517558604036213, 0.007932297276587771, 0.009456038253064025, 0.005506438492499224, 0.004140027438229648, 0.012337208744171348, 0.0035501354165972596, 0.009343133296790727, 0.0027142131199013103, 0.006482169570108566, 0.004154645386922144, 0.0017665485037815613, 0.017853057116201, 0.003610558537541544, 0.004675540690725952, 0.005043042719901201, 0.007465250416339576, 0.0028165251626569745, 0.004766953862898327, 0.0029729780079421017, 0.009586328369712535, 0.0034640322992755073, 0.0033724627490473516, 0.004139796270668929, 0.0038257075461114692, 0.006276403244885452, 0.0032975168660532204, 0.0037036918683275013, 0.004827784926596821, 0.002499370068397042, 0.0032287513157853586, 0.0038816161005758385, 0.002207269458291457, 0.004003944534471367, 0.00303153819030883, 0.0025808770306880377, 0.001965903328717688, 0.0018933711070192943, 0.0013434710733912132, 0.0013168256124954497, 0.001120761525776611, 0.0006088749577949225, 0.00029790699530716255, 8.133698614340846e-05, 2.002319018576761e-05, 9.981679290908466e-05], "label": "dedup_minhash_independent_output_CC-MAIN-2013-48"}}, "layout": {"title": {"text": "Histograms of selected metrics"}, "xaxis": {"title": {"text": "Fraction of lines shorter than 30 chars"}}, "yaxis": {"range": [0.0, 0.1]}, "shapes": [{"type": "rect", "x0": 0.67, "y0": 0.0, "x1": 1.0, "y1": 0.1, "xref": "x", "yref": "y", "line": {"color": "rgba(255, 0, 0, 1)", "width": 1, "dash": "dashdot"}, "fillcolor": "rgba(255, 0, 0, 0.2)", "showarrow": false}], "annotations": [{"text": "Filtered out", "x": 0.83, "y": 0.05, "xref": "x", "yref": "y", "font": {"size": 10}, "showarrow": false}], "legend": {"xanchor": "right", "yanchor": "top", "x": 1, "y": 1}}}
|
src/plotting.js
CHANGED
@@ -21,7 +21,9 @@ const TASK_ID_TO_NAME = {
|
|
21 |
"mmlu/acc_norm": "MMLU",
|
22 |
|
23 |
// Stats
|
24 |
-
|
|
|
|
|
25 |
};
|
26 |
|
27 |
const DATASET_ID_TO_NAME = {
|
@@ -31,6 +33,9 @@ const DATASET_ID_TO_NAME = {
|
|
31 |
"red-pajama-v2_jsonl-deduplicated-extract": "RedPajamaV2",
|
32 |
"dolma-sample": "Dolma1.6",
|
33 |
dedup_minhash_independent_output: "Individual Dedup MinHash",
|
|
|
|
|
|
|
34 |
};
|
35 |
|
36 |
const DEFAULT_SETTINGS = {
|
@@ -148,9 +153,8 @@ const init_ablation_plot = function () {
|
|
148 |
async function updatePlot(dropdown, slider) {
|
149 |
const metricName = dropdown?.value ?? settings.defaultMetric;
|
150 |
const sliderValue = parseInt(slider?.value ?? 0);
|
151 |
-
console.log(indexMapping)
|
152 |
console.log(metricName)
|
153 |
-
console.log(
|
154 |
const metricData = await fetch(
|
155 |
`data/plots/${plotName}/${indexMapping[metricName]["file"]}`
|
156 |
).then((response) => response.json());
|
@@ -163,7 +167,7 @@ const init_ablation_plot = function () {
|
|
163 |
const trace = {
|
164 |
x: x,
|
165 |
y: y,
|
166 |
-
name:
|
167 |
...plotSettings,
|
168 |
};
|
169 |
traces.push(trace);
|
@@ -220,6 +224,8 @@ const createAblationPlottingElements = (
|
|
220 |
);
|
221 |
// Dropdown
|
222 |
let dropdown = undefined
|
|
|
|
|
223 |
if (metricOptions.length > 1) {
|
224 |
const dropdownLabel = document.createElement("label");
|
225 |
dropdownLabel.textContent = "Metric:";
|
@@ -238,7 +244,6 @@ const createAblationPlottingElements = (
|
|
238 |
dropdownContainer.appendChild(dropdown);
|
239 |
controls.appendChild(dropdownContainer);
|
240 |
}
|
241 |
-
console.log(dropdown)
|
242 |
|
243 |
let slider = undefined;
|
244 |
if (settings.slider !== null) {
|
|
|
21 |
"mmlu/acc_norm": "MMLU",
|
22 |
|
23 |
// Stats
|
24 |
+
"lines_ended_with_punct": "Lines Ended With Punctuation",
|
25 |
+
"lines_chars": "Lines Chars",
|
26 |
+
"short_lines": "Short Lines",
|
27 |
};
|
28 |
|
29 |
const DATASET_ID_TO_NAME = {
|
|
|
33 |
"red-pajama-v2_jsonl-deduplicated-extract": "RedPajamaV2",
|
34 |
"dolma-sample": "Dolma1.6",
|
35 |
dedup_minhash_independent_output: "Individual Dedup MinHash",
|
36 |
+
dedup_minhash_independent_output: "Individual Dedup MinHash",
|
37 |
+
"dedup_minhash_CC-MAIN-2013-48_output": "Full MinHash CC-MAIN-2013-48",
|
38 |
+
"dedup_minhash_independent_output_CC-MAIN-2013-48": "Independent MinHash CC-MAIN-2013-48",
|
39 |
};
|
40 |
|
41 |
const DEFAULT_SETTINGS = {
|
|
|
153 |
async function updatePlot(dropdown, slider) {
|
154 |
const metricName = dropdown?.value ?? settings.defaultMetric;
|
155 |
const sliderValue = parseInt(slider?.value ?? 0);
|
|
|
156 |
console.log(metricName)
|
157 |
+
console.log(indexMapping[metricName])
|
158 |
const metricData = await fetch(
|
159 |
`data/plots/${plotName}/${indexMapping[metricName]["file"]}`
|
160 |
).then((response) => response.json());
|
|
|
167 |
const trace = {
|
168 |
x: x,
|
169 |
y: y,
|
170 |
+
name: DATASET_ID_TO_NAME[key] ?? key,
|
171 |
...plotSettings,
|
172 |
};
|
173 |
traces.push(trace);
|
|
|
224 |
);
|
225 |
// Dropdown
|
226 |
let dropdown = undefined
|
227 |
+
console.log(plotElement)
|
228 |
+
console.log(metricOptions)
|
229 |
if (metricOptions.length > 1) {
|
230 |
const dropdownLabel = document.createElement("label");
|
231 |
dropdownLabel.textContent = "Metric:";
|
|
|
244 |
dropdownContainer.appendChild(dropdown);
|
245 |
controls.appendChild(dropdownContainer);
|
246 |
}
|
|
|
247 |
|
248 |
let slider = undefined;
|
249 |
if (settings.slider !== null) {
|