TxT360 / data /topic_charts.json
mylibrar's picture
Update topic analysis result (#5)
fa3890f verified
[
[
"Number of Document of Each Topic",
{
"type": "pie",
"kwargs": {
"x": [
324853053,
127033069,
233531055,
123094708,
267497100,
148588074,
581871647,
165387460,
390492627,
244588996,
170281196,
914696921,
281274506,
686870899,
19458015,
1411221902,
366116749
],
"labels": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"autopct": "%1.1f%%",
"colors": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
],
"pctdistance": 1.2,
"labeldistance": 1.5
},
"comment": "As shown in the graph above, over 20% of the documents are related to Business & Economics & Finance, which makes it the largest topic group in dataset. On the contrary, the group of Culture & Cultural geography contains the smallest number of documents among all topics."
}
],
[
"Fraction of Words Corrected in Lines",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.005660839019938058,
0.005601641737204916,
0.010656858389603374,
0.006108459524594901,
0.005077341851036456,
0.004333818728677237,
0.00812686384284095,
0.005099914065389049,
0.005922873834475705,
0.008028764588273587,
0.005868815973653353,
0.007446294346393395,
0.006845364607248323,
0.007812665071102337,
0.007692180748283549,
0.006834288663313659,
0.007850315335340054
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "In average, documents related to Shopping & Commodity have larger fraction of words corrected in lines."
}
],
[
"Fraction of Lines Ending with Ellipsis",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.013698353704877283,
0.011988367184873385,
0.010239788510367555,
0.013182844498032174,
0.012825014289657984,
0.016784713501187303,
0.013729740175749594,
0.012272497721678627,
0.011805768817329271,
0.013464839491767208,
0.012785021526251267,
0.015677345947523093,
0.011127706885026923,
0.012810749078485683,
0.013244961193298873,
0.012872493046687979,
0.014188113777531883
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Compared with other topics, Personal Development & Human Resources & Career in average contain more lines ending with ellipsis."
}
],
[
"Fraction of Lines Starting with Bullet Point",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.05924759002174845,
0.06636489569195865,
0.1111156447572103,
0.057900707172324956,
0.05498350949228654,
0.04950217629831486,
0.09247477225558454,
0.06597399742617387,
0.08548870827846955,
0.09873316891194645,
0.06547543788491705,
0.0735152711822082,
0.08847503034590092,
0.07390893089349196,
0.058802087892367495,
0.08333351946410401,
0.06067125030474924
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Shopping & Commodity related documents have higher percentage of lines starting with bullet point."
}
],
[
"Number of Lines with Toxic Words",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.19922507238988454,
0.8737279739340943,
0.2723972492651994,
0.2602153376081773,
0.26610157268994694,
1.0438407391968751,
0.13075538461491662,
0.43004878362603793,
0.7763741362522576,
0.15141952256920013,
0.1365380766999076,
0.7216673095153012,
0.10996786534219351,
1.0588212632953606,
0.13198319561373553,
0.1422862362860352,
0.45226715918424154
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Daily Life & Home & Lifestyle in average has more lines with toxic words."
}
],
[
"Number of Toxic Words",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.2733295968131166,
1.5669363935464709,
0.44824271872535326,
0.372924147153426,
0.3350402266043258,
2.0669407694186814,
0.19540095755860054,
0.5992540667835397,
1.458653655962626,
0.1993768722121906,
0.18759733165134687,
1.4411351199901983,
0.1523272713524915,
2.455736465842033,
0.18513111435056454,
0.22537730852195914,
0.781900764665645
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Daily Life & Home & Lifestyle in average has more toxic words."
}
],
[
"Word Count",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
528.940000185253,
630.8575219496587,
331.12940931560473,
652.5414443568118,
639.8122070257958,
745.046993327338,
616.6008528251936,
561.0260046983005,
739.9628602078574,
427.55716142683707,
611.8977898886733,
470.5665158383101,
557.2392759050832,
448.4545774765747,
666.4168803960733,
555.4812271358867,
506.7156364621822
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Documents in the topic of Personal Development & Human Resources & Career in average contain more words than other topics."
}
],
[
"Mean Word Length",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
4.8591533713129555,
5.180746132747496,
4.95975994177285,
4.875042818402709,
5.168670579970495,
4.654983410185081,
5.237515458154388,
5.1004147156966715,
5.205703704499496,
4.880593401877592,
5.08581294318828,
4.914944728270949,
5.264151733240911,
4.9967250103431935,
5.143653278714547,
5.172399304913307,
4.948735274753513
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "There is no significant variance in the average word length for different topic groups. However, Education related data contain longer words than others in general."
}
],
[
"Number of Sentences",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
23.889495217396032,
28.667429824906456,
17.00780648209721,
32.56295546840243,
27.016352794105057,
41.47359763206837,
27.639144560346658,
24.82906568611671,
34.94192956170719,
22.905121737365487,
28.89842860864097,
22.790945701674666,
26.620081949410658,
22.43558780468875,
30.96235911011478,
25.786870409555195,
27.291027835495175
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Documents in the topic of Personal Development & Human Resources & Career usually contain more sentences."
}
],
[
"Symbol to Word Ratio",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.0029673437245102876,
0.002369316991198444,
0.0026953845515368074,
0.0030995856207761256,
0.002515345366978788,
0.003716508288521279,
0.002910243583180489,
0.0021063347433407133,
0.0023350751882016177,
0.00325952936332765,
0.0026651973287582483,
0.0037352572697365097,
0.002278588397824893,
0.0033945285429091187,
0.002321581720070917,
0.002557382224711868,
0.0035588078008559885
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Documents related to Entertainment & Travel & Hobby usually have higher percentage of symbols."
}
],
[
"Fraction of Words with Alpha Character",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.9543530503658745,
0.966243732434154,
0.9437721848599528,
0.9641198106485631,
0.9655064629815291,
0.9789937421507563,
0.9480065698252734,
0.9637242361370412,
0.9640004505795688,
0.950377474345678,
0.9627294216362635,
0.9531905135921064,
0.9586824669836848,
0.9522644098234544,
0.9526614781429045,
0.9564103310368344,
0.9600895447178572
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "The fraction of words with alpha character seems to be relatively consistent across different topics."
}
],
[
"Number of Stop Words",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
107.18473168851517,
138.47942025237538,
61.60918090315654,
149.6609807547535,
142.7609030677342,
156.05911428665533,
121.44399604677766,
120.49043039901574,
151.07195976327614,
83.29154811200092,
130.22922543954883,
93.5245517591504,
114.12362959051823,
85.78707441498406,
158.3108779081525,
113.40855796468499,
104.68147387870529
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Culture & Cultural geography contains more stop words in average."
}
],
[
"Has Curly Bracket",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.005196103236253101,
0.006631383517940514,
0.010188400853154199,
0.00940792678106032,
0.011743005812025626,
0.009386123411223433,
0.027600317153793196,
0.0069009343271853865,
0.009196375940793372,
0.010313546566910966,
0.007981867827613802,
0.007428252838734548,
0.010558184039615734,
0.011781598276738173,
0.01233861727416697,
0.00927681818248878,
0.00859682603594844
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Natural Science & Formal Science & Technology has a significantly higher rate in percentage of documents that contain curly bracket. It might be related to the coding data."
}
],
[
"Number of Document Duplication",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
7.537686235012851,
7.1025928374603,
5.966075852310092,
7.145161033242794,
7.434088750868701,
6.676430767922868,
6.555241123477528,
7.261966584407307,
6.24676702026438,
6.911925072867955,
7.12188790945537,
6.103017919746556,
6.569012504105153,
5.991152694037777,
8.832861317045957,
6.079587144899626,
6.927007100677604
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Culture & Cultural geography related documents have a higher number of duplication count."
}
],
[
"Number of Dump Duplication",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
3.192987467475025,
3.256401260367881,
2.059270656744132,
3.2623413672665764,
3.4332213994095637,
3.184122448481296,
2.718956481479841,
3.2133479527407944,
2.675923289583647,
3.1140861341121004,
3.18235214298119,
2.64155309428444,
2.727249994708017,
2.398892457664013,
3.9119670737225767,
2.418408296500489,
3.0585330664563504
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "In average, Culture & Cultural geography related documents are duplicated across a higher number of common crawl dumps. Duplication of Shopping & Commodity appears in less dumps than others."
}
],
[
"Number of Year Duplication",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
1.5100191962794944,
1.5532559399946482,
1.286292351995755,
1.5752399851340482,
1.583747083613243,
1.5616323218510795,
1.4345138525025949,
1.5322301884314566,
1.4292428061644298,
1.514165600483515,
1.5310395693955543,
1.420375959699989,
1.4311913003590877,
1.3601090413935268,
1.6918824453573502,
1.372621704109578,
1.5093211072952033
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "In average, Culture & Cultural geography related documents are duplicated across more years than other topics."
}
],
[
"Maximum Span of Year Duplication",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
1.6027068922144314,
1.6463481331778262,
1.326032869589871,
1.6654945962421066,
1.7014688495688364,
1.6394770282842483,
1.5158579087803534,
1.622844253125358,
1.4877078793090759,
1.5888212812321287,
1.6172138231869126,
1.4887702131010017,
1.492519460686565,
1.41459098414941,
1.822498749230073,
1.4288667063218525,
1.592900066967436
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "In average, Culture & Cultural geography related documents are duplicated across a wider span of years."
}
],
[
"Language Score",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.940489806219048,
0.9341014072001546,
0.8833469805761418,
0.9326888501156927,
0.9414304493962583,
0.9514325652491805,
0.8825959914278214,
0.9474163424125213,
0.9228861253995115,
0.9051492112749342,
0.9259433469236898,
0.9106329146251756,
0.9205018098890236,
0.8984234924235204,
0.922120043098531,
0.9144863004649139,
0.9163656720680041
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Average language scores of different topic groups are mostly consistent. No significant differences are obeserved."
}
],
[
"Fraction of Duplicate Lines",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.012600997185367828,
0.01188115899050692,
0.019039766660027862,
0.0124898927764339,
0.011646428662052831,
0.010610017211082174,
0.0159476139009855,
0.012597314331886177,
0.015094734040349217,
0.014975673115722092,
0.012534733023571196,
0.01610487136667016,
0.015238474263765327,
0.01591887690664154,
0.01433554300372473,
0.015517507810570494,
0.015401894047378658
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "In average, Shopping & Commodity has a larger fraction of duplicate lines than others."
}
],
[
"Fraction of Characters in Duplicate Lines",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.004938858081287135,
0.004205415457276368,
0.007552357256944613,
0.004582746666516553,
0.0044753076683352235,
0.003940081675446834,
0.006179645047614322,
0.0050437770645133185,
0.005686946797304247,
0.005994693646977406,
0.0046510979989690445,
0.006342709242367984,
0.005829011670104205,
0.006381457735701225,
0.005068730018848793,
0.005971977053954138,
0.005856182489324971
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Shopping & Commodity usually has a larger fraction of characters in duplicate lines than others."
}
],
[
"Fraction of Characters in Most Common Bigram",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.026404273123764796,
0.025634991525195914,
0.03614706934891397,
0.026733992014165146,
0.02377814829063671,
0.019649114365205896,
0.03137691450183766,
0.0270495750357038,
0.027673178183087933,
0.029942339233414595,
0.027350265679715224,
0.030526314564882247,
0.030614040432541026,
0.03509742016691783,
0.027540083176404263,
0.029519105783701725,
0.028834560229748462
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
],
[
"Fraction of Characters in Most Common 3-gram",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.026165437832849362,
0.026251631875192687,
0.03811434473394529,
0.027814565987299922,
0.02458896408514931,
0.020185288853227328,
0.031209395373852387,
0.027345772022684685,
0.026970288190643604,
0.030974020712503342,
0.027787662286871063,
0.03143649261443422,
0.030952890587447934,
0.035409395984874435,
0.028486665111510972,
0.029371087024795153,
0.030651728333515618
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
],
[
"Fraction of Characters in Most Common 4-gram",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.026986818387229945,
0.027131471441621514,
0.04096520359337113,
0.02892589321318727,
0.025567532544329325,
0.02099809142740805,
0.03184294279840072,
0.027798282452682368,
0.027173562606014456,
0.03262575837410923,
0.028962796310066586,
0.03275942719001153,
0.03150508247840716,
0.03652360609065789,
0.029967601450189167,
0.029689817413511087,
0.032849409839897196
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
],
[
"Fraction of Characters in Duplicate 5-grams",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.017172412387336047,
0.018706184899018846,
0.020367380081370155,
0.01912555365329135,
0.017726236260087368,
0.014196179855798982,
0.026453126704962582,
0.02113702754713442,
0.022750991771259714,
0.017193015331520775,
0.018542560337896252,
0.019254844137973823,
0.022992720412462874,
0.02142410388811584,
0.019425070816460523,
0.021273316544081922,
0.016721728689196018
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
],
[
"Fraction of Characters in Duplicate 6-grams",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.011970605317388902,
0.013247085951653395,
0.014520268748996269,
0.013346559736076019,
0.012676990872510209,
0.009987587475557972,
0.019267682560495096,
0.014804574416538653,
0.01599622943881697,
0.012044805932442022,
0.013103480807140754,
0.013565938336254593,
0.015985684726478346,
0.014952398432378033,
0.014038548162484649,
0.01499523284606334,
0.01191912942692566
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
],
[
"Fraction of Characters in Duplicate 7-grams",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.009039855940169648,
0.010073325457247666,
0.010696329689399887,
0.009986471165253209,
0.009836605907258674,
0.007629540534323915,
0.014862665767002146,
0.011114675308487159,
0.011994499294618745,
0.00907677139620476,
0.009828884274472392,
0.010105882592285087,
0.011911397850057279,
0.011114070775684791,
0.010705940851157975,
0.011289206913404862,
0.009020791758628242
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
],
[
"Fraction of Characters in Duplicate 8-grams",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.007195970331280814,
0.008036785010913946,
0.008249465083499188,
0.007850453980445486,
0.007992494679173406,
0.006182227979778321,
0.011985128922160077,
0.008779317883993009,
0.009467708596743243,
0.0072104637314519765,
0.007673480063984403,
0.007904073310509803,
0.00934269422506397,
0.008657166799636231,
0.008485033120385031,
0.00893995298657962,
0.0070980585511939785
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
],
[
"Fraction of Characters in Duplicate 9-grams",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.006007801998269901,
0.0066553421544330435,
0.006529544209051376,
0.006431532192110704,
0.006749247730086534,
0.0052160144431644155,
0.009999607112669078,
0.007230897967718682,
0.007732210045591141,
0.005943279041721623,
0.006205408840055294,
0.006469113514028088,
0.007626168747361047,
0.006984803950948357,
0.006992627523875565,
0.007390774121782952,
0.005766236221861412
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
],
[
"Fraction of Characters in Duplicate 10-grams",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.0051710981353331464,
0.005682713875756294,
0.005363391346741175,
0.005464661863661183,
0.005846752796603754,
0.0045485380381742845,
0.00859601801316329,
0.006136039855302813,
0.0065548495409889435,
0.005091836417990565,
0.005250172827665216,
0.005493532455475418,
0.0064690246645603714,
0.00581859783771439,
0.005913783298441542,
0.006306495977016168,
0.004847123500711834
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
],
[
"Number of Document of Each Topic in Duplication Bucket 1-1",
{
"type": "pie",
"kwargs": {
"x": [
132249226,
47101525,
108551234,
42778158,
106867576,
52904902,
254436283,
65155001,
160648797,
96650903,
67150855,
409977727,
110689452,
314681138,
6571908,
632953103,
144137883
],
"labels": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"autopct": "%1.1f%%",
"colors": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
],
"pctdistance": 1.2,
"labeldistance": 1.5
}
}
],
[
"Number of Document of Each Topic in Duplication Bucket 2-5",
{
"type": "pie",
"kwargs": {
"x": [
104341527,
43192514,
75077276,
41770802,
83866134,
49842746,
190845342,
53891858,
130879713,
77844628,
54343851,
284749532,
98624858,
215519319,
6476944,
461711335,
117251313
],
"labels": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"autopct": "%1.1f%%",
"colors": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
],
"pctdistance": 1.2,
"labeldistance": 1.5
}
}
],
[
"Number of Document of Each Topic in Duplication Bucket 6-10",
{
"type": "pie",
"kwargs": {
"x": [
38443961,
16376927,
24393877,
18121006,
33219823,
22183319,
61039668,
20168641,
46703585,
32303976,
22128963,
101878274,
33435189,
76066340,
2712172,
150381514,
48762315
],
"labels": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"autopct": "%1.1f%%",
"colors": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
],
"pctdistance": 1.2,
"labeldistance": 1.5
}
}
],
[
"Number of Document of Each Topic in Duplication Bucket 11-100",
{
"type": "pie",
"kwargs": {
"x": [
47907124,
19868534,
24683580,
19990446,
42239133,
23300618,
73293979,
25410832,
51086693,
36943522,
25974730,
115373947,
37581053,
78644079,
3587872,
162190029,
54743888
],
"labels": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"autopct": "%1.1f%%",
"colors": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
],
"pctdistance": 1.2,
"labeldistance": 1.5
}
}
],
[
"Number of Document of Each Topic in Duplication Bucket 101-1000",
{
"type": "pie",
"kwargs": {
"x": [
1879583,
484835,
792159,
425638,
1281577,
350055,
2198979,
744933,
1141913,
825604,
669062,
2659131,
922347,
1892942,
106472,
3840419,
1189540
],
"labels": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"autopct": "%1.1f%%",
"colors": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
],
"pctdistance": 1.2,
"labeldistance": 1.5
}
}
],
[
"Number of Document of Each Topic in Duplication Bucket 1001-30000000",
{
"type": "pie",
"kwargs": {
"x": [
31632,
8734,
32929,
8658,
22857,
6434,
57396,
16195,
31926,
20363,
13735,
58310,
21607,
67081,
2647,
145502,
31810
],
"labels": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"autopct": "%1.1f%%",
"colors": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
],
"pctdistance": 1.2,
"labeldistance": 1.5
}
}
]
]