TxT360 / data /topic_charts.json
mylibrar's picture
topic-analysis (#1)
a711d2f verified
raw
history blame
158 kB
[
[
"Number of Document of Each Topic",
{
"type": "pie",
"kwargs": {
"x": [
535838,
206990,
368022,
200460,
435310,
250450,
933732,
271801,
639890,
387594,
271359,
1473798,
459519,
1101903,
31659,
2254859,
591041
],
"labels": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"autopct": "%1.1f%%",
"colors": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
],
"pctdistance": 1.2,
"labeldistance": 1.5
},
"comment": "As shown in the graph above, over 20% of the documents are related to Business & Economics & Finance, which makes it the largest topic group in dataset. On the contrary, the group of Culture & Cultural geography contains the smallest number of documents among all topics."
}
],
[
"Fraction of Words Corrected in Lines",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.005599317351029421,
0.005491440909735792,
0.010611897213357221,
0.0061721529486005915,
0.005040363960665401,
0.0042498218252128035,
0.008174887952855342,
0.005098232906967347,
0.005905725848762689,
0.008048438948020924,
0.005920233062429675,
0.00738773833987446,
0.006788916830535338,
0.007824824620615435,
0.007817009319252808,
0.006894261391191716,
0.007759051322619051
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "In average, documents related to Shopping & Commodity have larger fraction of words corrected in lines."
}
],
[
"Fraction of Lines Ending with Ellipsis",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.013608683903284204,
0.01187771888948645,
0.010704198151112872,
0.013181499370177098,
0.012342863597933462,
0.01669603038717465,
0.013958760786106517,
0.011481605295821474,
0.011727508302172751,
0.013890752469918237,
0.012950109439490815,
0.015828153615401713,
0.011233498318616135,
0.013063106813702607,
0.013101045053120094,
0.012854514904197168,
0.014225441730661032
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Compared with other topics, Personal Development & Human Resources & Career in average contain more lines ending with ellipsis."
}
],
[
"Fraction of Lines Starting with Bullet Point",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.05958846605103529,
0.06540916901994907,
0.10871161367473074,
0.057639202535687495,
0.05391125998418046,
0.048856823399157104,
0.0919025139411848,
0.06361059519326412,
0.08348033701472354,
0.09887120370776314,
0.0654760782941809,
0.07275273301463199,
0.08648053868877607,
0.0728023788334523,
0.059507615068158916,
0.08230576538579888,
0.06015758928408362
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Shopping & Commodity related documents have higher percentage of lines starting with bullet point."
}
],
[
"Number of Lines with Toxic Words",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.18993986988604764,
0.8879124595391081,
0.25990565781393504,
0.26195250922877383,
0.25880866508924677,
1.059369135555999,
0.13686689542609656,
0.41953855946078195,
0.8275813030364594,
0.15215921815095176,
0.13490615752563948,
0.7103062970637767,
0.10924031432867846,
0.983178192635831,
0.1341482674752835,
0.14871528552339636,
0.44260888838506973
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Personal Development & Human Resources & Career in average has more lines with toxic words."
}
],
[
"Number of Toxic Words",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.2548288848495254,
1.5926663123822407,
0.4235181592404802,
0.38067444876783396,
0.32550136684202063,
2.0770772609303254,
0.20720185235163838,
0.590086129190106,
1.571774836299989,
0.20227609302517582,
0.18648727331689754,
1.453566228207665,
0.15104924932374938,
2.337839174591593,
0.20351242932499447,
0.24778267732040007,
0.7902395942075084
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Daily Life & Home & Lifestyle in average has more toxic words."
}
],
[
"Word Count",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
524.2469683001206,
634.8099570027538,
332.5969724636027,
654.5120023944927,
634.4970021364086,
747.0358714314234,
624.2853688210322,
570.2685052667209,
746.3173279782462,
427.6056492102561,
603.5602799243807,
470.1159928294108,
559.1577497339609,
450.07929463845727,
682.6580435263274,
559.7302638435485,
514.1515901604118
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Documents in the topic of Personal Development & Human Resources & Career in average contain more words than other topics."
}
],
[
"Mean Word Length",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
4.851116616082301,
5.17314698008811,
4.951553714759433,
4.8636771295932055,
5.165523097115738,
4.64498800138652,
5.233981234962708,
5.094122002544284,
5.191578081429402,
4.872407702558401,
5.077044932121297,
4.911569182027774,
5.25771470252484,
4.990336313339119,
5.138998450653204,
5.165914329275205,
4.943227231080612
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "There is no significant variance in the average word length for different topic groups. However, Education related data contain longer words than others in general."
}
],
[
"Number of Sentences",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
23.816802839664227,
28.88356925455336,
17.167653020743327,
32.65256909109049,
26.743545978727802,
41.80899580754642,
28.010818950191275,
25.435358957472562,
35.18096235290441,
22.968376703457743,
28.56101327024348,
22.844366731397383,
26.802678452904015,
22.603309910218957,
30.825168198616506,
26.01027691753675,
27.965867004150304
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Documents in the topic of Personal Development & Human Resources & Career usually contain more sentences."
}
],
[
"Symbol to Word Ratio",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.0029508316364481296,
0.002339527014691741,
0.002746622681352375,
0.0031207893125393786,
0.0024594503072570637,
0.003732116125668388,
0.0029521717963945683,
0.002009846839273012,
0.0023335875319153666,
0.0032912280108721562,
0.0026740153080243275,
0.0037401276658117497,
0.0022685436825723537,
0.0034624173472893424,
0.0022837896768252673,
0.002565854536163215,
0.0035536009817103663
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Documents related to Daily Life & Home & Lifestyle usually have higher percentage of symbols."
}
],
[
"Fraction of Words with Alpha Character",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.9554513833362817,
0.9672667625084445,
0.945038227724378,
0.9650443058450766,
0.9662993498435797,
0.9795101768513954,
0.949647348401343,
0.9644024275136092,
0.9651040360235426,
0.9515637138100507,
0.9638773263904938,
0.9544175710037947,
0.9602638724414636,
0.9533095901329957,
0.9536863995733356,
0.9573400271816177,
0.9613916720605239
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "The fraction of words with alpha character seems to be relatively consistent across different topics."
}
],
[
"Number of Stop Words",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
106.35206163056745,
139.14766896951542,
61.915719712408496,
150.11113937942733,
141.82980634490363,
156.21242563385906,
122.75635942647355,
122.98374178167114,
152.60597915266686,
83.42474857711936,
128.65106740517174,
93.49815985637109,
114.57335387655353,
86.25348147704472,
162.30932752139992,
114.21801717978818,
106.4132116046095
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Culture & Cultural geography contains more stop words in average."
}
],
[
"Has Curly Bracket",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.005337434075224228,
0.0067539494661577855,
0.01028199401122759,
0.009842362566097974,
0.011575658725965403,
0.00931123976841685,
0.02773600990434086,
0.006582021405366427,
0.009203144290424917,
0.01040779785032792,
0.008158933368710822,
0.007557345036429687,
0.010752547772779798,
0.011963847997509762,
0.012824157427587732,
0.009383291815585807,
0.008704979857573332
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Natural Science & Formal Science & Technology has a significantly higher rate in percentage of documents that contain curly bracket. It might be related to the coding data."
}
],
[
"Number of Document Duplication",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
12.971086410444947,
7.7029131842117975,
6.170992495013885,
7.104888755861518,
8.650198708966025,
6.623561589139549,
6.508078335111145,
9.093410252353744,
6.089149697604276,
7.057779532190901,
7.702302116384568,
6.5227466722033824,
6.954972482095409,
6.535254918082626,
9.99308253577182,
5.590145547903439,
6.865564317873041
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Sports related documents have a higher number of duplication count."
}
],
[
"Number of Dump Duplication",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
3.8719109133730716,
3.26455384318083,
2.2721848150382313,
3.265644018756859,
3.444853093197951,
3.1923417847873825,
2.7517906637022187,
3.3698330764051643,
2.710181437434559,
3.1639266861716124,
3.206342888940481,
2.7590002157690536,
2.8303421621304015,
2.6106544768459656,
3.9413752803310276,
2.4888664878823907,
3.094817449212491
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "In average, Culture & Cultural geography related documents are duplicated across a higher number of common crawl dumps. Duplication of Shopping & Commodity appears in less dumps than others."
}
],
[
"Number of Year Duplication",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
1.4484135130393887,
1.4291463355717668,
1.229893865040677,
1.4503641624264192,
1.452872665456801,
1.442735076861649,
1.3276539735170263,
1.4222795353953812,
1.328097016674741,
1.406236938652301,
1.4158586964132385,
1.3305229074812153,
1.3344910656577857,
1.2914712093532734,
1.5438579866704571,
1.2835525414227675,
1.404339123681775
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "In average, Culture & Cultural geography related documents are duplicated across more years than other topics."
}
],
[
"Maximum Span of Year Duplication",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
1.4810222492619038,
1.4615875163051355,
1.2437028221138953,
1.4792776613788288,
1.49291539362753,
1.4697025354362148,
1.3531216666024084,
1.4549983259811405,
1.3486599259247682,
1.432375114165854,
1.4460216908228583,
1.352992743917416,
1.3557807185339452,
1.309416527589089,
1.582425218737168,
1.3025537295236642,
1.4325029904862776
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "In average, Culture & Cultural geography related documents are duplicated across a wider span of years."
}
],
[
"Language Score",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.9413726660750416,
0.9347130364355554,
0.8847180050664069,
0.9336572405289453,
0.9420075430577804,
0.9522977107155225,
0.8831956938165678,
0.9481278901144439,
0.9241279717677588,
0.9066709862541587,
0.9270825804900252,
0.9117954084131167,
0.921528771738386,
0.8992133008305735,
0.9224377655046135,
0.9152426551412108,
0.9178671893764959
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Average language scores of different topic groups are mostly consistent. No significant differences are obeserved."
}
],
[
"Fraction of Duplicate Lines",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.01235014200385377,
0.012056246914591116,
0.018525258494333133,
0.012726935235443207,
0.01165333793386552,
0.010444387257042395,
0.016149995700960602,
0.012705431934865763,
0.01519943556613772,
0.014809953345215319,
0.012686293057054212,
0.01603496888664195,
0.01596207137084465,
0.016014032499666292,
0.013610478505124169,
0.01580386009616988,
0.015060041023072804
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "In average, Shopping & Commodity has a larger fraction of duplicate lines than others."
}
],
[
"Fraction of Characters in Duplicate Lines",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.00501345725991589,
0.004299959504074716,
0.0073601226054879785,
0.004651424152553605,
0.00450348053495509,
0.003909584113418541,
0.0063485903557626774,
0.00521503913729261,
0.005782503341128245,
0.005962335751386622,
0.004749891704712697,
0.006420052544922626,
0.0063561887111620065,
0.006466672218067342,
0.004978436253072214,
0.006108371322424041,
0.0057332990952240126
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
},
"comment": "Shopping & Commodity usually has a larger fraction of characters in duplicate lines than others."
}
],
[
"Fraction of Characters in Most Common Bigram",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.02614438964212445,
0.02549244163757135,
0.03593997020714517,
0.026520648762908574,
0.023796693998532542,
0.019517664362790295,
0.03146900938445295,
0.026900122790576828,
0.027486920194835916,
0.029735671266585457,
0.02724062185263462,
0.030402249730981233,
0.03031798250174187,
0.034936591389516845,
0.02730012031746535,
0.029329317288923955,
0.028440195287636943
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
],
[
"Fraction of Characters in Most Common 3-gram",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.025877442339206684,
0.026073122171118526,
0.03794965393907832,
0.02756936824343807,
0.024589084236341825,
0.019970321326854976,
0.031104349287997282,
0.027138921074492478,
0.02674544851177018,
0.03082668946385283,
0.027642774270487825,
0.031311152209273344,
0.030596143210215625,
0.0352048856850328,
0.028135774349846692,
0.029182052353507664,
0.03023015052666528
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
],
[
"Fraction of Characters in Most Common 4-gram",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.026649882448510714,
0.026904261739744192,
0.04086332064828129,
0.0286899321496711,
0.025495383586610822,
0.020748509542508307,
0.03171918073481819,
0.027563495776633813,
0.02693813171261885,
0.03243470539147362,
0.0287992899739741,
0.032589127100319,
0.031139178077804624,
0.03630423964958027,
0.029809457289325606,
0.029522356378146167,
0.032375986416410006
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
],
[
"Fraction of Characters in Duplicate 5-grams",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.016989093741368057,
0.01874268974292254,
0.02001249239006167,
0.01893345653851295,
0.017576185062959156,
0.013966567341084396,
0.026648000310062814,
0.021239561601745963,
0.022547189937081085,
0.016903077473431387,
0.018277127900190513,
0.019079382613460993,
0.023467347573746446,
0.021192854307303135,
0.019157826340526964,
0.021183180653813184,
0.016589870490142093
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
],
[
"Fraction of Characters in Duplicate 6-grams",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.011946923836249373,
0.013297904416841108,
0.014258151562338789,
0.013153098583726782,
0.012601072651000291,
0.009837626317910313,
0.01949595975959962,
0.014924056163499448,
0.015889641140216917,
0.011840004108930956,
0.012940087820238557,
0.013424858515603134,
0.016468963654372438,
0.014839192401791004,
0.01388493355575309,
0.01498376261489033,
0.011812586464028073
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
],
[
"Fraction of Characters in Duplicate 7-grams",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.009095872215498261,
0.010146405377015994,
0.010487557518535542,
0.009868429864354638,
0.009802808055168035,
0.007541438580109868,
0.015129318997269138,
0.011302686364124783,
0.011969695536420487,
0.00892604076557906,
0.009759568234633746,
0.010070709856859254,
0.012419860047704056,
0.011070038486862109,
0.010547925069683646,
0.011327481696653985,
0.008957792606056945
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
],
[
"Fraction of Characters in Duplicate 8-grams",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.0073128979182685684,
0.008109486840255576,
0.00813393016693516,
0.007792984494504855,
0.008002590936702558,
0.006117199534770664,
0.012284551039331444,
0.009023639757214827,
0.009496488981527608,
0.007086539674993228,
0.0076650824522217454,
0.007877075837565403,
0.00987496717434344,
0.008652258777583252,
0.008392133389867372,
0.00901948167673584,
0.007053229339496676
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
],
[
"Fraction of Characters in Duplicate 9-grams",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.006148310898840968,
0.00676529697013875,
0.00643290688721836,
0.006434352312383364,
0.0067735701471297,
0.005172565516416477,
0.010288525380088334,
0.007482544617336476,
0.00780204339328974,
0.005852660603046196,
0.006240040171999708,
0.006465362460507409,
0.008165651028577293,
0.006986331812620781,
0.006928899525750178,
0.007487698229960434,
0.005728220555701086
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
],
[
"Fraction of Characters in Duplicate 10-grams",
{
"type": "barh",
"kwargs": {
"y": [
"Sports",
"Society & Social Issues & Human Rights",
"Shopping & Commodity",
"Religion & Spirituality",
"Politics & Government",
"Personal Development & Human Resources & Career",
"Natural Science & Formal Science & Technology",
"Law & Justice",
"Health & Wellness & Medicine",
"Food & Drink & Cooking",
"Environment",
"Entertainment & Travel & Hobby",
"Education",
"Daily Life & Home & Lifestyle",
"Culture & Cultural geography",
"Business & Economics & Finance",
"Arts"
],
"width": [
0.005325364079705381,
0.005797357629820572,
0.005283647214644124,
0.005467491111249268,
0.005879825006312822,
0.004529332536092203,
0.008882676950579147,
0.006399831899960353,
0.006645377495475746,
0.005021569571200667,
0.005306020206939719,
0.00550360123328725,
0.007013864844056383,
0.005835955446724545,
0.005845555947354781,
0.00641447975612288,
0.004809876486057196
],
"color": [
[
1.0,
0.4980392156862745,
0.054901960784313725,
1.0
],
[
1.0,
0.7333333333333333,
0.47058823529411764,
1.0
],
[
0.17254901960784313,
0.6274509803921569,
0.17254901960784313,
1.0
],
[
0.596078431372549,
0.8745098039215686,
0.5411764705882353,
1.0
],
[
0.8392156862745098,
0.15294117647058825,
0.1568627450980392,
1.0
],
[
1.0,
0.596078431372549,
0.5882352941176471,
1.0
],
[
0.5803921568627451,
0.403921568627451,
0.7411764705882353,
1.0
],
[
0.7725490196078432,
0.6901960784313725,
0.8352941176470589,
1.0
],
[
0.5490196078431373,
0.33725490196078434,
0.29411764705882354,
1.0
],
[
0.7686274509803922,
0.611764705882353,
0.5803921568627451,
1.0
],
[
0.8901960784313725,
0.4666666666666667,
0.7607843137254902,
1.0
],
[
0.9686274509803922,
0.7137254901960784,
0.8235294117647058,
1.0
],
[
0.4980392156862745,
0.4980392156862745,
0.4980392156862745,
1.0
],
[
0.7803921568627451,
0.7803921568627451,
0.7803921568627451,
1.0
],
[
0.7372549019607844,
0.7411764705882353,
0.13333333333333333,
1.0
],
[
0.8588235294117647,
0.8588235294117647,
0.5529411764705883,
1.0
],
[
0.09019607843137255,
0.7450980392156863,
0.8117647058823529,
1.0
]
]
},
"x_label": "Metrics",
"subplots_adjust": {
"left": 0.37,
"right": 0.98
}
}
]
]