mylibrar
commited on
Commit
·
cb27b88
1
Parent(s):
d40cd38
Fix issues in topic analysis
Browse files- data/topic_charts.json +0 -0
- main.py +1 -3
- results.py +11 -13
data/topic_charts.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
main.py
CHANGED
@@ -355,9 +355,7 @@ def main():
|
|
355 |
Li(
|
356 |
A(
|
357 |
"Topic Analysis",
|
358 |
-
href="
|
359 |
-
hx_get="/results#section5",
|
360 |
-
hx_target="#inner-text",
|
361 |
)
|
362 |
)
|
363 |
),
|
|
|
355 |
Li(
|
356 |
A(
|
357 |
"Topic Analysis",
|
358 |
+
href="#section55",
|
|
|
|
|
359 |
)
|
360 |
)
|
361 |
),
|
results.py
CHANGED
@@ -977,11 +977,11 @@ llama_div = Div(
|
|
977 |
|
978 |
with open(os.path.join(os.path.dirname(__file__), "data", "topic_charts.json"), 'r') as f:
|
979 |
topic_charts = json.load(f)
|
980 |
-
topic_graphs =
|
981 |
|
982 |
-
for title, data in topic_charts
|
983 |
if data["type"] == "barh":
|
984 |
-
topic_graphs
|
985 |
x=data["kwargs"]["width"],
|
986 |
y=data["kwargs"]['y'],
|
987 |
orientation='h',
|
@@ -989,23 +989,23 @@ for title, data in topic_charts.items():
|
|
989 |
"rgb(" + ", ".join(str(val * 255) for val in rgb) + ')'
|
990 |
for rgb in data["kwargs"]["color"]
|
991 |
]
|
992 |
-
))
|
993 |
else:
|
994 |
-
|
995 |
values=data["kwargs"]['x'],
|
996 |
labels=data["kwargs"]["labels"],
|
997 |
marker_colors=[
|
998 |
"rgb(" + ", ".join(str(val * 255) for val in rgb) + ')'
|
999 |
for rgb in data["kwargs"]["colors"]
|
1000 |
]
|
1001 |
-
))
|
1002 |
|
1003 |
cluster_div = Div(
|
1004 |
Section(
|
1005 |
H2("Topic Analysis"),
|
1006 |
-
P("We tried to classify data into topic groups and looked for correlations between topics and statistics of data.
|
1007 |
H3("Methodology"),
|
1008 |
-
P("We took the ", A("common crawl", href="https://commoncrawl.org/"), " data and clustered them into 17 topic groups using ", A("BERTopic", href="https://maartengr.github.io/BERTopic/index.html"), ".
|
1009 |
H3("Cluster Groups"),
|
1010 |
P("We grouped data into the following 17 clusters"),
|
1011 |
Ul(*(
|
@@ -1013,11 +1013,9 @@ cluster_div = Div(
|
|
1013 |
for topic_name in ("Arts", "Business & Economics & Finance", "Culture & Cultural geography", "Daily Life & Home & Lifestyle", "Education", "Entertainment & Travel & Hobby", "Environment", "Food & Drink & Cooking", "Health & Wellness & Medicine", "Law & Justice", "Natural Science & Formal Science & Technology", "Personal Development & Human Resources & Career", "Politics & Government", "Religion & Spirituality", "Shopping & Commodity", "Society & Social Issues & Human Rights", "Sports")
|
1014 |
)),
|
1015 |
H3("Results Analysis"),
|
1016 |
-
H3("Number of document of each topic"),
|
1017 |
-
plotly2fasthtml(topic_count_graph),
|
1018 |
*(
|
1019 |
-
Section(
|
1020 |
-
for title,
|
1021 |
)
|
1022 |
)
|
1023 |
)
|
@@ -1046,7 +1044,7 @@ def results():
|
|
1046 |
),
|
1047 |
Section(
|
1048 |
cluster_div,
|
1049 |
-
id="
|
1050 |
),
|
1051 |
id="inner-text"
|
1052 |
)
|
|
|
977 |
|
978 |
with open(os.path.join(os.path.dirname(__file__), "data", "topic_charts.json"), 'r') as f:
|
979 |
topic_charts = json.load(f)
|
980 |
+
topic_graphs = []
|
981 |
|
982 |
+
for title, data in topic_charts:
|
983 |
if data["type"] == "barh":
|
984 |
+
topic_graphs.append(go.Figure(go.Bar(
|
985 |
x=data["kwargs"]["width"],
|
986 |
y=data["kwargs"]['y'],
|
987 |
orientation='h',
|
|
|
989 |
"rgb(" + ", ".join(str(val * 255) for val in rgb) + ')'
|
990 |
for rgb in data["kwargs"]["color"]
|
991 |
]
|
992 |
+
)))
|
993 |
else:
|
994 |
+
topic_graphs.append(go.Figure(go.Pie(
|
995 |
values=data["kwargs"]['x'],
|
996 |
labels=data["kwargs"]["labels"],
|
997 |
marker_colors=[
|
998 |
"rgb(" + ", ".join(str(val * 255) for val in rgb) + ')'
|
999 |
for rgb in data["kwargs"]["colors"]
|
1000 |
]
|
1001 |
+
)))
|
1002 |
|
1003 |
cluster_div = Div(
|
1004 |
Section(
|
1005 |
H2("Topic Analysis"),
|
1006 |
+
P("We tried to classify data into topic groups and looked for correlations between topics and statistics of data. Data from different topic groups should manifest different characteristics of distribution, which can give us some insight into the composition of dataset."),
|
1007 |
H3("Methodology"),
|
1008 |
+
P("We took the ", A("common crawl", href="https://commoncrawl.org/"), " data and clustered them into 17 topic groups using ", A("BERTopic", href="https://maartengr.github.io/BERTopic/index.html"), ". We collected and aggregated a series of metrics which include quality signals and other useful metadata. For each topic group, we calculated average scores and generated the corresponding bar charts over different metrics for comparison and analysis."),
|
1009 |
H3("Cluster Groups"),
|
1010 |
P("We grouped data into the following 17 clusters"),
|
1011 |
Ul(*(
|
|
|
1013 |
for topic_name in ("Arts", "Business & Economics & Finance", "Culture & Cultural geography", "Daily Life & Home & Lifestyle", "Education", "Entertainment & Travel & Hobby", "Environment", "Food & Drink & Cooking", "Health & Wellness & Medicine", "Law & Justice", "Natural Science & Formal Science & Technology", "Personal Development & Human Resources & Career", "Politics & Government", "Religion & Spirituality", "Shopping & Commodity", "Society & Social Issues & Human Rights", "Sports")
|
1014 |
)),
|
1015 |
H3("Results Analysis"),
|
|
|
|
|
1016 |
*(
|
1017 |
+
Section(H4(title), plotly2fasthtml(topic_graphs[i]))
|
1018 |
+
for i, (title, _) in enumerate(topic_charts)
|
1019 |
)
|
1020 |
)
|
1021 |
)
|
|
|
1044 |
),
|
1045 |
Section(
|
1046 |
cluster_div,
|
1047 |
+
id="section55"
|
1048 |
),
|
1049 |
id="inner-text"
|
1050 |
)
|