victormiller
commited on
Commit
•
be782f3
1
Parent(s):
e04322e
Update web.py
Browse files
web.py
CHANGED
@@ -708,25 +708,19 @@ def web_data():
|
|
708 |
P("""
|
709 |
There is minimal variation among existing pipeline implementations. We simply compute the mean word length as follows:
|
710 |
"""),
|
711 |
-
|
712 |
-
Code("""
|
713 |
words = text.split()
|
714 |
word_count = len(words)
|
715 |
character_count = sum(len(word) for word in words)
|
716 |
mean_word_length = character_count / word_count
|
717 |
-
"""),
|
718 |
-
cls="code-block",
|
719 |
-
),
|
720 |
P("""
|
721 |
It's worth noting that Dolma used the median word length instead of the mean in their codes.
|
722 |
"""),
|
723 |
-
|
724 |
-
Code("""
|
725 |
from statistics import median
|
726 |
median_word_length = median(len(word) for word in words)
|
727 |
-
"""),
|
728 |
-
cls="code-block",
|
729 |
-
),
|
730 |
H5("Number of Sentences"),
|
731 |
P("""
|
732 |
The only publicly available implementation of this quality signal is from RedPajama V2, which uses regular expressions
|
|
|
708 |
P("""
|
709 |
There is minimal variation among existing pipeline implementations. We simply compute the mean word length as follows:
|
710 |
"""),
|
711 |
+
D_code("""
|
|
|
712 |
words = text.split()
|
713 |
word_count = len(words)
|
714 |
character_count = sum(len(word) for word in words)
|
715 |
mean_word_length = character_count / word_count
|
716 |
+
""", block="block", language="python"),
|
|
|
|
|
717 |
P("""
|
718 |
It's worth noting that Dolma used the median word length instead of the mean in their codes.
|
719 |
"""),
|
720 |
+
D_code("""
|
|
|
721 |
from statistics import median
|
722 |
median_word_length = median(len(word) for word in words)
|
723 |
+
""", block="block", language="python"),
|
|
|
|
|
724 |
H5("Number of Sentences"),
|
725 |
P("""
|
726 |
The only publicly available implementation of this quality signal is from RedPajama V2, which uses regular expressions
|