File size: 1,340 Bytes
2a68adc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from nltk import FreqDist
from nltk.corpus import brown
from nltk.stem.lancaster import LancasterStemmer
import json

WORD_LIMIT = 10000
MIN_WORD_SIZE, MAX_WORD_SIZE = 4, 10

# stem = LancasterStemmer()
# frequency_list = FreqDist(i.lower() for i in brown.words())
# words = [
#     w.lower()
#     for w, _ in frequency_list.most_common()[:WORD_LIMIT]
#     if w.isalpha() and len(w) >= MIN_WORD_SIZE and len(w) <= MAX_WORD_SIZE
# ]
# stem_to_words = {}
# for word in words:
#     stemmed = stem.stem(word)
#     if stemmed not in stem_to_words:
#         stem_to_words[stemmed] = set()
#     stem_to_words[stemmed].add(word)

# final_words = []
# for stem, words in stem_to_words.items():
#     shortest = min(words, key=len)
#     final_words.append(shortest)

# with open("words.json", "w") as f:
#     f.write(json.dumps(final_words))

with open("jeopardy.json", "r") as f:
    jeopardy = json.loads(f.read())

answers = set()
for row in jeopardy:
    answer = row["answer"].lower()
    if not answer.isalpha():
        continue
    if answer.startswith("the "):
        answer = answer[4:]
    elif answer.startswith("a "):
        answer = answer[2:]
    if len(answer) < MIN_WORD_SIZE or len(answer) > MAX_WORD_SIZE:
        continue
    answers.add(answer)

with open("words.json", "w") as f:
    f.write(json.dumps(list(answers)))