schoolQuest / parse.py
araeyn's picture
yeah
5de72d6
import json
import os
# Configuration
name = "chs.json"
outputFolder = "database"
deleteKeys = [
"images",
"tags",
"html"
]
typeScrape = {
"article": "text",
"event": "description",
"list": "items"
}
data = json.load(open(name, "r"))
i = -1
k = 0
try:
os.mkdir(outputFolder)
except: pass
for item in data:
i += 1
for key in deleteKeys:
if key in item:
item[key]
del item[key]
data[i] = item
if "type" in item:
for typeKey, scrapeText in typeScrape.items():
try:
if item["type"] == typeKey:
k += 1
file = open(f"{outputFolder}/chs-{typeKey}-{k}.txt", "a")
if item["type"] == "list":
text = ""
if "title" in item:
text = item["title"]
file.write(text)
for pair in item[scrapeText]:
text = ""
if "title" in pair:
text = "\n" + pair["title"]
if "summary" in pair:
if pair["summary"].replace(" ", "") != pair["title"].replace(" ", ""):
text += "\n" + pair["summary"].replace(pair["title"], "")
if "fsElementContent" in pair:
if pair["fsElementContent"].replace(" ", "") != pair["title"].replace(" ", ""):
text += "\n" + pair["fsElementContent"]
if "fsElementFooterContent" in pair:
if pair["fsElementFooterContent"].replace(" ", "") != pair["title"].replace(" ", ""):
text += "\n" + pair["fsElementFooterContent"]
if "fsElementHeaderContent" in pair:
if pair["fsElementHeaderContent"].replace(" ", "") != pair["title"].replace(" ", ""):
text += "\n" + pair["fsElementHeaderContent"]
if text != "":
file.write(text)
else:
text = item[scrapeText]
if text != "":
file.write(text)
except: pass
json.dump(data, open(name, "w"), indent = 6)