import json import os # Configuration name = "chs.json" outputFolder = "database" deleteKeys = [ "images", "tags", "html" ] typeScrape = { "article": "text", "event": "description", "list": "items" } data = json.load(open(name, "r")) i = -1 k = 0 try: os.mkdir(outputFolder) except: pass for item in data: i += 1 for key in deleteKeys: if key in item: item[key] del item[key] data[i] = item if "type" in item: for typeKey, scrapeText in typeScrape.items(): try: if item["type"] == typeKey: k += 1 file = open(f"{outputFolder}/chs-{typeKey}-{k}.txt", "a") if item["type"] == "list": text = "" if "title" in item: text = item["title"] file.write(text) for pair in item[scrapeText]: text = "" if "title" in pair: text = "\n" + pair["title"] if "summary" in pair: if pair["summary"].replace(" ", "") != pair["title"].replace(" ", ""): text += "\n" + pair["summary"].replace(pair["title"], "") if "fsElementContent" in pair: if pair["fsElementContent"].replace(" ", "") != pair["title"].replace(" ", ""): text += "\n" + pair["fsElementContent"] if "fsElementFooterContent" in pair: if pair["fsElementFooterContent"].replace(" ", "") != pair["title"].replace(" ", ""): text += "\n" + pair["fsElementFooterContent"] if "fsElementHeaderContent" in pair: if pair["fsElementHeaderContent"].replace(" ", "") != pair["title"].replace(" ", ""): text += "\n" + pair["fsElementHeaderContent"] if text != "": file.write(text) else: text = item[scrapeText] if text != "": file.write(text) except: pass json.dump(data, open(name, "w"), indent = 6)