{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from glob import glob\n", "import xml.etree.ElementTree as ET\n", "import json" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "files = glob(\"../../kouigenjimonogatari.github.io/xml/lw/*.xml\")\n", "files.sort()\n", "data = []\n", "\n", "for file in files:\n", " tree = ET.parse(file)\n", " root = tree.getroot()\n", "\n", " vol = int(file.split(\"/\")[-1].split(\".\")[0])\n", " segs = root.findall(\".//{http://www.tei-c.org/ns/1.0}seg\")\n", " texts = {}\n", "\n", " for seg in segs: \n", " corresp = seg.attrib[\"corresp\"]\n", " page = int(corresp.split(\"/\")[-1].split(\"-\")[0])\n", " text = seg.text\n", "\n", " if page not in texts:\n", " texts[page] = []\n", "\n", " texts[page].append(text)\n", "\n", " for page in texts:\n", " text = \" \".join(texts[page])\n", " data.append({\"vol\": vol, \"page\": page, \"text\": text})" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "with open(\"../data.json\", \"w\") as f:\n", " json.dump(data, f, ensure_ascii=False, indent=4)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.11" } }, "nbformat": 4, "nbformat_minor": 2 }