{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'bs4'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mrequests\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mbs4\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BeautifulSoup\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'bs4'" ] } ], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "url = \"https://genji.dl.itc.u-tokyo.ac.jp/data/info.json\"\n", "\n", "response = requests.get(url).json()\n", "\n", "selections = response[\"selections\"]\n", "\n", "for selection in selections:\n", "\n", " members = selection[\"members\"]\n", "\n", " for member in members:\n", "\n", " aozora_urls = []\n", "\n", " for metadata in member[\"metadata\"]:\n", "\n", " if metadata[\"label\"] == \"aozora\":\n", "\n", " aozora_urls = metadata[\"value\"].split(\", \")\n", "\n", " for aozora_url in aozora_urls:\n", "\n", " filename = aozora_url.split(\"/\")[-1].split(\".\")[0]\n", "\n", " opath = f\"./data/text/{filename}.txt\"\n", "\n", " if os.path.exists(opath):\n", " continue\n", " # pass\n", "\n", " response = requests.get(aozora_url)\n", "\n", " response.encoding = response.apparent_encoding\n", "\n", " soup = BeautifulSoup(response.text, \"html.parser\")\n", "\n", " div = soup.find(\"div\", class_=\"main_text\") \n", "\n", " txt = div.get_text().strip()\n", "\n", " os.makedirs(os.path.dirname(opath), exist_ok=True)\n", "\n", " with open(opath, \"w\") as f:\n", " f.write(txt)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.11" } }, "nbformat": 4, "nbformat_minor": 2 }