Spaces:
Sleeping
Sleeping
Maki
commited on
Commit
·
08d41d2
unverified
·
0
Parent(s):
Initial commit
Browse files- .SourceSageignore +54 -0
- .dockerignore +56 -0
- .github/workflows/sync-to-hf.yml +32 -0
- .github/workflows/sync-to-report-gh.yml +52 -0
- .gitignore +208 -0
- Dockerfile +28 -0
- LICENSE +21 -0
- README.md +174 -0
- app.py +431 -0
- docker-compose.dev.yml +25 -0
- docker-compose.yml +27 -0
- requirements.txt +4 -0
- theme.py +44 -0
.SourceSageignore
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# バージョン管理システム関連
|
2 |
+
.git/
|
3 |
+
.gitignore
|
4 |
+
|
5 |
+
# キャッシュファイル
|
6 |
+
__pycache__/
|
7 |
+
.pytest_cache/
|
8 |
+
**/__pycache__/**
|
9 |
+
*.pyc
|
10 |
+
|
11 |
+
# ビルド・配布関連
|
12 |
+
build/
|
13 |
+
dist/
|
14 |
+
*.egg-info/
|
15 |
+
|
16 |
+
# 一時ファイル・出力
|
17 |
+
output/
|
18 |
+
output.md
|
19 |
+
test_output/
|
20 |
+
.SourceSageAssets/
|
21 |
+
.SourceSageAssetsDemo/
|
22 |
+
|
23 |
+
# アセット
|
24 |
+
*.png
|
25 |
+
*.svg
|
26 |
+
*.jpg
|
27 |
+
*.jepg
|
28 |
+
assets/
|
29 |
+
|
30 |
+
# その他
|
31 |
+
LICENSE
|
32 |
+
example/
|
33 |
+
package-lock.json
|
34 |
+
.DS_Store
|
35 |
+
|
36 |
+
# 特定のディレクトリを除外
|
37 |
+
tests/temp/
|
38 |
+
docs/drafts/
|
39 |
+
|
40 |
+
# パターンの例外(除外対象から除外)
|
41 |
+
!docs/important.md
|
42 |
+
!.github/workflows/
|
43 |
+
repository_summary.md
|
44 |
+
|
45 |
+
# Terraform関連
|
46 |
+
.terraform
|
47 |
+
*.terraform.lock.hcl
|
48 |
+
*.backup
|
49 |
+
*.tfstate
|
50 |
+
|
51 |
+
# Python仮想環境
|
52 |
+
venv
|
53 |
+
.venv
|
54 |
+
|
.dockerignore
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Git関連
|
2 |
+
.git
|
3 |
+
.gitignore
|
4 |
+
|
5 |
+
# Python関連
|
6 |
+
__pycache__/
|
7 |
+
*.py[cod]
|
8 |
+
*$py.class
|
9 |
+
*.so
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
*.egg-info/
|
24 |
+
.installed.cfg
|
25 |
+
*.egg
|
26 |
+
|
27 |
+
# 仮想環境
|
28 |
+
venv/
|
29 |
+
env/
|
30 |
+
ENV/
|
31 |
+
|
32 |
+
# IDE関連
|
33 |
+
.vscode/
|
34 |
+
.idea/
|
35 |
+
*.swp
|
36 |
+
*.swo
|
37 |
+
|
38 |
+
# OS関連
|
39 |
+
.DS_Store
|
40 |
+
Thumbs.db
|
41 |
+
|
42 |
+
# ログファイル
|
43 |
+
*.log
|
44 |
+
|
45 |
+
# 一時ファイル
|
46 |
+
*.tmp
|
47 |
+
*.temp
|
48 |
+
|
49 |
+
# Docker関連
|
50 |
+
Dockerfile*
|
51 |
+
docker-compose*
|
52 |
+
.dockerignore
|
53 |
+
|
54 |
+
# ドキュメント
|
55 |
+
README.md
|
56 |
+
LICENSE
|
.github/workflows/sync-to-hf.yml
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- main
|
7 |
+
- master
|
8 |
+
workflow_dispatch:
|
9 |
+
|
10 |
+
jobs:
|
11 |
+
sync-to-hf:
|
12 |
+
runs-on: ubuntu-latest
|
13 |
+
steps:
|
14 |
+
- name: Checkout repository
|
15 |
+
uses: actions/checkout@v4
|
16 |
+
with:
|
17 |
+
fetch-depth: 0
|
18 |
+
lfs: true
|
19 |
+
|
20 |
+
- name: Push to Hugging Face Hub
|
21 |
+
env:
|
22 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
23 |
+
run: |
|
24 |
+
# Git設定
|
25 |
+
git config --global user.email "[email protected]"
|
26 |
+
git config --global user.name "GitHub Action"
|
27 |
+
|
28 |
+
# Hugging Face Hubにリモートを追加
|
29 |
+
git remote add hf https://huggingface.co/spaces/MakiAi/wikipedia-to-markdown
|
30 |
+
|
31 |
+
# 強制プッシュでHugging Faceに同期
|
32 |
+
git push --force https://user:[email protected]/spaces/MakiAi/wikipedia-to-markdown HEAD:main
|
.github/workflows/sync-to-report-gh.yml
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: 📊 デイリーレポートハブ同期 v2.3 (YUKIHIKO PR版 - 完全リモート実行)
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches: [main, master]
|
5 |
+
pull_request:
|
6 |
+
types: [opened, synchronize, closed]
|
7 |
+
|
8 |
+
env:
|
9 |
+
WEEK_START_DAY: 1
|
10 |
+
AUTO_APPROVE: true
|
11 |
+
AUTO_MERGE: true
|
12 |
+
CREATE_PR: true
|
13 |
+
# リモートスクリプトの設定
|
14 |
+
SCRIPTS_BASE_URL: https://raw.githubusercontent.com/Sunwood-ai-labsII/daily-report-hub_dev/main/.github/scripts
|
15 |
+
|
16 |
+
jobs:
|
17 |
+
sync-data:
|
18 |
+
runs-on: ubuntu-latest
|
19 |
+
steps:
|
20 |
+
- name: 📥 現在のリポジトリをチェックアウト
|
21 |
+
uses: actions/checkout@v4
|
22 |
+
with:
|
23 |
+
fetch-depth: 0
|
24 |
+
|
25 |
+
- name: 📅 週情報を計算
|
26 |
+
run: curl -LsSf ${SCRIPTS_BASE_URL}/calculate-week-info.sh | sh -s -- ${{ env.WEEK_START_DAY }}
|
27 |
+
|
28 |
+
- name: 🔍 Git活動を分析
|
29 |
+
run: curl -LsSf ${SCRIPTS_BASE_URL}/analyze-git-activity.sh | sh
|
30 |
+
|
31 |
+
- name: 📝 Markdownレポートを生成
|
32 |
+
run: curl -LsSf ${SCRIPTS_BASE_URL}/generate-markdown-reports.sh | sh
|
33 |
+
|
34 |
+
- name: 📂 レポートハブをクローン
|
35 |
+
env:
|
36 |
+
GITHUB_TOKEN: ${{ secrets.GH_PAT }}
|
37 |
+
REPORT_HUB_REPO: ${{ vars.REPORT_HUB_REPO || 'Sunwood-ai-labsII/daily-report-hub' }}
|
38 |
+
run: |
|
39 |
+
git config --global user.name "GitHub Actions Bot"
|
40 |
+
git config --global user.email "[email protected]"
|
41 |
+
git clone https://x-access-token:${GITHUB_TOKEN}@github.com/${REPORT_HUB_REPO}.git daily-report-hub
|
42 |
+
|
43 |
+
- name: 🏗️ Docusaurus構造を作成
|
44 |
+
run: curl -LsSf ${SCRIPTS_BASE_URL}/create-docusaurus-structure.sh | sh
|
45 |
+
|
46 |
+
- name: 🚀 YUKIHIKO権限でPR作成&自動承認
|
47 |
+
env:
|
48 |
+
GITHUB_TOKEN_ORIGINAL: ${{ secrets.GH_PAT }} # 承認用
|
49 |
+
YUKIHIKO_TOKEN: ${{ secrets.GH_PAT_YUKIHIKO }} # PR作成用
|
50 |
+
GITHUB_TOKEN: ${{ secrets.GH_PAT }} # デフォルト
|
51 |
+
REPORT_HUB_REPO: ${{ vars.REPORT_HUB_REPO || 'Sunwood-ai-labsII/daily-report-hub' }}
|
52 |
+
run: curl -LsSf ${SCRIPTS_BASE_URL}/sync-to-hub-gh.sh | sh
|
.gitignore
ADDED
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[codz]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py.cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# UV
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
#uv.lock
|
102 |
+
|
103 |
+
# poetry
|
104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
106 |
+
# commonly ignored for libraries.
|
107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
108 |
+
#poetry.lock
|
109 |
+
#poetry.toml
|
110 |
+
|
111 |
+
# pdm
|
112 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
113 |
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
114 |
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
115 |
+
#pdm.lock
|
116 |
+
#pdm.toml
|
117 |
+
.pdm-python
|
118 |
+
.pdm-build/
|
119 |
+
|
120 |
+
# pixi
|
121 |
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
122 |
+
#pixi.lock
|
123 |
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
124 |
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
125 |
+
.pixi
|
126 |
+
|
127 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
128 |
+
__pypackages__/
|
129 |
+
|
130 |
+
# Celery stuff
|
131 |
+
celerybeat-schedule
|
132 |
+
celerybeat.pid
|
133 |
+
|
134 |
+
# SageMath parsed files
|
135 |
+
*.sage.py
|
136 |
+
|
137 |
+
# Environments
|
138 |
+
.env
|
139 |
+
.envrc
|
140 |
+
.venv
|
141 |
+
env/
|
142 |
+
venv/
|
143 |
+
ENV/
|
144 |
+
env.bak/
|
145 |
+
venv.bak/
|
146 |
+
|
147 |
+
# Spyder project settings
|
148 |
+
.spyderproject
|
149 |
+
.spyproject
|
150 |
+
|
151 |
+
# Rope project settings
|
152 |
+
.ropeproject
|
153 |
+
|
154 |
+
# mkdocs documentation
|
155 |
+
/site
|
156 |
+
|
157 |
+
# mypy
|
158 |
+
.mypy_cache/
|
159 |
+
.dmypy.json
|
160 |
+
dmypy.json
|
161 |
+
|
162 |
+
# Pyre type checker
|
163 |
+
.pyre/
|
164 |
+
|
165 |
+
# pytype static type analyzer
|
166 |
+
.pytype/
|
167 |
+
|
168 |
+
# Cython debug symbols
|
169 |
+
cython_debug/
|
170 |
+
|
171 |
+
# PyCharm
|
172 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
173 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
174 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
175 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
176 |
+
#.idea/
|
177 |
+
|
178 |
+
# Abstra
|
179 |
+
# Abstra is an AI-powered process automation framework.
|
180 |
+
# Ignore directories containing user credentials, local state, and settings.
|
181 |
+
# Learn more at https://abstra.io/docs
|
182 |
+
.abstra/
|
183 |
+
|
184 |
+
# Visual Studio Code
|
185 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
186 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
187 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
188 |
+
# you could uncomment the following to ignore the entire vscode folder
|
189 |
+
# .vscode/
|
190 |
+
|
191 |
+
# Ruff stuff:
|
192 |
+
.ruff_cache/
|
193 |
+
|
194 |
+
# PyPI configuration file
|
195 |
+
.pypirc
|
196 |
+
|
197 |
+
# Cursor
|
198 |
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
199 |
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
200 |
+
# refer to https://docs.cursor.com/context/ignore-files
|
201 |
+
.cursorignore
|
202 |
+
.cursorindexingignore
|
203 |
+
|
204 |
+
# Marimo
|
205 |
+
marimo/_static/
|
206 |
+
marimo/_lsp/
|
207 |
+
__marimo__/
|
208 |
+
.SourceSageAssets/
|
Dockerfile
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python 3.11をベースイメージとして使用
|
2 |
+
FROM python:3.11-slim
|
3 |
+
|
4 |
+
# 作業ディレクトリを設定
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
# システムパッケージの更新とクリーンアップ
|
8 |
+
RUN apt-get update && apt-get install -y \
|
9 |
+
&& rm -rf /var/lib/apt/lists/*
|
10 |
+
|
11 |
+
# 依存関係ファイルをコピー
|
12 |
+
COPY requirements.txt .
|
13 |
+
|
14 |
+
# Python依存関係をインストール
|
15 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
16 |
+
|
17 |
+
# アプリケーションファイルをコピー
|
18 |
+
COPY . .
|
19 |
+
|
20 |
+
# ポート7861を公開
|
21 |
+
EXPOSE 7861
|
22 |
+
|
23 |
+
# 非rootユーザーを作成してセキュリティを向上
|
24 |
+
RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
|
25 |
+
USER appuser
|
26 |
+
|
27 |
+
# アプリケーションを起動
|
28 |
+
CMD ["python", "app.py"]
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2025 Maki
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: mit
|
3 |
+
title: wikipedia to markdown
|
4 |
+
sdk: gradio
|
5 |
+
emoji: 📚
|
6 |
+
colorFrom: yellow
|
7 |
+
colorTo: gray
|
8 |
+
thumbnail: >-
|
9 |
+
https://cdn-uploads.huggingface.co/production/uploads/64e0ef4a4c78e1eba5178d7a/vJQZ24fctExV3dax_BGU-.jpeg
|
10 |
+
sdk_version: 5.42.0
|
11 |
+
---
|
12 |
+
|
13 |
+
<div align="center">
|
14 |
+
|
15 |
+

|
16 |
+
|
17 |
+
# 📚 Wikipedia to Markdown Converter
|
18 |
+
|
19 |
+
*WikipediaページをMarkdown形式に変換するWebアプリケーション*
|
20 |
+
|
21 |
+
[](https://python.org)
|
22 |
+
[](https://gradio.app)
|
23 |
+
[](LICENSE)
|
24 |
+
[](https://huggingface.co/spaces/MakiAi/wikipedia-to-markdown)
|
25 |
+
|
26 |
+
</div>
|
27 |
+
|
28 |
+
---
|
29 |
+
|
30 |
+
## 🌟 概要
|
31 |
+
|
32 |
+
**Wikipedia to Markdown Converter** は、Wikipediaの記事を整形されたMarkdownドキュメントに変換するWebアプリケーションです。単体処理と一括処理に対応し、複数のダウンロード形式を提供します。
|
33 |
+
|
34 |
+
### ✨ **主要機能**
|
35 |
+
|
36 |
+
- 🔄 **単体・一括処理** - 1つまたは複数のWikipediaページを同時変換
|
37 |
+
- 📊 **詳細分析** - 文字数、成功率、ファイル情報を表示
|
38 |
+
- 🗜️ **複数形式** - 個別ファイル、結合文書、ZIPダウンロード
|
39 |
+
- 🌐 **多言語対応** - 全てのWikipedia言語版に対応
|
40 |
+
- � **要使いやすいUI** - 直感的で美しいインターフェース
|
41 |
+
|
42 |
+
---
|
43 |
+
|
44 |
+
## 🚀 使い方
|
45 |
+
|
46 |
+
### � **オンラインで試す(推奨)**
|
47 |
+
**[🚀 デモサイトはこちら](https://huggingface.co/spaces/MakiAi/wikipedia-to-markdown)**
|
48 |
+
|
49 |
+
### 💻 **ローカルで実行**
|
50 |
+
|
51 |
+
```bash
|
52 |
+
# リポジトリをクローン
|
53 |
+
git clone https://github.com/your-username/wikipedia-to-markdown.git
|
54 |
+
cd wikipedia-to-markdown
|
55 |
+
|
56 |
+
# 依存関係をインストール
|
57 |
+
pip install -r requirements.txt
|
58 |
+
|
59 |
+
# アプリケーションを起動
|
60 |
+
python app.py
|
61 |
+
```
|
62 |
+
|
63 |
+
### 🐳 **Dockerで実行**
|
64 |
+
|
65 |
+
```bash
|
66 |
+
# Docker Composeを使用
|
67 |
+
docker-compose up -d
|
68 |
+
|
69 |
+
# ブラウザで http://localhost:7860 にアクセス
|
70 |
+
```
|
71 |
+
|
72 |
+
---
|
73 |
+
|
74 |
+
## 📋 操作方法
|
75 |
+
|
76 |
+
### 🔗 **単体処理**
|
77 |
+
1. WikipediaのURLを入力
|
78 |
+
2. 「✨ 変換する」ボタンをクリック
|
79 |
+
3. 生成されたMarkdownをコピーまたはダウンロード
|
80 |
+
|
81 |
+
### 📚 **一括処理**
|
82 |
+
1. 複数のURLを1行に1つずつ入力
|
83 |
+
2. 「🚀 一括変換する」ボタンをクリック
|
84 |
+
3. 処理結果を確認し、必要な形式でダウンロード
|
85 |
+
|
86 |
+
### 📊 **処理結果の表示例**
|
87 |
+
```
|
88 |
+
============================================================
|
89 |
+
📊 処理結果サマリー
|
90 |
+
============================================================
|
91 |
+
🔗 処理対象URL数: 3
|
92 |
+
✅ 成功: 2
|
93 |
+
❌ 失敗: 1
|
94 |
+
|
95 |
+
✅ 処理成功: https://ja.wikipedia.org/wiki/Python
|
96 |
+
📄 ページタイトル: Python
|
97 |
+
📊 文字数: 15,432 文字
|
98 |
+
💾 ファイル名: Python.md
|
99 |
+
```
|
100 |
+
|
101 |
+
---
|
102 |
+
|
103 |
+
## 📦 ダウンロード形式
|
104 |
+
|
105 |
+
| 形式 | 説明 | 用途 |
|
106 |
+
|------|------|------|
|
107 |
+
| **📄 個別ファイル** | 各ページを別々のMarkdownファイル | 個別編集・管理 |
|
108 |
+
| **📚 結合文書** | 全ページを1つのファイルに結合 | 一括閲覧・印刷 |
|
109 |
+
| **🗜️ ZIPアーカイブ** | 全ファイルを圧縮してまとめて | 大量ファイルの管理 |
|
110 |
+
|
111 |
+
---
|
112 |
+
|
113 |
+
## 🔧 技術仕様
|
114 |
+
|
115 |
+
### **使用技術**
|
116 |
+
- **Python 3.8+** - メイン言語
|
117 |
+
- **Gradio** - Webインターフェース
|
118 |
+
- **BeautifulSoup4** - HTML解析
|
119 |
+
- **html2text** - Markdown変換
|
120 |
+
- **Requests** - HTTP通信
|
121 |
+
|
122 |
+
### **処理フロー**
|
123 |
+
1. **URL検証** - 入力URLの妥当性チェック
|
124 |
+
2. **HTML取得** - Wikipediaページの取得
|
125 |
+
3. **コンテンツ抽出** - 主要コンテンツの抽出
|
126 |
+
4. **クリーンアップ** - 不要部分(脚注、編集リンク等)の削除
|
127 |
+
5. **Markdown変換** - 整形されたMarkdownに変換
|
128 |
+
6. **ファイル生成** - 各種形式でのファイル出力
|
129 |
+
|
130 |
+
---
|
131 |
+
|
132 |
+
## 📁 プロジェクト構成
|
133 |
+
|
134 |
+
```
|
135 |
+
wikipedia-to-markdown/
|
136 |
+
├── app.py # メインアプリケーション
|
137 |
+
├── theme.py # UIテーマ設定
|
138 |
+
├── requirements.txt # Python依存関係
|
139 |
+
├── docker-compose.yml # Docker設定
|
140 |
+
├── .github/workflows/ # CI/CD設定
|
141 |
+
└── README.md # このファイル
|
142 |
+
```
|
143 |
+
|
144 |
+
---
|
145 |
+
|
146 |
+
## 🛠️ カスタマイズ
|
147 |
+
|
148 |
+
### **テーマ変更**
|
149 |
+
`theme.py`を編集してUIの色やスタイルを変更できます。
|
150 |
+
|
151 |
+
### **処理ロジック拡張**
|
152 |
+
`app.py`の`scrape_wikipedia_to_markdown_final()`関数を編集して、変換処理をカスタマイズできます。
|
153 |
+
|
154 |
+
---
|
155 |
+
|
156 |
+
## 📄 ライセンス
|
157 |
+
|
158 |
+
このプロジェクトは[MITライセ��ス](LICENSE)の下で公開されています。
|
159 |
+
|
160 |
+
---
|
161 |
+
|
162 |
+
## 🤝 コントリビューション
|
163 |
+
|
164 |
+
バグ報告や機能提案は[GitHub Issues](https://github.com/your-username/wikipedia-to-markdown/issues)でお願いします。
|
165 |
+
|
166 |
+
---
|
167 |
+
|
168 |
+
<div align="center">
|
169 |
+
|
170 |
+
**🌟 このプロジェクトが役に立ったらスターをお願いします!**
|
171 |
+
|
172 |
+
*© 2025 Wikipedia to Markdown Converter*
|
173 |
+
|
174 |
+
</div>
|
app.py
ADDED
@@ -0,0 +1,431 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import html2text
|
4 |
+
import re
|
5 |
+
import gradio as gr
|
6 |
+
from theme import create_zen_theme
|
7 |
+
import tempfile
|
8 |
+
import os
|
9 |
+
import zipfile
|
10 |
+
from urllib.parse import urlparse, unquote
|
11 |
+
|
12 |
+
def scrape_wikipedia_to_markdown_final(url: str) -> str:
|
13 |
+
"""
|
14 |
+
Wikipediaページをスクレイピングし、整形・不要部分削除を行い、
|
15 |
+
タイトルを付けてMarkdownに変換します。
|
16 |
+
|
17 |
+
処理フロー:
|
18 |
+
1. ページのタイトルをH1見出しとして取得します。
|
19 |
+
2. 「登場人物」などの<dt>タグを見出しに変換します。
|
20 |
+
3. 生成されたMarkdown文字列から「## 脚注」以降を完全に削除します。
|
21 |
+
4. [編集]リンクを削除します。
|
22 |
+
5. 最終的にタイトルと本文を結合して返します。
|
23 |
+
|
24 |
+
Args:
|
25 |
+
url (str): スクレイピング対象のWikipediaページのURL。
|
26 |
+
|
27 |
+
Returns:
|
28 |
+
str: 整形・変換された最終的なMarkdownコンテンツ。失敗した場合は空の文字列。
|
29 |
+
"""
|
30 |
+
try:
|
31 |
+
# 1. HTMLの取得と解析
|
32 |
+
headers = {
|
33 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
34 |
+
}
|
35 |
+
response = requests.get(url, headers=headers)
|
36 |
+
response.raise_for_status() # HTTPエラーがあれば例外を発生させる
|
37 |
+
response.encoding = response.apparent_encoding # 文字コードを自動検出
|
38 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
39 |
+
|
40 |
+
# --- ページのタイトルを取得 ---
|
41 |
+
title_tag = soup.find('h1', id='firstHeading')
|
42 |
+
page_title = title_tag.get_text(strip=True) if title_tag else "Wikipedia ページ"
|
43 |
+
|
44 |
+
# 2. 主要コンテンツエリアの特定
|
45 |
+
content_div = soup.find('div', class_='mw-parser-output')
|
46 |
+
if not content_div:
|
47 |
+
return "エラー: コンテンツエリアが見つかりませんでした。"
|
48 |
+
|
49 |
+
# 3. HTMLの事前整形(登場人物などの見出し化)
|
50 |
+
for dt_tag in content_div.find_all('dt'):
|
51 |
+
h4_tag = soup.new_tag('h4')
|
52 |
+
h4_tag.extend(dt_tag.contents)
|
53 |
+
dt_tag.replace_with(h4_tag)
|
54 |
+
|
55 |
+
# 4. HTMLからMarkdownへの一次変換
|
56 |
+
h = html2text.HTML2Text()
|
57 |
+
h.body_width = 0 # テキストの折り返しを無効にする
|
58 |
+
full_markdown_text = h.handle(str(content_div))
|
59 |
+
|
60 |
+
# 5. 生成されたMarkdownから「## 脚注」以降を削除
|
61 |
+
footnote_marker = "\n## 脚注"
|
62 |
+
footnote_index = full_markdown_text.find(footnote_marker)
|
63 |
+
body_text = full_markdown_text[:footnote_index] if footnote_index != -1 else full_markdown_text
|
64 |
+
|
65 |
+
# 6. [編集]リンクを正規表現で一括削除
|
66 |
+
cleaned_body = re.sub(r'\[\[編集\]\(.+?\)]\n', '', body_text)
|
67 |
+
|
68 |
+
# 7. タイトルと整形後の本文を結合
|
69 |
+
final_markdown = f"# {page_title}\n\n{cleaned_body.strip()}"
|
70 |
+
|
71 |
+
return final_markdown
|
72 |
+
|
73 |
+
except requests.exceptions.RequestException as e:
|
74 |
+
return f"HTTPリクエストエラー: {e}"
|
75 |
+
except Exception as e:
|
76 |
+
return f"予期せぬエラーが発生しました: {e}"
|
77 |
+
|
78 |
+
def get_filename_from_url(url):
|
79 |
+
"""URLからファイル名を生成する関数"""
|
80 |
+
try:
|
81 |
+
# URLからページ名を抽出
|
82 |
+
parsed_url = urlparse(url)
|
83 |
+
page_name = parsed_url.path.split('/')[-1]
|
84 |
+
# URLデコード
|
85 |
+
page_name = unquote(page_name)
|
86 |
+
# ファイル名として使用できない文字を置換
|
87 |
+
safe_filename = re.sub(r'[<>:"/\\|?*]', '_', page_name)
|
88 |
+
return f"{safe_filename}.md"
|
89 |
+
except:
|
90 |
+
return "wikipedia_page.md"
|
91 |
+
|
92 |
+
def create_download_file(content, filename):
|
93 |
+
"""ダウンロード用の一時ファイルを作成する関数"""
|
94 |
+
try:
|
95 |
+
# 一時ディレクトリにファイルを作成
|
96 |
+
temp_dir = tempfile.gettempdir()
|
97 |
+
file_path = os.path.join(temp_dir, filename)
|
98 |
+
|
99 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
100 |
+
f.write(content)
|
101 |
+
|
102 |
+
return file_path
|
103 |
+
except Exception as e:
|
104 |
+
print(f"ファイル作成エラー: {e}")
|
105 |
+
return None
|
106 |
+
|
107 |
+
def create_zip_file(file_paths, zip_filename="wikipedia_export.zip"):
|
108 |
+
"""複数のファイルをZIP形式でまとめる関数"""
|
109 |
+
try:
|
110 |
+
temp_dir = tempfile.gettempdir()
|
111 |
+
zip_path = os.path.join(temp_dir, zip_filename)
|
112 |
+
|
113 |
+
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
114 |
+
for file_path in file_paths:
|
115 |
+
if os.path.exists(file_path):
|
116 |
+
# ファイル名のみを取得してZIPに追加
|
117 |
+
filename = os.path.basename(file_path)
|
118 |
+
zipf.write(file_path, filename)
|
119 |
+
|
120 |
+
return zip_path
|
121 |
+
except Exception as e:
|
122 |
+
print(f"ZIP作成エラー: {e}")
|
123 |
+
return None
|
124 |
+
|
125 |
+
def process_wikipedia_url(url):
|
126 |
+
"""Wikipedia URLを処理してMarkdownを生成するGradio用関数"""
|
127 |
+
if not url:
|
128 |
+
return "URLを入力してください。", None
|
129 |
+
|
130 |
+
# URLが有効かチェック
|
131 |
+
if not url.startswith('http'):
|
132 |
+
return "有効なURLを入力してください(http://またはhttps://から始まるURL)。", None
|
133 |
+
|
134 |
+
# Wikipedia URLかチェック
|
135 |
+
if 'wikipedia.org' not in url:
|
136 |
+
return "WikipediaのURLを入力してください。", None
|
137 |
+
|
138 |
+
# スクレイピングを実行
|
139 |
+
markdown_content = scrape_wikipedia_to_markdown_final(url)
|
140 |
+
|
141 |
+
# ダウンロード用ファイルを作成
|
142 |
+
if not markdown_content.startswith("エラー:") and not markdown_content.startswith("HTTP"):
|
143 |
+
filename = get_filename_from_url(url)
|
144 |
+
file_path = create_download_file(markdown_content, filename)
|
145 |
+
return markdown_content, file_path
|
146 |
+
else:
|
147 |
+
return markdown_content, None
|
148 |
+
|
149 |
+
def process_multiple_urls(urls_text, progress=gr.Progress()):
|
150 |
+
"""複数のWikipedia URLを一括処理してMarkdownを生成する関数"""
|
151 |
+
if not urls_text.strip():
|
152 |
+
return "URLリストを入力してください。", None, [], None
|
153 |
+
|
154 |
+
# URLリストを行ごとに分割
|
155 |
+
urls = [url.strip() for url in urls_text.strip().split('\n') if url.strip()]
|
156 |
+
|
157 |
+
if not urls:
|
158 |
+
return "有効なURLが見つかりませんでした。", None, [], None
|
159 |
+
|
160 |
+
results = []
|
161 |
+
all_content = []
|
162 |
+
individual_files = []
|
163 |
+
total_urls = len(urls)
|
164 |
+
success_count = 0
|
165 |
+
|
166 |
+
for i, url in enumerate(urls):
|
167 |
+
progress((i + 1) / total_urls, f"処理中: {i + 1}/{total_urls}")
|
168 |
+
|
169 |
+
# URLの検証
|
170 |
+
if not url.startswith('http'):
|
171 |
+
results.append(f"❌ 無効なURL: {url}")
|
172 |
+
continue
|
173 |
+
|
174 |
+
if 'wikipedia.org' not in url:
|
175 |
+
results.append(f"❌ Wikipedia以外のURL: {url}")
|
176 |
+
continue
|
177 |
+
|
178 |
+
# スクレイピング実行
|
179 |
+
try:
|
180 |
+
markdown_content = scrape_wikipedia_to_markdown_final(url)
|
181 |
+
if markdown_content.startswith("エラー:") or markdown_content.startswith("HTTP"):
|
182 |
+
results.append(f"❌ 処理失敗: {url}\n エラー: {markdown_content}")
|
183 |
+
else:
|
184 |
+
# ページタイトルを抽出
|
185 |
+
title_match = re.match(r'^# (.+)', markdown_content)
|
186 |
+
page_title = title_match.group(1) if title_match else "不明なページ"
|
187 |
+
|
188 |
+
# 文字数とファイル情報を表示
|
189 |
+
char_count = len(markdown_content)
|
190 |
+
filename = get_filename_from_url(url)
|
191 |
+
|
192 |
+
results.append(f"✅ 処理成功: {url}")
|
193 |
+
results.append(f" 📄 ページタイトル: {page_title}")
|
194 |
+
results.append(f" 📊 文字数: {char_count:,} 文字")
|
195 |
+
results.append(f" 💾 ファイル名: {filename}")
|
196 |
+
|
197 |
+
all_content.append(markdown_content)
|
198 |
+
success_count += 1
|
199 |
+
|
200 |
+
# 個別ファイルを作成
|
201 |
+
file_path = create_download_file(markdown_content, filename)
|
202 |
+
if file_path:
|
203 |
+
individual_files.append(file_path)
|
204 |
+
except Exception as e:
|
205 |
+
results.append(f"❌ 処理エラー: {url}")
|
206 |
+
results.append(f" エラー内容: {str(e)}")
|
207 |
+
|
208 |
+
# サマリー情報を追加
|
209 |
+
summary = [
|
210 |
+
"=" * 60,
|
211 |
+
"📊 処理結果サマリー",
|
212 |
+
"=" * 60,
|
213 |
+
f"🔗 処理対象URL数: {total_urls}",
|
214 |
+
f"✅ 成功: {success_count}",
|
215 |
+
f"❌ 失敗: {total_urls - success_count}",
|
216 |
+
""
|
217 |
+
]
|
218 |
+
|
219 |
+
# 結果を結合
|
220 |
+
final_result = "\n".join(summary + results)
|
221 |
+
|
222 |
+
# 一括ダウンロード用ファイルを作成
|
223 |
+
batch_file_path = None
|
224 |
+
if all_content:
|
225 |
+
combined_content = "\n\n" + "="*80 + "\n\n".join(all_content)
|
226 |
+
batch_file_path = create_download_file(combined_content, "wikipedia_batch_export.md")
|
227 |
+
|
228 |
+
# ZIPファイルを作成
|
229 |
+
zip_file_path = None
|
230 |
+
if individual_files:
|
231 |
+
zip_file_path = create_zip_file(individual_files, "wikipedia_export.zip")
|
232 |
+
|
233 |
+
return final_result, batch_file_path, individual_files, zip_file_path
|
234 |
+
|
235 |
+
# Gradioインターフェースの作成
|
236 |
+
def create_interface():
|
237 |
+
"""Gradioインターフェースを作成する関数"""
|
238 |
+
theme = create_zen_theme()
|
239 |
+
|
240 |
+
with gr.Blocks(theme=theme, title="Wikipedia to Markdown Converter") as demo:
|
241 |
+
# ヘッダー
|
242 |
+
gr.HTML("""
|
243 |
+
<div style='text-align: center; margin-bottom: 2rem; padding: 2rem; background: linear-gradient(135deg, #d4a574 0%, #ffffff 50%, #f5f2ed 100%); color: #3d405b; border-radius: 12px;'>
|
244 |
+
<h1 style='font-size: 3rem; margin-bottom: 0.5rem; text-shadow: 1px 1px 2px rgba(0,0,0,0.1);'>📚 Wikipedia to Markdown Converter</h1>
|
245 |
+
<p style='font-size: 1.2rem; opacity: 0.8;'>WikipediaのURLを入力して、Markdown形式に変換します</p>
|
246 |
+
</div>
|
247 |
+
""")
|
248 |
+
|
249 |
+
# タブの作成
|
250 |
+
with gr.Tabs():
|
251 |
+
# 単体処理タブ
|
252 |
+
with gr.TabItem("🔗 単体処理"):
|
253 |
+
with gr.Row():
|
254 |
+
with gr.Column(scale=1):
|
255 |
+
url_input = gr.Textbox(
|
256 |
+
label="🔗 Wikipedia URL",
|
257 |
+
placeholder="https://ja.wikipedia.org/wiki/...",
|
258 |
+
value="https://ja.wikipedia.org/wiki/Python"
|
259 |
+
)
|
260 |
+
convert_btn = gr.Button("✨ 変換する", variant="primary")
|
261 |
+
|
262 |
+
with gr.Column(scale=1):
|
263 |
+
output_text = gr.Textbox(
|
264 |
+
label="📝 変換されたMarkdown",
|
265 |
+
lines=20,
|
266 |
+
max_lines=50,
|
267 |
+
show_copy_button=True
|
268 |
+
)
|
269 |
+
download_file = gr.File(
|
270 |
+
label="📥 マークダウンファイルをダウンロード",
|
271 |
+
visible=False
|
272 |
+
)
|
273 |
+
|
274 |
+
# ボタンクリック時の処理
|
275 |
+
def update_single_output(url):
|
276 |
+
content, file_path = process_wikipedia_url(url)
|
277 |
+
if file_path:
|
278 |
+
return content, gr.update(value=file_path, visible=True)
|
279 |
+
else:
|
280 |
+
return content, gr.update(visible=False)
|
281 |
+
|
282 |
+
convert_btn.click(
|
283 |
+
fn=update_single_output,
|
284 |
+
inputs=url_input,
|
285 |
+
outputs=[output_text, download_file]
|
286 |
+
)
|
287 |
+
|
288 |
+
# 使用例
|
289 |
+
def example_process(url):
|
290 |
+
content, _ = process_wikipedia_url(url)
|
291 |
+
return content
|
292 |
+
|
293 |
+
gr.Examples(
|
294 |
+
examples=[
|
295 |
+
["https://ja.wikipedia.org/wiki/Python"],
|
296 |
+
["https://ja.wikipedia.org/wiki/JavaScript"],
|
297 |
+
["https://ja.wikipedia.org/wiki/HTML"]
|
298 |
+
],
|
299 |
+
inputs=url_input,
|
300 |
+
outputs=output_text,
|
301 |
+
fn=example_process,
|
302 |
+
cache_examples=False
|
303 |
+
)
|
304 |
+
|
305 |
+
# 一括処理タブ
|
306 |
+
with gr.TabItem("📋 一括処理"):
|
307 |
+
with gr.Row():
|
308 |
+
with gr.Column(scale=1):
|
309 |
+
urls_input = gr.Textbox(
|
310 |
+
label="📋 Wikipedia URLリスト(1行に1つずつ)",
|
311 |
+
placeholder="https://ja.wikipedia.org/wiki/Python\nhttps://ja.wikipedia.org/wiki/JavaScript\nhttps://ja.wikipedia.org/wiki/HTML",
|
312 |
+
lines=10,
|
313 |
+
value="https://ja.wikipedia.org/wiki/Python\nhttps://ja.wikipedia.org/wiki/JavaScript"
|
314 |
+
)
|
315 |
+
batch_convert_btn = gr.Button("🚀 一括変換する", variant="primary")
|
316 |
+
|
317 |
+
with gr.Column(scale=1):
|
318 |
+
batch_output_text = gr.Textbox(
|
319 |
+
label="📝 一括変換結果",
|
320 |
+
lines=15,
|
321 |
+
max_lines=30,
|
322 |
+
show_copy_button=True
|
323 |
+
)
|
324 |
+
batch_download_file = gr.File(
|
325 |
+
label="📥 全体をまとめてダウンロード",
|
326 |
+
visible=False
|
327 |
+
)
|
328 |
+
zip_download_file = gr.File(
|
329 |
+
label="🗜️ ZIPファイルでダウンロード",
|
330 |
+
visible=False
|
331 |
+
)
|
332 |
+
|
333 |
+
# 個別ダウンロードエリア
|
334 |
+
individual_downloads = gr.Column(visible=False)
|
335 |
+
with individual_downloads:
|
336 |
+
gr.Markdown("### 📥 個別ダウンロード")
|
337 |
+
individual_file_1 = gr.File(label="", visible=False)
|
338 |
+
individual_file_2 = gr.File(label="", visible=False)
|
339 |
+
individual_file_3 = gr.File(label="", visible=False)
|
340 |
+
individual_file_4 = gr.File(label="", visible=False)
|
341 |
+
individual_file_5 = gr.File(label="", visible=False)
|
342 |
+
|
343 |
+
# 一括処理ボタンクリック時の処理
|
344 |
+
def update_batch_output(urls_text):
|
345 |
+
content, batch_file_path, individual_files, zip_file_path = process_multiple_urls(urls_text)
|
346 |
+
|
347 |
+
# 戻り値の��ストを準備
|
348 |
+
outputs = [content]
|
349 |
+
|
350 |
+
# 一括ダウンロードファイル
|
351 |
+
if batch_file_path:
|
352 |
+
outputs.append(gr.update(value=batch_file_path, visible=True))
|
353 |
+
else:
|
354 |
+
outputs.append(gr.update(visible=False))
|
355 |
+
|
356 |
+
# ZIPダウンロードファイル
|
357 |
+
if zip_file_path:
|
358 |
+
outputs.append(gr.update(value=zip_file_path, visible=True))
|
359 |
+
else:
|
360 |
+
outputs.append(gr.update(visible=False))
|
361 |
+
|
362 |
+
# 個別ダウンロードエリアの表示/非表示
|
363 |
+
if individual_files:
|
364 |
+
outputs.append(gr.update(visible=True))
|
365 |
+
else:
|
366 |
+
outputs.append(gr.update(visible=False))
|
367 |
+
|
368 |
+
# 個別ファイル(最大5つまで表示)
|
369 |
+
for i in range(5):
|
370 |
+
if i < len(individual_files):
|
371 |
+
filename = os.path.basename(individual_files[i])
|
372 |
+
outputs.append(gr.update(value=individual_files[i], visible=True, label=f"📄 {filename}"))
|
373 |
+
else:
|
374 |
+
outputs.append(gr.update(visible=False))
|
375 |
+
|
376 |
+
return outputs
|
377 |
+
|
378 |
+
batch_convert_btn.click(
|
379 |
+
fn=update_batch_output,
|
380 |
+
inputs=urls_input,
|
381 |
+
outputs=[
|
382 |
+
batch_output_text,
|
383 |
+
batch_download_file,
|
384 |
+
zip_download_file,
|
385 |
+
individual_downloads,
|
386 |
+
individual_file_1,
|
387 |
+
individual_file_2,
|
388 |
+
individual_file_3,
|
389 |
+
individual_file_4,
|
390 |
+
individual_file_5
|
391 |
+
]
|
392 |
+
)
|
393 |
+
|
394 |
+
gr.Markdown("### 💡 一括処理の使い方")
|
395 |
+
gr.Markdown("1. テキストエリアに変換したいWikipediaのURLを1行に1つずつ入力します")
|
396 |
+
gr.Markdown("2. 「🚀 一括変換する」ボタンをクリックします")
|
397 |
+
gr.Markdown("3. 処理の進行状況が表示され、完了後に結果が表示されます")
|
398 |
+
gr.Markdown("4. 各URLの処理結果(成功/失敗)が明確に表示されます")
|
399 |
+
|
400 |
+
gr.Markdown("---")
|
401 |
+
gr.Markdown("### 🎯 基本的な使用方法")
|
402 |
+
gr.Markdown("- **単体処理**: 1つのWikipediaページを変換したい場合")
|
403 |
+
gr.Markdown("- **一括処理**: 複数のWikipediaページを一度に変換したい場合")
|
404 |
+
gr.Markdown("- 生成されたMarkdownは右側のテキストエリアからコピーできます")
|
405 |
+
gr.Markdown("- **📥 ダウンロード機能**: 変換が成功すると、マークダウンファイルとして直接ダウンロードできます")
|
406 |
+
gr.Markdown(" - 単体処理: ページ名に基づいたファイル名で個別ダウンロード")
|
407 |
+
gr.Markdown(" - 一括処理: 各URLごとの個別ダウンロード + 全体をまとめた一括ダウンロード + **🗜️ ZIPファイル**")
|
408 |
+
gr.Markdown(" - 個別ダウンロード: 成功した各ページを個別のファイルとしてダウンロード可能(最大5つまで表示)")
|
409 |
+
gr.Markdown(" - **ZIPダウンロード**: 複数のMarkdownファイルを1つのZIPファイルにまとめてダウンロード")
|
410 |
+
|
411 |
+
# ZENテーマの説明
|
412 |
+
gr.HTML("""
|
413 |
+
<div style='text-align: center; margin-top: 2rem; padding: 1.5rem; background: #ffffff; border-radius: 12px;'>
|
414 |
+
<h3 style='color: #3d405b; margin-top: 0;'>🧘♀️ ZENテーマ</h3>
|
415 |
+
<p style='color: #8b7355;'>和モダンなデザインで、使いやすさと美しさを追求しました</p>
|
416 |
+
</div>
|
417 |
+
""")
|
418 |
+
|
419 |
+
return demo
|
420 |
+
|
421 |
+
if __name__ == "__main__":
|
422 |
+
# インターフェースを作成
|
423 |
+
demo = create_interface()
|
424 |
+
|
425 |
+
# アプリケーションを実行
|
426 |
+
demo.launch(
|
427 |
+
server_name="0.0.0.0",
|
428 |
+
server_port=7860,
|
429 |
+
share=False,
|
430 |
+
debug=True
|
431 |
+
)
|
docker-compose.dev.yml
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: '3.8'
|
2 |
+
|
3 |
+
services:
|
4 |
+
wikipedia-converter-dev:
|
5 |
+
build:
|
6 |
+
context: .
|
7 |
+
dockerfile: Dockerfile
|
8 |
+
ports:
|
9 |
+
- "7861:7860"
|
10 |
+
environment:
|
11 |
+
- PYTHONUNBUFFERED=1
|
12 |
+
- GRADIO_SERVER_NAME=0.0.0.0
|
13 |
+
- GRADIO_SERVER_PORT=7861
|
14 |
+
volumes:
|
15 |
+
# 開発時にコードの変更をリアルタイムで反映
|
16 |
+
- .:/app
|
17 |
+
- /app/__pycache__
|
18 |
+
restart: unless-stopped
|
19 |
+
command: python app.py
|
20 |
+
networks:
|
21 |
+
- wikipedia-dev-network
|
22 |
+
|
23 |
+
networks:
|
24 |
+
wikipedia-dev-network:
|
25 |
+
driver: bridge
|
docker-compose.yml
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: '3.8'
|
2 |
+
|
3 |
+
services:
|
4 |
+
wikipedia-converter:
|
5 |
+
build:
|
6 |
+
context: .
|
7 |
+
dockerfile: Dockerfile
|
8 |
+
ports:
|
9 |
+
- "7861:7860"
|
10 |
+
environment:
|
11 |
+
- PYTHONUNBUFFERED=1
|
12 |
+
# volumes:
|
13 |
+
# 開発時にコードの変更を反映させたい場合はコメントアウト
|
14 |
+
# - .:/app
|
15 |
+
restart: unless-stopped
|
16 |
+
healthcheck:
|
17 |
+
test: ["CMD", "curl", "-f", "http://localhost:7861"]
|
18 |
+
interval: 30s
|
19 |
+
timeout: 10s
|
20 |
+
retries: 3
|
21 |
+
start_period: 40s
|
22 |
+
networks:
|
23 |
+
- wikipedia-network
|
24 |
+
|
25 |
+
networks:
|
26 |
+
wikipedia-network:
|
27 |
+
driver: bridge
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
requests>=2.31.0
|
2 |
+
beautifulsoup4>=4.12.0
|
3 |
+
html2text>=2020.1.16
|
4 |
+
gradio>=5.42.0
|
theme.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
def create_zen_theme():
|
4 |
+
"""
|
5 |
+
ZENテーマの作成
|
6 |
+
和モダンなデザインで、使いやすさと美しさを追求したテーマ
|
7 |
+
"""
|
8 |
+
return gr.Theme(
|
9 |
+
primary_hue="amber",
|
10 |
+
secondary_hue="stone",
|
11 |
+
neutral_hue="slate",
|
12 |
+
text_size="md",
|
13 |
+
spacing_size="lg",
|
14 |
+
radius_size="sm",
|
15 |
+
font=[
|
16 |
+
"Hiragino Sans",
|
17 |
+
"Noto Sans JP",
|
18 |
+
"Yu Gothic",
|
19 |
+
"system-ui",
|
20 |
+
"sans-serif"
|
21 |
+
],
|
22 |
+
font_mono=[
|
23 |
+
"SF Mono",
|
24 |
+
"Monaco",
|
25 |
+
"monospace"
|
26 |
+
]
|
27 |
+
).set(
|
28 |
+
body_background_fill="#ffffff",
|
29 |
+
body_text_color="#3d405b",
|
30 |
+
button_primary_background_fill="#d4a574",
|
31 |
+
button_primary_background_fill_hover="#c19660",
|
32 |
+
button_primary_text_color="#ffffff",
|
33 |
+
button_secondary_background_fill="#f5f2ed",
|
34 |
+
button_secondary_text_color="#3d405b",
|
35 |
+
input_background_fill="#ffffff",
|
36 |
+
input_border_color="#d4c4a8",
|
37 |
+
input_border_color_focus="#d4a574",
|
38 |
+
block_background_fill="#ffffff",
|
39 |
+
block_border_color="#e8e2d5",
|
40 |
+
block_border_width="3px",
|
41 |
+
panel_background_fill="#ffffff",
|
42 |
+
panel_border_color="#e8e2d5",
|
43 |
+
slider_color="#d4a574",
|
44 |
+
)
|