diff --git a/.dockerignore b/.dockerignore
new file mode 120000
index 0000000000000000000000000000000000000000..3e4e48b0b5fe6b468434d6767749b399319f2da2
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1 @@
+.gitignore
\ No newline at end of file
diff --git a/.env.dev b/.env.dev
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.env.production b/.env.production
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..437d7b55881e32d37e8c2700c5dd81d73faa0f61
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,40 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jar filter=lfs diff=lfs merge=lfs -text
+backend_vespa/src/main/application/model/* filter=lfs diff=lfs merge=lfs -text
+backend_vespa/multilingual-e5-small/*.json filter=lfs diff=lfs merge=lfs -text
+backend_vespa/multilingual-e5-small-quantized/*.json filter=lfs diff=lfs merge=lfs -text
+data/* filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..9f8cc3b5568870dbd148cb2ced38785754e0491e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,513 @@
+## .gitignore.d/Archives.gitignore START
+# It's better to unpack these files and commit the raw source because
+# git has its own built in compression methods.
+*.7z
+*.jar
+*.rar
+*.zip
+*.gz
+*.gzip
+*.tgz
+*.bzip
+*.bzip2
+*.bz2
+*.xz
+*.lzma
+*.cab
+*.xar
+
+# Packing-only formats
+*.iso
+*.tar
+
+# Package management formats
+*.dmg
+*.xpi
+*.gem
+*.egg
+*.deb
+*.rpm
+*.msi
+*.msm
+*.msp
+*.txz
+## .gitignore.d/Archives.gitignore END
+## .gitignore.d/Backup.gitignore START
+*.bak
+*.gho
+*.ori
+*.orig
+*.tmp
+## .gitignore.d/Backup.gitignore END
+## .gitignore.d/GPG.gitignore START
+secring.*
+## .gitignore.d/GPG.gitignore END
+## .gitignore.d/Linux.gitignore START
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+## .gitignore.d/Linux.gitignore END
+## .gitignore.d/Local.gitignore START
+tmp
+.ruff_cache
+**/__pycache__
+
+backend_vespa/multilingual-e5-*/*
+backend_vespa/multilingual-e5-*-quantized/*
+## .gitignore.d/Local.gitignore END
+## .gitignore.d/Node.gitignore START
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+lerna-debug.log*
+.pnpm-debug.log*
+
+# Diagnostic reports (https://nodejs.org/api/report.html)
+report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
+
+# Runtime data
+pids
+*.pid
+*.seed
+*.pid.lock
+
+# Directory for instrumented libs generated by jscoverage/JSCover
+lib-cov
+
+# Coverage directory used by tools like istanbul
+coverage
+*.lcov
+
+# nyc test coverage
+.nyc_output
+
+# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
+.grunt
+
+# Bower dependency directory (https://bower.io/)
+bower_components
+
+# node-waf configuration
+.lock-wscript
+
+# Compiled binary addons (https://nodejs.org/api/addons.html)
+build/Release
+
+# Dependency directories
+node_modules/
+jspm_packages/
+
+# Snowpack dependency directory (https://snowpack.dev/)
+web_modules/
+
+# TypeScript cache
+*.tsbuildinfo
+
+# Optional npm cache directory
+.npm
+
+# Optional eslint cache
+.eslintcache
+
+# Optional stylelint cache
+.stylelintcache
+
+# Microbundle cache
+.rpt2_cache/
+.rts2_cache_cjs/
+.rts2_cache_es/
+.rts2_cache_umd/
+
+# Optional REPL history
+.node_repl_history
+
+# Output of 'npm pack'
+*.tgz
+
+# Yarn Integrity file
+.yarn-integrity
+
+# dotenv environment variable files
+# .env
+.env.development.local
+.env.test.local
+.env.production.local
+.env.local
+
+# parcel-bundler cache (https://parceljs.org/)
+.cache
+.parcel-cache
+
+# Next.js build output
+.next
+out
+
+# Nuxt.js build / generate output
+.nuxt
+dist
+
+# Gatsby files
+.cache/
+# Comment in the public line in if your project uses Gatsby and not Next.js
+# https://nextjs.org/blog/next-9-1#public-directory-support
+# public
+
+# vuepress build output
+.vuepress/dist
+
+# vuepress v2.x temp and cache directory
+.temp
+.cache
+
+# Docusaurus cache and generated files
+.docusaurus
+
+# Serverless directories
+.serverless/
+
+# FuseBox cache
+.fusebox/
+
+# DynamoDB Local files
+.dynamodb/
+
+# TernJS port file
+.tern-port
+
+# Stores VSCode versions used for testing VSCode extensions
+.vscode-test
+
+# yarn v2
+.yarn/cache
+.yarn/unplugged
+.yarn/build-state.yml
+.yarn/install-state.gz
+.pnp.*
+## .gitignore.d/Node.gitignore END
+## .gitignore.d/Python.gitignore START
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+# lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+## .gitignore.d/Python.gitignore END
+## .gitignore.d/Tags.gitignore START
+# Ignore tags created by etags, ctags, gtags (GNU global) and cscope
+TAGS
+.TAGS
+!TAGS/
+tags
+.tags
+!tags/
+gtags.files
+GTAGS
+GRTAGS
+GPATH
+GSYMS
+cscope.files
+cscope.out
+cscope.in.out
+cscope.po.out
+## .gitignore.d/Tags.gitignore END
+## .gitignore.d/Terraform.gitignore START
+# Local .terraform directories
+**/.terraform/*
+
+# .tfstate files
+*.tfstate
+*.tfstate.*
+
+# Crash log files
+crash.log
+crash.*.log
+
+# Exclude all .tfvars files, which are likely to contain sensitive data, such as
+# password, private keys, and other secrets. These should not be part of version
+# control as they are data points which are potentially sensitive and subject
+# to change depending on the environment.
+*.tfvars
+*.tfvars.json
+
+# Ignore override files as they are usually used to override resources locally and so
+# are not checked in
+override.tf
+override.tf.json
+*_override.tf
+*_override.tf.json
+
+# Include override files you do wish to add to version control using negated pattern
+# !example_override.tf
+
+# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
+# example: *tfplan*
+
+# Ignore CLI configuration files
+.terraformrc
+terraform.rc
+## .gitignore.d/Terraform.gitignore END
+## .gitignore.d/Vim.gitignore START
+# Swap
+[._]*.s[a-v][a-z]
+!*.svg # comment out if you don't need vector files
+[._]*.sw[a-p]
+[._]s[a-rt-v][a-z]
+[._]ss[a-gi-z]
+[._]sw[a-p]
+
+# Session
+Session.vim
+Sessionx.vim
+
+# Temporary
+.netrwhist
+*~
+# Auto-generated tag files
+tags
+# Persistent undo
+[._]*.un~
+## .gitignore.d/Vim.gitignore END
+## .gitignore.d/VisualStudioCode.gitignore START
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+## .gitignore.d/VisualStudioCode.gitignore END
+## .gitignore.d/Windows.gitignore START
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+
+# Dump file
+*.stackdump
+
+# Folder config file
+[Dd]esktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
+## .gitignore.d/Windows.gitignore END
+## .gitignore.d/Xcode.gitignore START
+## User settings
+xcuserdata/
+
+## Xcode 8 and earlier
+*.xcscmblueprint
+*.xccheckout
+## .gitignore.d/Xcode.gitignore END
+## .gitignore.d/macOS.gitignore START
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+## .gitignore.d/macOS.gitignore END
diff --git a/.gitignore.d/Archives.gitignore b/.gitignore.d/Archives.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..4ed9ab83509eded4b836072188c7b47462a19a41
--- /dev/null
+++ b/.gitignore.d/Archives.gitignore
@@ -0,0 +1,32 @@
+# It's better to unpack these files and commit the raw source because
+# git has its own built in compression methods.
+*.7z
+*.jar
+*.rar
+*.zip
+*.gz
+*.gzip
+*.tgz
+*.bzip
+*.bzip2
+*.bz2
+*.xz
+*.lzma
+*.cab
+*.xar
+
+# Packing-only formats
+*.iso
+*.tar
+
+# Package management formats
+*.dmg
+*.xpi
+*.gem
+*.egg
+*.deb
+*.rpm
+*.msi
+*.msm
+*.msp
+*.txz
diff --git a/.gitignore.d/Backup.gitignore b/.gitignore.d/Backup.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..825ce52db53d71679a1bdbc940d1fabb3726364a
--- /dev/null
+++ b/.gitignore.d/Backup.gitignore
@@ -0,0 +1,5 @@
+*.bak
+*.gho
+*.ori
+*.orig
+*.tmp
diff --git a/.gitignore.d/GPG.gitignore b/.gitignore.d/GPG.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..cf2aca6b32260288a72cb28c6cd1a459ef827882
--- /dev/null
+++ b/.gitignore.d/GPG.gitignore
@@ -0,0 +1 @@
+secring.*
diff --git a/.gitignore.d/Linux.gitignore b/.gitignore.d/Linux.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..b56bf65d85583b03eeccfaa2a927084583a33e91
--- /dev/null
+++ b/.gitignore.d/Linux.gitignore
@@ -0,0 +1,13 @@
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
diff --git a/.gitignore.d/Local.gitignore b/.gitignore.d/Local.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..2eda963a0d2806f566a5aadae817df33c9ceb996
--- /dev/null
+++ b/.gitignore.d/Local.gitignore
@@ -0,0 +1,6 @@
+tmp
+.ruff_cache
+**/__pycache__
+
+backend_vespa/multilingual-e5-*/*
+backend_vespa/multilingual-e5-*-quantized/*
diff --git a/.gitignore.d/Node.gitignore b/.gitignore.d/Node.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..208d50c7c3479339b177ea73facfc279e0e3581c
--- /dev/null
+++ b/.gitignore.d/Node.gitignore
@@ -0,0 +1,130 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+lerna-debug.log*
+.pnpm-debug.log*
+
+# Diagnostic reports (https://nodejs.org/api/report.html)
+report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
+
+# Runtime data
+pids
+*.pid
+*.seed
+*.pid.lock
+
+# Directory for instrumented libs generated by jscoverage/JSCover
+lib-cov
+
+# Coverage directory used by tools like istanbul
+coverage
+*.lcov
+
+# nyc test coverage
+.nyc_output
+
+# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
+.grunt
+
+# Bower dependency directory (https://bower.io/)
+bower_components
+
+# node-waf configuration
+.lock-wscript
+
+# Compiled binary addons (https://nodejs.org/api/addons.html)
+build/Release
+
+# Dependency directories
+node_modules/
+jspm_packages/
+
+# Snowpack dependency directory (https://snowpack.dev/)
+web_modules/
+
+# TypeScript cache
+*.tsbuildinfo
+
+# Optional npm cache directory
+.npm
+
+# Optional eslint cache
+.eslintcache
+
+# Optional stylelint cache
+.stylelintcache
+
+# Microbundle cache
+.rpt2_cache/
+.rts2_cache_cjs/
+.rts2_cache_es/
+.rts2_cache_umd/
+
+# Optional REPL history
+.node_repl_history
+
+# Output of 'npm pack'
+*.tgz
+
+# Yarn Integrity file
+.yarn-integrity
+
+# dotenv environment variable files
+# .env
+.env.development.local
+.env.test.local
+.env.production.local
+.env.local
+
+# parcel-bundler cache (https://parceljs.org/)
+.cache
+.parcel-cache
+
+# Next.js build output
+.next
+out
+
+# Nuxt.js build / generate output
+.nuxt
+dist
+
+# Gatsby files
+.cache/
+# Comment in the public line in if your project uses Gatsby and not Next.js
+# https://nextjs.org/blog/next-9-1#public-directory-support
+# public
+
+# vuepress build output
+.vuepress/dist
+
+# vuepress v2.x temp and cache directory
+.temp
+.cache
+
+# Docusaurus cache and generated files
+.docusaurus
+
+# Serverless directories
+.serverless/
+
+# FuseBox cache
+.fusebox/
+
+# DynamoDB Local files
+.dynamodb/
+
+# TernJS port file
+.tern-port
+
+# Stores VSCode versions used for testing VSCode extensions
+.vscode-test
+
+# yarn v2
+.yarn/cache
+.yarn/unplugged
+.yarn/build-state.yml
+.yarn/install-state.gz
+.pnp.*
diff --git a/.gitignore.d/Python.gitignore b/.gitignore.d/Python.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..170c4b5c6af48bad89cf4b3f903c8db19a637f6d
--- /dev/null
+++ b/.gitignore.d/Python.gitignore
@@ -0,0 +1,160 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+# lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/.gitignore.d/Tags.gitignore b/.gitignore.d/Tags.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..1fde25752466d4681ec2b1b9b1da6a636adeb1c9
--- /dev/null
+++ b/.gitignore.d/Tags.gitignore
@@ -0,0 +1,16 @@
+# Ignore tags created by etags, ctags, gtags (GNU global) and cscope
+TAGS
+.TAGS
+!TAGS/
+tags
+.tags
+!tags/
+gtags.files
+GTAGS
+GRTAGS
+GPATH
+GSYMS
+cscope.files
+cscope.out
+cscope.in.out
+cscope.po.out
diff --git a/.gitignore.d/Terraform.gitignore b/.gitignore.d/Terraform.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..6304eb3c1114d4e1a6a7dfa39bbde83e9f8715b6
--- /dev/null
+++ b/.gitignore.d/Terraform.gitignore
@@ -0,0 +1,34 @@
+# Local .terraform directories
+**/.terraform/*
+
+# .tfstate files
+*.tfstate
+*.tfstate.*
+
+# Crash log files
+crash.log
+crash.*.log
+
+# Exclude all .tfvars files, which are likely to contain sensitive data, such as
+# password, private keys, and other secrets. These should not be part of version
+# control as they are data points which are potentially sensitive and subject
+# to change depending on the environment.
+*.tfvars
+*.tfvars.json
+
+# Ignore override files as they are usually used to override resources locally and so
+# are not checked in
+override.tf
+override.tf.json
+*_override.tf
+*_override.tf.json
+
+# Include override files you do wish to add to version control using negated pattern
+# !example_override.tf
+
+# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
+# example: *tfplan*
+
+# Ignore CLI configuration files
+.terraformrc
+terraform.rc
diff --git a/.gitignore.d/Vim.gitignore b/.gitignore.d/Vim.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..19fa63264c56a2e0e98aa45e12ab2f0ec50f5f42
--- /dev/null
+++ b/.gitignore.d/Vim.gitignore
@@ -0,0 +1,19 @@
+# Swap
+[._]*.s[a-v][a-z]
+!*.svg # comment out if you don't need vector files
+[._]*.sw[a-p]
+[._]s[a-rt-v][a-z]
+[._]ss[a-gi-z]
+[._]sw[a-p]
+
+# Session
+Session.vim
+Sessionx.vim
+
+# Temporary
+.netrwhist
+*~
+# Auto-generated tag files
+tags
+# Persistent undo
+[._]*.un~
diff --git a/.gitignore.d/VisualStudioCode.gitignore b/.gitignore.d/VisualStudioCode.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..45fce1d71cdbd692d33284611adab75e61afe235
--- /dev/null
+++ b/.gitignore.d/VisualStudioCode.gitignore
@@ -0,0 +1,12 @@
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
diff --git a/.gitignore.d/Windows.gitignore b/.gitignore.d/Windows.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..84bffec74db3273565f4c9b3e9ffcc216cec1611
--- /dev/null
+++ b/.gitignore.d/Windows.gitignore
@@ -0,0 +1,24 @@
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+
+# Dump file
+*.stackdump
+
+# Folder config file
+[Dd]esktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
diff --git a/.gitignore.d/Xcode.gitignore b/.gitignore.d/Xcode.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..f87d2f2e7403c1eb1b23798cf06842e6d3d701d5
--- /dev/null
+++ b/.gitignore.d/Xcode.gitignore
@@ -0,0 +1,6 @@
+## User settings
+xcuserdata/
+
+## Xcode 8 and earlier
+*.xcscmblueprint
+*.xccheckout
diff --git a/.gitignore.d/macOS.gitignore b/.gitignore.d/macOS.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..6509c911748f2a51894b0b52660c8eb26b818e24
--- /dev/null
+++ b/.gitignore.d/macOS.gitignore
@@ -0,0 +1,27 @@
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
diff --git a/.lfsconfig b/.lfsconfig
new file mode 100644
index 0000000000000000000000000000000000000000..cfd8e7657f7658401f6511db20c79ec8212ebfea
--- /dev/null
+++ b/.lfsconfig
@@ -0,0 +1,2 @@
+[lfs]
+ url = https://huggingface.co/spaces/hicustomer/govsearch.git/info/lfs
diff --git a/.node-version b/.node-version
new file mode 100644
index 0000000000000000000000000000000000000000..bc78e9f2695ea604365801e2a3283916cd040d4d
--- /dev/null
+++ b/.node-version
@@ -0,0 +1 @@
+20.12.1
diff --git a/.npmrc b/.npmrc
new file mode 100644
index 0000000000000000000000000000000000000000..cffe8cdef132f31903a4971117f33f60cd9a56e6
--- /dev/null
+++ b/.npmrc
@@ -0,0 +1 @@
+save-exact=true
diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000000000000000000000000000000000000..9919bf8c903117ecc6cf607b756d1e2095a2e575
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.10.13
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..0f6df9dc88e6b4b9df1c2459ef6da3b8129e3d10
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,48 @@
+FROM node:20.12.1 as frontend
+WORKDIR /app
+
+RUN --mount=type=bind,source=package-lock.json,target=package-lock.json \
+ --mount=type=bind,source=package.json,target=package.json \
+ --mount=type=bind,source=.npmrc,target=.npmrc \
+ npm ci
+
+COPY . /app
+
+RUN make build_frontend
+
+
+FROM library/maven:3.9.6-eclipse-temurin-17-focal as backend_vespa
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential
+
+COPY ./backend_vespa /app
+
+RUN make package
+
+
+FROM vespaengine/vespa:8.324.16
+
+USER root
+RUN chown vespa:vespa /opt/vespa && rm /etc/yum.repos.d/epel.repo
+
+ENV HOME=/opt/vespa
+ENV RYE_HOME="$HOME/.rye"
+ENV PATH="$HOME/app/.venv/bin:$RYE_HOME/shims:$HOME/.local/bin:$PATH"
+USER vespa
+WORKDIR $HOME/app
+
+RUN curl -sSf https://rye-up.com/get | \
+ RYE_NO_AUTO_INSTALL=1 RYE_INSTALL_OPTION="--yes" bash
+RUN --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+ --mount=type=bind,source=requirements.lock,target=requirements.lock \
+ --mount=type=bind,source=requirements-dev.lock,target=requirements-dev.lock \
+ --mount=type=bind,source=.python-version,target=.python-version \
+ --mount=type=bind,source=README.md,target=README.md \
+ rye sync --no-dev --no-lock
+
+COPY --chown=vespa . $HOME/app
+COPY --chown=vespa --from=frontend /app/build $HOME/app/build
+COPY --chown=vespa --from=backend_vespa /app/target $HOME/app/backend_vespa/target
+
+ENTRYPOINT ["scripts/start-container.sh"]
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..be3f7b28e564e7dd05eaf59d64adba1a4065ac0e
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,661 @@
+ GNU AFFERO GENERAL PUBLIC LICENSE
+ Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc.
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+ A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate. Many developers of free software are heartened and
+encouraged by the resulting cooperation. However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+ The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community. It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server. Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+ An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals. This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU Affero General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Remote Network Interaction; Use with the GNU General Public License.
+
+ Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software. This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time. Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+
+ Copyright (C)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see .
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source. For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code. There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..cfd01ca9f19da41662de240dd55878163f281763
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,69 @@
+SHELL=/bin/bash -eo pipefail
+
+.PHONY: list fix check
+
+list:
+ @LC_ALL=C $(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null | \
+ awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' | \
+ sort | egrep -v -e '^[^[:alnum:]]' -e '^$@$$'
+
+setup: setup_frontend setup_backend
+
+setup_frontend:
+ npm ci
+
+setup_backend:
+ rye sync
+
+build: build_frontend build_backend
+
+build_frontend:
+ npx remix vite:build
+
+build_backend: build_frontend
+ docker compose build
+
+serve_frontend:
+ npx remix vite:dev --host 0.0.0.0 --port 7861
+
+serve_backend:
+ rye run uvicorn backend:app \
+ --host 0.0.0.0 --port 7860 --log-level debug --reload
+
+fix: fix_frontend fix_backend
+
+fix_frontend:
+ npx biome check --apply .
+
+fix_backend:
+ rye lint
+ rye fmt
+
+check: check_frontend check_backend
+
+check_frontend:
+ npx tsc
+
+check_backend:
+ rye run pyright
+
+codegen_gitignore:
+ for i in .gitignore.d/*.gitignore; do \
+ { echo "## $$i START"; cat $$i; echo "## $$i END"; }; \
+ done > .gitignore;
+
+codegen_config:
+ jsonnet -m . config.jsonnet
+ yq -i e -P '.' docker-compose.yml && yq -i e -P '.' docker-compose.yml
+ yq -i e -P '.' docker-compose.vespa.yml && yq -i e -P '.' docker-compose.vespa.yml
+
+codegen_graphql_backend:
+ python -m gql_schema_codegen \
+ -p ./schema/graphql_* \
+ -t ./backend/generated/schema_types.py
+ $$(command -v gsed &>/dev/null && echo "gsed" || echo "sed") \
+ -i '1,10 s/^from typing import/from typing_extensions import/' \
+ backend/generated/schema_types.py
+
+codegen_graphql_frontend:
+ npx graphql-codegen --config codegen.ts
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..742b823822196270323414f4a6f217cb250764ab
--- /dev/null
+++ b/README.md
@@ -0,0 +1,88 @@
+---
+title: Govsearch
+emoji: 🏆
+colorFrom: purple
+colorTo: pink
+sdk: docker
+pinned: true
+license: agpl-3.0
+app_port: 7860
+---
+
+# Govsearch
+
+Govsearch is an unofficial search application for Japanese government
+documents. Similar to Govbot(https://www.govbot.go.jp/), its aim is to
+improve accessibility to government documents, but it adopts a search
+approach instead of a chatbot. Internally, it utilizes Vespa as the
+search engine, Starlette for the backend, Remix for the frontend, and
+relies on documents provided by Govbot.
+
+## Usage
+
+To run the application, you need to have Docker installed. Then, you
+can run the following command:
+
+```sh
+echo "OPENAI_API_KEY=sk-xxx" > .env
+docker build -t govsearch . && docker run -it -p 7860:7860 govsearch
+open http://localhost:7860
+```
+
+To develop the application, you can run the following command:
+
+```sh
+echo "OPENAI_API_KEY=sk-xxx" > .env
+docker compose up -d
+(cd backend_vespa && make deploy && make feed)
+open http://localhost:7861
+```
+
+## References
+
+- Govbot
+ - https://govbot.go.jp/
+ - https://www.soumu.go.jp/main_sosiki/hyouka/soudan_n/kyotsucb_top.html#faq
+- Backend
+ - https://www.starlette.io/
+ - https://ariadnegraphql.org/docs/intro
+ - https://docs.pydantic.dev/latest/why/
+ - https://github.com/openai/openai-python
+ - https://rye-up.com/
+- Search
+ - https://docs.vespa.ai/en/text-matching.html
+- Graphql
+ - https://graphql.org/learn/
+ - https://the-guild.dev/graphql/codegen/docs/guides/react-vue
+ - https://the-guild.dev/graphql/ws/get-started
+ - https://github.com/enisdenjo/graphql-ws
+ - https://github.com/sauldom102/gql_schema_codegen
+- Frontend
+ - https://remix.run/docs/en/main/file-conventions/vite-config
+ - https://biomejs.dev/linter/rules/
+- UI Libraries
+ - https://tailwindcss.com/docs/utility-first
+ - https://ui.shadcn.com/docs
+- UI Issues
+ - https://github.com/radix-ui/primitives/issues/2783
+ - https://github.com/shadcn-ui/ui/issues/3256
+ - https://github.com/pacocoursey/cmdk/issues/206
+ - https://dninomiya.github.io/form-guide/stop-enter-submit
+- Typescript
+ - https://www.typescriptlang.org/cheatsheets
+ - https://www.typescriptlang.org/docs/handbook/modules/introduction.html
+ - https://www.typescriptlang.org/docs/handbook/declaration-merging.html
+ - https://www.typescriptlang.org/docs/handbook/utility-types.html
+ - https://www.typescriptlang.org/docs/handbook/2/types-from-types.html
+ - https://www.typescriptlang.org/docs/handbook/2/narrowing.html
+- Python
+ - https://docs.python.org/3/glossary.html
+ - https://docs.python.org/3/tutorial/index.html
+ - https://docs.python.org/3/library/ast.html
+ - https://docs.python.org/3/library/typing.html
+ - https://docs.python.org/3/library/asyncio.html
+- Docker
+ - https://github.com/vespa-engine/docker-image/tree/master
+ - https://docs.docker.com/build/building/multi-stage/
+- Git
+ - https://github.com/git-lfs/git-lfs/wiki/Tutorial
diff --git a/backend/__init__.py b/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7a06401b175df22f4c5b9397986b51cfb0d9b29
--- /dev/null
+++ b/backend/__init__.py
@@ -0,0 +1,33 @@
+from starlette.applications import Starlette
+from starlette.middleware import Middleware
+from starlette.responses import FileResponse
+from starlette.routing import Mount, Route, WebSocketRoute
+from starlette.staticfiles import StaticFiles
+from content_size_limit_asgi import ContentSizeLimitMiddleware
+
+from .graphql import graphql_app
+from .logger import init_logger
+from .cache import cache_questions, cache_generate_summary
+
+init_logger()
+
+app = Starlette(
+ debug=True,
+ on_startup=[cache_questions.connect, cache_generate_summary.connect],
+ on_shutdown=[cache_questions.disconnect, cache_generate_summary.disconnect],
+ middleware=[
+ Middleware(
+ ContentSizeLimitMiddleware,
+ max_content_size=1024 * 32,
+ ),
+ ],
+ routes=[
+ Route(
+ "/graphql", graphql_app.handle_request, methods=["GET", "POST", "OPTIONS"]
+ ),
+ WebSocketRoute("/ws/graphql", graphql_app.handle_websocket),
+ Mount("/assets", StaticFiles(directory="build/client/assets"), name="assets"),
+ Mount("/favicon.ico", FileResponse("build/client/favicon.ico")),
+ Route("/{path:path}", FileResponse("build/client/index.html")),
+ ],
+)
diff --git a/backend/cache.py b/backend/cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..2381d3411a8b19daddd7dc727045f427927dfd5c
--- /dev/null
+++ b/backend/cache.py
@@ -0,0 +1,4 @@
+from caches import Cache
+
+cache_questions = Cache("locmem://questions")
+cache_generate_summary = Cache("locmem://generate_summary")
diff --git a/backend/data.py b/backend/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dde4c3a7766c8fad617614f9bef36ba608d11be
--- /dev/null
+++ b/backend/data.py
@@ -0,0 +1,117 @@
+from backend.generated.schema_types import Question
+
+
+questions: list[Question] = [
+ {
+ "id": "M-A-b-10",
+ "docId": "M-A-b-10",
+ "categoryMajor": "子育て",
+ "categoryMedium": "児童手当",
+ "categoryMinor": None,
+ "question": "児童手当の申請期限を教えてください。",
+ },
+ {
+ "id": "M-A-b-67",
+ "docId": "M-A-b-67",
+ "categoryMajor": "子育て",
+ "categoryMedium": "ひとり親家庭への支援",
+ "categoryMinor": None,
+ "question": "児童扶養手当について教えてください。",
+ },
+ {
+ "id": "M-B-b-39",
+ "docId": "M-B-b-39",
+ "categoryMajor": "マイナンバー",
+ "categoryMedium": "マイナンバーカード",
+ "categoryMinor": None,
+ "question": "マイナンバーカードに記録されている電子証明書とは何ですか。",
+ },
+ {
+ "id": "M-B-e-1",
+ "docId": "M-B-e-1",
+ "categoryMajor": "マイナンバー",
+ "categoryMedium": "マイナ保険証",
+ "categoryMinor": None,
+ "question": "マイナンバーカードを健康保険証として利用するメリットは何ですか。",
+ },
+ {
+ "id": "M-C-g-1",
+ "docId": "M-C-g-1",
+ "categoryMajor": "医療保険",
+ "categoryMedium": "出産育児一時金",
+ "categoryMinor": None,
+ "question": "出産育児一時金について教えてください。",
+ },
+ {
+ "id": "M-D-d-01",
+ "docId": "M-D-d-01",
+ "categoryMajor": "年金",
+ "categoryMedium": "公的年金を受け取る",
+ "categoryMinor": None,
+ "question": "年金は「いつから」「いくら」受け取れますか。",
+ },
+]
+
+shot_user = """
+## 質問
+不妊治療について教えて下さい
+
+## 参考資料
+[
+ {
+ "categoryMajor": "子育て",
+ "categoryMedium": "不妊症・不育症に関する支援、相談",
+ "categoryMinor": null,
+ "question": "不妊治療を行っている医療機関を教えてください。",
+ "answer": "不妊治療を行っている医療機関は、<こちら,https://funin-fuiku.cfa.go.jp/clinic/>から検索することができます。\n(都道府県名、治療内容、その他の条件から検索できます。)"
+ },
+ {
+ "categoryMajor": "子育て",
+ "categoryMedium": "不妊症・不育症に関する支援、相談",
+ "categoryMinor": null,
+ "question": "働きながら不妊治療を続けることができるか不安です。不妊治療の内容や支援情報について教えてください。",
+ "answer": "働きながら不妊治療を受ける方、それを支える上司や同僚の皆さん向けに、「不妊治療と仕事との両立サポートハンドブック」を作成しています。不妊治療の内容や職場での配慮のポイント、不妊治療の方にとって役立つ情報などをまとめてありますので、詳しくは、<こちら,https://www.mhlw.go.jp/stf/newpage_14408.html>からご覧ください。\n"
+ },
+ {
+ "categoryMajor": "子育て",
+ "categoryMedium": "不妊症・不育症に関する支援、相談",
+ "categoryMinor": null,
+ "question": "不妊症・不育症について相談できる相談窓口を教えてください。",
+ "answer": "不妊症や不育症に悩む夫婦に対し、医学的・専門的な相談や情報提供は、都道府県、指定都市、中核市が設置している性と健康の相談センターで行っております。\n一覧は、<こちら,https://www.cfa.go.jp/policies/boshihoken/seitokenkogaiyo/>からご覧いただけます。"
+ },
+ {
+ "categoryMajor": "子育て",
+ "categoryMedium": "不妊症・不育症に関する支援、相談",
+ "categoryMinor": null,
+ "question": "不妊治療連絡カードとは何ですか。",
+ "answer": "不妊治療連絡カードは、不妊治療を受ける労働者の方が主治医等から診療に基づき治療や検査に必要な配慮事項について、企業の人事労務担当者に的確に伝達するためのカードで、厚生労働省において作成し、活用をお勧めしているものです。\n仕事と不妊治療との両立を行う従業員と企業の方をつなぐツールとしてお役立てください。\n\n不妊治療連絡カードの内容や様式、記載例については、<こちら,https://www.mhlw.go.jp/bunya/koyoukintou/pamphlet/dl/30b.pdf>をご覧ください。"
+ },
+ {
+ "categoryMajor": "子育て",
+ "categoryMedium": "不妊症・不育症に関する支援、相談",
+ "categoryMinor": null,
+ "question": "不妊治療にかかる費用が不安です。治療費の助成はありますか。",
+ "answer": "保険適用となっている不妊治療については、具体的には、タイミング法などの「一般不妊治療」や、採卵・採精から胚移植に至るまでの基本的な「生殖補助医療」が含まれています。令和4年4月から有効性・安全性が示された不妊治療が保険適用となっています。\nまた、これに加えてそれぞれ自治体による助成を受けられる場合があります。申請手続等に関してはお住まいの都道府県又は市区町村にご確認ください。"
+ }
+]
+"""
+
+shot_assistant = """
+不妊治療に関しては、医療機関での治療、助成金の申請、両立支援など多岐にわたる情報が提供されています。これらを通じて、患者の治療と日常生活のサポートが図られています。
+
+- 不妊治療を行っている医療機関は、[こちら](https://funin-fuiku.cfa.go.jp/clinic/)から検索できます。
+- 働きながらの不妊治療支援については、「不妊治療と仕事との両立サポートハンドブック」が[こちら](https://www.mhlw.go.jp/stf/newpage_14408.html)で提供されています。
+- 不妊症や不育症に関する相談は、都道府県や指定都市が設置する性と健康の相談センターで行われており、一覧は[こちら](https://www.cfa.go.jp/policies/boshihoken/seitokenkogaiyo/)から参照できます。
+- 不妊治療連絡カードは、働きながら治療を受ける人が企業の人事労務担当者に必要な配慮事項を伝えるためのツールです。詳細は[こちら](https://www.mhlw.go.jp/bunya/koyoukintou/pamphlet/dl/30b.pdf)。
+- 不妊治療にかかる費用の助成については、保険適用となる治療もあり、自治体による助成も受けられます。詳しくはお住まいの自治体に確認してください。
+- 不妊治療と仕事の両立支援に関する情報は、[厚生労働省のページ](https://www.mhlw.go.jp/stf/newpage_14408.html)で提供されています。
+- 無職で不妊治療中の方には、雇用保険の基本手当の受給が可能です。詳細は[こちら](https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/0000139508.html)。
+
+## 参考資料
+
+- [不妊治療を行っている医療機関の検索ページ](https://funin-fuiku.cfa.go.jp/clinic/)
+- [不妊治療と仕事との両立サポートハンドブック](https://www.mhlw.go.jp/stf/newpage_14408.html)
+- [性と健康の相談センター一覧](https://www.cfa.go.jp/policies/boshihoken/seitokenkogaiyo/)
+- [不妊治療連絡カードについて](https://www.mhlw.go.jp/bunya/koyoukintou/pamphlet/dl/30b.pdf)
+- [雇用保険の基本手当について](https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/0000139508.html)
+"""
diff --git a/backend/generated/schema_types.py b/backend/generated/schema_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ae58026da9353ebf2254657acceb4f566e5fb8a
--- /dev/null
+++ b/backend/generated/schema_types.py
@@ -0,0 +1,104 @@
+from typing_extensions import ClassVar, List, Optional, TypedDict
+
+
+Query = TypedDict(
+ "Query",
+ {
+ "questions": "QuestionsQueryResult",
+ "answers": "AnswersQueryResult",
+ },
+)
+
+
+QuestionsParams = TypedDict(
+ "QuestionsParams",
+ {
+ "query": Optional[str],
+ },
+)
+
+
+QuestionsQueryResult = ClassVar[Optional["QuestionsPayload"]]
+
+
+AnswersParams = TypedDict(
+ "AnswersParams",
+ {
+ "query": Optional[str],
+ },
+)
+
+
+AnswersQueryResult = ClassVar[Optional["AnswersPayload"]]
+
+
+Subscription = TypedDict(
+ "Subscription",
+ {
+ "generateSummary": "GenerateSummarySubscriptionResult",
+ },
+)
+
+
+GenerateSummaryParams = TypedDict(
+ "GenerateSummaryParams",
+ {
+ "query": str,
+ "docIds": List[str],
+ },
+)
+
+
+GenerateSummarySubscriptionResult = ClassVar[Optional["GenerateSummaryPayload"]]
+
+
+AnswersPayload = TypedDict(
+ "AnswersPayload",
+ {
+ "answers": Optional[List["Answer"]],
+ },
+)
+
+
+QuestionsPayload = TypedDict(
+ "QuestionsPayload",
+ {
+ "questions": Optional[List["Question"]],
+ },
+)
+
+
+GenerateSummaryPayload = TypedDict(
+ "GenerateSummaryPayload",
+ {
+ "summary": str,
+ },
+)
+
+
+Answer = TypedDict(
+ "Answer",
+ {
+ "id": str,
+ "docId": str,
+ "categoryMajor": Optional[str],
+ "categoryMedium": Optional[str],
+ "categoryMinor": Optional[str],
+ "question": str,
+ "answer": str,
+ "score": Optional[float],
+ },
+)
+
+
+Question = TypedDict(
+ "Question",
+ {
+ "id": str,
+ "docId": str,
+ "categoryMajor": Optional[str],
+ "categoryMedium": Optional[str],
+ "categoryMinor": Optional[str],
+ "question": str,
+ },
+)
diff --git a/backend/graphql.py b/backend/graphql.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5f12334b03ecc6fb013efac2f47d628a388c634
--- /dev/null
+++ b/backend/graphql.py
@@ -0,0 +1,15 @@
+import logging
+from ariadne import load_schema_from_path, make_executable_schema
+from ariadne.asgi import GraphQL
+from ariadne.asgi.handlers import GraphQLTransportWSHandler
+from .resolver import query, subscription
+
+
+type_defs = load_schema_from_path("schema/graphql_qa.graphql")
+schema = make_executable_schema(type_defs, query, subscription)
+graphql_app = GraphQL(
+ schema,
+ debug=True,
+ logger=logging.getLogger("graphql"),
+ websocket_handler=GraphQLTransportWSHandler(),
+)
diff --git a/backend/logger.py b/backend/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4be7355014c4afd6ae837e8a66496e96012f8d6
--- /dev/null
+++ b/backend/logger.py
@@ -0,0 +1,26 @@
+import logging
+import structlog
+import sys
+from pythonjsonlogger import jsonlogger
+
+
+def init_logger():
+ structlog.configure(
+ processors=[
+ structlog.stdlib.filter_by_level,
+ structlog.stdlib.add_logger_name,
+ structlog.stdlib.add_log_level,
+ structlog.stdlib.PositionalArgumentsFormatter(),
+ structlog.processors.StackInfoRenderer(),
+ structlog.processors.format_exc_info,
+ structlog.processors.UnicodeDecoder(),
+ structlog.stdlib.render_to_log_kwargs,
+ ],
+ logger_factory=structlog.stdlib.LoggerFactory(),
+ wrapper_class=structlog.stdlib.BoundLogger,
+ cache_logger_on_first_use=True,
+ )
+ handler = logging.StreamHandler(sys.stdout)
+ handler.setFormatter(jsonlogger.JsonFormatter())
+ root_logger = logging.getLogger()
+ root_logger.addHandler(handler)
diff --git a/backend/resolver.py b/backend/resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff3034147359149b37733d19ff6201a97940ae4c
--- /dev/null
+++ b/backend/resolver.py
@@ -0,0 +1,308 @@
+import asyncio
+import json
+from annotated_types import Len
+import structlog
+from typing import Optional
+from openai.types.chat import ChatCompletionMessageParam
+from pydantic import BaseModel, Field, validator
+from typing_extensions import Annotated, AsyncGenerator, Iterable, Unpack
+from ariadne import ObjectType, SubscriptionType
+from graphql import GraphQLResolveInfo
+from vespa.application import Vespa
+from vespa.io import VespaQueryResponse
+from openai import AsyncOpenAI
+
+from .data import questions as data_questions, shot_user, shot_assistant
+from .cache import cache_questions, cache_generate_summary
+from .settings import VESPA_APP_URL, OPENAI_API_KEY
+from .generated.schema_types import (
+ Answer,
+ AnswersParams,
+ AnswersQueryResult,
+ GenerateSummaryParams,
+ GenerateSummarySubscriptionResult,
+ Question,
+ QuestionsParams,
+ QuestionsQueryResult,
+)
+
+
+clientVespa = Vespa(url=VESPA_APP_URL)
+clientOpenAI = AsyncOpenAI(
+ api_key=str(OPENAI_API_KEY),
+)
+logger = structlog.get_logger("qa")
+query = ObjectType("Query")
+
+
+class QaFieldModel(BaseModel):
+ sddocname: str
+ documentid: str
+ doc_id: str
+ category_major: Optional[str] = None
+ category_medium: Optional[str] = None
+ category_minor: Optional[str] = None
+ question: str
+ answer: str
+
+
+class QaModel(BaseModel):
+ id: str
+ relevance: float
+ source: str
+ fields: QaFieldModel
+
+
+class AnswersParamsModel(BaseModel):
+ query: Optional[str] = Field(strict=True, max_length=1024)
+
+
+@query.field("answers")
+async def resolve_answer(
+ _, info: GraphQLResolveInfo, **params: Unpack[AnswersParams]
+) -> AnswersQueryResult:
+ assert info is not None, "Prevent type check error"
+
+ params_parsed = AnswersParamsModel.model_validate(params, strict=True)
+ answers: list[Answer] = []
+
+ query = params_parsed.query
+ if not query:
+ logger.warning("Query is empty", params=params_parsed)
+ return {"answers": answers}
+ query_parsed = (
+ query.replace("\\", "\\\\")
+ .replace('"', '\\"')
+ .replace(":", "\\:")
+ .replace(")", "\\)")
+ )
+
+ base = "select * from qa where"
+ anno = "{targetHits:100,approximate:false}"
+ cond01 = f"({anno}nearestNeighbor(answer_embedding_me5s, q))"
+ cond02 = f"({anno}nearestNeighbor(question_embedding_me5s, q))"
+
+ async with clientVespa.asyncio() as sess:
+ res: VespaQueryResponse = await sess.query(
+ yql=f"{base} {cond01} or {cond02}",
+ lang="ja",
+ hits=20,
+ ranking="semantic",
+ body={
+ "input.query(q)": f'embed(multilingual-e5-small, "query: {query_parsed}")',
+ },
+ )
+ if not res.is_successful():
+ logger.warning("Vespa query failed", json=res.json, status=res.status_code)
+ return {"answers": answers}
+
+ hits = [QaModel.model_validate(hit, strict=True) for hit in res.hits]
+ answers = [
+ Answer(
+ id=hit.fields.doc_id,
+ docId=hit.fields.doc_id,
+ categoryMajor=hit.fields.category_major,
+ categoryMedium=hit.fields.category_medium,
+ categoryMinor=hit.fields.category_minor,
+ question=hit.fields.question,
+ answer=hit.fields.answer,
+ score=hit.relevance,
+ )
+ for hit in hits
+ ]
+
+ return {"answers": answers}
+
+
+class QuestionsParamsModel(BaseModel):
+ query: Optional[str] = Field(strict=True, max_length=1024)
+
+
+@query.field("questions")
+async def resolve_question(
+ _, info: GraphQLResolveInfo, **params: Unpack[QuestionsParams]
+) -> QuestionsQueryResult:
+ assert info is not None, "Prevent type check error"
+
+ params_parsed = QuestionsParamsModel.model_validate(params, strict=True)
+ questions: list[Question] = data_questions
+
+ query = params_parsed.query
+ if query is None:
+ logger.warning("Query is empty", params=params_parsed)
+ return {"questions": questions}
+ query_parsed = (
+ query.replace("\\", "\\\\")
+ .replace('"', '\\"')
+ .replace(":", "\\:")
+ .replace(")", "\\)")
+ )
+
+ cached_questions = await cache_questions.get(query)
+ if isinstance(cached_questions, list):
+ return {"questions": cached_questions}
+
+ base = "select * from qa where"
+ anno = "{targetHits:100,approximate:false}"
+ cond01 = "({targetHits:100}userInput(@condQuery))"
+ cond02 = f'(question matches "{query_parsed}")'
+ cond03 = f"({anno}nearestNeighbor(question_embedding_me5s, q))"
+
+ async with clientVespa.asyncio() as sess:
+ res: VespaQueryResponse = await sess.query(
+ yql=f"{base} {cond01} or {cond02} or {cond03}",
+ lang="ja",
+ hits=20,
+ ranking="question_semantic",
+ body={
+ "condQuery": query,
+ "input.query(q)": f'embed(multilingual-e5-small, "query: {query_parsed}")',
+ },
+ )
+ if not res.is_successful():
+ logger.warning("Vespa query failed", json=res.json, status=res.status_code)
+ return {"questions": questions}
+
+ hits = [QaModel.model_validate(hit, strict=True) for hit in res.hits]
+ questions = [
+ Question(
+ id=hit.fields.doc_id,
+ docId=hit.fields.doc_id,
+ categoryMajor=hit.fields.category_major,
+ categoryMedium=hit.fields.category_medium,
+ categoryMinor=hit.fields.category_minor,
+ question=hit.fields.question,
+ )
+ for hit in hits
+ ]
+
+ await cache_questions.set(query, questions)
+ return {"questions": questions}
+
+
+subscription = SubscriptionType()
+
+
+class GenerateSummaryParamsModel(BaseModel):
+ query: str = Field(strict=True, max_length=1024)
+ docIds: Annotated[list[str], Len(max_length=10)]
+
+ @validator("docIds", each_item=True)
+ def check_max_length(cls, v):
+ if len(v) > 1024:
+ raise ValueError("string length exceeds maximum of 1024")
+ return v
+
+
+@subscription.source("generateSummary")
+async def generate_generate_summary(
+ _, info: GraphQLResolveInfo, **params: Unpack[GenerateSummaryParams]
+) -> AsyncGenerator[str, str]:
+ assert info is not None, "Prevent type check error"
+
+ params_parsed = GenerateSummaryParamsModel.model_validate(params, strict=True)
+ if not params_parsed.query:
+ logger.warning("No query found", params=params_parsed)
+ return
+
+ doc_ids = params_parsed.docIds or []
+ if not doc_ids:
+ logger.warning("No docIds found", params=params_parsed)
+ return
+
+ key = params_parsed.query + "|" + "|".join(sorted(doc_ids))
+ cached_summary = await cache_generate_summary.get(key)
+ if isinstance(cached_summary, str):
+ for char in cached_summary:
+ yield char
+ await asyncio.sleep(0.05)
+ return
+
+ query_in = ", ".join(
+ ['"' + x.replace("\\", "\\\\").replace('"', '\\"') + '"' for x in doc_ids]
+ )
+ answers = []
+ async with clientVespa.asyncio() as sess:
+ res: VespaQueryResponse = await sess.query(
+ yql=f"select * from qa where doc_id in ({query_in})",
+ lang="ja",
+ hits=5,
+ )
+ if not res.is_successful():
+ logger.warning("Vespa query failed", json=res.json, status=res.status_code)
+ return
+
+ hits = [QaModel.model_validate(hit, strict=True) for hit in res.hits]
+ answers = [
+ {
+ "docId": hit.fields.doc_id,
+ "answer": hit.fields.answer,
+ "score": hit.relevance,
+ }
+ for hit in hits
+ ]
+
+ if not answers:
+ logger.warning("No answers found", params=params_parsed)
+ return
+
+ system = """あなたには質問(question)と参考資料(references)が与えられます。
+あなたの仕事は以下の2つです。
+
+- 与えられた参考資料にかかれている情報のみを使って質問に回答する。
+- 参考資料の要約をわかりやすくまとめる。
+
+以下のルールに従ってください:
+
+- 回答には参考資料に書かれている正確な情報のみを反映してください。
+- 回答や要約には外部の情報や暗黙の知識は反映しないでください。
+
+以下のフォーマットで出力してください:
+
+```
+### 回答
+
+ここに回答を書いてください。
+
+### 要約
+
+ここに要約を書いてください。
+```
+"""
+ user = f"""
+## 質問
+{params_parsed.query}
+
+## 参考資料
+{json.dumps(answers, ensure_ascii=False, indent=2)}
+"""
+ messages: Iterable[ChatCompletionMessageParam] = [
+ {"role": "system", "name": "instruction", "content": system},
+ {"role": "user", "name": "info", "content": shot_user},
+ {"role": "assistant", "name": "summary", "content": shot_assistant},
+ {"role": "user", "name": "info", "content": user},
+ ]
+ print("OpenAI chat completions", f"messages={messages}")
+ stream = await clientOpenAI.chat.completions.create(
+ messages=messages,
+ model="gpt-4-turbo-2024-04-09",
+ stream=True,
+ )
+
+ summary = ""
+ async for chunk in stream:
+ content = chunk.choices[0].delta.content or ""
+ summary += content
+ # FIXME: sanitize to return only elements that are not dangerous as markdown
+ yield content
+
+ await cache_generate_summary.set(key, summary)
+ return
+
+
+@subscription.field("generateSummary")
+def resolve_generate_summary(
+ summary: str, info: GraphQLResolveInfo, **params: Unpack[GenerateSummaryParams]
+) -> GenerateSummarySubscriptionResult:
+ assert info and params, "Prevent type check error"
+ return {"summary": summary}
diff --git a/backend/settings.py b/backend/settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f1258bf5e1b46f2d4659265c572e94199dc06c3
--- /dev/null
+++ b/backend/settings.py
@@ -0,0 +1,9 @@
+from starlette.config import Config
+from starlette.datastructures import Secret
+
+config = Config(env_file=".env")
+
+VESPA_APP_URL = config("VESPA_APP_URL", default="http://localhost:4080")
+OPENAI_API_KEY = config(
+ "OPENAI_API_KEY", default="__OPENAI_API_KEY_NOT_SET__", cast=Secret
+)
diff --git a/backend_vespa/Makefile b/backend_vespa/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..8e474846deaf9f6109b87901d70ec2cb73ed979d
--- /dev/null
+++ b/backend_vespa/Makefile
@@ -0,0 +1,50 @@
+SHELL=/bin/bash -eo pipefail
+
+.PHONY: list clean setup convert_to_onix
+
+list:
+ @LC_ALL=C $(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null | \
+ awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' | \
+ sort | egrep -v -e '^[^[:alnum:]]' -e '^$@$$'
+clean:
+ rm -rf target
+
+package: clean
+ mvn package
+
+deploy: package
+ vespa deploy -t local --wait 300
+
+feed:
+ vespa feed - < ../data/000936301.jsonl
+
+check_reindex:
+ vespa curl -s deploy \
+ /application/v2/tenant/default/application/default/environment/prod/region/default/instance/default/reindexing | jq .
+
+convert_to_onix_base:
+ rye run python ../scripts/export_hf_model_from_hf.py \
+ --hf_model intfloat/multilingual-e5-base --output_dir multilingual-e5-base
+ rye run optimum-cli onnxruntime quantize \
+ --onnx_model multilingual-e5-base --avx2 -o multilingual-e5-base-quantized
+
+copy_model_to_app_base:
+ cp -a multilingual-e5-base-quantized/model_quantized.onnx \
+ src/main/application/model/multilingual-e5-base-model_quantized.onnx
+ cp -a multilingual-e5-base-quantized/tokenizer.json \
+ src/main/application/model/multilingual-e5-base-tokenizer.json
+
+convert_to_onix_small:
+ rye run python ../scripts/export_hf_model_from_hf.py \
+ --hf_model intfloat/multilingual-e5-small --output_dir multilingual-e5-small
+ rye run optimum-cli onnxruntime quantize \
+ --onnx_model multilingual-e5-small --avx2 -o multilingual-e5-small-quantized
+
+copy_model_to_app_small:
+ cp -a multilingual-e5-small-quantized/model_quantized.onnx \
+ src/main/application/model/multilingual-e5-small-model_quantized.onnx
+ cp -a multilingual-e5-small-quantized/tokenizer.json \
+ src/main/application/model/multilingual-e5-small-tokenizer.json
+
+remove_model_from_app:
+ rm -f src/main/application/model/*
diff --git a/backend_vespa/multilingual-e5-small-quantized/model_quantized.onnx b/backend_vespa/multilingual-e5-small-quantized/model_quantized.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..fc07dea98e84b0d4915da60c384f22516c9cb3af
--- /dev/null
+++ b/backend_vespa/multilingual-e5-small-quantized/model_quantized.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15609ad1718cd4565e6e94f02abdee21ae0840ea44fb54d31cc07a6908072f60
+size 118322572
diff --git a/backend_vespa/multilingual-e5-small-quantized/ort_config.json b/backend_vespa/multilingual-e5-small-quantized/ort_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a479562c1aff2e263998a0c074c9f196508d3388
--- /dev/null
+++ b/backend_vespa/multilingual-e5-small-quantized/ort_config.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ca2232004d70b9312da2e771e157746e1794c5dca52d35beae0d94274d9e04c
+size 831
diff --git a/backend_vespa/multilingual-e5-small-quantized/sentencepiece.bpe.model b/backend_vespa/multilingual-e5-small-quantized/sentencepiece.bpe.model
new file mode 100644
index 0000000000000000000000000000000000000000..7a3f40a75f870bc1f21700cd414dc2acc431583c
--- /dev/null
+++ b/backend_vespa/multilingual-e5-small-quantized/sentencepiece.bpe.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
+size 5069051
diff --git a/backend_vespa/multilingual-e5-small-quantized/special_tokens_map.json b/backend_vespa/multilingual-e5-small-quantized/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..24c989f33e4c90e04b572df270ce5476ab1bdcae
--- /dev/null
+++ b/backend_vespa/multilingual-e5-small-quantized/special_tokens_map.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c785abebea9ae3257b61681b4e6fd8365ceafde980c21970d001e834cf10835
+size 964
diff --git a/backend_vespa/multilingual-e5-small-quantized/tokenizer.json b/backend_vespa/multilingual-e5-small-quantized/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa8740f5f873cae8af943d8efd879ecc48b21944
--- /dev/null
+++ b/backend_vespa/multilingual-e5-small-quantized/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46877b3f800bf28b4cda6ab2197210ef1fe17c878b20e3a41819ee0ea7d27b76
+size 11082630
diff --git a/backend_vespa/multilingual-e5-small-quantized/tokenizer_config.json b/backend_vespa/multilingual-e5-small-quantized/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f2faa4ec1ef0dc90ca98c5b434dc6b981e276b3
--- /dev/null
+++ b/backend_vespa/multilingual-e5-small-quantized/tokenizer_config.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:abb78061a3b9cc3aa7623520f08cc4c2d10942ca1531bdf8f303937b2e7a3b56
+size 1172
diff --git a/backend_vespa/multilingual-e5-small/model.onnx b/backend_vespa/multilingual-e5-small/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..393c423c11479a71c9e9e3e3fa68f1b4619863b4
--- /dev/null
+++ b/backend_vespa/multilingual-e5-small/model.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66775c81c6056fe3370488807401f814a30d9467b18d01b3f0b116fb5b34a0c8
+size 470869701
diff --git a/backend_vespa/multilingual-e5-small/sentencepiece.bpe.model b/backend_vespa/multilingual-e5-small/sentencepiece.bpe.model
new file mode 100644
index 0000000000000000000000000000000000000000..7a3f40a75f870bc1f21700cd414dc2acc431583c
--- /dev/null
+++ b/backend_vespa/multilingual-e5-small/sentencepiece.bpe.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
+size 5069051
diff --git a/backend_vespa/multilingual-e5-small/special_tokens_map.json b/backend_vespa/multilingual-e5-small/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..41a246cfab18535444cfbea9525e44f4b9077023
--- /dev/null
+++ b/backend_vespa/multilingual-e5-small/special_tokens_map.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06e405a36dfe4b9604f484f6a1e619af1a7f7d09e34a8555eb0b77b66318067f
+size 280
diff --git a/backend_vespa/multilingual-e5-small/tokenizer.json b/backend_vespa/multilingual-e5-small/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..f57df2f13f5ce2bd0fff6b74f3d7204610d21e92
--- /dev/null
+++ b/backend_vespa/multilingual-e5-small/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b3f6fc9286922cf30646a1957c81e5655f977d2204c9631b7624f21d6c641b5
+size 17082757
diff --git a/backend_vespa/multilingual-e5-small/tokenizer_config.json b/backend_vespa/multilingual-e5-small/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f2faa4ec1ef0dc90ca98c5b434dc6b981e276b3
--- /dev/null
+++ b/backend_vespa/multilingual-e5-small/tokenizer_config.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:abb78061a3b9cc3aa7623520f08cc4c2d10942ca1531bdf8f303937b2e7a3b56
+size 1172
diff --git a/backend_vespa/pom.xml b/backend_vespa/pom.xml
new file mode 100644
index 0000000000000000000000000000000000000000..42259074fb82b43863c22ccf1e76722a9fce0b13
--- /dev/null
+++ b/backend_vespa/pom.xml
@@ -0,0 +1,74 @@
+
+
+ 4.0.0
+ jp.hicustomer
+ govsearch
+ 0.0.1
+ container-plugin
+
+ false
+ UTF-8
+ true
+ 8.324.16
+
+
+
+
+ com.yahoo.vespa
+ lucene-linguistics
+ ${vespa_version}
+
+
+ com.yahoo.vespa
+ linguistics
+ ${vespa_version}
+ provided
+
+
+ com.yahoo.vespa
+ application
+ ${vespa_version}
+ provided
+
+
+
+
+
+
+ com.yahoo.vespa
+ bundle-plugin
+ ${vespa_version}
+ true
+
+ false
+
+
+
+ com.yahoo.vespa
+ vespa-application-maven-plugin
+ ${vespa_version}
+
+
+
+ packageApplication
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.11.0
+
+ 17
+ 17
+
+
+
+
+
diff --git a/backend_vespa/src/main/application/components/kuromoji-linguistics-2.0.3.jar b/backend_vespa/src/main/application/components/kuromoji-linguistics-2.0.3.jar
new file mode 100644
index 0000000000000000000000000000000000000000..6a4fcdc438650aa5a4c1c073c4cec1ffc566fa36
--- /dev/null
+++ b/backend_vespa/src/main/application/components/kuromoji-linguistics-2.0.3.jar
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0eb48ee595d6df45687964229b322d670638a712cc808e1d71bdd802e31423d6
+size 13179612
diff --git a/backend_vespa/src/main/application/model/.keep b/backend_vespa/src/main/application/model/.keep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/backend_vespa/src/main/application/model/multilingual-e5-small-model_quantized.onnx b/backend_vespa/src/main/application/model/multilingual-e5-small-model_quantized.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..fc07dea98e84b0d4915da60c384f22516c9cb3af
--- /dev/null
+++ b/backend_vespa/src/main/application/model/multilingual-e5-small-model_quantized.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15609ad1718cd4565e6e94f02abdee21ae0840ea44fb54d31cc07a6908072f60
+size 118322572
diff --git a/backend_vespa/src/main/application/model/multilingual-e5-small-tokenizer.json b/backend_vespa/src/main/application/model/multilingual-e5-small-tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa8740f5f873cae8af943d8efd879ecc48b21944
--- /dev/null
+++ b/backend_vespa/src/main/application/model/multilingual-e5-small-tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46877b3f800bf28b4cda6ab2197210ef1fe17c878b20e3a41819ee0ea7d27b76
+size 11082630
diff --git a/backend_vespa/src/main/application/schemas/qa.sd b/backend_vespa/src/main/application/schemas/qa.sd
new file mode 100644
index 0000000000000000000000000000000000000000..8e8ecc0b7b4c3c51687f9e3d4db2d6c9b707aa0f
--- /dev/null
+++ b/backend_vespa/src/main/application/schemas/qa.sd
@@ -0,0 +1,116 @@
+schema qa {
+ document qa {
+ field lang type string {
+ indexing: set_language
+ }
+
+ field doc_id type string {
+ indexing: attribute | summary
+ attribute: fast-search
+ rank: filter
+ }
+
+ field category_major type string {
+ indexing: summary | attribute
+ }
+
+ field category_medium type string {
+ indexing: summary | attribute
+ }
+
+ field category_minor type string {
+ indexing: summary | attribute
+ }
+
+ field question type string {
+ indexing: summary | attribute
+ }
+
+ field answer type string {
+ indexing: summary | index
+ index: enable-bm25
+ }
+ }
+
+ field question_exact type string {
+ indexing: input question | index
+ match: word
+ }
+
+ field question_bm25 type string {
+ indexing: input question | index
+ index: enable-bm25
+ }
+
+ field question_ngram type string {
+ indexing: input question | index
+ match {
+ gram
+ gram-size: 2
+ }
+ }
+
+ field question_embedding_me5s type tensor(x[384]) {
+ indexing: {
+ "passage: " . input question | embed multilingual-e5-small | attribute
+ }
+ attribute {
+ distance-metric: angular
+ }
+ }
+
+ field answer_embedding_me5s type tensor(x[384]) {
+ indexing: {
+ "passage: " . input answer | embed multilingual-e5-small | attribute
+ }
+ attribute {
+ distance-metric: angular
+ }
+ }
+
+ fieldset default {
+ fields: question_bm25, answer
+ }
+
+ fieldset question {
+ fields: question
+ }
+
+ rank-profile semantic inherits default {
+ inputs {
+ query(q) tensor(x[384])
+ }
+ function question_semantic() {
+ expression: max(cos(distance(field, question_embedding_me5s)), 0)
+ }
+ function answer_semantic() {
+ expression: max(cos(distance(field, answer_embedding_me5s)), 0)
+ }
+ first-phase {
+ expression {
+ question_semantic * 0.5 + answer_semantic * 0.5
+ }
+ }
+ match-features {
+ question_semantic
+ answer_semantic
+ }
+ }
+
+ rank-profile question_semantic inherits default {
+ inputs {
+ query(q) tensor(x[384])
+ }
+ function question_semantic() {
+ expression: max(cos(distance(field, question_embedding_me5s)), 0)
+ }
+ first-phase {
+ expression {
+ question_semantic
+ }
+ }
+ match-features {
+ question_semantic
+ }
+ }
+}
diff --git a/backend_vespa/src/main/application/services.xml b/backend_vespa/src/main/application/services.xml
new file mode 100644
index 0000000000000000000000000000000000000000..fb226d5025e54880609a0bbd0ad7972d75042efe
--- /dev/null
+++ b/backend_vespa/src/main/application/services.xml
@@ -0,0 +1,54 @@
+
+
+
+
+
+
+ search
+ true
+ false
+ true
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 2
+
+
+
+
+
+
+
+
+
+ 0.95
+ 0.8
+
+
+
+
diff --git a/backend_vespa/src/main/application/validation-overrides.xml b/backend_vespa/src/main/application/validation-overrides.xml
new file mode 100644
index 0000000000000000000000000000000000000000..d9b531c6581894841a232d071108d290a0c0e877
--- /dev/null
+++ b/backend_vespa/src/main/application/validation-overrides.xml
@@ -0,0 +1,4 @@
+
+
+ indexing-change
+
diff --git a/biome.json b/biome.json
new file mode 100644
index 0000000000000000000000000000000000000000..77e5ea94303fc6e7e40008bedfdd0ebe195a03ef
--- /dev/null
+++ b/biome.json
@@ -0,0 +1,36 @@
+{
+ "$schema": "https://biomejs.dev/schemas/1.6.3/schema.json",
+ "organizeImports": {
+ "enabled": true
+ },
+ "files": {
+ "include": [
+ "*.ts",
+ "*.json",
+ "*.yml",
+ "frontend/**/*.ts",
+ "frontend/**/*.tsx",
+ "scripts/**/*.ts"
+ ],
+ "ignore": [
+ "backend_vespa/multilingual-e5*",
+ "backend_vespa/src/main/application/model",
+ "backend_vespa/target"
+ ]
+ },
+ "formatter": {
+ "indentStyle": "space"
+ },
+ "javascript": {
+ "formatter": {
+ "semicolons": "asNeeded"
+ }
+ },
+ "linter": {
+ "enabled": true,
+ "ignore": ["frontend/generated/*.ts"],
+ "rules": {
+ "recommended": true
+ }
+ }
+}
diff --git a/codegen.ts b/codegen.ts
new file mode 100644
index 0000000000000000000000000000000000000000..740b24283a9820c2a2c8bbe39270a8d80b074765
--- /dev/null
+++ b/codegen.ts
@@ -0,0 +1,20 @@
+import type { CodegenConfig } from "@graphql-codegen/cli"
+
+const config: CodegenConfig = {
+ overwrite: true,
+ schema: "schema/graphql_*.graphql",
+ documents: "frontend/**/*.{ts,tsx}",
+ ignoreNoDocuments: true,
+ generates: {
+ "frontend/generated/": {
+ preset: "client",
+ plugins: [],
+ config: {
+ enumsAsTypes: true,
+ strictScalars: true,
+ },
+ },
+ },
+}
+
+export default config
diff --git a/components.json b/components.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b1db8a48fff1055bac2004e1efffd2e43df265b
--- /dev/null
+++ b/components.json
@@ -0,0 +1,17 @@
+{
+ "$schema": "https://ui.shadcn.com/schema.json",
+ "style": "default",
+ "rsc": false,
+ "tsx": true,
+ "tailwind": {
+ "config": "tailwind.config.ts",
+ "css": "frontend/tailwind.css",
+ "baseColor": "slate",
+ "cssVariables": true,
+ "prefix": ""
+ },
+ "aliases": {
+ "components": "~/components",
+ "utils": "~/lib/utils"
+ }
+}
diff --git a/config.jsonnet b/config.jsonnet
new file mode 100644
index 0000000000000000000000000000000000000000..23e0e8a349fb8bbf839b5e996965c4a238dfd0d4
--- /dev/null
+++ b/config.jsonnet
@@ -0,0 +1,137 @@
+local docker_compose = {
+ image: 'govsearch:latest',
+ build: '.',
+ entrypoint: ['/usr/bin/env'],
+ environment: {
+ TZ: 'UTC',
+ },
+};
+local docker_compose_frontend = docker_compose {
+ command: [
+ 'npx',
+ 'remix',
+ 'vite:dev',
+ '--host',
+ '0.0.0.0',
+ '--port',
+ '7861',
+ ],
+ ports: ['7861:7861'],
+ depends_on: ['backend'],
+};
+local docker_compose_backend = docker_compose {
+ command: [
+ 'uvicorn',
+ 'backend:app',
+ '--host',
+ '0.0.0.0',
+ '--port',
+ '7860',
+ '--log-level',
+ 'debug',
+ '--reload',
+ ],
+ environment+: {
+ OPENAI_API_KEY: null,
+ },
+ ports: ['7860:7860'],
+ depends_on: ['vespa'],
+};
+local docker_compose_vespa = {
+ image: 'vespaengine/vespa:8.324.16',
+ volumes: [
+ 'vespa:/opt/vespa/var',
+ ],
+ ports: [
+ '4080:4080',
+ '19071:19071',
+ '19092:19092',
+ ],
+ // NOTE: https://github.com/vespa-engine/vespa/blob/master/vespabase/src/vespa.service.in
+ ulimits: {
+ nofile: { soft: 32768, hard: 262144 },
+ nproc: { soft: 32768, hard: 409600 },
+ },
+};
+
+local tsconfig_compiler_options = {
+ allowJs: false,
+ allowSyntheticDefaultImports: true,
+ allowUnreachableCode: false,
+ esModuleInterop: true,
+ experimentalDecorators: false,
+ forceConsistentCasingInFileNames: true,
+ incremental: true,
+ isolatedModules: true,
+ noEmit: true,
+ noFallthroughCasesInSwitch: true,
+ noImplicitAny: true,
+ noUncheckedIndexedAccess: true,
+ resolveJsonModule: true,
+ skipLibCheck: false,
+ strict: true,
+ strictNullChecks: true,
+ target: 'ES2022',
+};
+local tsconfig = {
+ compilerOptions: tsconfig_compiler_options {
+ baseUrl: '.',
+ jsx: 'react-jsx',
+ module: 'ESNext',
+ moduleResolution: 'Bundler',
+ lib: ['DOM', 'DOM.Iterable', 'ES2022'],
+ paths: {
+ '~/*': ['./frontend/*'],
+ },
+ },
+ include: [
+ 'env.d.ts',
+ 'frontend/**/*.ts',
+ 'frontend/**/*.tsx',
+ ],
+};
+local tsconfig_scripts = {
+ compilerOptions: tsconfig_compiler_options {
+ baseUrl: '.',
+ module: 'commonjs',
+ moduleResolution: 'Node',
+ lib: ['DOM', 'DOM.Iterable', 'ES2022'],
+ paths: {
+ '~/*': ['./scripts/*'],
+ },
+ },
+ include: [
+ 'scripts/**/*.ts',
+ ],
+};
+
+{
+ 'docker-compose.yml': std.manifestYamlDoc(
+ {
+ version: '3.4',
+ services: {
+ frontend: docker_compose_frontend,
+ backend: docker_compose_backend,
+ vespa: docker_compose_vespa,
+ },
+ volumes: {
+ vespa: null,
+ },
+ },
+ indent_array_in_object=true,
+ ),
+ 'docker-compose.vespa.yml': std.manifestYamlDoc(
+ {
+ version: '3.4',
+ services: {
+ vespa: docker_compose_vespa,
+ },
+ volumes: {
+ vespa: null,
+ },
+ },
+ indent_array_in_object=true,
+ ),
+ 'tsconfig.json': tsconfig,
+ 'tsconfig.scripts.json': tsconfig_scripts,
+}
diff --git a/data/.keep b/data/.keep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/data/000936301.csv b/data/000936301.csv
new file mode 100644
index 0000000000000000000000000000000000000000..3f305247096fc3eebe459d27e0c74452563bde4f
--- /dev/null
+++ b/data/000936301.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d324ae1afd894466a994aeb79fa1fa04e5ba18c59dc11c9f3da7586f274082a2
+size 1081969
diff --git a/data/000936301.jsonl b/data/000936301.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..992b25d6adeea471845f5ca9e407886847547a6c
--- /dev/null
+++ b/data/000936301.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e9480be772b4097ee242d4874d0597c3e7a97f0b6cf66296ed15237caf94e84
+size 1284040
diff --git a/data/000936301_lf.csv b/data/000936301_lf.csv
new file mode 100644
index 0000000000000000000000000000000000000000..975d2852410a1262692cdf83add721c557155a98
--- /dev/null
+++ b/data/000936301_lf.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1553f16ff38e7e7e1667f18df97493610980712ce917cef6ce482021a184874a
+size 1080510
diff --git a/data/000936301_normalized.jsonl b/data/000936301_normalized.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..df3a4e62b37256dab146e2ab7da689e0d15bae10
--- /dev/null
+++ b/data/000936301_normalized.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3a5d96de2ddc133bc233166bd72159a20eaefac77ea83c578d865a455f2e79d
+size 1223209
diff --git a/data/000936301_q.json b/data/000936301_q.json
new file mode 100644
index 0000000000000000000000000000000000000000..9df883b79824f4ddcf245b052d37f2b54adcca80
--- /dev/null
+++ b/data/000936301_q.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94c82213e7d5bebc04ea7d91ecf2eede80b426e640f1c27ab14b1ceb5439278f
+size 5060
diff --git a/docker-compose.vespa.yml b/docker-compose.vespa.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4148f0b8d94913905512fa76b9a22cce61bbb50b
--- /dev/null
+++ b/docker-compose.vespa.yml
@@ -0,0 +1,19 @@
+services:
+ vespa:
+ image: vespaengine/vespa:8.324.16
+ ports:
+ - 4080:4080
+ - 19071:19071
+ - 19092:19092
+ ulimits:
+ nofile:
+ hard: 262144
+ soft: 32768
+ nproc:
+ hard: 409600
+ soft: 32768
+ volumes:
+ - vespa:/opt/vespa/var
+version: "3.4"
+volumes:
+ vespa: null
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3768541c504553f97d0372f8ca45436425cd923b
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,60 @@
+services:
+ backend:
+ build: .
+ command:
+ - uvicorn
+ - backend:app
+ - --host
+ - 0.0.0.0
+ - --port
+ - "7860"
+ - --log-level
+ - debug
+ - --reload
+ depends_on:
+ - vespa
+ entrypoint:
+ - /usr/bin/env
+ environment:
+ OPENAI_API_KEY: null
+ TZ: UTC
+ image: govsearch:latest
+ ports:
+ - 7860:7860
+ frontend:
+ build: .
+ command:
+ - npx
+ - remix
+ - vite:dev
+ - --host
+ - 0.0.0.0
+ - --port
+ - "7861"
+ depends_on:
+ - backend
+ entrypoint:
+ - /usr/bin/env
+ environment:
+ TZ: UTC
+ image: govsearch:latest
+ ports:
+ - 7861:7861
+ vespa:
+ image: vespaengine/vespa:8.324.16
+ ports:
+ - 4080:4080
+ - 19071:19071
+ - 19092:19092
+ ulimits:
+ nofile:
+ hard: 262144
+ soft: 32768
+ nproc:
+ hard: 409600
+ soft: 32768
+ volumes:
+ - vespa:/opt/vespa/var
+version: "3.4"
+volumes:
+ vespa: null
diff --git a/env.d.ts b/env.d.ts
new file mode 100644
index 0000000000000000000000000000000000000000..e15f0afe79a21c1600ee23b6ecceca459c47567e
--- /dev/null
+++ b/env.d.ts
@@ -0,0 +1,10 @@
+///
+///
+
+interface ImportMetaEnv {
+ readonly VITE_APP_TITLE: string
+}
+
+interface ImportMeta {
+ readonly env: ImportMetaEnv
+}
diff --git a/frontend/components/answer-list.tsx b/frontend/components/answer-list.tsx
new file mode 100644
index 0000000000000000000000000000000000000000..86001694e3d0b5480c5cac7a9e21f1203c508d8e
--- /dev/null
+++ b/frontend/components/answer-list.tsx
@@ -0,0 +1,74 @@
+import { Badge } from "~/components/ui/badge"
+import { ScrollArea } from "~/components/ui/scroll-area"
+import type { AnswersQuery } from "~/generated/graphql"
+import { cn } from "~/lib/utils"
+
+type AnswerListProps = {
+ items: NonNullable["answers"]>
+}
+
+export function AnswerList({ items }: AnswerListProps) {
+ return (
+
+ {0 < items.length ? (
+