Spaces:
Sleeping
Sleeping
Commit
·
b50d8a8
0
Parent(s):
Initial commit
Browse files- .github/workflows/format.yaml +24 -0
- .github/workflows/lint.yaml +24 -0
- .gitignore +171 -0
- .pre-commit-config.yaml +12 -0
- LICENSE +193 -0
- README.md +94 -0
- app.py +74 -0
- pyproject.toml +51 -0
- requirements.txt +285 -0
- src/ucl_module_chat/__init__.py +0 -0
- src/ucl_module_chat/chains/rag_chain.py +84 -0
- src/ucl_module_chat/conf/config.yaml +29 -0
- src/ucl_module_chat/data_processing/document_conversion.py +227 -0
- src/ucl_module_chat/data_processing/document_embedding.py +48 -0
- src/ucl_module_chat/data_processing/document_scraping.py +94 -0
- src/ucl_module_chat/data_processing/document_templates.py +40 -0
- src/ucl_module_chat/setup.py +44 -0
- src/ucl_module_chat/utils/resolve_paths.py +17 -0
- uv.lock +0 -0
.github/workflows/format.yaml
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Check code formatting with Black
|
2 |
+
|
3 |
+
on: [push, pull_request]
|
4 |
+
|
5 |
+
jobs:
|
6 |
+
build:
|
7 |
+
runs-on: ubuntu-latest
|
8 |
+
|
9 |
+
steps:
|
10 |
+
- uses: actions/checkout@v4
|
11 |
+
- name: Set up uv
|
12 |
+
# Install a specific uv version using the installer
|
13 |
+
run: curl -LsSf https://astral.sh/uv/0.3.3/install.sh | sh
|
14 |
+
- name: "Set up Python"
|
15 |
+
uses: actions/setup-python@v5
|
16 |
+
with:
|
17 |
+
python-version-file: "pyproject.toml"
|
18 |
+
- name: Install the project
|
19 |
+
run: |
|
20 |
+
uv sync --all-extras --dev
|
21 |
+
- name: Check formatting with Black
|
22 |
+
run: |
|
23 |
+
uv run black --check .
|
24 |
+
continue-on-error: true
|
.github/workflows/lint.yaml
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Lint code with Ruff
|
2 |
+
|
3 |
+
on: [push, pull_request]
|
4 |
+
|
5 |
+
jobs:
|
6 |
+
build:
|
7 |
+
runs-on: ubuntu-latest
|
8 |
+
|
9 |
+
steps:
|
10 |
+
- uses: actions/checkout@v4
|
11 |
+
- name: Set up uv
|
12 |
+
# Install a specific uv version using the installer
|
13 |
+
run: curl -LsSf https://astral.sh/uv/0.3.3/install.sh | sh
|
14 |
+
- name: "Set up Python"
|
15 |
+
uses: actions/setup-python@v5
|
16 |
+
with:
|
17 |
+
python-version-file: "pyproject.toml"
|
18 |
+
- name: Install the project
|
19 |
+
run: |
|
20 |
+
uv sync --all-extras --dev
|
21 |
+
- name: Lint with Ruff
|
22 |
+
run: |
|
23 |
+
uv run ruff check --output-format=github .
|
24 |
+
continue-on-error: true
|
.gitignore
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
110 |
+
.pdm.toml
|
111 |
+
.pdm-python
|
112 |
+
.pdm-build/
|
113 |
+
|
114 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
115 |
+
__pypackages__/
|
116 |
+
|
117 |
+
# Celery stuff
|
118 |
+
celerybeat-schedule
|
119 |
+
celerybeat.pid
|
120 |
+
|
121 |
+
# SageMath parsed files
|
122 |
+
*.sage.py
|
123 |
+
|
124 |
+
# Environments
|
125 |
+
.env
|
126 |
+
.venv
|
127 |
+
env/
|
128 |
+
venv/
|
129 |
+
ENV/
|
130 |
+
env.bak/
|
131 |
+
venv.bak/
|
132 |
+
|
133 |
+
# Spyder project settings
|
134 |
+
.spyderproject
|
135 |
+
.spyproject
|
136 |
+
|
137 |
+
# Rope project settings
|
138 |
+
.ropeproject
|
139 |
+
|
140 |
+
# mkdocs documentation
|
141 |
+
/site
|
142 |
+
|
143 |
+
# mypy
|
144 |
+
.mypy_cache/
|
145 |
+
.dmypy.json
|
146 |
+
dmypy.json
|
147 |
+
|
148 |
+
# Pyre type checker
|
149 |
+
.pyre/
|
150 |
+
|
151 |
+
# pytype static type analyzer
|
152 |
+
.pytype/
|
153 |
+
|
154 |
+
# Cython debug symbols
|
155 |
+
cython_debug/
|
156 |
+
|
157 |
+
# PyCharm
|
158 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
159 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
160 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
161 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
162 |
+
#.idea/
|
163 |
+
|
164 |
+
# Manually added
|
165 |
+
draft_notebooks/
|
166 |
+
.env
|
167 |
+
outputs
|
168 |
+
module_html
|
169 |
+
module_catalogue_html
|
170 |
+
module_md
|
171 |
+
module_catalogue_md
|
.pre-commit-config.yaml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
repos:
|
2 |
+
- repo: https://github.com/psf/black
|
3 |
+
rev: 24.8.0
|
4 |
+
hooks:
|
5 |
+
- id: black
|
6 |
+
language_version: python3
|
7 |
+
|
8 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
9 |
+
rev: v0.6.2
|
10 |
+
hooks:
|
11 |
+
- id: ruff
|
12 |
+
args: [--fix, --exit-non-zero-on-fix]
|
LICENSE
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
---
|
179 |
+
|
180 |
+
Copyright 2024 Joseph Farrington
|
181 |
+
|
182 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
183 |
+
you may not use this file except in compliance with the License.
|
184 |
+
You may obtain a copy of the License at
|
185 |
+
|
186 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
187 |
+
|
188 |
+
Unless required by applicable law or agreed to in writing, software
|
189 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
190 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
191 |
+
See the License for the specific language governing permissions and
|
192 |
+
limitations under the License.
|
193 |
+
|
README.md
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Chat with the 2024/2025 UCL module catalogue
|
2 |
+
|
3 |
+
## NOTE
|
4 |
+
This is a demonstration developed for educational purposes only and is not affiliated with or endorsed by University College London (UCL). The model may provide incorrect or outdated information. Interactions should therefore not be used to inform decisions such as programme choices or module selection.
|
5 |
+
|
6 |
+
Please refer to the official [UCL module catalogue](https://www.ucl.ac.uk/module-catalogue) for accurate and up-to-date information.
|
7 |
+
|
8 |
+
The code is licensed under Apache License 2.0 but the module catalogue content is copyright UCL.
|
9 |
+
|
10 |
+
## Get started
|
11 |
+
|
12 |
+
### Hugging Face Space
|
13 |
+
|
14 |
+
The easiest way to chat with the model is using the Hugging Face space.
|
15 |
+
|
16 |
+
### Local use
|
17 |
+
|
18 |
+
You can use the code snippet below to run the app locally. This project uses [uv](https://docs.astral.sh/uv/) to manage dependencies and the snippet assumes that you have [uv installed](https://docs.astral.sh/uv/getting-started/installation/).
|
19 |
+
|
20 |
+
The app requires an [OpenAI API key](https://help.openai.com/en/articles/4936850-where-do-i-find-my-openai-api-key) to run locally.
|
21 |
+
|
22 |
+
```bash
|
23 |
+
# Clone repo and install dependencies in venv
|
24 |
+
git clone https://github.com/joefarrington/ucl_module_chat.git
|
25 |
+
cd ucl_module_chat
|
26 |
+
|
27 |
+
uv venv
|
28 |
+
source .venv/bin/activate
|
29 |
+
uv pip install .
|
30 |
+
|
31 |
+
# Add API keys as environment variables
|
32 |
+
# Alternatively, create a .env file in the main directory
|
33 |
+
export OPENAI_API_KEY=<Your API key>
|
34 |
+
|
35 |
+
# Run the app
|
36 |
+
python app.py
|
37 |
+
```
|
38 |
+
One advantage of LangChain is that you could easily substitute the embedding model and/or LLM for alternatives including locally hosted models using [Hugging Face](https://python.langchain.com/docs/integrations/providers/huggingface/), [llama.cpp](https://python.langchain.com/docs/integrations/providers/llamacpp/) or [Ollama](https://python.langchain.com/docs/integrations/providers/ollama/).
|
39 |
+
|
40 |
+
### Rerun scraping and embedding
|
41 |
+
|
42 |
+
The repository includes the vectorstore with pages from the module catalogue embedded using OpenAI's [text-embedding-3-small](https://platform.openai.com/docs/guides/embeddings).
|
43 |
+
|
44 |
+
The process for downloading the pages from the module catalogue, converting the pages to markdown documents, and embedding the documents can be re-run using the script `setup.py`. There is no need to run this script unless you want to change the way data is extracted from the HTML pages to markdown, or embed the documents using an alternative model.
|
45 |
+
|
46 |
+
## Implementation details
|
47 |
+
|
48 |
+
### Document scraping
|
49 |
+
|
50 |
+
The documents in the vectorstore used to provide context to the LLM are based on the publicly available webpages describing each module offered by UCL.
|
51 |
+
|
52 |
+
The URLs for the individual module catalogue pages are identified from the module catalogue search page. The modules pages are then visited in sequence and the HTML is downloaded for each page.
|
53 |
+
|
54 |
+
There are more efficient ways to scrape the content from the module catalogue (e.g. [scrapy](https://scrapy.org/)). The current method is designed to minimise the effect on the server. There is long wait time between requests and the raw HTML is saved so that alternative methods of extracting the content can be considered without needing to request additional data from the server.
|
55 |
+
|
56 |
+
### Document conversion
|
57 |
+
|
58 |
+
The raw HTML for each module page is converted to a markdown document using [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) to parse the HTML and a [Jinja](https://jinja.palletsprojects.com/en/stable/intro/) template to format the extracted information.
|
59 |
+
|
60 |
+
### Document embedding
|
61 |
+
|
62 |
+
The module pages are relatively short documents and therefore each is treated as a single chunk and embedded as a whole.
|
63 |
+
|
64 |
+
Each page is embedded using [text-embedding-3-small](https://platform.openai.com/docs/guides/embeddings). [FAISS](https://faiss.ai/) is used to store and search the embedded documents.
|
65 |
+
|
66 |
+
### Q&A based using RAG
|
67 |
+
|
68 |
+
The chat interface is a simple [Gradio](https://www.gradio.app/) app, and uses OpenAI's [gpt-4o-mini](https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/) as the underlying LLM.
|
69 |
+
|
70 |
+
At each turn of the conversation the following steps are performed, managed using [Langchain](https://python.langchain.com/docs/introduction/):
|
71 |
+
|
72 |
+
* Call the LLM to rephrase the user's query, given the conversation history, so that it includes relevant context from the conversation.
|
73 |
+
|
74 |
+
* Embed the rephrased query and retrieve relevant documents from the vectorstore.
|
75 |
+
|
76 |
+
* Call the LLM with the current user input, retrieved documents for context and conversation history. Output the result as the LLM's response in the chat interface.
|
77 |
+
|
78 |
+
## Potential extensions
|
79 |
+
|
80 |
+
* Add [course descriptions](https://www.ucl.ac.uk/prospective-students/undergraduate/undergraduate-courses) to the vectorstore so that the app is more useful to potential applicants and can explain, for example, which modules are mandatory on certain courses.
|
81 |
+
|
82 |
+
* Provide links to the module catalogue for modules suggested by the application, either within the conversation or as a separate interface element.
|
83 |
+
|
84 |
+
* Use a agent-based approach to avoid unnecessary retrieval steps and/or support more complex queries that require multiple retrieval steps.
|
85 |
+
|
86 |
+
* Use a LangGraph app to manage the conversation history and state.
|
87 |
+
|
88 |
+
## Useful resources
|
89 |
+
|
90 |
+
* [UCL module catalogue](https://www.ucl.ac.uk/module-catalogue?collection=drupal-module-catalogue&facetsort=alpha&num_ranks=20&daat=10000&sort=title)
|
91 |
+
|
92 |
+
* [Langchain official tutorials](https://python.langchain.com/docs/tutorials/)
|
93 |
+
|
94 |
+
* Hands-On Large Language Models [book](https://learning.oreilly.com/library/view/hands-on-large-language/9781098150952/) and [GitHub repository](https://github.com/HandsOnLLM/Hands-On-Large-Language-Models)
|
app.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import hydra
|
3 |
+
import omegaconf
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
|
6 |
+
|
7 |
+
from ucl_module_chat.chains.rag_chain import build_rag_chain
|
8 |
+
from ucl_module_chat.utils.resolve_paths import get_abs_path_using_repo_root
|
9 |
+
|
10 |
+
load_dotenv()
|
11 |
+
|
12 |
+
# Text paragraph to be added below the title
|
13 |
+
description = """
|
14 |
+
<b>NOTE</b>: This is a demonstration developed for educational purposes only
|
15 |
+
and is not affiliated with or endorsed by University College London (UCL).
|
16 |
+
The model may provide incorrect or outdated information. Interactions should
|
17 |
+
therefore not be used to inform decisions such as programme choices or module selection.
|
18 |
+
|
19 |
+
Please refer to the official [UCL module catalogue](https://www.ucl.ac.uk/module-catalogue)
|
20 |
+
for accurate and up-to-date information.
|
21 |
+
"""
|
22 |
+
|
23 |
+
examples = [
|
24 |
+
"When can I take a module on medical statistics?",
|
25 |
+
"What are the prerequisites for taking Supervised Learning?",
|
26 |
+
"What is the difference between the two modules on Trauma for \
|
27 |
+
paediatric dentistry?",
|
28 |
+
]
|
29 |
+
|
30 |
+
|
31 |
+
def convert_history(history: list[dict]) -> list[BaseMessage]:
|
32 |
+
"""Convert conversation history into Langchain messages"""
|
33 |
+
lc_history = []
|
34 |
+
for msg in history:
|
35 |
+
if msg["role"] == "user":
|
36 |
+
lc_history.append(HumanMessage(msg["content"]))
|
37 |
+
elif msg["role"] == "assistant":
|
38 |
+
lc_history.append(AIMessage(msg["content"]))
|
39 |
+
return lc_history
|
40 |
+
|
41 |
+
|
42 |
+
@hydra.main(
|
43 |
+
version_base=None, config_path="src/ucl_module_chat/conf", config_name="config"
|
44 |
+
)
|
45 |
+
def main(cfg: omegaconf.DictConfig) -> None:
|
46 |
+
"""Run the UCL module chatbot in a Gradio interface."""
|
47 |
+
|
48 |
+
vectorstore_dir = get_abs_path_using_repo_root(cfg.vectorstore.dir)
|
49 |
+
llm = hydra.utils.instantiate(cfg.models.llm)
|
50 |
+
embedding_model = hydra.utils.instantiate(cfg.models.embedding)
|
51 |
+
rag_chain = build_rag_chain(
|
52 |
+
llm=llm, embedding_model=embedding_model, vectorstore_dir=vectorstore_dir
|
53 |
+
)
|
54 |
+
|
55 |
+
def chat(input: str, history: list[dict] = None) -> str:
|
56 |
+
result = rag_chain.invoke(
|
57 |
+
{"input": input, "chat_history": convert_history(history)},
|
58 |
+
)
|
59 |
+
return result["answer"]
|
60 |
+
|
61 |
+
with gr.Blocks(fill_height=True) as module_chat:
|
62 |
+
gr.Markdown("# Chat with the module catalogue")
|
63 |
+
gr.Markdown(description)
|
64 |
+
gr.ChatInterface(
|
65 |
+
fn=chat,
|
66 |
+
type="messages",
|
67 |
+
examples=examples,
|
68 |
+
)
|
69 |
+
|
70 |
+
module_chat.launch()
|
71 |
+
|
72 |
+
|
73 |
+
if __name__ == "__main__":
|
74 |
+
main()
|
pyproject.toml
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "ucl-module-chat"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "Unofficial chat app for the UCL module catalogue"
|
5 |
+
readme = "README.md"
|
6 |
+
requires-python = ">=3.9"
|
7 |
+
dependencies = [
|
8 |
+
"beautifulsoup4>=4.12.3",
|
9 |
+
"langchain>=0.3.3",
|
10 |
+
"tqdm>=4.66.5",
|
11 |
+
"langchain-community>=0.3.2",
|
12 |
+
"faiss-cpu>=1.9.0",
|
13 |
+
"langchain-openai>=0.2.3",
|
14 |
+
"gradio>=4.44.1",
|
15 |
+
"jinja2>=3.1.4",
|
16 |
+
"loguru>=0.7.2",
|
17 |
+
"hydra-core>=1.3.2",
|
18 |
+
"gitpython>=3.1.43",
|
19 |
+
]
|
20 |
+
|
21 |
+
[build-system]
|
22 |
+
requires = ["hatchling"]
|
23 |
+
build-backend = "hatchling.build"
|
24 |
+
|
25 |
+
[tool.hatch.build.targets.wheel]
|
26 |
+
packages = ["src/ucl_module_chat"]
|
27 |
+
|
28 |
+
[tool.uv]
|
29 |
+
dev-dependencies = [
|
30 |
+
"jupyter>=1.1.1",
|
31 |
+
"black>=24.8.0",
|
32 |
+
"ruff>=0.6.9",
|
33 |
+
"pre-commit>=4.0.1",
|
34 |
+
]
|
35 |
+
|
36 |
+
[tool.black]
|
37 |
+
line-length = 88
|
38 |
+
|
39 |
+
[tool.ruff]
|
40 |
+
line-length = 88
|
41 |
+
fix = true
|
42 |
+
exclude = ["src/ucl_module_chat/data_processing/document_templates.py"]
|
43 |
+
|
44 |
+
[tool.ruff.lint]
|
45 |
+
select = ["E", "F", "I"]
|
46 |
+
fixable = ["ALL"]
|
47 |
+
unfixable = []
|
48 |
+
|
49 |
+
[tool.ruff.lint.isort]
|
50 |
+
force-single-line = false
|
51 |
+
combine-as-imports = true
|
requirements.txt
ADDED
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This file was autogenerated by uv via the following command:
|
2 |
+
# uv pip compile pyproject.toml -o requirements.txt
|
3 |
+
aiofiles==23.2.1
|
4 |
+
# via gradio
|
5 |
+
aiohappyeyeballs==2.4.3
|
6 |
+
# via aiohttp
|
7 |
+
aiohttp==3.10.10
|
8 |
+
# via
|
9 |
+
# langchain
|
10 |
+
# langchain-community
|
11 |
+
aiosignal==1.3.1
|
12 |
+
# via aiohttp
|
13 |
+
annotated-types==0.7.0
|
14 |
+
# via pydantic
|
15 |
+
antlr4-python3-runtime==4.9.3
|
16 |
+
# via
|
17 |
+
# hydra-core
|
18 |
+
# omegaconf
|
19 |
+
anyio==4.6.2.post1
|
20 |
+
# via
|
21 |
+
# gradio
|
22 |
+
# httpx
|
23 |
+
# openai
|
24 |
+
# starlette
|
25 |
+
attrs==24.2.0
|
26 |
+
# via aiohttp
|
27 |
+
beautifulsoup4==4.12.3
|
28 |
+
# via ucl-module-chat (pyproject.toml)
|
29 |
+
certifi==2024.8.30
|
30 |
+
# via
|
31 |
+
# httpcore
|
32 |
+
# httpx
|
33 |
+
# requests
|
34 |
+
charset-normalizer==3.4.0
|
35 |
+
# via requests
|
36 |
+
click==8.1.7
|
37 |
+
# via
|
38 |
+
# typer
|
39 |
+
# uvicorn
|
40 |
+
dataclasses-json==0.6.7
|
41 |
+
# via langchain-community
|
42 |
+
distro==1.9.0
|
43 |
+
# via openai
|
44 |
+
faiss-cpu==1.9.0
|
45 |
+
# via ucl-module-chat (pyproject.toml)
|
46 |
+
fastapi==0.115.3
|
47 |
+
# via gradio
|
48 |
+
ffmpy==0.4.0
|
49 |
+
# via gradio
|
50 |
+
filelock==3.16.1
|
51 |
+
# via huggingface-hub
|
52 |
+
frozenlist==1.5.0
|
53 |
+
# via
|
54 |
+
# aiohttp
|
55 |
+
# aiosignal
|
56 |
+
fsspec==2024.10.0
|
57 |
+
# via
|
58 |
+
# gradio-client
|
59 |
+
# huggingface-hub
|
60 |
+
gitdb==4.0.11
|
61 |
+
# via gitpython
|
62 |
+
gitpython==3.1.43
|
63 |
+
# via ucl-module-chat (pyproject.toml)
|
64 |
+
gradio==5.3.0
|
65 |
+
# via ucl-module-chat (pyproject.toml)
|
66 |
+
gradio-client==1.4.2
|
67 |
+
# via gradio
|
68 |
+
greenlet==3.1.1
|
69 |
+
# via sqlalchemy
|
70 |
+
h11==0.14.0
|
71 |
+
# via
|
72 |
+
# httpcore
|
73 |
+
# uvicorn
|
74 |
+
httpcore==1.0.6
|
75 |
+
# via httpx
|
76 |
+
httpx==0.27.2
|
77 |
+
# via
|
78 |
+
# gradio
|
79 |
+
# gradio-client
|
80 |
+
# langsmith
|
81 |
+
# openai
|
82 |
+
huggingface-hub==0.26.1
|
83 |
+
# via
|
84 |
+
# gradio
|
85 |
+
# gradio-client
|
86 |
+
hydra-core==1.3.2
|
87 |
+
# via ucl-module-chat (pyproject.toml)
|
88 |
+
idna==3.10
|
89 |
+
# via
|
90 |
+
# anyio
|
91 |
+
# httpx
|
92 |
+
# requests
|
93 |
+
# yarl
|
94 |
+
jinja2==3.1.4
|
95 |
+
# via
|
96 |
+
# ucl-module-chat (pyproject.toml)
|
97 |
+
# gradio
|
98 |
+
jiter==0.6.1
|
99 |
+
# via openai
|
100 |
+
jsonpatch==1.33
|
101 |
+
# via langchain-core
|
102 |
+
jsonpointer==3.0.0
|
103 |
+
# via jsonpatch
|
104 |
+
langchain==0.3.4
|
105 |
+
# via
|
106 |
+
# ucl-module-chat (pyproject.toml)
|
107 |
+
# langchain-community
|
108 |
+
langchain-community==0.3.3
|
109 |
+
# via ucl-module-chat (pyproject.toml)
|
110 |
+
langchain-core==0.3.12
|
111 |
+
# via
|
112 |
+
# langchain
|
113 |
+
# langchain-community
|
114 |
+
# langchain-openai
|
115 |
+
# langchain-text-splitters
|
116 |
+
langchain-openai==0.2.3
|
117 |
+
# via ucl-module-chat (pyproject.toml)
|
118 |
+
langchain-text-splitters==0.3.0
|
119 |
+
# via langchain
|
120 |
+
langsmith==0.1.137
|
121 |
+
# via
|
122 |
+
# langchain
|
123 |
+
# langchain-community
|
124 |
+
# langchain-core
|
125 |
+
loguru==0.7.2
|
126 |
+
# via ucl-module-chat (pyproject.toml)
|
127 |
+
markdown-it-py==3.0.0
|
128 |
+
# via rich
|
129 |
+
markupsafe==2.1.5
|
130 |
+
# via
|
131 |
+
# gradio
|
132 |
+
# jinja2
|
133 |
+
marshmallow==3.23.0
|
134 |
+
# via dataclasses-json
|
135 |
+
mdurl==0.1.2
|
136 |
+
# via markdown-it-py
|
137 |
+
multidict==6.1.0
|
138 |
+
# via
|
139 |
+
# aiohttp
|
140 |
+
# yarl
|
141 |
+
mypy-extensions==1.0.0
|
142 |
+
# via typing-inspect
|
143 |
+
numpy==1.26.4
|
144 |
+
# via
|
145 |
+
# faiss-cpu
|
146 |
+
# gradio
|
147 |
+
# langchain
|
148 |
+
# langchain-community
|
149 |
+
# pandas
|
150 |
+
omegaconf==2.3.0
|
151 |
+
# via hydra-core
|
152 |
+
openai==1.52.2
|
153 |
+
# via langchain-openai
|
154 |
+
orjson==3.10.10
|
155 |
+
# via
|
156 |
+
# gradio
|
157 |
+
# langsmith
|
158 |
+
packaging==24.1
|
159 |
+
# via
|
160 |
+
# faiss-cpu
|
161 |
+
# gradio
|
162 |
+
# gradio-client
|
163 |
+
# huggingface-hub
|
164 |
+
# hydra-core
|
165 |
+
# langchain-core
|
166 |
+
# marshmallow
|
167 |
+
pandas==2.2.3
|
168 |
+
# via gradio
|
169 |
+
pillow==10.4.0
|
170 |
+
# via gradio
|
171 |
+
propcache==0.2.0
|
172 |
+
# via yarl
|
173 |
+
pydantic==2.9.2
|
174 |
+
# via
|
175 |
+
# fastapi
|
176 |
+
# gradio
|
177 |
+
# langchain
|
178 |
+
# langchain-core
|
179 |
+
# langsmith
|
180 |
+
# openai
|
181 |
+
# pydantic-settings
|
182 |
+
pydantic-core==2.23.4
|
183 |
+
# via pydantic
|
184 |
+
pydantic-settings==2.6.0
|
185 |
+
# via langchain-community
|
186 |
+
pydub==0.25.1
|
187 |
+
# via gradio
|
188 |
+
pygments==2.18.0
|
189 |
+
# via rich
|
190 |
+
python-dateutil==2.9.0.post0
|
191 |
+
# via pandas
|
192 |
+
python-dotenv==1.0.1
|
193 |
+
# via pydantic-settings
|
194 |
+
python-multipart==0.0.12
|
195 |
+
# via gradio
|
196 |
+
pytz==2024.2
|
197 |
+
# via pandas
|
198 |
+
pyyaml==6.0.2
|
199 |
+
# via
|
200 |
+
# gradio
|
201 |
+
# huggingface-hub
|
202 |
+
# langchain
|
203 |
+
# langchain-community
|
204 |
+
# langchain-core
|
205 |
+
# omegaconf
|
206 |
+
regex==2024.9.11
|
207 |
+
# via tiktoken
|
208 |
+
requests==2.32.3
|
209 |
+
# via
|
210 |
+
# huggingface-hub
|
211 |
+
# langchain
|
212 |
+
# langchain-community
|
213 |
+
# langsmith
|
214 |
+
# requests-toolbelt
|
215 |
+
# tiktoken
|
216 |
+
requests-toolbelt==1.0.0
|
217 |
+
# via langsmith
|
218 |
+
rich==13.9.3
|
219 |
+
# via typer
|
220 |
+
ruff==0.7.1
|
221 |
+
# via gradio
|
222 |
+
semantic-version==2.10.0
|
223 |
+
# via gradio
|
224 |
+
shellingham==1.5.4
|
225 |
+
# via typer
|
226 |
+
six==1.16.0
|
227 |
+
# via python-dateutil
|
228 |
+
smmap==5.0.1
|
229 |
+
# via gitdb
|
230 |
+
sniffio==1.3.1
|
231 |
+
# via
|
232 |
+
# anyio
|
233 |
+
# httpx
|
234 |
+
# openai
|
235 |
+
soupsieve==2.6
|
236 |
+
# via beautifulsoup4
|
237 |
+
sqlalchemy==2.0.36
|
238 |
+
# via
|
239 |
+
# langchain
|
240 |
+
# langchain-community
|
241 |
+
starlette==0.41.0
|
242 |
+
# via
|
243 |
+
# fastapi
|
244 |
+
# gradio
|
245 |
+
tenacity==9.0.0
|
246 |
+
# via
|
247 |
+
# langchain
|
248 |
+
# langchain-community
|
249 |
+
# langchain-core
|
250 |
+
tiktoken==0.8.0
|
251 |
+
# via langchain-openai
|
252 |
+
tomlkit==0.12.0
|
253 |
+
# via gradio
|
254 |
+
tqdm==4.66.5
|
255 |
+
# via
|
256 |
+
# ucl-module-chat (pyproject.toml)
|
257 |
+
# huggingface-hub
|
258 |
+
# openai
|
259 |
+
typer==0.12.5
|
260 |
+
# via gradio
|
261 |
+
typing-extensions==4.12.2
|
262 |
+
# via
|
263 |
+
# fastapi
|
264 |
+
# gradio
|
265 |
+
# gradio-client
|
266 |
+
# huggingface-hub
|
267 |
+
# langchain-core
|
268 |
+
# openai
|
269 |
+
# pydantic
|
270 |
+
# pydantic-core
|
271 |
+
# sqlalchemy
|
272 |
+
# typer
|
273 |
+
# typing-inspect
|
274 |
+
typing-inspect==0.9.0
|
275 |
+
# via dataclasses-json
|
276 |
+
tzdata==2024.2
|
277 |
+
# via pandas
|
278 |
+
urllib3==2.2.3
|
279 |
+
# via requests
|
280 |
+
uvicorn==0.32.0
|
281 |
+
# via gradio
|
282 |
+
websockets==12.0
|
283 |
+
# via gradio-client
|
284 |
+
yarl==1.16.0
|
285 |
+
# via aiohttp
|
src/ucl_module_chat/__init__.py
ADDED
File without changes
|
src/ucl_module_chat/chains/rag_chain.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
|
5 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
6 |
+
from langchain_community.vectorstores import FAISS
|
7 |
+
from langchain_core.embeddings.embeddings import Embeddings
|
8 |
+
from langchain_core.language_models import BaseChatModel
|
9 |
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
10 |
+
|
11 |
+
load_dotenv()
|
12 |
+
|
13 |
+
context_prompt = """Given a chat history and the latest user question
|
14 |
+
which might reference context in the chat history,
|
15 |
+
formulate a standalone question which can be understood
|
16 |
+
without the chat history. Do NOT answer the question,
|
17 |
+
just reformulate it if needed and otherwise return it as is."""
|
18 |
+
|
19 |
+
rag_prompt = """You are an assistant for question-answering tasks related
|
20 |
+
to university courses (modules) at Univeristy College London (UCL).
|
21 |
+
Use the following pieces of retrieved context to answer the question.
|
22 |
+
The context is from entires in the UCL module catalogue,
|
23 |
+
which is available publicly on the internet.
|
24 |
+
The first time you refer to a module in the conversation, refer to
|
25 |
+
it by it's full name followed by the module code in brackets, e.g.
|
26 |
+
Supervised Learning (COMP0078).
|
27 |
+
If you don't know the answer, say that you don't know.
|
28 |
+
Use five sentences maximum and keep the answer concise.
|
29 |
+
Use professional British English and avoid using slang.
|
30 |
+
Do not refer to the context directly in your answer, but you
|
31 |
+
should use it to answer the question.
|
32 |
+
You can ask the user if they would like to know more about a
|
33 |
+
specific area if you think it may be helpful.
|
34 |
+
\n\n
|
35 |
+
{context}"""
|
36 |
+
|
37 |
+
|
38 |
+
def build_rag_chain(
|
39 |
+
llm: BaseChatModel,
|
40 |
+
embedding_model: Embeddings,
|
41 |
+
vectorstore_dir: str | Path,
|
42 |
+
context_system_prompt: str = context_prompt,
|
43 |
+
rag_system_prompt: str = rag_prompt,
|
44 |
+
):
|
45 |
+
"""Build a RAG chain for the UCL module chatbot."""
|
46 |
+
|
47 |
+
contextualize_q_prompt = ChatPromptTemplate.from_messages(
|
48 |
+
[
|
49 |
+
("system", context_system_prompt),
|
50 |
+
MessagesPlaceholder("chat_history"),
|
51 |
+
("human", "{input}"),
|
52 |
+
]
|
53 |
+
)
|
54 |
+
vectorstore = FAISS.load_local(
|
55 |
+
vectorstore_dir,
|
56 |
+
embeddings=embedding_model,
|
57 |
+
allow_dangerous_deserialization=True,
|
58 |
+
)
|
59 |
+
retriever = vectorstore.as_retriever()
|
60 |
+
|
61 |
+
history_aware_retriever = create_history_aware_retriever(
|
62 |
+
llm, retriever, contextualize_q_prompt
|
63 |
+
)
|
64 |
+
|
65 |
+
qa_prompt = ChatPromptTemplate.from_messages(
|
66 |
+
[
|
67 |
+
("system", rag_system_prompt),
|
68 |
+
MessagesPlaceholder("chat_history"),
|
69 |
+
("human", "{input}"),
|
70 |
+
]
|
71 |
+
)
|
72 |
+
|
73 |
+
# Stuff documents chain combines documents from retreiver | qa_prompt
|
74 |
+
# | llm | StrOutputParser
|
75 |
+
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt).with_config(
|
76 |
+
tags=["qa"]
|
77 |
+
)
|
78 |
+
|
79 |
+
# Full rag chain is history aware retriever | question answer chain
|
80 |
+
rag_chain = create_retrieval_chain(
|
81 |
+
history_aware_retriever, question_answer_chain
|
82 |
+
).with_config(tags=["rag"])
|
83 |
+
|
84 |
+
return rag_chain
|
src/ucl_module_chat/conf/config.yaml
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
defaults:
|
2 |
+
- override hydra/job_logging: disabled
|
3 |
+
|
4 |
+
# All paths are specified relative to the root of the project
|
5 |
+
|
6 |
+
setup:
|
7 |
+
scrape_documents:
|
8 |
+
index_page_url: "https://search.ucl.ac.uk/s/search.html?collection=drupal-module-catalogue&facetsort=alpha&num_ranks=10000&daat=10000&form=ucl&start_rank=0"
|
9 |
+
output_dir: 'data/module_html'
|
10 |
+
regex_url_pattern: 'https://www.ucl.ac.uk/module-catalogue/modules/[a-zA-Z0-9-]+[A-Z]{4}\d{4}'
|
11 |
+
wait_time_seconds: 2
|
12 |
+
convert_documents:
|
13 |
+
input_dir: ${setup.scrape_documents.output_dir}
|
14 |
+
output_dir: 'data/module_md'
|
15 |
+
embed_documents:
|
16 |
+
input_dir: ${setup.convert_documents.output_dir}
|
17 |
+
output_dir: ${vectorstore.dir}
|
18 |
+
|
19 |
+
vectorstore:
|
20 |
+
dir: 'data/module_catalogue_vectorstore'
|
21 |
+
|
22 |
+
models:
|
23 |
+
embedding:
|
24 |
+
_target_: langchain_openai.OpenAIEmbeddings
|
25 |
+
model: text-embedding-3-small
|
26 |
+
|
27 |
+
llm:
|
28 |
+
_target_: langchain_openai.ChatOpenAI
|
29 |
+
model: gpt-4o-mini
|
src/ucl_module_chat/data_processing/document_conversion.py
ADDED
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
import hydra
|
5 |
+
import jinja2
|
6 |
+
import omegaconf
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
from loguru import logger
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
from ucl_module_chat.data_processing.document_templates import module_template
|
12 |
+
from ucl_module_chat.utils.resolve_paths import get_abs_path_using_repo_root
|
13 |
+
|
14 |
+
|
15 |
+
def _extract_module_info_from_html(module_html: str) -> dict:
|
16 |
+
"""Parse HTML content for a UCL module page and extract key information."""
|
17 |
+
|
18 |
+
soup = BeautifulSoup(module_html, "html.parser")
|
19 |
+
|
20 |
+
# Extract the module title and code from the og:title meta tag
|
21 |
+
pattern = r"""
|
22 |
+
(?P<module_title>.*?) # Capture the module title
|
23 |
+
\s* # Optional whitespace
|
24 |
+
\((?P<module_code>[A-Z]{4}\d{4})\) # Capture the alphanumeric code
|
25 |
+
"""
|
26 |
+
og_title = soup.find("meta", attrs={"name": "og:title"})["content"]
|
27 |
+
match = re.search(pattern, og_title, re.VERBOSE)
|
28 |
+
module_title = match.group("module_title").strip()
|
29 |
+
module_code = match.group("module_code").strip()
|
30 |
+
|
31 |
+
url = soup.find("meta", attrs={"property": "og:url"})["content"]
|
32 |
+
|
33 |
+
faculty = soup.find("meta", attrs={"name": "ucl:sanitized_faculty"})["content"]
|
34 |
+
|
35 |
+
teaching_department = soup.find(
|
36 |
+
"meta", attrs={"name": "ucl:sanitized_teaching_department"}
|
37 |
+
)["content"]
|
38 |
+
|
39 |
+
level = soup.find("meta", attrs={"name": "ucl:sanitized_level"})["content"]
|
40 |
+
|
41 |
+
teaching_term = soup.find(
|
42 |
+
"meta", attrs={"name": "ucl:sanitized_intended_teaching_term"}
|
43 |
+
)["content"]
|
44 |
+
|
45 |
+
credit_value = soup.find("meta", attrs={"name": "ucl:sanitized_credit_value"})[
|
46 |
+
"content"
|
47 |
+
]
|
48 |
+
|
49 |
+
sanitized_subject = soup.find("meta", attrs={"name": "ucl:sanitized_subject"})[
|
50 |
+
"content"
|
51 |
+
]
|
52 |
+
|
53 |
+
sanitized_keywords = soup.find("meta", attrs={"name": "ucl:sanitized_keywords"})[
|
54 |
+
"content"
|
55 |
+
]
|
56 |
+
|
57 |
+
restrictions = (
|
58 |
+
soup.find("dt", string="Restrictions")
|
59 |
+
.find_next_sibling("dd")
|
60 |
+
.get_text()
|
61 |
+
.strip()
|
62 |
+
.replace("\n", " ")
|
63 |
+
)
|
64 |
+
|
65 |
+
alternative_credit_options = (
|
66 |
+
soup.find("h2", string="Alternative credit options")
|
67 |
+
.find_next("p")
|
68 |
+
.get_text()
|
69 |
+
.strip()
|
70 |
+
)
|
71 |
+
|
72 |
+
description = soup.find("div", class_="module-description").get_text()
|
73 |
+
|
74 |
+
# Deliveries - there may be multiple deliveries for a module
|
75 |
+
potential_deliveries = soup.find_all("div", class_="box tagged box--bar-thick")
|
76 |
+
deliveries = [
|
77 |
+
d
|
78 |
+
for d in potential_deliveries
|
79 |
+
if d.find("h3", string="Teaching and assessment") is not None
|
80 |
+
]
|
81 |
+
collated_d = []
|
82 |
+
for d in deliveries:
|
83 |
+
delivery_info = {}
|
84 |
+
|
85 |
+
# Info from the header
|
86 |
+
header = d.find("h2").get_text()
|
87 |
+
# Might need to modify this regex pattern if some modules are different
|
88 |
+
pattern = r"""
|
89 |
+
Intended\steaching\sterm: # Matches 'Intended teaching term:'
|
90 |
+
\s* # Optional whitespace
|
91 |
+
(?P<term>[\w\s,\(\)]+) # Capture the term
|
92 |
+
\s* # Optional whitespace
|
93 |
+
(?P<type>Undergraduate|Postgraduate) # Matches UG or PG
|
94 |
+
\s* # Optional whitespace
|
95 |
+
\(FHEQ\sLevel\s(?P<fheq_level>\d+)\) # Matches 'FHEQ Level X'
|
96 |
+
""" # and captures level number
|
97 |
+
|
98 |
+
# Search for matches in the header string
|
99 |
+
match = re.search(pattern, header, re.VERBOSE)
|
100 |
+
|
101 |
+
if match:
|
102 |
+
# Extracted values from the regex groups
|
103 |
+
delivery_info["teaching_term"] = match.group("term").strip()
|
104 |
+
delivery_info["type"] = match.group("type").strip()
|
105 |
+
delivery_info["fheq_level"] = match.group("fheq_level").strip()
|
106 |
+
|
107 |
+
# Info from the table for this delivery
|
108 |
+
col_1 = d.find("section", class_="middle-split__column1")
|
109 |
+
|
110 |
+
delivery_info["mode_of_study"] = (
|
111 |
+
col_1.find("dt", string="Mode of study").find_next("dd").text.strip()
|
112 |
+
)
|
113 |
+
|
114 |
+
assessment_methods = (
|
115 |
+
col_1.find("dt", string="Methods of assessment")
|
116 |
+
.find_next("dd")
|
117 |
+
.find_all("div")
|
118 |
+
)
|
119 |
+
delivery_info["methods_of_assessment"] = ", ".join(
|
120 |
+
[" ".join(method.text.strip().split()) for method in assessment_methods]
|
121 |
+
)
|
122 |
+
|
123 |
+
delivery_info["mark_scheme"] = (
|
124 |
+
col_1.find("dt", string="Mark scheme").find_next("dd").text.strip()
|
125 |
+
)
|
126 |
+
|
127 |
+
col_2 = d.find("section", class_="middle-split__column2")
|
128 |
+
|
129 |
+
email = col_2.find("a", href=re.compile(r"^mailto:"))
|
130 |
+
delivery_info["contact_email"] = email.text.strip() if email else None
|
131 |
+
|
132 |
+
delivery_info["number_of_students_prior_year"] = (
|
133 |
+
col_2.find("dt", string="Number of students on module in previous year")
|
134 |
+
.find_next("dd")
|
135 |
+
.text.strip()
|
136 |
+
)
|
137 |
+
|
138 |
+
collated_d.append(delivery_info)
|
139 |
+
|
140 |
+
info = {
|
141 |
+
"module_title": module_title,
|
142 |
+
"module_code": module_code,
|
143 |
+
"url": url,
|
144 |
+
"faculty": faculty,
|
145 |
+
"teaching_department": teaching_department,
|
146 |
+
"level": level,
|
147 |
+
"teaching_term": teaching_term,
|
148 |
+
"credit_value": credit_value,
|
149 |
+
"subject": sanitized_subject,
|
150 |
+
"keywords": sanitized_keywords,
|
151 |
+
"alternative_credit_options": alternative_credit_options,
|
152 |
+
"description": description,
|
153 |
+
"restrictions": restrictions,
|
154 |
+
"deliveries": collated_d,
|
155 |
+
}
|
156 |
+
|
157 |
+
return info
|
158 |
+
|
159 |
+
|
160 |
+
def _module_info_to_markdown(module_info: dict, template: jinja2.Template) -> str:
|
161 |
+
"""Process module information dictionary into markdown document using template."""
|
162 |
+
return template.render(module_info)
|
163 |
+
|
164 |
+
|
165 |
+
def _convert_module_html_to_markdown(
|
166 |
+
module_html: str, extract_function: callable, markdown_template: jinja2.Template
|
167 |
+
) -> None:
|
168 |
+
"""Convert a single UCL module HTML page to a markdown document."""
|
169 |
+
module_info = extract_function(module_html)
|
170 |
+
module_markdown = _module_info_to_markdown(module_info, markdown_template)
|
171 |
+
return module_markdown
|
172 |
+
|
173 |
+
|
174 |
+
def convert_all_documents_html_to_markdown(
|
175 |
+
input_dir: str | Path,
|
176 |
+
output_dir: str | Path,
|
177 |
+
extract_function: callable = _extract_module_info_from_html,
|
178 |
+
markdown_template: jinja2.Template = module_template,
|
179 |
+
):
|
180 |
+
"""Convert all UCL module HTML pages in a directory to markdown documents."""
|
181 |
+
|
182 |
+
input_dir = Path(input_dir)
|
183 |
+
output_dir = Path(output_dir)
|
184 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
185 |
+
|
186 |
+
logger.info("""Converting HTML module files to markdown documents""")
|
187 |
+
|
188 |
+
all_module_html_files = list(input_dir.glob("*.html"))
|
189 |
+
|
190 |
+
n_modules = len(all_module_html_files)
|
191 |
+
|
192 |
+
logger.info(
|
193 |
+
f"Identified {n_modules} HTML module files to convert to markdown documents"
|
194 |
+
)
|
195 |
+
|
196 |
+
errors = 0
|
197 |
+
for module_html_path in tqdm(all_module_html_files):
|
198 |
+
try:
|
199 |
+
with open(module_html_path, "r") as f:
|
200 |
+
module_html = f.read()
|
201 |
+
|
202 |
+
module_markdown = _convert_module_html_to_markdown(
|
203 |
+
module_html, extract_function, markdown_template
|
204 |
+
)
|
205 |
+
|
206 |
+
output_path = output_dir / f"{module_html_path.stem}.md"
|
207 |
+
with open(output_path, "w") as f:
|
208 |
+
f.write(module_markdown)
|
209 |
+
except Exception as e:
|
210 |
+
errors += 1
|
211 |
+
logger.error(f"Error converting {module_html_path.stem}: {e}")
|
212 |
+
logger.info(f"{n_modules - errors} HTML files successfully converted to markdown")
|
213 |
+
logger.info(f"{errors} HTML files could not be converted.")
|
214 |
+
|
215 |
+
|
216 |
+
@hydra.main(version_base=None, config_path="../conf", config_name="config")
|
217 |
+
def main(cfg: omegaconf.DictConfig) -> None:
|
218 |
+
"""Run the document conversion process."""
|
219 |
+
cfg = cfg.setup.convert_documents
|
220 |
+
cfg.input_dir = get_abs_path_using_repo_root(cfg.input_dir)
|
221 |
+
cfg.output_dir = get_abs_path_using_repo_root(cfg.output_dir)
|
222 |
+
|
223 |
+
convert_all_documents_html_to_markdown(**cfg)
|
224 |
+
|
225 |
+
|
226 |
+
if __name__ == "__main__":
|
227 |
+
main()
|
src/ucl_module_chat/data_processing/document_embedding.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
import hydra
|
4 |
+
import omegaconf
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
from langchain_community.vectorstores import FAISS
|
7 |
+
from langchain_core.embeddings.embeddings import Embeddings
|
8 |
+
from loguru import logger
|
9 |
+
|
10 |
+
from ucl_module_chat.utils.resolve_paths import get_abs_path_using_repo_root
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
|
15 |
+
def embed_documents(input_dir: str | Path, embedding_model: Embeddings) -> FAISS:
|
16 |
+
"""Create a FAISS vectorstore from a directory of markdown documents."""
|
17 |
+
input_dir = Path(input_dir)
|
18 |
+
|
19 |
+
all_module_document_paths = list(input_dir.glob("*.md"))
|
20 |
+
|
21 |
+
module_docs = []
|
22 |
+
|
23 |
+
for module_md_path in all_module_document_paths:
|
24 |
+
with open(module_md_path, "r") as f:
|
25 |
+
module_md = f.read()
|
26 |
+
module_docs.append(module_md)
|
27 |
+
|
28 |
+
logger.info(f"Embedding {len(module_docs)} documents")
|
29 |
+
vectorstore = FAISS.from_texts(module_docs, embedding=embedding_model)
|
30 |
+
logger.info(f"Vectorstore created with {vectorstore.index.ntotal} vectors")
|
31 |
+
return vectorstore
|
32 |
+
|
33 |
+
|
34 |
+
@hydra.main(version_base=None, config_path="../conf", config_name="config")
|
35 |
+
def main(cfg: omegaconf.DictConfig) -> None:
|
36 |
+
"""Run the document embedding process."""
|
37 |
+
embedding_model = hydra.utils.instantiate(cfg.models.embedding)
|
38 |
+
cfg.setup.embed_documents.input_dir = get_abs_path_using_repo_root(
|
39 |
+
cfg.setup.embed_documents.input_dir
|
40 |
+
)
|
41 |
+
cfg.vectorstore.dir = get_abs_path_using_repo_root(cfg.vectorstore.dir)
|
42 |
+
vectorstore = embed_documents(cfg.setup.embed_documents.input_dir, embedding_model)
|
43 |
+
vectorstore.save_local(cfg.vectorstore.dir)
|
44 |
+
logger.info(f"Vectorstore saved to {cfg.vectorstore.dir}")
|
45 |
+
|
46 |
+
|
47 |
+
if __name__ == "__main__":
|
48 |
+
main()
|
src/ucl_module_chat/data_processing/document_scraping.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import time
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
import hydra
|
6 |
+
import omegaconf
|
7 |
+
import requests
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
from loguru import logger
|
10 |
+
from tqdm import tqdm
|
11 |
+
|
12 |
+
from ucl_module_chat.utils.resolve_paths import get_abs_path_using_repo_root
|
13 |
+
|
14 |
+
|
15 |
+
def _get_index_page_html(index_page_url: str):
|
16 |
+
"""Get the HTML content of the index page."""
|
17 |
+
response = requests.get(index_page_url)
|
18 |
+
index_page_html = response.text
|
19 |
+
return index_page_html
|
20 |
+
|
21 |
+
|
22 |
+
def _get_module_urls_from_index_page(index_page_html: str, regex_url_pattern: str):
|
23 |
+
"""Extract module URLs from the index page HTML using regex."""
|
24 |
+
regex_url_pattern = re.compile(regex_url_pattern)
|
25 |
+
soup = BeautifulSoup(index_page_html, "html.parser")
|
26 |
+
pattern = re.compile(regex_url_pattern)
|
27 |
+
|
28 |
+
module_urls = []
|
29 |
+
for cite_tag in soup.find_all("cite"):
|
30 |
+
url = cite_tag["data-url"]
|
31 |
+
if pattern.match(url):
|
32 |
+
module_urls.append(url)
|
33 |
+
|
34 |
+
return module_urls
|
35 |
+
|
36 |
+
|
37 |
+
def _save_module_page_html(module_url: str, output_dir: str | Path):
|
38 |
+
"""Save the HTML content of a module page to a text file."""
|
39 |
+
output_dir = Path(output_dir)
|
40 |
+
|
41 |
+
# Send a GET request to fetch the HTML content
|
42 |
+
response = requests.get(module_url)
|
43 |
+
response.raise_for_status() # Raise an exception for HTTP errors
|
44 |
+
|
45 |
+
# Extract the part of the URL after "/modules/" for the filename
|
46 |
+
module_id = module_url.split("/modules/")[1]
|
47 |
+
|
48 |
+
# Save the HTML content to a text file
|
49 |
+
file_path = output_dir / f"{module_id}.html"
|
50 |
+
with open(file_path, "w", encoding="utf-8") as file:
|
51 |
+
file.write(response.text)
|
52 |
+
|
53 |
+
|
54 |
+
def scrape_documents(
|
55 |
+
index_page_url: str | Path,
|
56 |
+
output_dir: str | Path,
|
57 |
+
regex_url_pattern: str,
|
58 |
+
wait_time_seconds: int = 2,
|
59 |
+
):
|
60 |
+
"""Scrape module pages and save HTML content to text files."""
|
61 |
+
output_dir = Path(output_dir)
|
62 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
63 |
+
|
64 |
+
logger.info(f"Identifying module pages from {index_page_url}")
|
65 |
+
index_page_html = _get_index_page_html(index_page_url)
|
66 |
+
|
67 |
+
module_urls = _get_module_urls_from_index_page(index_page_html, regex_url_pattern)
|
68 |
+
n_modules = len(module_urls)
|
69 |
+
logger.info(f"Identified {len(module_urls)} module pages to save to {output_dir}.")
|
70 |
+
|
71 |
+
errors = 0
|
72 |
+
|
73 |
+
for url in tqdm(module_urls):
|
74 |
+
try:
|
75 |
+
_save_module_page_html(url, output_dir)
|
76 |
+
time.sleep(wait_time_seconds) # Pause to avoid abusing the server
|
77 |
+
except requests.exceptions.RequestException as e:
|
78 |
+
logger.error(f"Error saving HTML for {url}: {e}")
|
79 |
+
errors += 1
|
80 |
+
|
81 |
+
logger.info(f"{n_modules - errors} module pages successfully saved")
|
82 |
+
logger.info(f"{errors} module pages could not be saved.")
|
83 |
+
|
84 |
+
|
85 |
+
@hydra.main(version_base=None, config_path="../conf", config_name="config")
|
86 |
+
def main(cfg: omegaconf.DictConfig) -> None:
|
87 |
+
"""Run the document scraping process."""
|
88 |
+
cfg = cfg.setup.scrape_documents
|
89 |
+
cfg.output_dir = get_abs_path_using_repo_root(cfg.output_dir)
|
90 |
+
scrape_documents(**cfg)
|
91 |
+
|
92 |
+
|
93 |
+
if __name__ == "__main__":
|
94 |
+
main()
|
src/ucl_module_chat/data_processing/document_templates.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from jinja2 import Template
|
2 |
+
|
3 |
+
module_template = Template(
|
4 |
+
"""
|
5 |
+
# {{ module_title }} ({{module_code}})
|
6 |
+
|
7 |
+
## Key information
|
8 |
+
|
9 |
+
**Course Code:** {{ module_code }} \\
|
10 |
+
**Subject Area:** {{ subject }} \\
|
11 |
+
**Keywords:** {{ keywords }} \\
|
12 |
+
**Module catalogue URL:** {{ url }}
|
13 |
+
|
14 |
+
**Faculty:** {{ faculty }} \\
|
15 |
+
**Teaching Department:** {{ teaching_department }} \\
|
16 |
+
**Credit Value:** {{ credit_value }} \\
|
17 |
+
**Restrictions:** {{ restrictions }}
|
18 |
+
|
19 |
+
## Alternative credit options
|
20 |
+
{{ alternative_credit_options }}
|
21 |
+
|
22 |
+
## Description
|
23 |
+
{{ description }}
|
24 |
+
|
25 |
+
## Module deliveries for 2024/25 academic year
|
26 |
+
{% for delivery in deliveries %}
|
27 |
+
### {{delivery.type}} (FHEQ Level {{delivery.fheq_level}})
|
28 |
+
|
29 |
+
#### Teaching and assessment
|
30 |
+
**Intended teaching term:** {{ delivery.teaching_term }} \\
|
31 |
+
**Mode of study:** {{ delivery.mode_of_study }} \\
|
32 |
+
**Methods of assessment:** {{ delivery.methods_of_assessment }} \\
|
33 |
+
**Mark scheme:** {{ delivery.mark_scheme }}
|
34 |
+
|
35 |
+
#### Other information
|
36 |
+
**Number of students on module in previous year:** {{ delivery.number_of_students_prior_year }} \\
|
37 |
+
**Who to contact for more information:** {{ delivery.contact_email }}
|
38 |
+
{% endfor %}
|
39 |
+
"""
|
40 |
+
)
|
src/ucl_module_chat/setup.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import hydra
|
2 |
+
import omegaconf
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from loguru import logger
|
5 |
+
|
6 |
+
from ucl_module_chat.data_processing.document_conversion import (
|
7 |
+
convert_all_documents_html_to_markdown,
|
8 |
+
)
|
9 |
+
from ucl_module_chat.data_processing.document_embedding import embed_documents
|
10 |
+
from ucl_module_chat.data_processing.document_scraping import scrape_documents
|
11 |
+
from ucl_module_chat.utils.resolve_paths import get_abs_path_using_repo_root
|
12 |
+
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
|
16 |
+
@hydra.main(version_base=None, config_path="conf", config_name="config")
|
17 |
+
def main(cfg: omegaconf.DictConfig) -> None:
|
18 |
+
"""Scrape module catalogue, convert HTML to markdown, and embed in vectorstore."""
|
19 |
+
|
20 |
+
cfg.setup.scrape_documents.output_dir = get_abs_path_using_repo_root(
|
21 |
+
cfg.setup.scrape_documents.output_dir
|
22 |
+
)
|
23 |
+
cfg.setup.convert_documents.input_dir = get_abs_path_using_repo_root(
|
24 |
+
cfg.setup.convert_documents.input_dir
|
25 |
+
)
|
26 |
+
cfg.setup.convert_documents.output_dir = get_abs_path_using_repo_root(
|
27 |
+
cfg.setup.convert_documents.output_dir
|
28 |
+
)
|
29 |
+
cfg.setup.embed_documents.input_dir = get_abs_path_using_repo_root(
|
30 |
+
cfg.setup.embed_documents.input_dir
|
31 |
+
)
|
32 |
+
cfg.vectorstore.dir = get_abs_path_using_repo_root(cfg.vectorstore.dir)
|
33 |
+
|
34 |
+
scrape_documents(**cfg.setup.scrape_documents)
|
35 |
+
convert_all_documents_html_to_markdown(**cfg.setup.convert_documents)
|
36 |
+
|
37 |
+
embedding_model = hydra.utils.instantiate(cfg.models.embedding)
|
38 |
+
vectorstore = embed_documents(cfg.setup.embed_documents.input_dir, embedding_model)
|
39 |
+
vectorstore.save_local(cfg.vectorstore.dir)
|
40 |
+
logger.info(f"Vectorstore saved to {cfg.vectorstore.dir}")
|
41 |
+
|
42 |
+
|
43 |
+
if __name__ == "__main__":
|
44 |
+
main()
|
src/ucl_module_chat/utils/resolve_paths.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
from git import Repo
|
5 |
+
|
6 |
+
|
7 |
+
def get_abs_path_using_repo_root(path: str | Path) -> Path:
|
8 |
+
"""Takes path relative to the repo root and returns the absolute path."""
|
9 |
+
|
10 |
+
# Initialize the repo (this will automatically find the root if
|
11 |
+
# you're in a subdirectory)
|
12 |
+
repo = Repo(os.getcwd(), search_parent_directories=True)
|
13 |
+
|
14 |
+
# Get the root directory
|
15 |
+
repo_root = repo.git.rev_parse("--show-toplevel")
|
16 |
+
abs_path = Path(repo_root) / path
|
17 |
+
return abs_path
|
uv.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|