joefarrington commited on
Commit
b50d8a8
·
0 Parent(s):

Initial commit

Browse files
.github/workflows/format.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Check code formatting with Black
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+
9
+ steps:
10
+ - uses: actions/checkout@v4
11
+ - name: Set up uv
12
+ # Install a specific uv version using the installer
13
+ run: curl -LsSf https://astral.sh/uv/0.3.3/install.sh | sh
14
+ - name: "Set up Python"
15
+ uses: actions/setup-python@v5
16
+ with:
17
+ python-version-file: "pyproject.toml"
18
+ - name: Install the project
19
+ run: |
20
+ uv sync --all-extras --dev
21
+ - name: Check formatting with Black
22
+ run: |
23
+ uv run black --check .
24
+ continue-on-error: true
.github/workflows/lint.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Lint code with Ruff
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+
9
+ steps:
10
+ - uses: actions/checkout@v4
11
+ - name: Set up uv
12
+ # Install a specific uv version using the installer
13
+ run: curl -LsSf https://astral.sh/uv/0.3.3/install.sh | sh
14
+ - name: "Set up Python"
15
+ uses: actions/setup-python@v5
16
+ with:
17
+ python-version-file: "pyproject.toml"
18
+ - name: Install the project
19
+ run: |
20
+ uv sync --all-extras --dev
21
+ - name: Lint with Ruff
22
+ run: |
23
+ uv run ruff check --output-format=github .
24
+ continue-on-error: true
.gitignore ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
163
+
164
+ # Manually added
165
+ draft_notebooks/
166
+ .env
167
+ outputs
168
+ module_html
169
+ module_catalogue_html
170
+ module_md
171
+ module_catalogue_md
.pre-commit-config.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/psf/black
3
+ rev: 24.8.0
4
+ hooks:
5
+ - id: black
6
+ language_version: python3
7
+
8
+ - repo: https://github.com/astral-sh/ruff-pre-commit
9
+ rev: v0.6.2
10
+ hooks:
11
+ - id: ruff
12
+ args: [--fix, --exit-non-zero-on-fix]
LICENSE ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ ---
179
+
180
+ Copyright 2024 Joseph Farrington
181
+
182
+ Licensed under the Apache License, Version 2.0 (the "License");
183
+ you may not use this file except in compliance with the License.
184
+ You may obtain a copy of the License at
185
+
186
+ http://www.apache.org/licenses/LICENSE-2.0
187
+
188
+ Unless required by applicable law or agreed to in writing, software
189
+ distributed under the License is distributed on an "AS IS" BASIS,
190
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
191
+ See the License for the specific language governing permissions and
192
+ limitations under the License.
193
+
README.md ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Chat with the 2024/2025 UCL module catalogue
2
+
3
+ ## NOTE
4
+ This is a demonstration developed for educational purposes only and is not affiliated with or endorsed by University College London (UCL). The model may provide incorrect or outdated information. Interactions should therefore not be used to inform decisions such as programme choices or module selection.
5
+
6
+ Please refer to the official [UCL module catalogue](https://www.ucl.ac.uk/module-catalogue) for accurate and up-to-date information.
7
+
8
+ The code is licensed under Apache License 2.0 but the module catalogue content is copyright UCL.
9
+
10
+ ## Get started
11
+
12
+ ### Hugging Face Space
13
+
14
+ The easiest way to chat with the model is using the Hugging Face space.
15
+
16
+ ### Local use
17
+
18
+ You can use the code snippet below to run the app locally. This project uses [uv](https://docs.astral.sh/uv/) to manage dependencies and the snippet assumes that you have [uv installed](https://docs.astral.sh/uv/getting-started/installation/).
19
+
20
+ The app requires an [OpenAI API key](https://help.openai.com/en/articles/4936850-where-do-i-find-my-openai-api-key) to run locally.
21
+
22
+ ```bash
23
+ # Clone repo and install dependencies in venv
24
+ git clone https://github.com/joefarrington/ucl_module_chat.git
25
+ cd ucl_module_chat
26
+
27
+ uv venv
28
+ source .venv/bin/activate
29
+ uv pip install .
30
+
31
+ # Add API keys as environment variables
32
+ # Alternatively, create a .env file in the main directory
33
+ export OPENAI_API_KEY=<Your API key>
34
+
35
+ # Run the app
36
+ python app.py
37
+ ```
38
+ One advantage of LangChain is that you could easily substitute the embedding model and/or LLM for alternatives including locally hosted models using [Hugging Face](https://python.langchain.com/docs/integrations/providers/huggingface/), [llama.cpp](https://python.langchain.com/docs/integrations/providers/llamacpp/) or [Ollama](https://python.langchain.com/docs/integrations/providers/ollama/).
39
+
40
+ ### Rerun scraping and embedding
41
+
42
+ The repository includes the vectorstore with pages from the module catalogue embedded using OpenAI's [text-embedding-3-small](https://platform.openai.com/docs/guides/embeddings).
43
+
44
+ The process for downloading the pages from the module catalogue, converting the pages to markdown documents, and embedding the documents can be re-run using the script `setup.py`. There is no need to run this script unless you want to change the way data is extracted from the HTML pages to markdown, or embed the documents using an alternative model.
45
+
46
+ ## Implementation details
47
+
48
+ ### Document scraping
49
+
50
+ The documents in the vectorstore used to provide context to the LLM are based on the publicly available webpages describing each module offered by UCL.
51
+
52
+ The URLs for the individual module catalogue pages are identified from the module catalogue search page. The modules pages are then visited in sequence and the HTML is downloaded for each page.
53
+
54
+ There are more efficient ways to scrape the content from the module catalogue (e.g. [scrapy](https://scrapy.org/)). The current method is designed to minimise the effect on the server. There is long wait time between requests and the raw HTML is saved so that alternative methods of extracting the content can be considered without needing to request additional data from the server.
55
+
56
+ ### Document conversion
57
+
58
+ The raw HTML for each module page is converted to a markdown document using [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) to parse the HTML and a [Jinja](https://jinja.palletsprojects.com/en/stable/intro/) template to format the extracted information.
59
+
60
+ ### Document embedding
61
+
62
+ The module pages are relatively short documents and therefore each is treated as a single chunk and embedded as a whole.
63
+
64
+ Each page is embedded using [text-embedding-3-small](https://platform.openai.com/docs/guides/embeddings). [FAISS](https://faiss.ai/) is used to store and search the embedded documents.
65
+
66
+ ### Q&A based using RAG
67
+
68
+ The chat interface is a simple [Gradio](https://www.gradio.app/) app, and uses OpenAI's [gpt-4o-mini](https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/) as the underlying LLM.
69
+
70
+ At each turn of the conversation the following steps are performed, managed using [Langchain](https://python.langchain.com/docs/introduction/):
71
+
72
+ * Call the LLM to rephrase the user's query, given the conversation history, so that it includes relevant context from the conversation.
73
+
74
+ * Embed the rephrased query and retrieve relevant documents from the vectorstore.
75
+
76
+ * Call the LLM with the current user input, retrieved documents for context and conversation history. Output the result as the LLM's response in the chat interface.
77
+
78
+ ## Potential extensions
79
+
80
+ * Add [course descriptions](https://www.ucl.ac.uk/prospective-students/undergraduate/undergraduate-courses) to the vectorstore so that the app is more useful to potential applicants and can explain, for example, which modules are mandatory on certain courses.
81
+
82
+ * Provide links to the module catalogue for modules suggested by the application, either within the conversation or as a separate interface element.
83
+
84
+ * Use a agent-based approach to avoid unnecessary retrieval steps and/or support more complex queries that require multiple retrieval steps.
85
+
86
+ * Use a LangGraph app to manage the conversation history and state.
87
+
88
+ ## Useful resources
89
+
90
+ * [UCL module catalogue](https://www.ucl.ac.uk/module-catalogue?collection=drupal-module-catalogue&facetsort=alpha&num_ranks=20&daat=10000&sort=title)
91
+
92
+ * [Langchain official tutorials](https://python.langchain.com/docs/tutorials/)
93
+
94
+ * Hands-On Large Language Models [book](https://learning.oreilly.com/library/view/hands-on-large-language/9781098150952/) and [GitHub repository](https://github.com/HandsOnLLM/Hands-On-Large-Language-Models)
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import hydra
3
+ import omegaconf
4
+ from dotenv import load_dotenv
5
+ from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
6
+
7
+ from ucl_module_chat.chains.rag_chain import build_rag_chain
8
+ from ucl_module_chat.utils.resolve_paths import get_abs_path_using_repo_root
9
+
10
+ load_dotenv()
11
+
12
+ # Text paragraph to be added below the title
13
+ description = """
14
+ <b>NOTE</b>: This is a demonstration developed for educational purposes only
15
+ and is not affiliated with or endorsed by University College London (UCL).
16
+ The model may provide incorrect or outdated information. Interactions should
17
+ therefore not be used to inform decisions such as programme choices or module selection.
18
+
19
+ Please refer to the official [UCL module catalogue](https://www.ucl.ac.uk/module-catalogue)
20
+ for accurate and up-to-date information.
21
+ """
22
+
23
+ examples = [
24
+ "When can I take a module on medical statistics?",
25
+ "What are the prerequisites for taking Supervised Learning?",
26
+ "What is the difference between the two modules on Trauma for \
27
+ paediatric dentistry?",
28
+ ]
29
+
30
+
31
+ def convert_history(history: list[dict]) -> list[BaseMessage]:
32
+ """Convert conversation history into Langchain messages"""
33
+ lc_history = []
34
+ for msg in history:
35
+ if msg["role"] == "user":
36
+ lc_history.append(HumanMessage(msg["content"]))
37
+ elif msg["role"] == "assistant":
38
+ lc_history.append(AIMessage(msg["content"]))
39
+ return lc_history
40
+
41
+
42
+ @hydra.main(
43
+ version_base=None, config_path="src/ucl_module_chat/conf", config_name="config"
44
+ )
45
+ def main(cfg: omegaconf.DictConfig) -> None:
46
+ """Run the UCL module chatbot in a Gradio interface."""
47
+
48
+ vectorstore_dir = get_abs_path_using_repo_root(cfg.vectorstore.dir)
49
+ llm = hydra.utils.instantiate(cfg.models.llm)
50
+ embedding_model = hydra.utils.instantiate(cfg.models.embedding)
51
+ rag_chain = build_rag_chain(
52
+ llm=llm, embedding_model=embedding_model, vectorstore_dir=vectorstore_dir
53
+ )
54
+
55
+ def chat(input: str, history: list[dict] = None) -> str:
56
+ result = rag_chain.invoke(
57
+ {"input": input, "chat_history": convert_history(history)},
58
+ )
59
+ return result["answer"]
60
+
61
+ with gr.Blocks(fill_height=True) as module_chat:
62
+ gr.Markdown("# Chat with the module catalogue")
63
+ gr.Markdown(description)
64
+ gr.ChatInterface(
65
+ fn=chat,
66
+ type="messages",
67
+ examples=examples,
68
+ )
69
+
70
+ module_chat.launch()
71
+
72
+
73
+ if __name__ == "__main__":
74
+ main()
pyproject.toml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "ucl-module-chat"
3
+ version = "0.1.0"
4
+ description = "Unofficial chat app for the UCL module catalogue"
5
+ readme = "README.md"
6
+ requires-python = ">=3.9"
7
+ dependencies = [
8
+ "beautifulsoup4>=4.12.3",
9
+ "langchain>=0.3.3",
10
+ "tqdm>=4.66.5",
11
+ "langchain-community>=0.3.2",
12
+ "faiss-cpu>=1.9.0",
13
+ "langchain-openai>=0.2.3",
14
+ "gradio>=4.44.1",
15
+ "jinja2>=3.1.4",
16
+ "loguru>=0.7.2",
17
+ "hydra-core>=1.3.2",
18
+ "gitpython>=3.1.43",
19
+ ]
20
+
21
+ [build-system]
22
+ requires = ["hatchling"]
23
+ build-backend = "hatchling.build"
24
+
25
+ [tool.hatch.build.targets.wheel]
26
+ packages = ["src/ucl_module_chat"]
27
+
28
+ [tool.uv]
29
+ dev-dependencies = [
30
+ "jupyter>=1.1.1",
31
+ "black>=24.8.0",
32
+ "ruff>=0.6.9",
33
+ "pre-commit>=4.0.1",
34
+ ]
35
+
36
+ [tool.black]
37
+ line-length = 88
38
+
39
+ [tool.ruff]
40
+ line-length = 88
41
+ fix = true
42
+ exclude = ["src/ucl_module_chat/data_processing/document_templates.py"]
43
+
44
+ [tool.ruff.lint]
45
+ select = ["E", "F", "I"]
46
+ fixable = ["ALL"]
47
+ unfixable = []
48
+
49
+ [tool.ruff.lint.isort]
50
+ force-single-line = false
51
+ combine-as-imports = true
requirements.txt ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv pip compile pyproject.toml -o requirements.txt
3
+ aiofiles==23.2.1
4
+ # via gradio
5
+ aiohappyeyeballs==2.4.3
6
+ # via aiohttp
7
+ aiohttp==3.10.10
8
+ # via
9
+ # langchain
10
+ # langchain-community
11
+ aiosignal==1.3.1
12
+ # via aiohttp
13
+ annotated-types==0.7.0
14
+ # via pydantic
15
+ antlr4-python3-runtime==4.9.3
16
+ # via
17
+ # hydra-core
18
+ # omegaconf
19
+ anyio==4.6.2.post1
20
+ # via
21
+ # gradio
22
+ # httpx
23
+ # openai
24
+ # starlette
25
+ attrs==24.2.0
26
+ # via aiohttp
27
+ beautifulsoup4==4.12.3
28
+ # via ucl-module-chat (pyproject.toml)
29
+ certifi==2024.8.30
30
+ # via
31
+ # httpcore
32
+ # httpx
33
+ # requests
34
+ charset-normalizer==3.4.0
35
+ # via requests
36
+ click==8.1.7
37
+ # via
38
+ # typer
39
+ # uvicorn
40
+ dataclasses-json==0.6.7
41
+ # via langchain-community
42
+ distro==1.9.0
43
+ # via openai
44
+ faiss-cpu==1.9.0
45
+ # via ucl-module-chat (pyproject.toml)
46
+ fastapi==0.115.3
47
+ # via gradio
48
+ ffmpy==0.4.0
49
+ # via gradio
50
+ filelock==3.16.1
51
+ # via huggingface-hub
52
+ frozenlist==1.5.0
53
+ # via
54
+ # aiohttp
55
+ # aiosignal
56
+ fsspec==2024.10.0
57
+ # via
58
+ # gradio-client
59
+ # huggingface-hub
60
+ gitdb==4.0.11
61
+ # via gitpython
62
+ gitpython==3.1.43
63
+ # via ucl-module-chat (pyproject.toml)
64
+ gradio==5.3.0
65
+ # via ucl-module-chat (pyproject.toml)
66
+ gradio-client==1.4.2
67
+ # via gradio
68
+ greenlet==3.1.1
69
+ # via sqlalchemy
70
+ h11==0.14.0
71
+ # via
72
+ # httpcore
73
+ # uvicorn
74
+ httpcore==1.0.6
75
+ # via httpx
76
+ httpx==0.27.2
77
+ # via
78
+ # gradio
79
+ # gradio-client
80
+ # langsmith
81
+ # openai
82
+ huggingface-hub==0.26.1
83
+ # via
84
+ # gradio
85
+ # gradio-client
86
+ hydra-core==1.3.2
87
+ # via ucl-module-chat (pyproject.toml)
88
+ idna==3.10
89
+ # via
90
+ # anyio
91
+ # httpx
92
+ # requests
93
+ # yarl
94
+ jinja2==3.1.4
95
+ # via
96
+ # ucl-module-chat (pyproject.toml)
97
+ # gradio
98
+ jiter==0.6.1
99
+ # via openai
100
+ jsonpatch==1.33
101
+ # via langchain-core
102
+ jsonpointer==3.0.0
103
+ # via jsonpatch
104
+ langchain==0.3.4
105
+ # via
106
+ # ucl-module-chat (pyproject.toml)
107
+ # langchain-community
108
+ langchain-community==0.3.3
109
+ # via ucl-module-chat (pyproject.toml)
110
+ langchain-core==0.3.12
111
+ # via
112
+ # langchain
113
+ # langchain-community
114
+ # langchain-openai
115
+ # langchain-text-splitters
116
+ langchain-openai==0.2.3
117
+ # via ucl-module-chat (pyproject.toml)
118
+ langchain-text-splitters==0.3.0
119
+ # via langchain
120
+ langsmith==0.1.137
121
+ # via
122
+ # langchain
123
+ # langchain-community
124
+ # langchain-core
125
+ loguru==0.7.2
126
+ # via ucl-module-chat (pyproject.toml)
127
+ markdown-it-py==3.0.0
128
+ # via rich
129
+ markupsafe==2.1.5
130
+ # via
131
+ # gradio
132
+ # jinja2
133
+ marshmallow==3.23.0
134
+ # via dataclasses-json
135
+ mdurl==0.1.2
136
+ # via markdown-it-py
137
+ multidict==6.1.0
138
+ # via
139
+ # aiohttp
140
+ # yarl
141
+ mypy-extensions==1.0.0
142
+ # via typing-inspect
143
+ numpy==1.26.4
144
+ # via
145
+ # faiss-cpu
146
+ # gradio
147
+ # langchain
148
+ # langchain-community
149
+ # pandas
150
+ omegaconf==2.3.0
151
+ # via hydra-core
152
+ openai==1.52.2
153
+ # via langchain-openai
154
+ orjson==3.10.10
155
+ # via
156
+ # gradio
157
+ # langsmith
158
+ packaging==24.1
159
+ # via
160
+ # faiss-cpu
161
+ # gradio
162
+ # gradio-client
163
+ # huggingface-hub
164
+ # hydra-core
165
+ # langchain-core
166
+ # marshmallow
167
+ pandas==2.2.3
168
+ # via gradio
169
+ pillow==10.4.0
170
+ # via gradio
171
+ propcache==0.2.0
172
+ # via yarl
173
+ pydantic==2.9.2
174
+ # via
175
+ # fastapi
176
+ # gradio
177
+ # langchain
178
+ # langchain-core
179
+ # langsmith
180
+ # openai
181
+ # pydantic-settings
182
+ pydantic-core==2.23.4
183
+ # via pydantic
184
+ pydantic-settings==2.6.0
185
+ # via langchain-community
186
+ pydub==0.25.1
187
+ # via gradio
188
+ pygments==2.18.0
189
+ # via rich
190
+ python-dateutil==2.9.0.post0
191
+ # via pandas
192
+ python-dotenv==1.0.1
193
+ # via pydantic-settings
194
+ python-multipart==0.0.12
195
+ # via gradio
196
+ pytz==2024.2
197
+ # via pandas
198
+ pyyaml==6.0.2
199
+ # via
200
+ # gradio
201
+ # huggingface-hub
202
+ # langchain
203
+ # langchain-community
204
+ # langchain-core
205
+ # omegaconf
206
+ regex==2024.9.11
207
+ # via tiktoken
208
+ requests==2.32.3
209
+ # via
210
+ # huggingface-hub
211
+ # langchain
212
+ # langchain-community
213
+ # langsmith
214
+ # requests-toolbelt
215
+ # tiktoken
216
+ requests-toolbelt==1.0.0
217
+ # via langsmith
218
+ rich==13.9.3
219
+ # via typer
220
+ ruff==0.7.1
221
+ # via gradio
222
+ semantic-version==2.10.0
223
+ # via gradio
224
+ shellingham==1.5.4
225
+ # via typer
226
+ six==1.16.0
227
+ # via python-dateutil
228
+ smmap==5.0.1
229
+ # via gitdb
230
+ sniffio==1.3.1
231
+ # via
232
+ # anyio
233
+ # httpx
234
+ # openai
235
+ soupsieve==2.6
236
+ # via beautifulsoup4
237
+ sqlalchemy==2.0.36
238
+ # via
239
+ # langchain
240
+ # langchain-community
241
+ starlette==0.41.0
242
+ # via
243
+ # fastapi
244
+ # gradio
245
+ tenacity==9.0.0
246
+ # via
247
+ # langchain
248
+ # langchain-community
249
+ # langchain-core
250
+ tiktoken==0.8.0
251
+ # via langchain-openai
252
+ tomlkit==0.12.0
253
+ # via gradio
254
+ tqdm==4.66.5
255
+ # via
256
+ # ucl-module-chat (pyproject.toml)
257
+ # huggingface-hub
258
+ # openai
259
+ typer==0.12.5
260
+ # via gradio
261
+ typing-extensions==4.12.2
262
+ # via
263
+ # fastapi
264
+ # gradio
265
+ # gradio-client
266
+ # huggingface-hub
267
+ # langchain-core
268
+ # openai
269
+ # pydantic
270
+ # pydantic-core
271
+ # sqlalchemy
272
+ # typer
273
+ # typing-inspect
274
+ typing-inspect==0.9.0
275
+ # via dataclasses-json
276
+ tzdata==2024.2
277
+ # via pandas
278
+ urllib3==2.2.3
279
+ # via requests
280
+ uvicorn==0.32.0
281
+ # via gradio
282
+ websockets==12.0
283
+ # via gradio-client
284
+ yarl==1.16.0
285
+ # via aiohttp
src/ucl_module_chat/__init__.py ADDED
File without changes
src/ucl_module_chat/chains/rag_chain.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from dotenv import load_dotenv
4
+ from langchain.chains import create_history_aware_retriever, create_retrieval_chain
5
+ from langchain.chains.combine_documents import create_stuff_documents_chain
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain_core.embeddings.embeddings import Embeddings
8
+ from langchain_core.language_models import BaseChatModel
9
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
10
+
11
+ load_dotenv()
12
+
13
+ context_prompt = """Given a chat history and the latest user question
14
+ which might reference context in the chat history,
15
+ formulate a standalone question which can be understood
16
+ without the chat history. Do NOT answer the question,
17
+ just reformulate it if needed and otherwise return it as is."""
18
+
19
+ rag_prompt = """You are an assistant for question-answering tasks related
20
+ to university courses (modules) at Univeristy College London (UCL).
21
+ Use the following pieces of retrieved context to answer the question.
22
+ The context is from entires in the UCL module catalogue,
23
+ which is available publicly on the internet.
24
+ The first time you refer to a module in the conversation, refer to
25
+ it by it's full name followed by the module code in brackets, e.g.
26
+ Supervised Learning (COMP0078).
27
+ If you don't know the answer, say that you don't know.
28
+ Use five sentences maximum and keep the answer concise.
29
+ Use professional British English and avoid using slang.
30
+ Do not refer to the context directly in your answer, but you
31
+ should use it to answer the question.
32
+ You can ask the user if they would like to know more about a
33
+ specific area if you think it may be helpful.
34
+ \n\n
35
+ {context}"""
36
+
37
+
38
+ def build_rag_chain(
39
+ llm: BaseChatModel,
40
+ embedding_model: Embeddings,
41
+ vectorstore_dir: str | Path,
42
+ context_system_prompt: str = context_prompt,
43
+ rag_system_prompt: str = rag_prompt,
44
+ ):
45
+ """Build a RAG chain for the UCL module chatbot."""
46
+
47
+ contextualize_q_prompt = ChatPromptTemplate.from_messages(
48
+ [
49
+ ("system", context_system_prompt),
50
+ MessagesPlaceholder("chat_history"),
51
+ ("human", "{input}"),
52
+ ]
53
+ )
54
+ vectorstore = FAISS.load_local(
55
+ vectorstore_dir,
56
+ embeddings=embedding_model,
57
+ allow_dangerous_deserialization=True,
58
+ )
59
+ retriever = vectorstore.as_retriever()
60
+
61
+ history_aware_retriever = create_history_aware_retriever(
62
+ llm, retriever, contextualize_q_prompt
63
+ )
64
+
65
+ qa_prompt = ChatPromptTemplate.from_messages(
66
+ [
67
+ ("system", rag_system_prompt),
68
+ MessagesPlaceholder("chat_history"),
69
+ ("human", "{input}"),
70
+ ]
71
+ )
72
+
73
+ # Stuff documents chain combines documents from retreiver | qa_prompt
74
+ # | llm | StrOutputParser
75
+ question_answer_chain = create_stuff_documents_chain(llm, qa_prompt).with_config(
76
+ tags=["qa"]
77
+ )
78
+
79
+ # Full rag chain is history aware retriever | question answer chain
80
+ rag_chain = create_retrieval_chain(
81
+ history_aware_retriever, question_answer_chain
82
+ ).with_config(tags=["rag"])
83
+
84
+ return rag_chain
src/ucl_module_chat/conf/config.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - override hydra/job_logging: disabled
3
+
4
+ # All paths are specified relative to the root of the project
5
+
6
+ setup:
7
+ scrape_documents:
8
+ index_page_url: "https://search.ucl.ac.uk/s/search.html?collection=drupal-module-catalogue&facetsort=alpha&num_ranks=10000&daat=10000&form=ucl&start_rank=0"
9
+ output_dir: 'data/module_html'
10
+ regex_url_pattern: 'https://www.ucl.ac.uk/module-catalogue/modules/[a-zA-Z0-9-]+[A-Z]{4}\d{4}'
11
+ wait_time_seconds: 2
12
+ convert_documents:
13
+ input_dir: ${setup.scrape_documents.output_dir}
14
+ output_dir: 'data/module_md'
15
+ embed_documents:
16
+ input_dir: ${setup.convert_documents.output_dir}
17
+ output_dir: ${vectorstore.dir}
18
+
19
+ vectorstore:
20
+ dir: 'data/module_catalogue_vectorstore'
21
+
22
+ models:
23
+ embedding:
24
+ _target_: langchain_openai.OpenAIEmbeddings
25
+ model: text-embedding-3-small
26
+
27
+ llm:
28
+ _target_: langchain_openai.ChatOpenAI
29
+ model: gpt-4o-mini
src/ucl_module_chat/data_processing/document_conversion.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from pathlib import Path
3
+
4
+ import hydra
5
+ import jinja2
6
+ import omegaconf
7
+ from bs4 import BeautifulSoup
8
+ from loguru import logger
9
+ from tqdm import tqdm
10
+
11
+ from ucl_module_chat.data_processing.document_templates import module_template
12
+ from ucl_module_chat.utils.resolve_paths import get_abs_path_using_repo_root
13
+
14
+
15
+ def _extract_module_info_from_html(module_html: str) -> dict:
16
+ """Parse HTML content for a UCL module page and extract key information."""
17
+
18
+ soup = BeautifulSoup(module_html, "html.parser")
19
+
20
+ # Extract the module title and code from the og:title meta tag
21
+ pattern = r"""
22
+ (?P<module_title>.*?) # Capture the module title
23
+ \s* # Optional whitespace
24
+ \((?P<module_code>[A-Z]{4}\d{4})\) # Capture the alphanumeric code
25
+ """
26
+ og_title = soup.find("meta", attrs={"name": "og:title"})["content"]
27
+ match = re.search(pattern, og_title, re.VERBOSE)
28
+ module_title = match.group("module_title").strip()
29
+ module_code = match.group("module_code").strip()
30
+
31
+ url = soup.find("meta", attrs={"property": "og:url"})["content"]
32
+
33
+ faculty = soup.find("meta", attrs={"name": "ucl:sanitized_faculty"})["content"]
34
+
35
+ teaching_department = soup.find(
36
+ "meta", attrs={"name": "ucl:sanitized_teaching_department"}
37
+ )["content"]
38
+
39
+ level = soup.find("meta", attrs={"name": "ucl:sanitized_level"})["content"]
40
+
41
+ teaching_term = soup.find(
42
+ "meta", attrs={"name": "ucl:sanitized_intended_teaching_term"}
43
+ )["content"]
44
+
45
+ credit_value = soup.find("meta", attrs={"name": "ucl:sanitized_credit_value"})[
46
+ "content"
47
+ ]
48
+
49
+ sanitized_subject = soup.find("meta", attrs={"name": "ucl:sanitized_subject"})[
50
+ "content"
51
+ ]
52
+
53
+ sanitized_keywords = soup.find("meta", attrs={"name": "ucl:sanitized_keywords"})[
54
+ "content"
55
+ ]
56
+
57
+ restrictions = (
58
+ soup.find("dt", string="Restrictions")
59
+ .find_next_sibling("dd")
60
+ .get_text()
61
+ .strip()
62
+ .replace("\n", " ")
63
+ )
64
+
65
+ alternative_credit_options = (
66
+ soup.find("h2", string="Alternative credit options")
67
+ .find_next("p")
68
+ .get_text()
69
+ .strip()
70
+ )
71
+
72
+ description = soup.find("div", class_="module-description").get_text()
73
+
74
+ # Deliveries - there may be multiple deliveries for a module
75
+ potential_deliveries = soup.find_all("div", class_="box tagged box--bar-thick")
76
+ deliveries = [
77
+ d
78
+ for d in potential_deliveries
79
+ if d.find("h3", string="Teaching and assessment") is not None
80
+ ]
81
+ collated_d = []
82
+ for d in deliveries:
83
+ delivery_info = {}
84
+
85
+ # Info from the header
86
+ header = d.find("h2").get_text()
87
+ # Might need to modify this regex pattern if some modules are different
88
+ pattern = r"""
89
+ Intended\steaching\sterm: # Matches 'Intended teaching term:'
90
+ \s* # Optional whitespace
91
+ (?P<term>[\w\s,\(\)]+) # Capture the term
92
+ \s* # Optional whitespace
93
+ (?P<type>Undergraduate|Postgraduate) # Matches UG or PG
94
+ \s* # Optional whitespace
95
+ \(FHEQ\sLevel\s(?P<fheq_level>\d+)\) # Matches 'FHEQ Level X'
96
+ """ # and captures level number
97
+
98
+ # Search for matches in the header string
99
+ match = re.search(pattern, header, re.VERBOSE)
100
+
101
+ if match:
102
+ # Extracted values from the regex groups
103
+ delivery_info["teaching_term"] = match.group("term").strip()
104
+ delivery_info["type"] = match.group("type").strip()
105
+ delivery_info["fheq_level"] = match.group("fheq_level").strip()
106
+
107
+ # Info from the table for this delivery
108
+ col_1 = d.find("section", class_="middle-split__column1")
109
+
110
+ delivery_info["mode_of_study"] = (
111
+ col_1.find("dt", string="Mode of study").find_next("dd").text.strip()
112
+ )
113
+
114
+ assessment_methods = (
115
+ col_1.find("dt", string="Methods of assessment")
116
+ .find_next("dd")
117
+ .find_all("div")
118
+ )
119
+ delivery_info["methods_of_assessment"] = ", ".join(
120
+ [" ".join(method.text.strip().split()) for method in assessment_methods]
121
+ )
122
+
123
+ delivery_info["mark_scheme"] = (
124
+ col_1.find("dt", string="Mark scheme").find_next("dd").text.strip()
125
+ )
126
+
127
+ col_2 = d.find("section", class_="middle-split__column2")
128
+
129
+ email = col_2.find("a", href=re.compile(r"^mailto:"))
130
+ delivery_info["contact_email"] = email.text.strip() if email else None
131
+
132
+ delivery_info["number_of_students_prior_year"] = (
133
+ col_2.find("dt", string="Number of students on module in previous year")
134
+ .find_next("dd")
135
+ .text.strip()
136
+ )
137
+
138
+ collated_d.append(delivery_info)
139
+
140
+ info = {
141
+ "module_title": module_title,
142
+ "module_code": module_code,
143
+ "url": url,
144
+ "faculty": faculty,
145
+ "teaching_department": teaching_department,
146
+ "level": level,
147
+ "teaching_term": teaching_term,
148
+ "credit_value": credit_value,
149
+ "subject": sanitized_subject,
150
+ "keywords": sanitized_keywords,
151
+ "alternative_credit_options": alternative_credit_options,
152
+ "description": description,
153
+ "restrictions": restrictions,
154
+ "deliveries": collated_d,
155
+ }
156
+
157
+ return info
158
+
159
+
160
+ def _module_info_to_markdown(module_info: dict, template: jinja2.Template) -> str:
161
+ """Process module information dictionary into markdown document using template."""
162
+ return template.render(module_info)
163
+
164
+
165
+ def _convert_module_html_to_markdown(
166
+ module_html: str, extract_function: callable, markdown_template: jinja2.Template
167
+ ) -> None:
168
+ """Convert a single UCL module HTML page to a markdown document."""
169
+ module_info = extract_function(module_html)
170
+ module_markdown = _module_info_to_markdown(module_info, markdown_template)
171
+ return module_markdown
172
+
173
+
174
+ def convert_all_documents_html_to_markdown(
175
+ input_dir: str | Path,
176
+ output_dir: str | Path,
177
+ extract_function: callable = _extract_module_info_from_html,
178
+ markdown_template: jinja2.Template = module_template,
179
+ ):
180
+ """Convert all UCL module HTML pages in a directory to markdown documents."""
181
+
182
+ input_dir = Path(input_dir)
183
+ output_dir = Path(output_dir)
184
+ output_dir.mkdir(parents=True, exist_ok=True)
185
+
186
+ logger.info("""Converting HTML module files to markdown documents""")
187
+
188
+ all_module_html_files = list(input_dir.glob("*.html"))
189
+
190
+ n_modules = len(all_module_html_files)
191
+
192
+ logger.info(
193
+ f"Identified {n_modules} HTML module files to convert to markdown documents"
194
+ )
195
+
196
+ errors = 0
197
+ for module_html_path in tqdm(all_module_html_files):
198
+ try:
199
+ with open(module_html_path, "r") as f:
200
+ module_html = f.read()
201
+
202
+ module_markdown = _convert_module_html_to_markdown(
203
+ module_html, extract_function, markdown_template
204
+ )
205
+
206
+ output_path = output_dir / f"{module_html_path.stem}.md"
207
+ with open(output_path, "w") as f:
208
+ f.write(module_markdown)
209
+ except Exception as e:
210
+ errors += 1
211
+ logger.error(f"Error converting {module_html_path.stem}: {e}")
212
+ logger.info(f"{n_modules - errors} HTML files successfully converted to markdown")
213
+ logger.info(f"{errors} HTML files could not be converted.")
214
+
215
+
216
+ @hydra.main(version_base=None, config_path="../conf", config_name="config")
217
+ def main(cfg: omegaconf.DictConfig) -> None:
218
+ """Run the document conversion process."""
219
+ cfg = cfg.setup.convert_documents
220
+ cfg.input_dir = get_abs_path_using_repo_root(cfg.input_dir)
221
+ cfg.output_dir = get_abs_path_using_repo_root(cfg.output_dir)
222
+
223
+ convert_all_documents_html_to_markdown(**cfg)
224
+
225
+
226
+ if __name__ == "__main__":
227
+ main()
src/ucl_module_chat/data_processing/document_embedding.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import hydra
4
+ import omegaconf
5
+ from dotenv import load_dotenv
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain_core.embeddings.embeddings import Embeddings
8
+ from loguru import logger
9
+
10
+ from ucl_module_chat.utils.resolve_paths import get_abs_path_using_repo_root
11
+
12
+ load_dotenv()
13
+
14
+
15
+ def embed_documents(input_dir: str | Path, embedding_model: Embeddings) -> FAISS:
16
+ """Create a FAISS vectorstore from a directory of markdown documents."""
17
+ input_dir = Path(input_dir)
18
+
19
+ all_module_document_paths = list(input_dir.glob("*.md"))
20
+
21
+ module_docs = []
22
+
23
+ for module_md_path in all_module_document_paths:
24
+ with open(module_md_path, "r") as f:
25
+ module_md = f.read()
26
+ module_docs.append(module_md)
27
+
28
+ logger.info(f"Embedding {len(module_docs)} documents")
29
+ vectorstore = FAISS.from_texts(module_docs, embedding=embedding_model)
30
+ logger.info(f"Vectorstore created with {vectorstore.index.ntotal} vectors")
31
+ return vectorstore
32
+
33
+
34
+ @hydra.main(version_base=None, config_path="../conf", config_name="config")
35
+ def main(cfg: omegaconf.DictConfig) -> None:
36
+ """Run the document embedding process."""
37
+ embedding_model = hydra.utils.instantiate(cfg.models.embedding)
38
+ cfg.setup.embed_documents.input_dir = get_abs_path_using_repo_root(
39
+ cfg.setup.embed_documents.input_dir
40
+ )
41
+ cfg.vectorstore.dir = get_abs_path_using_repo_root(cfg.vectorstore.dir)
42
+ vectorstore = embed_documents(cfg.setup.embed_documents.input_dir, embedding_model)
43
+ vectorstore.save_local(cfg.vectorstore.dir)
44
+ logger.info(f"Vectorstore saved to {cfg.vectorstore.dir}")
45
+
46
+
47
+ if __name__ == "__main__":
48
+ main()
src/ucl_module_chat/data_processing/document_scraping.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import time
3
+ from pathlib import Path
4
+
5
+ import hydra
6
+ import omegaconf
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ from loguru import logger
10
+ from tqdm import tqdm
11
+
12
+ from ucl_module_chat.utils.resolve_paths import get_abs_path_using_repo_root
13
+
14
+
15
+ def _get_index_page_html(index_page_url: str):
16
+ """Get the HTML content of the index page."""
17
+ response = requests.get(index_page_url)
18
+ index_page_html = response.text
19
+ return index_page_html
20
+
21
+
22
+ def _get_module_urls_from_index_page(index_page_html: str, regex_url_pattern: str):
23
+ """Extract module URLs from the index page HTML using regex."""
24
+ regex_url_pattern = re.compile(regex_url_pattern)
25
+ soup = BeautifulSoup(index_page_html, "html.parser")
26
+ pattern = re.compile(regex_url_pattern)
27
+
28
+ module_urls = []
29
+ for cite_tag in soup.find_all("cite"):
30
+ url = cite_tag["data-url"]
31
+ if pattern.match(url):
32
+ module_urls.append(url)
33
+
34
+ return module_urls
35
+
36
+
37
+ def _save_module_page_html(module_url: str, output_dir: str | Path):
38
+ """Save the HTML content of a module page to a text file."""
39
+ output_dir = Path(output_dir)
40
+
41
+ # Send a GET request to fetch the HTML content
42
+ response = requests.get(module_url)
43
+ response.raise_for_status() # Raise an exception for HTTP errors
44
+
45
+ # Extract the part of the URL after "/modules/" for the filename
46
+ module_id = module_url.split("/modules/")[1]
47
+
48
+ # Save the HTML content to a text file
49
+ file_path = output_dir / f"{module_id}.html"
50
+ with open(file_path, "w", encoding="utf-8") as file:
51
+ file.write(response.text)
52
+
53
+
54
+ def scrape_documents(
55
+ index_page_url: str | Path,
56
+ output_dir: str | Path,
57
+ regex_url_pattern: str,
58
+ wait_time_seconds: int = 2,
59
+ ):
60
+ """Scrape module pages and save HTML content to text files."""
61
+ output_dir = Path(output_dir)
62
+ output_dir.mkdir(parents=True, exist_ok=True)
63
+
64
+ logger.info(f"Identifying module pages from {index_page_url}")
65
+ index_page_html = _get_index_page_html(index_page_url)
66
+
67
+ module_urls = _get_module_urls_from_index_page(index_page_html, regex_url_pattern)
68
+ n_modules = len(module_urls)
69
+ logger.info(f"Identified {len(module_urls)} module pages to save to {output_dir}.")
70
+
71
+ errors = 0
72
+
73
+ for url in tqdm(module_urls):
74
+ try:
75
+ _save_module_page_html(url, output_dir)
76
+ time.sleep(wait_time_seconds) # Pause to avoid abusing the server
77
+ except requests.exceptions.RequestException as e:
78
+ logger.error(f"Error saving HTML for {url}: {e}")
79
+ errors += 1
80
+
81
+ logger.info(f"{n_modules - errors} module pages successfully saved")
82
+ logger.info(f"{errors} module pages could not be saved.")
83
+
84
+
85
+ @hydra.main(version_base=None, config_path="../conf", config_name="config")
86
+ def main(cfg: omegaconf.DictConfig) -> None:
87
+ """Run the document scraping process."""
88
+ cfg = cfg.setup.scrape_documents
89
+ cfg.output_dir = get_abs_path_using_repo_root(cfg.output_dir)
90
+ scrape_documents(**cfg)
91
+
92
+
93
+ if __name__ == "__main__":
94
+ main()
src/ucl_module_chat/data_processing/document_templates.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from jinja2 import Template
2
+
3
+ module_template = Template(
4
+ """
5
+ # {{ module_title }} ({{module_code}})
6
+
7
+ ## Key information
8
+
9
+ **Course Code:** {{ module_code }} \\
10
+ **Subject Area:** {{ subject }} \\
11
+ **Keywords:** {{ keywords }} \\
12
+ **Module catalogue URL:** {{ url }}
13
+
14
+ **Faculty:** {{ faculty }} \\
15
+ **Teaching Department:** {{ teaching_department }} \\
16
+ **Credit Value:** {{ credit_value }} \\
17
+ **Restrictions:** {{ restrictions }}
18
+
19
+ ## Alternative credit options
20
+ {{ alternative_credit_options }}
21
+
22
+ ## Description
23
+ {{ description }}
24
+
25
+ ## Module deliveries for 2024/25 academic year
26
+ {% for delivery in deliveries %}
27
+ ### {{delivery.type}} (FHEQ Level {{delivery.fheq_level}})
28
+
29
+ #### Teaching and assessment
30
+ **Intended teaching term:** {{ delivery.teaching_term }} \\
31
+ **Mode of study:** {{ delivery.mode_of_study }} \\
32
+ **Methods of assessment:** {{ delivery.methods_of_assessment }} \\
33
+ **Mark scheme:** {{ delivery.mark_scheme }}
34
+
35
+ #### Other information
36
+ **Number of students on module in previous year:** {{ delivery.number_of_students_prior_year }} \\
37
+ **Who to contact for more information:** {{ delivery.contact_email }}
38
+ {% endfor %}
39
+ """
40
+ )
src/ucl_module_chat/setup.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hydra
2
+ import omegaconf
3
+ from dotenv import load_dotenv
4
+ from loguru import logger
5
+
6
+ from ucl_module_chat.data_processing.document_conversion import (
7
+ convert_all_documents_html_to_markdown,
8
+ )
9
+ from ucl_module_chat.data_processing.document_embedding import embed_documents
10
+ from ucl_module_chat.data_processing.document_scraping import scrape_documents
11
+ from ucl_module_chat.utils.resolve_paths import get_abs_path_using_repo_root
12
+
13
+ load_dotenv()
14
+
15
+
16
+ @hydra.main(version_base=None, config_path="conf", config_name="config")
17
+ def main(cfg: omegaconf.DictConfig) -> None:
18
+ """Scrape module catalogue, convert HTML to markdown, and embed in vectorstore."""
19
+
20
+ cfg.setup.scrape_documents.output_dir = get_abs_path_using_repo_root(
21
+ cfg.setup.scrape_documents.output_dir
22
+ )
23
+ cfg.setup.convert_documents.input_dir = get_abs_path_using_repo_root(
24
+ cfg.setup.convert_documents.input_dir
25
+ )
26
+ cfg.setup.convert_documents.output_dir = get_abs_path_using_repo_root(
27
+ cfg.setup.convert_documents.output_dir
28
+ )
29
+ cfg.setup.embed_documents.input_dir = get_abs_path_using_repo_root(
30
+ cfg.setup.embed_documents.input_dir
31
+ )
32
+ cfg.vectorstore.dir = get_abs_path_using_repo_root(cfg.vectorstore.dir)
33
+
34
+ scrape_documents(**cfg.setup.scrape_documents)
35
+ convert_all_documents_html_to_markdown(**cfg.setup.convert_documents)
36
+
37
+ embedding_model = hydra.utils.instantiate(cfg.models.embedding)
38
+ vectorstore = embed_documents(cfg.setup.embed_documents.input_dir, embedding_model)
39
+ vectorstore.save_local(cfg.vectorstore.dir)
40
+ logger.info(f"Vectorstore saved to {cfg.vectorstore.dir}")
41
+
42
+
43
+ if __name__ == "__main__":
44
+ main()
src/ucl_module_chat/utils/resolve_paths.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from git import Repo
5
+
6
+
7
+ def get_abs_path_using_repo_root(path: str | Path) -> Path:
8
+ """Takes path relative to the repo root and returns the absolute path."""
9
+
10
+ # Initialize the repo (this will automatically find the root if
11
+ # you're in a subdirectory)
12
+ repo = Repo(os.getcwd(), search_parent_directories=True)
13
+
14
+ # Get the root directory
15
+ repo_root = repo.git.rev_parse("--show-toplevel")
16
+ abs_path = Path(repo_root) / path
17
+ return abs_path
uv.lock ADDED
The diff for this file is too large to render. See raw diff