Added vectorstore
Browse files- .env.template +2 -0
- backend/app/__init__.py +9 -1
- backend/app/vectorstore.py +44 -0
- backend/tests/test_vectorstore.py +35 -0
- pyproject.toml +21 -8
- pytest.ini +2 -1
- static/data/langchain_rag_tutorial.html +0 -0
- test_local.sh +1 -1
.env.template
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
# OpenAI API Key - Required for embeddings and LLM calls
|
2 |
+
OPENAI_API_KEY=your_key_here
|
backend/app/__init__.py
CHANGED
@@ -1 +1,9 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
from dotenv import load_dotenv, find_dotenv
|
4 |
+
|
5 |
+
# Load environment variables from .env file in project root
|
6 |
+
load_dotenv(find_dotenv())
|
7 |
+
|
8 |
+
if os.getenv("OPENAI_API_KEY") is None:
|
9 |
+
raise ValueError("OPENAI_API_KEY is not set")
|
backend/app/vectorstore.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Super early version of a vector store. Just want to make something available for the rest of the app to use.
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import requests
|
6 |
+
import nltk
|
7 |
+
|
8 |
+
from langchain_community.vectorstores import Qdrant
|
9 |
+
from langchain_openai.embeddings import OpenAIEmbeddings
|
10 |
+
|
11 |
+
nltk.download('punkt_tab')
|
12 |
+
nltk.download('averaged_perceptron_tagger_eng')
|
13 |
+
|
14 |
+
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
|
15 |
+
|
16 |
+
# Create static/data directory if it doesn't exist
|
17 |
+
os.makedirs("static/data", exist_ok=True)
|
18 |
+
|
19 |
+
# Download and save the webpage
|
20 |
+
url = "https://python.langchain.com/docs/tutorials/rag/"
|
21 |
+
response = requests.get(url)
|
22 |
+
with open("static/data/langchain_rag_tutorial.html", "w", encoding="utf-8") as f:
|
23 |
+
f.write(response.text)
|
24 |
+
|
25 |
+
from langchain_community.document_loaders import DirectoryLoader
|
26 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
27 |
+
|
28 |
+
# Load HTML files from static/data directory
|
29 |
+
loader = DirectoryLoader("static/data", glob="*.html")
|
30 |
+
documents = loader.load()
|
31 |
+
|
32 |
+
# Split documents into chunks
|
33 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
34 |
+
chunk_size=1000,
|
35 |
+
chunk_overlap=200
|
36 |
+
)
|
37 |
+
split_chunks = text_splitter.split_documents(documents)
|
38 |
+
|
39 |
+
vector_db = Qdrant.from_documents(
|
40 |
+
split_chunks,
|
41 |
+
embedding_model,
|
42 |
+
location=":memory:",
|
43 |
+
collection_name="extending_context_window_llama_3",
|
44 |
+
)
|
backend/tests/test_vectorstore.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
import os
|
3 |
+
from langchain.schema import Document
|
4 |
+
from backend.app import vectorstore
|
5 |
+
|
6 |
+
def test_directory_creation():
|
7 |
+
"""Test that the static/data directory is created"""
|
8 |
+
assert os.path.exists("static/data")
|
9 |
+
assert os.path.exists("static/data/langchain_rag_tutorial.html")
|
10 |
+
|
11 |
+
def test_html_content():
|
12 |
+
"""Test that the HTML content was downloaded and contains expected content"""
|
13 |
+
with open("static/data/langchain_rag_tutorial.html", "r", encoding="utf-8") as f:
|
14 |
+
content = f.read()
|
15 |
+
|
16 |
+
# Check for some expected content from the LangChain RAG tutorial
|
17 |
+
assert "RAG" in content
|
18 |
+
assert "LangChain" in content
|
19 |
+
|
20 |
+
def test_vector_store_similarity_search():
|
21 |
+
"""Test that the vector store can perform similarity search"""
|
22 |
+
# Test query
|
23 |
+
query = "What is RAG?"
|
24 |
+
|
25 |
+
# Perform similarity search
|
26 |
+
results = vectorstore.vector_db.similarity_search(query, k=2)
|
27 |
+
|
28 |
+
# Verify we get results
|
29 |
+
assert len(results) == 2
|
30 |
+
assert isinstance(results[0], Document)
|
31 |
+
|
32 |
+
# Verify the results contain relevant content
|
33 |
+
combined_content = " ".join([doc.page_content for doc in results]).lower()
|
34 |
+
assert "rag" in combined_content
|
35 |
+
assert "retrieval" in combined_content
|
pyproject.toml
CHANGED
@@ -1,9 +1,7 @@
|
|
1 |
[project]
|
2 |
-
name = "
|
3 |
version = "0.1.0"
|
4 |
-
description = "
|
5 |
-
readme = "README.md"
|
6 |
-
requires-python = ">=3.12"
|
7 |
dependencies = [
|
8 |
"chainlit>=2.0.4",
|
9 |
"numpy>=2.2.2",
|
@@ -14,9 +12,24 @@ dependencies = [
|
|
14 |
"fastapi>=0.110.0",
|
15 |
"uvicorn>=0.27.1",
|
16 |
"pytest>=8.0.0",
|
17 |
-
"httpx>=0.26.0"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
]
|
19 |
|
20 |
-
[
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
[project]
|
2 |
+
name = "backend"
|
3 |
version = "0.1.0"
|
4 |
+
description = "Backend for the application"
|
|
|
|
|
5 |
dependencies = [
|
6 |
"chainlit>=2.0.4",
|
7 |
"numpy>=2.2.2",
|
|
|
12 |
"fastapi>=0.110.0",
|
13 |
"uvicorn>=0.27.1",
|
14 |
"pytest>=8.0.0",
|
15 |
+
"httpx>=0.26.0",
|
16 |
+
"langchain>=0.3.15",
|
17 |
+
"langchain-community>=0.3.15",
|
18 |
+
"langchain-openai>=0.3.2",
|
19 |
+
"requests>=2.31.0",
|
20 |
+
"python-dotenv>=1.0.0",
|
21 |
+
"openai>=1.12.0",
|
22 |
+
"pytest-dotenv>=0.5.2",
|
23 |
+
"unstructured",
|
24 |
+
"qdrant-client>=1.6.0",
|
25 |
]
|
26 |
|
27 |
+
[project.optional-dependencies]
|
28 |
+
test = [
|
29 |
+
"pytest>=7.4.0",
|
30 |
+
"pytest-asyncio>=0.21.1",
|
31 |
+
]
|
32 |
+
|
33 |
+
[build-system]
|
34 |
+
requires = ["setuptools>=61.0"]
|
35 |
+
build-backend = "setuptools.build_meta"
|
pytest.ini
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
[pytest]
|
2 |
-
pythonpath = .
|
|
|
|
1 |
[pytest]
|
2 |
+
pythonpath = .
|
3 |
+
env_files = .env
|
static/data/langchain_rag_tutorial.html
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test_local.sh
CHANGED
@@ -30,6 +30,6 @@ echo -e "${BLUE}${DIVIDER}${NC}"
|
|
30 |
|
31 |
# Run backend tests
|
32 |
echo -e "${YELLOW}Running backend tests...${NC}"
|
33 |
-
docker run simplify-test pytest backend/tests
|
34 |
|
35 |
echo -e "\n${GREEN}✨ Testing complete!${NC}\n"
|
|
|
30 |
|
31 |
# Run backend tests
|
32 |
echo -e "${YELLOW}Running backend tests...${NC}"
|
33 |
+
docker run --env-file .env simplify-test pytest backend/tests
|
34 |
|
35 |
echo -e "\n${GREEN}✨ Testing complete!${NC}\n"
|