Rsr2425 commited on
Commit
ed91833
·
1 Parent(s): 50f8987

Added vectorstore

Browse files
.env.template ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # OpenAI API Key - Required for embeddings and LLM calls
2
+ OPENAI_API_KEY=your_key_here
backend/app/__init__.py CHANGED
@@ -1 +1,9 @@
1
- # Empty file to make the directory a Python package
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ from dotenv import load_dotenv, find_dotenv
4
+
5
+ # Load environment variables from .env file in project root
6
+ load_dotenv(find_dotenv())
7
+
8
+ if os.getenv("OPENAI_API_KEY") is None:
9
+ raise ValueError("OPENAI_API_KEY is not set")
backend/app/vectorstore.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Super early version of a vector store. Just want to make something available for the rest of the app to use.
3
+ """
4
+ import os
5
+ import requests
6
+ import nltk
7
+
8
+ from langchain_community.vectorstores import Qdrant
9
+ from langchain_openai.embeddings import OpenAIEmbeddings
10
+
11
+ nltk.download('punkt_tab')
12
+ nltk.download('averaged_perceptron_tagger_eng')
13
+
14
+ embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
15
+
16
+ # Create static/data directory if it doesn't exist
17
+ os.makedirs("static/data", exist_ok=True)
18
+
19
+ # Download and save the webpage
20
+ url = "https://python.langchain.com/docs/tutorials/rag/"
21
+ response = requests.get(url)
22
+ with open("static/data/langchain_rag_tutorial.html", "w", encoding="utf-8") as f:
23
+ f.write(response.text)
24
+
25
+ from langchain_community.document_loaders import DirectoryLoader
26
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
27
+
28
+ # Load HTML files from static/data directory
29
+ loader = DirectoryLoader("static/data", glob="*.html")
30
+ documents = loader.load()
31
+
32
+ # Split documents into chunks
33
+ text_splitter = RecursiveCharacterTextSplitter(
34
+ chunk_size=1000,
35
+ chunk_overlap=200
36
+ )
37
+ split_chunks = text_splitter.split_documents(documents)
38
+
39
+ vector_db = Qdrant.from_documents(
40
+ split_chunks,
41
+ embedding_model,
42
+ location=":memory:",
43
+ collection_name="extending_context_window_llama_3",
44
+ )
backend/tests/test_vectorstore.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ import os
3
+ from langchain.schema import Document
4
+ from backend.app import vectorstore
5
+
6
+ def test_directory_creation():
7
+ """Test that the static/data directory is created"""
8
+ assert os.path.exists("static/data")
9
+ assert os.path.exists("static/data/langchain_rag_tutorial.html")
10
+
11
+ def test_html_content():
12
+ """Test that the HTML content was downloaded and contains expected content"""
13
+ with open("static/data/langchain_rag_tutorial.html", "r", encoding="utf-8") as f:
14
+ content = f.read()
15
+
16
+ # Check for some expected content from the LangChain RAG tutorial
17
+ assert "RAG" in content
18
+ assert "LangChain" in content
19
+
20
+ def test_vector_store_similarity_search():
21
+ """Test that the vector store can perform similarity search"""
22
+ # Test query
23
+ query = "What is RAG?"
24
+
25
+ # Perform similarity search
26
+ results = vectorstore.vector_db.similarity_search(query, k=2)
27
+
28
+ # Verify we get results
29
+ assert len(results) == 2
30
+ assert isinstance(results[0], Document)
31
+
32
+ # Verify the results contain relevant content
33
+ combined_content = " ".join([doc.page_content for doc in results]).lower()
34
+ assert "rag" in combined_content
35
+ assert "retrieval" in combined_content
pyproject.toml CHANGED
@@ -1,9 +1,7 @@
1
  [project]
2
- name = "simplify"
3
  version = "0.1.0"
4
- description = "LLM System to generate quizzes that simplify the learning process of tools and frameworks"
5
- readme = "README.md"
6
- requires-python = ">=3.12"
7
  dependencies = [
8
  "chainlit>=2.0.4",
9
  "numpy>=2.2.2",
@@ -14,9 +12,24 @@ dependencies = [
14
  "fastapi>=0.110.0",
15
  "uvicorn>=0.27.1",
16
  "pytest>=8.0.0",
17
- "httpx>=0.26.0"
 
 
 
 
 
 
 
 
 
18
  ]
19
 
20
- [tool.pytest.ini_options]
21
- testpaths = ["backend/tests"]
22
- python_files = ["test_*.py"]
 
 
 
 
 
 
 
1
  [project]
2
+ name = "backend"
3
  version = "0.1.0"
4
+ description = "Backend for the application"
 
 
5
  dependencies = [
6
  "chainlit>=2.0.4",
7
  "numpy>=2.2.2",
 
12
  "fastapi>=0.110.0",
13
  "uvicorn>=0.27.1",
14
  "pytest>=8.0.0",
15
+ "httpx>=0.26.0",
16
+ "langchain>=0.3.15",
17
+ "langchain-community>=0.3.15",
18
+ "langchain-openai>=0.3.2",
19
+ "requests>=2.31.0",
20
+ "python-dotenv>=1.0.0",
21
+ "openai>=1.12.0",
22
+ "pytest-dotenv>=0.5.2",
23
+ "unstructured",
24
+ "qdrant-client>=1.6.0",
25
  ]
26
 
27
+ [project.optional-dependencies]
28
+ test = [
29
+ "pytest>=7.4.0",
30
+ "pytest-asyncio>=0.21.1",
31
+ ]
32
+
33
+ [build-system]
34
+ requires = ["setuptools>=61.0"]
35
+ build-backend = "setuptools.build_meta"
pytest.ini CHANGED
@@ -1,2 +1,3 @@
1
  [pytest]
2
- pythonpath = .
 
 
1
  [pytest]
2
+ pythonpath = .
3
+ env_files = .env
static/data/langchain_rag_tutorial.html ADDED
The diff for this file is too large to render. See raw diff
 
test_local.sh CHANGED
@@ -30,6 +30,6 @@ echo -e "${BLUE}${DIVIDER}${NC}"
30
 
31
  # Run backend tests
32
  echo -e "${YELLOW}Running backend tests...${NC}"
33
- docker run simplify-test pytest backend/tests
34
 
35
  echo -e "\n${GREEN}✨ Testing complete!${NC}\n"
 
30
 
31
  # Run backend tests
32
  echo -e "${YELLOW}Running backend tests...${NC}"
33
+ docker run --env-file .env simplify-test pytest backend/tests
34
 
35
  echo -e "\n${GREEN}✨ Testing complete!${NC}\n"