Rohil Bansal commited on
Commit
2ed2129
·
1 Parent(s): cbb52fe
Files changed (45) hide show
  1. .gitignore +3 -0
  2. .gradio/certificate.pem +31 -0
  3. .gradio/flagged/dataset1.csv +43 -0
  4. README.md +2 -2
  5. course_search.txt +27 -0
  6. course_search/__init__.py +0 -0
  7. course_search/__pycache__/__init__.cpython-311.pyc +0 -0
  8. course_search/__pycache__/__init__.cpython-312.pyc +0 -0
  9. course_search/app/__init__.py +0 -0
  10. course_search/app/__pycache__/__init__.cpython-312.pyc +0 -0
  11. course_search/app/__pycache__/run.cpython-312.pyc +0 -0
  12. course_search/app/gradio_app.py +115 -0
  13. course_search/app/main.py +26 -0
  14. course_search/app/run.py +65 -0
  15. course_search/scraper/__init__.py +0 -0
  16. course_search/scraper/__pycache__/__init__.cpython-311.pyc +0 -0
  17. course_search/scraper/__pycache__/__init__.cpython-312.pyc +0 -0
  18. course_search/scraper/__pycache__/course_scraper.cpython-311.pyc +0 -0
  19. course_search/scraper/__pycache__/course_scraper.cpython-312.pyc +0 -0
  20. course_search/scraper/course_scraper.py +98 -0
  21. course_search/search_system/__init__.py +0 -0
  22. course_search/search_system/__pycache__/__init__.cpython-311.pyc +0 -0
  23. course_search/search_system/__pycache__/__init__.cpython-312.pyc +0 -0
  24. course_search/search_system/__pycache__/data_pipeline.cpython-311.pyc +0 -0
  25. course_search/search_system/__pycache__/data_pipeline.cpython-312.pyc +0 -0
  26. course_search/search_system/__pycache__/embeddings.cpython-311.pyc +0 -0
  27. course_search/search_system/__pycache__/embeddings.cpython-312.pyc +0 -0
  28. course_search/search_system/__pycache__/rag_system.cpython-311.pyc +0 -0
  29. course_search/search_system/__pycache__/vector_store.cpython-311.pyc +0 -0
  30. course_search/search_system/__pycache__/vector_store.cpython-312.pyc +0 -0
  31. course_search/search_system/data_pipeline.py +47 -0
  32. course_search/search_system/embeddings.py +123 -0
  33. course_search/search_system/rag_system.py +110 -0
  34. course_search/search_system/vector_store.py +65 -0
  35. data/courses_with_embeddings.pkl +3 -0
  36. data/embedding_cache/embeddings_cache_all-MiniLM-L6-v2.pkl +3 -0
  37. instructions.txt +49 -0
  38. requirements.txt +0 -0
  39. tests/__init__.py +0 -0
  40. tests/__pycache__/__init__.cpython-312.pyc +0 -0
  41. tests/__pycache__/test_complete_pipeline.cpython-312.pyc +0 -0
  42. tests/test_complete_pipeline.py +47 -0
  43. tests/test_integration.py +19 -0
  44. tests/test_scraper.py +33 -0
  45. tests/test_vector_store.py +49 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .env
2
+
3
+
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
.gradio/flagged/dataset1.csv ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ What would you like to learn?,Number of Results,output,timestamp
2
+ "LLM
3
+ ",5,"# Search Results
4
+
5
+ ### 1. Getting Started with Large Language Models
6
+ **Description:**
7
+ Getting Started With LLMs
8
+
9
+ **Course Link:** [View Course](https://courses.analyticsvidhya.com/courses/getting-started-with-llms)
10
+
11
+ ---
12
+
13
+ ### 2. Framework to Choose the Right LLM for your Business
14
+ **Description:**
15
+ This course will guide you through the process of selecting the most suitable Large Language Model (LLM) for various business needs. By examining factors such as accuracy, cost, scalability, and integration, you will understand how different LLMs perform in specific scenarios, from customer support to healthcare and strategy development. The course emphasizes practical decision-making with real-world case studies, helping businesses navigate the rapidly evolving LLM landscape effectively.
16
+
17
+ **Course Link:** [View Course](https://courses.analyticsvidhya.com/courses/choosing-the-right-LLM-for-your-business)
18
+
19
+ ---
20
+
21
+ ### 3. Building LLM Applications using Prompt Engineering - Free Course
22
+ **Description:**
23
+ Professionals: Individuals looking to deepen their knowledge and apply advanced LLM and prompt engineering techniques to solve complex problems across various domains.
24
+ Aspiring Students: Individuals looking to deepen their knowledge and apply advanced LLM and prompt engineering techniques to solve complex problems across various domains.
25
+
26
+ **Course Link:** [View Course](https://courses.analyticsvidhya.com/courses/building-llm-applications-using-prompt-engineering-free)
27
+
28
+ ---
29
+
30
+ ### 4. Getting Started with Large Language Models
31
+ **Description:**
32
+ Who Should Enroll: Professionals: Individuals looking to expand their skill set and leverage LLMs across different industries. Aspiring Students: For those setting out on their journey to master language data analysis and leave a mark in the tech world.
33
+
34
+ **Course Link:** [View Course](https://courses.analyticsvidhya.com/courses/getting-started-with-llms)
35
+
36
+ ---
37
+
38
+ ### 5. Getting Started with Large Language Models
39
+ **Description:**
40
+ This course will help you gain a comprehensive understanding of Large Language Models (LLMs) and develop advanced natural language processing (NLP) applications using the PyTorch framework. With a carefully curated list of resources and exercises, this course is your guide to becoming an expert in LLMs. Master the techniques to build and fine-tune LLMs, and generate human-like text.
41
+
42
+ **Course Link:** [View Course](https://courses.analyticsvidhya.com/courses/getting-started-with-llms)
43
+
README.md CHANGED
@@ -1,11 +1,11 @@
1
  ---
2
- title: Course Search Av
3
  emoji: 🚀
4
  colorFrom: purple
5
  colorTo: pink
6
  sdk: gradio
7
  sdk_version: 5.8.0
8
- app_file: app.py
9
  pinned: false
10
  license: other
11
  short_description: Course search assignment for Analytics Vidhya
 
1
  ---
2
+ title: Course Search Analytics Vidya
3
  emoji: 🚀
4
  colorFrom: purple
5
  colorTo: pink
6
  sdk: gradio
7
  sdk_version: 5.8.0
8
+ app_file: course_search/app/run.py
9
  pinned: false
10
  license: other
11
  short_description: Course search assignment for Analytics Vidhya
course_search.txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Course Search System: My Implementation Journey
2
+
3
+ Data Gathering
4
+ I started by collecting course data from various sources like university websites and online platforms. I used web scraping techniques to extract information like course titles, descriptions, prerequisites, and more. I also integrated APIs from some institutions to directly access their course catalogs. Once I had the data, I cleaned and standardized it to ensure consistency.
5
+
6
+ Choosing the Right Tools
7
+ For processing the text data, I selected a powerful language model like BERT to understand the semantic meaning of course descriptions. This model converted the text into numerical representations (vectors) that computers could easily process.
8
+
9
+ To efficiently store and search these vectors, I used a vector database called FAISS. It's designed for handling large datasets of vectors and quickly finding the most similar ones.
10
+
11
+ Building the System
12
+ I designed my system to be flexible and scalable. Here's a breakdown of its key components:
13
+
14
+ Data Ingestion: This part collects and prepares the course data.
15
+ Embedding: The language model processes the course descriptions and creates vectors.
16
+ Vector Database: This stores the vectors for efficient searching.
17
+ Search API: This allows users to query the system and get relevant results.
18
+ User Interface: This is the front-end where users can interact with the system.
19
+ I deployed the system on a cloud platform to ensure it's reliable and can handle increasing user demand. Each component runs in its own container, making it easy to manage and update.
20
+
21
+ Challenges and Solutions
22
+
23
+ Data Quality: Ensuring data consistency and accuracy was a big challenge. I addressed this by carefully cleaning and standardizing the data.
24
+ Model Performance: Choosing the right language model was crucial. I experimented with different models and fine-tuned them to get the best results.
25
+ Scalability: Handling a large number of courses required a scalable vector database. FAISS was a great choice for this, and I configured it to handle the load.
26
+ User Experience: I focused on making the system user-friendly. I conducted user tests and made improvements to the interface and search algorithm.
27
+ Overall, this project was a great learning experience. I'm proud of what I've accomplished and excited to see how it can help students find the right courses.
course_search/__init__.py ADDED
File without changes
course_search/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (179 Bytes). View file
 
course_search/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (167 Bytes). View file
 
course_search/app/__init__.py ADDED
File without changes
course_search/app/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (171 Bytes). View file
 
course_search/app/__pycache__/run.cpython-312.pyc ADDED
Binary file (2.63 kB). View file
 
course_search/app/gradio_app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ import logging
5
+ from course_search.search_system.rag_system import RAGSystem
6
+
7
+ # Setup logging
8
+ logging.basicConfig(level=logging.INFO)
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class CourseSearchApp:
12
+ def __init__(self):
13
+ """Initialize the search application"""
14
+ try:
15
+ self.load_components()
16
+ except Exception as e:
17
+ logger.error(f"Initialization error: {str(e)}")
18
+ raise
19
+
20
+ def load_components(self):
21
+ """Initialize RAG system and load data"""
22
+ try:
23
+ # Construct path to data file
24
+ data_path = Path(__file__).parent.parent.parent / 'data' / 'courses_with_embeddings.pkl'
25
+
26
+ if not data_path.exists():
27
+ raise FileNotFoundError(f"Data file not found at: {data_path}")
28
+
29
+ # Load saved course data
30
+ df = pd.read_pickle(str(data_path))
31
+ logger.info(f"Loaded {len(df)} courses from {data_path}")
32
+
33
+ # Initialize RAG system
34
+ self.rag_system = RAGSystem()
35
+ self.rag_system.load_and_process_data(df)
36
+ logger.info("Components loaded successfully")
37
+
38
+ except Exception as e:
39
+ logger.error(f"Error loading components: {str(e)}")
40
+ raise
41
+
42
+ def search_courses(self, query: str, num_results: int) -> str:
43
+ """
44
+ Search for courses and format results for Gradio
45
+ """
46
+ try:
47
+ results = self.rag_system.search_courses(query, top_k=num_results)
48
+
49
+ # Format results for display
50
+ markdown_output = "# Search Results\n\n"
51
+ for i, result in enumerate(results['results'], 1):
52
+ markdown_output += f"### {i}. {result['title']}\n"
53
+ markdown_output += f"**Description:**\n{result['description']}\n\n"
54
+ markdown_output += f"**Course Link:** [View Course]({result['url']})\n\n"
55
+ markdown_output += "---\n\n"
56
+
57
+ return markdown_output
58
+
59
+ except Exception as e:
60
+ error_msg = f"Error during search: {str(e)}"
61
+ logger.error(error_msg)
62
+ return f"# Error\n\n{error_msg}"
63
+
64
+ def create_gradio_interface():
65
+ """Create and configure Gradio interface"""
66
+ try:
67
+ app = CourseSearchApp()
68
+
69
+ # Define the interface
70
+ iface = gr.Interface(
71
+ fn=app.search_courses,
72
+ inputs=[
73
+ gr.Textbox(
74
+ label="What would you like to learn?",
75
+ placeholder="e.g., machine learning for beginners, data visualization, python basics",
76
+ lines=2
77
+ ),
78
+ gr.Slider(
79
+ minimum=1,
80
+ maximum=10,
81
+ value=5,
82
+ step=1,
83
+ label="Number of Results"
84
+ )
85
+ ],
86
+ outputs=gr.Markdown(),
87
+ title="Analytics Vidhya Course Search",
88
+ description="""
89
+ Search through Analytics Vidhya's free courses using natural language!
90
+ Get personalized course recommendations and AI-generated responses to your queries.
91
+ """,
92
+ theme=gr.themes.Soft()
93
+ )
94
+
95
+ return iface
96
+
97
+ except Exception as e:
98
+ logger.error(f"Error creating Gradio interface: {str(e)}")
99
+ raise
100
+
101
+ def main():
102
+ """Main function to run the Gradio app"""
103
+ try:
104
+ iface = create_gradio_interface()
105
+ iface.launch(
106
+ server_name="0.0.0.0",
107
+ server_port=7860,
108
+ share=True
109
+ )
110
+ except Exception as e:
111
+ logger.error(f"Error launching Gradio app: {str(e)}")
112
+ raise
113
+
114
+ if __name__ == "__main__":
115
+ main()
course_search/app/main.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ # Add the project root directory to Python path
4
+ PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
5
+ sys.path.append(PROJECT_ROOT)
6
+
7
+ from course_search.search_system.data_pipeline import DataPipeline
8
+ import logging
9
+ import argparse
10
+
11
+ def main():
12
+ parser = argparse.ArgumentParser(description='Run the course scraping and embedding pipeline')
13
+ parser.add_argument('--output', type=str, default='data/courses_with_embeddings.pkl',
14
+ help='Path to save the processed data')
15
+ args = parser.parse_args()
16
+
17
+ # Initialize and run pipeline
18
+ pipeline = DataPipeline()
19
+ df = pipeline.run_pipeline(save_path=args.output)
20
+
21
+ print(f"\nProcessed {len(df)} courses")
22
+ print(f"Data saved to {args.output}")
23
+
24
+ if __name__ == "__main__":
25
+ logging.basicConfig(level=logging.INFO)
26
+ main()
course_search/app/run.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from pathlib import Path
4
+ import subprocess
5
+ import logging
6
+
7
+ project_root = Path(__file__).parent.parent.parent
8
+
9
+ if str(project_root) not in sys.path:
10
+ sys.path.append(str(project_root))
11
+
12
+ from course_search.search_system.data_pipeline import DataPipeline
13
+
14
+ # Setup logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ def setup_paths():
19
+ """Setup necessary paths and directories"""
20
+ project_root = Path(__file__).parent.parent.parent
21
+
22
+ if str(project_root) not in sys.path:
23
+ sys.path.append(str(project_root))
24
+
25
+ data_dir = project_root / 'data'
26
+ data_dir.mkdir(exist_ok=True)
27
+
28
+ return project_root, data_dir
29
+
30
+ def main():
31
+ try:
32
+ # Setup paths
33
+ project_root, data_dir = setup_paths()
34
+
35
+ # Run data pipeline
36
+ logger.info("Running data pipeline...")
37
+ pipeline = DataPipeline()
38
+ pipeline.run_pipeline(save_path=str(data_dir / 'courses_with_embeddings.pkl'))
39
+
40
+ # Run Gradio app
41
+ logger.info("Starting Gradio app...")
42
+ gradio_path = Path(__file__).parent / 'gradio_app.py'
43
+
44
+ if not gradio_path.exists():
45
+ raise FileNotFoundError(f"Gradio app not found at: {gradio_path}")
46
+
47
+ # Change to project root directory before running
48
+ os.chdir(str(project_root))
49
+
50
+ # Run Gradio with proper Python path
51
+ env = os.environ.copy()
52
+ env['PYTHONPATH'] = str(project_root)
53
+
54
+ subprocess.run(
55
+ ['python', str(gradio_path)],
56
+ env=env,
57
+ check=True
58
+ )
59
+
60
+ except Exception as e:
61
+ logger.error(f"Error running application: {str(e)}")
62
+ raise
63
+
64
+ if __name__ == "__main__":
65
+ main()
course_search/scraper/__init__.py ADDED
File without changes
course_search/scraper/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (187 Bytes). View file
 
course_search/scraper/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (161 Bytes). View file
 
course_search/scraper/__pycache__/course_scraper.cpython-311.pyc ADDED
Binary file (5.6 kB). View file
 
course_search/scraper/__pycache__/course_scraper.cpython-312.pyc ADDED
Binary file (5.03 kB). View file
 
course_search/scraper/course_scraper.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ from typing import List, Dict
5
+ import logging
6
+ from urllib.parse import urljoin
7
+ import time
8
+
9
+ # Set up logging
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class CourseScraper:
14
+ def __init__(self):
15
+ self.base_url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
16
+ self.headers = {
17
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
18
+ }
19
+
20
+ def get_course_links(self) -> List[str]:
21
+ try:
22
+ logger.info(f"Fetching course links from {self.base_url}")
23
+ response = requests.get(self.base_url, headers=self.headers)
24
+ response.raise_for_status()
25
+
26
+ soup = BeautifulSoup(response.content, 'html.parser')
27
+ course_links = []
28
+
29
+ for a_tag in soup.find_all('a', class_='course-card'):
30
+ href = a_tag.get('href')
31
+ if href:
32
+ full_url = urljoin(self.base_url, href)
33
+ course_links.append(full_url)
34
+ logger.debug(f"Found course link: {full_url}")
35
+
36
+ logger.info(f"Found {len(course_links)} course links")
37
+ return course_links
38
+
39
+ except requests.RequestException as e:
40
+ logger.error(f"Error fetching course links: {str(e)}")
41
+ return []
42
+
43
+
44
+ def extract_course_info(self, url: str) -> Dict:
45
+ try:
46
+ logger.info(f"Extracting course info from {url}")
47
+ response = requests.get(url, headers=self.headers)
48
+ response.raise_for_status()
49
+
50
+ soup = BeautifulSoup(response.content, 'html.parser')
51
+
52
+ course_info = {
53
+ 'url': url,
54
+ 'title': '',
55
+ 'description': '',
56
+ 'curriculum': ''
57
+ }
58
+
59
+ # Extract title
60
+ title_elem = soup.find('h1', class_='section__heading')
61
+ if title_elem:
62
+ course_info['title'] = title_elem.text.strip()
63
+
64
+ # Extract description
65
+ desc_elem = soup.find('div', class_='rich-text__container')
66
+ if desc_elem:
67
+ course_info['description'] = desc_elem.text.strip()
68
+
69
+ # Extract curriculum
70
+ curr_elem = soup.find('div', class_='course-curriculum__container')
71
+ if curr_elem:
72
+ course_info['curriculum'] = curr_elem.text.strip()
73
+
74
+ return course_info
75
+
76
+ except requests.RequestException as e:
77
+ logger.error(f"Error extracting course info from {url}: {str(e)}")
78
+ return None
79
+
80
+
81
+ def scrape_all_courses(self) -> pd.DataFrame:
82
+ all_courses = []
83
+ course_links = self.get_course_links()
84
+
85
+ for link in course_links:
86
+ try:
87
+ course_info = self.extract_course_info(link)
88
+ if course_info:
89
+ all_courses.append(course_info)
90
+ # Add a small delay to be respectful to the server
91
+ time.sleep(1)
92
+ except Exception as e:
93
+ logger.error(f"Error processing {link}: {str(e)}")
94
+ continue
95
+
96
+ df = pd.DataFrame(all_courses)
97
+ logger.info(f"Successfully scraped {len(df)} courses")
98
+ return df
course_search/search_system/__init__.py ADDED
File without changes
course_search/search_system/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (193 Bytes). View file
 
course_search/search_system/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (167 Bytes). View file
 
course_search/search_system/__pycache__/data_pipeline.cpython-311.pyc ADDED
Binary file (2.98 kB). View file
 
course_search/search_system/__pycache__/data_pipeline.cpython-312.pyc ADDED
Binary file (2.6 kB). View file
 
course_search/search_system/__pycache__/embeddings.cpython-311.pyc ADDED
Binary file (7.76 kB). View file
 
course_search/search_system/__pycache__/embeddings.cpython-312.pyc ADDED
Binary file (2.65 kB). View file
 
course_search/search_system/__pycache__/rag_system.cpython-311.pyc ADDED
Binary file (4.95 kB). View file
 
course_search/search_system/__pycache__/vector_store.cpython-311.pyc ADDED
Binary file (4.75 kB). View file
 
course_search/search_system/__pycache__/vector_store.cpython-312.pyc ADDED
Binary file (4.05 kB). View file
 
course_search/search_system/data_pipeline.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from typing import Optional
3
+ from course_search.scraper.course_scraper import CourseScraper
4
+ from course_search.search_system.embeddings import EmbeddingGenerator
5
+ from course_search.search_system.vector_store import FAISSManager
6
+ import logging
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class DataPipeline:
11
+ def __init__(self):
12
+ self.scraper = CourseScraper()
13
+ self.embedding_generator = EmbeddingGenerator()
14
+ self.vector_store = FAISSManager()
15
+
16
+ def run_pipeline(self, save_path: Optional[str] = None) -> pd.DataFrame:
17
+ """
18
+ Run the complete data pipeline: scraping, embedding generation, and vector storage
19
+ """
20
+ try:
21
+ # Step 1: Scrape courses
22
+ logger.info("Starting course scraping...")
23
+ df = self.scraper.scrape_all_courses()
24
+ logger.info(f"Scraped {len(df)} courses successfully")
25
+
26
+ # Step 2: Generate embeddings
27
+ logger.info("Generating embeddings...")
28
+ df = self.embedding_generator.add_embeddings_to_df(
29
+ df,
30
+ text_column='description'
31
+ )
32
+ logger.info("Embeddings generated successfully")
33
+
34
+ # Step 3: Upload to FAISS
35
+ logger.info("Uploading to FAISS...")
36
+ self.vector_store.upsert_courses(df)
37
+
38
+ # Step 4: Save data if path provided
39
+ if save_path:
40
+ logger.info(f"Saving data to {save_path}")
41
+ df.to_pickle(save_path)
42
+
43
+ return df
44
+
45
+ except Exception as e:
46
+ logger.error(f"Error in pipeline: {str(e)}")
47
+ raise
course_search/search_system/embeddings.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from typing import List
4
+ import logging
5
+ from sentence_transformers import SentenceTransformer
6
+ import os
7
+ from pathlib import Path
8
+ import pickle
9
+
10
+ os.environ['TOKENIZERS_PARALLELISM'] = 'false' # To avoid warnings
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class EmbeddingGenerator:
15
+ def __init__(self, model_name: str = 'all-MiniLM-L6-v2', cache_dir: str = None):
16
+ try:
17
+ self.model_name = model_name
18
+ self.model = SentenceTransformer(model_name)
19
+
20
+ # Setup cache directory
21
+ self.cache_dir = Path(cache_dir) if cache_dir else Path('data/embedding_cache')
22
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
23
+
24
+ # Cache file for embeddings
25
+ self.cache_file = self.cache_dir / f"embeddings_cache_{model_name.replace('/', '_')}.pkl"
26
+
27
+ # Load existing cache if available
28
+ self.embedding_cache = self._load_cache()
29
+
30
+ logger.info(f"Successfully loaded model: {model_name}")
31
+ except Exception as e:
32
+ logger.error(f"Error loading model: {str(e)}")
33
+ raise
34
+
35
+ def _load_cache(self) -> dict:
36
+ """Load embedding cache from file if it exists"""
37
+ try:
38
+ if self.cache_file.exists():
39
+ with open(self.cache_file, 'rb') as f:
40
+ cache = pickle.load(f)
41
+ logger.info(f"Loaded {len(cache)} cached embeddings")
42
+ return cache
43
+ return {}
44
+ except Exception as e:
45
+ logger.warning(f"Error loading cache, starting fresh: {str(e)}")
46
+ return {}
47
+
48
+ def _save_cache(self):
49
+ """Save embedding cache to file"""
50
+ try:
51
+ with open(self.cache_file, 'wb') as f:
52
+ pickle.dump(self.embedding_cache, f)
53
+ logger.info(f"Saved {len(self.embedding_cache)} embeddings to cache")
54
+ except Exception as e:
55
+ logger.error(f"Error saving cache: {str(e)}")
56
+
57
+ def generate_embeddings(self, texts: pd.Series) -> np.ndarray:
58
+ try:
59
+ # Convert texts to list
60
+ text_list = texts.tolist()
61
+
62
+ # Initialize array to store embeddings
63
+ all_embeddings = []
64
+ texts_to_embed = []
65
+ indices_to_embed = []
66
+
67
+ # Check cache for existing embeddings
68
+ for i, text in enumerate(text_list):
69
+ text_hash = hash(text)
70
+ if text_hash in self.embedding_cache:
71
+ all_embeddings.append(self.embedding_cache[text_hash])
72
+ else:
73
+ texts_to_embed.append(text)
74
+ indices_to_embed.append(i)
75
+
76
+ # Generate embeddings only for new texts
77
+ if texts_to_embed:
78
+ logger.info(f"Generating embeddings for {len(texts_to_embed)} new texts")
79
+ new_embeddings = self.model.encode(
80
+ texts_to_embed,
81
+ show_progress_bar=True,
82
+ convert_to_numpy=True
83
+ )
84
+
85
+ # Cache new embeddings
86
+ for text, embedding in zip(texts_to_embed, new_embeddings):
87
+ text_hash = hash(text)
88
+ self.embedding_cache[text_hash] = embedding
89
+
90
+ # Save updated cache
91
+ self._save_cache()
92
+
93
+ # Insert new embeddings in correct positions
94
+ for idx, embedding in zip(indices_to_embed, new_embeddings):
95
+ all_embeddings.insert(idx, embedding)
96
+ else:
97
+ logger.info("All embeddings found in cache")
98
+
99
+ return np.array(all_embeddings)
100
+
101
+ except Exception as e:
102
+ logger.error(f"Error generating embeddings: {str(e)}")
103
+ raise
104
+
105
+ def add_embeddings_to_df(self, df: pd.DataFrame, text_column: str = 'description') -> pd.DataFrame:
106
+ try:
107
+ embeddings = self.generate_embeddings(df[text_column])
108
+ df['embeddings'] = list(embeddings)
109
+ return df
110
+ except Exception as e:
111
+ logger.error(f"Error adding embeddings to DataFrame: {str(e)}")
112
+ raise
113
+
114
+ def clear_cache(self):
115
+ """Clear the embedding cache"""
116
+ try:
117
+ self.embedding_cache = {}
118
+ if self.cache_file.exists():
119
+ self.cache_file.unlink()
120
+ logger.info("Embedding cache cleared")
121
+ except Exception as e:
122
+ logger.error(f"Error clearing cache: {str(e)}")
123
+ raise
course_search/search_system/rag_system.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import DataFrameLoader
2
+ from langchain_community.embeddings import HuggingFaceEmbeddings
3
+ from langchain_community.vectorstores import FAISS
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.chains import RetrievalQA
6
+ from langchain_community.llms import HuggingFaceHub
7
+ import pandas as pd
8
+ import logging
9
+ from typing import List, Dict
10
+ import os
11
+ from dotenv import load_dotenv
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ class RAGSystem:
16
+ def __init__(self):
17
+ """Initialize the RAG system with LangChain components"""
18
+ load_dotenv()
19
+
20
+ # Initialize embedding model
21
+ self.embeddings = HuggingFaceEmbeddings(
22
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
23
+ )
24
+
25
+ # Initialize text splitter for chunking
26
+ self.text_splitter = RecursiveCharacterTextSplitter(
27
+ chunk_size=500,
28
+ chunk_overlap=50
29
+ )
30
+
31
+ self.vector_store = None
32
+ self.qa_chain = None
33
+
34
+ def load_and_process_data(self, df: pd.DataFrame) -> None:
35
+ """
36
+ Load course data and create vector store
37
+ """
38
+ try:
39
+ # Prepare documents from DataFrame
40
+ loader = DataFrameLoader(
41
+ data_frame=df,
42
+ page_content_column="description"
43
+ )
44
+ documents = loader.load()
45
+ for doc, row in zip(documents, df.itertuples()):
46
+ doc.metadata = {
47
+ "title": row.title,
48
+ "url": row.url,
49
+ # Add other metadata fields as needed
50
+ }
51
+
52
+ # Split documents into chunks
53
+ splits = self.text_splitter.split_documents(documents)
54
+
55
+ # Create vector store
56
+ self.vector_store = FAISS.from_documents(
57
+ splits,
58
+ self.embeddings
59
+ )
60
+
61
+ # Initialize QA chain
62
+ llm = HuggingFaceHub(
63
+ repo_id="google/flan-t5-small",
64
+ huggingfacehub_api_token=os.getenv('HUGGINGFACE_API_TOKEN')
65
+ )
66
+
67
+ self.qa_chain = RetrievalQA.from_chain_type(
68
+ llm=llm,
69
+ chain_type="stuff",
70
+ retriever=self.vector_store.as_retriever()
71
+ )
72
+
73
+ logger.info("RAG system initialized successfully")
74
+
75
+ except Exception as e:
76
+ logger.error(f"Error initializing RAG system: {str(e)}")
77
+ raise
78
+
79
+ def search_courses(self, query: str, top_k: int = 5) -> List[Dict]:
80
+ """
81
+ Search for courses using RAG
82
+ """
83
+ try:
84
+ if not self.vector_store:
85
+ raise ValueError("Vector store not initialized. Please load data first.")
86
+
87
+ # Get relevant documents
88
+ docs = self.vector_store.similarity_search(query, k=top_k)
89
+
90
+ # Generate answer using QA chain
91
+ answer = self.qa_chain.run(query)
92
+
93
+ # Format results
94
+ results = []
95
+ for doc in docs:
96
+ results.append({
97
+ 'title': doc.metadata['title'],
98
+ 'description': doc.page_content,
99
+ 'url': doc.metadata['url'],
100
+ 'score': doc.metadata.get('score', 1.0)
101
+ })
102
+
103
+ return {
104
+ 'results': results,
105
+ 'answer': answer
106
+ }
107
+
108
+ except Exception as e:
109
+ logger.error(f"Error in course search: {str(e)}")
110
+ raise
course_search/search_system/vector_store.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from typing import List, Dict
4
+ import faiss
5
+ import logging
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class FAISSManager:
10
+ def __init__(self, dimension: int = 384):
11
+ """
12
+ Initialize FAISS index with error handling
13
+ """
14
+ try:
15
+ self.dimension = dimension
16
+ self.index = faiss.IndexFlatL2(dimension)
17
+ self.metadata = []
18
+ logger.info("FAISS index initialized successfully")
19
+ except Exception as e:
20
+ logger.error(f"Error initializing FAISS: {str(e)}")
21
+ raise RuntimeError(f"Failed to initialize FAISS: {str(e)}")
22
+
23
+ def upsert_courses(self, df: pd.DataFrame) -> None:
24
+ """
25
+ Add course embeddings to the FAISS index with error handling
26
+ """
27
+ try:
28
+ # Convert embeddings to numpy array
29
+ vectors = np.vstack([
30
+ emb.astype('float32') for emb in df['embeddings'].values
31
+ ])
32
+
33
+ # Add vectors to index
34
+ self.index.add(vectors)
35
+ self.metadata.extend(df[['title', 'description', 'url']].to_dict('records'))
36
+ logger.info(f"Added {len(vectors)} vectors to FAISS index")
37
+
38
+ except Exception as e:
39
+ logger.error(f"Error adding vectors to FAISS: {str(e)}")
40
+ raise RuntimeError(f"Failed to add vectors to FAISS: {str(e)}")
41
+
42
+ def search_courses(self, query_embedding: np.ndarray, top_k: int = 5) -> List[Dict]:
43
+ """
44
+ Search for similar courses using query embedding with error handling
45
+ """
46
+ try:
47
+ # Ensure query embedding is in correct format
48
+ query_embedding = query_embedding.astype('float32').reshape(1, -1)
49
+
50
+ # Perform search
51
+ distances, indices = self.index.search(query_embedding, top_k)
52
+
53
+ results = []
54
+ for i, idx in enumerate(indices[0]):
55
+ if idx == -1:
56
+ continue
57
+ result = self.metadata[idx].copy()
58
+ result['score'] = float(distances[0][i]) # Convert to float for JSON serialization
59
+ results.append(result)
60
+
61
+ return results
62
+
63
+ except Exception as e:
64
+ logger.error(f"Error searching FAISS index: {str(e)}")
65
+ raise RuntimeError(f"Failed to search FAISS index: {str(e)}")
data/courses_with_embeddings.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c55c9ffb5774c19e7443e3f1c5b8a8c0add246c1516f05f80d29aa0433972f61
3
+ size 139388
data/embedding_cache/embeddings_cache_all-MiniLM-L6-v2.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5444508e05b63d662919fc358bfed9dea64094858d01a6df9b291a56525ea13d
3
+ size 123513
instructions.txt ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This assignment involves building a **smart search tool** for Analytics Vidhya's free courses. Here's a breakdown of how you can approach it:
2
+
3
+ ### Steps to Complete the Assignment
4
+
5
+ #### 1. **Data Collection**
6
+ - **Explore the Free Courses**: Visit the [Analytics Vidhya Free Courses](https://courses.analyticsvidhya.com/collections/courses) and collect course-related data such as:
7
+ - Course Titles
8
+ - Descriptions
9
+ - Curriculum
10
+ - **Tools for Data Collection**: Use Python libraries like `BeautifulSoup` (for web scraping) or APIs (if available) to extract the data.
11
+
12
+ #### 2. **Smart Search System**
13
+ - **Frameworks**: Use **LangChain (0.3.x)** or **LlamaIndex (0.12.x)** for implementing a Retrieval-Augmented Generation (RAG) system.
14
+ - **Steps**:
15
+ 1. **Generate Embeddings**:
16
+ - Choose a pre-trained embedding model like OpenAI's `text-embedding-ada-002` or Sentence Transformers.
17
+ - Convert the collected course data into embeddings.
18
+ 2. **Vector Database**:
19
+ - Use a vector database such as Pinecone, Weaviate, or FAISS to store and query embeddings.
20
+ 3. **Search Implementation**:
21
+ - Implement a natural language search interface where user queries are matched against the stored embeddings.
22
+ - Return the most relevant courses.
23
+
24
+ #### 3. **Deployment on Huggingface Spaces**
25
+ - **Frameworks**: Use `Gradio` or `Streamlit` for creating an interactive UI.
26
+ - **Deployment Steps**:
27
+ 1. Set up a Huggingface Spaces account.
28
+ 2. Integrate your search tool into the interface.
29
+ 3. Deploy the app and obtain a public URL.
30
+
31
+ #### 4. **Document Your Approach**
32
+ - Write a report explaining:
33
+ - Data collection process.
34
+ - Embedding model and vector database choices.
35
+ - System architecture.
36
+ - Challenges faced and how you addressed them.
37
+
38
+ ---
39
+
40
+ ### Required Skills and Tools
41
+ - **Web Scraping**: `requests`, `BeautifulSoup`
42
+ - **Embedding Models**: `sentence-transformers`, OpenAI API
43
+ - **Vector Database**: Pinecone, Weaviate, or FAISS
44
+ - **Deployment**: `Gradio`, `Streamlit`, Huggingface Spaces
45
+ - **Programming Language**: Python
46
+
47
+ ---
48
+
49
+ If you need guidance on specific steps, feel free to ask!
requirements.txt ADDED
File without changes
tests/__init__.py ADDED
File without changes
tests/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (159 Bytes). View file
 
tests/__pycache__/test_complete_pipeline.cpython-312.pyc ADDED
Binary file (2.43 kB). View file
 
tests/test_complete_pipeline.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from course_search.search_system.embeddings import EmbeddingGenerator
3
+ from course_search.search_system.vector_store import FAISSManager
4
+ import pandas as pd
5
+ import numpy as np
6
+
7
+ logging.basicConfig(level=logging.INFO)
8
+ logger = logging.getLogger(__name__)
9
+
10
+ def test_pipeline():
11
+ try:
12
+ # Test data
13
+ test_data = pd.DataFrame({
14
+ 'title': ['Test Course 1', 'Test Course 2'],
15
+ 'description': ['This is a test course about Python', 'This is a test course about ML'],
16
+ 'url': ['http://test1.com', 'http://test2.com']
17
+ })
18
+
19
+ # Test embedding generation
20
+ logger.info("Testing embedding generation...")
21
+ embedding_gen = EmbeddingGenerator()
22
+ test_data = embedding_gen.add_embeddings_to_df(test_data)
23
+ assert 'embeddings' in test_data.columns
24
+ logger.info("Embedding generation successful!")
25
+
26
+ # Test FAISS storage
27
+ logger.info("Testing FAISS storage...")
28
+ vector_store = FAISSManager()
29
+ vector_store.upsert_courses(test_data)
30
+ logger.info("FAISS storage successful!")
31
+
32
+ # Test search
33
+ logger.info("Testing search...")
34
+ query = "Python programming"
35
+ query_embedding = embedding_gen.generate_embeddings(pd.Series([query]))[0]
36
+ results = vector_store.search_courses(query_embedding)
37
+ assert len(results) > 0
38
+ logger.info("Search successful!")
39
+
40
+ return True
41
+
42
+ except Exception as e:
43
+ logger.error(f"Pipeline test failed: {str(e)}")
44
+ return False
45
+
46
+ if __name__ == "__main__":
47
+ test_pipeline()
tests/test_integration.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from course_search.search_system.data_pipeline import DataPipeline
2
+ import logging
3
+
4
+ def test_pipeline():
5
+ pipeline = DataPipeline()
6
+
7
+ # Run pipeline and save results
8
+ df = pipeline.run_pipeline(save_path='data/courses_with_embeddings.pkl')
9
+
10
+ # Print results
11
+ print("\nDataFrame Info:")
12
+ print(df.info())
13
+ print("\nSample Course:")
14
+ print(df.iloc[0][['title', 'description']].to_dict())
15
+ print("\nSample Embedding Shape:", df.iloc[0]['embeddings'].shape)
16
+
17
+ if __name__ == "__main__":
18
+ logging.basicConfig(level=logging.INFO)
19
+ test_pipeline()
tests/test_scraper.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from course_search.scraper.course_scraper import CourseScraper
3
+
4
+ def test_scraper():
5
+ scraper = CourseScraper()
6
+
7
+ # Test 1: Get course links
8
+ print("\nTesting get_course_links()...")
9
+ links = scraper.get_course_links()
10
+ print(f"Found {len(links)} links")
11
+ print("Sample links:")
12
+ for link in links[:3]:
13
+ print(f"- {link}")
14
+
15
+ # Test 2: Extract course info
16
+ if links:
17
+ print("\nTesting extract_course_info()...")
18
+ sample_course = scraper.extract_course_info(links[0])
19
+ print("Sample course info:")
20
+ for key, value in sample_course.items():
21
+ print(f"{key}: {value[:100]}...")
22
+
23
+ # Test 3: Scrape all courses
24
+ print("\nTesting scrape_all_courses()...")
25
+ df = scraper.scrape_all_courses()
26
+ print("\nDataFrame Info:")
27
+ print(df.info())
28
+ print("\nFirst few rows:")
29
+ print(df.head())
30
+
31
+ if __name__ == "__main__":
32
+ logging.basicConfig(level=logging.INFO)
33
+ test_scraper()
tests/test_vector_store.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import faiss
3
+ import logging
4
+
5
+ logging.basicConfig(level=logging.INFO)
6
+ logger = logging.getLogger(__name__)
7
+
8
+ def test_faiss():
9
+ try:
10
+ # Create a small test index
11
+ dimension = 64
12
+ nb = 100
13
+
14
+ # Generate random data
15
+ xb = np.random.random((nb, dimension)).astype('float32')
16
+
17
+ # Create index
18
+ index = faiss.IndexFlatL2(dimension)
19
+
20
+ # Add vectors
21
+ index.add(xb)
22
+
23
+ # Test search
24
+ k = 5
25
+ xq = np.random.random((1, dimension)).astype('float32')
26
+ D, I = index.search(xq, k)
27
+
28
+ logger.info("FAISS test successful!")
29
+ logger.info(f"Found {k} nearest neighbors")
30
+
31
+ return True
32
+
33
+ except Exception as e:
34
+ logger.error(f"FAISS test failed: {str(e)}")
35
+ return False
36
+
37
+ import torch
38
+
39
+ def test_torch():
40
+ try:
41
+ x = torch.rand(5, 3)
42
+ print("PyTorch is working correctly. Tensor:", x)
43
+ except Exception as e:
44
+ print("Error with PyTorch:", e)
45
+
46
+
47
+ if __name__ == "__main__":
48
+ test_faiss()
49
+ test_torch()