Spaces:
Sleeping
Sleeping
Rohil Bansal
commited on
Commit
·
2ed2129
1
Parent(s):
cbb52fe
commit
Browse files- .gitignore +3 -0
- .gradio/certificate.pem +31 -0
- .gradio/flagged/dataset1.csv +43 -0
- README.md +2 -2
- course_search.txt +27 -0
- course_search/__init__.py +0 -0
- course_search/__pycache__/__init__.cpython-311.pyc +0 -0
- course_search/__pycache__/__init__.cpython-312.pyc +0 -0
- course_search/app/__init__.py +0 -0
- course_search/app/__pycache__/__init__.cpython-312.pyc +0 -0
- course_search/app/__pycache__/run.cpython-312.pyc +0 -0
- course_search/app/gradio_app.py +115 -0
- course_search/app/main.py +26 -0
- course_search/app/run.py +65 -0
- course_search/scraper/__init__.py +0 -0
- course_search/scraper/__pycache__/__init__.cpython-311.pyc +0 -0
- course_search/scraper/__pycache__/__init__.cpython-312.pyc +0 -0
- course_search/scraper/__pycache__/course_scraper.cpython-311.pyc +0 -0
- course_search/scraper/__pycache__/course_scraper.cpython-312.pyc +0 -0
- course_search/scraper/course_scraper.py +98 -0
- course_search/search_system/__init__.py +0 -0
- course_search/search_system/__pycache__/__init__.cpython-311.pyc +0 -0
- course_search/search_system/__pycache__/__init__.cpython-312.pyc +0 -0
- course_search/search_system/__pycache__/data_pipeline.cpython-311.pyc +0 -0
- course_search/search_system/__pycache__/data_pipeline.cpython-312.pyc +0 -0
- course_search/search_system/__pycache__/embeddings.cpython-311.pyc +0 -0
- course_search/search_system/__pycache__/embeddings.cpython-312.pyc +0 -0
- course_search/search_system/__pycache__/rag_system.cpython-311.pyc +0 -0
- course_search/search_system/__pycache__/vector_store.cpython-311.pyc +0 -0
- course_search/search_system/__pycache__/vector_store.cpython-312.pyc +0 -0
- course_search/search_system/data_pipeline.py +47 -0
- course_search/search_system/embeddings.py +123 -0
- course_search/search_system/rag_system.py +110 -0
- course_search/search_system/vector_store.py +65 -0
- data/courses_with_embeddings.pkl +3 -0
- data/embedding_cache/embeddings_cache_all-MiniLM-L6-v2.pkl +3 -0
- instructions.txt +49 -0
- requirements.txt +0 -0
- tests/__init__.py +0 -0
- tests/__pycache__/__init__.cpython-312.pyc +0 -0
- tests/__pycache__/test_complete_pipeline.cpython-312.pyc +0 -0
- tests/test_complete_pipeline.py +47 -0
- tests/test_integration.py +19 -0
- tests/test_scraper.py +33 -0
- tests/test_vector_store.py +49 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
|
3 |
+
|
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
.gradio/flagged/dataset1.csv
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
What would you like to learn?,Number of Results,output,timestamp
|
2 |
+
"LLM
|
3 |
+
",5,"# Search Results
|
4 |
+
|
5 |
+
### 1. Getting Started with Large Language Models
|
6 |
+
**Description:**
|
7 |
+
Getting Started With LLMs
|
8 |
+
|
9 |
+
**Course Link:** [View Course](https://courses.analyticsvidhya.com/courses/getting-started-with-llms)
|
10 |
+
|
11 |
+
---
|
12 |
+
|
13 |
+
### 2. Framework to Choose the Right LLM for your Business
|
14 |
+
**Description:**
|
15 |
+
This course will guide you through the process of selecting the most suitable Large Language Model (LLM) for various business needs. By examining factors such as accuracy, cost, scalability, and integration, you will understand how different LLMs perform in specific scenarios, from customer support to healthcare and strategy development. The course emphasizes practical decision-making with real-world case studies, helping businesses navigate the rapidly evolving LLM landscape effectively.
|
16 |
+
|
17 |
+
**Course Link:** [View Course](https://courses.analyticsvidhya.com/courses/choosing-the-right-LLM-for-your-business)
|
18 |
+
|
19 |
+
---
|
20 |
+
|
21 |
+
### 3. Building LLM Applications using Prompt Engineering - Free Course
|
22 |
+
**Description:**
|
23 |
+
Professionals: Individuals looking to deepen their knowledge and apply advanced LLM and prompt engineering techniques to solve complex problems across various domains.
|
24 |
+
Aspiring Students: Individuals looking to deepen their knowledge and apply advanced LLM and prompt engineering techniques to solve complex problems across various domains.
|
25 |
+
|
26 |
+
**Course Link:** [View Course](https://courses.analyticsvidhya.com/courses/building-llm-applications-using-prompt-engineering-free)
|
27 |
+
|
28 |
+
---
|
29 |
+
|
30 |
+
### 4. Getting Started with Large Language Models
|
31 |
+
**Description:**
|
32 |
+
Who Should Enroll: Professionals: Individuals looking to expand their skill set and leverage LLMs across different industries. Aspiring Students: For those setting out on their journey to master language data analysis and leave a mark in the tech world.
|
33 |
+
|
34 |
+
**Course Link:** [View Course](https://courses.analyticsvidhya.com/courses/getting-started-with-llms)
|
35 |
+
|
36 |
+
---
|
37 |
+
|
38 |
+
### 5. Getting Started with Large Language Models
|
39 |
+
**Description:**
|
40 |
+
This course will help you gain a comprehensive understanding of Large Language Models (LLMs) and develop advanced natural language processing (NLP) applications using the PyTorch framework. With a carefully curated list of resources and exercises, this course is your guide to becoming an expert in LLMs. Master the techniques to build and fine-tune LLMs, and generate human-like text.
|
41 |
+
|
42 |
+
**Course Link:** [View Course](https://courses.analyticsvidhya.com/courses/getting-started-with-llms)
|
43 |
+
|
README.md
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
---
|
2 |
-
title: Course Search
|
3 |
emoji: 🚀
|
4 |
colorFrom: purple
|
5 |
colorTo: pink
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.8.0
|
8 |
-
app_file: app.py
|
9 |
pinned: false
|
10 |
license: other
|
11 |
short_description: Course search assignment for Analytics Vidhya
|
|
|
1 |
---
|
2 |
+
title: Course Search Analytics Vidya
|
3 |
emoji: 🚀
|
4 |
colorFrom: purple
|
5 |
colorTo: pink
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.8.0
|
8 |
+
app_file: course_search/app/run.py
|
9 |
pinned: false
|
10 |
license: other
|
11 |
short_description: Course search assignment for Analytics Vidhya
|
course_search.txt
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Course Search System: My Implementation Journey
|
2 |
+
|
3 |
+
Data Gathering
|
4 |
+
I started by collecting course data from various sources like university websites and online platforms. I used web scraping techniques to extract information like course titles, descriptions, prerequisites, and more. I also integrated APIs from some institutions to directly access their course catalogs. Once I had the data, I cleaned and standardized it to ensure consistency.
|
5 |
+
|
6 |
+
Choosing the Right Tools
|
7 |
+
For processing the text data, I selected a powerful language model like BERT to understand the semantic meaning of course descriptions. This model converted the text into numerical representations (vectors) that computers could easily process.
|
8 |
+
|
9 |
+
To efficiently store and search these vectors, I used a vector database called FAISS. It's designed for handling large datasets of vectors and quickly finding the most similar ones.
|
10 |
+
|
11 |
+
Building the System
|
12 |
+
I designed my system to be flexible and scalable. Here's a breakdown of its key components:
|
13 |
+
|
14 |
+
Data Ingestion: This part collects and prepares the course data.
|
15 |
+
Embedding: The language model processes the course descriptions and creates vectors.
|
16 |
+
Vector Database: This stores the vectors for efficient searching.
|
17 |
+
Search API: This allows users to query the system and get relevant results.
|
18 |
+
User Interface: This is the front-end where users can interact with the system.
|
19 |
+
I deployed the system on a cloud platform to ensure it's reliable and can handle increasing user demand. Each component runs in its own container, making it easy to manage and update.
|
20 |
+
|
21 |
+
Challenges and Solutions
|
22 |
+
|
23 |
+
Data Quality: Ensuring data consistency and accuracy was a big challenge. I addressed this by carefully cleaning and standardizing the data.
|
24 |
+
Model Performance: Choosing the right language model was crucial. I experimented with different models and fine-tuned them to get the best results.
|
25 |
+
Scalability: Handling a large number of courses required a scalable vector database. FAISS was a great choice for this, and I configured it to handle the load.
|
26 |
+
User Experience: I focused on making the system user-friendly. I conducted user tests and made improvements to the interface and search algorithm.
|
27 |
+
Overall, this project was a great learning experience. I'm proud of what I've accomplished and excited to see how it can help students find the right courses.
|
course_search/__init__.py
ADDED
File without changes
|
course_search/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (179 Bytes). View file
|
|
course_search/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (167 Bytes). View file
|
|
course_search/app/__init__.py
ADDED
File without changes
|
course_search/app/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (171 Bytes). View file
|
|
course_search/app/__pycache__/run.cpython-312.pyc
ADDED
Binary file (2.63 kB). View file
|
|
course_search/app/gradio_app.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from pathlib import Path
|
4 |
+
import logging
|
5 |
+
from course_search.search_system.rag_system import RAGSystem
|
6 |
+
|
7 |
+
# Setup logging
|
8 |
+
logging.basicConfig(level=logging.INFO)
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
|
11 |
+
class CourseSearchApp:
|
12 |
+
def __init__(self):
|
13 |
+
"""Initialize the search application"""
|
14 |
+
try:
|
15 |
+
self.load_components()
|
16 |
+
except Exception as e:
|
17 |
+
logger.error(f"Initialization error: {str(e)}")
|
18 |
+
raise
|
19 |
+
|
20 |
+
def load_components(self):
|
21 |
+
"""Initialize RAG system and load data"""
|
22 |
+
try:
|
23 |
+
# Construct path to data file
|
24 |
+
data_path = Path(__file__).parent.parent.parent / 'data' / 'courses_with_embeddings.pkl'
|
25 |
+
|
26 |
+
if not data_path.exists():
|
27 |
+
raise FileNotFoundError(f"Data file not found at: {data_path}")
|
28 |
+
|
29 |
+
# Load saved course data
|
30 |
+
df = pd.read_pickle(str(data_path))
|
31 |
+
logger.info(f"Loaded {len(df)} courses from {data_path}")
|
32 |
+
|
33 |
+
# Initialize RAG system
|
34 |
+
self.rag_system = RAGSystem()
|
35 |
+
self.rag_system.load_and_process_data(df)
|
36 |
+
logger.info("Components loaded successfully")
|
37 |
+
|
38 |
+
except Exception as e:
|
39 |
+
logger.error(f"Error loading components: {str(e)}")
|
40 |
+
raise
|
41 |
+
|
42 |
+
def search_courses(self, query: str, num_results: int) -> str:
|
43 |
+
"""
|
44 |
+
Search for courses and format results for Gradio
|
45 |
+
"""
|
46 |
+
try:
|
47 |
+
results = self.rag_system.search_courses(query, top_k=num_results)
|
48 |
+
|
49 |
+
# Format results for display
|
50 |
+
markdown_output = "# Search Results\n\n"
|
51 |
+
for i, result in enumerate(results['results'], 1):
|
52 |
+
markdown_output += f"### {i}. {result['title']}\n"
|
53 |
+
markdown_output += f"**Description:**\n{result['description']}\n\n"
|
54 |
+
markdown_output += f"**Course Link:** [View Course]({result['url']})\n\n"
|
55 |
+
markdown_output += "---\n\n"
|
56 |
+
|
57 |
+
return markdown_output
|
58 |
+
|
59 |
+
except Exception as e:
|
60 |
+
error_msg = f"Error during search: {str(e)}"
|
61 |
+
logger.error(error_msg)
|
62 |
+
return f"# Error\n\n{error_msg}"
|
63 |
+
|
64 |
+
def create_gradio_interface():
|
65 |
+
"""Create and configure Gradio interface"""
|
66 |
+
try:
|
67 |
+
app = CourseSearchApp()
|
68 |
+
|
69 |
+
# Define the interface
|
70 |
+
iface = gr.Interface(
|
71 |
+
fn=app.search_courses,
|
72 |
+
inputs=[
|
73 |
+
gr.Textbox(
|
74 |
+
label="What would you like to learn?",
|
75 |
+
placeholder="e.g., machine learning for beginners, data visualization, python basics",
|
76 |
+
lines=2
|
77 |
+
),
|
78 |
+
gr.Slider(
|
79 |
+
minimum=1,
|
80 |
+
maximum=10,
|
81 |
+
value=5,
|
82 |
+
step=1,
|
83 |
+
label="Number of Results"
|
84 |
+
)
|
85 |
+
],
|
86 |
+
outputs=gr.Markdown(),
|
87 |
+
title="Analytics Vidhya Course Search",
|
88 |
+
description="""
|
89 |
+
Search through Analytics Vidhya's free courses using natural language!
|
90 |
+
Get personalized course recommendations and AI-generated responses to your queries.
|
91 |
+
""",
|
92 |
+
theme=gr.themes.Soft()
|
93 |
+
)
|
94 |
+
|
95 |
+
return iface
|
96 |
+
|
97 |
+
except Exception as e:
|
98 |
+
logger.error(f"Error creating Gradio interface: {str(e)}")
|
99 |
+
raise
|
100 |
+
|
101 |
+
def main():
|
102 |
+
"""Main function to run the Gradio app"""
|
103 |
+
try:
|
104 |
+
iface = create_gradio_interface()
|
105 |
+
iface.launch(
|
106 |
+
server_name="0.0.0.0",
|
107 |
+
server_port=7860,
|
108 |
+
share=True
|
109 |
+
)
|
110 |
+
except Exception as e:
|
111 |
+
logger.error(f"Error launching Gradio app: {str(e)}")
|
112 |
+
raise
|
113 |
+
|
114 |
+
if __name__ == "__main__":
|
115 |
+
main()
|
course_search/app/main.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
# Add the project root directory to Python path
|
4 |
+
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
5 |
+
sys.path.append(PROJECT_ROOT)
|
6 |
+
|
7 |
+
from course_search.search_system.data_pipeline import DataPipeline
|
8 |
+
import logging
|
9 |
+
import argparse
|
10 |
+
|
11 |
+
def main():
|
12 |
+
parser = argparse.ArgumentParser(description='Run the course scraping and embedding pipeline')
|
13 |
+
parser.add_argument('--output', type=str, default='data/courses_with_embeddings.pkl',
|
14 |
+
help='Path to save the processed data')
|
15 |
+
args = parser.parse_args()
|
16 |
+
|
17 |
+
# Initialize and run pipeline
|
18 |
+
pipeline = DataPipeline()
|
19 |
+
df = pipeline.run_pipeline(save_path=args.output)
|
20 |
+
|
21 |
+
print(f"\nProcessed {len(df)} courses")
|
22 |
+
print(f"Data saved to {args.output}")
|
23 |
+
|
24 |
+
if __name__ == "__main__":
|
25 |
+
logging.basicConfig(level=logging.INFO)
|
26 |
+
main()
|
course_search/app/run.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
from pathlib import Path
|
4 |
+
import subprocess
|
5 |
+
import logging
|
6 |
+
|
7 |
+
project_root = Path(__file__).parent.parent.parent
|
8 |
+
|
9 |
+
if str(project_root) not in sys.path:
|
10 |
+
sys.path.append(str(project_root))
|
11 |
+
|
12 |
+
from course_search.search_system.data_pipeline import DataPipeline
|
13 |
+
|
14 |
+
# Setup logging
|
15 |
+
logging.basicConfig(level=logging.INFO)
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
def setup_paths():
|
19 |
+
"""Setup necessary paths and directories"""
|
20 |
+
project_root = Path(__file__).parent.parent.parent
|
21 |
+
|
22 |
+
if str(project_root) not in sys.path:
|
23 |
+
sys.path.append(str(project_root))
|
24 |
+
|
25 |
+
data_dir = project_root / 'data'
|
26 |
+
data_dir.mkdir(exist_ok=True)
|
27 |
+
|
28 |
+
return project_root, data_dir
|
29 |
+
|
30 |
+
def main():
|
31 |
+
try:
|
32 |
+
# Setup paths
|
33 |
+
project_root, data_dir = setup_paths()
|
34 |
+
|
35 |
+
# Run data pipeline
|
36 |
+
logger.info("Running data pipeline...")
|
37 |
+
pipeline = DataPipeline()
|
38 |
+
pipeline.run_pipeline(save_path=str(data_dir / 'courses_with_embeddings.pkl'))
|
39 |
+
|
40 |
+
# Run Gradio app
|
41 |
+
logger.info("Starting Gradio app...")
|
42 |
+
gradio_path = Path(__file__).parent / 'gradio_app.py'
|
43 |
+
|
44 |
+
if not gradio_path.exists():
|
45 |
+
raise FileNotFoundError(f"Gradio app not found at: {gradio_path}")
|
46 |
+
|
47 |
+
# Change to project root directory before running
|
48 |
+
os.chdir(str(project_root))
|
49 |
+
|
50 |
+
# Run Gradio with proper Python path
|
51 |
+
env = os.environ.copy()
|
52 |
+
env['PYTHONPATH'] = str(project_root)
|
53 |
+
|
54 |
+
subprocess.run(
|
55 |
+
['python', str(gradio_path)],
|
56 |
+
env=env,
|
57 |
+
check=True
|
58 |
+
)
|
59 |
+
|
60 |
+
except Exception as e:
|
61 |
+
logger.error(f"Error running application: {str(e)}")
|
62 |
+
raise
|
63 |
+
|
64 |
+
if __name__ == "__main__":
|
65 |
+
main()
|
course_search/scraper/__init__.py
ADDED
File without changes
|
course_search/scraper/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (187 Bytes). View file
|
|
course_search/scraper/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (161 Bytes). View file
|
|
course_search/scraper/__pycache__/course_scraper.cpython-311.pyc
ADDED
Binary file (5.6 kB). View file
|
|
course_search/scraper/__pycache__/course_scraper.cpython-312.pyc
ADDED
Binary file (5.03 kB). View file
|
|
course_search/scraper/course_scraper.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import pandas as pd
|
4 |
+
from typing import List, Dict
|
5 |
+
import logging
|
6 |
+
from urllib.parse import urljoin
|
7 |
+
import time
|
8 |
+
|
9 |
+
# Set up logging
|
10 |
+
logging.basicConfig(level=logging.INFO)
|
11 |
+
logger = logging.getLogger(__name__)
|
12 |
+
|
13 |
+
class CourseScraper:
|
14 |
+
def __init__(self):
|
15 |
+
self.base_url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
|
16 |
+
self.headers = {
|
17 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
18 |
+
}
|
19 |
+
|
20 |
+
def get_course_links(self) -> List[str]:
|
21 |
+
try:
|
22 |
+
logger.info(f"Fetching course links from {self.base_url}")
|
23 |
+
response = requests.get(self.base_url, headers=self.headers)
|
24 |
+
response.raise_for_status()
|
25 |
+
|
26 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
27 |
+
course_links = []
|
28 |
+
|
29 |
+
for a_tag in soup.find_all('a', class_='course-card'):
|
30 |
+
href = a_tag.get('href')
|
31 |
+
if href:
|
32 |
+
full_url = urljoin(self.base_url, href)
|
33 |
+
course_links.append(full_url)
|
34 |
+
logger.debug(f"Found course link: {full_url}")
|
35 |
+
|
36 |
+
logger.info(f"Found {len(course_links)} course links")
|
37 |
+
return course_links
|
38 |
+
|
39 |
+
except requests.RequestException as e:
|
40 |
+
logger.error(f"Error fetching course links: {str(e)}")
|
41 |
+
return []
|
42 |
+
|
43 |
+
|
44 |
+
def extract_course_info(self, url: str) -> Dict:
|
45 |
+
try:
|
46 |
+
logger.info(f"Extracting course info from {url}")
|
47 |
+
response = requests.get(url, headers=self.headers)
|
48 |
+
response.raise_for_status()
|
49 |
+
|
50 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
51 |
+
|
52 |
+
course_info = {
|
53 |
+
'url': url,
|
54 |
+
'title': '',
|
55 |
+
'description': '',
|
56 |
+
'curriculum': ''
|
57 |
+
}
|
58 |
+
|
59 |
+
# Extract title
|
60 |
+
title_elem = soup.find('h1', class_='section__heading')
|
61 |
+
if title_elem:
|
62 |
+
course_info['title'] = title_elem.text.strip()
|
63 |
+
|
64 |
+
# Extract description
|
65 |
+
desc_elem = soup.find('div', class_='rich-text__container')
|
66 |
+
if desc_elem:
|
67 |
+
course_info['description'] = desc_elem.text.strip()
|
68 |
+
|
69 |
+
# Extract curriculum
|
70 |
+
curr_elem = soup.find('div', class_='course-curriculum__container')
|
71 |
+
if curr_elem:
|
72 |
+
course_info['curriculum'] = curr_elem.text.strip()
|
73 |
+
|
74 |
+
return course_info
|
75 |
+
|
76 |
+
except requests.RequestException as e:
|
77 |
+
logger.error(f"Error extracting course info from {url}: {str(e)}")
|
78 |
+
return None
|
79 |
+
|
80 |
+
|
81 |
+
def scrape_all_courses(self) -> pd.DataFrame:
|
82 |
+
all_courses = []
|
83 |
+
course_links = self.get_course_links()
|
84 |
+
|
85 |
+
for link in course_links:
|
86 |
+
try:
|
87 |
+
course_info = self.extract_course_info(link)
|
88 |
+
if course_info:
|
89 |
+
all_courses.append(course_info)
|
90 |
+
# Add a small delay to be respectful to the server
|
91 |
+
time.sleep(1)
|
92 |
+
except Exception as e:
|
93 |
+
logger.error(f"Error processing {link}: {str(e)}")
|
94 |
+
continue
|
95 |
+
|
96 |
+
df = pd.DataFrame(all_courses)
|
97 |
+
logger.info(f"Successfully scraped {len(df)} courses")
|
98 |
+
return df
|
course_search/search_system/__init__.py
ADDED
File without changes
|
course_search/search_system/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (193 Bytes). View file
|
|
course_search/search_system/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (167 Bytes). View file
|
|
course_search/search_system/__pycache__/data_pipeline.cpython-311.pyc
ADDED
Binary file (2.98 kB). View file
|
|
course_search/search_system/__pycache__/data_pipeline.cpython-312.pyc
ADDED
Binary file (2.6 kB). View file
|
|
course_search/search_system/__pycache__/embeddings.cpython-311.pyc
ADDED
Binary file (7.76 kB). View file
|
|
course_search/search_system/__pycache__/embeddings.cpython-312.pyc
ADDED
Binary file (2.65 kB). View file
|
|
course_search/search_system/__pycache__/rag_system.cpython-311.pyc
ADDED
Binary file (4.95 kB). View file
|
|
course_search/search_system/__pycache__/vector_store.cpython-311.pyc
ADDED
Binary file (4.75 kB). View file
|
|
course_search/search_system/__pycache__/vector_store.cpython-312.pyc
ADDED
Binary file (4.05 kB). View file
|
|
course_search/search_system/data_pipeline.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from typing import Optional
|
3 |
+
from course_search.scraper.course_scraper import CourseScraper
|
4 |
+
from course_search.search_system.embeddings import EmbeddingGenerator
|
5 |
+
from course_search.search_system.vector_store import FAISSManager
|
6 |
+
import logging
|
7 |
+
|
8 |
+
logger = logging.getLogger(__name__)
|
9 |
+
|
10 |
+
class DataPipeline:
|
11 |
+
def __init__(self):
|
12 |
+
self.scraper = CourseScraper()
|
13 |
+
self.embedding_generator = EmbeddingGenerator()
|
14 |
+
self.vector_store = FAISSManager()
|
15 |
+
|
16 |
+
def run_pipeline(self, save_path: Optional[str] = None) -> pd.DataFrame:
|
17 |
+
"""
|
18 |
+
Run the complete data pipeline: scraping, embedding generation, and vector storage
|
19 |
+
"""
|
20 |
+
try:
|
21 |
+
# Step 1: Scrape courses
|
22 |
+
logger.info("Starting course scraping...")
|
23 |
+
df = self.scraper.scrape_all_courses()
|
24 |
+
logger.info(f"Scraped {len(df)} courses successfully")
|
25 |
+
|
26 |
+
# Step 2: Generate embeddings
|
27 |
+
logger.info("Generating embeddings...")
|
28 |
+
df = self.embedding_generator.add_embeddings_to_df(
|
29 |
+
df,
|
30 |
+
text_column='description'
|
31 |
+
)
|
32 |
+
logger.info("Embeddings generated successfully")
|
33 |
+
|
34 |
+
# Step 3: Upload to FAISS
|
35 |
+
logger.info("Uploading to FAISS...")
|
36 |
+
self.vector_store.upsert_courses(df)
|
37 |
+
|
38 |
+
# Step 4: Save data if path provided
|
39 |
+
if save_path:
|
40 |
+
logger.info(f"Saving data to {save_path}")
|
41 |
+
df.to_pickle(save_path)
|
42 |
+
|
43 |
+
return df
|
44 |
+
|
45 |
+
except Exception as e:
|
46 |
+
logger.error(f"Error in pipeline: {str(e)}")
|
47 |
+
raise
|
course_search/search_system/embeddings.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
from typing import List
|
4 |
+
import logging
|
5 |
+
from sentence_transformers import SentenceTransformer
|
6 |
+
import os
|
7 |
+
from pathlib import Path
|
8 |
+
import pickle
|
9 |
+
|
10 |
+
os.environ['TOKENIZERS_PARALLELISM'] = 'false' # To avoid warnings
|
11 |
+
|
12 |
+
logger = logging.getLogger(__name__)
|
13 |
+
|
14 |
+
class EmbeddingGenerator:
|
15 |
+
def __init__(self, model_name: str = 'all-MiniLM-L6-v2', cache_dir: str = None):
|
16 |
+
try:
|
17 |
+
self.model_name = model_name
|
18 |
+
self.model = SentenceTransformer(model_name)
|
19 |
+
|
20 |
+
# Setup cache directory
|
21 |
+
self.cache_dir = Path(cache_dir) if cache_dir else Path('data/embedding_cache')
|
22 |
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
23 |
+
|
24 |
+
# Cache file for embeddings
|
25 |
+
self.cache_file = self.cache_dir / f"embeddings_cache_{model_name.replace('/', '_')}.pkl"
|
26 |
+
|
27 |
+
# Load existing cache if available
|
28 |
+
self.embedding_cache = self._load_cache()
|
29 |
+
|
30 |
+
logger.info(f"Successfully loaded model: {model_name}")
|
31 |
+
except Exception as e:
|
32 |
+
logger.error(f"Error loading model: {str(e)}")
|
33 |
+
raise
|
34 |
+
|
35 |
+
def _load_cache(self) -> dict:
|
36 |
+
"""Load embedding cache from file if it exists"""
|
37 |
+
try:
|
38 |
+
if self.cache_file.exists():
|
39 |
+
with open(self.cache_file, 'rb') as f:
|
40 |
+
cache = pickle.load(f)
|
41 |
+
logger.info(f"Loaded {len(cache)} cached embeddings")
|
42 |
+
return cache
|
43 |
+
return {}
|
44 |
+
except Exception as e:
|
45 |
+
logger.warning(f"Error loading cache, starting fresh: {str(e)}")
|
46 |
+
return {}
|
47 |
+
|
48 |
+
def _save_cache(self):
|
49 |
+
"""Save embedding cache to file"""
|
50 |
+
try:
|
51 |
+
with open(self.cache_file, 'wb') as f:
|
52 |
+
pickle.dump(self.embedding_cache, f)
|
53 |
+
logger.info(f"Saved {len(self.embedding_cache)} embeddings to cache")
|
54 |
+
except Exception as e:
|
55 |
+
logger.error(f"Error saving cache: {str(e)}")
|
56 |
+
|
57 |
+
def generate_embeddings(self, texts: pd.Series) -> np.ndarray:
|
58 |
+
try:
|
59 |
+
# Convert texts to list
|
60 |
+
text_list = texts.tolist()
|
61 |
+
|
62 |
+
# Initialize array to store embeddings
|
63 |
+
all_embeddings = []
|
64 |
+
texts_to_embed = []
|
65 |
+
indices_to_embed = []
|
66 |
+
|
67 |
+
# Check cache for existing embeddings
|
68 |
+
for i, text in enumerate(text_list):
|
69 |
+
text_hash = hash(text)
|
70 |
+
if text_hash in self.embedding_cache:
|
71 |
+
all_embeddings.append(self.embedding_cache[text_hash])
|
72 |
+
else:
|
73 |
+
texts_to_embed.append(text)
|
74 |
+
indices_to_embed.append(i)
|
75 |
+
|
76 |
+
# Generate embeddings only for new texts
|
77 |
+
if texts_to_embed:
|
78 |
+
logger.info(f"Generating embeddings for {len(texts_to_embed)} new texts")
|
79 |
+
new_embeddings = self.model.encode(
|
80 |
+
texts_to_embed,
|
81 |
+
show_progress_bar=True,
|
82 |
+
convert_to_numpy=True
|
83 |
+
)
|
84 |
+
|
85 |
+
# Cache new embeddings
|
86 |
+
for text, embedding in zip(texts_to_embed, new_embeddings):
|
87 |
+
text_hash = hash(text)
|
88 |
+
self.embedding_cache[text_hash] = embedding
|
89 |
+
|
90 |
+
# Save updated cache
|
91 |
+
self._save_cache()
|
92 |
+
|
93 |
+
# Insert new embeddings in correct positions
|
94 |
+
for idx, embedding in zip(indices_to_embed, new_embeddings):
|
95 |
+
all_embeddings.insert(idx, embedding)
|
96 |
+
else:
|
97 |
+
logger.info("All embeddings found in cache")
|
98 |
+
|
99 |
+
return np.array(all_embeddings)
|
100 |
+
|
101 |
+
except Exception as e:
|
102 |
+
logger.error(f"Error generating embeddings: {str(e)}")
|
103 |
+
raise
|
104 |
+
|
105 |
+
def add_embeddings_to_df(self, df: pd.DataFrame, text_column: str = 'description') -> pd.DataFrame:
|
106 |
+
try:
|
107 |
+
embeddings = self.generate_embeddings(df[text_column])
|
108 |
+
df['embeddings'] = list(embeddings)
|
109 |
+
return df
|
110 |
+
except Exception as e:
|
111 |
+
logger.error(f"Error adding embeddings to DataFrame: {str(e)}")
|
112 |
+
raise
|
113 |
+
|
114 |
+
def clear_cache(self):
|
115 |
+
"""Clear the embedding cache"""
|
116 |
+
try:
|
117 |
+
self.embedding_cache = {}
|
118 |
+
if self.cache_file.exists():
|
119 |
+
self.cache_file.unlink()
|
120 |
+
logger.info("Embedding cache cleared")
|
121 |
+
except Exception as e:
|
122 |
+
logger.error(f"Error clearing cache: {str(e)}")
|
123 |
+
raise
|
course_search/search_system/rag_system.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import DataFrameLoader
|
2 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
3 |
+
from langchain_community.vectorstores import FAISS
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain.chains import RetrievalQA
|
6 |
+
from langchain_community.llms import HuggingFaceHub
|
7 |
+
import pandas as pd
|
8 |
+
import logging
|
9 |
+
from typing import List, Dict
|
10 |
+
import os
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
class RAGSystem:
|
16 |
+
def __init__(self):
|
17 |
+
"""Initialize the RAG system with LangChain components"""
|
18 |
+
load_dotenv()
|
19 |
+
|
20 |
+
# Initialize embedding model
|
21 |
+
self.embeddings = HuggingFaceEmbeddings(
|
22 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
23 |
+
)
|
24 |
+
|
25 |
+
# Initialize text splitter for chunking
|
26 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
27 |
+
chunk_size=500,
|
28 |
+
chunk_overlap=50
|
29 |
+
)
|
30 |
+
|
31 |
+
self.vector_store = None
|
32 |
+
self.qa_chain = None
|
33 |
+
|
34 |
+
def load_and_process_data(self, df: pd.DataFrame) -> None:
|
35 |
+
"""
|
36 |
+
Load course data and create vector store
|
37 |
+
"""
|
38 |
+
try:
|
39 |
+
# Prepare documents from DataFrame
|
40 |
+
loader = DataFrameLoader(
|
41 |
+
data_frame=df,
|
42 |
+
page_content_column="description"
|
43 |
+
)
|
44 |
+
documents = loader.load()
|
45 |
+
for doc, row in zip(documents, df.itertuples()):
|
46 |
+
doc.metadata = {
|
47 |
+
"title": row.title,
|
48 |
+
"url": row.url,
|
49 |
+
# Add other metadata fields as needed
|
50 |
+
}
|
51 |
+
|
52 |
+
# Split documents into chunks
|
53 |
+
splits = self.text_splitter.split_documents(documents)
|
54 |
+
|
55 |
+
# Create vector store
|
56 |
+
self.vector_store = FAISS.from_documents(
|
57 |
+
splits,
|
58 |
+
self.embeddings
|
59 |
+
)
|
60 |
+
|
61 |
+
# Initialize QA chain
|
62 |
+
llm = HuggingFaceHub(
|
63 |
+
repo_id="google/flan-t5-small",
|
64 |
+
huggingfacehub_api_token=os.getenv('HUGGINGFACE_API_TOKEN')
|
65 |
+
)
|
66 |
+
|
67 |
+
self.qa_chain = RetrievalQA.from_chain_type(
|
68 |
+
llm=llm,
|
69 |
+
chain_type="stuff",
|
70 |
+
retriever=self.vector_store.as_retriever()
|
71 |
+
)
|
72 |
+
|
73 |
+
logger.info("RAG system initialized successfully")
|
74 |
+
|
75 |
+
except Exception as e:
|
76 |
+
logger.error(f"Error initializing RAG system: {str(e)}")
|
77 |
+
raise
|
78 |
+
|
79 |
+
def search_courses(self, query: str, top_k: int = 5) -> List[Dict]:
|
80 |
+
"""
|
81 |
+
Search for courses using RAG
|
82 |
+
"""
|
83 |
+
try:
|
84 |
+
if not self.vector_store:
|
85 |
+
raise ValueError("Vector store not initialized. Please load data first.")
|
86 |
+
|
87 |
+
# Get relevant documents
|
88 |
+
docs = self.vector_store.similarity_search(query, k=top_k)
|
89 |
+
|
90 |
+
# Generate answer using QA chain
|
91 |
+
answer = self.qa_chain.run(query)
|
92 |
+
|
93 |
+
# Format results
|
94 |
+
results = []
|
95 |
+
for doc in docs:
|
96 |
+
results.append({
|
97 |
+
'title': doc.metadata['title'],
|
98 |
+
'description': doc.page_content,
|
99 |
+
'url': doc.metadata['url'],
|
100 |
+
'score': doc.metadata.get('score', 1.0)
|
101 |
+
})
|
102 |
+
|
103 |
+
return {
|
104 |
+
'results': results,
|
105 |
+
'answer': answer
|
106 |
+
}
|
107 |
+
|
108 |
+
except Exception as e:
|
109 |
+
logger.error(f"Error in course search: {str(e)}")
|
110 |
+
raise
|
course_search/search_system/vector_store.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
from typing import List, Dict
|
4 |
+
import faiss
|
5 |
+
import logging
|
6 |
+
|
7 |
+
logger = logging.getLogger(__name__)
|
8 |
+
|
9 |
+
class FAISSManager:
|
10 |
+
def __init__(self, dimension: int = 384):
|
11 |
+
"""
|
12 |
+
Initialize FAISS index with error handling
|
13 |
+
"""
|
14 |
+
try:
|
15 |
+
self.dimension = dimension
|
16 |
+
self.index = faiss.IndexFlatL2(dimension)
|
17 |
+
self.metadata = []
|
18 |
+
logger.info("FAISS index initialized successfully")
|
19 |
+
except Exception as e:
|
20 |
+
logger.error(f"Error initializing FAISS: {str(e)}")
|
21 |
+
raise RuntimeError(f"Failed to initialize FAISS: {str(e)}")
|
22 |
+
|
23 |
+
def upsert_courses(self, df: pd.DataFrame) -> None:
|
24 |
+
"""
|
25 |
+
Add course embeddings to the FAISS index with error handling
|
26 |
+
"""
|
27 |
+
try:
|
28 |
+
# Convert embeddings to numpy array
|
29 |
+
vectors = np.vstack([
|
30 |
+
emb.astype('float32') for emb in df['embeddings'].values
|
31 |
+
])
|
32 |
+
|
33 |
+
# Add vectors to index
|
34 |
+
self.index.add(vectors)
|
35 |
+
self.metadata.extend(df[['title', 'description', 'url']].to_dict('records'))
|
36 |
+
logger.info(f"Added {len(vectors)} vectors to FAISS index")
|
37 |
+
|
38 |
+
except Exception as e:
|
39 |
+
logger.error(f"Error adding vectors to FAISS: {str(e)}")
|
40 |
+
raise RuntimeError(f"Failed to add vectors to FAISS: {str(e)}")
|
41 |
+
|
42 |
+
def search_courses(self, query_embedding: np.ndarray, top_k: int = 5) -> List[Dict]:
|
43 |
+
"""
|
44 |
+
Search for similar courses using query embedding with error handling
|
45 |
+
"""
|
46 |
+
try:
|
47 |
+
# Ensure query embedding is in correct format
|
48 |
+
query_embedding = query_embedding.astype('float32').reshape(1, -1)
|
49 |
+
|
50 |
+
# Perform search
|
51 |
+
distances, indices = self.index.search(query_embedding, top_k)
|
52 |
+
|
53 |
+
results = []
|
54 |
+
for i, idx in enumerate(indices[0]):
|
55 |
+
if idx == -1:
|
56 |
+
continue
|
57 |
+
result = self.metadata[idx].copy()
|
58 |
+
result['score'] = float(distances[0][i]) # Convert to float for JSON serialization
|
59 |
+
results.append(result)
|
60 |
+
|
61 |
+
return results
|
62 |
+
|
63 |
+
except Exception as e:
|
64 |
+
logger.error(f"Error searching FAISS index: {str(e)}")
|
65 |
+
raise RuntimeError(f"Failed to search FAISS index: {str(e)}")
|
data/courses_with_embeddings.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c55c9ffb5774c19e7443e3f1c5b8a8c0add246c1516f05f80d29aa0433972f61
|
3 |
+
size 139388
|
data/embedding_cache/embeddings_cache_all-MiniLM-L6-v2.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5444508e05b63d662919fc358bfed9dea64094858d01a6df9b291a56525ea13d
|
3 |
+
size 123513
|
instructions.txt
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This assignment involves building a **smart search tool** for Analytics Vidhya's free courses. Here's a breakdown of how you can approach it:
|
2 |
+
|
3 |
+
### Steps to Complete the Assignment
|
4 |
+
|
5 |
+
#### 1. **Data Collection**
|
6 |
+
- **Explore the Free Courses**: Visit the [Analytics Vidhya Free Courses](https://courses.analyticsvidhya.com/collections/courses) and collect course-related data such as:
|
7 |
+
- Course Titles
|
8 |
+
- Descriptions
|
9 |
+
- Curriculum
|
10 |
+
- **Tools for Data Collection**: Use Python libraries like `BeautifulSoup` (for web scraping) or APIs (if available) to extract the data.
|
11 |
+
|
12 |
+
#### 2. **Smart Search System**
|
13 |
+
- **Frameworks**: Use **LangChain (0.3.x)** or **LlamaIndex (0.12.x)** for implementing a Retrieval-Augmented Generation (RAG) system.
|
14 |
+
- **Steps**:
|
15 |
+
1. **Generate Embeddings**:
|
16 |
+
- Choose a pre-trained embedding model like OpenAI's `text-embedding-ada-002` or Sentence Transformers.
|
17 |
+
- Convert the collected course data into embeddings.
|
18 |
+
2. **Vector Database**:
|
19 |
+
- Use a vector database such as Pinecone, Weaviate, or FAISS to store and query embeddings.
|
20 |
+
3. **Search Implementation**:
|
21 |
+
- Implement a natural language search interface where user queries are matched against the stored embeddings.
|
22 |
+
- Return the most relevant courses.
|
23 |
+
|
24 |
+
#### 3. **Deployment on Huggingface Spaces**
|
25 |
+
- **Frameworks**: Use `Gradio` or `Streamlit` for creating an interactive UI.
|
26 |
+
- **Deployment Steps**:
|
27 |
+
1. Set up a Huggingface Spaces account.
|
28 |
+
2. Integrate your search tool into the interface.
|
29 |
+
3. Deploy the app and obtain a public URL.
|
30 |
+
|
31 |
+
#### 4. **Document Your Approach**
|
32 |
+
- Write a report explaining:
|
33 |
+
- Data collection process.
|
34 |
+
- Embedding model and vector database choices.
|
35 |
+
- System architecture.
|
36 |
+
- Challenges faced and how you addressed them.
|
37 |
+
|
38 |
+
---
|
39 |
+
|
40 |
+
### Required Skills and Tools
|
41 |
+
- **Web Scraping**: `requests`, `BeautifulSoup`
|
42 |
+
- **Embedding Models**: `sentence-transformers`, OpenAI API
|
43 |
+
- **Vector Database**: Pinecone, Weaviate, or FAISS
|
44 |
+
- **Deployment**: `Gradio`, `Streamlit`, Huggingface Spaces
|
45 |
+
- **Programming Language**: Python
|
46 |
+
|
47 |
+
---
|
48 |
+
|
49 |
+
If you need guidance on specific steps, feel free to ask!
|
requirements.txt
ADDED
File without changes
|
tests/__init__.py
ADDED
File without changes
|
tests/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (159 Bytes). View file
|
|
tests/__pycache__/test_complete_pipeline.cpython-312.pyc
ADDED
Binary file (2.43 kB). View file
|
|
tests/test_complete_pipeline.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from course_search.search_system.embeddings import EmbeddingGenerator
|
3 |
+
from course_search.search_system.vector_store import FAISSManager
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
logging.basicConfig(level=logging.INFO)
|
8 |
+
logger = logging.getLogger(__name__)
|
9 |
+
|
10 |
+
def test_pipeline():
|
11 |
+
try:
|
12 |
+
# Test data
|
13 |
+
test_data = pd.DataFrame({
|
14 |
+
'title': ['Test Course 1', 'Test Course 2'],
|
15 |
+
'description': ['This is a test course about Python', 'This is a test course about ML'],
|
16 |
+
'url': ['http://test1.com', 'http://test2.com']
|
17 |
+
})
|
18 |
+
|
19 |
+
# Test embedding generation
|
20 |
+
logger.info("Testing embedding generation...")
|
21 |
+
embedding_gen = EmbeddingGenerator()
|
22 |
+
test_data = embedding_gen.add_embeddings_to_df(test_data)
|
23 |
+
assert 'embeddings' in test_data.columns
|
24 |
+
logger.info("Embedding generation successful!")
|
25 |
+
|
26 |
+
# Test FAISS storage
|
27 |
+
logger.info("Testing FAISS storage...")
|
28 |
+
vector_store = FAISSManager()
|
29 |
+
vector_store.upsert_courses(test_data)
|
30 |
+
logger.info("FAISS storage successful!")
|
31 |
+
|
32 |
+
# Test search
|
33 |
+
logger.info("Testing search...")
|
34 |
+
query = "Python programming"
|
35 |
+
query_embedding = embedding_gen.generate_embeddings(pd.Series([query]))[0]
|
36 |
+
results = vector_store.search_courses(query_embedding)
|
37 |
+
assert len(results) > 0
|
38 |
+
logger.info("Search successful!")
|
39 |
+
|
40 |
+
return True
|
41 |
+
|
42 |
+
except Exception as e:
|
43 |
+
logger.error(f"Pipeline test failed: {str(e)}")
|
44 |
+
return False
|
45 |
+
|
46 |
+
if __name__ == "__main__":
|
47 |
+
test_pipeline()
|
tests/test_integration.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from course_search.search_system.data_pipeline import DataPipeline
|
2 |
+
import logging
|
3 |
+
|
4 |
+
def test_pipeline():
|
5 |
+
pipeline = DataPipeline()
|
6 |
+
|
7 |
+
# Run pipeline and save results
|
8 |
+
df = pipeline.run_pipeline(save_path='data/courses_with_embeddings.pkl')
|
9 |
+
|
10 |
+
# Print results
|
11 |
+
print("\nDataFrame Info:")
|
12 |
+
print(df.info())
|
13 |
+
print("\nSample Course:")
|
14 |
+
print(df.iloc[0][['title', 'description']].to_dict())
|
15 |
+
print("\nSample Embedding Shape:", df.iloc[0]['embeddings'].shape)
|
16 |
+
|
17 |
+
if __name__ == "__main__":
|
18 |
+
logging.basicConfig(level=logging.INFO)
|
19 |
+
test_pipeline()
|
tests/test_scraper.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from course_search.scraper.course_scraper import CourseScraper
|
3 |
+
|
4 |
+
def test_scraper():
|
5 |
+
scraper = CourseScraper()
|
6 |
+
|
7 |
+
# Test 1: Get course links
|
8 |
+
print("\nTesting get_course_links()...")
|
9 |
+
links = scraper.get_course_links()
|
10 |
+
print(f"Found {len(links)} links")
|
11 |
+
print("Sample links:")
|
12 |
+
for link in links[:3]:
|
13 |
+
print(f"- {link}")
|
14 |
+
|
15 |
+
# Test 2: Extract course info
|
16 |
+
if links:
|
17 |
+
print("\nTesting extract_course_info()...")
|
18 |
+
sample_course = scraper.extract_course_info(links[0])
|
19 |
+
print("Sample course info:")
|
20 |
+
for key, value in sample_course.items():
|
21 |
+
print(f"{key}: {value[:100]}...")
|
22 |
+
|
23 |
+
# Test 3: Scrape all courses
|
24 |
+
print("\nTesting scrape_all_courses()...")
|
25 |
+
df = scraper.scrape_all_courses()
|
26 |
+
print("\nDataFrame Info:")
|
27 |
+
print(df.info())
|
28 |
+
print("\nFirst few rows:")
|
29 |
+
print(df.head())
|
30 |
+
|
31 |
+
if __name__ == "__main__":
|
32 |
+
logging.basicConfig(level=logging.INFO)
|
33 |
+
test_scraper()
|
tests/test_vector_store.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import faiss
|
3 |
+
import logging
|
4 |
+
|
5 |
+
logging.basicConfig(level=logging.INFO)
|
6 |
+
logger = logging.getLogger(__name__)
|
7 |
+
|
8 |
+
def test_faiss():
|
9 |
+
try:
|
10 |
+
# Create a small test index
|
11 |
+
dimension = 64
|
12 |
+
nb = 100
|
13 |
+
|
14 |
+
# Generate random data
|
15 |
+
xb = np.random.random((nb, dimension)).astype('float32')
|
16 |
+
|
17 |
+
# Create index
|
18 |
+
index = faiss.IndexFlatL2(dimension)
|
19 |
+
|
20 |
+
# Add vectors
|
21 |
+
index.add(xb)
|
22 |
+
|
23 |
+
# Test search
|
24 |
+
k = 5
|
25 |
+
xq = np.random.random((1, dimension)).astype('float32')
|
26 |
+
D, I = index.search(xq, k)
|
27 |
+
|
28 |
+
logger.info("FAISS test successful!")
|
29 |
+
logger.info(f"Found {k} nearest neighbors")
|
30 |
+
|
31 |
+
return True
|
32 |
+
|
33 |
+
except Exception as e:
|
34 |
+
logger.error(f"FAISS test failed: {str(e)}")
|
35 |
+
return False
|
36 |
+
|
37 |
+
import torch
|
38 |
+
|
39 |
+
def test_torch():
|
40 |
+
try:
|
41 |
+
x = torch.rand(5, 3)
|
42 |
+
print("PyTorch is working correctly. Tensor:", x)
|
43 |
+
except Exception as e:
|
44 |
+
print("Error with PyTorch:", e)
|
45 |
+
|
46 |
+
|
47 |
+
if __name__ == "__main__":
|
48 |
+
test_faiss()
|
49 |
+
test_torch()
|