Abinivesh commited on
Commit
1ff5897
·
verified ·
1 Parent(s): 81da1c3

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +21 -13
  2. app_using_shiny.py +205 -0
  3. app_using_streamlit.py +89 -0
README.md CHANGED
@@ -1,13 +1,21 @@
1
- ---
2
- title: SmartSearchTool AnalyticsVidhya
3
- emoji: 🏆
4
- colorFrom: gray
5
- colorTo: pink
6
- sdk: streamlit
7
- sdk_version: 1.40.0
8
- app_file: app.py
9
- pinned: false
10
- short_description: A smart search tool to display relevant free courses
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
1
+ # Smart Search Tool for Analytics Vidhya Courses
2
+ # Goal
3
+ To create a smart search tool that enables users to find relevant free courses on Analytics Vidhya’s platform quickly.
4
+
5
+ # Project Approach
6
+ # Data Collection
7
+ I began by scraping the free courses' titles and relevant metadata, such as course links and images, from Analytics Vidhya’s platform using BeautifulSoup.
8
+
9
+ # Model Selection
10
+ Originally, I used the Groq API for generating embeddings and conducting searches. However, I found the results less suitable, leading me to switch to a more refined solution using BERT (Bidirectional Encoder Representations from Transformers). I leveraged a pre-trained BERT model (bert-base-uncased from Hugging Face) for generating embeddings.
11
+
12
+ # Relevance Matching
13
+ To match user queries with relevant courses, I calculated cosine similarity between the user’s query embedding and the course title embeddings. This similarity score enables ranking courses based on relevance, ensuring the most suitable courses are shown first.
14
+
15
+ # Interface
16
+ The application uses both Streamlit and Shiny for flexible, user-friendly interfaces. These interfaces display course details dynamically, including title, image, link, and relevance score.Finally I can able to conclude that Shiny is more faster in retrieving the results and display those in more interactive way than StreamLit.
17
+
18
+ # Deployment on Hugging Face Spaces
19
+ I deployed the tool on Hugging Face Spaces, providing an accessible, visually appealing interface for public use, enhanced with custom CSS for style and responsiveness.
20
+
21
+ BERT model : google-bert/bert-base-uncased
app_using_shiny.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from shiny import App, ui, render
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import pandas as pd
5
+ import torch
6
+ from transformers import BertTokenizer, BertModel
7
+ import numpy as np
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+
10
+ # Step 1: Scrape the free courses from Analytics Vidhya
11
+ url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
12
+ response = requests.get(url)
13
+ soup = BeautifulSoup(response.content, 'html.parser')
14
+
15
+ courses = []
16
+
17
+ # Extracting course title, image, and course link
18
+ for course_card in soup.find_all('header', class_='course-card__img-container'):
19
+ img_tag = course_card.find('img', class_='course-card__img')
20
+
21
+ if img_tag:
22
+ title = img_tag.get('alt')
23
+ image_url = img_tag.get('src')
24
+
25
+ link_tag = course_card.find_previous('a')
26
+ if link_tag:
27
+ course_link = link_tag.get('href')
28
+ if not course_link.startswith('http'):
29
+ course_link = 'https://courses.analyticsvidhya.com' + course_link
30
+
31
+ courses.append({
32
+ 'title': title,
33
+ 'image_url': image_url,
34
+ 'course_link': course_link
35
+ })
36
+
37
+ # Step 2: Create DataFrame
38
+ df = pd.DataFrame(courses)
39
+
40
+ # Load pre-trained BERT model and tokenizer
41
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
42
+ model = BertModel.from_pretrained('bert-base-uncased')
43
+
44
+ # Function to generate embeddings using BERT
45
+ def get_bert_embedding(text):
46
+ inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
47
+ with torch.no_grad():
48
+ outputs = model(**inputs)
49
+ return outputs.last_hidden_state.mean(dim=1).numpy()
50
+
51
+ # Create embeddings for course titles
52
+ df['embedding'] = df['title'].apply(lambda x: get_bert_embedding(x))
53
+
54
+ # Function to perform search using BERT-based similarity
55
+ def search_courses(query):
56
+ query_embedding = get_bert_embedding(query)
57
+ course_embeddings = np.vstack(df['embedding'].values)
58
+
59
+ # Compute cosine similarity between query embedding and course embeddings
60
+ similarities = cosine_similarity(query_embedding, course_embeddings).flatten()
61
+
62
+ # Add the similarity scores to the DataFrame
63
+ df['score'] = similarities
64
+
65
+ # Sort by similarity score in descending order and return top results
66
+ top_results = df.sort_values(by='score', ascending=False).head(10)
67
+ return top_results[['title', 'image_url', 'course_link', 'score']].to_dict(orient='records')
68
+
69
+ # Shiny UI and Server
70
+ app_ui = ui.page_fluid(
71
+ ui.tags.style(
72
+ """
73
+ @import url('https://fonts.googleapis.com/css2?family=Poppins:wght@300;500;700&display=swap');
74
+
75
+ body {
76
+ font-family: 'Poppins', sans-serif;
77
+ background-color: #f4f6f9;
78
+ }
79
+
80
+ .container {
81
+ padding: 20px;
82
+ }
83
+
84
+ h2 {
85
+ color: #ff6f61;
86
+ font-weight: 700;
87
+ text-align: center;
88
+ }
89
+
90
+ .result-container {
91
+ display: flex;
92
+ flex-wrap: wrap;
93
+ gap: 20px;
94
+ justify-content: center;
95
+ }
96
+
97
+ .course-card {
98
+ background-color: #ffffff;
99
+ border-radius: 12px;
100
+ box-shadow: 0 4px 10px rgba(0, 0, 0, 0.15);
101
+ overflow: hidden;
102
+ width: calc(50% - 10px);
103
+ transition: transform 0.3s, box-shadow 0.3s;
104
+ }
105
+
106
+ .course-card:hover {
107
+ transform: scale(1.05);
108
+ box-shadow: 0 6px 20px rgba(0, 0, 0, 0.2);
109
+ }
110
+
111
+ .course-image {
112
+ width: 100%;
113
+ height: 180px;
114
+ object-fit: cover;
115
+ border-top-left-radius: 12px;
116
+ border-top-right-radius: 12px;
117
+ }
118
+
119
+ .course-info {
120
+ padding: 15px;
121
+ }
122
+
123
+ .course-info h3 {
124
+ font-size: 20px;
125
+ color: #333;
126
+ margin-top: 0;
127
+ }
128
+
129
+ .course-info p {
130
+ color: #666;
131
+ font-size: 16px;
132
+ margin-bottom: 10px;
133
+ }
134
+
135
+ .course-link {
136
+ background-color: #ff6f61;
137
+ color: white;
138
+ padding: 8px 12px;
139
+ text-decoration: none;
140
+ border-radius: 6px;
141
+ font-size: 15px;
142
+ display: inline-block;
143
+ margin-top: 10px;
144
+ transition: background-color 0.2s;
145
+ }
146
+
147
+ .course-link:hover {
148
+ background-color: #e85a50;
149
+ }
150
+
151
+ .no-results {
152
+ text-align: center;
153
+ color: #888;
154
+ font-style: italic;
155
+ }
156
+ """
157
+ ),
158
+ ui.h2("Analytics Vidhya Smart Course Search"),
159
+ ui.input_text("query", "Enter your search query", placeholder="e.g., machine learning, data science, python"),
160
+ ui.output_text("search_info"),
161
+ ui.output_ui("results")
162
+ )
163
+
164
+ def server(input, output, session):
165
+ @output
166
+ @render.ui
167
+ def results():
168
+ if not input.query():
169
+ return ui.p("Enter a search query to get started!", class_="no-results")
170
+
171
+ # Perform the search
172
+ query = input.query()
173
+ results = search_courses(query)
174
+
175
+ if results:
176
+ result_ui = []
177
+ for item in results:
178
+ course_title = item['title']
179
+ course_image = item['image_url']
180
+ course_link = item['course_link']
181
+ relevance_score = round(item['score'] * 100, 2)
182
+
183
+ # Create course card UI
184
+ result_ui.append(
185
+ ui.div(
186
+ ui.img(src=course_image, class_="course-image"),
187
+ ui.div(
188
+ ui.h3(course_title),
189
+ ui.p(f"Relevance: {relevance_score}%"),
190
+ ui.a("View Course", href=course_link, target="_blank", class_="course-link"),
191
+ class_="course-info"
192
+ ),
193
+ class_="course-card"
194
+ )
195
+ )
196
+ return ui.div(*result_ui, class_="result-container")
197
+ else:
198
+ return ui.p("No results found.", class_="no-results")
199
+
200
+ @output
201
+ @render.text
202
+ def search_info():
203
+ return f"Results for '{input.query()}'" if input.query() else "Search for courses by typing a query above."
204
+
205
+ app = App(app_ui, server)
app_using_streamlit.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ import streamlit as st
5
+ import torch
6
+ from transformers import BertTokenizer, BertModel
7
+ import numpy as np
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+
10
+ # Step 1: Scrape the free courses from Analytics Vidhya
11
+ url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
12
+ response = requests.get(url)
13
+ soup = BeautifulSoup(response.content, 'html.parser')
14
+
15
+ courses = []
16
+
17
+ # Extracting course title, image, and course link
18
+ for course_card in soup.find_all('header', class_='course-card__img-container'):
19
+ img_tag = course_card.find('img', class_='course-card__img')
20
+
21
+ if img_tag:
22
+ title = img_tag.get('alt')
23
+ image_url = img_tag.get('src')
24
+
25
+ link_tag = course_card.find_previous('a')
26
+ if link_tag:
27
+ course_link = link_tag.get('href')
28
+ if not course_link.startswith('http'):
29
+ course_link = 'https://courses.analyticsvidhya.com' + course_link
30
+
31
+ courses.append({
32
+ 'title': title,
33
+ 'image_url': image_url,
34
+ 'course_link': course_link
35
+ })
36
+
37
+ # Step 2: Create DataFrame
38
+ df = pd.DataFrame(courses)
39
+
40
+ # Load pre-trained BERT model and tokenizer
41
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
42
+ model = BertModel.from_pretrained('bert-base-uncased')
43
+
44
+ # Function to generate embeddings using BERT
45
+ def get_bert_embedding(text):
46
+ inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
47
+ with torch.no_grad():
48
+ outputs = model(**inputs)
49
+ return outputs.last_hidden_state.mean(dim=1).numpy()
50
+
51
+ # Create embeddings for course titles
52
+ df['embedding'] = df['title'].apply(lambda x: get_bert_embedding(x))
53
+
54
+ # Function to perform search using BERT-based similarity
55
+ def search_courses(query):
56
+ query_embedding = get_bert_embedding(query)
57
+ course_embeddings = np.vstack(df['embedding'].values)
58
+
59
+ # Compute cosine similarity between query embedding and course embeddings
60
+ similarities = cosine_similarity(query_embedding, course_embeddings).flatten()
61
+
62
+ # Add the similarity scores to the DataFrame
63
+ df['score'] = similarities
64
+
65
+ # Sort by similarity score in descending order and return top results
66
+ top_results = df.sort_values(by='score', ascending=False).head(10)
67
+ return top_results[['title', 'image_url', 'course_link', 'score']].to_dict(orient='records')
68
+
69
+ # Streamlit Interface
70
+ st.title("Analytics Vidhya Smart Course Search")
71
+ st.write("Find the most relevant courses from Analytics Vidhya based on your query.")
72
+
73
+ query = st.text_input("Enter your search query", placeholder="e.g., machine learning, data science, python")
74
+
75
+ if query:
76
+ results = search_courses(query)
77
+ if results:
78
+ for item in results:
79
+ course_title = item['title']
80
+ course_image = item['image_url']
81
+ course_link = item['course_link']
82
+ relevance_score = round(item['score'] * 100, 2)
83
+
84
+ st.image(course_image, width=300)
85
+ st.markdown(f"### [{course_title}]({course_link})")
86
+ st.write(f"Relevance: {relevance_score}%")
87
+ st.markdown("---")
88
+ else:
89
+ st.write("No results found.")