File size: 4,561 Bytes
ae4467e 586e301 47e6638 ae4467e 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 586e301 47e6638 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import torch
import gradio as gr
import os
# Use the relative path where the CSV is uploaded
csv_file_path = os.path.join(os.getcwd(), 'Analytics_Vidhya_Free_Course_data.csv')
# Load the dataset
df = pd.read_csv(csv_file_path, encoding='Windows-1252')
# Load the pre-trained model for embeddings (using SentenceTransformers)
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
# Combine title and description to create a full text for each course
df['full_text'] = df.iloc[:,0] + " " + df.iloc[:,1] + " " + df['Instructor Name'] + " " + str(df['Rating']) + " " + df['Category']
# Convert full course texts into embeddings
course_embeddings = model.encode(df['full_text'].tolist(), convert_to_tensor=True)
# Function to expand the query using paraphrasing
def expand_query(query):
paraphraser = pipeline('text2text-generation', model='Vamsi/T5_Paraphrase_Paws')
expanded_queries = paraphraser(query, num_return_sequences=3, max_length=50, do_sample=True)
return [q['generated_text'] for q in expanded_queries]
# Function to search for the most relevant courses
def search_courses(query, level_filter=None, category_filter=None, top_k=3):
# Step 1: Expand the query using paraphrasing
expanded_queries = expand_query(query)
# Step 2: Initialize an array to store all similarities
all_similarities = []
for expanded_query in expanded_queries:
# Convert each expanded query into an embedding
query_embedding = model.encode(expanded_query, convert_to_tensor=True)
# Compute cosine similarities between the query embedding and course embeddings
similarities = util.pytorch_cos_sim(query_embedding, course_embeddings)[0]
# Append to the list of all similarities
all_similarities.append(similarities)
# Step 3: Convert the list of tensors to a single tensor by taking the maximum similarity for each course
aggregated_similarities = torch.max(torch.stack(all_similarities), dim=0)[0]
# Step 4: Apply filters
filtered_df = df.copy()
if level_filter:
filtered_df = filtered_df[filtered_df['Level of Difficulty'] == level_filter]
if category_filter:
filtered_df = filtered_df[filtered_df['Category'] == category_filter]
if filtered_df.empty:
return "<p>No matching courses found.</p>"
# Recalculate similarities for the filtered data
filtered_similarities = aggregated_similarities[filtered_df.index]
# Step 5: Get top_k most similar courses
top_results = filtered_similarities.topk(k=min(top_k, len(filtered_similarities)))
# Prepare the output as clickable links
results = []
for idx in top_results.indices:
idx = int(idx)
course_title = filtered_df.iloc[idx]['Course Title']
course_description = filtered_df.iloc[idx,1]
course_url = filtered_df.iloc[idx,-1]
# Format the result as a clickable hyperlink using raw HTML
course_link = f'<a href="{course_url}" target="_blank">{course_title}</a>'
results.append(f"<strong>{course_link}</strong><br>{course_description}<br><br>")
# Combine all results into an HTML formatted list
return "<ol>" + "".join([f"<li>{result}</li>" for result in results]) + "</ol>"
# Create Gradio UI
def create_gradio_interface():
with gr.Blocks() as demo:
gr.Markdown("# Analytics Vidhya Free Courses")
gr.Markdown("Enter your query and use filters to narrow down the search.")
# Input elements
query = gr.Textbox(label=" Search for a course", placeholder="Enter course topic or description")
# Filters (in a collapsible form)
with gr.Accordion(" Filters", open=False):
level_filter = gr.Dropdown(choices=["Beginner", "Intermediate", "Advanced"], label=" Course Level", multiselect=False)
category_filter = gr.Dropdown(choices=["Data Science", "Machine Learning", "Deep Learning", "AI", "NLP"], label=" Category", multiselect=False)
# Search button
search_button = gr.Button("Search")
# Output HTML for displaying results
output = gr.HTML(label="Search Results")
# On button click, trigger the search function
search_button.click(fn=search_courses, inputs=[query, level_filter, category_filter], outputs=output)
return demo
# Launch Gradio interface
demo = create_gradio_interface()
demo.launch(share=True, debug=True) |