File size: 4,561 Bytes
ae4467e
586e301
 
 
 
47e6638
ae4467e
47e6638
586e301
47e6638
 
586e301
 
47e6638
586e301
 
 
47e6638
586e301
 
 
 
47e6638
586e301
 
 
 
 
47e6638
586e301
47e6638
586e301
47e6638
 
586e301
 
 
47e6638
586e301
47e6638
 
586e301
47e6638
 
586e301
 
47e6638
586e301
 
47e6638
 
 
586e301
47e6638
586e301
47e6638
586e301
 
47e6638
 
586e301
47e6638
 
586e301
 
47e6638
586e301
 
 
 
47e6638
 
 
 
 
586e301
47e6638
586e301
47e6638
586e301
 
47e6638
586e301
 
 
 
47e6638
 
586e301
47e6638
 
586e301
47e6638
 
 
 
586e301
47e6638
 
586e301
47e6638
 
586e301
 
 
 
 
 
47e6638
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import torch
import gradio as gr
import os 

# Use the relative path where the CSV is uploaded
csv_file_path = os.path.join(os.getcwd(), 'Analytics_Vidhya_Free_Course_data.csv')

# Load the dataset
df = pd.read_csv(csv_file_path, encoding='Windows-1252')

# Load the pre-trained model for embeddings (using SentenceTransformers)
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

# Combine title and description to create a full text for each course
df['full_text'] = df.iloc[:,0] + " " + df.iloc[:,1] + " " + df['Instructor Name'] + " " + str(df['Rating']) + " " + df['Category']

# Convert full course texts into embeddings
course_embeddings = model.encode(df['full_text'].tolist(), convert_to_tensor=True)

# Function to expand the query using paraphrasing
def expand_query(query):
    paraphraser = pipeline('text2text-generation', model='Vamsi/T5_Paraphrase_Paws')
    expanded_queries = paraphraser(query, num_return_sequences=3, max_length=50, do_sample=True)
    return [q['generated_text'] for q in expanded_queries]

# Function to search for the most relevant courses
def search_courses(query, level_filter=None, category_filter=None, top_k=3):
    # Step 1: Expand the query using paraphrasing
    expanded_queries = expand_query(query)

    # Step 2: Initialize an array to store all similarities
    all_similarities = []

    for expanded_query in expanded_queries:
        # Convert each expanded query into an embedding
        query_embedding = model.encode(expanded_query, convert_to_tensor=True)

        # Compute cosine similarities between the query embedding and course embeddings
        similarities = util.pytorch_cos_sim(query_embedding, course_embeddings)[0]

        # Append to the list of all similarities
        all_similarities.append(similarities)

    # Step 3: Convert the list of tensors to a single tensor by taking the maximum similarity for each course
    aggregated_similarities = torch.max(torch.stack(all_similarities), dim=0)[0]

    # Step 4: Apply filters
    filtered_df = df.copy()
    if level_filter:
        filtered_df = filtered_df[filtered_df['Level of Difficulty'] == level_filter]
    if category_filter:
        filtered_df = filtered_df[filtered_df['Category'] == category_filter]
    
    if filtered_df.empty:
        return "<p>No matching courses found.</p>"
    
    # Recalculate similarities for the filtered data
    filtered_similarities = aggregated_similarities[filtered_df.index]

    # Step 5: Get top_k most similar courses
    top_results = filtered_similarities.topk(k=min(top_k, len(filtered_similarities)))

    # Prepare the output as clickable links
    results = []
    for idx in top_results.indices:
        idx = int(idx)
        course_title = filtered_df.iloc[idx]['Course Title']
        course_description = filtered_df.iloc[idx,1]
        course_url = filtered_df.iloc[idx,-1]
        
        
        # Format the result as a clickable hyperlink using raw HTML
        course_link = f'<a href="{course_url}" target="_blank">{course_title}</a>'
        results.append(f"<strong>{course_link}</strong><br>{course_description}<br><br>")

    # Combine all results into an HTML formatted list
    return "<ol>" + "".join([f"<li>{result}</li>" for result in results]) + "</ol>"

# Create Gradio UI
def create_gradio_interface():
    with gr.Blocks() as demo:
        gr.Markdown("#  Analytics Vidhya Free Courses")
        gr.Markdown("Enter your query and use filters to narrow down the search.")

        # Input elements
        query = gr.Textbox(label=" Search for a course", placeholder="Enter course topic or description")

        # Filters (in a collapsible form)
        with gr.Accordion(" Filters", open=False):
            level_filter = gr.Dropdown(choices=["Beginner", "Intermediate", "Advanced"], label=" Course Level", multiselect=False)
            category_filter = gr.Dropdown(choices=["Data Science", "Machine Learning", "Deep Learning", "AI", "NLP"], label=" Category", multiselect=False)

        # Search button
        search_button = gr.Button("Search")

        # Output HTML for displaying results
        output = gr.HTML(label="Search Results")

        # On button click, trigger the search function
        search_button.click(fn=search_courses, inputs=[query, level_filter, category_filter], outputs=output)

    return demo

# Launch Gradio interface
demo = create_gradio_interface()
demo.launch(share=True, debug=True)