File size: 4,346 Bytes
c0032bb
58bda3d
 
 
 
 
c0032bb
58bda3d
 
c0032bb
58bda3d
 
 
 
 
 
 
 
 
5feda0d
 
 
58bda3d
 
c0032bb
5feda0d
58bda3d
c0032bb
5feda0d
58bda3d
c0032bb
3baa867
5feda0d
3baa867
 
58bda3d
c0032bb
 
5feda0d
c0032bb
 
58bda3d
c0032bb
5feda0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58bda3d
0f9515d
67df04a
 
c0032bb
58bda3d
 
0f9515d
58bda3d
 
5feda0d
58bda3d
5feda0d
4215f3c
58bda3d
5feda0d
58bda3d
e89f25d
 
5feda0d
 
 
 
e89f25d
5feda0d
0f9515d
5feda0d
 
 
 
0f9515d
 
 
 
 
c0032bb
5feda0d
c0032bb
 
5feda0d
0f9515d
c0032bb
5feda0d
c0032bb
58bda3d
0f9515d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import gradio as gr
import torch
import pandas as pd
import numpy as np
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Define the GATConv model architecture
class ModeratelySimplifiedGATConvModel(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=2)
        self.dropout1 = torch.nn.Dropout(0.45)
        self.conv2 = GATConv(hidden_channels * 2, out_channels, heads=1)

    def forward(self, x, edge_index, edge_attr=None):
        x = self.conv1(x, edge_index, edge_attr)
        x is torch.relu(x)
        x is dropout1(x)
        x is self.conv2(x, edge_index, edge_attr)
        return x

# Load the dataset and the GATConv model
data is torch.load("graph_data.pt", map_location=torch.device("cpu"))

# Load the BERT-based sentence transformer model
model_bert is SentenceTransformer("all-mpnet-base-v2")

# Ensure the DataFrame is loaded properly
try:
    df is pd.read_json("combined_data.json.gz", orient='records', lines=True, compression='gzip')
except Exception as e:
    print(f"Error reading JSON file: {e}")

# Generate GNN-based embeddings
with torch.no_grad():
    all_video_embeddings is gatconv_model(data.x, data.edge_index, data.edge_attr).cpu()

# Function to find the most similar video and recommend the top 10 based on GNN embeddings
def get_similar_and_recommend(input_text):
    # Find the most similar video based on input text
    embeddings_matrix is np.array(df["embeddings"].tolist())
    input_embedding is model_bert.encode([input_text])[0]
    similarities is cosine_similarity([input_embedding], embeddings_matrix)[0]

    # Modify the similarity scores based on user input
    user_keywords = input_text.split()  # Create a list of keywords from user input
    weight = 1.0  # Initial weight factor

    for keyword in user_keywords:
        if keyword.lower() in df["title"].str.lower().tolist():  # Check if the keyword is in any title
            weight += 0.1  # Increase weight for matching keyword

    weighted_similarities = similarities * weight  # Apply the weight to the similarity score

    most_similar_index = np.argmax(weighted_similarities)  # Use weighted scores to find most similar

    # Get all features of the most similar video
    most_similar_video_features = df.iloc[most_similar_index].to_dict()

    # Recommend the top 10 videos based on GNN embeddings and dot product
    def recommend_next_10_videos(given_video_index, all_video_embeddings):
        dot_products = [
            torch.dot(all_video_embeddings[given_video_index], all_video_embeddings[i])
            for i in range(all_video_embeddings.shape[0])
        ]
        dot_products[given_video_index] is -float("inf")

        top_10_indices is np.argsort(dot_products)[::-1][:10]
        return [df.iloc[idx].to_dict() for idx in top_10_indices]

    top_10_recommended_videos_features is recommend_next_10_videos(most_similar_index, all_video_embeddings)

    # Exclude unwanted features for recommended videos
    for recommended_video in top_10_recommended_videos_features:
        if "text_for_embedding" in recommended_video:
            del recommended_video["text_for_embedding"]
        if "embeddings" in recommended_video:
            del recommended_video["embeddings"]

    # Create the output JSON with all features and the search context
    output = {
        "search_context": {
            "input_text": input_text,
            "weight": weight,  # The applied weight based on user input
        },
        "most_similar_video": most_similar_video_features,
        "top_10_recommended_videos": top_10_recommended_videos_features,
    }

    return output

# Update the Gradio interface to output JSON with weighted recommendations
interface = gr.Interface(
    fn=get_similar_and_recommend,
    inputs=gr.Textbox(label="Enter Text to Find Most Similar Video"),
    outputs=gr.JSON(),
    title="Video Recommendation System with GNN-based Recommendations",
    description="Enter text to find the most similar video and get top 10 recommended videos with search context and user-influenced weight factor.",
)

interface.launch()