Ankan Ghosh commited on
Commit
cd5a7c4
·
verified ·
1 Parent(s): d6a4fef

Upload 11 files

Browse files
app.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from main import init_qdrant, load_data, input_ratings, recommend_movies
4
+
5
+ # Initialize Qdrant and load data
6
+ qdrant = init_qdrant()
7
+ load_data(qdrant)
8
+
9
+ # Load movies
10
+ movies = pd.read_csv("./data/ml-latest-small/movies.csv")
11
+ ratings = pd.read_csv("./data/ml-latest-small/ratings.csv")
12
+
13
+ # Initialize session state to store user ratings
14
+ if "user_ratings" not in st.session_state:
15
+ st.session_state["user_ratings"] = {}
16
+
17
+ # Streamlit app interface
18
+ st.title("Movie Recommendation System")
19
+
20
+ # Movie selection and rating
21
+ movie_titles = movies["title"].tolist()
22
+
23
+ # Movie search and selection using multiselect
24
+ selected_movies = st.multiselect("Search and select movies to rate", movie_titles)
25
+
26
+ if selected_movies:
27
+ for movie in selected_movies:
28
+ rating = st.slider(f"Rate {movie}", 0.0, 5.0, 0.0, 0.5)
29
+ if st.button(f"Add {movie}"):
30
+ movie_id = movies[movies.title == movie].movieId.iloc[0]
31
+ st.session_state["user_ratings"][movie] = (movie_id, rating)
32
+ st.write(f"Added: {movie} with a rating of {rating}")
33
+ else:
34
+ st.write("Select movies to rate from the dropdown.")
35
+
36
+ # Clear button to reset all inputs
37
+ if st.button("Clear Selections"):
38
+ st.session_state["user_ratings"] = {}
39
+ st._set_query_params() # Reset the app state
40
+
41
+ # Display current ratings
42
+ if st.session_state["user_ratings"]:
43
+ st.write("Current Movie Ratings:")
44
+ for movie, (movie_id, rating) in st.session_state["user_ratings"].items():
45
+ st.write(f"{movie}: {rating}")
46
+
47
+ # Get recommendations
48
+ if st.button("Get Recommendations"):
49
+ if st.session_state["user_ratings"]:
50
+ final_ratings = input_ratings(st.session_state["user_ratings"], ratings)
51
+ recommendations = recommend_movies(qdrant, movies, final_ratings)
52
+
53
+ if recommendations:
54
+ st.header("Recommended Movies for You")
55
+ for movie in recommendations:
56
+ st.write(movie)
57
+ else:
58
+ st.info("No recommendations found based on your ratings.")
59
+ else:
60
+ st.warning("Please rate at least one movie to get recommendations.")
data/ml-latest-small/.ipynb_checkpoints/links-checkpoint.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/ml-latest-small/.ipynb_checkpoints/movies-checkpoint.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/ml-latest-small/.ipynb_checkpoints/tags-checkpoint.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/ml-latest-small/README.txt ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Summary
2
+ =======
3
+
4
+ This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.
5
+
6
+ Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.
7
+
8
+ The data are contained in the files `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`. More details about the contents and use of all these files follows.
9
+
10
+ This is a *development* dataset. As such, it may change over time and is not an appropriate dataset for shared research results. See available *benchmark* datasets if that is your intent.
11
+
12
+ This and other GroupLens data sets are publicly available for download at <http://grouplens.org/datasets/>.
13
+
14
+
15
+ Usage License
16
+ =============
17
+
18
+ Neither the University of Minnesota nor any of the researchers involved can guarantee the correctness of the data, its suitability for any particular purpose, or the validity of results based on the use of the data set. The data set may be used for any research purposes under the following conditions:
19
+
20
+ * The user may not state or imply any endorsement from the University of Minnesota or the GroupLens Research Group.
21
+ * The user must acknowledge the use of the data set in publications resulting from the use of the data set (see below for citation information).
22
+ * The user may redistribute the data set, including transformations, so long as it is distributed under these same license conditions.
23
+ * The user may not use this information for any commercial or revenue-bearing purposes without first obtaining permission from a faculty member of the GroupLens Research Project at the University of Minnesota.
24
+ * The executable software scripts are provided "as is" without warranty of any kind, either expressed or implied, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose. The entire risk as to the quality and performance of them is with you. Should the program prove defective, you assume the cost of all necessary servicing, repair or correction.
25
+
26
+ In no event shall the University of Minnesota, its affiliates or employees be liable to you for any damages arising out of the use or inability to use these programs (including but not limited to loss of data or data being rendered inaccurate).
27
+
28
+ If you have any further questions or comments, please email <[email protected]>
29
+
30
+
31
+ Citation
32
+ ========
33
+
34
+ To acknowledge use of the dataset in publications, please cite the following paper:
35
+
36
+ > F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. <https://doi.org/10.1145/2827872>
37
+
38
+
39
+ Further Information About GroupLens
40
+ ===================================
41
+
42
+ GroupLens is a research group in the Department of Computer Science and Engineering at the University of Minnesota. Since its inception in 1992, GroupLens's research projects have explored a variety of fields including:
43
+
44
+ * recommender systems
45
+ * online communities
46
+ * mobile and ubiquitious technologies
47
+ * digital libraries
48
+ * local geographic information systems
49
+
50
+ GroupLens Research operates a movie recommender based on collaborative filtering, MovieLens, which is the source of these data. We encourage you to visit <http://movielens.org> to try it out! If you have exciting ideas for experimental work to conduct on MovieLens, send us an email at <[email protected]> - we are always interested in working with external collaborators.
51
+
52
+
53
+ Content and Use of Files
54
+ ========================
55
+
56
+ Formatting and Encoding
57
+ -----------------------
58
+
59
+ The dataset files are written as [comma-separated values](http://en.wikipedia.org/wiki/Comma-separated_values) files with a single header row. Columns that contain commas (`,`) are escaped using double-quotes (`"`). These files are encoded as UTF-8. If accented characters in movie titles or tag values (e.g. Misérables, Les (1995)) display incorrectly, make sure that any program reading the data, such as a text editor, terminal, or script, is configured for UTF-8.
60
+
61
+
62
+ User Ids
63
+ --------
64
+
65
+ MovieLens users were selected at random for inclusion. Their ids have been anonymized. User ids are consistent between `ratings.csv` and `tags.csv` (i.e., the same id refers to the same user across the two files).
66
+
67
+
68
+ Movie Ids
69
+ ---------
70
+
71
+ Only movies with at least one rating or tag are included in the dataset. These movie ids are consistent with those used on the MovieLens web site (e.g., id `1` corresponds to the URL <https://movielens.org/movies/1>). Movie ids are consistent between `ratings.csv`, `tags.csv`, `movies.csv`, and `links.csv` (i.e., the same id refers to the same movie across these four data files).
72
+
73
+
74
+ Ratings Data File Structure (ratings.csv)
75
+ -----------------------------------------
76
+
77
+ All ratings are contained in the file `ratings.csv`. Each line of this file after the header row represents one rating of one movie by one user, and has the following format:
78
+
79
+ userId,movieId,rating,timestamp
80
+
81
+ The lines within this file are ordered first by userId, then, within user, by movieId.
82
+
83
+ Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).
84
+
85
+ Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
86
+
87
+
88
+ Tags Data File Structure (tags.csv)
89
+ -----------------------------------
90
+
91
+ All tags are contained in the file `tags.csv`. Each line of this file after the header row represents one tag applied to one movie by one user, and has the following format:
92
+
93
+ userId,movieId,tag,timestamp
94
+
95
+ The lines within this file are ordered first by userId, then, within user, by movieId.
96
+
97
+ Tags are user-generated metadata about movies. Each tag is typically a single word or short phrase. The meaning, value, and purpose of a particular tag is determined by each user.
98
+
99
+ Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
100
+
101
+
102
+ Movies Data File Structure (movies.csv)
103
+ ---------------------------------------
104
+
105
+ Movie information is contained in the file `movies.csv`. Each line of this file after the header row represents one movie, and has the following format:
106
+
107
+ movieId,title,genres
108
+
109
+ Movie titles are entered manually or imported from <https://www.themoviedb.org/>, and include the year of release in parentheses. Errors and inconsistencies may exist in these titles.
110
+
111
+ Genres are a pipe-separated list, and are selected from the following:
112
+
113
+ * Action
114
+ * Adventure
115
+ * Animation
116
+ * Children's
117
+ * Comedy
118
+ * Crime
119
+ * Documentary
120
+ * Drama
121
+ * Fantasy
122
+ * Film-Noir
123
+ * Horror
124
+ * Musical
125
+ * Mystery
126
+ * Romance
127
+ * Sci-Fi
128
+ * Thriller
129
+ * War
130
+ * Western
131
+ * (no genres listed)
132
+
133
+
134
+ Links Data File Structure (links.csv)
135
+ ---------------------------------------
136
+
137
+ Identifiers that can be used to link to other sources of movie data are contained in the file `links.csv`. Each line of this file after the header row represents one movie, and has the following format:
138
+
139
+ movieId,imdbId,tmdbId
140
+
141
+ movieId is an identifier for movies used by <https://movielens.org>. E.g., the movie Toy Story has the link <https://movielens.org/movies/1>.
142
+
143
+ imdbId is an identifier for movies used by <http://www.imdb.com>. E.g., the movie Toy Story has the link <http://www.imdb.com/title/tt0114709/>.
144
+
145
+ tmdbId is an identifier for movies used by <https://www.themoviedb.org>. E.g., the movie Toy Story has the link <https://www.themoviedb.org/movie/862>.
146
+
147
+ Use of the resources listed above is subject to the terms of each provider.
148
+
149
+
150
+ Cross-Validation
151
+ ----------------
152
+
153
+ Prior versions of the MovieLens dataset included either pre-computed cross-folds or scripts to perform this computation. We no longer bundle either of these features with the dataset, since most modern toolkits provide this as a built-in feature. If you wish to learn about standard approaches to cross-fold computation in the context of recommender systems evaluation, see [LensKit](http://lenskit.org) for tools, documentation, and open-source code examples.
data/ml-latest-small/links.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/ml-latest-small/movies.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/ml-latest-small/ratings.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/ml-latest-small/tags.csv ADDED
The diff for this file is too large to render. See raw diff
 
main.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from qdrant_client import QdrantClient, models
3
+ from collections import defaultdict
4
+
5
+ # Load the data
6
+ tags = pd.read_csv("./data/ml-latest-small/tags.csv")
7
+ movies = pd.read_csv("./data/ml-latest-small/movies.csv")
8
+ ratings = pd.read_csv("./data/ml-latest-small/ratings.csv")
9
+
10
+ # Initialize Qdrant client and create collections
11
+ def init_qdrant():
12
+ qdrant = QdrantClient(":memory:") # Use in-memory for simplicity
13
+ qdrant.create_collection(
14
+ "movielens", vectors_config={}, sparse_vectors_config={"ratings": models.SparseVectorParams()}
15
+ )
16
+ return qdrant
17
+
18
+ # Load data and upload to Qdrant
19
+ def load_data(qdrant):
20
+ ratings['normalized_rating'] = (ratings.rating - ratings.rating.mean(axis=0)) / ratings.rating.std()
21
+
22
+ user_sparse_vectors = defaultdict(lambda: {"values": [], "indices": []})
23
+ for row in ratings.itertuples():
24
+ user_sparse_vectors[row.userId]["values"].append(row.normalized_rating)
25
+ user_sparse_vectors[row.userId]["indices"].append(row.movieId)
26
+
27
+ def data_generator():
28
+ for user_id, vector in user_sparse_vectors.items():
29
+ yield models.PointStruct(
30
+ id=user_id, vector={"ratings": vector}, payload={}
31
+ )
32
+
33
+ qdrant.upload_points("movielens", data_generator())
34
+
35
+ # Function to input and normalize ratings
36
+ def input_ratings(user_ratings, ratings):
37
+ final_ratings = {}
38
+
39
+ mean_rating = ratings.rating.mean()
40
+ std_rating = ratings.rating.std()
41
+
42
+ for movie_id, user_rating in user_ratings.values():
43
+ normalized_input_rating = (user_rating - mean_rating) / std_rating
44
+ final_ratings[movie_id] = normalized_input_rating
45
+
46
+ return final_ratings
47
+
48
+ # Search and recommendation function
49
+ def recommend_movies(qdrant, movies, my_ratings):
50
+ def to_vector(ratings):
51
+ vector = models.SparseVector(values=[], indices=[])
52
+ for movieId, rating in ratings.items():
53
+ vector.values.append(rating)
54
+ vector.indices.append(movieId)
55
+ return vector
56
+
57
+ user_vector = to_vector(my_ratings)
58
+
59
+ results = qdrant.search(
60
+ "movielens",
61
+ query_vector=models.NamedSparseVector(name="ratings", vector=user_vector),
62
+ with_vectors=True,
63
+ limit=20,
64
+ )
65
+
66
+ movie_scores = defaultdict(lambda: 0)
67
+ for user in results:
68
+ user_scores = user.vector["ratings"]
69
+ for idx, rating in zip(user_scores.indices, user_scores.values):
70
+ if idx in my_ratings:
71
+ continue
72
+ movie_scores[idx] += rating
73
+
74
+ top_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)
75
+ recommended_movies = [movies[movies.movieId == movieId].title.values[0] for movieId, score in top_movies[:5]]
76
+
77
+ return recommended_movies
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ qdrant-client==1.10.1
2
+ pandas==2.2.2
3
+ numpy==1.26.4