Lets start with what you should already have:
Instructions: complete Task 0 and then focus either on Tasks 1 & 2, or on Task 3. Leave last 15 minutes for reporting on the achieved results.
in the subsequent code, we assume usage of LensKit framework, MovieLens_latest_small dataset and Popularity, ItemKNN, UserKNN, and FunkSVD recommending algorithms
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, funksvd, item_knn, user_knn, basic
from lenskit import topn
import pandas as pd
moviesDF = pd.read_csv("movies.csv", sep=",")
moviesDF.movieId = moviesDF.movieId.astype(int)
moviesDF.set_index("movieId", inplace=True)
df = pd.read_csv("ratings.csv", sep=",")
df.columns = ["user","item","rating","timestamp"] #LensKit require "user","item","rating" column names
ratingCounts = df.groupby("item")["user"].count()/df.user.unique().shape[0] # fraction of users who rated the movie
moviesDF["RatingCount"] = ratingCounts
moviesDF.fillna(0, inplace=True)
movieTitles = moviesDF.title.loc[df.item]
df["movieTitle"] = movieTitles.values
ratingCount = moviesDF.RatingCount.loc[df.item]
df["RawMoviePopularity"] = ratingCount.values
df.head()
| user | item | rating | timestamp | movieTitle | RawMoviePopularity | |
|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 4.0 | 964982703 | Toy Story (1995) | 0.352459 |
| 1 | 1 | 3 | 4.0 | 964981247 | Grumpier Old Men (1995) | 0.085246 |
| 2 | 1 | 6 | 4.0 | 964982224 | Heat (1995) | 0.167213 |
| 3 | 1 | 47 | 5.0 | 964983815 | Seven (a.k.a. Se7en) (1995) | 0.332787 |
| 4 | 1 | 50 | 5.0 | 964982931 | Usual Suspects, The (1995) | 0.334426 |
top_k = 10
pop_alg = basic.PopScore(score_method='quantile')
iKNN = item_knn.ItemItem(nnbrs = 10, feedback="implicit")
uKNN = user_knn.UserUser(nnbrs = 10, feedback="implicit")
funkSVD = funksvd.FunkSVD(features = 50, iterations = 5, lrate=0.01)
algs = [pop_alg, iKNN, uKNN, funkSVD]
anames = ["pop_alg", "iKNN", "uKNN", "funkSVD"]
for train,test in xf.partition_users(df, 1, xf.SampleFrac(0.2)): #define random sampled train and test sets
print(train.shape, test.shape)
break #to simplify things, only focus on one partition
(80672, 6) (20164, 6)
trained_algs = []
for a in algs:
a_clone = util.clone(a)
aR = Recommender.adapt(a_clone)
aR.fit(train)
trained_algs.append(aR)
trained_algs
[<lenskit.algorithms.ranking.TopN at 0x1bcae324640>, <lenskit.algorithms.ranking.TopN at 0x1bc9ea4b730>, <lenskit.algorithms.ranking.TopN at 0x1bcae326440>, <lenskit.algorithms.ranking.TopN at 0x1bcae3247c0>]
all_recs = []
users = test.user.unique()
for (a,name) in zip(trained_algs,anames):
recs_a = batch.recommend(a, users, top_k, n_jobs=1)
recs_a["Algorithm"] = name
movieTitles = moviesDF.title.loc[recs_a.item]
recs_a["movieTitle"] = movieTitles.values
ratingCount = moviesDF.RatingCount.loc[recs_a.item]
recs_a["RawMoviePopularity"] = ratingCount.values
all_recs.append(recs_a)
all_recs = pd.concat(all_recs, ignore_index=True)
all_recs.tail()
| item | score | user | rank | Algorithm | movieTitle | RawMoviePopularity | |
|---|---|---|---|---|---|---|---|
| 24395 | 3030 | 4.551667 | 610 | 6 | funkSVD | Yojimbo (1961) | 0.021311 |
| 24396 | 6460 | 4.542326 | 610 | 7 | funkSVD | Trial, The (Procès, Le) (1962) | 0.008197 |
| 24397 | 720 | 4.540106 | 610 | 8 | funkSVD | Wallace & Gromit: The Best of Aardman Animatio... | 0.044262 |
| 24398 | 1262 | 4.539869 | 610 | 9 | funkSVD | Great Escape, The (1963) | 0.070492 |
| 24399 | 92535 | 4.498750 | 610 | 10 | funkSVD | Louis C.K.: Live at the Beacon Theater (2011) | 0.016393 |
rla = topn.RecListAnalysis()
rla.add_metric(topn.hit)
rla.add_metric(topn.dcg)
results = rla.compute(all_recs[["item","score","user","rank","Algorithm"]], test)
results.head()
| nrecs | hit | dcg | ||
|---|---|---|---|---|
| Algorithm | user | |||
| pop_alg | 1 | 10 | 1.0 | 11.21530 |
| 2 | 10 | 0.0 | 0.00000 | |
| 3 | 10 | 0.0 | 0.00000 | |
| 4 | 10 | 1.0 | 0.63093 | |
| 5 | 10 | 1.0 | 3.90309 |
results.reset_index().groupby("Algorithm").mean()
| user | nrecs | hit | dcg | |
|---|---|---|---|---|
| Algorithm | ||||
| funkSVD | 305.5 | 10.0 | 0.170492 | 0.502527 |
| iKNN | 305.5 | 10.0 | 0.685246 | 4.374455 |
| pop_alg | 305.5 | 10.0 | 0.631148 | 3.845440 |
| uKNN | 305.5 | 10.0 | 0.795082 | 5.530651 |
# average popularity in user profiles
perUserAveragePopularity = train.groupby("user")["RawMoviePopularity"].mean()
averageProfilePopularity = perUserAveragePopularity.mean()
averageProfilePopularity
averageMoviePopularity = ratingCounts.mean()
(averageProfilePopularity, averageMoviePopularity)
(0.13122333740265585, 0.016999683055613626)
User profiles themselves are already considerably biased towards popular movies (i.e., there is a big "long-tail" of not very popular movies)