Lets start with what you should already have:
Instructions: complete Task 0 and then focus either on Tasks 1 & 2, or on Task 3. Leave last 15 minutes for reporting on the achieved results.
in the subsequent code, we assume usage of LensKit framework, MovieLens_latest_small dataset and Popularity, ItemKNN, UserKNN, and FunkSVD recommending algorithms
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, funksvd, item_knn, user_knn, basic
from lenskit import topn
import pandas as pd
C:\Users\lpesk\AppData\Roaming\Python\Python38\site-packages\pandas\core\computation\expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.1' currently installed). from pandas.core.computation.check import NUMEXPR_INSTALLED
moviesDF = pd.read_csv("movies.csv", sep=",")
moviesDF.movieId = moviesDF.movieId.astype(int)
moviesDF.set_index("movieId", inplace=True)
df = pd.read_csv("ratings.csv", sep=",")
df.columns = ["user","item","rating","timestamp"] #LensKit require "user","item","rating" column names
ratingCounts = df.groupby("item")["user"].count()/df.user.unique().shape[0] # fraction of users who rated the movie
moviesDF["RatingCount"] = ratingCounts
moviesDF.fillna(0, inplace=True)
movieTitles = moviesDF.title.loc[df.item]
df["movieTitle"] = movieTitles.values
ratingCount = moviesDF.RatingCount.loc[df.item]
df["RawMoviePopularity"] = ratingCount.values
df.head()
user | item | rating | timestamp | movieTitle | RawMoviePopularity | |
---|---|---|---|---|---|---|
0 | 1 | 1 | 4.0 | 964982703 | Toy Story (1995) | 0.352459 |
1 | 1 | 3 | 4.0 | 964981247 | Grumpier Old Men (1995) | 0.085246 |
2 | 1 | 6 | 4.0 | 964982224 | Heat (1995) | 0.167213 |
3 | 1 | 47 | 5.0 | 964983815 | Seven (a.k.a. Se7en) (1995) | 0.332787 |
4 | 1 | 50 | 5.0 | 964982931 | Usual Suspects, The (1995) | 0.334426 |
top_k = 10
pop_alg = basic.PopScore(score_method='quantile')
iKNN = item_knn.ItemItem(nnbrs = 10)
uKNN = user_knn.UserUser(nnbrs = 10)
funkSVD = funksvd.FunkSVD(features = 20, iterations = 10, lrate=0.01)
algs = [pop_alg, iKNN, uKNN, funkSVD]
anames = ["pop_alg", "iKNN", "uKNN", "funkSVD"]
for train,test in xf.partition_users(df, 1, xf.SampleFrac(0.2)): #define random sampled train and test sets
print(train.shape, test.shape)
break #to simplify things, only focus on one partition
(80672, 6) (20164, 6)
trained_algs = []
for a in algs:
a_clone = util.clone(a)
aR = Recommender.adapt(a_clone)
aR.fit(train)
trained_algs.append(aR)
trained_algs
Numba is using threading layer omp - consider TBB
BLAS using multiple threads - can cause oversubscription
found 2 potential runtime problems - see https://boi.st/lkpy-perf
C:\Users\lpesk\AppData\Roaming\Python\Python38\site-packages\lenskit\algorithms\item_knn.py:119: NumbaTypeSafetyWarning: unsafe cast from uint64 to int64. Precision may be lost.
b = blocks[bi]
[<lenskit.algorithms.ranking.TopN at 0x249c3092a00>, <lenskit.algorithms.ranking.TopN at 0x249a3740790>, <lenskit.algorithms.ranking.TopN at 0x249c6494ac0>, <lenskit.algorithms.ranking.TopN at 0x249c68848b0>]
all_recs = []
users = test.user.unique()
for (a,name) in zip(trained_algs,anames):
recs_a = batch.recommend(a, users, top_k)
recs_a["Algorithm"] = name
movieTitles = moviesDF.title.loc[recs_a.item]
recs_a["movieTitle"] = movieTitles.values
ratingCount = moviesDF.RatingCount.loc[recs_a.item]
recs_a["RawMoviePopularity"] = ratingCount.values
all_recs.append(recs_a)
all_recs = pd.concat(all_recs, ignore_index=True)
all_recs.tail()
item | score | user | rank | Algorithm | movieTitle | RawMoviePopularity | |
---|---|---|---|---|---|---|---|
24385 | 1204 | 4.511610 | 610 | 6 | funkSVD | Lawrence of Arabia (1962) | 0.073770 |
24386 | 951 | 4.497771 | 610 | 7 | funkSVD | His Girl Friday (1940) | 0.022951 |
24387 | 1178 | 4.493613 | 610 | 8 | funkSVD | Paths of Glory (1957) | 0.019672 |
24388 | 720 | 4.492577 | 610 | 9 | funkSVD | Wallace & Gromit: The Best of Aardman Animatio... | 0.044262 |
24389 | 3508 | 4.481829 | 610 | 10 | funkSVD | Outlaw Josey Wales, The (1976) | 0.029508 |
rla = topn.RecListAnalysis()
rla.add_metric(topn.hit)
rla.add_metric(topn.dcg)
results = rla.compute(all_recs[["item","score","user","rank","Algorithm"]], test)
results.head()
nrecs | hit | dcg | ||
---|---|---|---|---|
Algorithm | user | |||
pop_alg | 1 | 10 | 1.0 | 12.905686 |
2 | 10 | 0.0 | 0.000000 | |
3 | 10 | 0.0 | 0.000000 | |
4 | 10 | 1.0 | 0.630930 | |
5 | 10 | 1.0 | 1.261860 |
results.reset_index().groupby("Algorithm").mean()
user | nrecs | hit | dcg | |
---|---|---|---|---|
Algorithm | ||||
funkSVD | 305.500000 | 10.0 | 0.314754 | 1.477916 |
iKNN | 305.500000 | 10.0 | 0.009836 | 0.025508 |
pop_alg | 305.500000 | 10.0 | 0.629508 | 3.975884 |
uKNN | 305.914614 | 10.0 | 0.009852 | 0.018492 |
# average popularity in user profiles
perUserAveragePopularity = train.groupby("user")["RawMoviePopularity"].mean()
averageProfilePopularity = perUserAveragePopularity.mean()
averageProfilePopularity
averageMoviePopularity = ratingCounts.mean()
(averageProfilePopularity, averageMoviePopularity)
(0.13084868707489364, 0.016999683055613626)
User profiles themselves are already considerably biased towards popular movies (i.e., there is a big "long-tail" of not very popular movies)