from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, funksvd, item_knn, user_knn, basic
from lenskit import topn

import pandas as pd

C:\Users\lpesk\AppData\Roaming\Python\Python38\site-packages\pandas\core\computation\expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.1' currently installed).
  from pandas.core.computation.check import NUMEXPR_INSTALLED


moviesDF = pd.read_csv("movies.csv", sep=",")
moviesDF.movieId = moviesDF.movieId.astype(int)
moviesDF.set_index("movieId", inplace=True)

df = pd.read_csv("ratings.csv", sep=",")
df.columns = ["user","item","rating","timestamp"] #LensKit require "user","item","rating" column names 

ratingCounts = df.groupby("item")["user"].count()/df.user.unique().shape[0] # fraction of users who rated the movie
moviesDF["RatingCount"] = ratingCounts
moviesDF.fillna(0, inplace=True)


movieTitles = moviesDF.title.loc[df.item]
df["movieTitle"] = movieTitles.values

ratingCount = moviesDF.RatingCount.loc[df.item]
df["RawMoviePopularity"] = ratingCount.values
df.head()


top_k = 10

pop_alg = basic.PopScore(score_method='quantile')
iKNN = item_knn.ItemItem(nnbrs = 10)
uKNN = user_knn.UserUser(nnbrs = 10)
funkSVD = funksvd.FunkSVD(features = 20, iterations = 10, lrate=0.01)
algs = [pop_alg, iKNN, uKNN, funkSVD]
anames = ["pop_alg", "iKNN", "uKNN", "funkSVD"]


for train,test in xf.partition_users(df, 1, xf.SampleFrac(0.2)): #define random sampled train and test sets
    print(train.shape, test.shape)
    break #to simplify things, only focus on one partition

(80672, 6) (20164, 6)


trained_algs = []
for a in algs:
    a_clone = util.clone(a)
    aR = Recommender.adapt(a_clone)
    aR.fit(train)
    trained_algs.append(aR)
trained_algs

Numba is using threading layer omp - consider TBB
BLAS using multiple threads - can cause oversubscription
found 2 potential runtime problems - see https://boi.st/lkpy-perf
C:\Users\lpesk\AppData\Roaming\Python\Python38\site-packages\lenskit\algorithms\item_knn.py:119: NumbaTypeSafetyWarning: unsafe cast from uint64 to int64. Precision may be lost.
  b = blocks[bi]

[<lenskit.algorithms.ranking.TopN at 0x249c3092a00>,
 <lenskit.algorithms.ranking.TopN at 0x249a3740790>,
 <lenskit.algorithms.ranking.TopN at 0x249c6494ac0>,
 <lenskit.algorithms.ranking.TopN at 0x249c68848b0>]


all_recs = []
users = test.user.unique()
for (a,name) in zip(trained_algs,anames):
    recs_a = batch.recommend(a, users, top_k)
    recs_a["Algorithm"] = name
    
    movieTitles = moviesDF.title.loc[recs_a.item]
    recs_a["movieTitle"] = movieTitles.values

    ratingCount = moviesDF.RatingCount.loc[recs_a.item]
    recs_a["RawMoviePopularity"] = ratingCount.values
    
    all_recs.append(recs_a)
    
all_recs = pd.concat(all_recs, ignore_index=True)
all_recs.tail()


rla = topn.RecListAnalysis()
rla.add_metric(topn.hit)
rla.add_metric(topn.dcg)

results = rla.compute(all_recs[["item","score","user","rank","Algorithm"]], test)
results.head()


results.reset_index().groupby("Algorithm").mean()


# average popularity in user profiles
perUserAveragePopularity = train.groupby("user")["RawMoviePopularity"].mean()
averageProfilePopularity = perUserAveragePopularity.mean()
averageProfilePopularity

averageMoviePopularity = ratingCounts.mean()
(averageProfilePopularity, averageMoviePopularity)

(0.13084868707489364, 0.016999683055613626)

	user	item	rating	timestamp	movieTitle	RawMoviePopularity
0	1	1	4.0	964982703	Toy Story (1995)	0.352459
1	1	3	4.0	964981247	Grumpier Old Men (1995)	0.085246
2	1	6	4.0	964982224	Heat (1995)	0.167213
3	1	47	5.0	964983815	Seven (a.k.a. Se7en) (1995)	0.332787
4	1	50	5.0	964982931	Usual Suspects, The (1995)	0.334426

	item	score	user	rank	Algorithm	movieTitle	RawMoviePopularity
24385	1204	4.511610	610	6	funkSVD	Lawrence of Arabia (1962)	0.073770
24386	951	4.497771	610	7	funkSVD	His Girl Friday (1940)	0.022951
24387	1178	4.493613	610	8	funkSVD	Paths of Glory (1957)	0.019672
24388	720	4.492577	610	9	funkSVD	Wallace & Gromit: The Best of Aardman Animatio...	0.044262
24389	3508	4.481829	610	10	funkSVD	Outlaw Josey Wales, The (1976)	0.029508

	user	nrecs	hit	dcg
Algorithm
funkSVD	305.500000	10.0	0.314754	1.477916
iKNN	305.500000	10.0	0.009836	0.025508
pop_alg	305.500000	10.0	0.629508	3.975884
uKNN	305.914614	10.0	0.009852	0.018492

Labs 4: Biases and fairness in your favorite algorithms¶

Task 0: evaluate the performance of RS¶

Task 1: to what extent do individual algorithms suffer from popularity bias?¶

Task 2: de-biased evaluation¶

Task 3: fairness w.r.t. subgroups¶

Task 0: basic RS performance¶

Task 0 results:¶

Task 1: Evaluating popularity bias¶

Observation 1:¶

		nrecs	hit	dcg
Algorithm	user
pop_alg	1	10	1.0	12.905686
	2	10	0.0	0.000000
	3	10	0.0	0.000000
	4	10	1.0	0.630930
	5	10	1.0	1.261860