from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, funksvd, item_knn, user_knn, basic
from lenskit import topn

import pandas as pd


moviesDF = pd.read_csv("movies.csv", sep=",")
moviesDF.movieId = moviesDF.movieId.astype(int)
moviesDF.set_index("movieId", inplace=True)

df = pd.read_csv("ratings.csv", sep=",")
df.columns = ["user","item","rating","timestamp"] #LensKit require "user","item","rating" column names 

ratingCounts = df.groupby("item")["user"].count()/df.user.unique().shape[0] # fraction of users who rated the movie
moviesDF["RatingCount"] = ratingCounts
moviesDF.fillna(0, inplace=True)


movieTitles = moviesDF.title.loc[df.item]
df["movieTitle"] = movieTitles.values

ratingCount = moviesDF.RatingCount.loc[df.item]
df["RawMoviePopularity"] = ratingCount.values
df.head()


top_k = 10

pop_alg = basic.PopScore(score_method='quantile')
iKNN = item_knn.ItemItem(nnbrs = 10, feedback="implicit")
uKNN = user_knn.UserUser(nnbrs = 10, feedback="implicit")
funkSVD = funksvd.FunkSVD(features = 50, iterations = 5, lrate=0.01)
algs = [pop_alg, iKNN, uKNN, funkSVD]
anames = ["pop_alg", "iKNN", "uKNN", "funkSVD"]


for train,test in xf.partition_users(df, 1, xf.SampleFrac(0.2)): #define random sampled train and test sets
    print(train.shape, test.shape)
    break #to simplify things, only focus on one partition

(80672, 6) (20164, 6)


trained_algs = []
for a in algs:
    a_clone = util.clone(a)
    aR = Recommender.adapt(a_clone)
    aR.fit(train)
    trained_algs.append(aR)
trained_algs

[<lenskit.algorithms.ranking.TopN at 0x1bcae324640>,
 <lenskit.algorithms.ranking.TopN at 0x1bc9ea4b730>,
 <lenskit.algorithms.ranking.TopN at 0x1bcae326440>,
 <lenskit.algorithms.ranking.TopN at 0x1bcae3247c0>]


all_recs = []
users = test.user.unique()
for (a,name) in zip(trained_algs,anames):
    recs_a = batch.recommend(a, users, top_k, n_jobs=1)
    recs_a["Algorithm"] = name
    
    movieTitles = moviesDF.title.loc[recs_a.item]
    recs_a["movieTitle"] = movieTitles.values

    ratingCount = moviesDF.RatingCount.loc[recs_a.item]
    recs_a["RawMoviePopularity"] = ratingCount.values
    
    all_recs.append(recs_a)
    
all_recs = pd.concat(all_recs, ignore_index=True)
all_recs.tail()


rla = topn.RecListAnalysis()
rla.add_metric(topn.hit)
rla.add_metric(topn.dcg)

results = rla.compute(all_recs[["item","score","user","rank","Algorithm"]], test)
results.head()


results.reset_index().groupby("Algorithm").mean()


# average popularity in user profiles
perUserAveragePopularity = train.groupby("user")["RawMoviePopularity"].mean()
averageProfilePopularity = perUserAveragePopularity.mean()
averageProfilePopularity

averageMoviePopularity = ratingCounts.mean()
(averageProfilePopularity, averageMoviePopularity)

(0.13122333740265585, 0.016999683055613626)

	user	item	rating	timestamp	movieTitle	RawMoviePopularity
0	1	1	4.0	964982703	Toy Story (1995)	0.352459
1	1	3	4.0	964981247	Grumpier Old Men (1995)	0.085246
2	1	6	4.0	964982224	Heat (1995)	0.167213
3	1	47	5.0	964983815	Seven (a.k.a. Se7en) (1995)	0.332787
4	1	50	5.0	964982931	Usual Suspects, The (1995)	0.334426

	item	score	user	rank	Algorithm	movieTitle	RawMoviePopularity
24395	3030	4.551667	610	6	funkSVD	Yojimbo (1961)	0.021311
24396	6460	4.542326	610	7	funkSVD	Trial, The (Procès, Le) (1962)	0.008197
24397	720	4.540106	610	8	funkSVD	Wallace & Gromit: The Best of Aardman Animatio...	0.044262
24398	1262	4.539869	610	9	funkSVD	Great Escape, The (1963)	0.070492
24399	92535	4.498750	610	10	funkSVD	Louis C.K.: Live at the Beacon Theater (2011)	0.016393

	user	nrecs	hit	dcg
Algorithm
funkSVD	305.5	10.0	0.170492	0.502527
iKNN	305.5	10.0	0.685246	4.374455
pop_alg	305.5	10.0	0.631148	3.845440
uKNN	305.5	10.0	0.795082	5.530651

Labs 4: Biases and fairness in your favorite algorithms¶

Task 0: evaluate the performance of RS¶

Task 1: to what extent do individual algorithms suffer from popularity bias?¶

Task 2: de-biased evaluation¶

Task 3: fairness w.r.t. subgroups¶

Task 0: basic RS performance¶

Task 0 results:¶

Task 1: Evaluating popularity bias¶

Observation 1:¶

		nrecs	hit	dcg
Algorithm	user
pop_alg	1	10	1.0	11.21530
	2	10	0.0	0.00000
	3	10	0.0	0.00000
	4	10	1.0	0.63093
	5	10	1.0	3.90309