! wget -q http://files.grouplens.org/datasets/movielens/ml-100k.zip -O movie-lense.zip\
    && unzip -o movie-lense.zip

Archive:  movie-lense.zip
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating: ml-100k/u3.test         
  inflating: ml-100k/u4.base         
  inflating: ml-100k/u4.test         
  inflating: ml-100k/u5.base         
  inflating: ml-100k/u5.test         
  inflating: ml-100k/ua.base         
  inflating: ml-100k/ua.test         
  inflating: ml-100k/ub.base         
  inflating: ml-100k/ub.test

!head ml-100k/u.data

196	242	3	881250949
186	302	3	891717742
22	377	1	878887116
244	51	2	880606923
166	346	1	886397596
298	474	4	884182806
115	265	2	881171488
253	465	5	891628467
305	451	3	886324817
6	86	3	883603013

!head -n7 ml-100k/u.item

1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0
2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0
3|Four Rooms (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0
4|Get Shorty (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)|0|1|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0
5|Copycat (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Copycat%20(1995)|0|0|0|0|0|0|1|0|1|0|0|0|0|0|0|0|1|0|0
6|Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)|01-Jan-1995||http://us.imdb.com/Title?Yao+a+yao+yao+dao+waipo+qiao+(1995)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0
7|Twelve Monkeys (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Twelve%20Monkeys%20(1995)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|1|0|0|0

!head ml-100k/u.user

1|24|M|technician|85711
2|53|F|other|94043
3|23|M|writer|32067
4|24|M|technician|43537
5|33|F|other|15213
6|42|M|executive|98101
7|57|M|administrator|91344
8|36|M|administrator|05201
9|29|M|student|01002
10|53|M|lawyer|90703

#| echo: false
! python -m site

sys.path = [
    '/home/jovyan/work/lecture-slides',
    '/opt/conda/lib/python311.zip',
    '/opt/conda/lib/python3.11',
    '/opt/conda/lib/python3.11/lib-dynload',
    '/opt/conda/lib/python3.11/site-packages',
]
USER_BASE: '/home/jovyan/.local' (exists)
USER_SITE: '/home/jovyan/.local/lib/python3.11/site-packages' (doesn't exist)
ENABLE_USER_SITE: True

#| code-fold: true
#| code-summary: "Reloading if module movielense.py changes"
%load_ext autoreload
%autoreload 2

%%writefile movielense.py
import numpy as np
import pandas as pd
from pathlib import Path

Overwriting movielense.py

%%writefile -a movielense.py
def import_users(user_filename):
    """
    Imports Movie Lense user data into Pandas DataFrame
    
    user_filename: e.g. location of file `ml-100k/u.data` from
        http://files.grouplens.org/datasets/movielens/ml-100k.zip
        
    """
    users_list   = [l.split('|')  for l in Path(user_filename).read_text().split('\n')]
    return pd.DataFrame(
        users_list,
        columns='user id | age | gender | occupation | zip code'.split(' | '),
    ).dropna().astype(
        {'user id':'int', 'age':'int', 'gender':'str', 'occupation':'str', 'zip code':'str'}
    ).set_index('user id')

Appending to movielense.py

%%writefile -a movielense.py
def import_movies(movies_filename):
    """
    Imports Movie Lense movies data into Pandas DataFrame
    
    movies_filename: e.g. location of file `ml-100k/u.item` from 
        http://files.grouplens.org/datasets/movielens/ml-100k.zip
        
    """
    movies_list  = [l.split('|')  for l in Path(movies_filename).read_text(encoding = "ISO-8859-1").split('\n')]
    movies = pd.DataFrame(
        movies_list,
        columns='movie id | movie title | release date | video release date | '\
            'IMDb URL | unknown | Action | Adventure | Animation | '\
            'Children\'s | Comedy | Crime | Documentary | Drama | Fantasy | '\
            'Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | '\
            'Thriller | War | Western'.split(' | '),
    ).dropna()

    d = {'movie id':'int', 'movie title':'str', 'release date':'datetime64[ns]', 'video release date':'datetime64[ns]', 'IMDb URL':'str', 'Action':'int'}
    genre_columns = movies.columns[-19:]
    
    return movies.astype(d).astype(dict(zip(genre_columns, [int]*19))).astype(dict(zip(genre_columns, [bool]*19))).set_index('movie id')

Appending to movielense.py

%%writefile -a movielense.py
def import_ratings(data_filename, movies=None):
    """
    Imports Movie Lense ratings data into Pandas DataFrame
    
    data_filename: e.g. location of file `ml-100k/u.data` from 
        http://files.grouplens.org/datasets/movielens/ml-100k.zip
    movies: DataFrame resulting from `import_users`
    
    """
    
    ratings_list = [l.split('\t') for l in Path(data_filename).read_text().split('\n')]
    
    ratings = pd.DataFrame(
        ratings_list,
        columns='user id | item id | rating | timestamp'.split(' | ')
    ).dropna().astype({'timestamp':'int'}).astype(
        {'user id':'int', 'item id':'int', 'rating':'int', 'timestamp':'datetime64[s]'}
    ).rename(columns={'item id':'movie id'}).set_index(['user id','movie id']).drop(columns=['timestamp'])
    
    if (movies is not None):
        ratings = ratings.join(movies['movie title'], on='movie id').set_index('movie title', append=True)
        
    return ratings

Appending to movielense.py

import pandas as pd
import altair as alt
import numpy as np
import movielense as ml

users = ml.import_users('ml-100k/u.user')
movies = ml.import_movies('ml-100k/u.item')
ratings = ml.import_ratings('ml-100k/u.data', movies)

users.head()

movies.head(3)

ratings.head()

R_all = ratings.unstack(['user id'])
R_all

#| code-fold: true
#| code-summary: "Create a small subset of data"
I = 16
M = 15

# retrieve movies/users combination that is not *too* sparse
top_users = R_all.agg('sum', axis=0).nlargest(70).tail(I).index
top_movies = R_all.agg('sum', axis=1).nlargest(70).tail(M).index

R = R_all.loc[top_movies, top_users]
R

#| code-fold: true
#| code-summary: "Visualize missing rating values"

long = lambda x: x.stack().reset_index()

# https://altair-viz.github.io/user_guide/encoding.html#encoding-data-types
alt.Chart(long(R)).mark_rect().encode(
    x='user id:O',
    y='movie title:O',
    color=alt.Color('rating:O', scale=alt.Scale(scheme='yellowgreenblue'))
)

/tmp/ipykernel_2933879/2524743408.py:4: FutureWarning: The previous implementation of stack is deprecated and will be removed in a future version of pandas. See the What's New notes for pandas 2.1.0 for details. Specify future_stack=True to adopt the new implementation and silence this warning.
  long = lambda x: x.stack().reset_index()

#| code-fold: show
# number of latent factors
K = 3

# initialize U and V with random values
np.random.seed(42)

U = np.random.uniform(0, 1, size=K*I).reshape((I, K))
V = np.random.uniform(0, 1, size=K*M).reshape((M, K))

Uold = np.zeros_like(U)
Vold = np.zeros_like(V)

U.shape

(16, 3)

#| code-fold: show
# calculate RMSE
def rmse(X, Y):
    from numpy import sqrt, nanmean
    return sqrt(nanmean((X - Y)**2))

error = [(0, rmse(R, np.inner(V,U)))]

#| code-fold: show
# calculate maximum magnitude of relative updates
def max_update(X, Y):
    from numpy import inf
    from numpy.linalg import norm
    return norm(((X - Y)/Y).ravel(), inf)

update = [(0, max(max_update(Uold, U), max_update(Vold, V)))]

#| code-fold: show
rate = 0.1            # learning rate (step size) 
max_iterations = 300  # maximum number of iterations
threshold = 0.001     # max_update threshold for termination

for t in range(1, max_iterations):
     
    for m, i in zip(*np.where(~np.isnan(R))):
        
        U[i] = U[i] + rate*V[m]*(R.iloc[m,i] - np.inner(V[m], U[i]))
        V[m] = V[m] + rate*U[i]*(R.iloc[m,i] - np.inner(V[m], U[i]))
        
    # compute error after one sweep of updates
    error += [(t, rmse(R, np.inner(V,U)))]
    
    # keep track of how much U and V changes
    update += [(t, max(max_update(Uold, U), max_update(Vold, V)))]
    Uold = U.copy()
    Vold = V.copy()
    
error = pd.DataFrame(error, columns=['iteration', 'rmse'])
update = pd.DataFrame(update , columns=['iteration', 'maximum update'])

f_rmse = alt.Chart(error).encode(x='iteration:Q', y=alt.Y('rmse:Q', scale=alt.Scale(type='log', base=10, domain=[0.1, 3])))
# f_update = alt.Chart(update).encode(x='iteration:Q', y='maximum update:Q')
f_update = alt.Chart(update).encode(x='iteration:Q', y=alt.Y('maximum update:Q', scale=alt.Scale(type='log', base=10)))

alt.hconcat(
    alt.layer(f_rmse.mark_line(), f_rmse.mark_point(filled=True), title='Root Mean Square Error'),
    alt.layer(f_update.mark_line(), f_update.mark_point(filled=True), title='Maximum Relative Update')
)

#| echo: false
# Rone is DataFrame of all ones with R's structure
Rone = pd.DataFrame().reindex_like(R).replace(np.nan, 1)
# multiplying by Rone copies DataFrame structure
Rhat = np.inner(V, U) * Rone 
Rhat_if_obs = Rhat.where(~np.isnan(R), np.nan)

R_compare = \
    R.rename(columns={'rating':'observed'})\
    .join(Rhat_if_obs.rename(columns={'rating':'fit'}))\
    .join(Rhat.rename(columns={'rating':'fit/prediction'}))\
    .join((Rhat_if_obs-R).rename(columns={'rating':'deviation'}))

long(R_compare).head(3)

/tmp/ipykernel_2933879/2524743408.py:4: FutureWarning: The previous implementation of stack is deprecated and will be removed in a future version of pandas. See the What's New notes for pandas 2.1.0 for details. Specify future_stack=True to adopt the new implementation and silence this warning.
  long = lambda x: x.stack().reset_index()

#| echo: false

# create new base plot
base = alt.Chart(long(R_compare)).mark_rect().encode(
    x='user id:O',
    y='movie title:O',
    tooltip=['user id', 'movie title', 'fit/prediction', 'observed', 'deviation']
)

# raw ratings data
f_raw = base\
    .properties(title='Ratings Data')\
    .encode(color=alt.Color('observed:O', scale=alt.Scale(scheme='yellowgreenblue', domain=[1,2,3,4,5])))

# fit and predicted ratings
f_all = base\
    .properties(title='Ratings Fit and Predictions')\
    .encode(color=alt.Color('fit/prediction:Q', scale=alt.Scale(scheme='yellowgreenblue', domain=[1, 5])))

# deviation between ratings data and fit
f_err = base\
    .properties(title='Deviation: Data - Fit')\
    .encode(color=alt.Color('deviation:Q', scale=alt.Scale(scheme='redblue', domain=[-2, 2])))

nearest = alt.selection_point(nearest=True, on='mouseover', empty=False) 

selectors = base.mark_square(filled=False, size=350).encode(
    x='user id:N',
    y='movie title:N',
    color=alt.value('black'),
    opacity=alt.condition(nearest, alt.value(1), alt.value(0))
).add_params(
    nearest
)

alt.hconcat(
    alt.layer(f_all.encode(color=alt.Color('fit/prediction:Q', legend=None, scale=alt.Scale(scheme='yellowgreenblue', domain=[1, 5]))),
              selectors),
    alt.layer(f_raw.encode(y=alt.Y('movie title:O', axis=alt.Axis(labels=False))),
              selectors),
).resolve_scale(color='independent')

/tmp/ipykernel_2933879/2524743408.py:4: FutureWarning: The previous implementation of stack is deprecated and will be removed in a future version of pandas. See the What's New notes for pandas 2.1.0 for details. Specify future_stack=True to adopt the new implementation and silence this warning.
  long = lambda x: x.stack().reset_index()

#| echo: false
selectors = base.mark_square(filled=False, size=350).encode(
    x='user id:N',
    y='movie title:N',
    color=alt.value('black'),
    opacity=alt.condition(nearest, alt.value(1), alt.value(0))
).add_params(
    nearest
)

alt.hconcat(
    alt.layer(f_all.encode(color=alt.Color('fit/prediction:Q', legend=None, scale=alt.Scale(scheme='yellowgreenblue', domain=[1, 5]))),
              selectors),
    alt.layer(f_err.encode(y=alt.Y('movie title:N', axis=alt.Axis(labels=False))),
              selectors),
).resolve_scale(color='independent')

#| echo: false
V = pd.DataFrame(V, index=R.index, 
                 columns=pd.MultiIndex.from_product([['affinity'], range(0, K)], names=[None, 'k']))
U = pd.DataFrame(U, index=R.columns.get_level_values(level='user id'),
                 columns=pd.MultiIndex.from_product([['affinity'], range(0, K)], names=[None, 'k']))
alt.hconcat(
    alt.Chart(long(V)).mark_rect().encode(x='k:N', y='movie title:N', color='affinity:Q'),
    alt.Chart(long(U)).mark_rect().encode(x='k:N', y='user id:N', color='affinity:Q')
)

/tmp/ipykernel_2933879/2524743408.py:4: FutureWarning: The previous implementation of stack is deprecated and will be removed in a future version of pandas. See the What's New notes for pandas 2.1.0 for details. Specify future_stack=True to adopt the new implementation and silence this warning.
  long = lambda x: x.stack().reset_index()
/tmp/ipykernel_2933879/2524743408.py:4: FutureWarning: The previous implementation of stack is deprecated and will be removed in a future version of pandas. See the What's New notes for pandas 2.1.0 for details. Specify future_stack=True to adopt the new implementation and silence this warning.
  long = lambda x: x.stack().reset_index()

		rating
	user id	1	2	3	4	5	6	7	8	9	10	...	934	935	936	937	938	939	940	941	942	943
movie id	movie title
1	Toy Story (1995)	5.0	4.0	NaN	NaN	4.0	4.0	NaN	NaN	NaN	4.0	...	2.0	3.0	4.0	NaN	4.0	NaN	NaN	5.0	NaN	NaN
2	GoldenEye (1995)	3.0	NaN	NaN	NaN	3.0	NaN	NaN	NaN	NaN	NaN	...	4.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	5.0
3	Four Rooms (1995)	4.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	4.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	Get Shorty (1995)	3.0	NaN	NaN	NaN	NaN	NaN	5.0	NaN	NaN	4.0	...	5.0	NaN	NaN	NaN	NaN	NaN	2.0	NaN	NaN	NaN
5	Copycat (1995)	3.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1678	Mat' i syn (1997)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1679	B. Monkey (1998)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1680	Sliding Doors (1998)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1681	You So Crazy (1994)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1682	Scream of Stone (Schrei aus Stein) (1991)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

		rating
	user id	883	716	387	85	339	178	389	271	1	650	727	312	269	328	299	301
movie id	movie title
132	Wizard of Oz, The (1939)	NaN	5.0	NaN	5.0	5.0	NaN	5.0	5.0	4.0	4.0	2.0	5.0	5.0	5.0	NaN	4.0
238	Raising Arizona (1987)	4.0	4.0	5.0	2.0	5.0	4.0	5.0	4.0	4.0	4.0	2.0	3.0	5.0	NaN	4.0	NaN
748	Saint, The (1997)	5.0	NaN	NaN	NaN	NaN	4.0	NaN	NaN	NaN	NaN	4.0	NaN	NaN	3.0	NaN	NaN
196	Dead Poets Society (1989)	NaN	5.0	2.0	4.0	4.0	4.0	3.0	4.0	5.0	4.0	4.0	NaN	1.0	NaN	NaN	4.0
197	Graduate, The (1967)	4.0	5.0	2.0	5.0	5.0	2.0	5.0	4.0	5.0	4.0	3.0	4.0	5.0	NaN	3.0	5.0
185	Psycho (1960)	5.0	5.0	NaN	NaN	4.0	NaN	5.0	3.0	4.0	3.0	NaN	5.0	5.0	4.0	3.0	NaN
194	Sting, The (1973)	3.0	5.0	3.0	4.0	4.0	4.0	4.0	5.0	4.0	4.0	NaN	4.0	5.0	3.0	3.0	4.0
742	Ransom (1996)	NaN	NaN	2.0	NaN	NaN	3.0	NaN	3.0	NaN	3.0	NaN	NaN	NaN	4.0	4.0	4.0
82	Jurassic Park (1993)	3.0	5.0	4.0	3.0	4.0	5.0	4.0	NaN	5.0	3.0	3.0	NaN	2.0	4.0	NaN	5.0
97	Dances with Wolves (1990)	NaN	4.0	2.0	2.0	4.0	5.0	NaN	5.0	3.0	3.0	NaN	5.0	NaN	3.0	4.0	4.0
475	Trainspotting (1996)	NaN	NaN	3.0	NaN	5.0	NaN	5.0	NaN	NaN	NaN	NaN	NaN	5.0	NaN	4.0	NaN
268	Chasing Amy (1997)	NaN	NaN	3.0	4.0	NaN	4.0	NaN	NaN	5.0	NaN	4.0	NaN	5.0	NaN	NaN	NaN
186	Blues Brothers, The (1980)	NaN	3.0	2.0	3.0	4.0	NaN	2.0	4.0	4.0	4.0	5.0	3.0	2.0	4.0	3.0	4.0
496	It's a Wonderful Life (1946)	2.0	5.0	3.0	4.0	5.0	NaN	4.0	5.0	NaN	4.0	NaN	5.0	5.0	NaN	3.0	5.0
111	Truth About Cats & Dogs, The (1996)	NaN	4.0	NaN	NaN	NaN	4.0	3.0	4.0	5.0	NaN	3.0	NaN	1.0	NaN	3.0	1.0

	movie id	movie title	user id	observed	fit	fit/prediction	deviation
0	132	Wizard of Oz, The (1939)	1	4.0	4.435839	4.435839	0.435839
1	132	Wizard of Oz, The (1939)	85	5.0	4.677426	4.677426	-0.322574
2	132	Wizard of Oz, The (1939)	178	NaN	NaN	3.566235	NaN

Watched	Recommend	Recommend
5 (observed) / 5.5 (fit/prediction)	5.8 (fit/prediction)	4.3 (fit/prediction)

Watched	Recommend	Recommend
5 (observed) / 5.2 (fit/prediction)	5.8 (fit/prediction)	4.6 (fit/prediction)

The Blues Brothers (1980)	Dances with Wolves (1990)	It's a Wonderful Life (1946)

Movie Recommender System: Latent Factor Analysis¶

Movie Recommender System: Alternating Least Squares¶

Movie Lense Data¶

Metadata: Rating, User, and Movie Raw Data¶

Raw Data: Ratings¶

Raw Data: Movie information¶

Raw Data: User information¶

Raw Data: Users and Movie Genres¶

Import Data: Custom Module¶

¶

User Module¶

Import Users¶

Import Movies¶

Import Ratings¶

Transforming Data¶

Visualize Ratings Matrix¶

Create Small Subset of Data¶

Visualizing Missing Values¶

Scalar Ratings: Users and Movies¶

Ratings Matrix: Users and Movies Matrices¶

Ratings Matrix: Users and Movies Matrices¶

Optimize to find best $U$ and $V$¶

Preparing to Optimize: Part 1¶

Preparing to Optimize: Part 2¶

Preparing to Optimize: Part 3¶

Compute Solutions: $U$ and $V$¶

Monitoring Optimization Progress¶

Visualize Results¶

Recommending Movies¶

Recommendation: User id 85¶

Recommendation: User id 727¶

Visualizing Errors¶

Comparing Users or Comparing Movies¶

Matrix Factors: $V$ and $U$¶

What can be improved?¶

	age	gender	occupation	zip code
user id
1	24	M	technician	85711
2	53	F	other	94043
3	23	M	writer	32067
4	24	M	technician	43537
5	33	F	other	15213

	movie title	release date	video release date	IMDb URL	unknown	Action	Adventure	Animation	Children's	Comedy	...	Fantasy	Film-Noir	Horror	Musical	Mystery	Romance	Sci-Fi	Thriller	War	Western
movie id
1	Toy Story (1995)	1995-01-01	NaT	http://us.imdb.com/M/title-exact?Toy%20Story%2...	False	False	False	True	True	True	...	False	False	False	False	False	False	False	False	False	False
2	GoldenEye (1995)	1995-01-01	NaT	http://us.imdb.com/M/title-exact?GoldenEye%20(...	False	True	True	False	False	False	...	False	False	False	False	False	False	False	True	False	False
3	Four Rooms (1995)	1995-01-01	NaT	http://us.imdb.com/M/title-exact?Four%20Rooms%...	False	False	False	False	False	False	...	False	False	False	False	False	False	False	True	False	False