Scenario 4 • Music Genre Recommendation | Bayesian Analysis Portfolio

Analysis Code


import pandas as pd
import pymc as pm
import pytensor
import requests, zipfile, io
import xarray as xr
import numpy as np
import arviz as az

pytensor.config.blas__ldflags = ''
url = "http://labrosa.ee.columbia.edu/~dpwe/tmp/train_triplets.txt.zip"
response = requests.get(url)
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    with z.open('train_triplets.txt') as f:
        df = pd.read_csv(f, sep='\t', names=['user', 'song', 'count'])

df = df[df['user'].isin(df['user'].value_counts().head(100).index) & 
        df['song'].isin(df['song'].value_counts().head(100).index)]

play_counts = df.pivot_table(index='user', columns='song', values='count', fill_value=0)
play_counts = play_counts.reindex(index=df['user'].value_counts().head(50).index, 
                                  columns=df['song'].value_counts().head(50).index, fill_value=0).values
play_counts_norm = play_counts / (play_counts.max() + 1e-8)

with pm.Model() as abc_model:
    user_prefs = pm.Normal('user_prefs', mu=0, sigma=1, shape=(50, 3))
    song_features = pm.Normal('song_features', mu=0, sigma=1, shape=(50, 3))
    mu = pm.math.sigmoid(pm.math.dot(user_prefs, song_features.T))
    pm.Normal('observed_counts', mu=mu, sigma=0.1, observed=play_counts_norm)
    trace = pm.sample_smc(500, cores=4)

posterior = az.extract(trace, combined=True)
user_prefs_samples = np.asarray(posterior['user_prefs'].values, dtype=np.float64)
if user_prefs_samples.shape != (2000, 50, 3):
    user_prefs_samples = user_prefs_samples.transpose(2, 0, 1)

song_features_samples = np.asarray(posterior['song_features'].values, dtype=np.float64)
if song_features_samples.shape != (2000, 50, 3):
    song_features_samples = song_features_samples.transpose(2, 0, 1)

user_prefs_ds = xr.Dataset(
    {'user_prefs': (['draw', 'user', 'latent'], user_prefs_samples)},
    coords={'draw': np.arange(2000), 'user': np.arange(50), 'latent': np.arange(3)}
)
song_features_ds = xr.Dataset(
    {'song_features': (['draw', 'song', 'latent'], song_features_samples)},
    coords={'draw': np.arange(2000), 'song': np.arange(50), 'latent': np.arange(3)}
)
posterior_ds = xr.merge([user_prefs_ds, song_features_ds])
posterior_ds.to_netcdf('music_trace.nc')

Interactive Visualization

Insights: t-SNE embeddings cluster users by preference; SMC ensures fast, reliable ABC sampling (~10–15s). Traces saved as NetCDF for reproducibility. Full code in GitHub.

← Previous: Churn Prediction | Back to Home