Data Mining Tutorial
Welcome to the companion website of the “Mining Big Data to Extract Patterns and Predict Real-Life Outcomes” article published in Psychological Methods in 2016.
Welcome to the companion website of the “Mining Big Data to Extract Patterns and Predict Real-Life Outcomes” article published in Psychological Methods in 2016.
These examples are aimed at readers who have a basic familiarity with R; if you have not previously used R, we recommend that you start by reading the official introduction to this powerful language for statistical programming.
Please see the original paper for the detailed description of this code. (Lines starting with "#" are comments and will not be executed by R.)
If you are looking for other interesting data sets that can be used in social science studies, you should consider the following resources (please email michalk@stanford.edu if you have additional suggestions):
You should also consider the following resources:
We welcome feedback and suggestions: michalk@stanford.edu
First, you need to download the sample data set. Next, unzip it. Then, you can read it into R:
# Read in files (provide a full path, e.g., "~/Desktop/sample_dataset/users-likes.csv")users <- read.csv("users.csv")likes <- read.csv("likes.csv")ul <- read.csv("users-likes.csv") # You can check what's inside each object using the following set of commands:head(users)head(likes)head(ul)tail(ul)tail(users)tail(likes)dim(ul)dim(users)dim(likes) # Match entries in ul with users and likes dictionariesul$user_row<-match(ul$userid,users$userid)ul$like_row<-match(ul$likeid,likes$likeid)# and inspect what happened: head(ul)Next, let's build user-footprint Matrix M:
# Install Matrix library - run only onceinstall.packages("Matrix") # Load Matrix libraryrequire(Matrix) # Construct the sparse User-Like Matrix MM <- sparseMatrix(i = ul$user_row, j = ul$like_row, x = 1)# Check the dimensions of Mdim(M)# Save user IDs as row names in Mrownames(M) <- users$userid # Save Like names as column names in Mcolnames(M) <- likes$name # Remove ul and likes objects (they won't be needed)rm(ul, likes)Next, trim matrix M to remove users and Likes that are too rare.
# Remove users/Likes occurring less than 50/150 times repeat { # Repeat whatever is in the brackets i <- sum(dim(M)) # Check the size of M M <- M[rowSums(M) >= 50, colSums(M) >= 150] # Retain only these rows/columns that meet the threshold if (sum(dim(M)) == i) break # If the size has not changed, break the loop }# Check the new size of Mdim(M)# Remove the users from users object that were removed# from Musers <- users[match(rownames(M), users$userid), ]# Check the new size of usersdim(users)Using SVD:
# Preset the random number generator in R # for the comparability of the resultsset.seed(seed = 68)# Install irlba package (run only once)install.packages("irlba")# Load irlba and extract 5 SVD dimensionslibrary(irlba)Msvd <- irlba(M, nv = 5)# User SVD scores are here:u <- Msvd$u# Like SVD scores are here:v <- Msvd$v# The scree plot of singular values:plot(Msvd$d)You should also varimax-rotate the resulting SVD space.
# First obtain rotated V matrix:# (unclass function has to be used to save it as an # object of type matrix and not loadings)v_rot <- unclass(varimax(Msvd$v)$loadings)# The cross-product of M and v_rot gives u_rot:u_rot <- as.matrix(M %*% v_rot)Using LDA:
# Install topicmodels package (run only once)install.packages("topicmodels")# Load itlibrary(topicmodels)# Conduct LDA analysis, see text for details on setting# alpha and delta parameters. # WARNING: this may take quite some time!Mlda <- LDA(M, control = list(alpha = 10, delta = .1, seed=68), k = 5, method = "Gibbs")# Extract user LDA cluster membershipsgamma <- Mlda@gamma# Extract Like LDA clusters memberships# betas are stored as logarithms, # function exp() is used to convert logs to probabilitiesbeta <- exp(Mlda@beta) # Log-likelihood of the model is stored here:Mlda@loglikelihood# and can be also accessed using logLik() function:logLik(Mlda)# Let us estimate the log-likelihood for 2,3,4, and 5 cluster solutions: lg <- list()for (i in 2:5) {Mlda <- LDA(M, k = i, control = list(alpha = 10, delta = .1, seed = 68), method = "Gibbs")lg[[i]] <- logLik(Mlda) }plot(2:5, unlist(lg)) First, we investigate the relationship between LDA clusters / SVD dimensions and psycho-demographic traits using correlation.
# Correlate user traits and their SVD scores# users[,-1] is used to exclude the column with IDscor(u_rot, users[,-1], use = "pairwise")# LDA versioncor(gamma, users[,-1], use = "pairwise")The resulting correlation matrix can be presented as a heatmap:
# You need to install ggplot2 and reshape2 packages first, run only once:install.packages("ggplot2", "reshape2") # Load these librarieslibrary(ggplot2)library(reshape2) # Get correlationsx<-round(cor(u_rot, users[,-1], use="p"),2) # Reshape it in an easy way using ggplot2y<-melt(x)colnames(y)<-c("SVD", "Trait", "r") # Produce the plotqplot(x=SVD, y=Trait, data=y, fill=r, geom="tile") + scale_fill_gradient2(limits=range(x), breaks=c(min(x), 0, max(x)))+ theme(axis.text=element_text(size=12), axis.title=element_text(size=14,face="bold"), panel.background = element_rect(fill='white', colour='white'))+ labs(x=expression('SVD'[rot]), y=NULL)You can also print the Likes with the highest and lowest varimax-rotated SVD and LDA scores:
# SVDtop <- list()bottom <-list()for (i in 1:5) {f <- order(v_rot[ ,i])temp <- tail(f, n = 10)top[[i]]<-colnames(M)[temp] temp <- head(f, n = 10)bottom[[i]]<-colnames(M)[temp] }# LDAtop <- list()for (i in 1:5) {f <- order(beta[i,])temp <- tail(f, n = 10)top[[i]]<-colnames(M)[temp] }# Split users into 10 groupsfolds <- sample(1:10, size = nrow(users), replace = T)# Take users from group 1 and assign them to the TEST subsettest <- folds == 1# Extract SVD dimensions from the TRAINING subset# training set can be accessed using !testMsvd <- irlba(M[!test, ], nv = 50)# Rotate Like SVD scores (V)v_rot <- unclass(varimax(Msvd$v)$loadings)# Rotate user SVD scores *for the entire sample*u_rot <- as.data.frame(as.matrix(M %*% v_rot))# Build linear regression model for openness# using TRAINING subsetfit_o <- glm(users$ope~., data = u_rot, subset = !test)# Inspect the regression coefficientscoef(fit_o)# Do the same for gender# use family = "binomial" for logistic regression modelfit_g <- glm(users$gender~.,data = u_rot, subset = !test, family = "binomial")Next, examine the accuracy of the models:
# Compute the predictions for the TEST subsetpred_o <- predict(fit_o, u_rot[test, ])pred_g <- predict(fit_g, u_rot[test, ], type = "response")# Correlate predicted and actual values for the TEST subsetr <- cor(users$ope[test], pred_o)r# Compute Area Under the Curve for gender# remember to install ROCR library firstlibrary(ROCR)temp <- prediction(pred_g, users$gender[test])auc <- performance(temp,"auc")@y.valuesaucYou can also examine the accuracy across different values of k.
# Choose which k are to be included in the analysisks<-c(2:10,15,20,30,40,50)# Preset an empty list to hold the resultsrs <- list()# Run the code below for each k in ksfor (k in ks){# Varimax rotate Like SVD dimensions 1 to kv_rot <- unclass(varimax(Msvd$v[, 1:k])$loadings)# This code is exactly like the one discussed earlieru_rot <- as.data.frame(as.matrix(M %*% v_rot))fit_o <- glm(users$ope~., data = u_rot, subset = !test)pred_o <- predict(fit_o, u_rot[test, ])# Save the resulting correlation coefficient as the# element of R called krs[[as.character(k)]] <- cor(users$ope[test], pred_o)}# Check the resultsrsThe results of this analysis can be presented as a plot:
# Convert rs into the correct formatdata<-data.frame(k=ks, r=as.numeric(rs)) # plot!ggplot(data=data, aes(x=k, y=r, group=1)) + theme_light() + stat_smooth(colour="red", linetype="dashed", size=1,se=F) + geom_point(colour="red", size=2, shape=21, fill="white") + scale_y_continuous(breaks = seq(0, .5, by = 0.05))Also, let's compute prediction accuracy (for openness) across all 10 cross-validation folds:
pred_o <- rep(NA, n = nrow(users))for (i in 1:10){ test <- folds == i Msvd <- irlba(M[!test, ], nv = 50) v_rot <- unclass(varimax(Msvd$v)$loadings) u_rot <- as.data.frame(as.matrix(M %*% v_rot)) fit_o <- glm(users$ope~., data = u_rot, subset = !test) pred_o[test] <- predict(fit_o, u_rot[test, ])}r <- cor(users$ope, pred_o)Finally, the following code can be used to estimate user LDA cluster scores in a cross-validated way:
Mlda <- LDA(M[!test, ], control=list(alpha=1, delta=.1, seed=68), k=50, method="Gibbs")temp<-posterior(Mlda, M)gamma<-as.data.frame(temp$topics)Below, you will find a complete code that can be used to build prediction models for all of the traits.
# Load librariesrequire(Matrix)library(ROCR)library(topicmodels)library(irlba)# Load filesusers<-read.csv("users.csv")likes<-read.csv("likes.csv")ul<-read.csv("users-likes.csv")# Construct the matrixul$user_row<-match(ul$userid,users$userid)ul$like_row<-match(ul$likeid,likes$likeid)M<-sparseMatrix(i=ul$user_row,j=ul$like_row,x=1)rownames(M)<-users$useridcolnames(M)<-likes$namerm(ul,likes)# Matrix trimmingwhile (T){ i<-sum(dim(M)) M<-M[rowSums(M)>=50, colSums(M)>=150] if (sum(dim(M))==i) break}users <- users[match(rownames(M),users$userid), ]# Start predictionsset.seed(seed=68)n_folds<-10 # set number of foldsk<-50 # set kvars<-colnames(users)[-1] # choose variables to predictfolds <- sample(1:n_folds, size = nrow(users), replace = T)results<-list()for (fold in 1:n_folds){ print(paste("Cross-validated predictions, fold:", fold)) test <- folds == fold # If you want to use SVD: Msvd <- irlba(M[!test, ], nv = k) v_rot <- unclass(varimax(Msvd$v)$loadings) predictors <- as.data.frame(as.matrix(M %*% v_rot)) # If you want to use LDA, comment out the SVD lines above, and uncomment two lines below # Mlda <- LDA(M[!test, ], control = list(alpha = 1, delta = .1, seed=68), k = k, method = "Gibbs") # predictors <- as.data.frame(posterior(Mlda,M, control = list(alpha = 1, delta = .1))$topics) for (var in vars){ results[[var]]<-rep(NA, n = nrow(users)) # check if the variable is dichotomous if (length(unique(na.omit(users[,var]))) ==2) { fit <- glm(users[,var]~., data = predictors, subset = !test, family = "binomial") results[[var]][test] <- predict(fit, predictors[test, ], type = "response") } else { fit<-glm(users[,var]~., data = predictors, subset = !test) results[[var]][test] <- predict(fit, predictors[test, ]) } print(paste(" Variable", var, "done.")) }}compute_accuracy <- function(ground_truth, predicted){ if (length(unique(na.omit(ground_truth))) ==2) { f<-which(!is.na(ground_truth)) temp <- prediction(predicted[f], ground_truth[f]) return(performance(temp,"auc")@y.values) } else {return(cor(ground_truth, predicted,use = "pairwise"))}}accuracies<-list()for (var in vars) accuracies[[var]]<-compute_accuracy(users[,var], results[[var]])This section presents the code used to produce the User-Movie Matrix X discussed in the text and the examples of the SVD/LDA analyses:
# Generate the user-movies matrixlibrary(Matrix)M<-sparseMatrix(i=c(3,4,7,3,4,5,7,1,2,7,1,2,7,5,6,5,6), j=c(1,1,1,2,2,2,2,3,3,3,4,4,4,5,5,6,6), x=1)rownames(M)<-c("Noah", "Emma", "Mason", "Sophia", "William", "James", "Tom")colnames(M)<-c("True Romance", "Pretty Woman", "Aliens", "Star Wars", "Due Date", "Hangover")M# SVD analysislibrary(irlba)Msvd <- irlba(M,nu = 3, nv=3) #svd() would work as well with small data# Varimax rotationMsvd$v_rot <- unclass(varimax(Msvd$v)$loadings)Msvd$u_rot <- as.matrix(M %*%Msvd$v_rot)# or, in fact: Msvd$u_rot <- M %*% as.matrix(sweep(Msvd$v_rot,2, Msvd$d,"/"))# Round and print the resultsMsvd <- lapply(Msvd, round,2) Msvd# SVD on a centered matrix; svd() would work as well with small dataMc<-scale(M, center = TRUE, scale = F)Mcsvd<-irlba(Mc,nu = 3, nv=3)# Varimax rotationMcsvd$v_rot <- unclass(varimax(Mcsvd$v)$loadings)Mcsvd$u_rot <- as.matrix(Mc %*%Mcsvd$v_rot)# Round and print the resultsMcsvd <- lapply(Mcsvd, round,2) Mcsvd# LDA analysislibrary(topicmodels)# Default alphaMlda<-LDA(M, k=3,method="Gibbs")gamma <- Mlda@gammabeta <- exp(Mlda@beta)str(Mlda)# Print the resultsround(gamma,2)round(beta,2)# Low alphaMlda2<-LDA(M, k=3, control=list(alpha=.01, delta=.01), method = "Gibbs")gamma2 <- Mlda2@gammabeta2 <- exp(Mlda2@beta)round(posterior(Mlda2, M)$topics,2)# Print the resultsround(gamma2,2)round(beta2,2)