######################################################################
# Read in the two files from the Transparency Project package. #
# * Before starting, change the text in Folder to reflect where #
# you stored the two data files. #
# Load in the packages necessary to use the function ddply(). #
# * The R package "plyr" requires other packages. You will have to #
# load all of them into R for the ddply() commands to work. #
######################################################################
Folder <- "C:/Laboratory/Transparency/LiveMonth/";
require(plyr);
############################################################
# The file LiveMonthLiveAction.csv contains the actual #
# betting information associated with live action product #
# for each participant for each calendar day with at #
# least one transaction from February 1, 2005 through #
# February 28, 2007 or until the account was closed. #
# * UserID is a numeric ID number bwin assigned to each #
# player at the time of registration. The ID numbers in #
# this data set range from 1324368 to 1405181. #
# * Date is the date of the activity. #
# * Bets is the number of bets placed that day. #
# * Stake is the total amount bet that day (in Euros). #
# * Winnings is the total amount won that day (in Euros). #
# Winnings may include payouts from bets made on a #
# previous day. For example, if a bet is placed on #
# Tuesday for a game played on Wednesday, then the #
# winnings will be added to Wednesday's total. #
# This data file has 849928 rows and 32277 UserID's. #
############################################################
Live1 <- read.csv(paste0(Folder,"LiveMonthLiveAction.csv"),
header=TRUE,stringsAsFactors=FALSE);
Live1$Date <- as.Date(Live1$Date,format="%Y-%m-%d");
c(nrow(Live1),length(unique(Live1$UserID)))
############################################################
# The file LiveMonthAnalyticRoster.csv contains #
# demographic data and other information for the 530 #
# players in the analytic dataset. #
# * UserID is a numeric ID number bwin assigned to each #
# player at the time of registration. The ID numbers in #
# this data set range from 1324368 to 1405181. #
# * CountryID is a numeric code representing playersâ€™ #
# countries of residence. Most are the ISO 3166-1 codes. #
# The exceptions are 530 for Netherlands Antilles and #
# 736 for Sudan. #
# * LanguageID is a numeric code representing playersâ€™ #
# primary languages. The numbers range from 1 to 20. See #
# the table below. #
# * Gender is coded as 0 for female and 1 for male. #
# * Age is the age in years on the date of registration. #
# * Dep1Date is the date of the player's first deposit #
# with bwin.com in YYYY-MM-DD format. #
# * ReasonCA is the reason for self-exclusion. #
# -- A 1 means unsatisfied with the service. #
# -- A 2 means not interested. #
# -- A 3 means gambling-related problems. #
# Group denotes which players went into which partition #
# when Braverman and Shaffer partitioned the 530 players #
# to test for reliability. #
# * There are 274 players in Group A. #
# * There are 256 players in Group B. #
# Split describes the results of the cluster analyses #
# performed on the two partitions. #
# * Clusters A1 through A4 come from the 274. #
# * Clusters B1 through B4 come from the 256. #
# This file has 530 rows, one for each UserID. #
############################################################
Roster <- read.csv(paste0(Folder,"LiveMonthAnalyticRoster.csv"),
header=TRUE,stringsAsFactors=FALSE);
Roster$Dep1Date <- as.Date(Roster$Dep1Date,format="%Y-%m-%d");
nR <- nrow(Roster); nR
######################################################################
# Create Live2 with the data for the users who had at least four #
# days of live action sports betting activity. #
# * There are 21,996 users and 831,868 rows of data. #
# Create the Labrie series with the measures for the 21,996. #
######################################################################
# Create Labrie1 with the measures covering the entire #
# study period. #
# * StakesSP is the total amount bet. #
# * PrizesSP is the total amount won. #
# * BetsSP is the total number of bets. #
# * DaysSP is the number of days within the study period #
# with at least one live action sports bet. #
# * BPDaySP = BetsSP / DaysSP is the average number of #
# bets the user places on an active betting day. #
# * Period is the number of days between the first live #
# action sports bet and the last live action sports bet, #
# inclusive. #
# * Liv1Date is the date of the user's first live action #
# sports bet. #
# * DayNum is a placeholder variable to be used later. #
# * InR is equal to 1 if the user is one of the 530 in #
# Roster and equal to 0 otherwise. #
############################################################
Labrie1 <- ddply(Live1,~UserID,summarise,
StakeSP=sum(Stake),PrizesSP=sum(Winnings),
BetsSP=sum(Bets),DaysSP=sum(Stake>0),
Period=1+as.numeric(difftime(max(Date),min(Date),
units="days")),
Liv1Date=min(Date));
Labrie1 <- subset(Labrie1,Labrie1$DaysSP>=4);
nL <- nrow(Labrie1); nL
Labrie1 <- merge(Labrie1,data.frame(UserID=Roster$UserID,
DayNum=0,InR=1),
all.x=TRUE,all.y=FALSE,by=c("UserID"));
Labrie1$InR[is.na(Labrie1$InR)] = 0;
Labrie1$BPDaySP = Labrie1$BetsSP / Labrie1$DaysSP;
Labrie1 <- Labrie1[,c("UserID","StakeSP","PrizesSP","BetsSP",
"DaysSP","BPDaySP","Period","Liv1Date",
"DayNum","InR")];
Live2 <- merge(Live1,Labrie1[,c("UserID","InR")],
all.x=FALSE,all.y=TRUE,by=c("UserID"));
Live2 <- Live2[order(Live2$UserID,Live2$Date),];
c(nL,length(unique(Live2$UserID)),nrow(Live2))
############################################################
# Add in the day numbers for each betting day. #
# * The day of the first bet is DayNum = 1. #
# * The next day is DayNum = 2. #
# Map2 links rows of Live2 to users for easy reference. #
# * Rows is the number of rows of Live2 the user has. #
# * Top is the user's top row in Live2. #
# * Bot is the user's bottom row in Live2. #
############################################################
Scrap1 <- table(Live2$UserID);
Map2 <- data.frame(UserID=as.numeric(rownames(Scrap1)),
Rows=as.numeric(Scrap1),Top=0,Bot=0);
Map2$Bot <- cumsum(Map2$Rows);
Map2$Top <- c(1,1+Map2$Bot[1:(nL-1)]);
i1 <- nL; while (i1 > 0) {
Scrap1 <- Map2$Top[i1]:Map2$Bot[i1];
Live2$DayNum[Scrap1] = 1 + as.numeric(difftime(Live2$Date[Scrap1],
Labrie1$Liv1Date[i1],
units="days"));
i1 = i1 - 1;
}
############################################################
# Create the data set containing only the rows in Labrie1 #
# corresponding each user's DayNum = 1 through 31. #
# * Month1 has all 21,996 people and 159,044 rows. #
# Create the dataset containing only rows of Month1 with #
# at least one live action sports bet. #
# * Act1 contains 21,966 people and 158,024 rows. #
# Labrie2 is Labrie1 with measures covering those #
# first 31 days. #
# * Stake1M is the total amount bet on live action sports #
# bets over the first 31 days. #
# * Bets1M is the total number of bets on live action #
# sports over the first 31 days. #
# * Days1M is the number of days out of those first 31 #
# with at least one live action sports bet. #
# * BPDay1M = Bets1M / Days1M is the number of bets per #
# active day over the course of the study period. #
# * Small indicates if the account closer had at least one #
# day during that first month where 0.00 < Stake < 0.10. #
# * StakeSD1M is the standard deviation of the values of #
# Stake over the active days (Stake > 0) within those #
# first 31 days. #
# -- If the user only has one active day over that first #
# month, then we set StakeSD1M to 0. #
############################################################
Month1 <- subset(Live2,Live2$DayNum<=31);
c(length(unique(Month1$UserID)),nrow(Month1))
Scrap1 <- ddply(Month1,~UserID,summarise,
Days1M=sum(Stake>0),BetDime=sum(Stake>0.1));
nL - sum(Scrap1$UserID==Labrie1$UserID)
(1:nL)[Scrap1$Days1M!=Scrap1$BetDime]
Act1 <- subset(Month1,Month1$Stake>0);
Act1 <- Act1[order(Act1$UserID,Act1$Date),];
nA <- nrow(Act1); nA
Scrap2 <- ddply(Act1,~UserID,summarise,
Rows=length(Stake),Stake1M=sum(Stake),
Bets1M=sum(Bets),StakeSD1M=sd(Stake));
nL - sum(Scrap2$UserID==Labrie1$UserID)
MapA1 <- data.frame(UserID=Scrap2$UserID,
Rows=Scrap2$Rows,
Top=0,Bot=cumsum(Scrap2$Rows));
MapA1$Top <- c(1,1+MapA1$Bot[1:(nL-1)]);
Labrie2 <- cbind(Labrie1,Days1M=Scrap1$Days1M,
Scrap2[,c("Stake1M","Bets1M","StakeSD1M")]);
Labrie2$BPDay1M <- Labrie2$Bets1M / Labrie2$Days1M;
Labrie2$Small <- (Scrap1$Days1M!=Scrap1$BetDime);
Labrie2$StakeSD1M[is.na(Labrie2$StakeSD1M)] = 0;
############################################################
# Build the table with the values for sum(1:n) and #
# sum((1:n)^2) for n's from 1 to 31. #
# * SumI = sum(1:n) = n*(n+1)/2. #
# * SumI2 = sum((1:n)^2) = n(n+1)(2n+1)/6. #
# * Disc is the discriminant for the matrix #
# matrix(c(SumI2,SumI,SumI,n),2,2). #
# Calculate the trajectory slopes. #
# * Each user's trajectory slope was based on a linear #
# regression lm(log(Stake)~ActDayNum). #
# * ActDayNum was the number of days up to that point #
# where the user had bet at least 0.10 on live action #
# sports bets. #
# * For 21,914 users, each active day had Stake > 0.10. #
# For them, ActDayNum was sequential: 1 for the first #
# active day, 2 for the second active day, etc. #
# For those where ActDayNum was sequential, there is a #
# closed form solution for the slope. #
# * n is the number of active days. #
# * SumY = sum(log(Stake)). #
# * SumIY = sum((1:n)*log(Stake)). #
# * To get the formula for the slope, start with #
# beta = (X'X)^-1 X'Y and fill in #
# X = cbind(rep(1,n),1:n) and Y = log(Stake). #
# If there is only one active day in the first month, #
# then we set the slope to 0. #
############################################################
CombTable <- data.frame(N=1:31,SumI=0.5*(1:31)*(2:32),
SumI2=(1:31)*(2:32)*(2*(1:31)+1)/6,Disc=0);
CombTable$Disc = CombTable$N*CombTable$SumI2 - (CombTable$SumI)^2;
Scrap1 <- ddply(Act1,~UserID,summarise,
Days1M=length(Stake),SumY=sum(log(Stake)),
SumIY=sum((1:length(Stake))*log(Stake)));
Scrap1$Slope1M <- Scrap1$Days1M*Scrap1$SumIY -
Scrap1$SumY*CombTable$SumI[Scrap1$Days1M];
Scrap1$Slope1M <- Scrap1$Slope1M / CombTable$Disc[Scrap1$Days1M];
Scrap1$Slope1M[is.na(Scrap1$Slope1M)] = 0;
############################################################
# For the remaining 82 users, we'll have to #
# calculate the slope of the regression line honestly. #
# * ActDayNum will be mostly sequential, but there'll be #
# a hitch where Stake < 0.10. #
# * We still have n, the number of active days. #
# * We have the matrix X = cbind(rep(1,n),ActDayNum) and #
# the vector Y = log(Stake). #
# * To get the formula for the slope, #
# use beta = (X'X)^-1 X'Y. #
# * If there is only one active day in that first month, #
# set the slope equal to 0. #
# The slopes are in Labrie3$Slope1M. #
############################################################
Scrap2 <- (1:nL)[Labrie2$Small>0]; length(Scrap2)
i1 <- length(Scrap2); while (i1 > 0) {
j1 <- Scrap2[i1]; Scrap1$Slope1M[j1] = 0;
Scrap3 <- MapA1$Top[j1]:MapA1$Bot[j1];
Scrap4 <- data.frame(Stake=Act1$Stake[Scrap3],
BetDime=(Act1$Stake[Scrap3]>0.10),
ActDayNum=0,Y=0);
Scrap4$ActDayNum <- cumsum(Scrap4$BetDime);
Scrap4$Y <- log(Scrap4$Stake);
X <- cbind(rep(1,MapA1$Rows[j1]),Scrap4$ActDayNum);
XPrimeX <- t(X) %*% X;
XY <- t(X) %*% Scrap4$Y;
if (Scrap4$ActDayNum[nrow(Scrap4)] > 1) {
beta <- solve(XPrimeX,XY);
Scrap1$Slope1M[j1] = beta[2];
}
i1 = i1 - 1;
}
nL - sum(Scrap1$UserID==Labrie2$UserID)
Labrie3 <- cbind(Labrie2,Slope1M=Scrap1$Slope1M);
######################################################################
# Generate the z-scores and the clusters described in Braverman and #
# Shaffer (2010) (DOI: 10.1093/eurpub/ckp232). #
######################################################################
# Generate the z-scores for the variables BPDay1M, #
# StakesSD1M, Days1M, and Slope1M. #
# * We do not know what dataset Braverman and Shaffer used #
# to obtain the means and standard deviations. #
# * These means and standard deviations are reconstructed #
# from z-scores found in a previous version of this #
# Transparency project package. #
# Variable Mean SD #
# BPDay1M 4.24605667 5.211621 #
# StakesSD1M 41.30182060 154.314391 #
# Days1M 5.36156039 5.682126 #
# Slope1M -0.07336332 0.718154 #
############################################################
MeanSD <- data.frame(Variable=c("BPDay1M","StakesSD1M",
"Days1M","Slope1M"),
Mean=c(4.24605667, 41.30182060,5.36156039,-0.07336332),
SD=c( 5.211621,154.314391, 5.682126, 0.718154));
Scrap1 <- data.frame(ZBPDay1M=
(Labrie3$BPDay1M-MeanSD$Mean[1])/MeanSD$SD[1],
ZStakeSD1M=
(Labrie3$StakeSD1M-MeanSD$Mean[2])/MeanSD$SD[2],
ZDays1M=
(Labrie3$Days1M-MeanSD$Mean[3])/MeanSD$SD[3],
ZSlope1M=
(Labrie3$Slope1M-MeanSD$Mean[4])/MeanSD$SD[4],
stringsAsFactors=FALSE);
Labrie4 <- cbind(Labrie3,Scrap1);
############################################################
# Create Julia, the data set with just the 530 users in #
# the analytic data set. #
# * Braverman and Shaffer performed a k-means cluster #
# analysis with k = 4. #
# * We will use the cluster centers listed in Table 1 of #
# the manuscript to replicate the cluster assignments. #
# * Braverman and Shaffer's groups are in the variable #
# Julia$Cluster. #
############################################################
Centers <- data.frame(C1=c( 2.63489,1.78653, 4.40874, 0.26706),
C2=c(-0.54361,0.03928, 0.15649,-2.48611),
C3=c( 2.39258,1.89973, 0.26157, 0.14316),
C4=c( 0.27904,0.00430,-0.04460, 0.22323));
Julia <- subset(Labrie4,Labrie4$InR==1)
Scrap1 <- Julia[,c("ZDays1M","ZBPDay1M","ZStakeSD1M","ZSlope1M")];
Scrap2 <- data.frame(D1=rep(0,nR),D2=0,D3=0,D4=0,Min=0,Cluster=0);
k1 <- 4; while (k1 > 0) {
j1 <- 4; while (j1 > 0) {
Scrap2[,k1] = Scrap2[,k1] + (Scrap1[,j1]-Centers[j1,k1])^2;
j1 = j1 - 1;
}
k1 = k1 - 1;
}
Scrap2$Min = apply(Scrap2[,1:4],1,min);
Scrap2$Cluster[Scrap2$D1==Scrap2$Min] = 1;
Scrap2$Cluster[Scrap2$D2==Scrap2$Min] = 2;
Scrap2$Cluster[Scrap2$D3==Scrap2$Min] = 3;
Scrap2$Cluster[Scrap2$D4==Scrap2$Min] = 4;
Julia$Cluster <- Scrap2$Cluster;
############################################################
# Replicate Table 2 from the manuscript. #
############################################################
Scrap1 <- ddply(Julia,~Cluster,summarise,
n=length(Period),
MeanPeriod=mean(Period),
MeanStake=mean(StakeSP),
MeanBets=mean(BetsSP),
MeanBPD=mean(BPDaySP),
MeanNetLoss=mean(StakeSP)-mean(PrizesSP)
); round(t(Scrap1),digits=2)
# [,1] [,2] [,3] [,4]
# Cluster 1.00 2.00 3.00 4.00
# n 15.00 22.00 115.00 378.00
# MeanPeriod 447.33 337.86 425.78 360.33
# MeanStake 74085.80 6317.81 23660.03 5444.43
# MeanBets 1165.67 165.91 1995.76 317.58
# MeanBPD 10.65 5.42 12.64 4.92
# MeanNetLoss 4308.62 809.87 1705.85 408.42