###################################################################### # Read in the two files from the Transparency Project package. # # * Before starting, change the text in Folder to reflect where # # you stored the two data files. # # Load in the packages necessary to use the function ddply(). # # * The R package "plyr" requires other packages. You will have to # # load all of them into R for the ddply() commands to work. # ###################################################################### Folder <- "C:/Laboratory/Transparency/LiveMonth/"; require(plyr); ############################################################ # The file LiveMonthLiveAction.csv contains the actual # # betting information associated with live action product # # for each participant for each calendar day with at # # least one transaction from February 1, 2005 through # # February 28, 2007 or until the account was closed. # # * UserID is a numeric ID number bwin assigned to each # # player at the time of registration. The ID numbers in # # this data set range from 1324368 to 1405181. # # * Date is the date of the activity. # # * Bets is the number of bets placed that day. # # * Stake is the total amount bet that day (in Euros). # # * Winnings is the total amount won that day (in Euros). # # Winnings may include payouts from bets made on a # # previous day. For example, if a bet is placed on # # Tuesday for a game played on Wednesday, then the # # winnings will be added to Wednesday's total. # # This data file has 849928 rows and 32277 UserID's. # ############################################################ Live1 <- read.csv(paste0(Folder,"LiveMonthLiveAction.csv"), header=TRUE,stringsAsFactors=FALSE); Live1$Date <- as.Date(Live1$Date,format="%Y-%m-%d"); c(nrow(Live1),length(unique(Live1$UserID))) ############################################################ # The file LiveMonthAnalyticRoster.csv contains # # demographic data and other information for the 530 # # players in the analytic dataset. # # * UserID is a numeric ID number bwin assigned to each # # player at the time of registration. The ID numbers in # # this data set range from 1324368 to 1405181. # # * CountryID is a numeric code representing players’ # # countries of residence. Most are the ISO 3166-1 codes. # # The exceptions are 530 for Netherlands Antilles and # # 736 for Sudan. # # * LanguageID is a numeric code representing players’ # # primary languages. The numbers range from 1 to 20. See # # the table below. # # * Gender is coded as 0 for female and 1 for male. # # * Age is the age in years on the date of registration. # # * Dep1Date is the date of the player's first deposit # # with bwin.com in YYYY-MM-DD format. # # * ReasonCA is the reason for self-exclusion. # # -- A 1 means unsatisfied with the service. # # -- A 2 means not interested. # # -- A 3 means gambling-related problems. # # Group denotes which players went into which partition # # when Braverman and Shaffer partitioned the 530 players # # to test for reliability. # # * There are 274 players in Group A. # # * There are 256 players in Group B. # # Split describes the results of the cluster analyses # # performed on the two partitions. # # * Clusters A1 through A4 come from the 274. # # * Clusters B1 through B4 come from the 256. # # This file has 530 rows, one for each UserID. # ############################################################ Roster <- read.csv(paste0(Folder,"LiveMonthAnalyticRoster.csv"), header=TRUE,stringsAsFactors=FALSE); Roster$Dep1Date <- as.Date(Roster$Dep1Date,format="%Y-%m-%d"); nR <- nrow(Roster); nR ###################################################################### # Create Live2 with the data for the users who had at least four # # days of live action sports betting activity. # # * There are 21,996 users and 831,868 rows of data. # # Create the Labrie series with the measures for the 21,996. # ###################################################################### # Create Labrie1 with the measures covering the entire # # study period. # # * StakesSP is the total amount bet. # # * PrizesSP is the total amount won. # # * BetsSP is the total number of bets. # # * DaysSP is the number of days within the study period # # with at least one live action sports bet. # # * BPDaySP = BetsSP / DaysSP is the average number of # # bets the user places on an active betting day. # # * Period is the number of days between the first live # # action sports bet and the last live action sports bet, # # inclusive. # # * Liv1Date is the date of the user's first live action # # sports bet. # # * DayNum is a placeholder variable to be used later. # # * InR is equal to 1 if the user is one of the 530 in # # Roster and equal to 0 otherwise. # ############################################################ Labrie1 <- ddply(Live1,~UserID,summarise, StakeSP=sum(Stake),PrizesSP=sum(Winnings), BetsSP=sum(Bets),DaysSP=sum(Stake>0), Period=1+as.numeric(difftime(max(Date),min(Date), units="days")), Liv1Date=min(Date)); Labrie1 <- subset(Labrie1,Labrie1$DaysSP>=4); nL <- nrow(Labrie1); nL Labrie1 <- merge(Labrie1,data.frame(UserID=Roster$UserID, DayNum=0,InR=1), all.x=TRUE,all.y=FALSE,by=c("UserID")); Labrie1$InR[is.na(Labrie1$InR)] = 0; Labrie1$BPDaySP = Labrie1$BetsSP / Labrie1$DaysSP; Labrie1 <- Labrie1[,c("UserID","StakeSP","PrizesSP","BetsSP", "DaysSP","BPDaySP","Period","Liv1Date", "DayNum","InR")]; Live2 <- merge(Live1,Labrie1[,c("UserID","InR")], all.x=FALSE,all.y=TRUE,by=c("UserID")); Live2 <- Live2[order(Live2$UserID,Live2$Date),]; c(nL,length(unique(Live2$UserID)),nrow(Live2)) ############################################################ # Add in the day numbers for each betting day. # # * The day of the first bet is DayNum = 1. # # * The next day is DayNum = 2. # # Map2 links rows of Live2 to users for easy reference. # # * Rows is the number of rows of Live2 the user has. # # * Top is the user's top row in Live2. # # * Bot is the user's bottom row in Live2. # ############################################################ Scrap1 <- table(Live2$UserID); Map2 <- data.frame(UserID=as.numeric(rownames(Scrap1)), Rows=as.numeric(Scrap1),Top=0,Bot=0); Map2$Bot <- cumsum(Map2$Rows); Map2$Top <- c(1,1+Map2$Bot[1:(nL-1)]); i1 <- nL; while (i1 > 0) { Scrap1 <- Map2$Top[i1]:Map2$Bot[i1]; Live2$DayNum[Scrap1] = 1 + as.numeric(difftime(Live2$Date[Scrap1], Labrie1$Liv1Date[i1], units="days")); i1 = i1 - 1; } ############################################################ # Create the data set containing only the rows in Labrie1 # # corresponding each user's DayNum = 1 through 31. # # * Month1 has all 21,996 people and 159,044 rows. # # Create the dataset containing only rows of Month1 with # # at least one live action sports bet. # # * Act1 contains 21,966 people and 158,024 rows. # # Labrie2 is Labrie1 with measures covering those # # first 31 days. # # * Stake1M is the total amount bet on live action sports # # bets over the first 31 days. # # * Bets1M is the total number of bets on live action # # sports over the first 31 days. # # * Days1M is the number of days out of those first 31 # # with at least one live action sports bet. # # * BPDay1M = Bets1M / Days1M is the number of bets per # # active day over the course of the study period. # # * Small indicates if the account closer had at least one # # day during that first month where 0.00 < Stake < 0.10. # # * StakeSD1M is the standard deviation of the values of # # Stake over the active days (Stake > 0) within those # # first 31 days. # # -- If the user only has one active day over that first # # month, then we set StakeSD1M to 0. # ############################################################ Month1 <- subset(Live2,Live2$DayNum<=31); c(length(unique(Month1$UserID)),nrow(Month1)) Scrap1 <- ddply(Month1,~UserID,summarise, Days1M=sum(Stake>0),BetDime=sum(Stake>0.1)); nL - sum(Scrap1$UserID==Labrie1$UserID) (1:nL)[Scrap1$Days1M!=Scrap1$BetDime] Act1 <- subset(Month1,Month1$Stake>0); Act1 <- Act1[order(Act1$UserID,Act1$Date),]; nA <- nrow(Act1); nA Scrap2 <- ddply(Act1,~UserID,summarise, Rows=length(Stake),Stake1M=sum(Stake), Bets1M=sum(Bets),StakeSD1M=sd(Stake)); nL - sum(Scrap2$UserID==Labrie1$UserID) MapA1 <- data.frame(UserID=Scrap2$UserID, Rows=Scrap2$Rows, Top=0,Bot=cumsum(Scrap2$Rows)); MapA1$Top <- c(1,1+MapA1$Bot[1:(nL-1)]); Labrie2 <- cbind(Labrie1,Days1M=Scrap1$Days1M, Scrap2[,c("Stake1M","Bets1M","StakeSD1M")]); Labrie2$BPDay1M <- Labrie2$Bets1M / Labrie2$Days1M; Labrie2$Small <- (Scrap1$Days1M!=Scrap1$BetDime); Labrie2$StakeSD1M[is.na(Labrie2$StakeSD1M)] = 0; ############################################################ # Build the table with the values for sum(1:n) and # # sum((1:n)^2) for n's from 1 to 31. # # * SumI = sum(1:n) = n*(n+1)/2. # # * SumI2 = sum((1:n)^2) = n(n+1)(2n+1)/6. # # * Disc is the discriminant for the matrix # # matrix(c(SumI2,SumI,SumI,n),2,2). # # Calculate the trajectory slopes. # # * Each user's trajectory slope was based on a linear # # regression lm(log(Stake)~ActDayNum). # # * ActDayNum was the number of days up to that point # # where the user had bet at least 0.10 on live action # # sports bets. # # * For 21,914 users, each active day had Stake > 0.10. # # For them, ActDayNum was sequential: 1 for the first # # active day, 2 for the second active day, etc. # # For those where ActDayNum was sequential, there is a # # closed form solution for the slope. # # * n is the number of active days. # # * SumY = sum(log(Stake)). # # * SumIY = sum((1:n)*log(Stake)). # # * To get the formula for the slope, start with # # beta = (X'X)^-1 X'Y and fill in # # X = cbind(rep(1,n),1:n) and Y = log(Stake). # # If there is only one active day in the first month, # # then we set the slope to 0. # ############################################################ CombTable <- data.frame(N=1:31,SumI=0.5*(1:31)*(2:32), SumI2=(1:31)*(2:32)*(2*(1:31)+1)/6,Disc=0); CombTable$Disc = CombTable$N*CombTable$SumI2 - (CombTable$SumI)^2; Scrap1 <- ddply(Act1,~UserID,summarise, Days1M=length(Stake),SumY=sum(log(Stake)), SumIY=sum((1:length(Stake))*log(Stake))); Scrap1$Slope1M <- Scrap1$Days1M*Scrap1$SumIY - Scrap1$SumY*CombTable$SumI[Scrap1$Days1M]; Scrap1$Slope1M <- Scrap1$Slope1M / CombTable$Disc[Scrap1$Days1M]; Scrap1$Slope1M[is.na(Scrap1$Slope1M)] = 0; ############################################################ # For the remaining 82 users, we'll have to # # calculate the slope of the regression line honestly. # # * ActDayNum will be mostly sequential, but there'll be # # a hitch where Stake < 0.10. # # * We still have n, the number of active days. # # * We have the matrix X = cbind(rep(1,n),ActDayNum) and # # the vector Y = log(Stake). # # * To get the formula for the slope, # # use beta = (X'X)^-1 X'Y. # # * If there is only one active day in that first month, # # set the slope equal to 0. # # The slopes are in Labrie3$Slope1M. # ############################################################ Scrap2 <- (1:nL)[Labrie2$Small>0]; length(Scrap2) i1 <- length(Scrap2); while (i1 > 0) { j1 <- Scrap2[i1]; Scrap1$Slope1M[j1] = 0; Scrap3 <- MapA1$Top[j1]:MapA1$Bot[j1]; Scrap4 <- data.frame(Stake=Act1$Stake[Scrap3], BetDime=(Act1$Stake[Scrap3]>0.10), ActDayNum=0,Y=0); Scrap4$ActDayNum <- cumsum(Scrap4$BetDime); Scrap4$Y <- log(Scrap4$Stake); X <- cbind(rep(1,MapA1$Rows[j1]),Scrap4$ActDayNum); XPrimeX <- t(X) %*% X; XY <- t(X) %*% Scrap4$Y; if (Scrap4$ActDayNum[nrow(Scrap4)] > 1) { beta <- solve(XPrimeX,XY); Scrap1$Slope1M[j1] = beta[2]; } i1 = i1 - 1; } nL - sum(Scrap1$UserID==Labrie2$UserID) Labrie3 <- cbind(Labrie2,Slope1M=Scrap1$Slope1M); ###################################################################### # Generate the z-scores and the clusters described in Braverman and # # Shaffer (2010) (DOI: 10.1093/eurpub/ckp232). # ###################################################################### # Generate the z-scores for the variables BPDay1M, # # StakesSD1M, Days1M, and Slope1M. # # * We do not know what dataset Braverman and Shaffer used # # to obtain the means and standard deviations. # # * These means and standard deviations are reconstructed # # from z-scores found in a previous version of this # # Transparency project package. # # Variable Mean SD # # BPDay1M 4.24605667 5.211621 # # StakesSD1M 41.30182060 154.314391 # # Days1M 5.36156039 5.682126 # # Slope1M -0.07336332 0.718154 # ############################################################ MeanSD <- data.frame(Variable=c("BPDay1M","StakesSD1M", "Days1M","Slope1M"), Mean=c(4.24605667, 41.30182060,5.36156039,-0.07336332), SD=c( 5.211621,154.314391, 5.682126, 0.718154)); Scrap1 <- data.frame(ZBPDay1M= (Labrie3$BPDay1M-MeanSD$Mean[1])/MeanSD$SD[1], ZStakeSD1M= (Labrie3$StakeSD1M-MeanSD$Mean[2])/MeanSD$SD[2], ZDays1M= (Labrie3$Days1M-MeanSD$Mean[3])/MeanSD$SD[3], ZSlope1M= (Labrie3$Slope1M-MeanSD$Mean[4])/MeanSD$SD[4], stringsAsFactors=FALSE); Labrie4 <- cbind(Labrie3,Scrap1); ############################################################ # Create Julia, the data set with just the 530 users in # # the analytic data set. # # * Braverman and Shaffer performed a k-means cluster # # analysis with k = 4. # # * We will use the cluster centers listed in Table 1 of # # the manuscript to replicate the cluster assignments. # # * Braverman and Shaffer's groups are in the variable # # Julia$Cluster. # ############################################################ Centers <- data.frame(C1=c( 2.63489,1.78653, 4.40874, 0.26706), C2=c(-0.54361,0.03928, 0.15649,-2.48611), C3=c( 2.39258,1.89973, 0.26157, 0.14316), C4=c( 0.27904,0.00430,-0.04460, 0.22323)); Julia <- subset(Labrie4,Labrie4$InR==1) Scrap1 <- Julia[,c("ZDays1M","ZBPDay1M","ZStakeSD1M","ZSlope1M")]; Scrap2 <- data.frame(D1=rep(0,nR),D2=0,D3=0,D4=0,Min=0,Cluster=0); k1 <- 4; while (k1 > 0) { j1 <- 4; while (j1 > 0) { Scrap2[,k1] = Scrap2[,k1] + (Scrap1[,j1]-Centers[j1,k1])^2; j1 = j1 - 1; } k1 = k1 - 1; } Scrap2$Min = apply(Scrap2[,1:4],1,min); Scrap2$Cluster[Scrap2$D1==Scrap2$Min] = 1; Scrap2$Cluster[Scrap2$D2==Scrap2$Min] = 2; Scrap2$Cluster[Scrap2$D3==Scrap2$Min] = 3; Scrap2$Cluster[Scrap2$D4==Scrap2$Min] = 4; Julia$Cluster <- Scrap2$Cluster; ############################################################ # Replicate Table 2 from the manuscript. # ############################################################ Scrap1 <- ddply(Julia,~Cluster,summarise, n=length(Period), MeanPeriod=mean(Period), MeanStake=mean(StakeSP), MeanBets=mean(BetsSP), MeanBPD=mean(BPDaySP), MeanNetLoss=mean(StakeSP)-mean(PrizesSP) ); round(t(Scrap1),digits=2) # [,1] [,2] [,3] [,4] # Cluster 1.00 2.00 3.00 4.00 # n 15.00 22.00 115.00 378.00 # MeanPeriod 447.33 337.86 425.78 360.33 # MeanStake 74085.80 6317.81 23660.03 5444.43 # MeanBets 1165.67 165.91 1995.76 317.58 # MeanBPD 10.65 5.42 12.64 4.92 # MeanNetLoss 4308.62 809.87 1705.85 408.42