######################################################################
# Read in the two files from the Transparency Project package.       #
# * Before starting, change the text in Folder to reflect where      #
#   you stored the two data files.                                   #
# Load in the packages necessary to use the function ddply().        #
# * The R package "plyr" requires other packages. You will have to   #
#   load all of them into R for the ddply() commands to work.        #
######################################################################
Folder <- "C:/Laboratory/Transparency/LiveMonth/";
require(plyr);
############################################################
# The file LiveMonthLiveAction.csv contains the actual     #
#  betting information associated with live action product #
#  for each participant for each calendar day with at      #
#  least one transaction from February 1, 2005 through     #
#  February 28, 2007 or until the account was closed.      #
# * UserID is a numeric ID number bwin assigned to each    #
#   player at the time of registration. The ID numbers in  #
#   this data set range from 1324368 to 1405181.           #
# * Date is the date of the activity.                      #
# * Bets is the number of bets placed that day.            #
# * Stake is the total amount bet that day (in Euros).     #
# * Winnings is the total amount won that day (in Euros).  #
#   Winnings may include payouts from bets made on a       #
#   previous day. For example, if a bet is placed on       #
#   Tuesday for a game played on Wednesday, then the       #
#   winnings will be added to Wednesday's total.           #
# This data file has 849928 rows and 32277 UserID's.       #
############################################################
Live1 <- read.csv(paste0(Folder,"LiveMonthLiveAction.csv"),
                  header=TRUE,stringsAsFactors=FALSE);
   Live1$Date <- as.Date(Live1$Date,format="%Y-%m-%d");
   c(nrow(Live1),length(unique(Live1$UserID)))
############################################################
# The file LiveMonthAnalyticRoster.csv contains            #
#  demographic data and other information for the 530      #
#  players in the analytic dataset.                        #
# * UserID is a numeric ID number bwin assigned to each    #
#   player at the time of registration. The ID numbers in  #
#   this data set range from 1324368 to 1405181.           #
# * CountryID is a numeric code representing players’      #
#   countries of residence. Most are the ISO 3166-1 codes. #
#   The exceptions are 530 for Netherlands Antilles and    #
#   736 for Sudan.                                         #
# * LanguageID is a numeric code representing players’     #
#   primary languages. The numbers range from 1 to 20. See #
#   the table below.                                       #
# * Gender is coded as 0 for female and 1 for male.        #
# * Age is the age in years on the date of registration.   #
# * Dep1Date is the date of the player's first deposit     #
#   with bwin.com in YYYY-MM-DD format.                    #
# * ReasonCA is the reason for self-exclusion.             #
# -- A 1 means unsatisfied with the service.               #
# -- A 2 means not interested.                             #
# -- A 3 means gambling-related problems.                  #
# Group denotes which players went into which partition    #
#  when Braverman and Shaffer partitioned the 530 players  #
#  to test for reliability.                                #
# * There are 274 players in Group A.                      #
# * There are 256 players in Group B.                      #
# Split describes the results of the cluster analyses      #
#  performed on the two partitions.                        #
# * Clusters A1 through A4 come from the 274.              #
# * Clusters B1 through B4 come from the 256.              #
# This file has 530 rows, one for each UserID.             #
############################################################
Roster <- read.csv(paste0(Folder,"LiveMonthAnalyticRoster.csv"),
                   header=TRUE,stringsAsFactors=FALSE);
   Roster$Dep1Date <- as.Date(Roster$Dep1Date,format="%Y-%m-%d");
   nR <- nrow(Roster); nR
######################################################################
# Create Live2 with the data for the users who had at least four     #
#  days of live action sports betting activity.                      #
# * There are 21,996 users and 831,868 rows of data.                 #
# Create the Labrie series with the measures for the 21,996.         #
######################################################################
# Create Labrie1 with the measures covering the entire     #
#  study period.                                           #
# * StakesSP is the total amount bet.                      #
# * PrizesSP is the total amount won.                      #
# * BetsSP is the total number of bets.                    #
# * DaysSP is the number of days within the study period   #
#   with at least one live action sports bet.              #
# * BPDaySP = BetsSP / DaysSP is the average number of     #
#   bets the user places on an active betting day.         #
# * Period is the number of days between the first live    #
#   action sports bet and the last live action sports bet, #
#   inclusive.                                             #
# * Liv1Date is the date of the user's first live action   #
#   sports bet.                                            #
# * DayNum is a placeholder variable to be used later.     #
# * InR is equal to 1 if the user is one of the 530 in     #
#   Roster and equal to 0 otherwise.                       #
############################################################
Labrie1 <- ddply(Live1,~UserID,summarise,
                 StakeSP=sum(Stake),PrizesSP=sum(Winnings),
                 BetsSP=sum(Bets),DaysSP=sum(Stake>0),
                 Period=1+as.numeric(difftime(max(Date),min(Date),
                                              units="days")),
                 Liv1Date=min(Date));
   Labrie1 <- subset(Labrie1,Labrie1$DaysSP>=4);
   nL <- nrow(Labrie1); nL
Labrie1 <- merge(Labrie1,data.frame(UserID=Roster$UserID,
                                    DayNum=0,InR=1),
                 all.x=TRUE,all.y=FALSE,by=c("UserID"));
   Labrie1$InR[is.na(Labrie1$InR)] = 0;
   Labrie1$BPDaySP = Labrie1$BetsSP / Labrie1$DaysSP;
   Labrie1 <- Labrie1[,c("UserID","StakeSP","PrizesSP","BetsSP",
                         "DaysSP","BPDaySP","Period","Liv1Date",
                         "DayNum","InR")];
Live2 <- merge(Live1,Labrie1[,c("UserID","InR")],
               all.x=FALSE,all.y=TRUE,by=c("UserID"));
   Live2 <- Live2[order(Live2$UserID,Live2$Date),];
   c(nL,length(unique(Live2$UserID)),nrow(Live2))
############################################################
# Add in the day numbers for each betting day.             #
# * The day of the first bet is DayNum = 1.                #
# * The next day is DayNum = 2.                            #
# Map2 links rows of Live2 to users for easy reference.    #
# * Rows is the number of rows of Live2 the user has.      #
# * Top is the user's top row in Live2.                    #
# * Bot is the user's bottom row in Live2.                 #
############################################################
Scrap1 <- table(Live2$UserID);
Map2 <- data.frame(UserID=as.numeric(rownames(Scrap1)),
                   Rows=as.numeric(Scrap1),Top=0,Bot=0);
   Map2$Bot <- cumsum(Map2$Rows);
   Map2$Top <- c(1,1+Map2$Bot[1:(nL-1)]);
i1 <- nL; while (i1 > 0) {
   Scrap1 <- Map2$Top[i1]:Map2$Bot[i1];
   Live2$DayNum[Scrap1] = 1 + as.numeric(difftime(Live2$Date[Scrap1],
                                                Labrie1$Liv1Date[i1],
                                                units="days"));
   i1 = i1 - 1;
}
############################################################
# Create the data set containing only the rows in Labrie1  #
#  corresponding each user's DayNum = 1 through 31.        #
# * Month1 has all 21,996 people and 159,044 rows.         #
# Create the dataset containing only rows of Month1 with   #
#  at least one live action sports bet.                    #
# * Act1 contains 21,966 people and 158,024 rows.          #
# Labrie2 is Labrie1 with measures covering those          #
#  first 31 days.                                          #
# * Stake1M is the total amount bet on live action sports  #
#   bets over the first 31 days.                           #
# * Bets1M is the total number of bets on live action      #
#   sports over the first 31 days.                         #
# * Days1M is the number of days out of those first 31     #
#   with at least one live action sports bet.              #
# * BPDay1M = Bets1M / Days1M is the number of bets per    #
#   active day over the course of the study period.        #
# * Small indicates if the account closer had at least one #
#   day during that first month where 0.00 < Stake < 0.10. #
# * StakeSD1M is the standard deviation of the values of   #
#   Stake over the active days (Stake > 0) within those    #
#   first 31 days.                                         #
# -- If the user only has one active day over that first   #
#    month, then we set StakeSD1M to 0.                    #
############################################################
Month1 <- subset(Live2,Live2$DayNum<=31);
   c(length(unique(Month1$UserID)),nrow(Month1))
Scrap1 <- ddply(Month1,~UserID,summarise,
                Days1M=sum(Stake>0),BetDime=sum(Stake>0.1));
   nL - sum(Scrap1$UserID==Labrie1$UserID)
   (1:nL)[Scrap1$Days1M!=Scrap1$BetDime]
Act1 <- subset(Month1,Month1$Stake>0);
   Act1 <- Act1[order(Act1$UserID,Act1$Date),];
   nA <- nrow(Act1); nA
Scrap2 <- ddply(Act1,~UserID,summarise,
                Rows=length(Stake),Stake1M=sum(Stake),
                Bets1M=sum(Bets),StakeSD1M=sd(Stake));
   nL - sum(Scrap2$UserID==Labrie1$UserID)
MapA1 <- data.frame(UserID=Scrap2$UserID,
                    Rows=Scrap2$Rows,
                    Top=0,Bot=cumsum(Scrap2$Rows));
   MapA1$Top <- c(1,1+MapA1$Bot[1:(nL-1)]);
Labrie2 <- cbind(Labrie1,Days1M=Scrap1$Days1M,
                 Scrap2[,c("Stake1M","Bets1M","StakeSD1M")]);
   Labrie2$BPDay1M <- Labrie2$Bets1M / Labrie2$Days1M;
   Labrie2$Small <- (Scrap1$Days1M!=Scrap1$BetDime);
   Labrie2$StakeSD1M[is.na(Labrie2$StakeSD1M)] = 0;
############################################################
# Build the table with the values for sum(1:n) and         #
#  sum((1:n)^2) for n's from 1 to 31.                      #
# * SumI = sum(1:n) = n*(n+1)/2.                           #
# * SumI2 = sum((1:n)^2) = n(n+1)(2n+1)/6.                 #
# * Disc is the discriminant for the matrix                #
#    matrix(c(SumI2,SumI,SumI,n),2,2).                     #
# Calculate the trajectory slopes.                         #
# * Each user's trajectory slope was based on a linear     #
#   regression lm(log(Stake)~ActDayNum).                   #
# * ActDayNum was the number of days up to that point      #
#   where the user had bet at least 0.10 on live action    #
#   sports bets.                                           #
# * For 21,914 users, each active day had Stake > 0.10.    #
#   For them, ActDayNum was sequential: 1 for the first    #
#   active day, 2 for the second active day, etc.          #
# For those where ActDayNum was sequential, there is a     #
#   closed form solution for the slope.                    #
# * n is the number of active days.                        #
# * SumY = sum(log(Stake)).                                #
# * SumIY = sum((1:n)*log(Stake)).                         #
# * To get the formula for the slope, start with           #
#   beta = (X'X)^-1 X'Y and fill in                        #
#   X = cbind(rep(1,n),1:n) and Y = log(Stake).            #
# If there is only one active day in the first month,      #
#  then we set the slope to 0.                             #
############################################################
CombTable <- data.frame(N=1:31,SumI=0.5*(1:31)*(2:32),
                        SumI2=(1:31)*(2:32)*(2*(1:31)+1)/6,Disc=0);
   CombTable$Disc = CombTable$N*CombTable$SumI2 - (CombTable$SumI)^2;
Scrap1 <- ddply(Act1,~UserID,summarise,
                Days1M=length(Stake),SumY=sum(log(Stake)),
                SumIY=sum((1:length(Stake))*log(Stake)));
Scrap1$Slope1M <- Scrap1$Days1M*Scrap1$SumIY -
                            Scrap1$SumY*CombTable$SumI[Scrap1$Days1M];
   Scrap1$Slope1M <- Scrap1$Slope1M / CombTable$Disc[Scrap1$Days1M];
   Scrap1$Slope1M[is.na(Scrap1$Slope1M)] = 0;
############################################################
# For the remaining 82 users, we'll have to                #
#  calculate the slope of the regression line honestly.    #
# * ActDayNum will be mostly sequential, but there'll be   #
#   a hitch where Stake < 0.10.                            #
# * We still have n, the number of active days.            #
# * We have the matrix X = cbind(rep(1,n),ActDayNum) and   #
#   the vector Y = log(Stake).                             #
# * To get the formula for the slope,                      #
#   use beta = (X'X)^-1 X'Y.                               #
# * If there is only one active day in that first month,   #
#   set the slope equal to 0.                              #
# The slopes are in Labrie3$Slope1M.                       #
############################################################
Scrap2 <- (1:nL)[Labrie2$Small>0]; length(Scrap2)
i1 <- length(Scrap2); while (i1 > 0) {
   j1 <- Scrap2[i1]; Scrap1$Slope1M[j1] = 0;
   Scrap3 <- MapA1$Top[j1]:MapA1$Bot[j1];
   Scrap4 <- data.frame(Stake=Act1$Stake[Scrap3],
                        BetDime=(Act1$Stake[Scrap3]>0.10),
                        ActDayNum=0,Y=0);
      Scrap4$ActDayNum <- cumsum(Scrap4$BetDime);
      Scrap4$Y <- log(Scrap4$Stake);
   X <- cbind(rep(1,MapA1$Rows[j1]),Scrap4$ActDayNum);
   XPrimeX <- t(X) %*% X;
   XY <- t(X) %*% Scrap4$Y;
   if (Scrap4$ActDayNum[nrow(Scrap4)] > 1) {
      beta <- solve(XPrimeX,XY);
      Scrap1$Slope1M[j1] = beta[2];
   }
   i1 = i1 - 1;
}
nL - sum(Scrap1$UserID==Labrie2$UserID)
Labrie3 <- cbind(Labrie2,Slope1M=Scrap1$Slope1M);
######################################################################
# Generate the z-scores and the clusters described in Braverman and  #
#  Shaffer (2010) (DOI: 10.1093/eurpub/ckp232).                      #
######################################################################
# Generate the z-scores for the variables BPDay1M,         #
#  StakesSD1M, Days1M, and Slope1M.                        #
# * We do not know what dataset Braverman and Shaffer used #
#   to obtain the means and standard deviations.           #
# * These means and standard deviations are reconstructed  #
#   from z-scores found in a previous version of this      #
#   Transparency project package.                          #
#                          Variable        Mean         SD #
#                           BPDay1M  4.24605667   5.211621 #
#                        StakesSD1M 41.30182060 154.314391 #
#                            Days1M  5.36156039   5.682126 #
#                           Slope1M -0.07336332   0.718154 #
############################################################
MeanSD <- data.frame(Variable=c("BPDay1M","StakesSD1M",
                                "Days1M","Slope1M"),
             Mean=c(4.24605667, 41.30182060,5.36156039,-0.07336332),
               SD=c(  5.211621,154.314391,  5.682126,   0.718154));
Scrap1 <- data.frame(ZBPDay1M=
                        (Labrie3$BPDay1M-MeanSD$Mean[1])/MeanSD$SD[1],
                     ZStakeSD1M=
                      (Labrie3$StakeSD1M-MeanSD$Mean[2])/MeanSD$SD[2],
                     ZDays1M=
                         (Labrie3$Days1M-MeanSD$Mean[3])/MeanSD$SD[3],
                     ZSlope1M=
                        (Labrie3$Slope1M-MeanSD$Mean[4])/MeanSD$SD[4],
                    stringsAsFactors=FALSE);
Labrie4 <- cbind(Labrie3,Scrap1);
############################################################
# Create Julia, the data set with just the 530 users in    #
#  the analytic data set.                                  #
# * Braverman and Shaffer performed a k-means cluster      #
#   analysis with k = 4.                                   #
# * We will use the cluster centers listed in Table 1 of   #
#   the manuscript to replicate the cluster assignments.   #
# * Braverman and Shaffer's groups are in the variable     #
#   Julia$Cluster.                                         #
############################################################
Centers <- data.frame(C1=c( 2.63489,1.78653, 4.40874, 0.26706),
                      C2=c(-0.54361,0.03928, 0.15649,-2.48611),
                      C3=c( 2.39258,1.89973, 0.26157, 0.14316),
                      C4=c( 0.27904,0.00430,-0.04460, 0.22323));
Julia <- subset(Labrie4,Labrie4$InR==1)
   Scrap1 <- Julia[,c("ZDays1M","ZBPDay1M","ZStakeSD1M","ZSlope1M")];
   Scrap2 <- data.frame(D1=rep(0,nR),D2=0,D3=0,D4=0,Min=0,Cluster=0);
k1 <- 4; while (k1 > 0) {
   j1 <- 4; while (j1 > 0) {
      Scrap2[,k1] = Scrap2[,k1] + (Scrap1[,j1]-Centers[j1,k1])^2;
      j1 = j1 - 1;
   }
   k1 = k1 - 1;
}
Scrap2$Min = apply(Scrap2[,1:4],1,min);
   Scrap2$Cluster[Scrap2$D1==Scrap2$Min] = 1;
   Scrap2$Cluster[Scrap2$D2==Scrap2$Min] = 2;
   Scrap2$Cluster[Scrap2$D3==Scrap2$Min] = 3;
   Scrap2$Cluster[Scrap2$D4==Scrap2$Min] = 4;
Julia$Cluster <- Scrap2$Cluster;
############################################################
# Replicate Table 2 from the manuscript.                   #
############################################################
Scrap1 <- ddply(Julia,~Cluster,summarise,
                n=length(Period),
                MeanPeriod=mean(Period),
                MeanStake=mean(StakeSP),
                MeanBets=mean(BetsSP),
                MeanBPD=mean(BPDaySP),
                MeanNetLoss=mean(StakeSP)-mean(PrizesSP)
               ); round(t(Scrap1),digits=2)
#                 [,1]    [,2]     [,3]    [,4]
# Cluster         1.00    2.00     3.00    4.00
# n              15.00   22.00   115.00  378.00
# MeanPeriod    447.33  337.86   425.78  360.33
# MeanStake   74085.80 6317.81 23660.03 5444.43
# MeanBets     1165.67  165.91  1995.76  317.58
# MeanBPD        10.65    5.42    12.64    4.92
# MeanNetLoss  4308.62  809.87  1705.85  408.42