Folder <- "C:/Laboratory/Transparency/PopTrendsB/"; require(plyr); require(dplyr); ###################################################################### # Read in the three data files. # ###################################################################### Raw <- read.csv(paste0(Folder,"PopTrendsBData1Fixed.csv"), header=TRUE,stringsAsFactors=FALSE); Fixed <- data.frame(UserID=Raw$UserID, DateBet=as.Date(Raw$DateBet), StakeF=Raw$StakeF,WinF=Raw$WinF, BetsF=Raw$BetsF,stringsAsFactors=FALSE); nFixed <- nrow(Fixed); c(min(Fixed$DateBet),max(Fixed$DateBet)) Raw <- read.csv(paste0(Folder,"PopTrendsBData2Live.csv"), header=TRUE,stringsAsFactors=FALSE); Live <- data.frame(UserID=Raw$UserID, DateBet=as.Date(Raw$DateBet), StakeL=Raw$StakeL,WinL=Raw$WinL, BetsL=Raw$BetsL,stringsAsFactors=FALSE); nLive <- nrow(Live); c(min(Live$DateBet),max(Live$DateBet)) Raw <- read.csv(paste0(Folder,"PopTrendsBData3Aggs.csv"), header=TRUE,stringsAsFactors=FALSE); Aggs <- data.frame(UserID=Raw$UserID,CountryID=Raw$CountryID, Gender=Raw$Gender,BirthYear=Raw$BirthYear, DateReg=as.Date(Raw$DateReg), TimeReg=Raw$TimeReg, Date1Dep=as.Date(Raw$Date1Dep), Date1Bet=as.Date(Raw$Date1Bet), Date1Spo=as.Date(Raw$Date1Spo), StakeF=Raw$StakeF,StakeL=Raw$StakeL, StakeA=Raw$StakeA, WinF=Raw$WinF,WinL=Raw$WinL, WinA=Raw$WinA, BetsF=Raw$BetsF,BetsL=Raw$BetsL, BetsA=Raw$BetsA, DaysF=Raw$DaysF,DaysL=Raw$DaysL, DaysA=Raw$DaysA, stringsAsFactors=FALSE); nAggs <- nrow(Aggs); quantile(probs=c(0,1),x=Aggs$UserID) quantile(probs=c(0,1),x=Aggs$BirthYear,na.rm=TRUE) c(min(Aggs$DateReg),max(Aggs$DateReg)) ############################################################ # The statistics in Section 2.1 check out. # ############################################################ # The gender counts and percentages check out. c(nAggs,sum(Aggs$Gender==1), round(100*sum(Aggs$Gender==1)/nAggs,digits=1), sum(Aggs$Gender==0), round(100*sum(Aggs$Gender==0)/nAggs,digits=1)) # The minimum and maximum ages check out. # The mean and standard deviation are off because here I'm using # birth years and the original analyses used birth dates. c(2005-max(Aggs$BirthYear,na.rm=TRUE), 2005-min(Aggs$BirthYear,na.rm=TRUE), 2005-mean(Aggs$BirthYear,na.rm=TRUE), sd(Aggs$BirthYear,na.rm=TRUE)) # The number of countries and the percentage of Germans check out. c(length(unique(Aggs$CountryID)), sum(Aggs$CountryID==276), round(100*sum(Aggs$CountryID==276)/nAggs,digits=1)) # The counts and percentages of people who played fixed odds # and live action sports betting check out. c(sum(Aggs$BetsF*Aggs$BetsL>0), round(100*sum(Aggs$BetsF*Aggs$BetsL>0)/nAggs,digits=1), sum(Aggs$BetsL==0), round(100*sum(Aggs$BetsL==0)/nAggs,digits=1), sum(Aggs$BetsF==0), round(100*sum(Aggs$BetsF==0)/nAggs,digits=1)); ############################################################ # Construct daily aggregates from Fixed and Live to show # # that they match the numbers in Aggs. # # * We have to remove data from before subscribers' dates # # of first deposit. # ############################################################ FixedC <- merge(Fixed,Aggs[,c("UserID","Date1Dep")], all.x=TRUE,all.y=FALSE,by=c("UserID")); FixedC <- subset(FixedC,FixedC$DateBet>=FixedC$Date1Dep); AggsF2 <- ddply(FixedC,~UserID,summarise, StakeF=sum(StakeF),WinF=sum(WinF), BetsF=sum(BetsF)); AggsF1 <- subset(Aggs[,c("UserID","StakesF","WinF","BetsF")], Aggs$BetsF>0); quantile(AggsF1$StakeF-AggsF2$StakeF,c(0,1)) quantile(AggsF1$WinF-AggsF2$WinF,c(0,1)) quantile(AggsF1$BetsF-AggsF2$BetsF,c(0,1)) LiveC <- merge(Live,Aggs[,c("UserID","Date1Dep")], all.x=TRUE,all.y=FALSE,by=c("UserID")); LiveC <- subset(LiveC,LiveC$DateBet>=LiveC$Date1Dep); AggsL2 <- ddply(LiveC,~UserID,summarise, StakeL=sum(StakeL),WinF=sum(WinL), BetsL=sum(BetsL)); AggsL1 <- subset(Aggs[,c("UserID","StakesL","WinL","BetsL")], Aggs$BetsL>0); quantile(AggsL1$StakeL-AggsL2$StakeL,c(0,1)) quantile(AggsL1$WinL-AggsL2$WinL,c(0,1)) quantile(AggsL1$BetsL-AggsL2$BetsL,c(0,1))