# This script is made freely available under license CC-BY-4.0 # by Prof. Patrick E Meyer. Liege, Belgium # # The impact of COVID-19 vaccines on all-cause mortality in EU in 2021: a machine-learning perspective # Rscript of the statistical analysis of Patrick E. Meyer. December 2021. # ####################################################################### ##### STATISTICAL Analysis ############################################ ####################################################################### ## This section on data loading has been updated but the statistical analysis remains the same library(randomForest) dataFolder<-"~/covid/" #folder of the .csv data files and results.txt dataFile<-c("data0to14.csv","data15to44.csv","data45to64.csv") #names of the 3 input files read.csv(paste(dataFolder,dataFile[1],sep=""))->data0to14 read.csv(paste(dataFolder,dataFile[2],sep=""))->data15to44 read.csv(paste(dataFolder,dataFile[3],sep=""))->data45to64 myfile<-paste(dataFolder,"results.txt",sep="") ################## 2021 data analysis ########## ################## Correlation Table ########### meth<-"pearson" cormat<-matrix(nrow=3,ncol=7) cormat[1,]<-c(cor(data0to14[,1],data0to14[,5:9],method=meth),cor(data0to14[,9],data0to14[,7:8],method=meth)) cormat[2,]<-c(cor(data15to44[,1],data15to44[,5:9],method=meth),cor(data15to44[,9],data15to44[,7:8],method=meth)) cormat[3,]<-c(cor(data45to64[,1],data45to64[,5:9],method=meth),cor(data45to64[,9],data45to64[,7:8],method=meth)) colnames(cormat)<-c(names(data0to14)[c(5:9,7:8)]) rownames(cormat)<-c("0-14","15-44","45-64") write("Pearson correlation table \n",file=myfile,append=F) write.table(format(cormat,digits=2),file=myfile,sep="\t",quote=F,row.names=T,append=T) ################### Strong Relevance Table ############## set.seed(100) #oob MSE has a random component, CV is generally not necessary with boosting, anyway here it does not change results strongrel<-matrix(ncol=6,nrow=7) full<-ZscoreCurrent~ZscorePast1Y+ZscorePast2Y+CovDeathRate+DoseRate+CaseRate f19<-ZscoreCurrent~ZscorePast1Y+CaseRate+CovDeathRate+DoseRate f20<-ZscoreCurrent~ZscorePast2Y+CaseRate+CovDeathRate+DoseRate fCDR<-ZscoreCurrent~ZscorePast1Y+ZscorePast2Y+DoseRate+CaseRate fcr<-ZscoreCurrent~ZscorePast1Y+ZscorePast2Y+CovDeathRate+DoseRate fdr<-ZscoreCurrent~ZscorePast1Y+ZscorePast2Y+CovDeathRate+CaseRate mydata<-data0to14 strongrel[1,1]<-glm(full,data=mydata)[[11]] strongrel[1,2]<-glm(f19,data=mydata)[[11]] strongrel[1,3]<-glm(f20,data=mydata)[[11]] strongrel[1,4]<-glm(fCDR,data=mydata)[[11]] strongrel[1,5]<-glm(fcr,data=mydata)[[11]] strongrel[1,6]<-glm(fdr,data=mydata)[[11]] strongrel[2,1]<-randomForest(full,mydata)$mse[500] strongrel[2,2]<-randomForest(f19,mydata)$mse[500] strongrel[2,3]<-randomForest(f20,mydata)$mse[500] strongrel[2,4]<-randomForest(fCDR,mydata)$mse[500] strongrel[2,5]<-randomForest(fcr,mydata)$mse[500] strongrel[2,6]<-randomForest(fdr,mydata)$mse[500] mydata<-data15to44 strongrel[3,1]<-glm(full,data=mydata)[[11]] strongrel[3,2]<-glm(f19,data=mydata)[[11]] strongrel[3,3]<-glm(f20,data=mydata)[[11]] strongrel[3,4]<-glm(fCDR,data=mydata)[[11]] strongrel[3,5]<-glm(fcr,data=mydata)[[11]] strongrel[3,6]<-glm(fdr,data=mydata)[[11]] strongrel[4,1]<-randomForest(full,mydata)$mse[500] strongrel[4,2]<-randomForest(f19,mydata)$mse[500] strongrel[4,3]<-randomForest(f20,mydata)$mse[500] strongrel[4,4]<-randomForest(fCDR,mydata)$mse[500] strongrel[4,5]<-randomForest(fcr,mydata)$mse[500] strongrel[4,6]<-randomForest(fdr,mydata)$mse[500] mydata<-data45to64 strongrel[5,1]<-glm(full,data=mydata)[[11]] strongrel[5,2]<-glm(f19,data=mydata)[[11]] strongrel[5,3]<-glm(f20,data=mydata)[[11]] strongrel[5,4]<-glm(fCDR,data=mydata)[[11]] strongrel[5,5]<-glm(fcr,data=mydata)[[11]] strongrel[5,6]<-glm(fdr,data=mydata)[[11]] strongrel[6,1]<-randomForest(full,mydata)$mse[500] strongrel[6,2]<-randomForest(f19,mydata)$mse[500] strongrel[6,3]<-randomForest(f20,mydata)$mse[500] strongrel[6,4]<-randomForest(fCDR,mydata)$mse[500] strongrel[6,5]<-randomForest(fcr,mydata)$mse[500] strongrel[6,6]<-randomForest(fdr,mydata)$mse[500] #Difference of values less than epsilon are considered ties (by defaut, epsilon is one half procent) #This was suggested for robustness of the sign rank difference test replaceCloseValuesWithTies<-function(mat,refcol,lineomit,epsilon=0.005){ refvalues<-matrix(mat[,refcol],nc=(NCOL(mat)-1),nr=NROW(mat)) diff<-(mat[,-refcol]-refvalues)/refvalues diff<-diff[-lineomit,] mat[-lineomit,-refcol][(diff^2)<=(epsilon^2)]<-refvalues[-lineomit,][(diff^2)<=(epsilon^2)] mat } strongrel<-replaceCloseValuesWithTies(strongrel,1,7,0.005) strongrel[7,]<-c(NA,as.numeric(sapply(data.frame(strongrel)[-7,-1],wilcox.test,strongrel[-7,1],alternative="greater",paired=T)[3,])) colnames(strongrel)<-c("All vars","-2019","-2020","-CovDthRate","-CaseRate","-DoseRate") rownames(strongrel)<-c("AIC-0-14","MSE-0-14","AIC-15-44","MSE-15-44","AIC-45-64","MSE-45-64","p-values") write("\n strong relevance table \n",file=myfile,append=T) write.table(format(strongrel,digits=3),file=myfile,sep="\t",quote=F,row.names=T,append=T) ################################################################################## ###################### Model with only strong rel variables ####################### set.seed(50) mat20<-matrix(ncol=5,nrow=6) f3<-ZscoreCurrent~ZscorePast1Y+DoseRate+CovDeathRate f4<-ZscoreCurrent~ZscorePast1Y+CaseRate+CovDeathRate f5<-ZscoreCurrent~ZscorePast1Y+DoseRate f6<-ZscoreCurrent~ZscorePast1Y+CaseRate mydata<-data0to14 mat20[1,1]<-strongrel[1,1] mat20[2,1]<-strongrel[2,1] mat20[1,2]<-glm(f3,data=mydata)[[11]] mat20[2,2]<-randomForest(f3,mydata)$mse[500] mat20[1,3]<-glm(f4,data=mydata)[[11]] mat20[2,3]<-randomForest(f4,mydata)$mse[500] mat20[1,4]<-glm(f5,data=mydata)[[11]] mat20[2,4]<-randomForest(f5,mydata)$mse[500] mat20[1,5]<-glm(f6,data=mydata)[[11]] mat20[2,5]<-randomForest(f6,mydata)$mse[500] mydata<-data15to44 mat20[3,1]<-strongrel[3,1] mat20[4,1]<-strongrel[4,1] mat20[3,2]<-glm(f3,data=mydata)[[11]] mat20[4,2]<-randomForest(f3,mydata)$mse[500] mat20[3,3]<-glm(f4,data=mydata)[[11]] mat20[4,3]<-randomForest(f4,mydata)$mse[500] mat20[3,4]<-glm(f5,data=mydata)[[11]] mat20[4,4]<-randomForest(f5,mydata)$mse[500] mat20[3,5]<-glm(f6,data=mydata)[[11]] mat20[4,5]<-randomForest(f6,mydata)$mse[500] mydata<-data45to64 mat20[5,1]<-strongrel[5,1] mat20[6,1]<-strongrel[6,1] mat20[5,2]<-glm(f3,data=mydata)[[11]] mat20[6,2]<-randomForest(f3,mydata)$mse[500] mat20[5,3]<-glm(f4,data=mydata)[[11]] mat20[6,3]<-randomForest(f4,mydata)$mse[500] mat20[5,4]<-glm(f5,data=mydata)[[11]] mat20[6,4]<-randomForest(f5,mydata)$mse[500] mat20[5,5]<-glm(f6,data=mydata)[[11]] mat20[6,5]<-randomForest(f6,mydata)$mse[500] #mat20[7,]<-c(NA,as.numeric(sapply(data.frame(mat20)[-7,-1],wilcox.test,mat20[-7,1],alternative="greater",paired=T)[3,])) colnames(mat20)<-c("All vars","All strong","CR vs DR","2020+DR","2020+CR") rownames(mat20)<-c("AIC-0-14","MSE-0-14","AIC-15-44","MSE-15-44","AIC-45-64","MSE-45-64") write("\n relevance inversion table \n",file=myfile,append=T) write.table(format(mat20,digits=2),file=myfile,sep="\t",quote=F,row.names=T,append=T) ################ Error Ratios ##################################################### set.seed(50) mat19<-matrix(ncol=5,nrow=6) f3<-ZscoreCurrent~ZscorePast2Y+DoseRate+CovDeathRate f4<-ZscoreCurrent~ZscorePast2Y+CaseRate+CovDeathRate f5<-ZscoreCurrent~ZscorePast2Y+DoseRate f6<-ZscoreCurrent~ZscorePast2Y+CaseRate mydata<-data0to14 mat19[1,2]<-glm(f3,data=mydata)[[11]] mat19[2,2]<-randomForest(f3,mydata)$mse[500] mat19[1,3]<-glm(f4,data=mydata)[[11]] mat19[2,3]<-randomForest(f4,mydata)$mse[500] mat19[1,4]<-glm(f5,data=mydata)[[11]] mat19[2,4]<-randomForest(f5,mydata)$mse[500] mat19[1,5]<-glm(f6,data=mydata)[[11]] mat19[2,5]<-randomForest(f6,mydata)$mse[500] mydata<-data15to44 mat19[3,2]<-glm(f3,data=mydata)[[11]] mat19[4,2]<-randomForest(f3,mydata)$mse[500] mat19[3,3]<-glm(f4,data=mydata)[[11]] mat19[4,3]<-randomForest(f4,mydata)$mse[500] mat19[3,4]<-glm(f5,data=mydata)[[11]] mat19[4,4]<-randomForest(f5,mydata)$mse[500] mat19[3,5]<-glm(f6,data=mydata)[[11]] mat19[4,5]<-randomForest(f6,mydata)$mse[500] mydata<-data45to64 mat19[5,2]<-glm(f3,data=mydata)[[11]] mat19[6,2]<-randomForest(f3,mydata)$mse[500] mat19[5,3]<-glm(f4,data=mydata)[[11]] mat19[6,3]<-randomForest(f4,mydata)$mse[500] mat19[5,4]<-glm(f5,data=mydata)[[11]] mat19[6,4]<-randomForest(f5,mydata)$mse[500] mat19[5,5]<-glm(f6,data=mydata)[[11]] mat19[6,5]<-randomForest(f6,mydata)$mse[500] set.seed(50) mat1920<-matrix(ncol=5,nrow=6) f3<-ZscoreCurrent~ZscorePast1Y+ZscorePast2Y+DoseRate+CovDeathRate f4<-ZscoreCurrent~ZscorePast1Y+ZscorePast2Y+CaseRate+CovDeathRate f5<-ZscoreCurrent~ZscorePast1Y+ZscorePast2Y+DoseRate f6<-ZscoreCurrent~ZscorePast1Y+ZscorePast2Y+CaseRate mydata<-data0to14 mat1920[1,2]<-glm(f3,data=mydata)[[11]] mat1920[2,2]<-randomForest(f3,mydata)$mse[500] mat1920[1,3]<-glm(f4,data=mydata)[[11]] mat1920[2,3]<-randomForest(f4,mydata)$mse[500] mat1920[1,4]<-glm(f5,data=mydata)[[11]] mat1920[2,4]<-randomForest(f5,mydata)$mse[500] mat1920[1,5]<-glm(f6,data=mydata)[[11]] mat1920[2,5]<-randomForest(f6,mydata)$mse[500] mydata<-data15to44 mat1920[3,2]<-glm(f3,data=mydata)[[11]] mat1920[4,2]<-randomForest(f3,mydata)$mse[500] mat1920[3,3]<-glm(f4,data=mydata)[[11]] mat1920[4,3]<-randomForest(f4,mydata)$mse[500] mat1920[3,4]<-glm(f5,data=mydata)[[11]] mat1920[4,4]<-randomForest(f5,mydata)$mse[500] mat1920[3,5]<-glm(f6,data=mydata)[[11]] mat1920[4,5]<-randomForest(f6,mydata)$mse[500] mydata<-data45to64 mat1920[5,2]<-glm(f3,data=mydata)[[11]] mat1920[6,2]<-randomForest(f3,mydata)$mse[500] mat1920[5,3]<-glm(f4,data=mydata)[[11]] mat1920[6,3]<-randomForest(f4,mydata)$mse[500] mat1920[5,4]<-glm(f5,data=mydata)[[11]] mat1920[6,4]<-randomForest(f5,mydata)$mse[500] mat1920[5,5]<-glm(f6,data=mydata)[[11]] mat1920[6,5]<-randomForest(f6,mydata)$mse[500] ratio1<-matrix(ncol=6,nrow=6) colnames(ratio1)<-c("20 CDR (DR / CR)","20 (CR / DR)","19 CDR (DR / CR)","19 (CR / DR)","1920 CDR (DR / CR)","1920 (CR / DR)") rownames(ratio1)<-c("AIC-0-14","MSE-0-14","AIC-15-44","MSE-15-44","AIC-45-64","MSE-45-64") ratio1[,1]<-mat20[,3]/mat20[,2] ratio1[,2]<-mat20[,4]/mat20[,5] ratio1[,3]<-mat19[,3]/mat19[,2] ratio1[,4]<-mat19[,4]/mat19[,5] ratio1[,5]<-mat1920[,3]/mat1920[,2] ratio1[,6]<-mat1920[,4]/mat1920[,5] write("\n relevance inversion using error-ratio \n",file=myfile,append=T) write.table(format(ratio1,digits=2,scientific=F),file=myfile,sep="\t",quote=F,row.names=T,append=T)