diff --git a/survey.R b/survey.R new file mode 100644 index 0000000000000000000000000000000000000000..7a4c406bd38c111476a3f1cf8c49f61dcf0b23e1 --- /dev/null +++ b/survey.R @@ -0,0 +1,131 @@ +library(survey) +options(na.action="na.pass") + +# from survey package, R/surveyrep.R +# slightly patched to use numeric NAs in matrices and avoid over-decorating result +svymean <- function(x,design, na.rm=FALSE, rho=NULL, + return.replicates=FALSE,deff=FALSE,...) +{ + # if (!exists(".Generic",inherits=FALSE)) + # .Deprecated("svymean") + if (!inherits(design,"svyrep.design")) stop("design is not a replicate survey design") + + if (inherits(x,"formula")){ + ## do the right thing with factors + mf<-model.frame(x,design$variables,na.action=na.pass) + xx<-lapply(attr(terms(x),"variables")[-1], + function(tt) model.matrix(eval(bquote(~0+.(tt))),mf)) + cols<-sapply(xx,NCOL) + x<-matrix(data=as.numeric(NA), nrow=NROW(xx[[1]]),ncol=sum(cols)) + scols<-c(0,cumsum(cols)) + for(i in 1:length(xx)){ + x[,scols[i]+1:cols[i]]<-xx[[i]] + } + colnames(x)<-do.call("c",lapply(xx,colnames)) + } else { + if(typeof(x) %in% c("expression","symbol")) + x<-eval(x, design$variables) + else { + if(is.data.frame(x) && any(sapply(x,is.factor))){ + xx<-lapply(x, function(xi) {if (is.factor(xi)) 0+(outer(xi,levels(xi),"==")) else xi}) + cols<-sapply(xx,NCOL) + scols<-c(0,cumsum(cols)) + cn<-character(sum(cols)) + for(i in 1:length(xx)) + cn[scols[i]+1:cols[i]]<-paste(names(x)[i],levels(x[[i]]),sep="") + x<-matrix(nrow=NROW(xx[[1]]),ncol=sum(cols)) + for(i in 1:length(xx)){ + x[,scols[i]+1:cols[i]]<-xx[[i]] + } + colnames(x)<-cn + } + } + } + + x<-as.matrix(x) + + if (na.rm){ + nas<-rowSums(is.na(x)) + design<-design[nas==0,] + x<-x[nas==0,,drop=FALSE] + } + + wts<-design$repweights + scale<-design$scale + rscales<-design$rscales + if (!is.null(rho)) .NotYetUsed("rho") + + if (!design$combined.weights) + pw<-design$pweights + else + pw<-1 + + rval<-colSums(design$pweights*x)/sum(design$pweights) + + if (getOption("survey.drop.replicates") && !is.null(design$selfrep) && all(design$selfrep)){ + v<-matrix(0,length(rval),length(rval)) + repmeans<-NULL + } else { + if (inherits(wts, "repweights_compressed")){ + repmeans<-matrix(ncol=NCOL(x), nrow=ncol(wts$weights)) + for(i in 1:ncol(wts$weights)){ + wi<-wts$weights[wts$index,i] + repmeans[i,]<-t(colSums(wi*x*pw)/sum(pw*wi)) + } + } else { + repmeans<-matrix(data=as.numeric(NA), ncol=NCOL(x), nrow=ncol(wts)) + for(i in 1:ncol(wts)){ + repmeans[i,]<-t(colSums(wts[,i]*x*pw)/sum(pw*wts[,i])) + } + } + repmeans<-drop(repmeans) + v <- svrVar(repmeans, scale, rscales,mse=design$mse, coef=rval) +} +# this is easy to fix, but for now... +return(v) + attr(rval,"var") <-v + attr(rval, "statistic")<-"mean" + if (return.replicates){ + attr(repmeans,"scale")<-design$scale + attr(repmeans,"rscales")<-design$rscales + attr(repmeans,"mse")<-design$mse + rval<-list(mean=rval, replicates=repmeans) + } + if (is.character(deff) || deff){ + nobs<-length(design$pweights) + npop<-sum(design$pweights) + vsrs<-unclass(svyvar(x,design,na.rm=na.rm, return.replicates=FALSE,estimate.only=TRUE))/length(design$pweights) + if (deff!="replace") + vsrs<-vsrs*(npop-nobs)/npop + attr(rval,"deff") <- v/vsrs + } + class(rval)<-"svrepstat" + rval +} + +source("harness.R") +for (s in c("alabama", "california", "acs3yr" )) { + svydata <- readRDS(paste0(s,".rds")) + names(svydata) <- tolower(names(svydata)) + print("loaded") + svydsgn <- svrepdesign( + weight = ~pwgtp , + repweights = 'pwgtp[1-9]' , + scale = 4 / 80 , + rscales = rep( 1 , 80 ) , + mse = TRUE , + data = svydata) + print("initialized") + for (r in 1:5) { + timing <- system.time({ + agep <- svymean(~agep, svydsgn, se=TRUE) + relp <- svymean(~relp, svydsgn, se=TRUE) + print(agep) + print(relp) + })[[3]] + + log.result("survey", sys, conf, s, r, timing) + clearResultRecycler() + } +} +