From 0a35fe431f8ac3ed7eab8e3f4a75efc75a033c9b 2015-11-11 10:54:06 From: Hannes Muehleisen Date: 2015-11-11 10:54:06 Subject: [PATCH] Import --- diff --git a/hmda-dump.sql b/hmda-dump.sql new file mode 100644 index 0000000000000000000000000000000000000000..048775542f4946637861dd2b5d5c70dddc93eb9b --- /dev/null +++ b/hmda-dump.sql @@ -0,0 +1,35 @@ +COPY SELECT * FROM hmda_06 SAMPLE 10000 INTO '/scratch/hannes/hmda/hmda_06_1.csv'; +COPY SELECT * FROM hmda_06 SAMPLE 100000 INTO '/scratch/hannes/hmda/hmda_06_2.csv'; +COPY SELECT * FROM hmda_06 SAMPLE 1000000 INTO '/scratch/hannes/hmda/hmda_06_3.csv'; +COPY SELECT * FROM hmda_06 SAMPLE 10000000 INTO '/scratch/hannes/hmda/hmda_06_4.csv'; +COPY SELECT * FROM hmda_06 INTO '/scratch/hannes/hmda/hmda_06_f.csv'; + +COPY SELECT * FROM hmda_07 SAMPLE 10000 INTO '/scratch/hannes/hmda/hmda_07_1.csv'; +COPY SELECT * FROM hmda_07 SAMPLE 100000 INTO '/scratch/hannes/hmda/hmda_07_2.csv'; +COPY SELECT * FROM hmda_07 SAMPLE 1000000 INTO '/scratch/hannes/hmda/hmda_07_3.csv'; +COPY SELECT * FROM hmda_07 SAMPLE 10000000 INTO '/scratch/hannes/hmda/hmda_07_4.csv'; +COPY SELECT * FROM hmda_07 INTO '/scratch/hannes/hmda/hmda_07_f.csv'; + +COPY SELECT * FROM hmda_08 SAMPLE 10000 INTO '/scratch/hannes/hmda/hmda_08_1.csv'; +COPY SELECT * FROM hmda_08 SAMPLE 100000 INTO '/scratch/hannes/hmda/hmda_08_2.csv'; +COPY SELECT * FROM hmda_08 SAMPLE 1000000 INTO '/scratch/hannes/hmda/hmda_08_3.csv'; +COPY SELECT * FROM hmda_08 SAMPLE 10000000 INTO '/scratch/hannes/hmda/hmda_08_4.csv'; +COPY SELECT * FROM hmda_08 INTO '/scratch/hannes/hmda/hmda_08_f.csv'; + +COPY SELECT * FROM hmda_09 SAMPLE 10000 INTO '/scratch/hannes/hmda/hmda_09_1.csv'; +COPY SELECT * FROM hmda_09 SAMPLE 100000 INTO '/scratch/hannes/hmda/hmda_09_2.csv'; +COPY SELECT * FROM hmda_09 SAMPLE 1000000 INTO '/scratch/hannes/hmda/hmda_09_3.csv'; +COPY SELECT * FROM hmda_09 SAMPLE 10000000 INTO '/scratch/hannes/hmda/hmda_09_4.csv'; +COPY SELECT * FROM hmda_09 INTO '/scratch/hannes/hmda/hmda_09_f.csv'; + +COPY SELECT * FROM hmda_10 SAMPLE 10000 INTO '/scratch/hannes/hmda/hmda_10_1.csv'; +COPY SELECT * FROM hmda_10 SAMPLE 100000 INTO '/scratch/hannes/hmda/hmda_10_2.csv'; +COPY SELECT * FROM hmda_10 SAMPLE 1000000 INTO '/scratch/hannes/hmda/hmda_10_3.csv'; +COPY SELECT * FROM hmda_10 SAMPLE 10000000 INTO '/scratch/hannes/hmda/hmda_10_4.csv'; +COPY SELECT * FROM hmda_10 INTO '/scratch/hannes/hmda/hmda_10_f.csv'; + +COPY SELECT * FROM hmda_11 SAMPLE 10000 INTO '/scratch/hannes/hmda/hmda_11_1.csv'; +COPY SELECT * FROM hmda_11 SAMPLE 100000 INTO '/scratch/hannes/hmda/hmda_11_2.csv'; +COPY SELECT * FROM hmda_11 SAMPLE 1000000 INTO '/scratch/hannes/hmda/hmda_11_3.csv'; +COPY SELECT * FROM hmda_11 SAMPLE 10000000 INTO '/scratch/hannes/hmda/hmda_11_4.csv'; +COPY SELECT * FROM hmda_11 INTO '/scratch/hannes/hmda/hmda_11_f.csv'; diff --git a/hmda-queries.sql b/hmda-queries.sql new file mode 100644 index 0000000000000000000000000000000000000000..32e69199879a48be71133fb723575b3887a787d3 --- /dev/null +++ b/hmda-queries.sql @@ -0,0 +1,192 @@ +SELECT actiontype, + propertytype, + loanpurpose, + count(*) AS num_records +FROM hmda_11 +GROUP BY actiontype, + propertytype, + loanpurpose; + + +SELECT actiontype, + propertytype, + loanpurpose, + count(*) AS num_records +FROM hmda_06 +GROUP BY actiontype, + propertytype, + loanpurpose +ORDER BY actiontype, + propertytype, + loanpurpose; + + +SELECT actiontype, + propertytype, + loanpurpose, + count(*) AS num_records +FROM hmda_07 +GROUP BY actiontype, + propertytype, + loanpurpose +ORDER BY actiontype, + propertytype, + loanpurpose; + + +SELECT actiontype, + propertytype, + loanpurpose, + count(*) AS num_records +FROM hmda_08 +GROUP BY actiontype, + propertytype, + loanpurpose +ORDER BY actiontype, + propertytype, + loanpurpose; + + +SELECT actiontype, + propertytype, + loanpurpose, + count(*) AS num_records +FROM hmda_09 +GROUP BY actiontype, + propertytype, + loanpurpose +ORDER BY actiontype, + propertytype, + loanpurpose; + + +SELECT actiontype, + propertytype, + loanpurpose, + count(*) AS num_records +FROM hmda_10 +GROUP BY actiontype, + propertytype, + loanpurpose +ORDER BY actiontype, + propertytype, + loanpurpose; + + +SELECT actiontype, + propertytype, + loanpurpose, + count(*) AS num_records +FROM hmda_11 +GROUP BY actiontype, + propertytype, + loanpurpose +ORDER BY actiontype, + propertytype, + loanpurpose; + + +SELECT sum((loanpurpose = '1' + AND propertytype IN ('1', '2'))) AS home_purchase, + sum((loanpurpose = '3' + AND propertytype IN ('1', '2'))) AS refinance, + sum((loanpurpose = '2' + AND propertytype IN ('1', '2'))) AS home_improvement, + sum((propertytype = '3')) AS multifamily, + count(*) AS total +FROM hmda_11 +WHERE actiontype = '1'; + + +SELECT sum((loanpurpose = '1' + AND propertytype IN ('1', '2'))) AS home_purchase, + sum((loanpurpose = '3' + AND propertytype IN ('1', '2'))) AS refinance, + sum((loanpurpose = '2' + AND propertytype IN ('1', '2'))) AS home_improvement, + sum((propertytype = '3')) AS multifamily, + count(*) AS total +FROM hmda_06 +WHERE actiontype = '1' +UNION +SELECT sum((loanpurpose = '1' + AND propertytype IN ('1', '2'))) AS home_purchase, + sum((loanpurpose = '3' + AND propertytype IN ('1', '2'))) AS refinance, + sum((loanpurpose = '2' + AND propertytype IN ('1', '2'))) AS home_improvement, + sum((propertytype = '3')) AS multifamily, + count(*) AS total +FROM hmda_07 +WHERE actiontype = '1' +UNION +SELECT sum((loanpurpose = '1' + AND propertytype IN ('1', '2'))) AS home_purchase, + sum((loanpurpose = '3' + AND propertytype IN ('1', '2'))) AS refinance, + sum((loanpurpose = '2' + AND propertytype IN ('1', '2'))) AS home_improvement, + sum((propertytype = '3')) AS multifamily, + count(*) AS total +FROM hmda_08 +WHERE actiontype = '1' +UNION +SELECT sum((loanpurpose = '1' + AND propertytype IN ('1', '2'))) AS home_purchase, + sum((loanpurpose = '3' + AND propertytype IN ('1', '2'))) AS refinance, + sum((loanpurpose = '2' + AND propertytype IN ('1', '2'))) AS home_improvement, + sum((propertytype = '3')) AS multifamily, + count(*) AS total +FROM hmda_09 +WHERE actiontype = '1' +UNION +SELECT sum((loanpurpose = '1' + AND propertytype IN ('1', '2'))) AS home_purchase, + sum((loanpurpose = '3' + AND propertytype IN ('1', '2'))) AS refinance, + sum((loanpurpose = '2' + AND propertytype IN ('1', '2'))) AS home_improvement, + sum((propertytype = '3')) AS multifamily, + count(*) AS total +FROM hmda_10 +WHERE actiontype = '1' +UNION +SELECT sum((loanpurpose = '1' + AND propertytype IN ('1', '2'))) AS home_purchase, + sum((loanpurpose = '3' + AND propertytype IN ('1', '2'))) AS refinance, + sum((loanpurpose = '2' + AND propertytype IN ('1', '2'))) AS home_improvement, + sum((propertytype = '3')) AS multifamily, + count(*) AS total +FROM hmda_11 +WHERE actiontype = '1'; + +SELECT race, + count(*) +FROM hmda_11 +WHERE actiontype = '1' + AND loanpurpose = '1' + AND occupancy = '1' + AND lienstatus = '1' + AND propertytype IN ('1', + '2') +GROUP BY race +ORDER BY race; + + +SELECT ethnicity, + count(*) +FROM hmda_11 +WHERE actiontype = '1' + AND loanpurpose = '1' + AND occupancy = '1' + AND lienstatus = '1' + AND propertytype IN ('1', + '2') + AND race = '5' +GROUP BY ethnicity +ORDER BY ethnicity; + diff --git a/plots.R b/plots.R new file mode 100755 index 0000000000000000000000000000000000000000..d838069df785b02610289fa6e33cb3e3232f3055 --- /dev/null +++ b/plots.R @@ -0,0 +1,56 @@ +#!R -f + +(function(lp) { +np <- lp[!(lp %in% installed.packages()[,"Package"])] +if(length(np)) install.packages(np,repos=c("https://cran.rstudio.com/")) +x <- lapply(lp,function(x){library(x,character.only=TRUE)}) +})(c("ggplot2", "ggthemes", "gtable", "reshape2")) + +theme <- theme_few(base_size = 24) + +theme(axis.title.y=element_text(vjust=0.9), + axis.title.x=element_text(vjust=-0.1), + axis.ticks.x=element_blank(), + text=element_text(family="serif"), + legend.position = "none") + +dd <- read.table("results.csv", sep="\t", header=F) +names(dd) <- c("sys", "dataset", "nrows", "loadtime", "querytime", "readtime") +ddm <- melt(dd[c(1,2,4,5,6)], id.vars=c("sys","dataset")) +ddm$sys <- factor(ddm$sys, c("MonetDBLite", "MonetDB", "SQLite")) + +print(ddm) + +xlf <- function(value) ifelse(value == 0, "DNF", ifelse(value < 10, + paste0(round(value, 1), "s"), paste0(round(value), "s"))) + +plt <- function(ddp, fname, xmax, title) { + pdf(fname, width=10, height=5) + p <- ggplot(ddp,aes(x=dataset, y=value, fill=sys)) + + geom_bar(stat="identity", position = "dodge", width=.7) + + geom_text(aes(label = xlf(value), family="serif"), size = 5, vjust=-.4, + position = position_dodge(width=.7)) + + scale_x_discrete(labels=xlabel) + + scale_y_continuous(limits=c(0, xmax)) + + scale_fill_manual(values=c( "#1f78b4", "#a6cee3", "#b2df8a")) + + xlab("# Tuples") + ylab("Time (s)") + theme + ggtitle(title) + + # haaaack (secondary x axis patched in using gtable) + axis <- ggplot(ddp,aes(x=dataset, y=value, fill=sys)) + + geom_text(aes(label=sys, y=0, family="serif"), angle=90, size = 5, hjust = 0.55, + position = position_dodge(width=.7)) + annotation <- gtable_filter(ggplotGrob(axis), "panel", trim=TRUE) + annotation[["grobs"]][[1]][["children"]][c(1,3)] <- NULL + g <- ggplotGrob(p) + g <- gtable_add_rows(g, unit(6, "line"), pos=3) + g <- gtable_add_grob(g, annotation, t=4, b=4, l=4, r=4) + grid.draw(g) + + dev.off() +} + +xlabel <- c("1"="60K", "2"="600K", "3"="6M", "4"="60M", "f"="128M") +plt(ddm[ddm$variable=='loadtime',], "load.pdf", 10000, "Loading from CSV files") +plt(ddm[ddm$variable=='querytime',], "query.pdf", 70, "Run HMDA analysis") + +xlabel <- c("1"="10K", "2"="100K", "3"="1M", "4"="10M", "f"="14M") +plt(ddm[ddm$variable=='readtime',], "read.pdf", 1500, "Convert table to data.frame") diff --git a/testdriver.R b/testdriver.R new file mode 100644 index 0000000000000000000000000000000000000000..570933f03ebb6c67cc1301915c4108121f631edd --- /dev/null +++ b/testdriver.R @@ -0,0 +1,59 @@ +library(MonetDB.R) +library(RSQLite) +library(RPostgreSQL) + +con1 <- dbConnect(MonetDB.R(), "hmda") +con2 <- dbConnect(MonetDBLite()) +con3 <- dbConnect(SQLite(), ":memory:") +con4 <- dbConnect(SQLite(), tempfile()) + +# PostgreSQL can't run queries +# con5 <- dbConnect(dbDriver("PostgreSQL"), host="localhost", port=5432) + +ds <- c("1"=60000, "2"=600000, "3"=6000000, "4"=60000000, "f"=128818169) +tb <- sprintf("hmda_%02d", 6:11) + +mtime <- function(x) system.time(x)[[3]] + +results <- data.frame(sys=character(), dataset=character(), nrows=integer(), ltime=numeric(), qtime=numeric(), dtime=numeric()) + +run <- function(con, sys) { + for (d in names(ds)) { + # create tables + lapply(tb, function(t) { + if (dbExistsTable(con, t)) dbRemoveTable(con, t) + dbSendQuery(con, paste0('CREATE TABLE ', t, '( "asofyear" DOUBLE PRECISION, "respondentid" VARCHAR(255), "agencycode" VARCHAR(255), "loantype" DOUBLE PRECISION, "propertytype" VARCHAR(255), "loanpurpose" DOUBLE PRECISION, "occupancy" DOUBLE PRECISION, "loanamount" DOUBLE PRECISION, "preapproval" VARCHAR(255), "actiontype" DOUBLE PRECISION, "msa_md" VARCHAR(255), "statecode" VARCHAR(255), "countycode" VARCHAR(255), "censustractnumber" VARCHAR(255), "applicantethnicity" VARCHAR(255), "coapplicantethnicity" VARCHAR(255), "applicantrace1" VARCHAR(255), "applicantrace2" VARCHAR(255), "applicantrace3" VARCHAR(255), "applicantrace4" VARCHAR(255), "applicantrace5" VARCHAR(255), "coapplicantrace1" VARCHAR(255), "coapplicantrace2" VARCHAR(255), "coapplicantrace3" VARCHAR(255), "coapplicantrace4" VARCHAR(255), "coapplicantrace5" VARCHAR(255), "applicantsex" DOUBLE PRECISION, "coapplicantsex" DOUBLE PRECISION, "applicantincome" VARCHAR(255), "purchasertype" VARCHAR(255), "denialreason1" VARCHAR(255), "denialreason2" VARCHAR(255), "denialreason3" VARCHAR(255), "ratespread" VARCHAR(255), "hoepastatus" VARCHAR(255), "lienstatus" VARCHAR(255), "editstatus" VARCHAR(255), "applicationdateindicator" DOUBLE PRECISION, "sequencenumber" DOUBLE PRECISION, "population" DOUBLE PRECISION, "minoritypopulationpct" DOUBLE PRECISION, "hudmedianfamilyincome" DOUBLE PRECISION, "tracttomsa_mdincomepct" DOUBLE PRECISION, "numberofowneroccupiedunits" DOUBLE PRECISION, "numberof1to4familyunits" DOUBLE PRECISION, "activityyear" DOUBLE PRECISION, "fedtaxid" VARCHAR(255), "respondentname_ts" VARCHAR(255), "respondentmailingaddress" VARCHAR(255), "respondentcity_ts" VARCHAR(255), "respondentstate_ts" VARCHAR(255), "respondentzipcode" VARCHAR(255), "parentname_ts" VARCHAR(255), "parentaddress" VARCHAR(255), "parentcity_ts" VARCHAR(255), "parentstate_ts" VARCHAR(255), "parentzipcode" VARCHAR(255), "respondentname_panel" VARCHAR(255), "respondentcity_panel" VARCHAR(255), "respondentstate_panel" VARCHAR(255), "assets_panel" DOUBLE PRECISION, "otherlendercode_panel" DOUBLE PRECISION, "regioncode_panel" DOUBLE PRECISION, "larcount" DOUBLE PRECISION, "validityerror" VARCHAR(255), "app_min_cnt" INTEGER, "co_min_cnt" INTEGER, "appwhite" INTEGER, "cowhite" INTEGER, "race" INTEGER, "ethnicity" VARCHAR(255))')) + }) + + # load data + ltime <- mtime( + for (t in tb) { + fname <- paste0(getwd(), "/data/", t, "_", d, ".csv") + if (inherits(con, "SQLiteConnection")) dbWriteTable(con, name = t, value = fname, row.names = FALSE, header = FALSE, append=T, sep="|", overwrite=F) + if (inherits(con, "MonetDBConnection")) dbSendQuery(con, paste0("COPY INTO ", t, " FROM '", fname, "'", collapse="\n")) + if (inherits(con, "PostgreSQLConnection")) dbGetQuery(con, paste0("COPY ", t, " FROM '", fname, "' WITH (FORMAT CSV, DELIMITER \'|\', NULL \'null\')", collapse="\n")) + }) + + trows <- dbGetQuery(con, paste0("SELECT SUM(rws) AS rws FROM (", paste0("SELECT COUNT(*) AS rws FROM ", tb, collapse=" UNION ALL "), ") AS t"))$rws[[1]] + + # make sure we have loaded as many rows as expected + stopifnot(trows == ds[d]) + + dump <- dbGetQuery(con, paste0(readLines("hmda-queries.sql"), collapse="\n")) + dump <- dbGetQuery(con, paste0(readLines("hmda-queries.sql"), collapse="\n")) + qtime <- mtime(dbGetQuery(con, paste0(readLines("hmda-queries.sql"), collapse="\n"))) + dump <- dbReadTable(con, "hmda_11") + dump <- dbReadTable(con, "hmda_11") + dtime <- mtime(dbReadTable(con, "hmda_11")) + + # clean up + lapply(tb, function(x) dbRemoveTable(con, x)) + + results <<- rbind(results, data.frame(sys=sys, dataset=d, nrows=ds[d], ltime=ltime, qtime=qtime, dtime=dtime)) + print(results) + write.table(results, "results.csv", row.names=F, quote=F, sep="\t", col.names=F) + } +} +run(con2, 'MonetDBLite') +run(con4, 'SQLite-file') +run(con1, 'MonetDB')