Changeset - 0a35fe431f8a
[Not reviewed]
0 0 4
Hannes Muehleisen - 9 years ago 2015-11-11 10:54:06
hannes@muehleisen.org
Import
4 files changed with 342 insertions and 0 deletions:
0 comments (0 inline, 0 general)
hmda-dump.sql
Show inline comments
 
new file 100644
 
COPY SELECT * FROM hmda_06 SAMPLE 10000 INTO '/scratch/hannes/hmda/hmda_06_1.csv';
 
COPY SELECT * FROM hmda_06 SAMPLE 100000 INTO '/scratch/hannes/hmda/hmda_06_2.csv';
 
COPY SELECT * FROM hmda_06 SAMPLE 1000000 INTO '/scratch/hannes/hmda/hmda_06_3.csv';
 
COPY SELECT * FROM hmda_06 SAMPLE 10000000 INTO '/scratch/hannes/hmda/hmda_06_4.csv';
 
COPY SELECT * FROM hmda_06 INTO '/scratch/hannes/hmda/hmda_06_f.csv';
 

	
 
COPY SELECT * FROM hmda_07 SAMPLE 10000 INTO '/scratch/hannes/hmda/hmda_07_1.csv';
 
COPY SELECT * FROM hmda_07 SAMPLE 100000 INTO '/scratch/hannes/hmda/hmda_07_2.csv';
 
COPY SELECT * FROM hmda_07 SAMPLE 1000000 INTO '/scratch/hannes/hmda/hmda_07_3.csv';
 
COPY SELECT * FROM hmda_07 SAMPLE 10000000 INTO '/scratch/hannes/hmda/hmda_07_4.csv';
 
COPY SELECT * FROM hmda_07 INTO '/scratch/hannes/hmda/hmda_07_f.csv';
 

	
 
COPY SELECT * FROM hmda_08 SAMPLE 10000 INTO '/scratch/hannes/hmda/hmda_08_1.csv';
 
COPY SELECT * FROM hmda_08 SAMPLE 100000 INTO '/scratch/hannes/hmda/hmda_08_2.csv';
 
COPY SELECT * FROM hmda_08 SAMPLE 1000000 INTO '/scratch/hannes/hmda/hmda_08_3.csv';
 
COPY SELECT * FROM hmda_08 SAMPLE 10000000 INTO '/scratch/hannes/hmda/hmda_08_4.csv';
 
COPY SELECT * FROM hmda_08 INTO '/scratch/hannes/hmda/hmda_08_f.csv';
 

	
 
COPY SELECT * FROM hmda_09 SAMPLE 10000 INTO '/scratch/hannes/hmda/hmda_09_1.csv';
 
COPY SELECT * FROM hmda_09 SAMPLE 100000 INTO '/scratch/hannes/hmda/hmda_09_2.csv';
 
COPY SELECT * FROM hmda_09 SAMPLE 1000000 INTO '/scratch/hannes/hmda/hmda_09_3.csv';
 
COPY SELECT * FROM hmda_09 SAMPLE 10000000 INTO '/scratch/hannes/hmda/hmda_09_4.csv';
 
COPY SELECT * FROM hmda_09 INTO '/scratch/hannes/hmda/hmda_09_f.csv';
 

	
 
COPY SELECT * FROM hmda_10 SAMPLE 10000 INTO '/scratch/hannes/hmda/hmda_10_1.csv';
 
COPY SELECT * FROM hmda_10 SAMPLE 100000 INTO '/scratch/hannes/hmda/hmda_10_2.csv';
 
COPY SELECT * FROM hmda_10 SAMPLE 1000000 INTO '/scratch/hannes/hmda/hmda_10_3.csv';
 
COPY SELECT * FROM hmda_10 SAMPLE 10000000 INTO '/scratch/hannes/hmda/hmda_10_4.csv';
 
COPY SELECT * FROM hmda_10 INTO '/scratch/hannes/hmda/hmda_10_f.csv';
 

	
 
COPY SELECT * FROM hmda_11 SAMPLE 10000 INTO '/scratch/hannes/hmda/hmda_11_1.csv';
 
COPY SELECT * FROM hmda_11 SAMPLE 100000 INTO '/scratch/hannes/hmda/hmda_11_2.csv';
 
COPY SELECT * FROM hmda_11 SAMPLE 1000000 INTO '/scratch/hannes/hmda/hmda_11_3.csv';
 
COPY SELECT * FROM hmda_11 SAMPLE 10000000 INTO '/scratch/hannes/hmda/hmda_11_4.csv';
 
COPY SELECT * FROM hmda_11 INTO '/scratch/hannes/hmda/hmda_11_f.csv';
hmda-queries.sql
Show inline comments
 
new file 100644
 
SELECT actiontype,
 
       propertytype,
 
       loanpurpose,
 
       count(*) AS num_records
 
FROM hmda_11
 
GROUP BY actiontype,
 
         propertytype,
 
         loanpurpose;
 

	
 

	
 
SELECT actiontype,
 
       propertytype,
 
       loanpurpose,
 
       count(*) AS num_records
 
FROM hmda_06
 
GROUP BY actiontype,
 
         propertytype,
 
         loanpurpose
 
ORDER BY actiontype,
 
         propertytype,
 
         loanpurpose;
 

	
 

	
 
SELECT actiontype,
 
       propertytype,
 
       loanpurpose,
 
       count(*) AS num_records
 
FROM hmda_07
 
GROUP BY actiontype,
 
         propertytype,
 
         loanpurpose
 
ORDER BY actiontype,
 
         propertytype,
 
         loanpurpose;
 

	
 

	
 
SELECT actiontype,
 
       propertytype,
 
       loanpurpose,
 
       count(*) AS num_records
 
FROM hmda_08
 
GROUP BY actiontype,
 
         propertytype,
 
         loanpurpose
 
ORDER BY actiontype,
 
         propertytype,
 
         loanpurpose;
 

	
 

	
 
SELECT actiontype,
 
       propertytype,
 
       loanpurpose,
 
       count(*) AS num_records
 
FROM hmda_09
 
GROUP BY actiontype,
 
         propertytype,
 
         loanpurpose
 
ORDER BY actiontype,
 
         propertytype,
 
         loanpurpose;
 

	
 

	
 
SELECT actiontype,
 
       propertytype,
 
       loanpurpose,
 
       count(*) AS num_records
 
FROM hmda_10
 
GROUP BY actiontype,
 
         propertytype,
 
         loanpurpose
 
ORDER BY actiontype,
 
         propertytype,
 
         loanpurpose;
 

	
 

	
 
SELECT actiontype,
 
       propertytype,
 
       loanpurpose,
 
       count(*) AS num_records
 
FROM hmda_11
 
GROUP BY actiontype,
 
         propertytype,
 
         loanpurpose
 
ORDER BY actiontype,
 
         propertytype,
 
         loanpurpose;
 

	
 

	
 
SELECT sum((loanpurpose = '1'
 
            AND propertytype IN ('1', '2'))) AS home_purchase,
 
       sum((loanpurpose = '3'
 
            AND propertytype IN ('1', '2'))) AS refinance,
 
       sum((loanpurpose = '2'
 
            AND propertytype IN ('1', '2'))) AS home_improvement,
 
       sum((propertytype = '3')) AS multifamily,
 
       count(*) AS total
 
FROM hmda_11
 
WHERE actiontype = '1';
 

	
 

	
 
SELECT sum((loanpurpose = '1'
 
            AND propertytype IN ('1', '2'))) AS home_purchase,
 
       sum((loanpurpose = '3'
 
            AND propertytype IN ('1', '2'))) AS refinance,
 
       sum((loanpurpose = '2'
 
            AND propertytype IN ('1', '2'))) AS home_improvement,
 
       sum((propertytype = '3')) AS multifamily,
 
       count(*) AS total
 
FROM hmda_06
 
WHERE actiontype = '1'
 
UNION
 
SELECT sum((loanpurpose = '1'
 
            AND propertytype IN ('1', '2'))) AS home_purchase,
 
       sum((loanpurpose = '3'
 
            AND propertytype IN ('1', '2'))) AS refinance,
 
       sum((loanpurpose = '2'
 
            AND propertytype IN ('1', '2'))) AS home_improvement,
 
       sum((propertytype = '3')) AS multifamily,
 
       count(*) AS total
 
FROM hmda_07
 
WHERE actiontype = '1'
 
UNION
 
SELECT sum((loanpurpose = '1'
 
            AND propertytype IN ('1', '2'))) AS home_purchase,
 
       sum((loanpurpose = '3'
 
            AND propertytype IN ('1', '2'))) AS refinance,
 
       sum((loanpurpose = '2'
 
            AND propertytype IN ('1', '2'))) AS home_improvement,
 
       sum((propertytype = '3')) AS multifamily,
 
       count(*) AS total
 
FROM hmda_08
 
WHERE actiontype = '1'
 
UNION
 
SELECT sum((loanpurpose = '1'
 
            AND propertytype IN ('1', '2'))) AS home_purchase,
 
       sum((loanpurpose = '3'
 
            AND propertytype IN ('1', '2'))) AS refinance,
 
       sum((loanpurpose = '2'
 
            AND propertytype IN ('1', '2'))) AS home_improvement,
 
       sum((propertytype = '3')) AS multifamily,
 
       count(*) AS total
 
FROM hmda_09
 
WHERE actiontype = '1'
 
UNION
 
SELECT sum((loanpurpose = '1'
 
            AND propertytype IN ('1', '2'))) AS home_purchase,
 
       sum((loanpurpose = '3'
 
            AND propertytype IN ('1', '2'))) AS refinance,
 
       sum((loanpurpose = '2'
 
            AND propertytype IN ('1', '2'))) AS home_improvement,
 
       sum((propertytype = '3')) AS multifamily,
 
       count(*) AS total
 
FROM hmda_10
 
WHERE actiontype = '1'
 
UNION
 
SELECT sum((loanpurpose = '1'
 
            AND propertytype IN ('1', '2'))) AS home_purchase,
 
       sum((loanpurpose = '3'
 
            AND propertytype IN ('1', '2'))) AS refinance,
 
       sum((loanpurpose = '2'
 
            AND propertytype IN ('1', '2'))) AS home_improvement,
 
       sum((propertytype = '3')) AS multifamily,
 
       count(*) AS total
 
FROM hmda_11
 
WHERE actiontype = '1';
 

	
 
SELECT race,
 
       count(*)
 
FROM hmda_11
 
WHERE actiontype = '1'
 
  AND loanpurpose = '1'
 
  AND occupancy = '1'
 
  AND lienstatus = '1'
 
  AND propertytype IN ('1',
 
                       '2')
 
GROUP BY race
 
ORDER BY race;
 

	
 

	
 
SELECT ethnicity,
 
       count(*)
 
FROM hmda_11
 
WHERE actiontype = '1'
 
  AND loanpurpose = '1'
 
  AND occupancy = '1'
 
  AND lienstatus = '1'
 
  AND propertytype IN ('1',
 
                       '2')
 
  AND race = '5'
 
GROUP BY ethnicity
 
ORDER BY ethnicity;
 

	
plots.R
Show inline comments
 
new file 100755
 
#!R -f
 

	
 
(function(lp) {
 
np <- lp[!(lp %in% installed.packages()[,"Package"])]
 
if(length(np)) install.packages(np,repos=c("https://cran.rstudio.com/"))
 
x <- lapply(lp,function(x){library(x,character.only=TRUE)}) 
 
})(c("ggplot2", "ggthemes", "gtable", "reshape2"))
 

	
 
theme <- theme_few(base_size = 24) + 
 
theme(axis.title.y=element_text(vjust=0.9), 
 
  axis.title.x=element_text(vjust=-0.1),
 
  axis.ticks.x=element_blank(),
 
  text=element_text(family="serif"),
 
  legend.position = "none")
 

	
 
dd <- read.table("results.csv", sep="\t", header=F)
 
names(dd) <- c("sys", "dataset", "nrows", "loadtime", "querytime", "readtime")
 
ddm <- melt(dd[c(1,2,4,5,6)], id.vars=c("sys","dataset"))
 
ddm$sys <- factor(ddm$sys, c("MonetDBLite", "MonetDB", "SQLite"))
 

	
 
print(ddm)
 

	
 
xlf <- function(value) ifelse(value == 0, "DNF", ifelse(value < 10, 
 
  paste0(round(value, 1), "s"), paste0(round(value), "s")))
 

	
 
plt <- function(ddp, fname, xmax, title) {
 
  pdf(fname, width=10, height=5)
 
  p <- ggplot(ddp,aes(x=dataset, y=value, fill=sys)) + 
 
    geom_bar(stat="identity", position = "dodge", width=.7)  +
 
    geom_text(aes(label = xlf(value), family="serif"), size = 5, vjust=-.4, 
 
      position = position_dodge(width=.7)) + 
 
    scale_x_discrete(labels=xlabel) +
 
    scale_y_continuous(limits=c(0, xmax)) + 
 
    scale_fill_manual(values=c( "#1f78b4", "#a6cee3", "#b2df8a")) +
 
    xlab("# Tuples") + ylab("Time (s)") + theme + ggtitle(title)
 

	
 
  # haaaack (secondary x axis patched in using gtable)
 
  axis <- ggplot(ddp,aes(x=dataset, y=value, fill=sys)) + 
 
    geom_text(aes(label=sys, y=0, family="serif"), angle=90, size = 5,  hjust = 0.55, 
 
      position = position_dodge(width=.7))
 
  annotation <- gtable_filter(ggplotGrob(axis), "panel", trim=TRUE)
 
  annotation[["grobs"]][[1]][["children"]][c(1,3)] <- NULL
 
  g <- ggplotGrob(p)
 
  g <- gtable_add_rows(g, unit(6, "line"), pos=3)
 
  g <- gtable_add_grob(g, annotation, t=4, b=4, l=4, r=4)
 
  grid.draw(g)
 

	
 
  dev.off()
 
}
 

	
 
xlabel <- c("1"="60K", "2"="600K", "3"="6M", "4"="60M", "f"="128M")
 
plt(ddm[ddm$variable=='loadtime',], "load.pdf", 10000, "Loading from CSV files")
 
plt(ddm[ddm$variable=='querytime',], "query.pdf", 70, "Run HMDA analysis")
 

	
 
xlabel <- c("1"="10K", "2"="100K", "3"="1M", "4"="10M", "f"="14M")
 
plt(ddm[ddm$variable=='readtime',], "read.pdf", 1500, "Convert table to data.frame")
testdriver.R
Show inline comments
 
new file 100644
 
library(MonetDB.R)
 
library(RSQLite)
 
library(RPostgreSQL)
 

	
 
con1 <- dbConnect(MonetDB.R(), "hmda")
 
con2 <- dbConnect(MonetDBLite())
 
con3 <- dbConnect(SQLite(), ":memory:")
 
con4 <- dbConnect(SQLite(), tempfile())
 

	
 
# PostgreSQL can't run queries
 
# con5 <- dbConnect(dbDriver("PostgreSQL"), host="localhost", port=5432)
 

	
 
ds <- c("1"=60000, "2"=600000, "3"=6000000, "4"=60000000, "f"=128818169)
 
tb <- sprintf("hmda_%02d", 6:11)
 

	
 
mtime <- function(x) system.time(x)[[3]]
 

	
 
results <- data.frame(sys=character(), dataset=character(), nrows=integer(), ltime=numeric(), qtime=numeric(), dtime=numeric())
 

	
 
run <- function(con, sys) {
 
	for (d in names(ds)) {
 
		# create tables
 
		lapply(tb, function(t) {
 
			if (dbExistsTable(con, t)) dbRemoveTable(con, t)
 
			dbSendQuery(con, paste0('CREATE TABLE ', t, '( "asofyear" DOUBLE PRECISION, "respondentid" VARCHAR(255), "agencycode" VARCHAR(255), "loantype" DOUBLE PRECISION, "propertytype" VARCHAR(255), "loanpurpose" DOUBLE PRECISION, "occupancy" DOUBLE PRECISION, "loanamount" DOUBLE PRECISION, "preapproval" VARCHAR(255), "actiontype" DOUBLE PRECISION, "msa_md" VARCHAR(255), "statecode" VARCHAR(255), "countycode" VARCHAR(255), "censustractnumber" VARCHAR(255), "applicantethnicity" VARCHAR(255), "coapplicantethnicity" VARCHAR(255), "applicantrace1" VARCHAR(255), "applicantrace2" VARCHAR(255), "applicantrace3" VARCHAR(255), "applicantrace4" VARCHAR(255), "applicantrace5" VARCHAR(255), "coapplicantrace1" VARCHAR(255), "coapplicantrace2" VARCHAR(255), "coapplicantrace3" VARCHAR(255), "coapplicantrace4" VARCHAR(255), "coapplicantrace5" VARCHAR(255), "applicantsex" DOUBLE PRECISION, "coapplicantsex" DOUBLE PRECISION, "applicantincome" VARCHAR(255), "purchasertype" VARCHAR(255), "denialreason1" VARCHAR(255), "denialreason2" VARCHAR(255), "denialreason3" VARCHAR(255), "ratespread" VARCHAR(255), "hoepastatus" VARCHAR(255), "lienstatus" VARCHAR(255), "editstatus" VARCHAR(255), "applicationdateindicator" DOUBLE PRECISION, "sequencenumber" DOUBLE PRECISION, "population" DOUBLE PRECISION, "minoritypopulationpct" DOUBLE PRECISION, "hudmedianfamilyincome" DOUBLE PRECISION, "tracttomsa_mdincomepct" DOUBLE PRECISION, "numberofowneroccupiedunits" DOUBLE PRECISION, "numberof1to4familyunits" DOUBLE PRECISION, "activityyear" DOUBLE PRECISION, "fedtaxid" VARCHAR(255), "respondentname_ts" VARCHAR(255), "respondentmailingaddress" VARCHAR(255), "respondentcity_ts" VARCHAR(255), "respondentstate_ts" VARCHAR(255), "respondentzipcode" VARCHAR(255), "parentname_ts" VARCHAR(255), "parentaddress" VARCHAR(255), "parentcity_ts" VARCHAR(255), "parentstate_ts" VARCHAR(255), "parentzipcode" VARCHAR(255), "respondentname_panel" VARCHAR(255), "respondentcity_panel" VARCHAR(255), "respondentstate_panel" VARCHAR(255), "assets_panel" DOUBLE PRECISION, "otherlendercode_panel" DOUBLE PRECISION, "regioncode_panel" DOUBLE PRECISION, "larcount" DOUBLE PRECISION, "validityerror" VARCHAR(255), "app_min_cnt" INTEGER, "co_min_cnt" INTEGER, "appwhite" INTEGER, "cowhite" INTEGER, "race" INTEGER, "ethnicity" VARCHAR(255))'))
 
		})
 

	
 
		# load data
 
		ltime <- mtime(
 
		for (t in tb) {
 
			fname <- paste0(getwd(), "/data/", t, "_", d, ".csv")
 
			if (inherits(con, "SQLiteConnection")) dbWriteTable(con, name = t, value = fname, row.names = FALSE, header = FALSE, append=T, sep="|", overwrite=F)
 
			if (inherits(con, "MonetDBConnection"))  dbSendQuery(con, paste0("COPY INTO ", t, " FROM '", fname, "'", collapse="\n"))
 
			if (inherits(con, "PostgreSQLConnection"))  dbGetQuery(con, paste0("COPY ", t, " FROM '", fname, "' WITH (FORMAT CSV, DELIMITER \'|\', NULL \'null\')", collapse="\n"))
 
		})
 

	
 
		trows <- dbGetQuery(con, paste0("SELECT SUM(rws) AS rws FROM (", paste0("SELECT COUNT(*) AS rws FROM ", tb, collapse=" UNION ALL "), ") AS t"))$rws[[1]]
 

	
 
		# make sure we have loaded as many rows as expected
 
		stopifnot(trows == ds[d])
 

	
 
		dump  <- dbGetQuery(con, paste0(readLines("hmda-queries.sql"), collapse="\n"))
 
		dump  <- dbGetQuery(con, paste0(readLines("hmda-queries.sql"), collapse="\n"))
 
		qtime <- mtime(dbGetQuery(con, paste0(readLines("hmda-queries.sql"), collapse="\n")))
 
		dump  <- dbReadTable(con, "hmda_11")
 
		dump  <- dbReadTable(con, "hmda_11")
 
		dtime <- mtime(dbReadTable(con, "hmda_11"))
 

	
 
		# clean up
 
		lapply(tb, function(x) dbRemoveTable(con, x))
 

	
 
		results <<- rbind(results, data.frame(sys=sys, dataset=d, nrows=ds[d], ltime=ltime, qtime=qtime, dtime=dtime))
 
		print(results)
 
		write.table(results, "results.csv", row.names=F, quote=F, sep="\t", col.names=F)
 
	}
 
}
 
run(con2, 'MonetDBLite')
 
run(con4, 'SQLite-file')
 
run(con1, 'MonetDB')
0 comments (0 inline, 0 general)