diff --git a/vldb-protocols2.R b/vldb-protocols2.R index b4739f52396efd062bbbefe4a9544002843b0d20..b2c86a746ccb0b7bd6f044b795b843c6511ddb74 100644 --- a/vldb-protocols2.R +++ b/vldb-protocols2.R @@ -1,7 +1,9 @@ - library(dplyr) library(ggplot2) library(ggthemes) +library(ggrepel) +library(stringr) + theme <- theme_few(base_size = 24) + theme(axis.title.y=element_text(vjust=0.9), @@ -10,31 +12,276 @@ theme(axis.title.y=element_text(vjust=0.9), text=element_text(family="serif")) -read.table("12.csv", header=T, sep=",", stringsAsFactors=F, na.strings="-1") -> dd3 +read.table("11.csv", header=T, sep=",", stringsAsFactors=F, na.strings="-1") -> dd3 sysnames <- c("mariadb-default"="MySQL", "mariadb-compress"="MySQL+C", "oracle-default"="DBMS X", "postgres-default" = "PostgreSQL", "db2-default" = "DB2", "monetdb-default"="MonetDB", "hbase-default" = "HBase", "hive-default" = "Hive", "mongodb-default" = "MongoDB") -dd3 %>% filter(network=="unlimited", protocol == "native", timeout != 1, system %in% c("hive-default", "hbase-default", "mariadb-default", "mariadb-compress", "db2-default", "oracle-default", "postgres-default", "monetdb-default", "mongodb-default"), tuple %in% c(1, 1000000)) %>% mutate(system=sysnames[system]) %>% select(system, tuple, time, bytes) %>% group_by(system, tuple) %>% summarise_each(funs(mean)) -> dd4 +dd3 %>% filter(network=="unlimited", protocol == "native", timeout != 1, system %in% c("hive-default", "hbase-default", "mariadb-default", "mariadb-compress", "db2-default", "oracle-default", "postgres-default", "monetdb-default", "mongodb-default"), tuple %in% c(1, 1000000)) %>% mutate(system=sysnames[system]) %>% select(system, tuple, time, bytes) %>% group_by(system, tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> dd4 + +dd5 <- dd4 %>% filter(tuple==1000000) %>% rename(time_1m=time_mean, bytes_1m=bytes_mean) %>% left_join(dd4 %>% filter(tuple == 1) %>% rename(time_1=time_mean, bytes_1=bytes_mean), by="system") %>% mutate(tuple_time_ms=((time_1m-time_1)/1000000)*1000, tuple_bytes=(bytes_1m-bytes_1)/1000000) + + +pdf("total-time.pdf", width=10, height=5) + +ggplot(dd4 %>% filter(tuple == 1000000), aes(y=time_mean, x=reorder(system, -time_mean), label=round(time_mean, 1))) + geom_bar(stat="identity", position="dodge", width=.5) + geom_errorbar(aes(ymax = time_mean + time_se, ymin=time_mean - time_se), width=0.2, size=1) + theme + xlab("") + ylab("Wall clock time (s)") + coord_flip() + geom_text(size=7, hjust=-.8, family="serif") + scale_y_continuous(limits=c(0, 31)) + + +dev.off() + +pdf("total-bytes.pdf", width=10, height=5) + +ggplot(dd4 %>% filter(tuple == 1000000) %>% mutate(bytes_mean_mb=(bytes_mean/1024/1024)), aes(y=bytes_mean_mb, x=reorder(system, -bytes_mean_mb), label=round(bytes_mean_mb, 2))) + geom_hline(yintercept=120, linetype="dashed") + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Bytes transferred (MB)") + coord_flip() + geom_text(size=7, hjust=-.2, family="serif") + scale_y_continuous(limits=c(0, 400)) + + +dev.off() + + + +# pdf("time-per-tuple.pdf", width=10, height=5) + +# ggplot(dd5, aes( y=tuple_time_ms, x=reorder(system, -tuple_time_ms), label=round(tuple_time_ms, 4))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Time per tuple (ms)") + coord_flip() + guides(fill=FALSE) + geom_text(size=7, hjust=-.2, family="serif") + scale_y_continuous(limits=c(0, .03)) + + +# dev.off() + + +# pdf("bytes-per-tuple.pdf", width=10, height=5) + + +# ggplot(dd5, aes( y=tuple_bytes, x=reorder(system, -tuple_bytes), label=round(tuple_bytes))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Bytes per tuple") + coord_flip() + guides(fill=FALSE) + geom_text(size=7, hjust=-.2, family="serif") + scale_y_continuous(limits=c(0, 410)) + + +# dev.off() + + + +# ggplot(dd5, aes(y=tuple_time_ms, x=tuple_bytes, label=system)) + geom_point() + geom_text(size=7, hjust=-.1, family="serif") + theme + + + +read.table("13.csv", header=T, sep=",", stringsAsFactors=F, na.strings="-1") -> dd3 +ntuples <- 1000000 + +sysnames <- c("mariadb-default"="MySQL", "mariadb-compress"="MySQL+C", "oracle-default"="DBMS X", "postgres-default" = "PostgreSQL", "db2-default" = "DB2", "monetdb-prot9"="MonetDB", "hbase-default" = "HBase", "hive-default" = "Hive", "mongodb-default" = "MongoDB") + +dd3 %>% filter(protocol == "odbc-noprint", timeout != 1, system %in% c("hive-default", "hbase-default", "mariadb-default", "mariadb-compress", "db2-default", "oracle-default", "postgres-default", "monetdb-prot9", "mongodb-default"), tuple %in% c(1, ntuples)) %>% mutate(system=sysnames[system]) %>% select(system, tuple, network, time, bytes) %>% group_by(system, tuple, network) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> dd4 + + + +dd5 <- as.data.frame(dd4 %>% filter(tuple==ntuples) %>% rename(time_1m=time_mean, bytes_1m=bytes_mean) %>% left_join(dd4 %>% filter(tuple == 1) %>% rename(time_1=time_mean, bytes_1=bytes_mean), by=c("system", "network"))) %>% mutate(tuple_time_ms=((time_1m-time_1)/ntuples)*1000, tuple_bytes=(bytes_1m-bytes_1)/ntuples, adj_time= time_1m-time_1, adj_bytes=bytes_1m-bytes_1) + + + +pdf("total-time-noprint.pdf", width=10, height=3.5) + +ggplot(dd4 %>% filter(tuple == 1000000, network=="unlimited"), aes(y=time_mean, x=reorder(system, -time_mean), label=round(time_mean, 1))) + geom_hline(yintercept=0.237, linetype="dashed") + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Wall clock time (s)") + coord_flip() + geom_text(size=7, hjust=-.8, family="serif") + scale_y_continuous(limits=c(0, 6)) + + +dev.off() + +# pdf("time-per-tuple5.pdf", width=10, height=5) + +# ggplot(dd5 %>% filter(network == "unlimited"), aes( y=tuple_time_ms, x=reorder(system, -tuple_time_ms), label=round(tuple_time_ms, 4))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Time per tuple (ms)") + coord_flip() + guides(fill=FALSE) + geom_text(size=7, hjust=-.2, family="serif") + +# dev.off() + + +# pdf("time-per-tuple6.pdf", width=10, height=5) + +# ggplot(dd5 %>% filter(network == "10mbitethhd"), aes( y=tuple_time_ms, x=reorder(system, -tuple_time_ms), label=round(tuple_time_ms, 4))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Time per tuple (ms)") + coord_flip() + guides(fill=FALSE) + geom_text(size=7, hjust=-.2, family="serif") + +# dev.off() + + + +# pdf("bytes-per-tuple5.pdf", width=10, height=5) + + +# ggplot(dd5 %>% filter(network == "unlimited"), aes( y=tuple_bytes, x=reorder(system, -tuple_bytes), label=round(tuple_bytes))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Bytes per tuple") + coord_flip() + guides(fill=FALSE) + geom_text(size=7, hjust=-.2, family="serif") + +# dev.off() + + + +# ggplot(dd5, aes( x=network, y=tuple_time_ms, fill=system)) +geom_bar(stat="identity", position="dodge", width=.5) + + + +read.table("43.csv", header=T, sep=",", stringsAsFactors=F, na.strings="-1") -> dd3 + +ntuples <- 1000000 + +sysnames <- c("mariadb-default"="MySQL", "mariadb-compress"="MySQL+C", "oracle-default"="DBMS X", "postgres-default" = "PostgreSQL", "db2-default" = "DB2", "monetdb-prot9"="MonetDB", "monetdb-prot10-none-none"="MonetDB++", "monetdb-prot10-snappy-none"="MonetDB++S", "hbase-default" = "HBase", "hive-default" = "Hive", "mongodb-default" = "MongoDB") + +dd3 %>% filter(protocol == "odbc-noprint", timeout != 1, system %in% c("mariadb-default", "db2-default", "oracle-default", "postgres-default", "monetdb-prot9", "mariadb-compress"), tuple %in% c(1, ntuples)) %>% mutate(system=sysnames[system]) %>% select(system, tuple, throughput, latency, time, bytes) %>% group_by(system, tuple, throughput, latency) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> dd4 + + + +dd5 <- as.data.frame(dd4 %>% filter(tuple==ntuples) %>% rename(time_1m=time_mean, bytes_1m=bytes_mean) %>% left_join(dd4 %>% filter(tuple == 1) %>% rename(time_1=time_mean, bytes_1=bytes_mean), by=c("system", "throughput", "latency"))) %>% mutate(tuple_time_ms=((time_1m-time_1)/ntuples)*1000, tuple_bytes=(bytes_1m-bytes_1)/ntuples, adj_time= time_1m-time_1, adj_bytes=bytes_1m-bytes_1) + + +pdf("latency.pdf", width=10, height=5) + +ggplot(dd5 %>% filter(is.na(throughput)), aes(color=system, y=adj_time, x=latency, label=round(adj_time, 1))) + geom_line(size=1.5) + geom_point(size=3) + scale_y_log10() + scale_x_log10(breaks=c(0.1,1,10,100), limits=c(0.1, 500)) + theme + xlab("Latency (ms, log)") + ylab("Wall clock time (s, log)") + theme(legend.position = "none") + geom_text_repel(data = dd5 %>% filter(is.na(throughput), latency==100 | (system=="DBMS X" & latency==10)), aes(label = system), size = 8, nudge_x = .3, segment.color = NA, family="serif") + + +dev.off() + + +pdf("throughput.pdf", width=10, height=5) + +ggplot(dd5 %>% filter(is.na(latency)), aes(color=system, y=adj_time, x=throughput, label=round(adj_time, 1))) + geom_line(size=1.5) + geom_point(size=3) + scale_y_log10() + scale_x_log10(breaks=c(10,100,1000), limits=c(10, 3500)) + theme + xlab("Throughput (Mb/s, log)") + ylab("Wall clock time (s, log)") + theme(legend.position = "none") + + geom_text_repel(data = dd5 %>% filter(is.na(latency), throughput==1000), aes(label = system), size = 8, nudge_x = .35, segment.color = NA, family="serif") +dev.off() + + +read.table("44.csv", header=T, sep=",", stringsAsFactors=F, na.strings="-1") -> dd3 + +compnames <- c("snappy"="Sy", "xz"="XZ", "lz4" = "LZ4", "gzip" = "GZ") + + +dd3 %>% filter(grepl("netcat-prot-", dd3$system, fixed=T), tuple == 1000000, bin_chunksize == 100000, is.na(latency)) %>% select(bin_compress, bin_orientation, throughput, bytes, time) %>% group_by(bin_compress, bin_orientation, throughput) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) %>% mutate(bytes_mean_mb=(bytes_mean/1024/1024), bin_compress_name=ifelse(bin_compress == "", "None", compnames[bin_compress]), throughput=ifelse(is.na(throughput), "Unlimited", throughput)) -> dd4 + + + + + +dd4$throughput <- factor(dd4$throughput, levels=c("Unlimited", "1000", "100", "10")) + +pdf("compression.pdf", width=10, height=6) + +ggplot(dd4 %>% filter(bin_orientation == "col"), aes(y=time_mean, x=bytes_mean_mb, label=bin_compress_name)) + geom_point(size=3) + theme + scale_y_log10() + ylab("Wall clock time (s, log)") + xlab("Bytes transferred (MB)") + theme(legend.position = "none") + facet_wrap(~throughput) + geom_text_repel(family="serif", size=8, point.padding=unit(2,"mm")) + + +dev.off() + + +# for table + +xtable::xtable(dd4 %>% filter(is.na(throughput)) %>% select(bin_compress, bin_orientation, bytes_mean) %>% mutate(bytes_mean_mb=round(bytes_mean/1024/1024,1), ratio=round(110271223/bytes_mean, 2)) %>% select(bin_compress, bin_orientation,bytes_mean_mb, ratio)) + + + + + +read.table("47.csv", header=T, sep=",", stringsAsFactors=F, na.strings="-1") -> dd3 + + +ntuples <- 10000000 + +sysnames <- c("monetdb-prot10-strings-none-none"="Null-Terminated", "monetdb-prot10-strings-snappy-none"="Null-Terminated + Sy","monetdb-prot10-none-none"="Fixed Witdth", "monetdb-prot10-snappy-none"="Fixed Width + Sy") + +dd3 %>% filter(network=="unlimitedms-unlimitedmb/s", protocol == "odbc-noprint", timeout != 1, system %in% c(names(sysnames)), tuple %in% c(1,ntuples)) %>% mutate(system=sysnames[system]) %>% select(system, tuple, time, bytes) %>% group_by(system, tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> dd4 -dd5 <- dd4 %>% filter(tuple==1000000) %>% rename(time_1m=time, bytes_1m=bytes) %>% left_join(dd4 %>% filter(tuple == 1) %>% rename(time_1=time, bytes_1=bytes), by="system") %>% mutate(tuple_time_ms=((time_1m-time_1)/1000000)*1000, tuple_bytes=(bytes_1m-bytes_1)/1000000) + dd5 <- as.data.frame(dd4 %>% filter(tuple==ntuples) %>% rename(time_1m=time_mean, bytes_1m=bytes_mean) %>% left_join(dd4 %>% filter(tuple == 1) %>% rename(time_1=time_mean, bytes_1=bytes_mean), by=c("system"))) %>% mutate(tuple_time_ms=((time_1m-time_1)/ntuples)*1000, tuple_bytes=(bytes_1m-bytes_1)/ntuples, adj_time= time_1m-time_1, adj_bytes=bytes_1m-bytes_1) -pdf("time-per-tuple.pdf", width=10, height=5) +pdf("total-time-strings-noprint.pdf", width=10, height=3) -ggplot(dd5, aes( y=tuple_time_ms, x=reorder(system, -tuple_time_ms), label=round(tuple_time_ms, 4))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Time per tuple (ms)") + coord_flip() + guides(fill=FALSE) + geom_text(size=7, hjust=-.2, family="serif") + scale_y_continuous(limits=c(0, .03)) +ggplot(dd5 %>% filter(tuple.x == ntuples), aes(y=adj_time, x=reorder(system, -adj_time), label=round(adj_time, 2))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Wall clock time (s)") + coord_flip() + geom_text(size=7, hjust=-.4, family="serif") + scale_y_continuous(limits=c(0, 4)) dev.off() +pdf("total-bytes-strings.pdf", width=10, height=3) + +ggplot(dd4 %>% filter(tuple == ntuples) %>% mutate(bytes_mean_mb=(bytes_mean/1024/1024)), aes(y=bytes_mean_mb, x=reorder(system, -bytes_mean_mb), label=round(bytes_mean_mb, 2))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Bytes transferred (MB)") + coord_flip() + geom_text(size=7, hjust=-.2, family="serif") + scale_y_continuous(limits=c(0, 840)) -pdf("bytes-per-tuple.pdf", width=10, height=5) + +dev.off() -ggplot(dd5, aes( y=tuple_bytes, x=reorder(system, -tuple_bytes), label=round(tuple_bytes))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Bytes per tuple") + coord_flip() + guides(fill=FALSE) + geom_text(size=7, hjust=-.2, family="serif") + scale_y_continuous(limits=c(0, 410)) +sysnames <- c("monetdb-prot10-intsonly-pfor"="PFOR-DELTA", "monetdb-prot10-intsonly-none"="None","monetdb-prot10-intsonly-pfor-snappy"="PFOR + Snappy", "monetdb-prot10-intsonly-none-snappy"="Snappy") +dd3 %>% filter(network=="unlimitedms-unlimitedmb/s", protocol == "odbc-noprint", timeout != 1, system %in% c(names(sysnames)), tuple %in% c(1,ntuples)) %>% mutate(system=sysnames[system]) %>% select(system, tuple, time, bytes) %>% group_by(system, tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> dd4 + + dd5 <- as.data.frame(dd4 %>% filter(tuple==ntuples) %>% rename(time_1m=time_mean, bytes_1m=bytes_mean) %>% left_join(dd4 %>% filter(tuple == 1) %>% rename(time_1=time_mean, bytes_1=bytes_mean), by=c("system"))) %>% mutate(tuple_time_ms=((time_1m-time_1)/ntuples)*1000, tuple_bytes=(bytes_1m-bytes_1)/ntuples, adj_time= time_1m-time_1, adj_bytes=bytes_1m-bytes_1) + + +pdf("total-time-ints-noprint.pdf", width=10, height=3) + + +ggplot(dd5 %>% filter(tuple.x == ntuples), aes(y=adj_time, x=reorder(system, -adj_time), label=round(adj_time, 3))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Wall clock time (s)") + coord_flip() + geom_text(size=7, hjust=-.4, family="serif") + scale_y_continuous(limits=c(0, 1.2)) dev.off() +pdf("total-bytes-ints.pdf", width=10, height=3) + +ggplot(dd4 %>% filter(tuple == ntuples) %>% mutate(bytes_mean_mb=(bytes_mean/1024/1024)), aes(y=bytes_mean_mb, x=reorder(system, -bytes_mean_mb), label=round(bytes_mean_mb, 2))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Bytes transferred (MB)") + coord_flip() + geom_text(size=7, hjust=-.2, family="serif") + scale_y_continuous(limits=c(0, 220)) + + +dev.off() + + + + +sysnames <- c("monetdb-prot10"="New Protocol", "netcat-csv"="netcat") + +dd3 %>% filter(network=="unlimitedms-unlimitedmb/s", timeout != 1, system %in% c(names(sysnames)), tuple %in% c(1,ntuples)) %>% mutate(system=sysnames[system]) %>% select(system, tuple, time, bytes) %>% group_by(system, tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> dd4 + + dd5 <- as.data.frame(dd4 %>% filter(tuple==ntuples) %>% rename(time_1m=time_mean, bytes_1m=bytes_mean) %>% left_join(dd4 %>% filter(tuple == 1) %>% rename(time_1=time_mean, bytes_1=bytes_mean), by=c("system"))) %>% mutate(tuple_time_ms=((time_1m-time_1)/ntuples)*1000, tuple_bytes=(bytes_1m-bytes_1)/ntuples, adj_time= time_1m-time_1, adj_bytes=bytes_1m-bytes_1) + + +pdf("total-time-shootout-noprint.pdf", width=10, height=3) + + +ggplot(dd5 %>% filter(tuple.x == ntuples), aes(y=adj_time, x=reorder(system, -adj_time), label=round(adj_time, 2))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Wall clock time (s)") + coord_flip() + geom_text(size=7, hjust=-.4, family="serif") #+ scale_y_continuous(limits=c(0, 4)) + +dev.off() + +pdf("total-bytes-shootout.pdf", width=10, height=3) + +ggplot(dd4 %>% filter(tuple == ntuples) %>% mutate(bytes_mean_mb=(bytes_mean/1024/1024)), aes(y=bytes_mean_mb, x=reorder(system, -bytes_mean_mb), label=round(bytes_mean_mb, 2))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Bytes transferred (MB)") + coord_flip() + geom_text(size=7, hjust=-.2, family="serif") # + scale_y_continuous(limits=c(0, 840)) + + +dev.off() + + + + +read.table("48.csv", header=T, sep=",", stringsAsFactors=F, na.strings="-1") -> dd3 + +ntuples <- 1000000 + +dd3 %>% filter(grepl("monetdb-prot10-", dd3$system, fixed=T)) %>% mutate(system2 = gsub("monetdb-prot10-","", system, fixed=T), compression=str_split_fixed(system2, "-", 2)[,1], chunksize=as.integer(str_split_fixed(system2, "-", 2)[,2])) %>% filter(timeout != 1, tuple %in% c(1,ntuples)) %>% select(compression, chunksize, tuple, time, bytes) %>% group_by(compression, chunksize, tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> dd4 + + + dd5 <- as.data.frame(dd4 %>% filter(tuple==ntuples) %>% rename(time_1m=time_mean, bytes_1m=bytes_mean) %>% left_join(dd4 %>% filter(tuple == 1) %>% rename(time_1=time_mean, bytes_1=bytes_mean), by=c("compression", "chunksize"))) %>% mutate(tuple_time_ms=((time_1m-time_1)/ntuples)*1000, tuple_bytes=(bytes_1m-bytes_1)/ntuples, adj_time= time_1m-time_1, adj_bytes=bytes_1m-bytes_1) + +dd5 %>% select(compression, chunksize, adj_time, adj_bytes) -> dd6 + +print(xtable::xtable(dd6 %>% filter(compression=="snappy")%>% full_join(dd6 %>% filter(compression =="none"), by="chunksize") %>% select(chunksize, time_uncomp=adj_time.y, bytes_uncom=adj_bytes.y, time_snappy=adj_time.x, bytes_snappy=adj_bytes.x) %>% mutate(ratio=round(bytes_uncom/bytes_snappy, 2), time_uncomp=round(time_uncomp, 2), time_snappy=round(time_snappy, 2), bytes_uncom=round(bytes_uncom/1024/1024, 1)) %>% select(chunksize, time_uncomp, time_snappy, bytes_uncom, ratio)), include.rownames=FALSE) + +# ggplot(dd5, aes(y=adj_time, x=chunksize/1024, color=compression)) + theme + xlab("Chunk Size (KB)") + ylab("Wall clock time (s)") + geom_line() + geom_point() + scale_x_log10() + + +# ggplot(dd5, aes(y=adj_bytes/1024, x=chunksize/1024, color=compression)) + theme + xlab("Chunk Size (KB)") + ylab("Data transferred (KB)") + geom_line() + geom_point() + scale_x_log10() + + + + + + + +read.table("49.csv", header=T, sep=",", stringsAsFactors=F, na.strings="-1") -> dd3 + +ntuples <- 1000000 + +dd3 %>% filter(grepl("monetdb-prot10-", dd3$system, fixed=T)) %>% mutate(system2 = gsub("monetdb-prot10-","", system, fixed=T), compression=str_split_fixed(system2, "-", 2)[,1], stringmethod=str_split_fixed(system2, "-", 2)[,2]) %>% filter(timeout != 1, tuple %in% c(1,ntuples)) %>% select(compression, stringmethod, tuple, time, bytes) %>% group_by(compression, stringmethod, tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> dd4 + + + dd5 <- as.data.frame(dd4 %>% filter(tuple==ntuples) %>% rename(time_1m=time_mean, bytes_1m=bytes_mean) %>% left_join(dd4 %>% filter(tuple == 1) %>% rename(time_1=time_mean, bytes_1=bytes_mean), by=c("compression", "stringmethod"))) %>% mutate(tuple_time_ms=((time_1m-time_1)/ntuples)*1000, tuple_bytes=(bytes_1m-bytes_1)/ntuples, adj_time= time_1m-time_1, adj_bytes=bytes_1m-bytes_1) + + + # ggplot(dd5 %>% filter(compression == "snappy"), aes(y=adj_time, x=reorder(stringmethod, -adj_time), label=round(adj_time, 2))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Wall clock time") + coord_flip() + geom_text(size=7, hjust=-.2, family="serif") + + + # ggplot(dd5 %>% filter(compression == "snappy"), aes(y=adj_bytes, x=reorder(stringmethod, -adj_bytes), label=round(adj_bytes, 2))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Wall clock time") + coord_flip() + geom_text(size=7, hjust=-.2, family="serif") + + + +dd5 %>% select(compression, stringmethod, adj_time, adj_bytes) -> dd6 +print(xtable::xtable(dd6 %>% filter(compression=="snappy")%>% full_join(dd6 %>% filter(compression =="none"), by="stringmethod") %>% select(stringmethod, time_uncomp=adj_time.y, bytes_uncom=adj_bytes.y, time_snappy=adj_time.x, bytes_snappy=adj_bytes.x) %>% mutate(ratio=round(bytes_uncom/bytes_snappy, 2), time_uncomp=round(time_uncomp, 2), time_snappy=round(time_snappy, 2), bytes_uncom=round(bytes_uncom/1024/1024, 1)) %>% select(stringmethod, time_uncomp, time_snappy, bytes_uncom, ratio)), include.rownames=FALSE)