Files
@ 78f3a9ffe84c
Branch filter:
Location: DA/protocols/vldb-protocols2.R
78f3a9ffe84c
18.8 KiB
text/S-plus
much stuff
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 | library(dplyr)
library(ggplot2)
library(ggthemes)
library(ggrepel)
library(stringr)
theme <- theme_few(base_size = 24) +
theme(axis.title.y=element_text(vjust=0.9),
axis.title.x=element_text(vjust=-0.1),
axis.ticks.x=element_blank(),
text=element_text(family="serif"))
read.table("11.csv", header=T, sep=",", stringsAsFactors=F, na.strings="-1") -> dd3
sysnames <- c("mariadb-default"="MySQL", "mariadb-compress"="MySQL+C", "oracle-default"="DBMS X", "postgres-default" = "PostgreSQL", "db2-default" = "DB2", "monetdb-default"="MonetDB", "hbase-default" = "HBase", "hive-default" = "Hive", "mongodb-default" = "MongoDB")
dd3 %>% filter(network=="unlimited", protocol == "native", timeout != 1, system %in% c("hive-default", "hbase-default", "mariadb-default", "mariadb-compress", "db2-default", "oracle-default", "postgres-default", "monetdb-default", "mongodb-default"), tuple %in% c(1, 1000000)) %>% mutate(system=sysnames[system]) %>% select(system, tuple, time, bytes) %>% group_by(system, tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> dd4
dd5 <- dd4 %>% filter(tuple==1000000) %>% rename(time_1m=time_mean, bytes_1m=bytes_mean) %>% left_join(dd4 %>% filter(tuple == 1) %>% rename(time_1=time_mean, bytes_1=bytes_mean), by="system") %>% mutate(tuple_time_ms=((time_1m-time_1)/1000000)*1000, tuple_bytes=(bytes_1m-bytes_1)/1000000)
pdf("total-time.pdf", width=10, height=5)
ggplot(dd4 %>% filter(tuple == 1000000), aes(y=time_mean, x=reorder(system, -time_mean), label=round(time_mean, 1))) + geom_bar(stat="identity", position="dodge", width=.5) + geom_errorbar(aes(ymax = time_mean + time_se, ymin=time_mean - time_se), width=0.2, size=1) + theme + xlab("") + ylab("Wall clock time (s)") + coord_flip() + geom_text(size=7, hjust=-.8, family="serif") + scale_y_continuous(limits=c(0, 31))
dev.off()
pdf("total-bytes.pdf", width=10, height=5)
ggplot(dd4 %>% filter(tuple == 1000000) %>% mutate(bytes_mean_mb=(bytes_mean/1024/1024)), aes(y=bytes_mean_mb, x=reorder(system, -bytes_mean_mb), label=round(bytes_mean_mb, 2))) + geom_hline(yintercept=120, linetype="dashed") + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Bytes transferred (MB)") + coord_flip() + geom_text(size=7, hjust=-.2, family="serif") + scale_y_continuous(limits=c(0, 400))
dev.off()
# pdf("time-per-tuple.pdf", width=10, height=5)
# ggplot(dd5, aes( y=tuple_time_ms, x=reorder(system, -tuple_time_ms), label=round(tuple_time_ms, 4))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Time per tuple (ms)") + coord_flip() + guides(fill=FALSE) + geom_text(size=7, hjust=-.2, family="serif") + scale_y_continuous(limits=c(0, .03))
# dev.off()
# pdf("bytes-per-tuple.pdf", width=10, height=5)
# ggplot(dd5, aes( y=tuple_bytes, x=reorder(system, -tuple_bytes), label=round(tuple_bytes))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Bytes per tuple") + coord_flip() + guides(fill=FALSE) + geom_text(size=7, hjust=-.2, family="serif") + scale_y_continuous(limits=c(0, 410))
# dev.off()
# ggplot(dd5, aes(y=tuple_time_ms, x=tuple_bytes, label=system)) + geom_point() + geom_text(size=7, hjust=-.1, family="serif") + theme
read.table("13.csv", header=T, sep=",", stringsAsFactors=F, na.strings="-1") -> dd3
ntuples <- 1000000
sysnames <- c("mariadb-default"="MySQL", "mariadb-compress"="MySQL+C", "oracle-default"="DBMS X", "postgres-default" = "PostgreSQL", "db2-default" = "DB2", "monetdb-prot9"="MonetDB", "hbase-default" = "HBase", "hive-default" = "Hive", "mongodb-default" = "MongoDB")
dd3 %>% filter(protocol == "odbc-noprint", timeout != 1, system %in% c("hive-default", "hbase-default", "mariadb-default", "mariadb-compress", "db2-default", "oracle-default", "postgres-default", "monetdb-prot9", "mongodb-default"), tuple %in% c(1, ntuples)) %>% mutate(system=sysnames[system]) %>% select(system, tuple, network, time, bytes) %>% group_by(system, tuple, network) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> dd4
dd5 <- as.data.frame(dd4 %>% filter(tuple==ntuples) %>% rename(time_1m=time_mean, bytes_1m=bytes_mean) %>% left_join(dd4 %>% filter(tuple == 1) %>% rename(time_1=time_mean, bytes_1=bytes_mean), by=c("system", "network"))) %>% mutate(tuple_time_ms=((time_1m-time_1)/ntuples)*1000, tuple_bytes=(bytes_1m-bytes_1)/ntuples, adj_time= time_1m-time_1, adj_bytes=bytes_1m-bytes_1)
pdf("total-time-noprint.pdf", width=10, height=3.5)
ggplot(dd4 %>% filter(tuple == 1000000, network=="unlimited"), aes(y=time_mean, x=reorder(system, -time_mean), label=round(time_mean, 1))) + geom_hline(yintercept=0.237, linetype="dashed") + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Wall clock time (s)") + coord_flip() + geom_text(size=7, hjust=-.8, family="serif") + scale_y_continuous(limits=c(0, 6))
dev.off()
# pdf("time-per-tuple5.pdf", width=10, height=5)
# ggplot(dd5 %>% filter(network == "unlimited"), aes( y=tuple_time_ms, x=reorder(system, -tuple_time_ms), label=round(tuple_time_ms, 4))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Time per tuple (ms)") + coord_flip() + guides(fill=FALSE) + geom_text(size=7, hjust=-.2, family="serif")
# dev.off()
# pdf("time-per-tuple6.pdf", width=10, height=5)
# ggplot(dd5 %>% filter(network == "10mbitethhd"), aes( y=tuple_time_ms, x=reorder(system, -tuple_time_ms), label=round(tuple_time_ms, 4))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Time per tuple (ms)") + coord_flip() + guides(fill=FALSE) + geom_text(size=7, hjust=-.2, family="serif")
# dev.off()
# pdf("bytes-per-tuple5.pdf", width=10, height=5)
# ggplot(dd5 %>% filter(network == "unlimited"), aes( y=tuple_bytes, x=reorder(system, -tuple_bytes), label=round(tuple_bytes))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Bytes per tuple") + coord_flip() + guides(fill=FALSE) + geom_text(size=7, hjust=-.2, family="serif")
# dev.off()
# ggplot(dd5, aes( x=network, y=tuple_time_ms, fill=system)) +geom_bar(stat="identity", position="dodge", width=.5)
read.table("43.csv", header=T, sep=",", stringsAsFactors=F, na.strings="-1") -> dd3
ntuples <- 1000000
sysnames <- c("mariadb-default"="MySQL", "mariadb-compress"="MySQL+C", "oracle-default"="DBMS X", "postgres-default" = "PostgreSQL", "db2-default" = "DB2", "monetdb-prot9"="MonetDB", "monetdb-prot10-none-none"="MonetDB++", "monetdb-prot10-snappy-none"="MonetDB++S", "hbase-default" = "HBase", "hive-default" = "Hive", "mongodb-default" = "MongoDB")
dd3 %>% filter(protocol == "odbc-noprint", timeout != 1, system %in% c("mariadb-default", "db2-default", "oracle-default", "postgres-default", "monetdb-prot9", "mariadb-compress"), tuple %in% c(1, ntuples)) %>% mutate(system=sysnames[system]) %>% select(system, tuple, throughput, latency, time, bytes) %>% group_by(system, tuple, throughput, latency) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> dd4
dd5 <- as.data.frame(dd4 %>% filter(tuple==ntuples) %>% rename(time_1m=time_mean, bytes_1m=bytes_mean) %>% left_join(dd4 %>% filter(tuple == 1) %>% rename(time_1=time_mean, bytes_1=bytes_mean), by=c("system", "throughput", "latency"))) %>% mutate(tuple_time_ms=((time_1m-time_1)/ntuples)*1000, tuple_bytes=(bytes_1m-bytes_1)/ntuples, adj_time= time_1m-time_1, adj_bytes=bytes_1m-bytes_1)
pdf("latency.pdf", width=10, height=5)
ggplot(dd5 %>% filter(is.na(throughput)), aes(color=system, y=adj_time, x=latency, label=round(adj_time, 1))) + geom_line(size=1.5) + geom_point(size=3) + scale_y_log10() + scale_x_log10(breaks=c(0.1,1,10,100), limits=c(0.1, 500)) + theme + xlab("Latency (ms, log)") + ylab("Wall clock time (s, log)") + theme(legend.position = "none") + geom_text_repel(data = dd5 %>% filter(is.na(throughput), latency==100 | (system=="DBMS X" & latency==10)), aes(label = system), size = 8, nudge_x = .3, segment.color = NA, family="serif")
dev.off()
pdf("throughput.pdf", width=10, height=5)
ggplot(dd5 %>% filter(is.na(latency)), aes(color=system, y=adj_time, x=throughput, label=round(adj_time, 1))) + geom_line(size=1.5) + geom_point(size=3) + scale_y_log10() + scale_x_log10(breaks=c(10,100,1000), limits=c(10, 3500)) + theme + xlab("Throughput (Mb/s, log)") + ylab("Wall clock time (s, log)") + theme(legend.position = "none") +
geom_text_repel(data = dd5 %>% filter(is.na(latency), throughput==1000), aes(label = system), size = 8, nudge_x = .35, segment.color = NA, family="serif")
dev.off()
read.table("44.csv", header=T, sep=",", stringsAsFactors=F, na.strings="-1") -> dd3
compnames <- c("snappy"="Sy", "xz"="XZ", "lz4" = "LZ4", "gzip" = "GZ")
dd3 %>% filter(grepl("netcat-prot-", dd3$system, fixed=T), tuple == 1000000, bin_chunksize == 100000, is.na(latency)) %>% select(bin_compress, bin_orientation, throughput, bytes, time) %>% group_by(bin_compress, bin_orientation, throughput) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) %>% mutate(bytes_mean_mb=(bytes_mean/1024/1024), bin_compress_name=ifelse(bin_compress == "", "None", compnames[bin_compress]), throughput=ifelse(is.na(throughput), "Unlimited", throughput)) -> dd4
dd4$throughput <- factor(dd4$throughput, levels=c("Unlimited", "1000", "100", "10"))
pdf("compression.pdf", width=10, height=6)
ggplot(dd4 %>% filter(bin_orientation == "col"), aes(y=time_mean, x=bytes_mean_mb, label=bin_compress_name)) + geom_point(size=3) + theme + scale_y_log10() + ylab("Wall clock time (s, log)") + xlab("Bytes transferred (MB)") + theme(legend.position = "none") + facet_wrap(~throughput) + geom_text_repel(family="serif", size=8, point.padding=unit(2,"mm"))
dev.off()
# for table
xtable::xtable(dd4 %>% filter(is.na(throughput)) %>% select(bin_compress, bin_orientation, bytes_mean) %>% mutate(bytes_mean_mb=round(bytes_mean/1024/1024,1), ratio=round(110271223/bytes_mean, 2)) %>% select(bin_compress, bin_orientation,bytes_mean_mb, ratio))
read.table("47.csv", header=T, sep=",", stringsAsFactors=F, na.strings="-1") -> dd3
ntuples <- 10000000
sysnames <- c("monetdb-prot10-strings-none-none"="Null-Terminated", "monetdb-prot10-strings-snappy-none"="Null-Terminated + Sy","monetdb-prot10-none-none"="Fixed Witdth", "monetdb-prot10-snappy-none"="Fixed Width + Sy")
dd3 %>% filter(network=="unlimitedms-unlimitedmb/s", protocol == "odbc-noprint", timeout != 1, system %in% c(names(sysnames)), tuple %in% c(1,ntuples)) %>% mutate(system=sysnames[system]) %>% select(system, tuple, time, bytes) %>% group_by(system, tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> dd4
dd5 <- as.data.frame(dd4 %>% filter(tuple==ntuples) %>% rename(time_1m=time_mean, bytes_1m=bytes_mean) %>% left_join(dd4 %>% filter(tuple == 1) %>% rename(time_1=time_mean, bytes_1=bytes_mean), by=c("system"))) %>% mutate(tuple_time_ms=((time_1m-time_1)/ntuples)*1000, tuple_bytes=(bytes_1m-bytes_1)/ntuples, adj_time= time_1m-time_1, adj_bytes=bytes_1m-bytes_1)
pdf("total-time-strings-noprint.pdf", width=10, height=3)
ggplot(dd5 %>% filter(tuple.x == ntuples), aes(y=adj_time, x=reorder(system, -adj_time), label=round(adj_time, 2))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Wall clock time (s)") + coord_flip() + geom_text(size=7, hjust=-.4, family="serif") + scale_y_continuous(limits=c(0, 4))
dev.off()
pdf("total-bytes-strings.pdf", width=10, height=3)
ggplot(dd4 %>% filter(tuple == ntuples) %>% mutate(bytes_mean_mb=(bytes_mean/1024/1024)), aes(y=bytes_mean_mb, x=reorder(system, -bytes_mean_mb), label=round(bytes_mean_mb, 2))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Bytes transferred (MB)") + coord_flip() + geom_text(size=7, hjust=-.2, family="serif") + scale_y_continuous(limits=c(0, 840))
dev.off()
sysnames <- c("monetdb-prot10-intsonly-pfor"="PFOR-DELTA", "monetdb-prot10-intsonly-none"="None","monetdb-prot10-intsonly-pfor-snappy"="PFOR + Snappy", "monetdb-prot10-intsonly-none-snappy"="Snappy")
dd3 %>% filter(network=="unlimitedms-unlimitedmb/s", protocol == "odbc-noprint", timeout != 1, system %in% c(names(sysnames)), tuple %in% c(1,ntuples)) %>% mutate(system=sysnames[system]) %>% select(system, tuple, time, bytes) %>% group_by(system, tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> dd4
dd5 <- as.data.frame(dd4 %>% filter(tuple==ntuples) %>% rename(time_1m=time_mean, bytes_1m=bytes_mean) %>% left_join(dd4 %>% filter(tuple == 1) %>% rename(time_1=time_mean, bytes_1=bytes_mean), by=c("system"))) %>% mutate(tuple_time_ms=((time_1m-time_1)/ntuples)*1000, tuple_bytes=(bytes_1m-bytes_1)/ntuples, adj_time= time_1m-time_1, adj_bytes=bytes_1m-bytes_1)
pdf("total-time-ints-noprint.pdf", width=10, height=3)
ggplot(dd5 %>% filter(tuple.x == ntuples), aes(y=adj_time, x=reorder(system, -adj_time), label=round(adj_time, 3))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Wall clock time (s)") + coord_flip() + geom_text(size=7, hjust=-.4, family="serif") + scale_y_continuous(limits=c(0, 1.2))
dev.off()
pdf("total-bytes-ints.pdf", width=10, height=3)
ggplot(dd4 %>% filter(tuple == ntuples) %>% mutate(bytes_mean_mb=(bytes_mean/1024/1024)), aes(y=bytes_mean_mb, x=reorder(system, -bytes_mean_mb), label=round(bytes_mean_mb, 2))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Bytes transferred (MB)") + coord_flip() + geom_text(size=7, hjust=-.2, family="serif") + scale_y_continuous(limits=c(0, 220))
dev.off()
sysnames <- c("monetdb-prot10"="New Protocol", "netcat-csv"="netcat")
dd3 %>% filter(network=="unlimitedms-unlimitedmb/s", timeout != 1, system %in% c(names(sysnames)), tuple %in% c(1,ntuples)) %>% mutate(system=sysnames[system]) %>% select(system, tuple, time, bytes) %>% group_by(system, tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> dd4
dd5 <- as.data.frame(dd4 %>% filter(tuple==ntuples) %>% rename(time_1m=time_mean, bytes_1m=bytes_mean) %>% left_join(dd4 %>% filter(tuple == 1) %>% rename(time_1=time_mean, bytes_1=bytes_mean), by=c("system"))) %>% mutate(tuple_time_ms=((time_1m-time_1)/ntuples)*1000, tuple_bytes=(bytes_1m-bytes_1)/ntuples, adj_time= time_1m-time_1, adj_bytes=bytes_1m-bytes_1)
pdf("total-time-shootout-noprint.pdf", width=10, height=3)
ggplot(dd5 %>% filter(tuple.x == ntuples), aes(y=adj_time, x=reorder(system, -adj_time), label=round(adj_time, 2))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Wall clock time (s)") + coord_flip() + geom_text(size=7, hjust=-.4, family="serif") #+ scale_y_continuous(limits=c(0, 4))
dev.off()
pdf("total-bytes-shootout.pdf", width=10, height=3)
ggplot(dd4 %>% filter(tuple == ntuples) %>% mutate(bytes_mean_mb=(bytes_mean/1024/1024)), aes(y=bytes_mean_mb, x=reorder(system, -bytes_mean_mb), label=round(bytes_mean_mb, 2))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Bytes transferred (MB)") + coord_flip() + geom_text(size=7, hjust=-.2, family="serif") # + scale_y_continuous(limits=c(0, 840))
dev.off()
read.table("48.csv", header=T, sep=",", stringsAsFactors=F, na.strings="-1") -> dd3
ntuples <- 1000000
dd3 %>% filter(grepl("monetdb-prot10-", dd3$system, fixed=T)) %>% mutate(system2 = gsub("monetdb-prot10-","", system, fixed=T), compression=str_split_fixed(system2, "-", 2)[,1], chunksize=as.integer(str_split_fixed(system2, "-", 2)[,2])) %>% filter(timeout != 1, tuple %in% c(1,ntuples)) %>% select(compression, chunksize, tuple, time, bytes) %>% group_by(compression, chunksize, tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> dd4
dd5 <- as.data.frame(dd4 %>% filter(tuple==ntuples) %>% rename(time_1m=time_mean, bytes_1m=bytes_mean) %>% left_join(dd4 %>% filter(tuple == 1) %>% rename(time_1=time_mean, bytes_1=bytes_mean), by=c("compression", "chunksize"))) %>% mutate(tuple_time_ms=((time_1m-time_1)/ntuples)*1000, tuple_bytes=(bytes_1m-bytes_1)/ntuples, adj_time= time_1m-time_1, adj_bytes=bytes_1m-bytes_1)
dd5 %>% select(compression, chunksize, adj_time, adj_bytes) -> dd6
print(xtable::xtable(dd6 %>% filter(compression=="snappy")%>% full_join(dd6 %>% filter(compression =="none"), by="chunksize") %>% select(chunksize, time_uncomp=adj_time.y, bytes_uncom=adj_bytes.y, time_snappy=adj_time.x, bytes_snappy=adj_bytes.x) %>% mutate(ratio=round(bytes_uncom/bytes_snappy, 2), time_uncomp=round(time_uncomp, 2), time_snappy=round(time_snappy, 2), bytes_uncom=round(bytes_uncom/1024/1024, 1)) %>% select(chunksize, time_uncomp, time_snappy, bytes_uncom, ratio)), include.rownames=FALSE)
# ggplot(dd5, aes(y=adj_time, x=chunksize/1024, color=compression)) + theme + xlab("Chunk Size (KB)") + ylab("Wall clock time (s)") + geom_line() + geom_point() + scale_x_log10()
# ggplot(dd5, aes(y=adj_bytes/1024, x=chunksize/1024, color=compression)) + theme + xlab("Chunk Size (KB)") + ylab("Data transferred (KB)") + geom_line() + geom_point() + scale_x_log10()
read.table("49.csv", header=T, sep=",", stringsAsFactors=F, na.strings="-1") -> dd3
ntuples <- 1000000
dd3 %>% filter(grepl("monetdb-prot10-", dd3$system, fixed=T)) %>% mutate(system2 = gsub("monetdb-prot10-","", system, fixed=T), compression=str_split_fixed(system2, "-", 2)[,1], stringmethod=str_split_fixed(system2, "-", 2)[,2]) %>% filter(timeout != 1, tuple %in% c(1,ntuples)) %>% select(compression, stringmethod, tuple, time, bytes) %>% group_by(compression, stringmethod, tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> dd4
dd5 <- as.data.frame(dd4 %>% filter(tuple==ntuples) %>% rename(time_1m=time_mean, bytes_1m=bytes_mean) %>% left_join(dd4 %>% filter(tuple == 1) %>% rename(time_1=time_mean, bytes_1=bytes_mean), by=c("compression", "stringmethod"))) %>% mutate(tuple_time_ms=((time_1m-time_1)/ntuples)*1000, tuple_bytes=(bytes_1m-bytes_1)/ntuples, adj_time= time_1m-time_1, adj_bytes=bytes_1m-bytes_1)
# ggplot(dd5 %>% filter(compression == "snappy"), aes(y=adj_time, x=reorder(stringmethod, -adj_time), label=round(adj_time, 2))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Wall clock time") + coord_flip() + geom_text(size=7, hjust=-.2, family="serif")
# ggplot(dd5 %>% filter(compression == "snappy"), aes(y=adj_bytes, x=reorder(stringmethod, -adj_bytes), label=round(adj_bytes, 2))) + geom_bar(stat="identity", position="dodge", width=.5) + theme + xlab("") + ylab("Wall clock time") + coord_flip() + geom_text(size=7, hjust=-.2, family="serif")
dd5 %>% select(compression, stringmethod, adj_time, adj_bytes) -> dd6
print(xtable::xtable(dd6 %>% filter(compression=="snappy")%>% full_join(dd6 %>% filter(compression =="none"), by="stringmethod") %>% select(stringmethod, time_uncomp=adj_time.y, bytes_uncom=adj_bytes.y, time_snappy=adj_time.x, bytes_snappy=adj_bytes.x) %>% mutate(ratio=round(bytes_uncom/bytes_snappy, 2), time_uncomp=round(time_uncomp, 2), time_snappy=round(time_snappy, 2), bytes_uncom=round(bytes_uncom/1024/1024, 1)) %>% select(stringmethod, time_uncomp, time_snappy, bytes_uncom, ratio)), include.rownames=FALSE)
|