Files @ e23e3482a0b7
Branch filter:

Location: DA/protocols/vldb-protocols.R

Hannes Muehleisen
more stuff
# library(dplyr)
# library(ggplot2)


# read.table("~/Desktop/secondrun.csv", header=F,  sep=",", stringsAsFactors=F) -> dd
# names(dd) <- c('system', 'network', 'tuple', 'run', 'time', "bytes")



# # dd %>% filter(network == "unlimited") %>% select(system,tuple,time) %>%  group_by(system,tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> df
# # limits <- aes(ymax = mean + se, ymin=mean - se)

# # p <- ggplot(df, aes(color=system, y=mean, x=tuple)) + scale_x_log10() + scale_y_log10() + geom_point() + geom_errorbar(limits, width=0.25)
# # print(p)





# dd %>% select(system,network,tuple,time) %>%  group_by(system,network, tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> df
# limits <- aes(ymax = mean + se, ymin=mean - se)

# p <- ggplot(df, aes(color=system, y=mean, x=tuple)) + scale_x_log10() + scale_y_log10() + geom_point() + geom_errorbar(limits, width=0.25) + geom_line() + facet_grid( . ~ network)
# print(p)




# dd %>% filter(network=="unlimited") %>% select(system,tuple,bytes) %>%  group_by(system, tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> df2

# p <- ggplot(df2, aes(color=system, y=mean, x=tuple)) + scale_x_log10() + scale_y_log10() + geom_point() + geom_errorbar(limits, width=0.25)
# print(p)




# read.table("~/Desktop/test.csv", header=F,  sep=",", stringsAsFactors=F) -> dd2
# names(dd2) <- c('system', 'db', 'protocol', 'network', 'throughput', 'latency', 'tuple', 'run', 'time', "bytes")



# dd2 %>% filter(network=="unlimited", tuple == 10000000) %>% select(db,protocol,time,bytes) %>%  group_by(db,protocol) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> df3


# ggplot(df3, aes(fill=db, y=time_mean, x=db)) + geom_bar(stat="identity") + facet_grid( . ~ protocol)



# ggplot(df3, aes(fill=db, y=time_mean, x=protocol)) + geom_bar(stat="identity") + facet_grid( . ~ db)


# ggplot(df3, aes(fill=db, y=bytes_mean, x=protocol)) + geom_bar(stat="identity") + facet_grid( . ~ db)




library(dplyr)
library(ggplot2)
library(ggthemes)

theme <- theme_few(base_size = 24) + 
theme(axis.title.y=element_text(vjust=0.9), 
  axis.title.x=element_text(vjust=-0.1),
  axis.ticks.x=element_blank())


read.table("13.csv", header=T,  sep=",", stringsAsFactors=F, na.strings="-1") -> dd3

str(dd3)

dd3 %>% filter(tuple > 1, network=="unlimited", protocol == "native", timeout != 1, is.na(bin_chunksize) | system %in% c("netcat-prot-col-chunk-100000-snappy"), !grepl("netcat-csv-", system, fixed=T))  %>%  group_by(system, tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> df4

tuplelabels <- c("100"="10^2","1000"="10^3","10000"="10^4", "100000"="10^5", "1000000"="10^6", "10000000"="10^7")
tuplebreaks <- as.numeric(names(tuplelabels))


# time per db as tuples increase
pdf("protocols-time.pdf", width=10, height=5)

ggplot(df4, aes(color=system, y=time_mean, x=tuple, label=round(time_mean, 2))) + geom_line(size=1.5) + geom_point(size=2) + scale_x_log10(breaks=tuplebreaks, labels=tuplelabels) + scale_y_log10() + geom_errorbar(aes(ymax = time_mean + time_se, ymin=time_mean - time_se), width=0.1, size=1) + theme + ggtitle("Wall clock time") + xlab("Tuples (#, log)") + ylab("Wall clock time (s, log)") 

dev.off()


pdf("protocols-time2.pdf", width=10, height=5)

ggplot(df4 %>% filter(tuple==1000000), aes(fill=system, y=time_mean, x=tuple, label=round(time_mean, 2))) + geom_bar(stat="identity", position="dodge") +  scale_x_log10(breaks=tuplebreaks, labels=tuplelabels)  + theme + ggtitle("Wall clock time") + xlab("Tuples (#, log)") + ylab("Wall clock time (s)") 

dev.off()




pdf("protocols-bytes1.pdf", width=10, height=5)

# bytes per db
ggplot(df4, aes(color=system, y=bytes_mean, x=tuple)) + geom_line(size=1.5) + geom_point(size=2) + scale_x_log10(breaks=tuplebreaks, labels=tuplelabels) + scale_y_log10() + theme + ggtitle("Bytes Transferred") + xlab("Tuples (#, log)") + ylab ("Bytes (#, log)")


dev.off()


pdf("protocols-bytes2.pdf", width=10, height=5)


ggplot(df4%>% filter(tuple> 1000), aes(fill=system, y=bytes_mean, x=tuple)) + geom_bar(stat="identity", position="dodge") +  scale_x_log10(breaks=tuplebreaks, labels=tuplelabels) + theme + xlab("Tuples (#, log)") + ylab ("Bytes (#)") + ggtitle("Bytes transferred")


dev.off()


pdf("protocols-packets.pdf", width=10, height=5)


# packets
ggplot(df4, aes(color=system, y=packets_mean, x=tuple)) + geom_line(size=1.5) + geom_point(size=2) +  scale_x_log10(breaks=tuplebreaks, labels=tuplelabels) + scale_y_log10() + theme +  xlab("Tuples (#, log)") + ylab("Packets sent (#, log)") + ggtitle("Packets sent")


dev.off()


pdf("protocols-memory.pdf", width=10, height=5)


# client memory
ggplot(df4, aes(color=system, y=memory_max_kb_mean, x=tuple)) + geom_line(size=1.5) + geom_point(size=2) + scale_x_log10(breaks=tuplebreaks, labels=tuplelabels)  + scale_y_log10() + ylab("Max memory (KB, log)") + theme + xlab("Tuples (#, log)") + ggtitle ("Memory Footprint")


dev.off()


# protocols


dd3 %>% filter(system != "mariadb-compress", network=="unlimited", db != "mongodb" & db != "hbase" & db != "netcat", timeout != 1, is.na(bin_chunksize))  %>%  group_by(db, protocol, tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> df5



pdf("protocols-wrapper-time.pdf", width=10, height=5)



ggplot(df5, aes(color=protocol, y=time_mean, x=tuple)) + geom_line(size=1.5) + geom_point(size=2) + scale_x_log10()  + scale_y_log10() + ylab("Wall clock time (s, log)")  + theme + xlab("Tuples (#, log)") + ggtitle ("Wrapper overhead") + facet_grid( ~ db) + theme( axis.text.x=element_blank())


dev.off()


pdf("protocols-wrapper-bytes.pdf", width=10, height=5)


ggplot(df5, aes(color=protocol, y=bytes_mean, x=tuple)) + geom_line(size=1.5) + geom_point(size=2) + scale_x_log10()  + scale_y_log10() + ylab ("Bytes (#, log)")  + theme + xlab("Tuples (#, log)") + ggtitle ("Wrapper overhead") + facet_grid( ~ db) + theme( axis.text.x=element_blank())


dev.off()


pdf("protocols-wrapper-memory.pdf", width=10, height=5)


ggplot(df5, aes(color=protocol, y=memory_max_kb_mean, x=tuple)) + geom_line(size=1.5) + geom_point(size=2) + scale_x_log10()  + scale_y_log10() + ylab("Max memory (KB, log)") + theme + xlab("Tuples (#, log)") + ggtitle ("Wrapper overhead") + facet_grid( ~ db) + theme( axis.text.x=element_blank())


dev.off()

# networks

dd3 %>% filter(protocol=="native", db != "netcat", timeout != 1) %>% filter(network %in% c("unlimited", "gigabitethld", "10mbitethhd")) %>%  group_by(system, network, tuple) %>% summarise_each(funs(mean,sd,se=sd(.)/sqrt(n()))) -> df6

pdf("protocols-network.pdf", width=10, height=5)

ggplot(df6, aes(color=network, y=time_mean, x=tuple)) + geom_line(size=1.5) + geom_point(size=2) + scale_x_log10()  + scale_y_log10() + ylab("Wall clock time (s, log)") + theme + xlab("Tuples (#, log)") + ggtitle ("Network speed impact") + facet_grid( ~ system) + theme( axis.text.x=element_blank())

dev.off()



pdf("protocols-network2.pdf", width=10, height=5)

ggplot(df6, aes(fill=system, y=time_mean, x=tuple)) + geom_bar(stat="identity", position="dodge") +scale_x_log10() + ylab("Wall clock time (s)") + theme + xlab("Tuples (#, log)") + ggtitle ("Network speed impact") + facet_grid( ~ network) + theme( axis.text.x=element_blank())

dev.off()