Changeset - c0e3a977dfe5
[Not reviewed]
0 1 2
Hannes Muehleisen - 9 years ago 2016-08-02 14:42:28
hannes@muehleisen.org
more things
3 files changed with 97 insertions and 7 deletions:
pmjc.jar
bin+new file
0 comments (0 inline, 0 general)
convert.py
Show inline comments
 
new file 100644
 
import numpy
 
import pandas
 
from datetime import datetime
 
import time
 

	
 

	
 
def datetoint(col):
 
	return numpy.array([time.mktime(datetime.strptime(x, '%Y-%m-%d').timetuple()) for x in col], dtype=numpy.int64)
 

	
 

	
 
cfile = pandas.read_csv('lineitem-1m.tbl', sep='|', header=None, names=['l_orderkey','l_partkey','l_suppkey','l_linenumber','l_quantity','l_extendedprice','l_discount','l_tax','l_returnflag','l_linestatus','l_shipdate','l_commitdate','l_receiptdate','l_shipinstruct','l_shipmode','l_comment'])
 

	
 
cfile.l_orderkey = cfile.l_orderkey.astype(numpy.int32)
 
cfile.l_partkey = cfile.l_partkey.astype(numpy.int32)
 
cfile.l_suppkey = cfile.l_suppkey.astype(numpy.int16)
 
cfile.l_linenumber = cfile.l_linenumber.astype(numpy.int8)
 
cfile.l_quantity = cfile.l_quantity.astype(numpy.int8)
 
cfile.l_shipdate = datetoint(cfile.l_shipdate)
 
cfile.l_commitdate = datetoint(cfile.l_commitdate)
 
cfile.l_receiptdate = datetoint(cfile.l_receiptdate)
 

	
 

	
 
chunkbytes=1000000
 

	
 
tmp = chunkbytes
 
tmp -= 16
 
row = 0
 
startrow = 0
 

	
 
rowsize = 0
 

	
 
while True:
 
	rowsize = 0
 
	for i in range(len(cfile.columns)):
 
		col = cfile[cfile.columns[i]]
 
		if col.dtype == numpy.object:
 
			rowsize += len(col[row]) + 1
 
		else:
 
			rowsize += col.dtype.itemsize
 
	if (tmp < rowsize):
 
		break
 
	tmp -= rowsize
 
	row += 1
 

	
 

	
 
out = open('out.col', 'wb+')
 

	
 
# message length
 
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out)
 
# row count
 
numpy.array(row, dtype=numpy.int64).tofile(out)
 

	
 
for i in range(len(cfile.columns)):
 
	col = cfile[cfile.columns[i]]
 
	if col.dtype == numpy.object:
 
		for o in range(startrow, (startrow + row)):
 
			out.write(col[o])
 
			out.write('\0')
 
	else:
 
		numpy.array(col[startrow:(startrow + row)]).tofile(out)
 

	
 
out.close()
 

	
 

	
 

	
 
out = open('out.row', 'wb+')
 

	
 
# header
 
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out)
 
for r in range(startrow, startrow+row):
 
	for i in range(len(cfile.columns)):
 
		col = cfile[cfile.columns[i]]
 
		if col.dtype == numpy.object:
 
			out.write(col[r])
 
			out.write('\0')
 
		else:
 
			numpy.array(col[r]).tofile(out)
 

	
 
out.close()
pmjc.jar
Show inline comments
 
new file 100644
 
binary diff not shown
vldb-protocols.py
Show inline comments
 
@@ -4,24 +4,25 @@ import sys
 
import csv
 
import re
 
import json
 
import subprocess
 

	
 
systems = [
 
	{'name':'hive-default', 'db':'hive'}, 
 
	{'name':'netcat-csv-lz4', 'db':'netcat', 'compress': 'lz4'},
 
	{'name':'netcat-csv-lz4-heavy', 'db':'netcat', 'compress': 'lz4-heavy'},
 
	{'name':'netcat-csv-gzip', 'db':'netcat', 'compress': 'gzip'},
 
	{'name':'netcat-csv-xz', 'db':'netcat', 'compress': 'xz'},
 
	{'name':'netcat-csv', 'db':'netcat'},
 
	{'name':'mariadb-compress', 'db':'mariadb', 'compress': True},
 
	{'name':'db2-default', 'db':'db2'}, 
 
	{'name':'oracle-default', 'db':'oracle'}, 
 
	{'name':'postgres-default', 'db':'postgres'}, 
 
	{'name':'mariadb-default', 'db':'mariadb'}, 
 
	{'name':'monetdb-default', 'db':'monetdb'}, 
 
	{'name':'hbase-default', 'db':'hbase'}, 
 
	{'name':'hbase-default', 'db':'hbase'},
 
	{'name':'mongodb-default', 'db':'mongodb'}]
 

	
 
networks = [
 
	{'name':'unlimited', 'throughput': -1, 'latency':-1}]#, 
 
	#{'name':'gigabitethld', 'throughput': 1000, 'latency':0.3},
 
	#{'name':'10mbitethhd', 'throughput': 10, 'latency':150}]
 
@@ -120,34 +121,34 @@ for r in range(nruns):
 
				querycmd = ""
 
				jdbcflags = ''
 
				odbcdriver = ''
 
				odbccmd = None
 
				if system['db'] == 'postgres':
 
					querycmd = 'psql %s --host 127.0.0.1 -w -t -A -c "%s" > /dev/null' % ('--set=sslcompression=1 --set=sslmode=require --set=keepalives=0' if 'compress' in system else '', query)
 
					jdbcflags = '-u jdbc:postgresql://127.0.0.1/user -n user -d org.postgresql.Driver'
 
					jdbcflags = 'org.postgresql.Driver jdbc:postgresql://127.0.0.1/user user user'
 
					odbccmd = 'isql PostgreSQL -d, < query > /dev/null'
 
				elif system['db'] == 'mariadb':
 
					querycmd = 'mysql %s --host=127.0.0.1 user --skip-column-names --batch -e "%s" > /dev/null'  % ('--compress' if 'compress' in system else '', query)
 
					jdbcflags = '-u jdbc:mysql://127.0.0.1/user -n user -d org.mariadb.jdbc.Driver'
 
					jdbcflags = 'org.mariadb.jdbc.Driver jdbc:mysql://127.0.0.1/user user null'
 
					odbccmd = 'isql MySQL -d, < query > /dev/null'
 
				elif system['db'] == 'monetdb':
 
					querycmd = 'mclient -h 127.0.0.1 -p 50001 -fcsv -s "%s" > /dev/null' % query
 
					jdbcflags = '-u jdbc:monetdb://127.0.0.1:50001/database -n monetdb -p monetdb -d nl.cwi.monetdb.jdbc.MonetDriver'
 
					jdbcflags = 'nl.cwi.monetdb.jdbc.MonetDriver jdbc:monetdb://127.0.0.1:50001/database monetdb monetdb'
 
					odbccmd = 'isql MonetDB -d, < query > /dev/null'
 
				elif system['db'] == 'db2':
 
					db2qfile = open("db2query", "w")
 
					db2qfile.write("connect to remotedb user user using user; \n" + query + ";\n")
 
					db2qfile.close()
 
					querycmd = 'db2 -tf db2query > /dev/null;'
 
					jdbcflags = '-u jdbc:db2://127.0.0.1:50000/db -d com.ibm.db2.jcc.DB2Driver -n user -p user'
 
					jdbcflags = 'com.ibm.db2.jcc.DB2Driver jdbc:db2://127.0.0.1:50000/db user user'
 
					os.environ['DB2INSTANCE'] = 'user'
 
					odbccmd = 'isql DB2_SAMPLE -d, user user < query > /dev/null'
 
				elif system['db'] == 'oracle':
 
					os.environ['TNS_ADMIN'] = '/home/user/oracleconfig'
 
					querycmd = 'sqlplus system/oracle@//127.0.0.1:49161/XE @query-oracle.sql %d > /dev/null' % tuple
 
					jdbcflags = '-u jdbc:oracle:thin:@127.0.0.1:49161:XE -d oracle.jdbc.driver.OracleDriver -n system -p oracle'
 
					jdbcflags = 'oracle.jdbc.driver.OracleDriver jdbc:oracle:thin:@127.0.0.1:49161:XE system oracle'
 
					odbccmd = 'isql Oracle -d, < query > /dev/null'
 
					# for JDBC/ODBCV
 
					query = "SELECT * FROM lineitem where rownum < %d;" % tuple
 
				elif system['db'] == 'mongodb':
 
					querycmd = 'mongoexport -d lineitem  -c things --csv --fields "l_orderkey,l_partkey,l_suppkey,l_linenumber,l_quantity,l_extendedprice,l_discount,l_tax,l_returnflag,l_linestatus,l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment" --limit %d > /dev/null 2> /dev/null' % tuple
 
					jdbcflags = None
 
@@ -157,12 +158,17 @@ for r in range(nruns):
 
					os.environ['HBASE_HEAPSIZE'] = '10g'
 
					query = "scan 'lineitem',{LIMIT=>%d}" % tuple
 
					querycmd = 'hbase shell < query > /dev/null 2> /dev/null'
 
					jdbcflags = None
 
					odbccmd = None
 
					odbcdriver = None
 
				elif system['db'] == 'hive':
 
					querycmd = None
 
					jdbcflags = 'org.apache.hive.jdbc.HiveDriver jdbc:hive2://localhost:10000 user null'
 
					odbccmd = None
 
					odbcdriver = None
 
				elif system['db'] == 'netcat':
 
					# open netcat for listening in a separate process
 
					filename = '/home/user/lineitem-%d.csv' % tuple
 
					compress_cmd = ''
 
					if 'compress' in system:
 
						if system['compress'] == 'lz4': 
 
@@ -182,13 +188,18 @@ for r in range(nruns):
 

	
 
				qfile = open("query", "w")
 
				qfile.write(query)
 
				qfile.write("\n")
 
				qfile.close()
 

	
 
				jdbccmd = 'java -Xmx10G -Djava.security.egd=file:/dev/./urandom -Djava.ext.dirs=/home/user/java/ -jar /home/user/java/sqlline.jar --fastConnect=true --outputformat=csv  --isolation=TRANSACTION_SERIALIZABLE --silent=true --showHeader=false %s < query > /dev/null' % jdbcflags
 
				jdbccmd = 'java -Xmx10G -Djava.security.egd=file:/dev/./urandom -cp /home/user/java/pmjc.jar:/home/user/java/db2jcc4.jar:/home/user/java/monetdb-jdbc-2.23.jar:/home/user/java/mariadb-java-client-1.4.6.jar:/home/user/java/ojdbc6_g.jar:/home/user/java/postgresql-9.4.1209.jar:/home/user/java/hive-jdbc-2.1.0-standalone.jar:/home/user/java/hadoop-common-2.6.4.jar %s "%s" 1000' % (jdbcflags, query)
 
				
 
				# special case for hive
 
				if querycmd is None:
 
					querycmd = jdbccmd
 
					jdbccmd = None
 

	
 
				# getting caches hot
 
				benchmark_command(querycmd, system, 'native', network, tuple, r, True)
 

	
 
				# native client
 
				benchmark_command(querycmd, system, 'native', network, tuple, r, False)
0 comments (0 inline, 0 general)