Changeset - c0e3a977dfe5
[Not reviewed]
0 1 2
Hannes Muehleisen - 9 years ago 2016-08-02 14:42:28
hannes@muehleisen.org
more things
3 files changed with 96 insertions and 6 deletions:
pmjc.jar
bin+new file
0 comments (0 inline, 0 general)
convert.py
Show inline comments
 
new file 100644
 
import numpy
 
import pandas
 
from datetime import datetime
 
import time
 

	
 

	
 
def datetoint(col):
 
	return numpy.array([time.mktime(datetime.strptime(x, '%Y-%m-%d').timetuple()) for x in col], dtype=numpy.int64)
 

	
 

	
 
cfile = pandas.read_csv('lineitem-1m.tbl', sep='|', header=None, names=['l_orderkey','l_partkey','l_suppkey','l_linenumber','l_quantity','l_extendedprice','l_discount','l_tax','l_returnflag','l_linestatus','l_shipdate','l_commitdate','l_receiptdate','l_shipinstruct','l_shipmode','l_comment'])
 

	
 
cfile.l_orderkey = cfile.l_orderkey.astype(numpy.int32)
 
cfile.l_partkey = cfile.l_partkey.astype(numpy.int32)
 
cfile.l_suppkey = cfile.l_suppkey.astype(numpy.int16)
 
cfile.l_linenumber = cfile.l_linenumber.astype(numpy.int8)
 
cfile.l_quantity = cfile.l_quantity.astype(numpy.int8)
 
cfile.l_shipdate = datetoint(cfile.l_shipdate)
 
cfile.l_commitdate = datetoint(cfile.l_commitdate)
 
cfile.l_receiptdate = datetoint(cfile.l_receiptdate)
 

	
 

	
 
chunkbytes=1000000
 

	
 
tmp = chunkbytes
 
tmp -= 16
 
row = 0
 
startrow = 0
 

	
 
rowsize = 0
 

	
 
while True:
 
	rowsize = 0
 
	for i in range(len(cfile.columns)):
 
		col = cfile[cfile.columns[i]]
 
		if col.dtype == numpy.object:
 
			rowsize += len(col[row]) + 1
 
		else:
 
			rowsize += col.dtype.itemsize
 
	if (tmp < rowsize):
 
		break
 
	tmp -= rowsize
 
	row += 1
 

	
 

	
 
out = open('out.col', 'wb+')
 

	
 
# message length
 
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out)
 
# row count
 
numpy.array(row, dtype=numpy.int64).tofile(out)
 

	
 
for i in range(len(cfile.columns)):
 
	col = cfile[cfile.columns[i]]
 
	if col.dtype == numpy.object:
 
		for o in range(startrow, (startrow + row)):
 
			out.write(col[o])
 
			out.write('\0')
 
	else:
 
		numpy.array(col[startrow:(startrow + row)]).tofile(out)
 

	
 
out.close()
 

	
 

	
 

	
 
out = open('out.row', 'wb+')
 

	
 
# header
 
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out)
 
for r in range(startrow, startrow+row):
 
	for i in range(len(cfile.columns)):
 
		col = cfile[cfile.columns[i]]
 
		if col.dtype == numpy.object:
 
			out.write(col[r])
 
			out.write('\0')
 
		else:
 
			numpy.array(col[r]).tofile(out)
 

	
 
out.close()
pmjc.jar
Show inline comments
 
new file 100644
 
binary diff not shown
vldb-protocols.py
Show inline comments
 
@@ -7,6 +7,7 @@ import json
 
import subprocess
 

	
 
systems = [
 
	{'name':'hive-default', 'db':'hive'}, 
 
	{'name':'netcat-csv-lz4', 'db':'netcat', 'compress': 'lz4'},
 
	{'name':'netcat-csv-lz4-heavy', 'db':'netcat', 'compress': 'lz4-heavy'},
 
	{'name':'netcat-csv-gzip', 'db':'netcat', 'compress': 'gzip'},
 
@@ -123,28 +124,28 @@ for r in range(nruns):
 
				odbccmd = None
 
				if system['db'] == 'postgres':
 
					querycmd = 'psql %s --host 127.0.0.1 -w -t -A -c "%s" > /dev/null' % ('--set=sslcompression=1 --set=sslmode=require --set=keepalives=0' if 'compress' in system else '', query)
 
					jdbcflags = '-u jdbc:postgresql://127.0.0.1/user -n user -d org.postgresql.Driver'
 
					jdbcflags = 'org.postgresql.Driver jdbc:postgresql://127.0.0.1/user user user'
 
					odbccmd = 'isql PostgreSQL -d, < query > /dev/null'
 
				elif system['db'] == 'mariadb':
 
					querycmd = 'mysql %s --host=127.0.0.1 user --skip-column-names --batch -e "%s" > /dev/null'  % ('--compress' if 'compress' in system else '', query)
 
					jdbcflags = '-u jdbc:mysql://127.0.0.1/user -n user -d org.mariadb.jdbc.Driver'
 
					jdbcflags = 'org.mariadb.jdbc.Driver jdbc:mysql://127.0.0.1/user user null'
 
					odbccmd = 'isql MySQL -d, < query > /dev/null'
 
				elif system['db'] == 'monetdb':
 
					querycmd = 'mclient -h 127.0.0.1 -p 50001 -fcsv -s "%s" > /dev/null' % query
 
					jdbcflags = '-u jdbc:monetdb://127.0.0.1:50001/database -n monetdb -p monetdb -d nl.cwi.monetdb.jdbc.MonetDriver'
 
					jdbcflags = 'nl.cwi.monetdb.jdbc.MonetDriver jdbc:monetdb://127.0.0.1:50001/database monetdb monetdb'
 
					odbccmd = 'isql MonetDB -d, < query > /dev/null'
 
				elif system['db'] == 'db2':
 
					db2qfile = open("db2query", "w")
 
					db2qfile.write("connect to remotedb user user using user; \n" + query + ";\n")
 
					db2qfile.close()
 
					querycmd = 'db2 -tf db2query > /dev/null;'
 
					jdbcflags = '-u jdbc:db2://127.0.0.1:50000/db -d com.ibm.db2.jcc.DB2Driver -n user -p user'
 
					jdbcflags = 'com.ibm.db2.jcc.DB2Driver jdbc:db2://127.0.0.1:50000/db user user'
 
					os.environ['DB2INSTANCE'] = 'user'
 
					odbccmd = 'isql DB2_SAMPLE -d, user user < query > /dev/null'
 
				elif system['db'] == 'oracle':
 
					os.environ['TNS_ADMIN'] = '/home/user/oracleconfig'
 
					querycmd = 'sqlplus system/oracle@//127.0.0.1:49161/XE @query-oracle.sql %d > /dev/null' % tuple
 
					jdbcflags = '-u jdbc:oracle:thin:@127.0.0.1:49161:XE -d oracle.jdbc.driver.OracleDriver -n system -p oracle'
 
					jdbcflags = 'oracle.jdbc.driver.OracleDriver jdbc:oracle:thin:@127.0.0.1:49161:XE system oracle'
 
					odbccmd = 'isql Oracle -d, < query > /dev/null'
 
					# for JDBC/ODBCV
 
					query = "SELECT * FROM lineitem where rownum < %d;" % tuple
 
@@ -160,6 +161,11 @@ for r in range(nruns):
 
					jdbcflags = None
 
					odbccmd = None
 
					odbcdriver = None
 
				elif system['db'] == 'hive':
 
					querycmd = None
 
					jdbcflags = 'org.apache.hive.jdbc.HiveDriver jdbc:hive2://localhost:10000 user null'
 
					odbccmd = None
 
					odbcdriver = None
 
				elif system['db'] == 'netcat':
 
					# open netcat for listening in a separate process
 
					filename = '/home/user/lineitem-%d.csv' % tuple
 
@@ -185,7 +191,12 @@ for r in range(nruns):
 
				qfile.write("\n")
 
				qfile.close()
 

	
 
				jdbccmd = 'java -Xmx10G -Djava.security.egd=file:/dev/./urandom -Djava.ext.dirs=/home/user/java/ -jar /home/user/java/sqlline.jar --fastConnect=true --outputformat=csv  --isolation=TRANSACTION_SERIALIZABLE --silent=true --showHeader=false %s < query > /dev/null' % jdbcflags
 
				jdbccmd = 'java -Xmx10G -Djava.security.egd=file:/dev/./urandom -cp /home/user/java/pmjc.jar:/home/user/java/db2jcc4.jar:/home/user/java/monetdb-jdbc-2.23.jar:/home/user/java/mariadb-java-client-1.4.6.jar:/home/user/java/ojdbc6_g.jar:/home/user/java/postgresql-9.4.1209.jar:/home/user/java/hive-jdbc-2.1.0-standalone.jar:/home/user/java/hadoop-common-2.6.4.jar %s "%s" 1000' % (jdbcflags, query)
 
				
 
				# special case for hive
 
				if querycmd is None:
 
					querycmd = jdbccmd
 
					jdbccmd = None
 

	
 
				# getting caches hot
 
				benchmark_command(querycmd, system, 'native', network, tuple, r, True)
0 comments (0 inline, 0 general)