Files @ 3d604507d726
Branch filter:

Location: DA/protocols/genprotocol.py

Mark Raasveldt
Add pmodbc.c and add new protocol tests to vldb-protocols.py.
import numpy
import pandas
from datetime import datetime
import time


def datetoint(col):
	return numpy.array([time.mktime(datetime.strptime(x, '%Y-%m-%d').timetuple()) for x in col], dtype=numpy.int64)

chunkbyte_choices = [100000, 1000000, 10000000, 100000000]
chunkbytes = 1000000

tuples = [1,100,1000,10000,100000,1000000]
filename = '/home/user/lineitem-%d.csv'

for chunkbytes in chunkbyte_choices:
	for tuple in tuples:
		cfile = pandas.read_csv(filename % tuple, sep='|', header=None, names=['l_orderkey','l_partkey','l_suppkey','l_linenumber','l_quantity','l_extendedprice','l_discount','l_tax','l_returnflag','l_linestatus','l_shipdate','l_commitdate','l_receiptdate','l_shipinstruct','l_shipmode','l_comment'])

		cfile.l_orderkey = cfile.l_orderkey.astype(numpy.int32)
		cfile.l_partkey = cfile.l_partkey.astype(numpy.int32)
		cfile.l_suppkey = cfile.l_suppkey.astype(numpy.int16)
		cfile.l_linenumber = cfile.l_linenumber.astype(numpy.int8)
		cfile.l_quantity = cfile.l_quantity.astype(numpy.int8)
		cfile.l_shipdate = datetoint(cfile.l_shipdate)
		cfile.l_commitdate = datetoint(cfile.l_commitdate)
		cfile.l_receiptdate = datetoint(cfile.l_receiptdate)

		out_col = open('lineitem-%dtpl-%dchunksize.col' % (tuple, chunkbytes), 'wb+')
		out_row = open('lineitem-%dtpl-%dchunksize.row' % (tuple, chunkbytes), 'wb+')

		startrow = 0

		while startrow < len(cfile):
			print(startrow)
			tmp = chunkbytes - 16
			row = 0
			rowsize = 0
			# for the current chunk, compute the amount of rows to transmit
			while startrow + row < len(cfile):
				rowsize = 0
				for i in range(len(cfile.columns)):
					col = cfile[cfile.columns[i]]
					if col.dtype == numpy.object:
						rowsize += len(col[row]) + 1
					else:
						rowsize += col.dtype.itemsize
				if (tmp < rowsize):
					break
				tmp -= rowsize
				row += 1

			# write column-protocol

			# message length
			numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out_col)
			# row count
			numpy.array(row, dtype=numpy.int64).tofile(out_col)

			for i in range(len(cfile.columns)):
				col = cfile[cfile.columns[i]]
				if col.dtype == numpy.object:
					for o in range(startrow, (startrow + row)):
						out_col.write(col[o])
						out_col.write('\0')
				else:
					numpy.array(col[startrow:(startrow + row)]).tofile(out_col)

			# write row-protocol

			# message length
			numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out_row)
			for r in range(startrow, startrow+row):
				for i in range(len(cfile.columns)):
					col = cfile[cfile.columns[i]]
					if col.dtype == numpy.object:
						out_row.write(col[r])
						out_row.write('\0')
					else:
						numpy.array(col[r]).tofile(out_row)

			startrow += row

		out_col.close()
		out_row.close()