diff --git a/genprotocol.py b/genprotocol.py new file mode 100644 index 0000000000000000000000000000000000000000..868bda532597d669c3437ec596b602b21b8ee08c --- /dev/null +++ b/genprotocol.py @@ -0,0 +1,85 @@ +import numpy +import pandas +from datetime import datetime +import time + + +def datetoint(col): + return numpy.array([time.mktime(datetime.strptime(x, '%Y-%m-%d').timetuple()) for x in col], dtype=numpy.int64) + +chunkbyte_choices = [100000, 1000000, 10000000, 100000000] +chunkbytes = 1000000 + +tuples = [1,100,1000,10000,100000,1000000] +filename = '/home/user/lineitem-%d.csv' + +for chunkbytes in chunkbyte_choices: + for tuple in tuples: + cfile = pandas.read_csv(filename % tuple, sep='|', header=None, names=['l_orderkey','l_partkey','l_suppkey','l_linenumber','l_quantity','l_extendedprice','l_discount','l_tax','l_returnflag','l_linestatus','l_shipdate','l_commitdate','l_receiptdate','l_shipinstruct','l_shipmode','l_comment']) + + cfile.l_orderkey = cfile.l_orderkey.astype(numpy.int32) + cfile.l_partkey = cfile.l_partkey.astype(numpy.int32) + cfile.l_suppkey = cfile.l_suppkey.astype(numpy.int16) + cfile.l_linenumber = cfile.l_linenumber.astype(numpy.int8) + cfile.l_quantity = cfile.l_quantity.astype(numpy.int8) + cfile.l_shipdate = datetoint(cfile.l_shipdate) + cfile.l_commitdate = datetoint(cfile.l_commitdate) + cfile.l_receiptdate = datetoint(cfile.l_receiptdate) + + out_col = open('lineitem-%dtpl-%dchunksize.col' % (tuple, chunkbytes), 'wb+') + out_row = open('lineitem-%dtpl-%dchunksize.row' % (tuple, chunkbytes), 'wb+') + + startrow = 0 + + while startrow < len(cfile): + print(startrow) + tmp = chunkbytes - 16 + row = 0 + rowsize = 0 + # for the current chunk, compute the amount of rows to transmit + while startrow + row < len(cfile): + rowsize = 0 + for i in range(len(cfile.columns)): + col = cfile[cfile.columns[i]] + if col.dtype == numpy.object: + rowsize += len(col[row]) + 1 + else: + rowsize += col.dtype.itemsize + if (tmp < rowsize): + break + tmp -= rowsize + row += 1 + + # write column-protocol + + # message length + numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out_col) + # row count + numpy.array(row, dtype=numpy.int64).tofile(out_col) + + for i in range(len(cfile.columns)): + col = cfile[cfile.columns[i]] + if col.dtype == numpy.object: + for o in range(startrow, (startrow + row)): + out_col.write(col[o]) + out_col.write('\0') + else: + numpy.array(col[startrow:(startrow + row)]).tofile(out_col) + + # write row-protocol + + # message length + numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out_row) + for r in range(startrow, startrow+row): + for i in range(len(cfile.columns)): + col = cfile[cfile.columns[i]] + if col.dtype == numpy.object: + out_row.write(col[r]) + out_row.write('\0') + else: + numpy.array(col[r]).tofile(out_row) + + startrow += row + + out_col.close() + out_row.close()