import numpy import pandas from datetime import datetime import time def datetoint(col): return numpy.array([time.mktime(datetime.strptime(x, '%Y-%m-%d').timetuple()) for x in col], dtype=numpy.int64) chunkbyte_choices = [100000, 1000000, 10000000, 100000000] chunkbytes = 1000000 tuples = [1,100,1000,10000,100000,1000000] filename = '/home/user/lineitem-%d.csv' for chunkbytes in chunkbyte_choices: for tuple in tuples: cfile = pandas.read_csv(filename % tuple, sep='|', header=None, names=['l_orderkey','l_partkey','l_suppkey','l_linenumber','l_quantity','l_extendedprice','l_discount','l_tax','l_returnflag','l_linestatus','l_shipdate','l_commitdate','l_receiptdate','l_shipinstruct','l_shipmode','l_comment']) cfile.l_orderkey = cfile.l_orderkey.astype(numpy.int32) cfile.l_partkey = cfile.l_partkey.astype(numpy.int32) cfile.l_suppkey = cfile.l_suppkey.astype(numpy.int16) cfile.l_linenumber = cfile.l_linenumber.astype(numpy.int8) cfile.l_quantity = cfile.l_quantity.astype(numpy.int8) cfile.l_shipdate = datetoint(cfile.l_shipdate) cfile.l_commitdate = datetoint(cfile.l_commitdate) cfile.l_receiptdate = datetoint(cfile.l_receiptdate) out_col = open('lineitem-%dtpl-%dchunksize.col' % (tuple, chunkbytes), 'wb+') out_row = open('lineitem-%dtpl-%dchunksize.row' % (tuple, chunkbytes), 'wb+') startrow = 0 while startrow < len(cfile): print(startrow) tmp = chunkbytes - 16 row = 0 rowsize = 0 # for the current chunk, compute the amount of rows to transmit while startrow + row < len(cfile): rowsize = 0 for i in range(len(cfile.columns)): col = cfile[cfile.columns[i]] if col.dtype == numpy.object: rowsize += len(col[row]) + 1 else: rowsize += col.dtype.itemsize if (tmp < rowsize): break tmp -= rowsize row += 1 # write column-protocol # message length numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out_col) # row count numpy.array(row, dtype=numpy.int64).tofile(out_col) for i in range(len(cfile.columns)): col = cfile[cfile.columns[i]] if col.dtype == numpy.object: for o in range(startrow, (startrow + row)): out_col.write(col[o]) out_col.write('\0') else: numpy.array(col[startrow:(startrow + row)]).tofile(out_col) # write row-protocol # message length numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out_row) for r in range(startrow, startrow+row): for i in range(len(cfile.columns)): col = cfile[cfile.columns[i]] if col.dtype == numpy.object: out_row.write(col[r]) out_row.write('\0') else: numpy.array(col[r]).tofile(out_row) startrow += row out_col.close() out_row.close()