import numpy import pandas from datetime import datetime import time def datetoint(col): return numpy.array([time.mktime(datetime.strptime(x, '%Y-%m-%d').timetuple()) for x in col], dtype=numpy.int64) cfile = pandas.read_csv('lineitem-1m.tbl', sep='|', header=None, names=['l_orderkey','l_partkey','l_suppkey','l_linenumber','l_quantity','l_extendedprice','l_discount','l_tax','l_returnflag','l_linestatus','l_shipdate','l_commitdate','l_receiptdate','l_shipinstruct','l_shipmode','l_comment']) cfile.l_orderkey = cfile.l_orderkey.astype(numpy.int32) cfile.l_partkey = cfile.l_partkey.astype(numpy.int32) cfile.l_suppkey = cfile.l_suppkey.astype(numpy.int16) cfile.l_linenumber = cfile.l_linenumber.astype(numpy.int8) cfile.l_quantity = cfile.l_quantity.astype(numpy.int8) cfile.l_shipdate = datetoint(cfile.l_shipdate) cfile.l_commitdate = datetoint(cfile.l_commitdate) cfile.l_receiptdate = datetoint(cfile.l_receiptdate) chunkbytes=1000000 tmp = chunkbytes tmp -= 16 row = 0 startrow = 0 rowsize = 0 while True: rowsize = 0 for i in range(len(cfile.columns)): col = cfile[cfile.columns[i]] if col.dtype == numpy.object: rowsize += len(col[row]) + 1 else: rowsize += col.dtype.itemsize if (tmp < rowsize): break tmp -= rowsize row += 1 out = open('out.col', 'wb+') # message length numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out) # row count numpy.array(row, dtype=numpy.int64).tofile(out) for i in range(len(cfile.columns)): col = cfile[cfile.columns[i]] if col.dtype == numpy.object: for o in range(startrow, (startrow + row)): out.write(col[o]) out.write('\0') else: numpy.array(col[startrow:(startrow + row)]).tofile(out) out.close() out = open('out.row', 'wb+') # header numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out) for r in range(startrow, startrow+row): for i in range(len(cfile.columns)): col = cfile[cfile.columns[i]] if col.dtype == numpy.object: out.write(col[r]) out.write('\0') else: numpy.array(col[r]).tofile(out) out.close()