Files
@ e23e3482a0b7
Branch filter:
Location: DA/protocols/genprotocol.py - annotation
e23e3482a0b7
2.7 KiB
text/x-python
more stuff
b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 b01cb2bb9139 | import numpy
import pandas
from datetime import datetime
import time
def datetoint(col):
return numpy.array([time.mktime(datetime.strptime(x, '%Y-%m-%d').timetuple()) for x in col], dtype=numpy.int64)
chunkbyte_choices = [100000, 1000000, 10000000, 100000000]
chunkbytes = 1000000
tuples = [1,100,1000,10000,100000,1000000]
filename = '/home/user/lineitem-%d.csv'
for chunkbytes in chunkbyte_choices:
for tuple in tuples:
cfile = pandas.read_csv(filename % tuple, sep='|', header=None, names=['l_orderkey','l_partkey','l_suppkey','l_linenumber','l_quantity','l_extendedprice','l_discount','l_tax','l_returnflag','l_linestatus','l_shipdate','l_commitdate','l_receiptdate','l_shipinstruct','l_shipmode','l_comment'])
cfile.l_orderkey = cfile.l_orderkey.astype(numpy.int32)
cfile.l_partkey = cfile.l_partkey.astype(numpy.int32)
cfile.l_suppkey = cfile.l_suppkey.astype(numpy.int16)
cfile.l_linenumber = cfile.l_linenumber.astype(numpy.int8)
cfile.l_quantity = cfile.l_quantity.astype(numpy.int8)
cfile.l_shipdate = datetoint(cfile.l_shipdate)
cfile.l_commitdate = datetoint(cfile.l_commitdate)
cfile.l_receiptdate = datetoint(cfile.l_receiptdate)
out_col = open('lineitem-%dtpl-%dchunksize.col' % (tuple, chunkbytes), 'wb+')
out_row = open('lineitem-%dtpl-%dchunksize.row' % (tuple, chunkbytes), 'wb+')
startrow = 0
while startrow < len(cfile):
print(startrow)
tmp = chunkbytes - 16
row = 0
rowsize = 0
# for the current chunk, compute the amount of rows to transmit
while startrow + row < len(cfile):
rowsize = 0
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
rowsize += len(col[row]) + 1
else:
rowsize += col.dtype.itemsize
if (tmp < rowsize):
break
tmp -= rowsize
row += 1
# write column-protocol
# message length
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out_col)
# row count
numpy.array(row, dtype=numpy.int64).tofile(out_col)
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
for o in range(startrow, (startrow + row)):
out_col.write(col[o])
out_col.write('\0')
else:
numpy.array(col[startrow:(startrow + row)]).tofile(out_col)
# write row-protocol
# message length
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out_row)
for r in range(startrow, startrow+row):
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
out_row.write(col[r])
out_row.write('\0')
else:
numpy.array(col[r]).tofile(out_row)
startrow += row
out_col.close()
out_row.close()
|