Files
@ c0e3a977dfe5
Branch filter:
Location: DA/protocols/convert.py - annotation
c0e3a977dfe5
2.0 KiB
text/x-python
more things
c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 | import numpy
import pandas
from datetime import datetime
import time
def datetoint(col):
return numpy.array([time.mktime(datetime.strptime(x, '%Y-%m-%d').timetuple()) for x in col], dtype=numpy.int64)
cfile = pandas.read_csv('lineitem-1m.tbl', sep='|', header=None, names=['l_orderkey','l_partkey','l_suppkey','l_linenumber','l_quantity','l_extendedprice','l_discount','l_tax','l_returnflag','l_linestatus','l_shipdate','l_commitdate','l_receiptdate','l_shipinstruct','l_shipmode','l_comment'])
cfile.l_orderkey = cfile.l_orderkey.astype(numpy.int32)
cfile.l_partkey = cfile.l_partkey.astype(numpy.int32)
cfile.l_suppkey = cfile.l_suppkey.astype(numpy.int16)
cfile.l_linenumber = cfile.l_linenumber.astype(numpy.int8)
cfile.l_quantity = cfile.l_quantity.astype(numpy.int8)
cfile.l_shipdate = datetoint(cfile.l_shipdate)
cfile.l_commitdate = datetoint(cfile.l_commitdate)
cfile.l_receiptdate = datetoint(cfile.l_receiptdate)
chunkbytes=1000000
tmp = chunkbytes
tmp -= 16
row = 0
startrow = 0
rowsize = 0
while True:
rowsize = 0
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
rowsize += len(col[row]) + 1
else:
rowsize += col.dtype.itemsize
if (tmp < rowsize):
break
tmp -= rowsize
row += 1
out = open('out.col', 'wb+')
# message length
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out)
# row count
numpy.array(row, dtype=numpy.int64).tofile(out)
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
for o in range(startrow, (startrow + row)):
out.write(col[o])
out.write('\0')
else:
numpy.array(col[startrow:(startrow + row)]).tofile(out)
out.close()
out = open('out.row', 'wb+')
# header
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out)
for r in range(startrow, startrow+row):
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
out.write(col[r])
out.write('\0')
else:
numpy.array(col[r]).tofile(out)
out.close()
|