Files
@ e23e3482a0b7
Branch filter:
Location: DA/protocols/convert.py - annotation
e23e3482a0b7
2.0 KiB
text/x-python
more stuff
c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 | import numpy
import pandas
from datetime import datetime
import time
def datetoint(col):
return numpy.array([time.mktime(datetime.strptime(x, '%Y-%m-%d').timetuple()) for x in col], dtype=numpy.int64)
cfile = pandas.read_csv('lineitem-1m.tbl', sep='|', header=None, names=['l_orderkey','l_partkey','l_suppkey','l_linenumber','l_quantity','l_extendedprice','l_discount','l_tax','l_returnflag','l_linestatus','l_shipdate','l_commitdate','l_receiptdate','l_shipinstruct','l_shipmode','l_comment'])
cfile.l_orderkey = cfile.l_orderkey.astype(numpy.int32)
cfile.l_partkey = cfile.l_partkey.astype(numpy.int32)
cfile.l_suppkey = cfile.l_suppkey.astype(numpy.int16)
cfile.l_linenumber = cfile.l_linenumber.astype(numpy.int8)
cfile.l_quantity = cfile.l_quantity.astype(numpy.int8)
cfile.l_shipdate = datetoint(cfile.l_shipdate)
cfile.l_commitdate = datetoint(cfile.l_commitdate)
cfile.l_receiptdate = datetoint(cfile.l_receiptdate)
chunkbytes=1000000
tmp = chunkbytes
tmp -= 16
row = 0
startrow = 0
rowsize = 0
while True:
rowsize = 0
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
rowsize += len(col[row]) + 1
else:
rowsize += col.dtype.itemsize
if (tmp < rowsize):
break
tmp -= rowsize
row += 1
out = open('out.col', 'wb+')
# message length
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out)
# row count
numpy.array(row, dtype=numpy.int64).tofile(out)
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
for o in range(startrow, (startrow + row)):
out.write(col[o])
out.write('\0')
else:
numpy.array(col[startrow:(startrow + row)]).tofile(out)
out.close()
out = open('out.row', 'wb+')
# header
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out)
for r in range(startrow, startrow+row):
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
out.write(col[r])
out.write('\0')
else:
numpy.array(col[r]).tofile(out)
out.close()
|