Files
@ 232ee0cc752e
Branch filter:
Location: DA/protocols/convert.py - annotation
232ee0cc752e
2.0 KiB
text/x-python
snappy it is?
c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 | import numpy
import pandas
from datetime import datetime
import time
def datetoint(col):
return numpy.array([time.mktime(datetime.strptime(x, '%Y-%m-%d').timetuple()) for x in col], dtype=numpy.int64)
cfile = pandas.read_csv('lineitem-1m.tbl', sep='|', header=None, names=['l_orderkey','l_partkey','l_suppkey','l_linenumber','l_quantity','l_extendedprice','l_discount','l_tax','l_returnflag','l_linestatus','l_shipdate','l_commitdate','l_receiptdate','l_shipinstruct','l_shipmode','l_comment'])
cfile.l_orderkey = cfile.l_orderkey.astype(numpy.int32)
cfile.l_partkey = cfile.l_partkey.astype(numpy.int32)
cfile.l_suppkey = cfile.l_suppkey.astype(numpy.int16)
cfile.l_linenumber = cfile.l_linenumber.astype(numpy.int8)
cfile.l_quantity = cfile.l_quantity.astype(numpy.int8)
cfile.l_shipdate = datetoint(cfile.l_shipdate)
cfile.l_commitdate = datetoint(cfile.l_commitdate)
cfile.l_receiptdate = datetoint(cfile.l_receiptdate)
chunkbytes=1000000
tmp = chunkbytes
tmp -= 16
row = 0
startrow = 0
rowsize = 0
while True:
rowsize = 0
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
rowsize += len(col[row]) + 1
else:
rowsize += col.dtype.itemsize
if (tmp < rowsize):
break
tmp -= rowsize
row += 1
out = open('out.col', 'wb+')
# message length
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out)
# row count
numpy.array(row, dtype=numpy.int64).tofile(out)
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
for o in range(startrow, (startrow + row)):
out.write(col[o])
out.write('\0')
else:
numpy.array(col[startrow:(startrow + row)]).tofile(out)
out.close()
out = open('out.row', 'wb+')
# header
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out)
for r in range(startrow, startrow+row):
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
out.write(col[r])
out.write('\0')
else:
numpy.array(col[r]).tofile(out)
out.close()
|