Files
@ 5f874ae74f0f
Branch filter:
Location: DA/protocols/convert.py - annotation
5f874ae74f0f
2.0 KiB
text/x-python
new pictures
c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 | import numpy
import pandas
from datetime import datetime
import time
def datetoint(col):
return numpy.array([time.mktime(datetime.strptime(x, '%Y-%m-%d').timetuple()) for x in col], dtype=numpy.int64)
cfile = pandas.read_csv('lineitem-1m.tbl', sep='|', header=None, names=['l_orderkey','l_partkey','l_suppkey','l_linenumber','l_quantity','l_extendedprice','l_discount','l_tax','l_returnflag','l_linestatus','l_shipdate','l_commitdate','l_receiptdate','l_shipinstruct','l_shipmode','l_comment'])
cfile.l_orderkey = cfile.l_orderkey.astype(numpy.int32)
cfile.l_partkey = cfile.l_partkey.astype(numpy.int32)
cfile.l_suppkey = cfile.l_suppkey.astype(numpy.int16)
cfile.l_linenumber = cfile.l_linenumber.astype(numpy.int8)
cfile.l_quantity = cfile.l_quantity.astype(numpy.int8)
cfile.l_shipdate = datetoint(cfile.l_shipdate)
cfile.l_commitdate = datetoint(cfile.l_commitdate)
cfile.l_receiptdate = datetoint(cfile.l_receiptdate)
chunkbytes=1000000
tmp = chunkbytes
tmp -= 16
row = 0
startrow = 0
rowsize = 0
while True:
rowsize = 0
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
rowsize += len(col[row]) + 1
else:
rowsize += col.dtype.itemsize
if (tmp < rowsize):
break
tmp -= rowsize
row += 1
out = open('out.col', 'wb+')
# message length
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out)
# row count
numpy.array(row, dtype=numpy.int64).tofile(out)
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
for o in range(startrow, (startrow + row)):
out.write(col[o])
out.write('\0')
else:
numpy.array(col[startrow:(startrow + row)]).tofile(out)
out.close()
out = open('out.row', 'wb+')
# header
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out)
for r in range(startrow, startrow+row):
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
out.write(col[r])
out.write('\0')
else:
numpy.array(col[r]).tofile(out)
out.close()
|