Files
@ 5dbb8a045bb1
Branch filter:
Location: DA/protocols/convert.py - annotation
5dbb8a045bb1
2.0 KiB
text/x-python
jdbc fixes
c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 c0e3a977dfe5 | import numpy
import pandas
from datetime import datetime
import time
def datetoint(col):
return numpy.array([time.mktime(datetime.strptime(x, '%Y-%m-%d').timetuple()) for x in col], dtype=numpy.int64)
cfile = pandas.read_csv('lineitem-1m.tbl', sep='|', header=None, names=['l_orderkey','l_partkey','l_suppkey','l_linenumber','l_quantity','l_extendedprice','l_discount','l_tax','l_returnflag','l_linestatus','l_shipdate','l_commitdate','l_receiptdate','l_shipinstruct','l_shipmode','l_comment'])
cfile.l_orderkey = cfile.l_orderkey.astype(numpy.int32)
cfile.l_partkey = cfile.l_partkey.astype(numpy.int32)
cfile.l_suppkey = cfile.l_suppkey.astype(numpy.int16)
cfile.l_linenumber = cfile.l_linenumber.astype(numpy.int8)
cfile.l_quantity = cfile.l_quantity.astype(numpy.int8)
cfile.l_shipdate = datetoint(cfile.l_shipdate)
cfile.l_commitdate = datetoint(cfile.l_commitdate)
cfile.l_receiptdate = datetoint(cfile.l_receiptdate)
chunkbytes=1000000
tmp = chunkbytes
tmp -= 16
row = 0
startrow = 0
rowsize = 0
while True:
rowsize = 0
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
rowsize += len(col[row]) + 1
else:
rowsize += col.dtype.itemsize
if (tmp < rowsize):
break
tmp -= rowsize
row += 1
out = open('out.col', 'wb+')
# message length
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out)
# row count
numpy.array(row, dtype=numpy.int64).tofile(out)
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
for o in range(startrow, (startrow + row)):
out.write(col[o])
out.write('\0')
else:
numpy.array(col[startrow:(startrow + row)]).tofile(out)
out.close()
out = open('out.row', 'wb+')
# header
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out)
for r in range(startrow, startrow+row):
for i in range(len(cfile.columns)):
col = cfile[cfile.columns[i]]
if col.dtype == numpy.object:
out.write(col[r])
out.write('\0')
else:
numpy.array(col[r]).tofile(out)
out.close()
|