DA/protocols Files · convert.py · Centrum Wiskunde & Informatica (CWI)

Files @ c0e3a977dfe5

Branch filter:

Location: DA/protocols/convert.py

c0e3a977dfe5 2.0 KiB text/x-python Show Annotation Show as Raw Download as Raw

Hannes Muehleisen

more things

import numpy
import pandas
from datetime import datetime
import time


def datetoint(col):
	return numpy.array([time.mktime(datetime.strptime(x, '%Y-%m-%d').timetuple()) for x in col], dtype=numpy.int64)


cfile = pandas.read_csv('lineitem-1m.tbl', sep='|', header=None, names=['l_orderkey','l_partkey','l_suppkey','l_linenumber','l_quantity','l_extendedprice','l_discount','l_tax','l_returnflag','l_linestatus','l_shipdate','l_commitdate','l_receiptdate','l_shipinstruct','l_shipmode','l_comment'])

cfile.l_orderkey = cfile.l_orderkey.astype(numpy.int32)
cfile.l_partkey = cfile.l_partkey.astype(numpy.int32)
cfile.l_suppkey = cfile.l_suppkey.astype(numpy.int16)
cfile.l_linenumber = cfile.l_linenumber.astype(numpy.int8)
cfile.l_quantity = cfile.l_quantity.astype(numpy.int8)
cfile.l_shipdate = datetoint(cfile.l_shipdate)
cfile.l_commitdate = datetoint(cfile.l_commitdate)
cfile.l_receiptdate = datetoint(cfile.l_receiptdate)


chunkbytes=1000000

tmp = chunkbytes
tmp -= 16
row = 0
startrow = 0

rowsize = 0

while True:
	rowsize = 0
	for i in range(len(cfile.columns)):
		col = cfile[cfile.columns[i]]
		if col.dtype == numpy.object:
			rowsize += len(col[row]) + 1
		else:
			rowsize += col.dtype.itemsize
	if (tmp < rowsize):
		break
	tmp -= rowsize
	row += 1


out = open('out.col', 'wb+')

# message length
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out)
# row count
numpy.array(row, dtype=numpy.int64).tofile(out)

for i in range(len(cfile.columns)):
	col = cfile[cfile.columns[i]]
	if col.dtype == numpy.object:
		for o in range(startrow, (startrow + row)):
			out.write(col[o])
			out.write('\0')
	else:
		numpy.array(col[startrow:(startrow + row)]).tofile(out)

out.close()



out = open('out.row', 'wb+')

# header
numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out)
for r in range(startrow, startrow+row):
	for i in range(len(cfile.columns)):
		col = cfile[cfile.columns[i]]
		if col.dtype == numpy.object:
			out.write(col[r])
			out.write('\0')
		else:
			numpy.array(col[r]).tofile(out)

out.close()