diff --git a/convert.py b/convert.py new file mode 100644 index 0000000000000000000000000000000000000000..f448cf034cc869097e31c1a224bffe13905ad494 --- /dev/null +++ b/convert.py @@ -0,0 +1,79 @@ +import numpy +import pandas +from datetime import datetime +import time + + +def datetoint(col): + return numpy.array([time.mktime(datetime.strptime(x, '%Y-%m-%d').timetuple()) for x in col], dtype=numpy.int64) + + +cfile = pandas.read_csv('lineitem-1m.tbl', sep='|', header=None, names=['l_orderkey','l_partkey','l_suppkey','l_linenumber','l_quantity','l_extendedprice','l_discount','l_tax','l_returnflag','l_linestatus','l_shipdate','l_commitdate','l_receiptdate','l_shipinstruct','l_shipmode','l_comment']) + +cfile.l_orderkey = cfile.l_orderkey.astype(numpy.int32) +cfile.l_partkey = cfile.l_partkey.astype(numpy.int32) +cfile.l_suppkey = cfile.l_suppkey.astype(numpy.int16) +cfile.l_linenumber = cfile.l_linenumber.astype(numpy.int8) +cfile.l_quantity = cfile.l_quantity.astype(numpy.int8) +cfile.l_shipdate = datetoint(cfile.l_shipdate) +cfile.l_commitdate = datetoint(cfile.l_commitdate) +cfile.l_receiptdate = datetoint(cfile.l_receiptdate) + + +chunkbytes=1000000 + +tmp = chunkbytes +tmp -= 16 +row = 0 +startrow = 0 + +rowsize = 0 + +while True: + rowsize = 0 + for i in range(len(cfile.columns)): + col = cfile[cfile.columns[i]] + if col.dtype == numpy.object: + rowsize += len(col[row]) + 1 + else: + rowsize += col.dtype.itemsize + if (tmp < rowsize): + break + tmp -= rowsize + row += 1 + + +out = open('out.col', 'wb+') + +# message length +numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out) +# row count +numpy.array(row, dtype=numpy.int64).tofile(out) + +for i in range(len(cfile.columns)): + col = cfile[cfile.columns[i]] + if col.dtype == numpy.object: + for o in range(startrow, (startrow + row)): + out.write(col[o]) + out.write('\0') + else: + numpy.array(col[startrow:(startrow + row)]).tofile(out) + +out.close() + + + +out = open('out.row', 'wb+') + +# header +numpy.array(chunkbytes - tmp, dtype=numpy.int64).tofile(out) +for r in range(startrow, startrow+row): + for i in range(len(cfile.columns)): + col = cfile[cfile.columns[i]] + if col.dtype == numpy.object: + out.write(col[r]) + out.write('\0') + else: + numpy.array(col[r]).tofile(out) + +out.close() diff --git a/pmjc.jar b/pmjc.jar new file mode 100644 index 0000000000000000000000000000000000000000..4d4bc05878b4bbbf25f78e0874bd4da91c4ac69b GIT binary patch literal 2087 zcmZ{l2T+sg7KTF!ADbUE=6epObCI5j;OSR5~QObMKN#z5tQB{2na+& z2bW7xBp{s)1S7Zgr@&FxLp(AE|Rtcf++!rabY9c!^W2mowghli{%YU)f8jM{O=@JN&GNtV_u zdydTuz+nVIjn{csV4PA=&V~D50~GO}4M3c)>cOEwB%)XN<#eoQmmK8Z6H(yjvA0&v zkG73Q@@xV?S&5b32_p!vU1cTG>AOwu+pj-e&3wAM z^Hw8nBE-!iuC!ZwY_hOe{ox{s=z_6f!jYc3ZPm3jKcvpkqQB74`7o%deA~lzCz*2+ z15g1!*70@Fxlx*2GNe1>ap0fmy2j3B*bM6kd)dQ3cETO*4kFPPOEaQl$Y-vZAa*k9^BUQYkiyRlPl|~sXstLuvw|$ z>rowaQcuX;?2tL*wjPl__95Vfg-ZL@fLqS7&4CqE716x1HzY-vg&1rJ)JvKuy91ElWXWFtl?og^wXd&hUn|*M%TORlU_w8 zKor)^w9Jl0LhJe$xp6G-E_xOWGVH*1tU-jqQ)JNX9f&- z(luIuG7LhbI-xs12_s()sIdHka39tRsj`gHpRT}8P#zzl%e~57m(F>Dr)nG>A(JE? zHXF zPqHNCw+qMyyTRv*j}DM|MHCYqZC_&1fI|PVBF{PLL)ksEj;J?Ob}|ST%FD zpx#V!WAg%4s-}FBNP2x+33C&UyCB(K`>(TO53b8@zr{Tjp9(qguB^#rFWqHIuXCy{ z4P@OUoRWVo9?u(4POXt*(Vlt{S-v*XY9r%cTGX!uRyo(^B`BUdo-O$dVeIy(Wifj( zr;@@vVzqO7t%#m2u+JX3!O0}K{E(gleI4E*}Vbt&YFkJ*19 zcZw!^7ws|g;uqU;z{KW^;m>_zQ<2Mp{4R!yL|eMmp8R8%7H2{BDZO-X*2n0}F_2DU zO9l%Ma(74M84NjJ|OZ znhEPd7rqa)dLuO-EPH}QP-1kMBRb%ZpY~jhIlC%YzA3ccjI!(0YY>v8Jz&nnuX=RD zleJoG+I}|CPrz1trg;{YiMsU(n6#TlQNG+A#N98Psi=c=$n7Y++ftr;{mXChAqj*S z^gT@rf>nQhW!D*a{RnWkR9j}{>*mgq5jV}i0{OR+_yGVBPE`Lm$HCkj|2w*ZJ;Hrs z?_kB~#*oC3?V!!40an?J>KyP*h zy?j==WALr|r(KzXS<0-NZBCg1Md@f!9oMY?BQokYjuqv