diff --git a/ruby/red-arrow-format/benchmark/file-writer.yaml b/ruby/red-arrow-format/benchmark/file-writer.yaml new file mode 100644 index 00000000000..37b89f5bff7 --- /dev/null +++ b/ruby/red-arrow-format/benchmark/file-writer.yaml @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +prelude: | + Warning[:experimental] = false + + require "arrow" + require "arrow-format" + + seed = 29 + random = Random.new(seed) + + n_columns = 100 + n_rows = 10000 + max_uint32 = 2 ** 32 - 1 + arrays = n_columns.times.collect do |i| + if i.even? + Arrow::UInt32Array.new(n_rows.times.collect {random.rand(max_uint32)}) + else + Arrow::BinaryArray.new(n_rows.times.collect {random.bytes(random.rand(10))}) + end + end + columns = arrays.collect.with_index {|array, i| [i, array]} + red_arrow_table = Arrow::Table.new(columns) + + fields = arrays.collect.with_index do |array, i| + case array + when Arrow::UInt32Array + type = ArrowFormat::UInt32Type.singleton + when Arrow::BinaryArray + type = ArrowFormat::BinaryType.singleton + end + ArrowFormat::Field.new(i.to_s, type) + end + schema = ArrowFormat::Schema.new(fields) + def convert_buffer(buffer) + return nil if buffer.nil? + IO::Buffer.for(buffer.data.to_s.dup) + end + columns = fields.zip(arrays).collect do |field, array| + case array + when Arrow::UInt32Array + field.type.build_array(n_rows, + convert_buffer(array.null_bitmap), + convert_buffer(array.data_buffer)) + when Arrow::BinaryArray + field.type.build_array(n_rows, + convert_buffer(array.null_bitmap), + convert_buffer(array.offsets_buffer), + convert_buffer(array.data_buffer)) + end + end + red_arrow_format_record_batch = + ArrowFormat::RecordBatch.new(schema, n_rows, columns) + + GC.start + GC.disable +benchmark: + "Arrow::Table#save": | + buffer = Arrow::ResizableBuffer.new(4096) + red_arrow_table.save(buffer, format: :arrow_file) + "Arrow::RecordBatchFileWriter": | + buffer = Arrow::ResizableBuffer.new(4096) + Arrow::BufferOutputStream.open(buffer) do |output| + schema = red_arrow_table.schema + Arrow::RecordBatchFileWriter.open(output, schema) do |writer| + writer.write_table(red_arrow_table) + end + end + "ArrowFormat::FileWriter": | + output = +"".b + writer = ArrowFormat::FileWriter.new(output) + writer.start(red_arrow_format_record_batch.schema) + writer.write_record_batch(red_arrow_format_record_batch) + writer.finish diff --git a/ruby/red-arrow-format/benchmark/streaming-writer.yaml b/ruby/red-arrow-format/benchmark/streaming-writer.yaml new file mode 100644 index 00000000000..824e71dff67 --- /dev/null +++ b/ruby/red-arrow-format/benchmark/streaming-writer.yaml @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +prelude: | + Warning[:experimental] = false + + require "arrow" + require "arrow-format" + + seed = 29 + random = Random.new(seed) + + n_columns = 100 + n_rows = 10000 + max_uint32 = 2 ** 32 - 1 + arrays = n_columns.times.collect do |i| + if i.even? + Arrow::UInt32Array.new(n_rows.times.collect {random.rand(max_uint32)}) + else + Arrow::BinaryArray.new(n_rows.times.collect {random.bytes(random.rand(10))}) + end + end + columns = arrays.collect.with_index {|array, i| [i, array]} + red_arrow_table = Arrow::Table.new(columns) + + fields = arrays.collect.with_index do |array, i| + case array + when Arrow::UInt32Array + type = ArrowFormat::UInt32Type.singleton + when Arrow::BinaryArray + type = ArrowFormat::BinaryType.singleton + end + ArrowFormat::Field.new(i.to_s, type) + end + schema = ArrowFormat::Schema.new(fields) + def convert_buffer(buffer) + return nil if buffer.nil? + IO::Buffer.for(buffer.data.to_s.dup) + end + columns = fields.zip(arrays).collect do |field, array| + case array + when Arrow::UInt32Array + field.type.build_array(n_rows, + convert_buffer(array.null_bitmap), + convert_buffer(array.data_buffer)) + when Arrow::BinaryArray + field.type.build_array(n_rows, + convert_buffer(array.null_bitmap), + convert_buffer(array.offsets_buffer), + convert_buffer(array.data_buffer)) + end + end + red_arrow_format_record_batch = + ArrowFormat::RecordBatch.new(schema, n_rows, columns) + + GC.start + GC.disable +benchmark: + "Arrow::Table#save": | + buffer = Arrow::ResizableBuffer.new(4096) + red_arrow_table.save(buffer, format: :arrow_streaming) + "Arrow::RecordBatchStreamWriter": | + buffer = Arrow::ResizableBuffer.new(4096) + Arrow::BufferOutputStream.open(buffer) do |output| + schema = red_arrow_table.schema + Arrow::RecordBatchStreamWriter.open(output, schema) do |writer| + writer.write_table(red_arrow_table) + end + end + "ArrowFormat::StreamingWriter": | + output = +"".b + writer = ArrowFormat::StreamingWriter.new(output) + writer.start(red_arrow_format_record_batch.schema) + writer.write_record_batch(red_arrow_format_record_batch) + writer.finish diff --git a/ruby/red-arrow-format/lib/arrow-format/array.rb b/ruby/red-arrow-format/lib/arrow-format/array.rb index cb71a4d2550..9a248d279fd 100644 --- a/ruby/red-arrow-format/lib/arrow-format/array.rb +++ b/ruby/red-arrow-format/lib/arrow-format/array.rb @@ -140,8 +140,8 @@ def slice_offsets_buffer(id, buffer, buffer_type) end class NullArray < Array - def initialize(type, size) - super(type, size, nil) + def initialize(size) + super(NullType.singleton, size, nil) end def each_buffer @@ -186,6 +186,10 @@ def element_size end class BooleanArray < PrimitiveArray + def initialize(size, validity_buffer, values_buffer) + super(BooleanType.singleton, size, validity_buffer, values_buffer) + end + def to_a return [] if empty? @@ -209,51 +213,120 @@ def clear_cache end class IntArray < PrimitiveArray + def initialize(size, validity_buffer, values_buffer) + super(self.class.type, size, validity_buffer, values_buffer) + end end class Int8Array < IntArray + class << self + def type + Int8Type.singleton + end + end end class UInt8Array < IntArray + class << self + def type + UInt8Type.singleton + end + end end class Int16Array < IntArray + class << self + def type + Int16Type.singleton + end + end end class UInt16Array < IntArray + class << self + def type + UInt16Type.singleton + end + end end class Int32Array < IntArray + class << self + def type + Int32Type.singleton + end + end end class UInt32Array < IntArray + class << self + def type + UInt32Type.singleton + end + end end class Int64Array < IntArray + class << self + def type + Int64Type.singleton + end + end end class UInt64Array < IntArray + class << self + def type + UInt64Type.singleton + end + end end class FloatingPointArray < PrimitiveArray + def initialize(size, validity_buffer, values_buffer) + super(self.class.type, size, validity_buffer, values_buffer) + end end class Float32Array < FloatingPointArray + class << self + def type + Float32Type.singleton + end + end end class Float64Array < FloatingPointArray + class << self + def type + Float64Type.singleton + end + end end class TemporalArray < PrimitiveArray end class DateArray < TemporalArray + def initialize(size, validity_buffer, values_buffer) + super(self.class.type, size, validity_buffer, values_buffer) + end end class Date32Array < DateArray + class << self + def type + Date32Type.singleton + end + end end class Date64Array < DateArray + class << self + def type + Date64Type.singleton + end + end end class TimeArray < TemporalArray @@ -318,8 +391,8 @@ class DurationArray < TemporalArray end class VariableSizeBinaryArray < Array - def initialize(type, size, validity_buffer, offsets_buffer, values_buffer) - super(type, size, validity_buffer) + def initialize(size, validity_buffer, offsets_buffer, values_buffer) + super(self.class.type, size, validity_buffer) @offsets_buffer = offsets_buffer @values_buffer = values_buffer end @@ -364,18 +437,38 @@ def offset_size end class BinaryArray < VariableSizeBinaryArray + class << self + def type + BinaryType.singleton + end + end end class LargeBinaryArray < VariableSizeBinaryArray + class << self + def type + LargeBinaryType.singleton + end + end end class VariableSizeUTF8Array < VariableSizeBinaryArray end class UTF8Array < VariableSizeUTF8Array + class << self + def type + UTF8Type.singleton + end + end end class LargeUTF8Array < VariableSizeUTF8Array + class << self + def type + LargeUTF8Type.singleton + end + end end class FixedSizeBinaryArray < Array diff --git a/ruby/red-arrow-format/lib/arrow-format/type.rb b/ruby/red-arrow-format/lib/arrow-format/type.rb index 17674af30c7..38523cf00bf 100644 --- a/ruby/red-arrow-format/lib/arrow-format/type.rb +++ b/ruby/red-arrow-format/lib/arrow-format/type.rb @@ -33,7 +33,7 @@ def name end def build_array(size) - NullArray.new(self, size) + NullArray.new(size) end def to_flatbuffers @@ -56,7 +56,7 @@ def name end def build_array(size, validity_buffer, values_buffer) - BooleanArray.new(self, size, validity_buffer, values_buffer) + BooleanArray.new(size, validity_buffer, values_buffer) end def to_flatbuffers @@ -107,7 +107,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - Int8Array.new(self, size, validity_buffer, values_buffer) + Int8Array.new(size, validity_buffer, values_buffer) end end @@ -131,7 +131,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - UInt8Array.new(self, size, validity_buffer, values_buffer) + UInt8Array.new(size, validity_buffer, values_buffer) end end @@ -155,7 +155,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - Int16Array.new(self, size, validity_buffer, values_buffer) + Int16Array.new(size, validity_buffer, values_buffer) end end @@ -179,7 +179,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - UInt16Array.new(self, size, validity_buffer, values_buffer) + UInt16Array.new(size, validity_buffer, values_buffer) end end @@ -203,7 +203,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - Int32Array.new(self, size, validity_buffer, values_buffer) + Int32Array.new(size, validity_buffer, values_buffer) end end @@ -227,7 +227,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - UInt32Array.new(self, size, validity_buffer, values_buffer) + UInt32Array.new(size, validity_buffer, values_buffer) end end @@ -251,7 +251,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - Int64Array.new(self, size, validity_buffer, values_buffer) + Int64Array.new(size, validity_buffer, values_buffer) end end @@ -275,7 +275,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - UInt64Array.new(self, size, validity_buffer, values_buffer) + UInt64Array.new(size, validity_buffer, values_buffer) end end @@ -313,7 +313,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - Float32Array.new(self, size, validity_buffer, values_buffer) + Float32Array.new(size, validity_buffer, values_buffer) end end @@ -337,7 +337,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - Float64Array.new(self, size, validity_buffer, values_buffer) + Float64Array.new(size, validity_buffer, values_buffer) end end @@ -378,7 +378,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - Date32Array.new(self, size, validity_buffer, values_buffer) + Date32Array.new(size, validity_buffer, values_buffer) end end @@ -402,7 +402,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - Date64Array.new(self, size, validity_buffer, values_buffer) + Date64Array.new(size, validity_buffer, values_buffer) end end @@ -628,8 +628,7 @@ def encoding end def build_array(size, validity_buffer, offsets_buffer, values_buffer) - BinaryArray.new(self, - size, + BinaryArray.new(size, validity_buffer, offsets_buffer, values_buffer) @@ -660,8 +659,7 @@ def encoding end def build_array(size, validity_buffer, offsets_buffer, values_buffer) - LargeBinaryArray.new(self, - size, + LargeBinaryArray.new(size, validity_buffer, offsets_buffer, values_buffer) @@ -692,7 +690,7 @@ def encoding end def build_array(size, validity_buffer, offsets_buffer, values_buffer) - UTF8Array.new(self, size, validity_buffer, offsets_buffer, values_buffer) + UTF8Array.new(size, validity_buffer, offsets_buffer, values_buffer) end def to_flatbuffers @@ -720,8 +718,7 @@ def encoding end def build_array(size, validity_buffer, offsets_buffer, values_buffer) - LargeUTF8Array.new(self, - size, + LargeUTF8Array.new(size, validity_buffer, offsets_buffer, values_buffer)