Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
return datetime.time(hour, minute, second, milisec)
def expected_data_transform_int32(data):
milisec = data % (10**4)
milisec *= 10**2
data //= 10**4
second = data % 60
data //= 60
minute = data % 60
hour = data // 60
return datetime.time(hour, minute, second, milisec)
iterate_over_test_chunk([pyarrow.int64(), pyarrow.int64()],
column_meta_int64, time_generator_int64, expected_data_transform_int64)
iterate_over_test_chunk([pyarrow.int32(), pyarrow.int32()],
column_meta_int32, time_generator_int32, expected_data_transform_int32)
'unit_test_table',
self.sa_meta,
sa.Column('real_col', sa.REAL),
sa.Column('bigint_col', sa.BIGINT),
sa.Column('int_col', sa.INTEGER),
sa.Column('smallint_col', sa.SMALLINT),
sa.Column('bool_col', sa.BOOLEAN),
sa.Column('str_col', sa.VARCHAR),
sa.Column('timestamp_col', sa.TIMESTAMP),
sa.Column('plaintext_col', sa.TEXT),
)
self.expected_datatypes = [
pa.float32(),
pa.int64(),
pa.int32(),
pa.int16(),
pa.bool_(),
pa.string(),
pa.timestamp('ns'),
pa.string(),
]
def get_arrow_type(self, dt, is_list):
"""get_arrow_type"""
if dt == dtypes.bool:
arrow_type = pa.bool_()
elif dt == dtypes.int8:
arrow_type = pa.int8()
elif dt == dtypes.int16:
arrow_type = pa.int16()
elif dt == dtypes.int32:
arrow_type = pa.int32()
elif dt == dtypes.int64:
arrow_type = pa.int64()
elif dt == dtypes.uint8:
arrow_type = pa.uint8()
elif dt == dtypes.uint16:
arrow_type = pa.uint16()
elif dt == dtypes.uint32:
arrow_type = pa.uint32()
elif dt == dtypes.uint64:
arrow_type = pa.uint64()
elif dt == dtypes.float16:
arrow_type = pa.float16()
elif dt == dtypes.float32:
arrow_type = pa.float32()
elif dt == dtypes.float64:
arrow_type = pa.float64()
varchar_ VARCHAR(40),
text_ TEXT,
time_ TIME,
timestamp_ TIMESTAMP,
date_ DATE
);''')
# skipping decimal for now
c.execute(create)
names = ['boolean_', 'smallint_', 'int_', 'bigint_',
'float_', 'double_', 'varchar_', 'text_',
'time_', 'timestamp_', 'date_']
columns = [pa.array([True, False, None], type=pa.bool_()),
pa.array([1, 0, None]).cast(pa.int16()),
pa.array([1, 0, None]).cast(pa.int32()),
pa.array([1, 0, None]),
pa.array([1.0, 1.1, None]).cast(pa.float32()),
pa.array([1.0, 1.1, None]),
# no fixed-width string
pa.array(['a', 'b', None]),
pa.array(['a', 'b', None]),
(pa.array([1, 2, None]).cast(pa.int32())
.cast(pa.time32('s'))),
pa.array([datetime.datetime(2016, 1, 1, 12, 12, 12),
datetime.datetime(2017, 1, 1), None]),
pa.array([datetime.date(2016, 1, 1),
datetime.date(2017, 1, 1), None])]
table = pa.Table.from_arrays(columns, names=names)
con.load_table_arrow("all_types", table)
c.execute('drop table if exists all_types;')
def test_convert_json():
"""
Test converting a JSON file to Parquet
"""
schema = pa.schema([
pa.field("foo", pa.int32()),
pa.field("bar", pa.int64())
])
input_path = "{}/tests/fixtures/simple_json.txt".format(os.getcwd())
expected_file = "{}/tests/fixtures/simple.parquet".format(os.getcwd())
with tempfile.NamedTemporaryFile() as f:
output_file = f.name
client.convert_json(input_path, output_file, schema)
output = pq.ParquetFile(output_file)
expected = pq.ParquetFile(expected_file)
assert output.metadata.num_columns == expected.metadata.num_columns
assert output.metadata.num_rows == expected.metadata.num_rows
assert output.schema.equals(expected.schema)
assert output.read_row_group(0).to_pydict() == expected.read_row_group(0).to_pydict()
def to_arrow_type(dt):
""" Convert Spark data type to pyarrow type
"""
import pyarrow as pa
if type(dt) == BooleanType:
arrow_type = pa.bool_()
elif type(dt) == ByteType:
arrow_type = pa.int8()
elif type(dt) == ShortType:
arrow_type = pa.int16()
elif type(dt) == IntegerType:
arrow_type = pa.int32()
elif type(dt) == LongType:
arrow_type = pa.int64()
elif type(dt) == FloatType:
arrow_type = pa.float32()
elif type(dt) == DoubleType:
arrow_type = pa.float64()
elif type(dt) == DecimalType:
arrow_type = pa.decimal128(dt.precision, dt.scale)
elif type(dt) == StringType:
arrow_type = pa.string()
elif type(dt) == BinaryType:
arrow_type = pa.binary()
elif type(dt) == DateType:
arrow_type = pa.date32()
elif type(dt) == TimestampType:
# Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read
pa.string().id: str,
# Use any list type here, only LIST is important
pa.list_(pa.string()).id: list,
pa.duration("ns").id: datetime.timedelta,
}
_string_type_map = {"date64[ms]": pa.date64(), "string": pa.string()}
_examples = {
pa.null(): pa.array([None, None], type=pa.null()),
pa.bool_(): pa.array([None, True], type=pa.bool_()),
pa.int8(): pa.array([None, -1], type=pa.int8()),
pa.uint8(): pa.array([None, 1], type=pa.uint8()),
pa.int16(): pa.array([None, -1], type=pa.int16()),
pa.uint16(): pa.array([None, 1], type=pa.uint16()),
pa.int32(): pa.array([None, -1], type=pa.int32()),
pa.uint32(): pa.array([None, 1], type=pa.uint32()),
pa.int64(): pa.array([None, -1], type=pa.int64()),
pa.uint64(): pa.array([None, 1], type=pa.uint64()),
pa.float16(): pa.array([None, np.float16(-0.1)], type=pa.float16()),
pa.float32(): pa.array([None, -0.1], type=pa.float32()),
pa.float64(): pa.array([None, -0.1], type=pa.float64()),
pa.date32(): pa.array([None, datetime.date(2010, 9, 8)], type=pa.date32()),
pa.date64(): pa.array([None, datetime.date(2010, 9, 8)], type=pa.date64()),
pa.timestamp("s"): pa.array(
[None, datetime.datetime(2013, 12, 11, 10, 9, 8)], type=pa.timestamp("s")
),
pa.timestamp("ms"): pa.array(
[None, datetime.datetime(2013, 12, 11, 10, 9, 8, 1000)], type=pa.timestamp("ms")
),
pa.timestamp("us"): pa.array(
[None, datetime.datetime(2013, 12, 11, 10, 9, 8, 7)], type=pa.timestamp("us")
Parameters
----------
jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Int
Returns
-------
typ: pyarrow.DataType
"""
if jvm_type.isSigned:
if jvm_type.bitWidth == 8:
return pa.int8()
elif jvm_type.bitWidth == 16:
return pa.int16()
elif jvm_type.bitWidth == 32:
return pa.int32()
elif jvm_type.bitWidth == 64:
return pa.int64()
else:
if jvm_type.bitWidth == 8:
return pa.uint8()
elif jvm_type.bitWidth == 16:
return pa.uint16()
elif jvm_type.bitWidth == 32:
return pa.uint32()
elif jvm_type.bitWidth == 64:
return pa.uint64()
_converted_col.append(pd.to_datetime(t, format=date_format))
except pd._libs.tslib.OutOfBoundsDatetime:
_converted_col.append(pd.Timestamp.max)
array_data.append(pa.Array.from_pandas(pd.to_datetime(_converted_col), type=pa.timestamp('ns')))
elif column.type.id == pa.date32().id:
_converted_col = map(_date_converter, _col)
array_data.append(pa.array(_converted_col, type=pa.date32()))
# Float types are ambiguous for conversions, need to specify the exact type
elif column.type.id == pa.float64().id:
array_data.append(pa.array(_col, type=pa.float64()))
elif column.type.id == pa.float32().id:
# Python doesn't have a native float32 type
# and PyArrow cannot cast float64 -> float32
_col = pd.to_numeric(_col, downcast='float')
array_data.append(pa.Array.from_pandas(_col, type=pa.float32()))
elif column.type.id == pa.int32().id:
# PyArrow 0.8.0 can cast int64 -> int32
_col64 = pa.array(_col, type=pa.int64())
array_data.append(_col64.cast(pa.int32()))
elif column.type.id == pa.bool_().id:
_col = map(_boolean_converter, _col)
array_data.append(pa.array(_col, type=column.type))
else:
array_data.append(pa.array(_col, type=column.type))
if isinstance(field_aliases, dict):
schema_names.append(field_aliases.get(column.name, column.name))
else:
schema_names.append(column.name)
return pa.RecordBatch.from_arrays(array_data, schema_names)