Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Parameters
----------
jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Timestamp
Returns
-------
typ: pyarrow.DataType
"""
time_unit = jvm_type.getUnit().toString()
timezone = jvm_type.getTimezone()
if time_unit == 'SECOND':
return pa.timestamp('s', tz=timezone)
elif time_unit == 'MILLISECOND':
return pa.timestamp('ms', tz=timezone)
elif time_unit == 'MICROSECOND':
return pa.timestamp('us', tz=timezone)
elif time_unit == 'NANOSECOND':
return pa.timestamp('ns', tz=timezone)
Convert a JVM timestamp type to its Python equivalent.
Parameters
----------
jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Timestamp
Returns
-------
typ: pyarrow.DataType
"""
time_unit = jvm_type.getUnit().toString()
timezone = jvm_type.getTimezone()
if time_unit == 'SECOND':
return pa.timestamp('s', tz=timezone)
elif time_unit == 'MILLISECOND':
return pa.timestamp('ms', tz=timezone)
elif time_unit == 'MICROSECOND':
return pa.timestamp('us', tz=timezone)
elif time_unit == 'NANOSECOND':
return pa.timestamp('ns', tz=timezone)
elif vartype == np.int64:
fields.append(pa.field(varname, pa.int64()))
elif vartype == np.uint8:
fields.append(pa.field(varname, pa.uint8()))
elif vartype == np.uint16:
fields.append(pa.field(varname, pa.uint16()))
elif vartype == np.uint32:
fields.append(pa.field(varname, pa.uint32()))
elif vartype == np.uint64:
fields.append(pa.field(varname, pa.uint64()))
elif vartype == np.bool_:
fields.append(pa.field(varname, pa.bool_()))
elif (vartype == object) | (vartype.name == 'category'):
fields.append(pa.field(varname, pa.string()))
elif np.issubdtype(vartype, np.datetime64):
fields.append(pa.field(varname, pa.timestamp('ns')))
assert len(dtypes) == len(fields)
schema = pa.schema(fields)
return schema
pa.uint8(): types.uint8,
pa.uint16(): types.uint16,
pa.uint32(): types.uint32,
pa.uint64(): types.uint64,
# float types (TODO: float16?)
pa.float32(): types.float32,
pa.float64(): types.float64,
# String
pa.string(): string_type,
# date
pa.date32(): types.NPDatetime('ns'),
pa.date64(): types.NPDatetime('ns'),
# time (TODO: time32, time64, ...)
pa.timestamp('ns'): types.NPDatetime('ns'),
pa.timestamp('us'): types.NPDatetime('ns'),
pa.timestamp('ms'): types.NPDatetime('ns'),
pa.timestamp('s'): types.NPDatetime('ns'),
}
if pa_typ not in _typ_map:
raise ValueError("Arrow data type {} not supported yet".format(pa_typ))
return _typ_map[pa_typ]
def _parquet_bytes_to_dict(column: str, index_buffer: bytes):
reader = pa.BufferReader(index_buffer)
# This can be done much more efficient but would take a lot more
# time to implement so this will be only done on request.
table = pq.read_table(reader)
if ARROW_LARGER_EQ_0150:
column_type = table.schema.field(column).type
else:
column_type = table.schema.field_by_name(column).type
# `datetime.datetime` objects have a precision of up to microseconds only, so arrow
# parses the type to `pa.timestamp("us")`. Since the
# values are normalized to `numpy.datetime64[ns]` anyways, we do not care about this
# and load the column type as `pa.timestamp("ns")`
if column_type == pa.timestamp("us"):
column_type = pa.timestamp("ns")
df = _fix_pyarrow_07992_table(table).to_pandas() # Could eventually be phased out
index_dct = dict(
zip(df[column].values, (list(x) for x in df[_PARTITION_COLUMN_NAME].values))
)
return index_dct, column_type
pa.null().id: str,
pa.bool_().id: bool,
pa.int8().id: int,
pa.uint8().id: int,
pa.int16().id: int,
pa.uint16().id: int,
pa.int32().id: int,
pa.uint32().id: int,
pa.int64().id: int,
pa.uint64().id: int,
pa.float16().id: float,
pa.float32().id: float,
pa.float64().id: float,
pa.date32().id: datetime.date,
pa.date64().id: datetime.date,
pa.timestamp("ms").id: datetime.datetime,
pa.binary().id: bytes,
pa.string().id: str,
# Use any list type here, only LIST is important
pa.list_(pa.string()).id: list,
pa.duration("ns").id: datetime.timedelta,
}
_string_type_map = {"date64[ms]": pa.date64(), "string": pa.string()}
_examples = {
pa.null(): pa.array([None, None], type=pa.null()),
pa.bool_(): pa.array([None, True], type=pa.bool_()),
pa.int8(): pa.array([None, -1], type=pa.int8()),
pa.uint8(): pa.array([None, 1], type=pa.uint8()),
pa.int16(): pa.array([None, -1], type=pa.int16()),
pa.uint16(): pa.array([None, 1], type=pa.uint16()),
pa.uint32(): pa.array([None, 1], type=pa.uint32()),
pa.int64(): pa.array([None, -1], type=pa.int64()),
pa.uint64(): pa.array([None, 1], type=pa.uint64()),
pa.float16(): pa.array([None, np.float16(-0.1)], type=pa.float16()),
pa.float32(): pa.array([None, -0.1], type=pa.float32()),
pa.float64(): pa.array([None, -0.1], type=pa.float64()),
pa.date32(): pa.array([None, datetime.date(2010, 9, 8)], type=pa.date32()),
pa.date64(): pa.array([None, datetime.date(2010, 9, 8)], type=pa.date64()),
pa.timestamp("s"): pa.array(
[None, datetime.datetime(2013, 12, 11, 10, 9, 8)], type=pa.timestamp("s")
),
pa.timestamp("ms"): pa.array(
[None, datetime.datetime(2013, 12, 11, 10, 9, 8, 1000)], type=pa.timestamp("ms")
),
pa.timestamp("us"): pa.array(
[None, datetime.datetime(2013, 12, 11, 10, 9, 8, 7)], type=pa.timestamp("us")
),
pa.timestamp("ns"): pa.array(
[None, datetime.datetime(2013, 12, 11, 10, 9, 8, 7)], type=pa.timestamp("ns")
),
pa.binary(): pa.array([None, b"122"], type=pa.binary()),
pa.string(): pa.array([None, "🤔"], type=pa.string()),
pa.duration("s"): pa.array(
[None, datetime.timedelta(seconds=9)], type=pa.duration("s")
),
pa.duration("ms"): pa.array(
[None, datetime.timedelta(milliseconds=8)], type=pa.duration("ms")
),
pa.duration("us"): pa.array(
[None, datetime.timedelta(microseconds=7)], type=pa.duration("us")
),
pa.duration("ns"): pa.array(
schema_names = []
for row in data:
for column in schema.names:
_col = column_data.get(column, [])
_col.append(row.get(column))
column_data[column] = _col
for column in schema:
_col = column_data.get(column.name)
if isinstance(column.type, pa.lib.TimestampType):
_converted_col = []
for t in _col:
try:
_converted_col.append(pd.to_datetime(t, format=date_format))
except pd._libs.tslib.OutOfBoundsDatetime:
_converted_col.append(pd.Timestamp.max)
array_data.append(pa.Array.from_pandas(pd.to_datetime(_converted_col), type=pa.timestamp('ns')))
elif column.type.id == pa.date32().id:
_converted_col = map(_date_converter, _col)
array_data.append(pa.array(_converted_col, type=pa.date32()))
# Float types are ambiguous for conversions, need to specify the exact type
elif column.type.id == pa.float64().id:
array_data.append(pa.array(_col, type=pa.float64()))
elif column.type.id == pa.float32().id:
# Python doesn't have a native float32 type
# and PyArrow cannot cast float64 -> float32
_col = pd.to_numeric(_col, downcast='float')
array_data.append(pa.Array.from_pandas(_col, type=pa.float32()))
elif column.type.id == pa.int32().id:
# PyArrow 0.8.0 can cast int64 -> int32
_col64 = pa.array(_col, type=pa.int64())
array_data.append(_col64.cast(pa.int32()))
elif column.type.id == pa.bool_().id: