How to use the pyarrow.timestamp function in pyarrow

To help you get started, we’ve selected a few pyarrow examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github apache / arrow / python / pyarrow / jvm.py View on Github external
Parameters
    ----------
    jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Timestamp

    Returns
    -------
    typ: pyarrow.DataType
    """
    time_unit = jvm_type.getUnit().toString()
    timezone = jvm_type.getTimezone()
    if time_unit == 'SECOND':
        return pa.timestamp('s', tz=timezone)
    elif time_unit == 'MILLISECOND':
        return pa.timestamp('ms', tz=timezone)
    elif time_unit == 'MICROSECOND':
        return pa.timestamp('us', tz=timezone)
    elif time_unit == 'NANOSECOND':
        return pa.timestamp('ns', tz=timezone)
github apache / arrow / python / pyarrow / jvm.py View on Github external
Convert a JVM timestamp type to its Python equivalent.

    Parameters
    ----------
    jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Timestamp

    Returns
    -------
    typ: pyarrow.DataType
    """
    time_unit = jvm_type.getUnit().toString()
    timezone = jvm_type.getTimezone()
    if time_unit == 'SECOND':
        return pa.timestamp('s', tz=timezone)
    elif time_unit == 'MILLISECOND':
        return pa.timestamp('ms', tz=timezone)
    elif time_unit == 'MICROSECOND':
        return pa.timestamp('us', tz=timezone)
    elif time_unit == 'NANOSECOND':
        return pa.timestamp('ns', tz=timezone)
github kylebarron / medicare_utils / medicare_utils / parquet.py View on Github external
elif vartype == np.int64:
            fields.append(pa.field(varname, pa.int64()))
        elif vartype == np.uint8:
            fields.append(pa.field(varname, pa.uint8()))
        elif vartype == np.uint16:
            fields.append(pa.field(varname, pa.uint16()))
        elif vartype == np.uint32:
            fields.append(pa.field(varname, pa.uint32()))
        elif vartype == np.uint64:
            fields.append(pa.field(varname, pa.uint64()))
        elif vartype == np.bool_:
            fields.append(pa.field(varname, pa.bool_()))
        elif (vartype == object) | (vartype.name == 'category'):
            fields.append(pa.field(varname, pa.string()))
        elif np.issubdtype(vartype, np.datetime64):
            fields.append(pa.field(varname, pa.timestamp('ns')))

    assert len(dtypes) == len(fields)
    schema = pa.schema(fields)
    return schema
github IntelPython / sdc / sdc / io / parquet_pio.py View on Github external
pa.uint8(): types.uint8,
        pa.uint16(): types.uint16,
        pa.uint32(): types.uint32,
        pa.uint64(): types.uint64,
        # float types (TODO: float16?)
        pa.float32(): types.float32,
        pa.float64(): types.float64,
        # String
        pa.string(): string_type,
        # date
        pa.date32(): types.NPDatetime('ns'),
        pa.date64(): types.NPDatetime('ns'),
        # time (TODO: time32, time64, ...)
        pa.timestamp('ns'): types.NPDatetime('ns'),
        pa.timestamp('us'): types.NPDatetime('ns'),
        pa.timestamp('ms'): types.NPDatetime('ns'),
        pa.timestamp('s'): types.NPDatetime('ns'),
    }
    if pa_typ not in _typ_map:
        raise ValueError("Arrow data type {} not supported yet".format(pa_typ))
    return _typ_map[pa_typ]
github JDASoftwareGroup / kartothek / kartothek / core / index.py View on Github external
def _parquet_bytes_to_dict(column: str, index_buffer: bytes):
    reader = pa.BufferReader(index_buffer)
    # This can be done much more efficient but would take a lot more
    # time to implement so this will be only done on request.
    table = pq.read_table(reader)
    if ARROW_LARGER_EQ_0150:
        column_type = table.schema.field(column).type
    else:
        column_type = table.schema.field_by_name(column).type

    # `datetime.datetime` objects have a precision of up to microseconds only, so arrow
    # parses the type to `pa.timestamp("us")`. Since the
    # values are normalized to `numpy.datetime64[ns]` anyways, we do not care about this
    # and load the column type as `pa.timestamp("ns")`
    if column_type == pa.timestamp("us"):
        column_type = pa.timestamp("ns")

    df = _fix_pyarrow_07992_table(table).to_pandas()  # Could eventually be phased out

    index_dct = dict(
        zip(df[column].values, (list(x) for x in df[_PARTITION_COLUMN_NAME].values))
    )
    return index_dct, column_type
github xhochy / fletcher / fletcher / base.py View on Github external
pa.null().id: str,
    pa.bool_().id: bool,
    pa.int8().id: int,
    pa.uint8().id: int,
    pa.int16().id: int,
    pa.uint16().id: int,
    pa.int32().id: int,
    pa.uint32().id: int,
    pa.int64().id: int,
    pa.uint64().id: int,
    pa.float16().id: float,
    pa.float32().id: float,
    pa.float64().id: float,
    pa.date32().id: datetime.date,
    pa.date64().id: datetime.date,
    pa.timestamp("ms").id: datetime.datetime,
    pa.binary().id: bytes,
    pa.string().id: str,
    # Use any list type here, only LIST is important
    pa.list_(pa.string()).id: list,
    pa.duration("ns").id: datetime.timedelta,
}

_string_type_map = {"date64[ms]": pa.date64(), "string": pa.string()}

_examples = {
    pa.null(): pa.array([None, None], type=pa.null()),
    pa.bool_(): pa.array([None, True], type=pa.bool_()),
    pa.int8(): pa.array([None, -1], type=pa.int8()),
    pa.uint8(): pa.array([None, 1], type=pa.uint8()),
    pa.int16(): pa.array([None, -1], type=pa.int16()),
    pa.uint16(): pa.array([None, 1], type=pa.uint16()),
github xhochy / fletcher / fletcher / base.py View on Github external
pa.uint32(): pa.array([None, 1], type=pa.uint32()),
    pa.int64(): pa.array([None, -1], type=pa.int64()),
    pa.uint64(): pa.array([None, 1], type=pa.uint64()),
    pa.float16(): pa.array([None, np.float16(-0.1)], type=pa.float16()),
    pa.float32(): pa.array([None, -0.1], type=pa.float32()),
    pa.float64(): pa.array([None, -0.1], type=pa.float64()),
    pa.date32(): pa.array([None, datetime.date(2010, 9, 8)], type=pa.date32()),
    pa.date64(): pa.array([None, datetime.date(2010, 9, 8)], type=pa.date64()),
    pa.timestamp("s"): pa.array(
        [None, datetime.datetime(2013, 12, 11, 10, 9, 8)], type=pa.timestamp("s")
    ),
    pa.timestamp("ms"): pa.array(
        [None, datetime.datetime(2013, 12, 11, 10, 9, 8, 1000)], type=pa.timestamp("ms")
    ),
    pa.timestamp("us"): pa.array(
        [None, datetime.datetime(2013, 12, 11, 10, 9, 8, 7)], type=pa.timestamp("us")
    ),
    pa.timestamp("ns"): pa.array(
        [None, datetime.datetime(2013, 12, 11, 10, 9, 8, 7)], type=pa.timestamp("ns")
    ),
    pa.binary(): pa.array([None, b"122"], type=pa.binary()),
    pa.string(): pa.array([None, "🤔"], type=pa.string()),
    pa.duration("s"): pa.array(
        [None, datetime.timedelta(seconds=9)], type=pa.duration("s")
    ),
    pa.duration("ms"): pa.array(
        [None, datetime.timedelta(milliseconds=8)], type=pa.duration("ms")
    ),
    pa.duration("us"): pa.array(
        [None, datetime.timedelta(microseconds=7)], type=pa.duration("us")
    ),
    pa.duration("ns"): pa.array(
github andrewgross / json2parquet / json2parquet / client.py View on Github external
schema_names = []
    for row in data:
        for column in schema.names:
            _col = column_data.get(column, [])
            _col.append(row.get(column))
            column_data[column] = _col
    for column in schema:
        _col = column_data.get(column.name)
        if isinstance(column.type, pa.lib.TimestampType):
            _converted_col = []
            for t in _col:
                try:
                    _converted_col.append(pd.to_datetime(t, format=date_format))
                except pd._libs.tslib.OutOfBoundsDatetime:
                    _converted_col.append(pd.Timestamp.max)
            array_data.append(pa.Array.from_pandas(pd.to_datetime(_converted_col), type=pa.timestamp('ns')))
        elif column.type.id == pa.date32().id:
            _converted_col = map(_date_converter, _col)
            array_data.append(pa.array(_converted_col, type=pa.date32()))
        # Float types are ambiguous for conversions, need to specify the exact type
        elif column.type.id == pa.float64().id:
            array_data.append(pa.array(_col, type=pa.float64()))
        elif column.type.id == pa.float32().id:
            # Python doesn't have a native float32 type
            # and PyArrow cannot cast float64 -> float32
            _col = pd.to_numeric(_col, downcast='float')
            array_data.append(pa.Array.from_pandas(_col, type=pa.float32()))
        elif column.type.id == pa.int32().id:
            # PyArrow 0.8.0 can cast int64 -> int32
            _col64 = pa.array(_col, type=pa.int64())
            array_data.append(_col64.cast(pa.int32()))
        elif column.type.id == pa.bool_().id: