Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_iterate_over_string_chunk():
random.seed(datetime.datetime.now())
column_meta = [
{"logicalType": "TEXT"},
{"logicalType": "TEXT"}
]
field_foo = pyarrow.field("column_foo", pyarrow.string(), True, column_meta[0])
field_bar = pyarrow.field("column_bar", pyarrow.string(), True, column_meta[1])
pyarrow.schema([field_foo, field_bar])
def str_generator():
return str(random.randint(-100, 100))
iterate_over_test_chunk([pyarrow.string(), pyarrow.string()],
column_meta, str_generator)
pa.field("bool", pa.bool_()),
pa.field("byte", pa.binary()),
pa.field("date", pa.date32()),
pa.field("datetime64", pa.timestamp("us")),
pa.field("float32", pa.float64()),
pa.field("float64", pa.float64()),
pa.field("int16", pa.int64()),
pa.field("int32", pa.int64()),
pa.field("int64", pa.int64()),
pa.field("int8", pa.int64()),
pa.field("null", pa.null()),
pa.field("uint16", pa.uint64()),
pa.field("uint32", pa.uint64()),
pa.field("uint64", pa.uint64()),
pa.field("uint8", pa.uint64()),
pa.field("unicode", pa.string()),
]
expected_schema = pa.schema(fields)
assert actual_schema.remove_metadata() == expected_schema
)
xfail_list_setitem_not_implemented = pytest.mark.xfail_by_type_filter(
[pa.types.is_list], "__setitem__ is not implemented for lists"
)
xfail_missing_list_dict_encode = pytest.mark.xfail_by_type_filter(
[pa.types.is_list],
"ArrowNotImplementedError: dictionary-encode not implemented for list",
)
xfail_bool_too_few_uniques = pytest.mark.xfail_by_type_filter(
[pa.types.is_boolean], "Test requires at least 3 unique values"
)
test_types = [
FletcherTestType(
pa.string(),
["🙈", "Ö", "Č", "a", "B"] * 20,
[None, "A"],
["B", "B", None, None, "A", "A", "B", "C"],
["B", "C", "A"],
["B", None, "A"],
lambda: choices(list(string.ascii_letters), k=10),
),
FletcherTestType(
pa.bool_(),
[True, False, True, True, False] * 20,
[None, False],
[True, True, None, None, False, False, True, False],
[True, False, False],
[True, None, False],
lambda: choices([True, False], k=10),
),
def get_pyarrow_translated_schema(string_schema):
"""
Converts string schema dict to pyarrow schema for writing to parquet.
:param string_schema:
:return: pyarrow schema
"""
type_conversions = {
'STRING': pa.string(),
'NUMERIC': pa.int64(),
'BYTE': None,
'INTEGER': pa.int64(),
'FLOAT': pa.float64(),
'NUMERIC': pa.int64(),
'BOOLEAN': pa.bool_(),
'TIMESTAMP': pa.timestamp('us'),
'DATE': pa.date32(),
'TIME': pa.time64('us'),
'DATETIME': pa.timestamp('us'),
'GEOGRAPHY': None,
'RECORD': None
}
pa_schema_list = []
for field in string_schema:
field_type = field['type']
"IndexCol": {
IndexValue: ["partition_label"]
},
"SecondIndexCol": {
Value: ["partition_label"]
}
}
"""
if self.primary_indices_loaded:
return self
indices = _construct_dynamic_index_from_partitions(
partitions=self.partitions,
table_meta=self.table_meta,
default_dtype=pa.string() if self.metadata_version == 3 else None,
partition_keys=self.partition_keys,
)
combined_indices = self.indices.copy()
combined_indices.update(indices)
return self.copy(indices=combined_indices)
if min_maxs:
# We have min/max pairs
divisions = [mn for mn, mx in min_maxs] + [min_maxs[-1][1]]
if pa.__version__ < LooseVersion("0.13.1"):
# Handle conversion to pandas timestamp divisions
index_field = pa_schema.field_by_name(divisions_name)
if pa.types.is_timestamp(index_field.type):
time_unit = index_field.type.unit
divisions_ns = [_to_ns(d, time_unit) for d in divisions]
divisions = [pd.Timestamp(ns) for ns in divisions_ns]
# Handle encoding of bytes string
if index_field.type == pa.string():
# Parquet strings are always encoded as utf-8
encoding = "utf-8"
divisions = [d.decode(encoding).strip() for d in divisions]
else: # pragma: no cover
if infer_divisions is True:
raise ValueError(
(
"Unable to infer divisions for index of '{index_name}' "
"because it is not known to be "
"sorted across partitions"
).format(index_name=divisions_name_in_schema)
)
divisions = (None,) * (len(pa_pieces) + 1)
elif pa_pieces:
print("----pyarrow version---")
print(pa.__version__)
writer = None
try:
for batch in iterator:
if is_dev:
print(batch.to_pandas())
if writer is None:
writer = pa.RecordBatchStreamWriter(stream, batch.schema)
writer.write_batch(batch)
# if iterator is empty, we should write default schema
if writer is None:
if is_dev:
print("----dump empty arrow---")
rb = pa.RecordBatch.from_arrays([[]], schema=pa.schema([('value', pa.string())]))
writer = pa.RecordBatchStreamWriter(stream, rb.schema)
writer.write_batch(rb)
finally:
if writer is not None:
writer.close()
result_offsets = np.empty(len(a) + 1, dtype=np.int32)
result_offsets[0] = 0
total_size = (offsets_a[-1] - offsets_a[0]) + (offsets_b[-1] - offsets_b[0])
result_data = np.empty(total_size, dtype=np.uint8)
_merge_string_data(
len(a),
valid,
offsets_a,
data_a,
offsets_b,
data_b,
result_offsets,
result_data,
)
buffers = [pa.py_buffer(x) for x in [valid, result_offsets, result_data]]
return pa.Array.from_buffers(pa.string(), len(a), buffers)
return a
"""
if not isinstance(string, str):
raise TypeError(
"'construct_from_string' expects a string, got "
)
# Remove fletcher specific naming from the arrow type string.
if string.startswith("fletcher_chunked["):
string = string[len("fletcher_chunked[") : -1]
else:
raise TypeError(
f"Cannot construct a 'FletcherChunkedDtype' from '{string}'"
)
if string == "list":
return cls(pa.list_(pa.string()))
try:
type_for_alias = pa.type_for_alias(string)
except (ValueError, KeyError):
# pandas API expects a TypeError
raise TypeError(string)
return cls(type_for_alias)
pa.field('operation', pa.string()),
pa.field('value', pa.string()),
pa.field('arguments', pa.string()),
pa.field('time_stamp', pa.string(), nullable=False)
]
PQ_SCHEMAS['javascript'] = pa.schema(fields)
# javascript_cookies
fields = [
pa.field('crawl_id', pa.uint32()),
pa.field('visit_id', pa.int64()),
pa.field('instance_id', pa.uint32(), nullable=False),
pa.field('extension_session_uuid', pa.string()),
pa.field('event_ordinal', pa.int64()),
pa.field('record_type', pa.string()),
pa.field('change_cause', pa.string()),
pa.field('expiry', pa.string()),
pa.field('is_http_only', pa.bool_()),
pa.field('is_host_only', pa.bool_()),
pa.field('is_session', pa.bool_()),
pa.field('host', pa.string()),
pa.field('is_secure', pa.bool_()),
pa.field('name', pa.string()),
pa.field('path', pa.string()),
pa.field('value', pa.string()),
pa.field('same_site', pa.string()),
pa.field('first_party_domain', pa.string()),
pa.field('store_id', pa.string()),
pa.field('time_stamp', pa.string())
]
PQ_SCHEMAS['javascript_cookies'] = pa.schema(fields)