How to use the pyarrow.parquet function in pyarrow

To help you get started, we’ve selected a few pyarrow examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JDASoftwareGroup / kartothek / tests / serialization / test_parquet.py View on Github external
def test_index_metadata(store):
    key = "test.parquet"
    df = pd.DataFrame({"a": [1]})
    table = pa.Table.from_pandas(df)
    meta = b"""{
        "pandas_version": "0.20.3",
        "index_columns": ["__index_level_0__"],
        "columns": [
            {"metadata": null, "name": "a", "numpy_type": "int64", "pandas_type": "int64"}
        ]
    }"""
    table = table.replace_schema_metadata({b"pandas": meta})
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    store.put(key, buf.getvalue().to_pybytes())
    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key), df)
github awslabs / aws-data-wrangler / awswrangler / pandas.py View on Github external
:param columns: Names of columns to read from the file
        :param filters: List of filters to apply, like ``[[('x', '=', 0), ...], ...]``.
        :param procs_cpu_bound: Number of cores used for CPU bound tasks
        """
        session = session_primitives.session
        is_file: bool = session.s3.does_object_exists(path=path)
        if is_file is False:
            path = path[:-1] if path[-1] == "/" else path
        procs_cpu_bound = procs_cpu_bound if procs_cpu_bound is not None else session_primitives.procs_cpu_bound if session_primitives.procs_cpu_bound is not None else 1
        use_threads: bool = True if procs_cpu_bound > 1 else False
        logger.debug(f"Reading Parquet: {path}")
        if is_file is True:
            client_s3 = session.boto3_session.client(service_name="s3", use_ssl=True, config=session.botocore_config)
            bucket, key = path.replace("s3://", "").split("/", 1)
            obj = client_s3.get_object(Bucket=bucket, Key=key)
            table = pq.ParquetFile(source=BytesIO(obj["Body"].read())).read(columns=columns, use_threads=use_threads)
        else:
            fs: S3FileSystem = s3.get_fs(session_primitives=session_primitives)
            fs = pa.filesystem._ensure_filesystem(fs)
            fs.invalidate_cache()
            table = pq.read_table(source=path, columns=columns, filters=filters, filesystem=fs, use_threads=use_threads)
        # Check if we lose some integer during the conversion (Happens when has some null value)
        integers = [field.name for field in table.schema if str(field.type).startswith("int")]
        logger.debug(f"Converting to Pandas: {path}")
        df = table.to_pandas(use_threads=use_threads, integer_object_nulls=True)
        for c in integers:
            if not str(df[c].dtype).startswith("int"):
                df[c] = df[c].astype("Int64")
        logger.debug(f"Done: {path}")
        return df
github JDASoftwareGroup / kartothek / kartothek / io_components / metapartition.py View on Github external
Returns
        -------
        pd.DataFrame
          A DataFrame with relevant parquet metadata
        """
        if not isinstance(table_name, str):
            raise TypeError("Expecting a string for parameter `table_name`.")

        if callable(store):
            store = store()

        data = {}
        if table_name in self.files:
            with store.open(self.files[table_name]) as fd:  # type: ignore
                pq_metadata = pa.parquet.ParquetFile(fd).metadata
            try:
                metadata_dict = pq_metadata.to_dict()
            except AttributeError:  # No data in file
                metadata_dict = None
                data = {
                    "partition_label": self.label,
                    "serialized_size": pq_metadata.serialized_size,
                    "number_rows_total": pq_metadata.num_rows,
                    "number_row_groups": pq_metadata.num_row_groups,
                    "row_group_id": [0],
                    "number_rows_per_row_group": [0],
                    "row_group_compressed_size": [0],
                    "row_group_uncompressed_size": [0],
                }

            if metadata_dict:
github dask / dask / dask / dataframe / io / parquet.py View on Github external
def _write_partition_pyarrow(
    df, path, fs, filename, write_index, partition_on, metadata_path=None, **kwargs
):
    import pyarrow as pa
    from pyarrow import parquet

    t = pa.Table.from_pandas(df, preserve_index=write_index)

    if partition_on:
        parquet.write_to_dataset(
            t,
            path,
            partition_cols=partition_on,
            preserve_index=write_index,
            filesystem=fs,
            **kwargs
        )
    else:
        with fs.open(filename, "wb") as fil:
            parquet.write_table(t, fil, **kwargs)

    if metadata_path is not None:
        with fs.open(metadata_path, "wb") as fil:
            # Get only arguments specified in the function
            kwargs_meta = {
                k: v for k, v in kwargs.items() if k in _pyarrow_write_metadata_kwargs
github JDASoftwareGroup / kartothek / kartothek / core / common_metadata.py View on Github external
def _schema2bytes(schema):
    buf = pa.BufferOutputStream()
    pq.write_metadata(schema, buf, version="2.0", coerce_timestamps="us")
    return buf.getvalue().to_pybytes()
github dask / dask / dask / dataframe / io / parquet / arrow.py View on Github external
if "_metadata" in fns and "validate_schema" not in dataset_kwargs:
            dataset_kwargs["validate_schema"] = False
        if "_metadata" in fns or gather_statistics is not False:
            # Let arrow do its thing (use _metadata or scan files)
            dataset = pq.ParquetDataset(
                paths, filesystem=fs, filters=filters, **dataset_kwargs
            )
        else:
            # Use _common_metadata file if it is available.
            # Otherwise, just use 0th file
            if "_common_metadata" in fns:
                dataset = pq.ParquetDataset(
                    base + fs.sep + "_common_metadata", filesystem=fs, **dataset_kwargs
                )
            else:
                dataset = pq.ParquetDataset(
                    allpaths[0], filesystem=fs, **dataset_kwargs
                )
            parts = [base + fs.sep + fn for fn in fns]
    else:
        # There is only one file to read
        dataset = pq.ParquetDataset(paths, filesystem=fs, **dataset_kwargs)
    return parts, dataset
github horovod / horovod / horovod / spark / common / store.py View on Github external
def get_parquet_dataset(self, path):
        return pq.ParquetDataset(self.get_localized_path(path), filesystem=self.get_filesystem())
github bmoscon / cryptostore / cryptostore / data / parquet.py View on Github external
file_name += f"{data_type}-"
                elif var == "exchange":
                    file_name += f"{exchange}-"
                elif var == "pair":
                    file_name += f"{pair}-"
                else:
                    print(var)
                    raise ValueError("Invalid file format specified for parquet file")
            file_name = file_name[:-1] + ".parquet"
        else:
            file_name = f'{exchange}-{data_type}-{pair}-{int(timestamp)}.parquet'

        if self.path:
            file_name = os.path.join(self.path, file_name)

        pq.write_table(self.data, file_name)
        self.data = None

        if self._write:
            for func, bucket, prefix, kwargs in zip(self._write, self.bucket, self.prefix, self.kwargs):
                path = f'{exchange}/{data_type}/{pair}/{exchange}-{data_type}-{pair}-{int(timestamp)}.parquet'
                if prefix:
                    path = f"{prefix}/{path}"
                func(bucket, path, file_name, **kwargs)
            if self.del_file:
                os.remove(file_name)
github vaexio / vaex / packages / vaex-arrow / vaex_arrow / export.py View on Github external
def export_parquet(dataset, path, column_names=None, byteorder="=", shuffle=False, selection=False, progress=None, virtual=True, sort=None, ascending=True):
    table = _export_table(dataset, column_names, byteorder, shuffle, selection, progress, virtual, sort, ascending)
    pq.write_table(table, path)