How to use the pyarrow.parquet.write_table function in pyarrow

To help you get started, we’ve selected a few pyarrow examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github quantumblacklabs / kedro / tests / contrib / io / gcs / test_parquet_gcs.py View on Github external
def write_table(table, where, filesystem, **kwargs):  # pylint: disable=unused-argument
    path = str(filesystem.tmp_path / FILENAME)
    filesystem.files[str(where)] = path
    pq.write_table(table, path)
github quantumblacklabs / kedro / tests / contrib / io / parquet / test_parquet_s3.py View on Github external
def mocked_s3_object_versioned(
    tmp_path, mocked_s3_bucket, dummy_dataframe, save_version
):
    """Create versioned test data and add it to mocked S3 bucket."""
    table = pa.Table.from_pandas(dummy_dataframe)
    temporary_path = tmp_path / FILENAME
    pq.write_table(table, str(temporary_path))

    mocked_s3_bucket.put_object(
        Bucket=BUCKET_NAME,
        Key="{0}/{1}/{0}".format(FILENAME, save_version),
        Body=temporary_path.read_bytes(),
    )
    return mocked_s3_bucket
github HumanCellAtlas / table-testing / create_data / converters.py View on Github external
def convert_to_parquet(df, row_group_size, compression):
    """Convert a dataframe of expression values to a parquet file."""

    path = _get_temp_path(".parquet")
    qcs = fake_qc_values(NUM_QC_VALUES, df.index, seed=df.values.sum())
    full_df = pandas.concat([df, qcs], axis=1)
    table = pyarrow.Table.from_pandas(full_df)
    pyarrow.parquet.write_table(table, path, row_group_size=row_group_size,
                                compression=compression)

    return path
github gojek / feast / sdk / python / feast / client.py View on Github external
table = json.read_json(filename)
        else:
            table = pq.read_table(file_path)
    else:
        raise ValueError(f"Unknown data source provided for ingestion: {source}")

    # Ensure that PyArrow table is initialised
    assert isinstance(table, pa.lib.Table)

    # Write table as parquet file with a specified row_group_size
    dir_path = tempfile.mkdtemp()
    tmp_table_name = f"{int(time.time())}.parquet"
    dest_path = f"{dir_path}/{tmp_table_name}"
    row_group_size = min(ceil(table.num_rows / max_workers), chunk_size)
    pq.write_table(table=table, where=dest_path, row_group_size=row_group_size)

    # Remove table from memory
    del table

    return dir_path, dest_path
github cldellow / csv2parquet / csv2parquet / csv2parquet.py View on Github external
columns = [[] for x in range(len(column_names))]

            if rownum == max_rows:
                break

    if columns and any(columns):
        add_arrays(columns)

    data = [
        pa.array([item.as_py() for sublist in arr for item in sublist], type=types[idx][0]) if keep[idx] else None
        for idx, arr in enumerate(arrs)]
    data = [x for x in data if x is not None]
    batch = pa.RecordBatch.from_arrays(data, [column_names[x] for x in range(len(arrs)) if keep[x]])
    table = pa.Table.from_batches([batch])

    pq.write_table(table,
                   output_file,
                   version='1.0',
                   compression=codec,
                   use_dictionary=True,
                   row_group_size=row_group_size)
github awslabs / aws-data-wrangler / awswrangler / pandas.py View on Github external
def _write_parquet_to_s3_retrying(fs: Any, path: str, table: pa.Table, compression: str) -> None:
        with fs.open(path, "wb") as f:
            pq.write_table(table, f, compression=compression, coerce_timestamps="ms", flavor="spark")
github andrewgross / json2parquet / json2parquet / client.py View on Github external
def write_parquet(data, destination, **kwargs):
    """
    data: PyArrow record batch
    destination: Output file name

    **kwargs: defined at https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html
    """
    try:
        table = pa.Table.from_batches(data)
    except TypeError:
        table = pa.Table.from_batches([data])
    pq.write_table(table, destination, **kwargs)
github metaspace2020 / metaspace / metaspace / mol-db / app / isotope_storage.py View on Github external
p = p.centroids(instrument_model).trimmed(ISOTOPIC_PEAK_N)
            p.sortByMass()
            masses.append(p.masses)
            intensities.append(p.intensities)

        df = pd.DataFrame({
            'mf': valid_mfs,
            'mzs': masses,
            'intensities': intensities
        })
        df['adduct'] = adduct
        if existing_df is not None and not existing_df.empty:
            df = pd.concat([existing_df, df])

        table = pyarrow.Table.from_pandas(df)
        pyarrow.parquet.write_table(table, fn)

        logger.info('wrote {} NEW isotope patterns to {}'.format(len(valid_mfs), fn))
github ICB-DCM / pyABC / pyabc / storage / dataframe_bytes_storage.py View on Github external
def df_to_bytes_parquet_(df: pd.DataFrame) -> bytes:
    """
    pyarrow parquet is the standard conversion method of pandas
    DataFrames since pyabc 0.9.14, because msgpack became
    deprecated in pandas 0.25.0.
    """
    b = BytesIO()
    table = pyarrow.Table.from_pandas(df)
    parquet.write_table(table, b)
    b.seek(0)
    return b.read()
github holoviz / spatialpandas / spatialpandas / dask.py View on Github external
def write_concatted_part(part_df, part_output_path, md_list):
            with filesystem.open(part_output_path, 'wb') as f:
                pq.write_table(
                    pa.Table.from_pandas(part_df),
                    f, compression=compression, metadata_collector=md_list
                )