Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def write_table(table, where, filesystem, **kwargs): # pylint: disable=unused-argument
path = str(filesystem.tmp_path / FILENAME)
filesystem.files[str(where)] = path
pq.write_table(table, path)
def mocked_s3_object_versioned(
tmp_path, mocked_s3_bucket, dummy_dataframe, save_version
):
"""Create versioned test data and add it to mocked S3 bucket."""
table = pa.Table.from_pandas(dummy_dataframe)
temporary_path = tmp_path / FILENAME
pq.write_table(table, str(temporary_path))
mocked_s3_bucket.put_object(
Bucket=BUCKET_NAME,
Key="{0}/{1}/{0}".format(FILENAME, save_version),
Body=temporary_path.read_bytes(),
)
return mocked_s3_bucket
def convert_to_parquet(df, row_group_size, compression):
"""Convert a dataframe of expression values to a parquet file."""
path = _get_temp_path(".parquet")
qcs = fake_qc_values(NUM_QC_VALUES, df.index, seed=df.values.sum())
full_df = pandas.concat([df, qcs], axis=1)
table = pyarrow.Table.from_pandas(full_df)
pyarrow.parquet.write_table(table, path, row_group_size=row_group_size,
compression=compression)
return path
table = json.read_json(filename)
else:
table = pq.read_table(file_path)
else:
raise ValueError(f"Unknown data source provided for ingestion: {source}")
# Ensure that PyArrow table is initialised
assert isinstance(table, pa.lib.Table)
# Write table as parquet file with a specified row_group_size
dir_path = tempfile.mkdtemp()
tmp_table_name = f"{int(time.time())}.parquet"
dest_path = f"{dir_path}/{tmp_table_name}"
row_group_size = min(ceil(table.num_rows / max_workers), chunk_size)
pq.write_table(table=table, where=dest_path, row_group_size=row_group_size)
# Remove table from memory
del table
return dir_path, dest_path
columns = [[] for x in range(len(column_names))]
if rownum == max_rows:
break
if columns and any(columns):
add_arrays(columns)
data = [
pa.array([item.as_py() for sublist in arr for item in sublist], type=types[idx][0]) if keep[idx] else None
for idx, arr in enumerate(arrs)]
data = [x for x in data if x is not None]
batch = pa.RecordBatch.from_arrays(data, [column_names[x] for x in range(len(arrs)) if keep[x]])
table = pa.Table.from_batches([batch])
pq.write_table(table,
output_file,
version='1.0',
compression=codec,
use_dictionary=True,
row_group_size=row_group_size)
def _write_parquet_to_s3_retrying(fs: Any, path: str, table: pa.Table, compression: str) -> None:
with fs.open(path, "wb") as f:
pq.write_table(table, f, compression=compression, coerce_timestamps="ms", flavor="spark")
def write_parquet(data, destination, **kwargs):
"""
data: PyArrow record batch
destination: Output file name
**kwargs: defined at https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html
"""
try:
table = pa.Table.from_batches(data)
except TypeError:
table = pa.Table.from_batches([data])
pq.write_table(table, destination, **kwargs)
p = p.centroids(instrument_model).trimmed(ISOTOPIC_PEAK_N)
p.sortByMass()
masses.append(p.masses)
intensities.append(p.intensities)
df = pd.DataFrame({
'mf': valid_mfs,
'mzs': masses,
'intensities': intensities
})
df['adduct'] = adduct
if existing_df is not None and not existing_df.empty:
df = pd.concat([existing_df, df])
table = pyarrow.Table.from_pandas(df)
pyarrow.parquet.write_table(table, fn)
logger.info('wrote {} NEW isotope patterns to {}'.format(len(valid_mfs), fn))
def df_to_bytes_parquet_(df: pd.DataFrame) -> bytes:
"""
pyarrow parquet is the standard conversion method of pandas
DataFrames since pyabc 0.9.14, because msgpack became
deprecated in pandas 0.25.0.
"""
b = BytesIO()
table = pyarrow.Table.from_pandas(df)
parquet.write_table(table, b)
b.seek(0)
return b.read()
def write_concatted_part(part_df, part_output_path, md_list):
with filesystem.open(part_output_path, 'wb') as f:
pq.write_table(
pa.Table.from_pandas(part_df),
f, compression=compression, metadata_collector=md_list
)