Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def read_split_metadata(split_file, subset_tags=None):
"""
:param split_file:
:param subset_tags: For example, ['NOVELCLASS', 'NOVELMODEL', 'NOVELVIEW']
:return:
"""
# Make sure cpp implementation is used for parsing protocol buffers. Otherwise it will be too slow.
from google.protobuf.internal import api_implementation
protobuf_impl_name = api_implementation._default_implementation_type
assert 'cpp' == protobuf_impl_name, protobuf_impl_name
with open(split_file, mode='rb') as f:
compressed = f.read()
decompressed = blosc.decompress(compressed)
examples = dataset_pb2.Examples()
examples.ParseFromString(decompressed)
if subset_tags is not None:
assert isinstance(subset_tags, (list, tuple))
experiment_names = subset_tags
ret = collections.defaultdict(list)
for example in examples.examples:
tags = tags_from_example(example)
for name in experiment_names:
if name in tags:
ret[name].append(example)
else:
ret = list(examples.examples)
def _load_blosc(self, ix, src=None, dst=None):
""" Load data from a blosc packed file """
file_name = self._get_file_name(ix, src)
with open(file_name, 'rb') as f:
data = dill.loads(blosc.decompress(f.read()))
components = tuple(dst or self.components)
try:
item = tuple(data[i] for i in components)
except Exception as e:
raise KeyError('Cannot find components in corresponfig file', e)
return item
elif is_object_dtype(dtype):
return np.array(values, dtype=object)
dtype = pandas_dtype(dtype).base
if not as_is_ext:
values = values.encode('latin1')
if compress:
if compress == u'zlib':
_check_zlib()
decompress = zlib.decompress
elif compress == u'blosc':
_check_blosc()
decompress = blosc.decompress
else:
raise ValueError("compress must be one of 'zlib' or 'blosc'")
try:
return np.frombuffer(
_move_into_mutable_buffer(decompress(values)),
dtype=dtype,
)
except _BadMove as e:
# Pull the decompressed data off of the `_BadMove` exception.
# We don't just store this in the locals because we want to
# minimize the risk of giving users access to a `bytes` object
# whose data is also given to a mutable buffer.
values = e.args[0]
if len(values) > 1:
# The empty string and single characters are memoized in many
displacements = [numpy.int64(sum(sizes[:i])) for i in range(len(sizes))]
np_type = get_np_dtype(dtype)
mpi_type = get_mpi_type(dtype)
data_shape = data.shape
if not compress:
gdata = numpy.empty(numpy.int64(sum(sizes)), dtype=np_type)
mpi_comm.Gatherv([data.flatten(), size, mpi_type], [gdata, (sizes, displacements), mpi_type], root=root)
else:
data = blosc.compress(data, typesize=mpi_type.size, cname='blosclz')
data = mpi_comm.gather(data, root=0)
gdata = numpy.empty(0, dtype=np_type)
if comm.rank == 0:
for blosc_data in data:
gdata = numpy.concatenate((gdata, numpy.frombuffer(blosc.decompress(blosc_data), dtype=np_type)))
if len(data_shape) == 1:
return gdata
else:
if shape == 0:
num_lines = data_shape[0]
if num_lines > 0:
return gdata.reshape((num_lines, gdata.size//num_lines))
else:
return gdata.reshape((0, gdata.shape[1]))
if shape == 1:
num_columns = data_shape[1]
if num_columns > 0:
return gdata.reshape((gdata.size//num_columns, num_columns))
else:
return gdata.reshape((gdata.shape[0], 0))
volume_empty = True # abort if the volume doesn't exist in S3
for offset in xyz_range(step):
if args.get('test'):
# Enable Test Mode
# This is where the cubes downsamples are all taken from 0/0/0
# so that the entire frame doesn't have to be populated to test
# the code paths that downsample cubes
cube = offset # use target 0/0/0
else:
cube = target + offset
try:
obj_key = HashedKey(parent_iso, col_id, exp_id, chan_id, resolution, t, cube.morton, version=version)
data = s3.get(obj_key)
data = blosc.decompress(data)
# DP ???: Check to see if the buffer is all zeros?
data = Buffer.frombuffer(data, dtype=np_types[data_type])
data.resize(dim)
#log.debug("Downloaded cube {}".format(cube))
volume[offset * dim: (offset + 1) * dim] = data
volume_empty = False
except Exception as e: # TODO: Create custom exception for S3 download
#log.exception("Problem downloading cubes {}".format(cube))
#log.debug("No cube at {}".format(cube))
# Eat the error, we don't care if the cube doesn't exist
# If the cube doesn't exist blank data will be used for downsampling
# If all the cubes don't exist, then the downsample is finished
pass
# Get bit depth
try:
bit_depth = resource.get_bit_depth()
except ValueError:
return BossParserError("Unsupported data type provided to parser: {}".format(resource.get_data_type()),
ErrorCodes.TYPE_ERROR)
# Make sure cutout request is under 500MB UNCOMPRESSED
if is_too_large(req, bit_depth):
return BossParserError("Cutout request is over 500MB when uncompressed. Reduce cutout dimensions.",
ErrorCodes.REQUEST_TOO_LARGE)
try:
# Decompress
raw_data = blosc.decompress(stream.read())
data_mat = np.fromstring(raw_data, dtype=resource.get_numpy_data_type())
except MemoryError:
return BossParserError("Ran out of memory decompressing data.",
ErrorCodes.BOSS_SYSTEM_ERROR)
except:
return BossParserError("Failed to decompress data. Verify the datatype/bitdepth of your data "
"matches the channel.", ErrorCodes.DATATYPE_DOES_NOT_MATCH)
# Reshape and return
try:
if req.time_request:
# Time series request (even if single time point) - Get 4D matrix
parsed_data = np.reshape(data_mat,
(len(req.get_time()),
req.get_z_span(),
req.get_y_span(),
def FetchFindMissingHashRecords(self, request_iterator, context):
"""Determine data tensor hash records existing on the server and not on the client.
"""
for idx, request in enumerate(request_iterator):
if idx == 0:
commit = request.commit
hBytes, offset = bytearray(request.total_byte_size), 0
size = len(request.hashs)
hBytes[offset: offset + size] = request.hashs
offset += size
uncompBytes = blosc.decompress(hBytes)
c_hashs_raw = chunks.deserialize_record_pack(uncompBytes)
c_hashset = set([chunks.deserialize_ident(raw).digest for raw in c_hashs_raw])
with tempfile.TemporaryDirectory() as tempD:
tmpDF = os.path.join(tempD, 'test.lmdb')
tmpDB = lmdb.open(path=tmpDF, **c.LMDB_SETTINGS)
commiting.unpack_commit_ref(self.env.refenv, tmpDB, commit)
s_hashes_schemas = queries.RecordQuery(tmpDB).data_hash_to_schema_hash()
s_hashes = set(s_hashes_schemas.keys())
tmpDB.close()
c_missing = list(s_hashes.difference(c_hashset))
c_hash_schemas_raw = [chunks.serialize_ident(c_mis, s_hashes_schemas[c_mis]) for c_mis in c_missing]
raw_pack = chunks.serialize_record_pack(c_hash_schemas_raw)
err = hangar_service_pb2.ErrorProto(code=0, message='OK')
response_pb = hangar_service_pb2.FindMissingHashRecordsReply
self.start(first_start = False)
continue
m0,m1 = self.recv_socket.recv_multipart()
self.last_packet_time = time.time()
abs_pos = msgpack.loads(m0)
if self.last_pos>abs_pos:
print 'restart because last not googd'
self.start(first_start = False)
continue
if self.compress is None:
buf = buffer(m1)
elif self.compress == 'blosc':
buf = blosc.decompress(m1)
chunk = np.frombuffer(buf, dtype = np_array.dtype, ).reshape(-1, n).transpose()
#~ print 'recv', abs_pos, chunk.shape
new = chunk.shape[1]
head = abs_pos%half_size+half_size
tail = head - new
np_array[:, tail:head] = chunk
head = abs_pos%half_size+half_size
tail = head - new
np_array[:, tail:head] = chunk
head2 = abs_pos%half_size
tail2 = max(head2 - new, 0)
new2 = head2-tail2
if new2!=0:
In order to prevent errors or malicious behavior, the cryptographic hash
of every tensor is calculated and compared to what the client "said" it
is. If an error is detected, no sample in the entire stream will be
saved to disk.
"""
for idx, request in enumerate(request_iterator):
if idx == 0:
uncomp_nbytes = request.uncomp_nbytes
comp_nbytes = request.comp_nbytes
dBytes, offset = bytearray(comp_nbytes), 0
size = len(request.raw_data)
dBytes[offset: offset + size] = request.raw_data
offset += size
uncompBytes = blosc.decompress(dBytes)
if uncomp_nbytes != len(uncompBytes):
msg = f'ERROR: uncomp_nbytes sent: {uncomp_nbytes} != received {comp_nbytes}'
context.set_details(msg)
context.set_code(grpc.StatusCode.DATA_LOSS)
err = hangar_service_pb2.ErrorProto(code=15, message=msg)
reply = hangar_service_pb2.PushDataReply(error=err)
return reply
unpacked_records = chunks.deserialize_record_pack(uncompBytes)
received_data = []
for record in unpacked_records:
data = chunks.deserialize_record(record)
schema_hash = data.schema
received_hash = array_hash_digest(data.array)
if received_hash != data.digest:
msg = f'HASH MANGLED, received: {received_hash} != expected digest: {data.digest}'
if blosc_args['shuffle'] is None:
blosc_args['shuffle'] = DEFAULT_SHUFFLE
if blosc_args['cname'] is None:
blosc_args['cname'] = DEFAULT_CNAME
_check_blosc_args(blosc_args)
offsets_pos = (BLOSCPACK_HEADER_LENGTH +
(METADATA_HEADER_LENGTH + metadata_header['max_meta_size'] +
CHECKSUMS_LOOKUP[metadata_header['meta_checksum']].size
if metadata is not None else 0))
# seek to the final offset
original_fp.seek(offsets[-1], 0)
# decompress the last chunk
compressed, blosc_header, digest = _read_compressed_chunk_fp(original_fp,
checksum_impl)
# TODO check digest
decompressed = blosc.decompress(compressed)
# figure out how many bytes we need to read to rebuild the last chunk
ultimo_length = len(decompressed)
bytes_to_read = bloscpack_header.chunk_size - ultimo_length
if new_size <= bytes_to_read:
# special case
# must squeeze data into last chunk
fill_up = new_content_fp.read(new_size)
# seek back to the position of the original last chunk
original_fp.seek(offsets[-1], 0)
# write the chunk that has been filled up
compressed = _compress_chunk_str(decompressed + fill_up, blosc_args)
digest = checksum_impl(compressed)
_write_compressed_chunk(original_fp, compressed, digest)
# return 0 to indicate that no new chunks have been written
# build the new header
bloscpack_header.last_chunk += new_size