Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
FIELDNAMES = ['image_id', 'image_w', 'image_h',
'num_boxes', 'boxes', 'features']
if phase == 'trainval':
infiles = [
'raw/trainval_36/trainval_resnet101_faster_rcnn_genome_36.tsv',
]
elif phase == 'test':
infiles = [
'raw/test2015_36/test2015_resnet101_faster_rcnn_genome_36.tsv',
]
else:
raise SystemExit('Unrecognised phase')
# Read the tsv and append to files
boxes = zarr.open_group(phase + '_boxes.zarr', mode='w')
features = zarr.open_group(phase + '.zarr', mode='w')
image_size = {}
for infile in infiles:
with open(infile, "r") as tsv_in_file:
reader = csv.DictReader(
tsv_in_file, delimiter='\t', fieldnames=FIELDNAMES)
print('Converting ' + infile + ' to zarr...')
for item in tqdm(reader):
item['image_id'] = str(item['image_id'])
item['image_h'] = int(item['image_h'])
item['image_w'] = int(item['image_w'])
item['num_boxes'] = int(item['num_boxes'])
for field in ['boxes', 'features']:
encoded_str = base64.decodestring(
item[field].encode('utf-8'))
item[field] = np.frombuffer(encoded_str,
'num_boxes', 'boxes', 'features']
if phase == 'trainval':
infiles = [
'raw/trainval_36/trainval_resnet101_faster_rcnn_genome_36.tsv',
]
elif phase == 'test':
infiles = [
'raw/test2015_36/test2015_resnet101_faster_rcnn_genome_36.tsv',
]
else:
raise SystemExit('Unrecognised phase')
# Read the tsv and append to files
boxes = zarr.open_group(phase + '_boxes.zarr', mode='w')
features = zarr.open_group(phase + '.zarr', mode='w')
image_size = {}
for infile in infiles:
with open(infile, "r") as tsv_in_file:
reader = csv.DictReader(
tsv_in_file, delimiter='\t', fieldnames=FIELDNAMES)
print('Converting ' + infile + ' to zarr...')
for item in tqdm(reader):
item['image_id'] = str(item['image_id'])
item['image_h'] = int(item['image_h'])
item['image_w'] = int(item['image_w'])
item['num_boxes'] = int(item['num_boxes'])
for field in ['boxes', 'features']:
encoded_str = base64.decodestring(
item[field].encode('utf-8'))
item[field] = np.frombuffer(encoded_str,
dtype=np.float32).reshape((item['num_boxes'], -1))
'dtype': dtype}
if self.length is None:
self.arr_kwargs['shape'] = (1,)+self.data_element_shape
else:
self.arr_kwargs['shape'] = (self.length,)+self.data_element_shape
if kwargs is not None:
self.arr_kwargs.update(kwargs)
# Open the file for writing.
self.group = None
if append:
self.write_mode = 'a'
else:
self.write_mode = 'w'
try:
self.group = zarr.open_group(filename, self.write_mode)
except:
print("Error: failed to open file %s" % filename)
raise
# Open an array interface (check if the array exists; if not, create it)
if self.length is None:
ds_args = (self.array_name, (1,)+self.data_element_shape)
else:
ds_args = (self.array_name, (self.length,)+self.data_element_shape)
try:
self.storage_array = self.group[self.array_name]
self.storage_array_ptr = len(self.storage_array)
except KeyError:
self.storage_array = self.group.create_dataset(**self.arr_kwargs)
self.storage_array_ptr = 0
def collect_zarr(file_name, out_dir, num_procs):
final_zarr_file = '%s/%s' % (out_dir, file_name)
# seed w/ job0
job_zarr_file = '%s/job0/%s' % (out_dir, file_name)
shutil.copytree(job_zarr_file, final_zarr_file)
# open final
final_zarr_open = zarr.open_group(final_zarr_file)
for pi in range(1, num_procs):
# open job
job_zarr_file = '%s/job%d/%s' % (out_dir, pi, file_name)
job_zarr_open = zarr.open_group(job_zarr_file, 'r')
# append to final
for key in final_zarr_open.keys():
if key in ['percentiles', 'target_ids', 'target_labels']:
# once is enough
pass
elif key[-4:] == '_pct':
# average
u_k1 = np.array(final_zarr_open[key])
x_k = np.array(job_zarr_open[key])
final_zarr_open[key] = u_k1 + (x_k - u_k1) / (pi+1)
else:
# append
final_zarr_open[key].append(job_zarr_open[key])
def collect_zarr(file_name, out_dir, num_procs):
final_zarr_file = '%s/%s' % (out_dir, file_name)
# seed w/ job0
job_zarr_file = '%s/job0/%s' % (out_dir, file_name)
shutil.copytree(job_zarr_file, final_zarr_file)
# open final
final_zarr_open = zarr.open_group(final_zarr_file)
for pi in range(1, num_procs):
# open job
job_zarr_file = '%s/job%d/%s' % (out_dir, pi, file_name)
job_zarr_open = zarr.open_group(job_zarr_file, 'r')
# append to final
for key in final_zarr_open.keys():
if key in ['percentiles', 'target_ids', 'target_labels']:
# once is enough
pass
elif key[-4:] == '_pct':
# average
u_k1 = np.array(final_zarr_open[key])
x_k = np.array(job_zarr_open[key])
def initialize_output_zarr(out_dir, sad_stats, snps, target_ids, target_labels):
"""Initialize an output Zarr file for SAD stats."""
num_targets = len(target_ids)
num_snps = len(snps)
sad_out = zarr.open_group('%s/sad.zarr' % out_dir, 'w')
# write SNPs
sad_out.create_dataset('snp', data=[snp.rsid for snp in snps], chunks=(32768,))
# write targets
sad_out.create_dataset('target_ids', data=target_ids, compressor=None)
sad_out.create_dataset('target_labels', data=target_labels, compressor=None)
# initialize SAD stats
for sad_stat in sad_stats:
sad_out.create_dataset(sad_stat,
shape=(num_snps, num_targets),
chunks=(128, num_targets),
dtype='float16')
return sad_out