Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"-aa", "no", "-aaVector", "no", "-cropbox",
join(pdf_dir, pdfname), join(output_dir, doc_id + "-page")]
else:
args = ["pdftoppm", "-jpeg", "-r", str(dpi), "-cropbox",
join(pdf_dir, pdfname), join(output_dir, doc_id + "-page")]
retcode = call(args)
if retcode != 0:
raise ValueError("Bad return code for <%s> (%d)", " ".join(args), retcode)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Cache rasterized page images for a dataset')
parser.add_argument("dataset", choices=datasets.DATASETS.keys(), help="target dataset")
parser.add_argument("color", choices=["gray", "color"], help="kind of images to render")
args = parser.parse_args()
dataset = datasets.get_dataset(args.dataset)
print("Running on dataset: " + dataset.name)
if args.color == "gray":
get_images(dataset.pdf_dir, dataset.page_images_gray_dir,
dataset.IMAGE_DPI, True)
elif args.color == "color":
get_images(dataset.pdf_dir, dataset.page_images_color_dir,
dataset.COLOR_IMAGE_DPI, False)
else:
exit(1)
raise ValueError("Output dir must be empty")
ignore_docs = set()
if args.ignore_docs_in is not None:
if isdir(args.ignore_docs_in):
for filename in listdir(args.ignore_docs_in):
if isdir(join(args.ignore_docs_in, filename)):
for sub_filename in listdir(join(args.ignore_docs_in, filename)):
ignore_docs.add(sub_filename.split("-page-")[0])
else:
ignore_docs.add(filename.split("-page-")[0])
else:
raise ValueError()
print("Found %d documents in %s, ignoring" % (len(ignore_docs), args.ignore_docs_in))
dataset = datasets.get_dataset(args.dataset)
pages_to_annotated = dataset.get_annotated_pages_map()
if dataset.has_annotations():
annotations = dataset.get_annotations("all")
else:
annotations = {}
annotated_docs = annotations.keys()
all_docs = dataset.get_doc_ids("all")
missing_docs = list(set(all_docs) - set(annotated_docs) - ignore_docs)
image_file_map = dataset.get_color_image_file_map()
print("%d missing documents" % len(missing_docs))
if args.groups:
size = len(missing_docs) /args.groups
groups = [missing_docs[round(i*size):round(i*size + size)] for i in range(args.groups)]
for i,group in enumerate(groups):
other_str = bytearray(other_t.cpu().storage().tolist()).decode(encoding="utf-8")
this_str = bytes.decode(encoding="utf-8")
raise ValueError(
"Rank {} opt is different from rank {}:\n".format(state.world_rank, other) +
utils.diff_str(this_str, other_str))
# in case of downloading, to avoid race, let rank 0 download.
if state.world_rank == 0:
train_dataset = datasets.get_dataset(state, 'train')
test_dataset = datasets.get_dataset(state, 'test')
if not dummy and state.distributed:
utils.distributed.barrier()
if state.world_rank != 0:
train_dataset = datasets.get_dataset(state, 'train')
test_dataset = datasets.get_dataset(state, 'test')
state.opt.train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=state.batch_size,
num_workers=state.num_workers, pin_memory=True, shuffle=True)
state.opt.test_loader = torch.utils.data.DataLoader(
test_dataset, batch_size=state.test_batch_size,
num_workers=state.num_workers, pin_memory=True, shuffle=True)
if not dummy:
logging.info('train dataset size:\t{}'.format(len(train_dataset)))
logging.info('test dataset size: \t{}'.format(len(test_dataset)))
logging.info('datasets built!')
state.vis_queue = utils.multiprocessing.FixSizeProcessQueue(2)
from nni.nas.pytorch.callbacks import ArchitectureCheckpoint, LRSchedulerCallback
from nni.nas.pytorch.darts import DartsTrainer
from utils import accuracy
logger = logging.getLogger('nni')
if __name__ == "__main__":
parser = ArgumentParser("darts")
parser.add_argument("--layers", default=8, type=int)
parser.add_argument("--batch-size", default=64, type=int)
parser.add_argument("--log-frequency", default=10, type=int)
parser.add_argument("--epochs", default=50, type=int)
parser.add_argument("--unrolled", default=False, action="store_true")
args = parser.parse_args()
dataset_train, dataset_valid = datasets.get_dataset("cifar10")
model = CNN(32, 3, 16, 10, args.layers)
criterion = nn.CrossEntropyLoss()
optim = torch.optim.SGD(model.parameters(), 0.025, momentum=0.9, weight_decay=3.0E-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, args.epochs, eta_min=0.001)
trainer = DartsTrainer(model,
loss=criterion,
metrics=lambda output, target: accuracy(output, target, topk=(1,)),
optimizer=optim,
num_epochs=args.epochs,
dataset_train=dataset_train,
dataset_valid=dataset_valid,
batch_size=args.batch_size,
log_frequency=args.log_frequency,
def format_data(config):
"""Post-processing and generate custom files."""
prog_bar = progressbar.ProgressBar()
config['stage'] = 'post_format'
dataset = get_dataset(config['data_name'])(**config)
prog_bar.max_value = dataset.data_length
test_set = dataset.get_test_set()
idx = 0
while True:
try:
data = next(test_set)
dataset.format_data(data)
prog_bar.update(idx)
idx += 1
except dataset.end_set:
break
def extract_aug_feat(config):
"""Extract augmented features."""
prog_bar = progressbar.ProgressBar()
config['stage'] = 'aug'
dataset = get_dataset(config['data_name'])(**config)
prog_bar.max_value = dataset.data_length
test_set = dataset.get_test_set()
model = get_model('aug_model')(config['pretrained']['loc_model'], **(config['aug_feat']))
idx = 0
while True:
try:
data = next(test_set)
dump_path = data['dump_path'].decode('utf-8')
aug_f = h5py.File(dump_path, 'a')
if 'aug_feat' not in aug_f or config['aug_feat']['overwrite']:
aug_feat, _ = model.run_test_data(data['dump_data'])
if 'aug_feat' in aug_f:
del aug_f['aug_feat']
if aug_feat.dtype == np.uint8:
_ = aug_f.create_dataset('aug_feat', data=aug_feat, dtype='uint8')
def main():
# create target output dir if it doesn't exist yet
if not os.path.isdir(args.output_dir):
os.mkdir(args.output_dir)
# enable mixed-precision computation if desired
if args.amp:
mixed_precision.enable_mixed_precision()
# set the RNG seeds (probably more hidden elsewhere...)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
# get the dataset
dataset = get_dataset(args.dataset)
encoder_size = get_encoder_size(dataset)
# get a helper object for tensorboard logging
log_dir = os.path.join(args.output_dir, args.run_name)
stat_tracker = StatTracker(log_dir=log_dir)
# get dataloaders for training and testing
train_loader, test_loader, num_classes = \
build_dataset(dataset=dataset,
batch_size=args.batch_size,
input_dir=args.input_dir,
labeled_only=args.classifiers)
torch_device = torch.device('cuda')
checkpointer = Checkpointer(args.output_dir)
if args.cpt_load_path:
from nni.nas.pytorch import enas
from nni.nas.pytorch.callbacks import (ArchitectureCheckpoint,
LRSchedulerCallback)
from utils import accuracy, reward_accuracy
logger = logging.getLogger('nni')
if __name__ == "__main__":
parser = ArgumentParser("enas")
parser.add_argument("--batch-size", default=128, type=int)
parser.add_argument("--log-frequency", default=10, type=int)
parser.add_argument("--search-for", choices=["macro", "micro"], default="macro")
args = parser.parse_args()
dataset_train, dataset_valid = datasets.get_dataset("cifar10")
if args.search_for == "macro":
model = GeneralNetwork()
num_epochs = 310
mutator = None
elif args.search_for == "micro":
model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, use_aux_heads=True)
num_epochs = 150
mutator = enas.EnasMutator(model, tanh_constant=1.1, cell_exit_extra_step=True)
else:
raise AssertionError
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), 0.05, momentum=0.9, weight_decay=1.0E-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0.001)
trainer = enas.EnasTrainer(model,