Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_pytorch(script_mode: bool = False, use_loss_module=False):
smd.del_hook()
sim_class = ScriptSimulator if script_mode else SagemakerSimulator
with sim_class() as sim:
trainloader, testloader = get_dataloaders()
net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
if script_mode:
hook = smd.Hook(out_dir=sim.out_dir)
hook.register_module(net)
hook.register_loss(criterion)
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
if use_loss_module:
loss = criterion(outputs, labels)
else:
loss = F.cross_entropy(outputs, labels)
def run(rank, size, include_workers="one", num_epochs=10, batch_size=128, num_batches=10):
"""Distributed function to be implemented later."""
torch.manual_seed(1234)
device = torch.device("cpu")
model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=1)
shutil.rmtree(out_dir, ignore_errors=True)
hook = smd.Hook(
out_dir=out_dir,
save_config=smd.SaveConfig(save_steps=[0, 1, 5]),
save_all=True,
include_workers=include_workers,
)
hook.register_module(model)
for epoch in range(num_epochs):
epoch_loss = 0.0
for _ in range(num_batches):
optimizer.zero_grad()
data, target = dataset(batch_size)
output = model(data)
loss = F.mse_loss(output, target)
epoch_loss += loss.item()
def helper_pytorch_tests(collection, register_loss, save_config):
coll_name, coll_regex = collection
run_id = "trial_" + coll_name + "-" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
trial_dir = os.path.join(SMDEBUG_PT_HOOK_TESTS_DIR, run_id)
hook = PT_Hook(
out_dir=trial_dir,
include_collections=[coll_name],
save_config=save_config,
export_tensorboard=True,
)
simple_pt_model(hook, register_loss=register_loss)
hook.close()
saved_scalars = ["scalar/pt_num_steps", "scalar/pt_before_train", "scalar/pt_after_train"]
verify_files(trial_dir, save_config, saved_scalars)
def test_tensorboard_dir_script_specify_tensorboard_dir():
""" In script mode, passing `export_tensorboard` and `tensorboard_dir` works. """
with ScriptSimulator(tensorboard_dir="/tmp/tensorboard_dir") as sim:
hook = smd.Hook(
out_dir=sim.out_dir, export_tensorboard=True, tensorboard_dir=sim.tensorboard_dir
)
assert hook.tensorboard_dir == sim.tensorboard_dir
# Inputs : _input_, and
# Output : _output
# In order to log the inputs and output of a module, we will create a collection as follows:
assert module is not None
# Create a hook that logs weights, biases, gradients and inputs/outputs of model every 5 steps from steps 0-100 while training.
hook = Hook(
out_dir=output_dir,
save_config=SaveConfig(save_steps=[i * 5 for i in range(20)]),
include_collections=["weights", "gradients", "biases", "l_mod"],
)
hook.get_collection("l_mod").add_module_tensors(module, inputs=True, outputs=True)
elif hook_type == "weights-bias-gradients":
save_config = SaveConfig(save_steps=[i * 5 for i in range(20)])
# Create a hook that logs ONLY weights, biases, and gradients every 5 steps (from steps 0-100) while training the model.
hook = Hook(out_dir=output_dir, save_config=save_config)
return hook
def create_hook(output_dir, module, trial_id="trial-resnet", save_interval=100):
# With the following SaveConfig, we will save tensors for steps 1, 2 and 3
# (indexing starts with 0) and then continue to save tensors at interval of
# 100,000 steps. Note: union operation is applied to produce resulting config
# of save_steps and save_interval params.
save_config = SaveConfig(save_interval)
# The names of input and output tensors of a block are in following format
# Inputs : _input_, and
# Output : _output
# In order to log the inputs and output of a model, we will create a collection as follows
# Create a hook that logs weights, biases, gradients of model while training.
hook = Hook(out_dir=output_dir)
return hook
def create_hook(output_dir, module=None, hook_type="saveall", save_steps=None):
# Create a hook that logs weights, biases, gradients and inputs/ouputs of model
if hook_type == "saveall":
hook = Hook(
out_dir=output_dir,
save_config=SaveConfig(save_steps=save_steps),
save_all=True,
export_tensorboard=True,
)
elif hook_type == "module-input-output":
# The names of input and output tensors of a module are in following format
# Inputs : _input_, and
# Output : _output
# In order to log the inputs and output of a module, we will create a collection as follows:
assert module is not None
# Create a hook that logs weights, biases, gradients and inputs/outputs of model
hook = Hook(
out_dir=output_dir,
save_config=SaveConfig(save_steps=save_steps),
# Create a hook that logs weights, biases, gradients and inputs/ouputs of model every 10 steps while training.
if hook_type == "saveall":
hook = Hook(
out_dir=output_dir,
save_config=SaveConfig(save_steps=[i * 10 for i in range(20)]),
save_all=True,
)
elif hook_type == "module-input-output":
# The names of input and output tensors of a module are in following format
# Inputs : _input_, and
# Output : _output
# In order to log the inputs and output of a module, we will create a collection as follows:
assert module is not None
# Create a hook that logs weights, biases, gradients and inputs/outputs of model every 5 steps from steps 0-100 while training.
hook = Hook(
out_dir=output_dir,
save_config=SaveConfig(save_steps=[i * 5 for i in range(20)]),
include_collections=["weights", "gradients", "biases", "l_mod"],
)
hook.get_collection("l_mod").add_module_tensors(module, inputs=True, outputs=True)
elif hook_type == "weights-bias-gradients":
save_config = SaveConfig(save_steps=[i * 5 for i in range(20)])
# Create a hook that logs ONLY weights, biases, and gradients every 5 steps (from steps 0-100) while training the model.
hook = Hook(out_dir=output_dir, save_config=save_config)
return hook