How to perform a gridsearch over multiple models in parallel with hyperparameters that can change in vmap? #20760

kclauw · 2024-04-15T08:21:07Z

kclauw
Apr 15, 2024

Hi,

I made a script to perform a gridsearch over hyperparameters using vmap. I want to pass parameters such as the seed, weight decay, and learning rate to train a model using optax and flax. This works fine for variables like seed that are not changed inside vmap. However, the learning rate is modified inside the optax optimizer resulting in a side effect.

How can you pass variables to vmap that are changed during execution? Is this even possible?

My code is as follows:

@jax.jit  # Jit the function for efficiency
def eval_step(state, batch):
    loss, acc = calculate_loss_acc(state, state.params, batch)
    return loss, acc

def eval_model(params, data_loader):
    parallel_eval_step_fn = jax.vmap(eval_step, in_axes=(0, None))
    all_losses, all_acc, batch_sizes = [], [], []
    for batch in data_loader:
        loss, acc = parallel_eval_step_fn(params, batch)
        all_losses.append(loss)
        all_acc.append(acc)
        batch_sizes.append(batch[0].shape[0])

    mean_loss = sum([l*b for l,b in zip(all_losses, batch_sizes)]) / sum(batch_sizes)
    mean_acc = sum([l*b for l,b in zip(all_acc, batch_sizes)]) / sum(batch_sizes)
    return mean_loss, mean_acc

def calculate_loss_acc(state, params, batch):
    data_input, labels = batch
    logits = state.apply_fn(params, data_input)
    loss = optax.softmax_cross_entropy_with_integer_labels(logits, labels).mean()
    acc = jnp.mean(jnp.argmax(logits, -1) == labels)
    return loss, acc

@jax.jit  # Jit the function for efficiency
def train_step(state, batch):
    # Gradient function
    
    grad_fn = jax.value_and_grad(calculate_loss_acc,  # Function to calculate the loss
                                argnums=1,  # Parameters are second argument of the function
                                has_aux=True  # Function has additional outputs, here accuracy
                                )
    # Determine gradients for current model, parameters and batch
    (loss, acc), grads = grad_fn(state, state.params, batch)
    
    # Perform parameter update with gradients and optimizer
    state = state.apply_gradients(grads=grads)
    # Return state and any other value we might want
    return state, loss, acc


def initialization(model, learning_rate, input_size, seed, weight_decay):
    
    rng = jax.random.PRNGKey(seed)

    rng, init_rng = jax.random.split(rng)
    dummy_input = jax.random.normal(init_rng, (1, input_size))  # Batch size 8, input size 2
    params = model.init(init_rng, dummy_input)
    model.apply(params, dummy_input)
    
    optimizer = optax.adamw(learning_rate=learning_rate)
   
    model_state = train_state.TrainState.create(apply_fn=model.apply,
                                                params=params,
                                                tx=optimizer)
    return model_state

@hydra.main(version_base=None, config_name="main", config_path="config")
def main(cfg) -> None:
    seed = 0
    num_epochs = 1000
    input_size = 194
    output_size = 97
    learning_rates = jnp.array([0.03], dtype=float)
    weight_decays = jnp.array([1], dtype=float)

    seeds = jnp.arange(len(learning_rates))
    train_dataloader, test_dataloader = get_dataloaders(cfg, train_with_full_batch=True)
    model = FCNN_2(num_hidden=1000, 
                   num_outputs=output_size, 
                   activation = cfg.model.parameters.activation)
    
    #with jax.checking_leaks():
    parallel_init_fn = jax.vmap(initialization, in_axes=(None, 0, None, 0, 0))

    parallel_train_step_fn = jax.vmap(train_step, in_axes=(0, None))
    
    params = parallel_init_fn(model, learning_rates, input_size, seeds, weight_decays)
    
    
    for epoch in range(num_epochs):
        #Run training on epoch
        for batch in train_dataloader:
            
            params, loss, acc = parallel_train_step_fn(params, batch)
            train_loss, train_acc = eval_model(params, train_dataloader)
            test_loss, test_acc = eval_model(params, test_dataloader)

danielkelshaw · 2024-04-18T11:19:51Z

danielkelshaw
Apr 18, 2024

Hi,

I believe I have accomplished a similar thing using jax.lax.switch with jax.vmap. Here you can define each of the options you wish to run as functions, closed over the variables you wish to change -- then vmap over the branches. For example:

def train_step(state, batch):
    ...

def initialization(model, learning_rate, input_size, seed, weight_decay):
    # keep this function as written   
    ...

# create functions -- assume we have a list of learning rates, seeds, and weight decays
initializers = [
    jtu.Partial(initialization, model=model, learning_rate=lr, input_size=input_size, seed=seed, weight_decay=wd)
    for lr, seed, wd in zip(learning_rates, seeds, weight_decays)
]

# vmap over initialisers
states = jax.vmap(jtu.Partial(jax.lax.switch, branches=initializers))(jnp.arange(len(learning_rates)))

# vectorise the stepping function over the first dimension of the states
fn_step = jax.vmap(train_step, in_axes=(0, None))

This is quite a crude example, but I think you can play around with it to achieve what you are asking for

Note: I did not test this code, but I think the idea is clear.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How to perform a gridsearch over multiple models in parallel with hyperparameters that can change in vmap? #20760

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 1 comment

{{title}}

Select a reply

How to perform a gridsearch over multiple models in parallel with hyperparameters that can change in vmap? #20760

kclauw Apr 15, 2024

Replies: 1 comment

danielkelshaw Apr 18, 2024

kclauw
Apr 15, 2024

danielkelshaw
Apr 18, 2024