Fatal Python error: Aborted #3165

GBX-Engineer · 2024-03-21T08:07:08Z

Describe the bug

After successfully installing Flower, I followed the run example to get this error. (Versions python=1.10.0, pytorch=2.2.1,Flower=1.7.0)

Steps/Code to Reproduce

from collections import OrderedDict
from typing import Dict, List, Optional, Tuple

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import CIFAR10

import flwr as fl

DEVICE = torch.device("cpu") # Try "cuda" to train on GPU
print(
f"Training on {DEVICE} using PyTorch {torch.version} and Flower {fl.version}"
)

NUM_CLIENTS = 10

def load_datasets(num_clients: int):
# Download and transform CIFAR-10 (train and test)
transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)
trainset = CIFAR10("./dataset", train=True, download=True, transform=transform)
testset = CIFAR10("./dataset", train=False, download=True, transform=transform)

# Split training set into `num_clients` partitions to simulate different local datasets
partition_size = len(trainset) // num_clients
lengths = [partition_size] * num_clients
datasets = random_split(trainset, lengths, torch.Generator().manual_seed(42))

# Split each partition into train/val and create DataLoader
trainloaders = []
valloaders = []
for ds in datasets:
    len_val = len(ds) // 10  # 10 % validation set
    len_train = len(ds) - len_val
    lengths = [len_train, len_val]
    ds_train, ds_val = random_split(ds, lengths, torch.Generator().manual_seed(42))
    trainloaders.append(DataLoader(ds_train, batch_size=32, shuffle=True))
    valloaders.append(DataLoader(ds_val, batch_size=32))
testloader = DataLoader(testset, batch_size=32)
return trainloaders, valloaders, testloader

trainloaders, valloaders, testloader = load_datasets(NUM_CLIENTS)

class Net(nn.Module):
def init(self) -> None:
super(Net, self).init()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)

def forward(self, x: torch.Tensor) -> torch.Tensor:
    x = self.pool(F.relu(self.conv1(x)))
    x = self.pool(F.relu(self.conv2(x)))
    x = x.view(-1, 16 * 5 * 5)
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)
    return x

def get_parameters(net) -> List[np.ndarray]:
return [val.cpu().numpy() for _, val in net.state_dict().items()]

def set_parameters(net, parameters: List[np.ndarray]):
params_dict = zip(net.state_dict().keys(), parameters)
state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict})
net.load_state_dict(state_dict, strict=True)

def train(net, trainloader, epochs: int):
"""Train the network on the training set."""
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters())
net.train()
for epoch in range(epochs):
correct, total, epoch_loss = 0, 0, 0.0
for images, labels in trainloader:
images, labels = images.to(DEVICE), labels.to(DEVICE)
optimizer.zero_grad()
outputs = net(images)
loss = criterion(net(images), labels)
loss.backward()
optimizer.step()
# Metrics
epoch_loss += loss
total += labels.size(0)
correct += (torch.max(outputs.data, 1)[1] == labels).sum().item()
epoch_loss /= len(trainloader.dataset)
epoch_acc = correct / total
print(f"Epoch {epoch+1}: train loss {epoch_loss}, accuracy {epoch_acc}")

def test(net, testloader):
"""Evaluate the network on the entire test set."""
criterion = torch.nn.CrossEntropyLoss()
correct, total, loss = 0, 0, 0.0
net.eval()
with torch.no_grad():
for images, labels in testloader:
images, labels = images.to(DEVICE), labels.to(DEVICE)
outputs = net(images)
loss += criterion(outputs, labels).item()
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
loss /= len(testloader.dataset)
accuracy = correct / total
return loss, accuracy

class FlowerClient(fl.client.NumPyClient):
def init(self, cid, net, trainloader, valloader):
self.cid = cid
self.net = net
self.trainloader = trainloader
self.valloader = valloader

def get_parameters(self, config):
    print(f"[Client {self.cid}] get_parameters")
    return get_parameters(self.net)

def fit(self, parameters, config):
    print(f"[Client {self.cid}] fit, config: {config}")
    set_parameters(self.net, parameters)
    train(self.net, self.trainloader, epochs=1)
    return get_parameters(self.net), len(self.trainloader), {}

def evaluate(self, parameters, config):
    print(f"[Client {self.cid}] evaluate, config: {config}")
    set_parameters(self.net, parameters)
    loss, accuracy = test(self.net, self.valloader)
    return float(loss), len(self.valloader), {"accuracy": float(accuracy)}

def client_fn(cid) -> FlowerClient:
net = Net().to(DEVICE)
trainloader = trainloaders[int(cid)]
valloader = valloaders[int(cid)]
return FlowerClient(cid, net, trainloader, valloader)

client_resources = None
if DEVICE.type == "cuda":
client_resources = {"num_gpus": 1}

Create FedAvg strategy

strategy = fl.server.strategy.FedAvg(
fraction_fit=1.0, # Sample 100% of available clients for training
fraction_evaluate=0.5, # Sample 50% of available clients for evaluation
min_fit_clients=10, # Never sample less than 10 clients for training
min_evaluate_clients=5, # Never sample less than 5 clients for evaluation
min_available_clients=10, # Wait until all 10 clients are available
)

Specify the resources each of your clients need. By default, each

client will be allocated 1x CPU and 0x GPUs

client_resources = {"num_cpus": 1, "num_gpus": 0.0}
if DEVICE.type == "cuda":
# here we are assigning an entire GPU for each client.
client_resources = {"num_cpus": 1, "num_gpus": 1.0}
# Refer to our documentation for more details about Flower Simulations
# and how to setup these client_resources.

Start simulation

fl.simulation.start_simulation(
client_fn=client_fn,
num_clients=NUM_CLIENTS,
config=fl.server.ServerConfig(num_rounds=5),
strategy=strategy,
client_resources=client_resources,
)

Expected Results

Solve this problem

Actual Results

None

The text was updated successfully, but these errors were encountered:

jafermarq · 2024-03-21T08:32:38Z

@GBX-Engineer, what example were you following?

GBX-Engineer added the bug Something isn't working label Mar 21, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fatal Python error: Aborted #3165

Fatal Python error: Aborted #3165

GBX-Engineer commented Mar 21, 2024

jafermarq commented Mar 21, 2024

Fatal Python error: Aborted #3165

Fatal Python error: Aborted #3165

Comments

GBX-Engineer commented Mar 21, 2024

Describe the bug

Steps/Code to Reproduce

Create FedAvg strategy

Specify the resources each of your clients need. By default, each

client will be allocated 1x CPU and 0x GPUs

Start simulation

Expected Results

Actual Results

jafermarq commented Mar 21, 2024