Skip to content

Commit

Permalink
[Cherry-pick] Apply IOU to test_parallel_executor_seresnext_base_gpu … (
Browse files Browse the repository at this point in the history
#43925)

* [Cherry-pick] Apply IOU to test_parallel_executor_seresnext_base_gpu (#43812)
1. Fix the conflict between #43812 and current release/2.3 branch
2. test_parallel_executor_seresnext_base_gpu failed on 2 P100 GPUs with `470.82` driver.
  • Loading branch information
zhhsplendid committed Jun 30, 2022
1 parent 83520fd commit fde34eb
Show file tree
Hide file tree
Showing 11 changed files with 243 additions and 163 deletions.
35 changes: 24 additions & 11 deletions python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@


class TestParallelExecutorBase(unittest.TestCase):

@classmethod
def check_network_convergence(cls,
method,
Expand All @@ -52,6 +53,7 @@ def check_network_convergence(cls,
optimizer=fluid.optimizer.Adam,
use_fast_executor=False,
enable_sequential_execution=False):

def run_executor(exe, binary, feed, fetch_list):
if feed_data_reader is None:
res = exe.run(binary, feed=feed, fetch_list=fetch_list)
Expand All @@ -66,8 +68,8 @@ def run_executor(exe, binary, feed, fetch_list):
feed_data_reader, FeedDataReader
), "feed_data_reader must be type of FeedDataReader"

paddle.seed(1)
paddle.framework.random._manual_program_seed(1)
paddle.seed(0)
paddle.framework.random._manual_program_seed(0)
main = fluid.Program()
startup = fluid.Program()

Expand Down Expand Up @@ -101,28 +103,39 @@ def run_executor(exe, binary, feed, fetch_list):
) if use_device == DeviceType.XPU else int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

area_below_loss = 0
begin = time.time()
first_loss, = run_executor(
exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name])
first_loss, = run_executor(exe=exe,
binary=binary,
feed=feed_dict,
fetch_list=[loss.name])
area_below_loss += 0.5 * first_loss.mean()
for _ in range(iter):
run_executor(exe=exe, binary=binary, feed=feed_dict, fetch_list=[])
last_loss, = run_executor(
exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name])
mid_loss = run_executor(exe=exe,
binary=binary,
feed=feed_dict,
fetch_list=[loss.name])
area_below_loss += mid_loss[0].mean()
last_loss, = run_executor(exe=exe,
binary=binary,
feed=feed_dict,
fetch_list=[loss.name])
area_below_loss += 0.5 * last_loss.mean()
end = time.time()

if batch_size is not None:
print("%.4f Instance per second" % (
(batch_size * iter + 2) / (end - begin)))
print("%.4f Instance per second" % ((batch_size * iter + 2) /
(end - begin)))

avg_last_loss_val = np.array(last_loss).mean()
avg_first_loss_val = np.array(first_loss).mean()
if math.isnan(float(avg_last_loss_val)) or math.isnan(
float(avg_first_loss_val)):
sys.exit("got NaN loss, training failed.")

print(first_loss, last_loss)
print(first_loss, last_loss, area_below_loss)
# self.assertGreater(first_loss[0], last_loss[0])
return first_loss, last_loss
return first_loss, last_loss, area_below_loss

@classmethod
def check_pass_conflict(cls,
Expand Down
18 changes: 12 additions & 6 deletions python/paddle/fluid/tests/unittests/seresnext_test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@


class TestResnetBase(TestParallelExecutorBase):

def _compare_result_with_origin_model(self,
check_func,
use_device,
Expand All @@ -29,7 +30,7 @@ def _compare_result_with_origin_model(self,
if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return

func_1_first_loss, func_1_last_loss = self.check_network_convergence(
func_1_first_loss, func_1_last_loss, func_1_loss_area = self.check_network_convergence(
seresnext_net.model,
feed_dict=seresnext_net.feed_dict(use_device),
iter=seresnext_net.iter(use_device),
Expand All @@ -38,7 +39,7 @@ def _compare_result_with_origin_model(self,
use_reduce=False,
optimizer=seresnext_net.optimizer)

func_2_first_loss, func_2_last_loss = check_func(
func_2_first_loss, func_2_last_loss, func_2_loss_area = check_func(
seresnext_net.model,
feed_dict=seresnext_net.feed_dict(use_device),
iter=seresnext_net.iter(use_device),
Expand All @@ -51,7 +52,12 @@ def _compare_result_with_origin_model(self,
for loss in zip(func_1_last_loss, func_2_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
else:
self.assertAlmostEquals(
np.mean(func_1_first_loss), func_2_first_loss[0], delta=1e-5)
self.assertAlmostEquals(
np.mean(func_1_last_loss), func_2_last_loss[0], delta=delta2)
np.testing.assert_allclose(func_1_loss_area,
func_2_loss_area,
rtol=delta2)
self.assertAlmostEquals(np.mean(func_1_first_loss),
func_2_first_loss[0],
delta=1e-5)
self.assertAlmostEquals(np.mean(func_1_last_loss),
func_2_last_loss[0],
delta=delta2)
30 changes: 16 additions & 14 deletions python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@


class TestFuseAllReduceOpsBase(TestParallelExecutorBase):

@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
Expand All @@ -47,15 +48,15 @@ def compare_fuse_all_reduce_ops(self,
img, label = init_feed_dict()
feed_dict_data = {"image": img, "label": label}

not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
not_fuse_op_first_loss, not_fuse_op_last_loss, _ = self.check_network_convergence(
model,
feed_dict=feed_dict_data,
get_data_from_feeder=get_data_from_feeder,
use_device=use_device,
fuse_all_reduce_ops=False,
fuse_all_optimizer_ops=fuse_all_optimizer_ops,
optimizer=optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
fuse_op_first_loss, fuse_op_last_loss, _ = self.check_network_convergence(
model,
feed_dict=feed_dict_data,
get_data_from_feeder=get_data_from_feeder,
Expand All @@ -77,13 +78,13 @@ def optimizer(self, learning_rate=1e-3):


class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):

def _decorate_compare_fused_all_reduce(self, model, use_device):
self.compare_fuse_all_reduce_ops(
model,
use_device,
init_feed_dict=init_data,
optimizer=self.optimizer,
fuse_all_optimizer_ops=True)
self.compare_fuse_all_reduce_ops(model,
use_device,
init_feed_dict=init_data,
optimizer=self.optimizer,
fuse_all_optimizer_ops=True)

def test_simple_fc_with_fuse_all_reduce(self):
self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA)
Expand All @@ -101,16 +102,17 @@ def test_batchnorm_fc_with_fuse_all_reduce(self):


class TestFuseAllReduceOpsAndOptiOps(TestFuseAllReduceOps):

def _decorate_compare_fused_all_reduce(self, model, use_device):
self.compare_fuse_all_reduce_ops(
model,
use_device,
init_feed_dict=init_data,
optimizer=self.optimizer,
fuse_all_optimizer_ops=True)
self.compare_fuse_all_reduce_ops(model,
use_device,
init_feed_dict=init_data,
optimizer=self.optimizer,
fuse_all_optimizer_ops=True)


class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):

@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@


class TestMNIST(TestParallelExecutorBase):

@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
Expand All @@ -41,19 +42,23 @@ def _optimizer(learning_rate=1e-6):
# FIXME (liuwei12)
# the new memory optimize strategy will crash this unittest
# add enable_inplace=False here to force pass the unittest
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
not_fuse_op_first_loss, not_fuse_op_last_loss, _ = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
feed_dict={
"image": img,
"label": label
},
use_device=use_device,
fuse_elewise_add_act_ops=False,
use_ir_memory_optimize=False,
enable_inplace=False,
optimizer=_optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
fuse_op_first_loss, fuse_op_last_loss, _ = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
feed_dict={
"image": img,
"label": label
},
use_device=use_device,
fuse_elewise_add_act_ops=True,
use_ir_memory_optimize=False,
Expand Down
81 changes: 48 additions & 33 deletions python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@


class TestFuseOptimizationOps(TestParallelExecutorBase):

@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
Expand All @@ -41,14 +42,14 @@ def _compare_fused_optimizer_ops(self,
if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return

not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
not_fuse_op_first_loss, not_fuse_op_last_loss, _ = self.check_network_convergence(
model,
feed_dict=feed_dict,
get_data_from_feeder=get_data_from_feeder,
use_device=use_device,
fuse_all_optimizer_ops=False,
optimizer=optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
fuse_op_first_loss, fuse_op_last_loss, _ = self.check_network_convergence(
model,
feed_dict=feed_dict,
get_data_from_feeder=get_data_from_feeder,
Expand All @@ -63,36 +64,41 @@ def _compare_fused_optimizer_ops(self,

def _decorate_compare_fused_optimizer_ops(self, model, use_device,
optimizer):
self._compare_fused_optimizer_ops(
model,
use_device,
feed_dict=self._get_feed_dict(),
optimizer=optimizer)
self._compare_fused_optimizer_ops(model,
use_device,
feed_dict=self._get_feed_dict(),
optimizer=optimizer)


class TestFuseAdamOps(TestFuseOptimizationOps):

def optimizer(self, learning_rate=1e-4):
return fluid.optimizer.Adam(learning_rate=learning_rate)

def test_batchnorm_fc_with_fuse_op(self):
self._decorate_compare_fused_optimizer_ops(
fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(
fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
DeviceType.CUDA,
optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
DeviceType.CPU,
optimizer=self.optimizer)


class TestFuseSGDOps(TestFuseAdamOps):

def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.SGD(learning_rate=learning_rate)


class TestFuseMomentumOps(TestFuseAdamOps):

def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.Momentum(
learning_rate=learning_rate, momentum=0.1)
return fluid.optimizer.Momentum(learning_rate=learning_rate,
momentum=0.1)


class TestSpareFuseAdamOps(TestFuseOptimizationOps):

@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
Expand Down Expand Up @@ -120,24 +126,29 @@ def optimizer(self, learning_rate=1e-4):

def test_simple_bow_net_with_fuse_op(self):
model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
self._decorate_compare_fused_optimizer_ops(
model, DeviceType.CUDA, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(
model, DeviceType.CPU, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(model,
DeviceType.CUDA,
optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(model,
DeviceType.CPU,
optimizer=self.optimizer)


class TestSpareFuseSGDOps(TestSpareFuseAdamOps):

def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.SGD(learning_rate=learning_rate)


class TestSpareFuseMomentumOps(TestSpareFuseAdamOps):

def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.Momentum(
learning_rate=learning_rate, momentum=0.1)
return fluid.optimizer.Momentum(learning_rate=learning_rate,
momentum=0.1)


class TestPassConflictBase(TestFuseAdamOps):

def _compare_fused_optimizer_ops(self,
model,
use_device,
Expand All @@ -147,36 +158,40 @@ def _compare_fused_optimizer_ops(self,
if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return

self.check_pass_conflict(
model,
feed_dict=feed_dict,
get_data_from_feeder=get_data_from_feeder,
use_device=use_device,
fuse_all_optimizer_ops=True,
optimizer=optimizer,
enable_sequential_execution=True)
self.check_pass_conflict(model,
feed_dict=feed_dict,
get_data_from_feeder=get_data_from_feeder,
use_device=use_device,
fuse_all_optimizer_ops=True,
optimizer=optimizer,
enable_sequential_execution=True)


class TestFuseAdamOpsPassConflict(TestPassConflictBase):

def optimizer(self, learning_rate=1e-4):
return fluid.optimizer.Adam(learning_rate=learning_rate)

def test_batchnorm_fc_with_fuse_op(self):
self._decorate_compare_fused_optimizer_ops(
fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(
fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
DeviceType.CPU,
optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
DeviceType.CUDA,
optimizer=self.optimizer)


class TestFuseSGDOpsPassConflict(TestFuseAdamOpsPassConflict):

def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.SGD(learning_rate=learning_rate)


class TestFuseMomentumOpsPassConflict(TestFuseAdamOpsPassConflict):

def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.Momentum(
learning_rate=learning_rate, momentum=0.1)
return fluid.optimizer.Momentum(learning_rate=learning_rate,
momentum=0.1)


if __name__ == '__main__':
Expand Down

0 comments on commit fde34eb

Please sign in to comment.