@@ -83,6 +83,7 @@ def should_end(self) -> bool:
8383return bool (self .t [0 ] > 0 )
8484
8585def _signal_handler (self , signum , frame ):
86+ print ("Signal reveived" )
8687self .t [0 ] = 1
8788
8889
@@ -152,6 +153,8 @@ def float_list(x):
152153help = 'file where to save the dllogger log from the experiment' )
153154train .add_argument ('--workspace' , type = str , default = './' ,
154155help = 'path to directory where results will be stored' )
156+ train .add_argument ('--logdir' , type = str , default = None ,
157+ help = "path to directory where logs will be stored" )
155158train .add_argument ('--no-metrics' , action = 'store_true' ,
156159help = 'do not calculate evaluation metrics (for benchmarking)' )
157160train .add_argument ('--benchmark-iters' , type = int , default = None ,
@@ -199,13 +202,18 @@ def load_model(args, model):
199202file = list (glob .glob (
200203f"{ args .workspace } /{ args .model_prefix } _*.params" ))
201204if len (file ) == 0 :
202- return 0
205+ return - 1
206+
207+ file = [x for x in sorted (file ) if "best.params" not in x ]
208+
209+ if len (file ) == 0 :
210+ return - 1
203211
204- file = [ x for x in sorted ( file ) if "best.params" not in x ] [- 1 ]
212+ file = file [- 1 ]
205213
206214epoch = re .match (f".*{ args .model_prefix } _([0-9]*)\.params" , file )
207215if epoch is None :
208- return 0
216+ return - 1
209217
210218epoch = int (epoch .group (1 ))
211219model .load_parameters (file )
@@ -427,6 +435,8 @@ def transform_data(images, labels):
427435
428436durations .append (time .time () - tic )
429437tic = time .time ()
438+ else :
439+ break
430440
431441durations = durations [min (len (durations ) // 10 , 100 ):]
432442dllogger_epoch_data = {
@@ -453,8 +463,8 @@ def transform_data(images, labels):
453463accuracy = score .get ('accuracy' , - 1 )
454464save_checkpoint (net , epoch , accuracy , best_accuracy ,
455465model_prefix , args .workspace ,
456- args .save_frequency , kvstore ,
457- force_save = should_break )
466+ args .save_frequency if args . mode == "train_val" else - 1 ,
467+ kvstore , force_save = should_break )
458468best_accuracy = max (best_accuracy , accuracy )
459469global_metrics .update_dict (dllogger_epoch_data )
460470dllogger .log (step = (epoch ,), data = dllogger_epoch_data )
@@ -473,6 +483,11 @@ def fit(args, model, data_loader):
473483# select gpu for horovod process
474484if 'horovod' in args .kv_store :
475485args .gpus = [args .gpus [hvd .local_rank ()]]
486+ ctx = mx .gpu (hvd .local_rank ())
487+
488+ tensor1 = mx .nd .zeros (shape = (1 ,), dtype = 'float32' , ctx = ctx )
489+ tensor2 = mx .nd .zeros (shape = (1 ,), dtype = 'float32' , ctx = ctx )
490+ tensor1 , tensor2 = hvd .grouped_allreduce ([tensor1 ,tensor2 ])
476491
477492if args .amp :
478493amp .init ()
@@ -516,11 +531,12 @@ def fit(args, model, data_loader):
516531tic = time .time ()
517532return
518533
519- start_epoch = load_model (args , model )
534+ start_epoch = load_model (args , model )+ 1
520535if start_epoch == 0 :
521536# all initializers should be specified in the model definition.
522537# if not, this will raise an error
523538model .initialize (mx .init .Initializer ())
539+ logging .info (f"starting epoch { start_epoch } " )
524540
525541# devices for training
526542devs = list (map (mx .gpu , args .gpus ))
@@ -598,7 +614,7 @@ def fit(args, model, data_loader):
598614optimizer = args .optimizer ,
599615optimizer_params = optimizer_params ,
600616lr_scheduler = lr_scheduler ,
601- model_prefix = os . path . join ( args .workspace , args . model_prefix ) ,
617+ model_prefix = args .model_prefix ,
602618 )
603619elif args .mode == 'val' :
604620for epoch in range (args .num_epochs ): # loop for benchmarking
0 commit comments