diff --git a/.jenkins/check/config/filter_linklint.txt b/.jenkins/check/config/filter_linklint.txt index e46649dcbd5033fa6557330c739195a5b9ccce3c..8f5359c3c654e5e2e34c64fdc6cab66ab26af89b 100644 --- a/.jenkins/check/config/filter_linklint.txt +++ b/.jenkins/check/config/filter_linklint.txt @@ -7,4 +7,11 @@ http://vllab.ucmerced.edu/wlai24/LapSRN/results/* https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.pyPaper https://www.mindspore.cn/install/en https://www.mindspore.cn/resources/hub/details/en?mindspore/1.3/resnest50_imagenet2012 -https://mindspore.cn/resources/hub/details/en?MindSpore/ascend/1.2/mobilenetv2_v1.2_imagenet2012 \ No newline at end of file +https://mindspore.cn/resources/hub/details/en?MindSpore/ascend/1.2/mobilenetv2_v1.2_imagenet2012 +https://arxiv.org/abs/1909.13719AugMix +https://arxiv.org/abs/1805.09501Learning +https://arxiv.org/abs/1805.09501policy +https://arxiv.org/abs/1906.11172RandAugment +https://arxiv.org/abs/2104.00298Acc +https://github.com/google/automl/tree/master/efficientnetv2paper +https://github.com/google-research/augmix/blob/master/imagenet.pyFrom \ No newline at end of file diff --git a/OWNERS b/OWNERS index 0cd9b621a7bfe427b94fe74b6f75ed0f01921904..8d29f4f1f05cb79f8c37835e41939722535c2667 100644 --- a/OWNERS +++ b/OWNERS @@ -10,3 +10,4 @@ approvers: - baochong - luoyang42 - wang_hua_2019 +- zhangyifan999 diff --git a/benchmark/ascend/bert/pretrain_config_Ascend_Boost.yaml b/benchmark/ascend/bert/pretrain_config_Ascend_Boost.yaml index 684780dcccff5a3ba4aa26e14039a5b8d4013030..9e4dbb8dc45b48b540e9e65eab01aee6c95decd2 100644 --- a/benchmark/ascend/bert/pretrain_config_Ascend_Boost.yaml +++ b/benchmark/ascend/bert/pretrain_config_Ascend_Boost.yaml @@ -13,7 +13,7 @@ enable_profiling: False # ============================================================================== description: 'run_pretrain' -distribute: 'true' +distribute: 'false' epoch_size: 40 device_id: 0 device_num: 1 @@ -21,16 +21,18 @@ enable_save_ckpt: 'false' enable_lossscale: 'true' do_shuffle: 'true' enable_data_sink: 'true' -data_sink_steps: 100 +data_sink_steps: 1 accumulation_steps: 1 allreduce_post_accumulation: 'true' -save_checkpoint_path: './' -load_checkpoint_path: '' +save_checkpoint_path: '' +load_checkpoint_path: '/home/bertlarge/Bert/msdata/new_ckpt.ckpt' save_checkpoint_steps: 10000 -train_steps: 17000 -save_checkpoint_num: 5 -data_dir: '' +train_steps: 6300 +save_checkpoint_num: 1 +data_dir: '/data4/PCL/new_train_data' schema_dir: '' +dataset_format: "tfrecord" +num_samples: None # is the option which could be set by user to specify steps when bert_network is base # ============================================================================== # pretrain related @@ -39,19 +41,17 @@ batch_size: 32 bert_network: 'large_boost' loss_scale_value: 65536 scale_factor: 2 -scale_window: 1000 +scale_window: 6000 optimizer: 'Lamb' enable_global_norm: False # pretrain_eval related -train_with_eval: 'false' -eval_data_dir: "" +train_with_eval: 'true' +eval_data_dir: "/home/bertlarge/Bert/new_eval_data" schema_file: "" eval_ckpt: "" -eval_samples: 300000 +eval_samples: 150000 # bucket list, default: [] -bucket_list: [128, 256, 384, 512] -# use packed dataset and model, which is incompatible with bucket -use_packed: False +bucket_list: [1, 512] # optimizer related AdamWeightDecay: learning_rate: 0.00003 # 3e-5 @@ -63,15 +63,15 @@ AdamWeightDecay: warmup_steps: 10000 Lamb: - learning_rate: 0.00035 # 3.5e-4 + learning_rate: 0.0007 end_learning_rate: 1.0e-9 - power: 1.2 + power: 1.8 warmup_steps: 0 weight_decay: 0.0166629 decay_filter: ['layernorm', 'bias'] - eps: 0.000001 # 1e-6, - beta1: 0.86 - beta2: 0.98 + eps: 0.000001 + beta1: 0.85 + beta2: 0.97 Momentum: learning_rate: 0.00002 # 2e-5 @@ -128,7 +128,7 @@ nezha_net_cfg: dtype: mstype.float32 compute_type: mstype.float16 # large -large_batch_size: 24 +large_batch_size: 25 large_net_cfg: seq_length: 512 vocab_size: 30522 @@ -146,7 +146,7 @@ large_net_cfg: dtype: mstype.float32 compute_type: mstype.float16 # Accelerated large network which is only supported in Ascend yet. -large_boost_batch_size: 24 +large_boost_batch_size: 25 large_boost_net_cfg: seq_length: 512 vocab_size: 30522 @@ -200,3 +200,4 @@ enable_lossscale: ["true", "false"] do_shuffle: ["true", "false"] enable_data_sink: ["true", "false"] allreduce_post_accumulation: ["true", "false"] +dataset_format: ["tfrecord", "mindrecord"] diff --git a/benchmark/ascend/bert/pretrain_eval.py b/benchmark/ascend/bert/pretrain_eval.py index edb10579c2e658dfc162a5de6f6640071c919607..83c66a4f35b48c1236cb727f857e6d901afcd02b 100644 --- a/benchmark/ascend/bert/pretrain_eval.py +++ b/benchmark/ascend/bert/pretrain_eval.py @@ -24,7 +24,7 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.utils import BertMetric from src.model_utils.config import config as cfg, bert_net_cfg from src.bert_for_pre_training import BertPretrainEval -from src.dataset import create_eval_dataset +from src.dataset import create_eval_dataset, CreateEvalDatasetInput def bert_predict(): @@ -33,7 +33,8 @@ def bert_predict(): ''' devid = int(os.getenv('DEVICE_ID')) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) - dataset = create_eval_dataset(cfg.batch_size, 1, data_dir=cfg.eval_data_dir) + inputs = CreateEvalDatasetInput(batchsize=cfg.batch_size, device_num=1, data_dir=cfg.eval_data_dir) + dataset = create_eval_dataset(inputs) net_for_pretraining = BertPretrainEval(bert_net_cfg) net_for_pretraining.set_train(False) param_dict = load_checkpoint(cfg.eval_ckpt) diff --git a/benchmark/ascend/bert/run_pretrain.py b/benchmark/ascend/bert/run_pretrain.py index 2bd22fbb6ce62e98961f8455b5bdb4a40cf9b007..b0fa3e1d1f21ac6ec3f4e17e4030f5a0655b3640 100644 --- a/benchmark/ascend/bert/run_pretrain.py +++ b/benchmark/ascend/bert/run_pretrain.py @@ -1,4 +1,4 @@ -# Copyright 2020-2021 Huawei Technologies Co., Ltd +# Copyright 2020-2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,20 @@ #################pre_train bert example on zh-wiki######################## python run_pretrain.py """ +import datetime import os import mindspore.communication.management as D from mindspore.communication.management import get_rank import mindspore.common.dtype as mstype from mindspore import context +from mindspore import ops, Tensor, nn from mindspore.train.model import Model from mindspore.context import ParallelMode from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train.train_thor import ConvertModelUtils +from mindspore.communication.management import GlobalComm from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecay, thor from mindspore import log as logger from mindspore.common import set_seed @@ -35,9 +38,10 @@ from src import BertNetworkWithLoss, BertNetworkMatchBucket, \ BertTrainOneStepWithLossScaleCell, \ BertTrainAccumulationAllReduceEachWithLossScaleCell, \ BertTrainAccumulationAllReducePostWithLossScaleCell, \ + BertTrainOneStepWithLossScaleCellForAdam, \ BertPretrainEval, \ AdamWeightDecayForBert, AdamWeightDecayOp -from src.dataset import create_bert_dataset, create_eval_dataset +from src.dataset import create_bert_dataset, create_eval_dataset, CreateEvalDatasetInput from src.utils import LossCallBack, BertLearningRate, EvalCallBack, BertMetric from src.model_utils.config import config as cfg, bert_net_cfg from src.model_utils.moxing_adapter import moxing_wrapper @@ -45,11 +49,27 @@ from src.model_utils.device_adapter import get_device_id, get_device_num _current_dir = os.path.dirname(os.path.realpath(__file__)) +os.environ["GLOG_v"] = "1" +print(os.getenv("GLOG_v")) +print(os.getenv("P128")) + + +class AllreduceSync(nn.Cell): + def __init__(self): + super(AllreduceSync, self).__init__() + self.allreduce = ops.AllReduce(ops.ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP) + + def construct(self, x): + y = self.allreduce(x) + return y + + def _set_bert_all_reduce_split(): """set bert all_reduce fusion split, support num_hidden_layers is 12 and 24.""" device_target = context.get_context('device_target') enable_graph_kernel = context.get_context('enable_graph_kernel') device_num = context.get_auto_parallel_context('device_num') + print("device_num:", device_num) if bert_net_cfg.num_hidden_layers == 12: if bert_net_cfg.use_relative_positions: context.set_auto_parallel_context(all_reduce_fusion_config=[29, 58, 87, 116, 145, 174, 203, 217]) @@ -150,18 +170,6 @@ def _check_compute_type(args_opt): logger.warning(warning_message) -def _check_accumulation_steps(args_opt): - if args_opt.accumulation_steps > 1: - logger.info("accumulation steps: {}".format(args_opt.accumulation_steps)) - logger.info("global batch size: {}".format(args_opt.batch_size * cfg.accumulation_steps)) - if args_opt.enable_data_sink == "true": - args_opt.data_sink_steps *= cfg.accumulation_steps - logger.info("data sink steps: {}".format(args_opt.data_sink_steps)) - if args_opt.enable_save_ckpt == "true": - args_opt.save_checkpoint_steps *= cfg.accumulation_steps - logger.info("save checkpoint steps: {}".format(args_opt.save_checkpoint_steps)) - - def modelarts_pre_process(): '''modelarts pre process function.''' cfg.device_id = get_device_id() @@ -170,6 +178,40 @@ def modelarts_pre_process(): cfg.save_checkpoint_path = os.path.join(cfg.output_path, cfg.save_checkpoint_path) +def InitNetWithGrads(net_with_loss, optimizer): + '''init net with grads''' + if cfg.enable_lossscale == "true": + update_cell = DynamicLossScaleUpdateCell(loss_scale_value=cfg.loss_scale_value, + scale_factor=cfg.scale_factor, + scale_window=cfg.scale_window) + accumulation_steps = cfg.accumulation_steps + enable_global_norm = cfg.enable_global_norm + if accumulation_steps <= 1: + if cfg.optimizer == 'AdamWeightDecay' and cfg.device_target == 'GPU': + net_with_grads = BertTrainOneStepWithLossScaleCellForAdam(net_with_loss, optimizer=optimizer, + scale_update_cell=update_cell) + else: + net_with_grads = BertTrainOneStepWithLossScaleCell(net_with_loss, optimizer=optimizer, + scale_update_cell=update_cell) + else: + allreduce_post = cfg.distribute == "false" or cfg.allreduce_post_accumulation == "true" + net_with_accumulation = (BertTrainAccumulationAllReducePostWithLossScaleCell if allreduce_post else + BertTrainAccumulationAllReduceEachWithLossScaleCell) + net_with_grads = net_with_accumulation(net_with_loss, optimizer=optimizer, + scale_update_cell=update_cell, + accumulation_steps=accumulation_steps, + enable_global_norm=enable_global_norm) + else: + net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer, enable_clip_grad=True) + if cfg.optimizer == "Thor": + net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer, sens=cfg.Thor.loss_scale, + enable_clip_grad=False) + + if cfg.bucket_list: + net_with_grads = BertNetworkMatchBucket(net_with_grads, bert_net_cfg.seq_length, cfg.bucket_list) + return net_with_grads + + @moxing_wrapper(pre_process=modelarts_pre_process) def run_pretrain(): """pre-train bert_clue""" @@ -177,29 +219,39 @@ def run_pretrain(): context.set_context(reserve_class_name_in_scope=False) _set_graph_kernel_context(cfg.device_target) ckpt_save_dir = cfg.save_checkpoint_path + rank = 0 + device_num = 1 if cfg.distribute == "true": if cfg.device_target == 'Ascend': D.init() device_num = cfg.device_num - rank = cfg.device_id % device_num + rank = int(os.getenv("RANK_ID")) + else: + D.init() + device_num = D.get_group_size() + rank = D.get_rank() ckpt_save_dir = os.path.join(cfg.save_checkpoint_path, 'ckpt_' + str(get_rank())) context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) _set_bert_all_reduce_split() - else: - rank = 0 - device_num = 1 - _check_compute_type(cfg) - _check_accumulation_steps(cfg) + print(cfg, flush=True) + if cfg.accumulation_steps > 1: + logger.info("accumulation steps: {}".format(cfg.accumulation_steps)) + logger.info("global batch size: {}".format(cfg.batch_size * cfg.accumulation_steps)) + if cfg.enable_data_sink == "true": + cfg.data_sink_steps *= cfg.accumulation_steps + logger.info("data sink steps: {}".format(cfg.data_sink_steps)) + if cfg.enable_save_ckpt == "true": + cfg.save_checkpoint_steps *= cfg.accumulation_steps + logger.info("save checkpoint steps: {}".format(cfg.save_checkpoint_steps)) ds = create_bert_dataset(device_num, rank, cfg.do_shuffle, cfg.data_dir, cfg.schema_dir, cfg.batch_size, - cfg.bucket_list, cfg.use_packed) + cfg.bucket_list, cfg.dataset_format, cfg.num_samples) net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) - - new_repeat_count = cfg.epoch_size * ds.get_dataset_size() // cfg.data_sink_steps + new_repeat_count = cfg.epoch_size * ds.get_dataset_size() // cfg.data_sink_steps # 100w -> 100w/100 if cfg.train_steps > 0: train_steps = cfg.train_steps * cfg.accumulation_steps new_repeat_count = min(new_repeat_count, train_steps // cfg.data_sink_steps) @@ -220,43 +272,31 @@ def run_pretrain(): param_dict = load_checkpoint(cfg.load_checkpoint_path) load_param_into_net(net_with_loss, param_dict) - if cfg.enable_lossscale == "true": - update_cell = DynamicLossScaleUpdateCell(loss_scale_value=cfg.loss_scale_value, - scale_factor=cfg.scale_factor, - scale_window=cfg.scale_window) - accumulation_steps = cfg.accumulation_steps - enable_global_norm = cfg.enable_global_norm - if accumulation_steps <= 1: - net_with_grads = BertTrainOneStepWithLossScaleCell(net_with_loss, optimizer=optimizer, - scale_update_cell=update_cell) - else: - allreduce_post = cfg.distribute == "false" or cfg.allreduce_post_accumulation == "true" - net_with_accumulation = (BertTrainAccumulationAllReducePostWithLossScaleCell if allreduce_post else - BertTrainAccumulationAllReduceEachWithLossScaleCell) - net_with_grads = net_with_accumulation(net_with_loss, optimizer=optimizer, - scale_update_cell=update_cell, - accumulation_steps=accumulation_steps, - enable_global_norm=enable_global_norm) - else: - net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer, enable_clip_grad=True) - if cfg.optimizer == "Thor": - net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer, sens=cfg.Thor.loss_scale, - enable_clip_grad=False) - - if cfg.bucket_list: - net_with_grads = BertNetworkMatchBucket(net_with_grads, bert_net_cfg.seq_length, cfg.bucket_list) + net_with_grads = InitNetWithGrads(net_with_loss, optimizer) model = Model(net_with_grads) if cfg.train_with_eval == 'true': net_eval = BertPretrainEval(bert_net_cfg, network=net_with_loss.bert) - eval_ds = create_eval_dataset(cfg.batch_size, device_num, rank, cfg.eval_data_dir, - cfg.schema_dir, cfg.use_packed) + inputs = CreateEvalDatasetInput(batchsize=cfg.batch_size, device_num=device_num, rank=rank, + data_dir=cfg.eval_data_dir, schema_dir=cfg.schema_dir, + dataset_format=cfg.dataset_format, num_samples=cfg.num_samples) + eval_ds = create_eval_dataset(inputs) model = Model(net_with_grads, eval_network=net_eval, metrics={'bert_acc': BertMetric(cfg.batch_size)}) eval_callback = EvalCallBack(model, eval_ds, device_num * cfg.batch_size, cfg.eval_samples) callback.append(eval_callback) model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) + new_repeat_count = 160 + model.build(ds, eval_ds, cfg.data_sink_steps, new_repeat_count) + sync = AllreduceSync() + import numpy as np + sync(Tensor(np.ones(1), mstype.float32)) + print('using build>>>>>>>>>>>>>>>>>>>>>>>>>>') + + model.eval(eval_ds, dataset_sink_mode=(cfg.enable_data_sink == "true")) + print(f'new_repeat_count: {new_repeat_count}') + print(f'train start: {datetime.datetime.utcnow()}') model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) @@ -264,3 +304,4 @@ def run_pretrain(): if __name__ == '__main__': set_seed(0) run_pretrain() + print(">>>>>>>>>>>>>>>>>end") diff --git a/benchmark/ascend/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py b/benchmark/ascend/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py index c6c1945f7980e007e920e08fd4cd5ba16d787923..db85549506aba5795572eb22ba7d2453eb7085e7 100644 --- a/benchmark/ascend/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py +++ b/benchmark/ascend/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py @@ -15,6 +15,7 @@ """distribute pretrain script""" import os import json +import configparser import multiprocessing from argparse import ArgumentParser @@ -35,12 +36,12 @@ def parse_args(): parser.add_argument("--run_script_dir", type=str, default="", help="Run script path, it is better to use absolute path") + parser.add_argument("--hyper_parameter_config_dir", type=str, default="", + help="Hyper Parameter config path, it is better to use absolute path") parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--hccl_config_dir", type=str, default="", help="Hccl config path, it is better to use absolute path") - parser.add_argument("--config", type=str, default="", - help="Path to the config yaml file, it is better to use absolute path") parser.add_argument("--cmd_file", type=str, default="distributed_cmd.sh", help="Path of the generated cmd file.") parser.add_argument("--hccl_time_out", type=int, default=120, @@ -90,7 +91,7 @@ def make_dirs(cmd, logic_id): return cmd -def print_info(rank_id, device_id, logic_id, cmdopt, data_dir, cur_dir): +def print_info(rank_id, device_id, logic_id, cmdopt, epoch_size, data_dir, cur_dir): """ Print some information about scripts. """ @@ -99,6 +100,7 @@ def print_info(rank_id, device_id, logic_id, cmdopt, data_dir, cur_dir): print("device_id:", device_id) print("logic_id", logic_id) print("core_nums:", cmdopt) + print("epoch_size:", epoch_size) print("data_dir:", data_dir) print("log_file_dir: " + cur_dir + "/LOG" + str(logic_id) + "/pretraining_log.txt") @@ -113,6 +115,9 @@ def distribute_pretrain(): run_script = args.run_script_dir data_dir = args.data_dir + cf = configparser.ConfigParser() + cf.read(args.hyper_parameter_config_dir) + cfg = dict(cf.items("config")) print("hccl_config_dir:", args.hccl_config_dir) print("hccl_time_out:", args.hccl_time_out) @@ -170,11 +175,15 @@ def distribute_pretrain(): cmd = make_dirs(cmd, logic_id) print_info(rank_id=rank_id, device_id=device_id, logic_id=logic_id, cmdopt=cmdopt, cur_dir=cur_dir, - data_dir=data_dir) + epoch_size=str(cfg['epoch_size']), data_dir=data_dir) run_cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " " + opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) + if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): + raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," + " 'device_num' or 'data_dir'! ") + run_cmd += opt run_cmd += " --data_dir=" + data_dir - run_cmd += " --config_path=" + args.config run_cmd += ' --device_id=' + str(logic_id) + ' --device_num=' \ + str(rank_size) + ' >./pretraining_log.txt 2>&1 &' diff --git a/benchmark/ascend/bert/scripts/ascend_distributed_launcher/hyper_parameter_config.ini b/benchmark/ascend/bert/scripts/ascend_distributed_launcher/hyper_parameter_config.ini new file mode 100644 index 0000000000000000000000000000000000000000..c70edb2e3e4f11c70b0a1c2677b4a606585f1342 --- /dev/null +++ b/benchmark/ascend/bert/scripts/ascend_distributed_launcher/hyper_parameter_config.ini @@ -0,0 +1,12 @@ +[config] +distribute=true +epoch_size=40 +enable_save_ckpt=false +do_shuffle=false +enable_data_sink=true +data_sink_steps=100 +accumulation_steps=1 +allreduce_post_accumulation=true +save_checkpoint_path=./ +save_checkpoint_num=1 +config_path=../../pretrain_config_Ascend_Boost.yaml diff --git a/benchmark/ascend/bert/scripts/run_distributed_pretrain_ascend.sh b/benchmark/ascend/bert/scripts/run_distributed_pretrain_ascend.sh index fc8c719c32457340048ffca5f51bd105d078b4f1..937928b45a5b3972de12abd739954cad3125e112 100644 --- a/benchmark/ascend/bert/scripts/run_distributed_pretrain_ascend.sh +++ b/benchmark/ascend/bert/scripts/run_distributed_pretrain_ascend.sh @@ -17,16 +17,21 @@ echo "==============================================================================================================" echo "Please run the script as: " echo "bash scripts/run_distributed_pretrain_ascend.sh DATA_DIR RANK_TABLE_FILE" -echo "for example: bash scripts/run_distributed_pretrain_ascend.sh /path/dataset /path/hccl.json /path/config.yaml" +echo "for example: bash scripts/run_distributed_pretrain_ascend.sh /path/dataset /path/hccl.json" echo "It is better to use absolute path." +echo "For hyper parameter, please note that you should customize the scripts: + '{CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini' " echo "==============================================================================================================" + +export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE" + CUR_DIR=`pwd` -ulimit -s 102400 +ulimit -s 302400 python ${CUR_DIR}/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py \ --run_script_dir=${CUR_DIR}/run_pretrain.py \ + --hyper_parameter_config_dir=${CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini \ --data_dir=$1 \ --hccl_config_dir=$2 \ - --config=$3 \ --hccl_time_out=600 \ --hccn_config_file='/etc/hccn.conf' \ --cmd_file=distributed_cmd.sh diff --git a/benchmark/ascend/bert/scripts/run_standalone_pretrain_ascend.sh b/benchmark/ascend/bert/scripts/run_standalone_pretrain_ascend.sh index 8d0b9b9617fb1ce188cbeb6c4d15e312b4f52dae..85ef8b2d862f76fb18edee8272e16c063593c081 100644 --- a/benchmark/ascend/bert/scripts/run_standalone_pretrain_ascend.sh +++ b/benchmark/ascend/bert/scripts/run_standalone_pretrain_ascend.sh @@ -26,7 +26,7 @@ DATA_DIR=$3 SCHEMA_DIR=$4 ulimit -s 102400 -mkdir -p ms_log +mkdir -p ms_log PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) CUR_DIR=`pwd` export GLOG_log_dir=${CUR_DIR}/ms_log diff --git a/benchmark/ascend/bert/src/adam.py b/benchmark/ascend/bert/src/adam.py index 522c4bf1e0f73b98f9a6425b93b36ced2aca4a6b..7e47adddd075f2268608143d318382c07f49ab26 100644 --- a/benchmark/ascend/bert/src/adam.py +++ b/benchmark/ascend/bert/src/adam.py @@ -191,11 +191,10 @@ def _run_off_load_opt(opt, beta1_power, beta2_power, beta1, beta2, eps, lr, grad def _check_param_value(beta1, beta2, eps, prim_name): """Check the type of inputs.""" - assert isinstance(beta1, float) and 0 <= beta1 <= 1.0, "beta1 should be float and between 0 and 1" - assert isinstance(beta2, float) and 0 <= beta2 <= 1.0, "beta2 should be float and between 0 and 1" + assert isinstance(beta1, float) and 0 <= beta1 <= 1, "beta1 should between 0 and 1" + assert isinstance(beta2, float) and 0 <= beta2 <= 1, "beta2 should between 0 and 1" assert isinstance(eps, float) and eps > 0, "eps should be bigger than 0" - class AdamWeightDecayForBert(Optimizer): """ Implements the Adam algorithm to fix the weight decay. diff --git a/benchmark/ascend/bert/src/bert_for_pre_training.py b/benchmark/ascend/bert/src/bert_for_pre_training.py index e098f868385da1f258995ff7b1fb5da368c6c028..bb7df924b8e73af5318a61a6b7dc143c82eeb887 100644 --- a/benchmark/ascend/bert/src/bert_for_pre_training.py +++ b/benchmark/ascend/bert/src/bert_for_pre_training.py @@ -14,20 +14,20 @@ # ============================================================================ """Bert for pretraining.""" import numpy as np - +import mindspore as ms import mindspore.nn as nn -from mindspore import amp, ops from mindspore.common.initializer import initializer, TruncatedNormal from mindspore.ops import operations as P from mindspore.ops import functional as F from mindspore.ops import composite as C from mindspore.common.tensor import Tensor from mindspore.common.parameter import Parameter +from mindspore.common.api import jit from mindspore.common import dtype as mstype from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.context import ParallelMode from mindspore.communication.management import get_group_size -from mindspore import context +from mindspore import context, amp, ops from .bert_model import BertModel GRADIENT_CLIP_TYPE = 1 @@ -165,9 +165,9 @@ class BertPreTraining(nn.Cell): self.cls2 = GetNextSentenceOutput(config) def construct(self, input_ids, input_mask, token_type_id, - masked_lm_positions, next_sentence_starts=None): + masked_lm_positions): sequence_output, pooled_output, embedding_table = \ - self.bert(input_ids, token_type_id, input_mask, next_sentence_starts) + self.bert(input_ids, token_type_id, input_mask) prediction_scores = self.cls1(sequence_output, embedding_table, masked_lm_positions) @@ -188,7 +188,6 @@ class BertPretrainingLoss(nn.Cell): def __init__(self, config): super(BertPretrainingLoss, self).__init__() - self.use_packed = config.use_packed self.vocab_size = config.vocab_size self.onehot = P.OneHot() self.on_value = Tensor(1.0, mstype.float32) @@ -201,12 +200,10 @@ class BertPretrainingLoss(nn.Cell): self.cast = P.Cast() def construct(self, prediction_scores, seq_relationship_score, masked_lm_ids, - masked_lm_weights, next_sentence_labels, next_sentence_weights=None): + masked_lm_weights, next_sentence_labels): """Defines the computation performed.""" label_ids = self.reshape(masked_lm_ids, self.last_idx) label_weights = self.cast(self.reshape(masked_lm_weights, self.last_idx), mstype.float32) - if self.use_packed: - label_weights = F.minimum(label_weights, 1.0) one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value) per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx)) @@ -219,16 +216,10 @@ class BertPretrainingLoss(nn.Cell): one_hot_labels = self.onehot(labels, 2, self.on_value, self.off_value) per_example_loss = self.neg(self.reduce_sum( one_hot_labels * seq_relationship_score, self.last_idx)) - if self.use_packed: - weights = self.cast(self.reshape(next_sentence_weights, self.last_idx), mstype.float32) - numerator = self.reduce_sum(weights * per_example_loss, ()) - denominator = F.maximum(self.reduce_sum(weights, ()), 1e-5) - next_sentence_loss = numerator / denominator - else: - next_sentence_loss = self.reduce_mean(per_example_loss, self.last_idx) + next_sentence_loss = self.reduce_mean(per_example_loss, self.last_idx) # total_loss - total_loss = masked_lm_loss + next_sentence_loss + total_loss = masked_lm_loss + next_sentence_loss #* tmp return total_loss @@ -251,6 +242,7 @@ class BertNetworkWithLoss(nn.Cell): self.bert = BertPreTraining(config, is_training, use_one_hot_embeddings) self.loss = BertPretrainingLoss(config) self.cast = P.Cast() + self.print = ops.Print() def construct(self, input_ids, @@ -259,14 +251,13 @@ class BertNetworkWithLoss(nn.Cell): next_sentence_labels, masked_lm_positions, masked_lm_ids, - masked_lm_weights, - next_sentence_starts=None, - next_sentence_weights=None): + masked_lm_weights): """Get pre-training loss""" prediction_scores, seq_relationship_score = \ - self.bert(input_ids, input_mask, token_type_id, masked_lm_positions, next_sentence_starts) + self.bert(input_ids, input_mask, token_type_id, masked_lm_positions) total_loss = self.loss(prediction_scores, seq_relationship_score, - masked_lm_ids, masked_lm_weights, next_sentence_labels, next_sentence_weights) + masked_lm_ids, masked_lm_weights, next_sentence_labels) + return self.cast(total_loss, mstype.float32) @@ -289,10 +280,16 @@ class BertTrainOneStepCell(nn.TrainOneStepCell): self.cast = P.Cast() self.hyper_map = C.HyperMap() self.enable_clip_grad = enable_clip_grad + self.enable_tuple_broaden = True def set_sens(self, value): self.sens = value + @jit + def clip_grads(self, grads): + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + return grads + def construct(self, input_ids, input_mask, @@ -321,7 +318,7 @@ class BertTrainOneStepCell(nn.TrainOneStepCell): self.cast(F.tuple_to_array((self.sens,)), mstype.float32)) if self.enable_clip_grad: - grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + grads = self.clip_grads(grads) grads = self.grad_reducer(grads) self.optimizer(grads) return loss @@ -370,6 +367,12 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) + self.enable_tuple_broaden = True + + @jit + def clip_grads(self, grads): + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + return grads def construct(self, input_ids, @@ -379,25 +382,27 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): masked_lm_positions, masked_lm_ids, masked_lm_weights, - next_sentence_starts=None, - next_sentence_weights=None, sens=None): """Defines the computation performed.""" weights = self.weights + + if sens is None: + scaling_sens = self.loss_scale + else: + scaling_sens = sens + flag = ops.cast(ops.sum(masked_lm_weights), ms.bool_) + flag_float = ops.cast(flag, ms.float32) + if not flag: + return flag_float.unsqueeze(-1), flag, scaling_sens.value() loss = self.network(input_ids, input_mask, token_type_id, next_sentence_labels, masked_lm_positions, masked_lm_ids, - masked_lm_weights, - next_sentence_starts, - next_sentence_weights) - if sens is None: - scaling_sens = self.loss_scale - else: - scaling_sens = sens + masked_lm_weights) status, scaling_sens = self.start_overflow_check(loss, scaling_sens) + grads = self.grad(self.network, weights)(input_ids, input_mask, token_type_id, @@ -405,15 +410,12 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): masked_lm_positions, masked_lm_ids, masked_lm_weights, - next_sentence_starts, - next_sentence_weights, self.cast(scaling_sens, mstype.float32)) # apply grad reducer on grads grads = self.grad_reducer(grads) degree_sens = self.cast(scaling_sens * self.degree, mstype.float32) grads = self.hyper_map(F.partial(grad_scale, degree_sens), grads) - grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) cond = self.get_overflow_status(status, grads) overflow = cond @@ -421,7 +423,7 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): overflow = self.loss_scaling_manager(self.loss_scale, cond) if not overflow: self.optimizer(grads) - return (loss, cond, scaling_sens.value()) + return loss, cond, scaling_sens.value() class BertTrainOneStepWithLossScaleCellForAdam(nn.TrainOneStepWithLossScaleCell): @@ -449,6 +451,12 @@ class BertTrainOneStepWithLossScaleCellForAdam(nn.TrainOneStepWithLossScaleCell) self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) + self.enable_tuple_broaden = True + + @jit + def clip_grads(self, grads): + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + return grads def construct(self, input_ids, @@ -486,13 +494,13 @@ class BertTrainOneStepWithLossScaleCellForAdam(nn.TrainOneStepWithLossScaleCell) # apply grad reducer on grads grads = self.grad_reducer(grads) grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads) - grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + grads = self.clip_grads(grads) cond = self.get_overflow_status(status, grads) overflow = cond if self.loss_scaling_manager is not None: overflow = self.loss_scaling_manager(scaling_sens, cond) self.optimizer(grads, overflow) - return (loss, cond, scaling_sens.value()) + return loss, cond, scaling_sens.value() cast = P.Cast() add_grads = C.MultitypeFuncGraph("add_grads") @@ -612,7 +620,7 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): scaling_sens = self.loss_scale else: scaling_sens = sens - + # alloc status and clear should be right before gradoperation # update accumulation parameters is_accu_step = self.not_equal(self.local_step, self.accumulation_steps) self.local_step = self.select(is_accu_step, self.local_step + self.one, self.one) @@ -634,8 +642,6 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): mean_loss = F.depend(mean_loss, accu_succ) overflow = ops.logical_not(amp.all_finite(grads)) - - overflow = self.logical_or(self.not_equal(self.accu_overflow, self.zero), overflow) accu_overflow = self.select(overflow, self.one, self.zero) self.accu_overflow = self.select(is_accu_step, accu_overflow, self.zero) @@ -659,7 +665,7 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): if not overflow: self.optimizer(grads) - return (mean_loss, overflow, scaling_sens.value()) + return mean_loss, overflow, scaling_sens.value() class BertTrainAccumulationAllReduceEachWithLossScaleCell(nn.Cell): @@ -755,9 +761,6 @@ class BertTrainAccumulationAllReduceEachWithLossScaleCell(nn.Cell): mean_loss = self.accu_loss / self.local_step is_accu_step = self.not_equal(self.local_step, self.accumulation_steps) - # alloc status and clear should be right before gradoperation - init = self.alloc_status() - self.clear_before_grad(init) grads = self.grad(self.network, weights)(input_ids, input_mask, token_type_id, @@ -777,6 +780,7 @@ class BertTrainAccumulationAllReduceEachWithLossScaleCell(nn.Cell): overflow = ops.logical_not(amp.all_finite(grads)) if self.reducer_flag: overflow = self.allreduce(overflow.to(mstype.float32)) >= self.base + overflow = self.logical_or(self.not_equal(self.accu_overflow, self.zero), overflow) accu_overflow = self.select(overflow, self.one, self.zero) self.accu_overflow = self.select(is_accu_step, accu_overflow, self.zero) overflow = self.reshape(overflow, (())) @@ -814,7 +818,19 @@ class BertNetworkMatchBucket(nn.Cell): self.network = network if not bucket_list or not isinstance(bucket_list, list): bucket_list = [seq_length] - self.bucket_list = [bucket for bucket in bucket_list if bucket <= seq_length] + self.bucket_list = [bucket for bucket in bucket_list if bucket < seq_length] + + if network.reducer_flag: + reuse_attr = 'reuse_communication_node' + if not network.grad_reducer.split_fusion: + hccl_op = network.grad_reducer.allreduce + network.grad_reducer.allreduce = hccl_op.add_prim_attr(reuse_attr, getattr(hccl_op, 'fusion')) + else: + new_op_list = [] + for hccl_op in network.grad_reducer.op_list: + new_op = hccl_op.add_prim_attr(reuse_attr, getattr(hccl_op, 'fusion')) + new_op_list.append(new_op) + network.grad_reducer.op_list = new_op_list def construct(self, input_ids, @@ -826,20 +842,6 @@ class BertNetworkMatchBucket(nn.Cell): masked_lm_weights, sentence_flag): """Switch network according to sentence length.""" - for bucket in self.bucket_list: - if sentence_flag == bucket: - input_ids = input_ids[:, :bucket] - input_mask = input_mask[:, :bucket] - token_type_id = token_type_id[:, :bucket] - loss = self.network(input_ids, - input_mask, - token_type_id, - next_sentence_labels, - masked_lm_positions, - masked_lm_ids, - masked_lm_weights) - return loss - loss = self.network(input_ids, input_mask, token_type_id, @@ -860,7 +862,6 @@ class BertPretrainEval(nn.Cell): self.network = BertPreTraining(config, False, False) else: self.network = network - self.use_packed = config.use_packed self.argmax = P.Argmax(axis=-1, output_type=mstype.int32) self.equal = P.Equal() self.sum = P.ReduceSum() @@ -880,18 +881,14 @@ class BertPretrainEval(nn.Cell): next_sentence_labels, masked_lm_positions, masked_lm_ids, - masked_lm_weights, - next_sentence_starts=None, - next_sentence_weights=None): + masked_lm_weights): """Calculate prediction scores""" bs, _ = self.shape(input_ids) - mlm, _ = self.network(input_ids, input_mask, token_type_id, masked_lm_positions, next_sentence_starts) - index = self.argmax(mlm) + mlm, _ = self.network(input_ids, input_mask, token_type_id, masked_lm_positions) + _, index = mlm.argmax_with_value(axis=-1) index = self.reshape(index, (bs, -1)) eval_acc = self.equal(index, masked_lm_ids) eval_acc = self.cast(eval_acc, mstype.float32) - if self.use_packed: - masked_lm_weights = F.minimum(masked_lm_weights, 1.0) real_acc = eval_acc * masked_lm_weights acc = self.sum(real_acc) total = self.sum(masked_lm_weights) diff --git a/benchmark/ascend/bert/src/bert_model.py b/benchmark/ascend/bert/src/bert_model.py index 0ce562f1b0e37c5f1b2ef0e2c89d5b1680084379..23e17f6d6f6f87fd53ef4f358000effd4b284667 100644 --- a/benchmark/ascend/bert/src/bert_model.py +++ b/benchmark/ascend/bert/src/bert_model.py @@ -19,13 +19,14 @@ import copy import numpy as np import mindspore.common.dtype as mstype import mindspore.nn as nn +import mindspore.ops as ops import mindspore.ops.functional as F from mindspore.common.initializer import TruncatedNormal, initializer -import mindspore.ops as ops from mindspore.ops import operations as P from mindspore.ops import composite as C from mindspore.common.tensor import Tensor from mindspore.common.parameter import Parameter +from mindspore.ops.operations.nn_ops import FlashAttentionScore class BertConfig: @@ -70,8 +71,7 @@ class BertConfig: initializer_range=0.02, use_relative_positions=False, dtype=mstype.float32, - compute_type=mstype.float32, - use_packed=False): + compute_type=mstype.float32): self.seq_length = seq_length self.vocab_size = vocab_size self.hidden_size = hidden_size @@ -87,7 +87,6 @@ class BertConfig: self.use_relative_positions = use_relative_positions self.dtype = dtype self.compute_type = compute_type - self.use_packed = use_packed class EmbeddingLookup(nn.Cell): @@ -357,6 +356,122 @@ class SaturateCast(nn.Cell): return self.cast(out, self.dst_type) +class BertFlashAttention(nn.Cell): + """ + Apply multi-headed attention from "from_tensor" to "to_tensor". + + Args: + from_tensor_width (int): Size of last dim of from_tensor. + to_tensor_width (int): Size of last dim of to_tensor. + num_attention_heads (int): Number of attention heads. Default: 1. + size_per_head (int): Size of each attention head. Default: 512. + query_act (str): Activation function for the query transform. Default: None. + key_act (str): Activation function for the key transform. Default: None. + value_act (str): Activation function for the value transform. Default: None. + has_attention_mask (bool): Specifies whether to use attention mask. Default: False. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.0. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + compute_type (:class:`mindspore.dtype`): Compute type in BertAttention. Default: mstype.float32. + """ + def __init__(self, + from_tensor_width, + to_tensor_width, + num_attention_heads=1, + size_per_head=512, + query_act=None, + key_act=None, + value_act=None, + has_attention_mask=False, + attention_probs_dropout_prob=0.0, + use_one_hot_embeddings=False, + initializer_range=0.02, + use_relative_positions=False, + compute_type=mstype.float32): + + super(BertFlashAttention, self).__init__() + self.num_attention_heads = num_attention_heads + self.size_per_head = size_per_head + self.has_attention_mask = has_attention_mask + self.use_relative_positions = use_relative_positions + + self.scores_mul = 1.0 / math.sqrt(float(self.size_per_head)) + self.reshape = P.Reshape() + self.shape_from_2d = (-1, from_tensor_width) + self.shape_to_2d = (-1, to_tensor_width) + weight = TruncatedNormal(initializer_range) + units = num_attention_heads * size_per_head + self.query_layer = nn.Dense(from_tensor_width, + units, + activation=query_act, + weight_init=weight).to_float(compute_type) + self.key_layer = nn.Dense(to_tensor_width, + units, + activation=key_act, + weight_init=weight).to_float(compute_type) + self.value_layer = nn.Dense(to_tensor_width, + units, + activation=value_act, + weight_init=weight).to_float(compute_type) + + self.matmul_trans_b = P.BatchMatMul(transpose_b=True) + self.multiply = P.Mul() + self.transpose = P.Transpose() + self.trans_shape = (0, 2, 1, 3) + self.trans_shape_relative = (2, 0, 1, 3) + self.trans_shape_position = (1, 2, 0, 3) + self.multiply_data = -10000.0 + self.matmul = P.BatchMatMul() + + self.softmax = nn.Softmax() + self.dropout = nn.Dropout(p=attention_probs_dropout_prob) + + self.shape_return = (-1, num_attention_heads * size_per_head) + + self.cast_compute_type = SaturateCast(dst_type=compute_type) + self.flash_attention = FlashAttentionScore(head_num=num_attention_heads, + input_layout="BNSD", + sparse_mode=0, # lxy + scale_value=1 / math.sqrt(size_per_head)) + if self.use_relative_positions: + self._generate_relative_positions_embeddings = \ + RelaPosEmbeddingsGenerator(depth=size_per_head, + max_relative_position=16, + initializer_range=initializer_range, + use_one_hot_embeddings=use_one_hot_embeddings) + + def construct(self, from_tensor, to_tensor, attention_mask): + """reshape 2d/3d input tensors to 2d""" + shape_from = F.shape(attention_mask)[2] # seq length + from_tensor = F.depend(from_tensor, shape_from) + from_tensor_2d = self.reshape(from_tensor, self.shape_from_2d) + to_tensor_2d = self.reshape(to_tensor, self.shape_to_2d) + query_out = self.query_layer(from_tensor_2d) + key_out = self.key_layer(to_tensor_2d) + value_out = self.value_layer(to_tensor_2d) + #b, s, n, d + query_layer = self.reshape(query_out, (-1, shape_from, self.num_attention_heads, self.size_per_head)) + query_layer = self.transpose(query_layer, self.trans_shape) + key_layer = self.reshape(key_out, (-1, shape_from, self.num_attention_heads, self.size_per_head)) + key_layer = self.transpose(key_layer, self.trans_shape) + # 25, 1, 512 + attention_mask = None + # 25, 1, 512, 512 + value_layer = self.reshape(value_out, (-1, shape_from, self.num_attention_heads, self.size_per_head)) + value_layer = self.transpose(value_layer, self.trans_shape) + _, _, _, context_layer = self.flash_attention(query_layer.astype(mstype.float16), + key_layer.astype(mstype.float16), + value_layer.astype(mstype.float16), + None, None, None, attention_mask, None) + + context_layer = self.transpose(context_layer, self.trans_shape) + context_layer = self.reshape(context_layer, self.shape_return) + + return context_layer + + class BertAttention(nn.Cell): """ Apply multi-headed attention from "from_tensor" to "to_tensor". @@ -492,7 +607,9 @@ class BertAttention(nn.Cell): attention_scores = self.multiply(self.scores_mul, attention_scores) if self.has_attention_mask: + # 25, 1, 512 attention_mask = self.expand_dims(attention_mask, 1) + # 25, 1, 1, 512 multiply_out = self.sub(self.cast(F.tuple_to_array((1.0,)), self.get_dtype(attention_scores)), self.cast(attention_mask, self.get_dtype(attention_scores))) @@ -569,8 +686,7 @@ class BertSelfAttention(nn.Cell): "of attention heads (%d)" % (hidden_size, num_attention_heads)) self.size_per_head = int(hidden_size / num_attention_heads) - - self.attention = BertAttention( + self.attention = BertFlashAttention( from_tensor_width=hidden_size, to_tensor_width=hidden_size, num_attention_heads=num_attention_heads, @@ -738,24 +854,14 @@ class CreateAttentionMaskFromInputMask(nn.Cell): """ def __init__(self, config): super(CreateAttentionMaskFromInputMask, self).__init__() - self.use_packed = config.use_packed self.input_mask = None self.cast = P.Cast() self.reshape = P.Reshape() - self.tile = P.Tile() - self.transpose = P.Transpose() def construct(self, input_mask): seq_length = F.shape(input_mask)[1] - if self.use_packed: - mask_tile = self.reshape(self.tile(input_mask, (1, seq_length)), (-1, seq_length)) - reshape_mask = F.broadcast_to(self.reshape(input_mask, (1, -1)), (seq_length, -1)) - transpose_mask = self.transpose(reshape_mask, (1, 0)) - attention_mask = self.reshape(self.cast(mask_tile == transpose_mask, mstype.float32), - (-1, seq_length, seq_length)) - else: - attention_mask = self.cast(self.reshape(input_mask, (-1, 1, seq_length)), mstype.float32) + attention_mask = self.cast(self.reshape(input_mask, (-1, 1, seq_length)), mstype.float32) return attention_mask @@ -781,7 +887,6 @@ class BertModel(nn.Cell): self.hidden_size = config.hidden_size self.num_hidden_layers = config.num_hidden_layers self.embedding_size = config.hidden_size - self.use_packed = config.use_packed self.token_type_ids = None self.last_idx = self.num_hidden_layers - 1 @@ -822,7 +927,6 @@ class BertModel(nn.Cell): self.dtype = config.dtype self.cast_compute_type = SaturateCast(dst_type=config.compute_type) self.slice = P.StridedSlice() - self.gather = P.Gather() self.squeeze_1 = P.Squeeze(axis=1) self.dense = nn.Dense(self.hidden_size, self.hidden_size, @@ -830,7 +934,7 @@ class BertModel(nn.Cell): weight_init=TruncatedNormal(config.initializer_range)).to_float(config.compute_type) self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask(config) - def construct(self, input_ids, token_type_ids, input_mask, next_sentence_starts=None): + def construct(self, input_ids, token_type_ids, input_mask): """Bidirectional Encoder Representations from Transformers.""" # embedding embedding_tables = self.bert_embedding_lookup.embedding_table @@ -849,18 +953,11 @@ class BertModel(nn.Cell): # pooler batch_size = P.Shape()(input_ids)[0] - if self.use_packed: - slices = [] - for i in range(batch_size): - slices.append(sequence_output[i][next_sentence_starts[i]]) - sequence_slice = F.stack(slices) - first_token = F.reshape(sequence_slice, (-1, self.hidden_size)) - else: - sequence_slice = self.slice(sequence_output, - (0, 0, 0), - (batch_size, 1, self.hidden_size), - (1, 1, 1)) - first_token = self.squeeze_1(sequence_slice) + sequence_slice = self.slice(sequence_output, + (0, 0, 0), + (batch_size, 1, self.hidden_size), + (1, 1, 1)) + first_token = self.squeeze_1(sequence_slice) pooled_output = self.dense(first_token) pooled_output = self.cast(pooled_output, self.dtype) diff --git a/benchmark/ascend/bert/src/dataset.py b/benchmark/ascend/bert/src/dataset.py index a6d70ca67cfa0107bac6c9148411953a08c5418f..37aff4e3683f25bdb6e1bae4e37e0a7e97a2c3bb 100644 --- a/benchmark/ascend/bert/src/dataset.py +++ b/benchmark/ascend/bert/src/dataset.py @@ -1,22 +1,7 @@ -# Copyright 2020-2022 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -Data operations, will be used in run_pretrain.py -""" import os import math +from dataclasses import field, dataclass + import numpy as np import mindspore.common.dtype as mstype import mindspore.dataset as ds @@ -27,42 +12,48 @@ from mindspore import log as logger class BucketDatasetGenerator: """ Provide data distribution of different gears for the bert network. - Args: - dataset (Dataset): The training dataset. + dataset (Dataset): The training dataset batch_size (Int): The training batchsize. - bucket_list (List): List of different sentence lengths, such as [128, 256, 512]. Default: None. - valid_dataset_len (Int): Prevent communication failure at the end of the dataset. Default: 0.35. + bucket_list (List): List of different sentence lengths, such as [128, 256, 512]. Default: None """ - def __init__(self, dataset, batch_size, bucket_list=None, valid_dataset_len=0.35): + def __init__(self, dataset, batch_size, bucket_list=None, train_steps=None): self.dataset = dataset self.batch_size = batch_size self.bucket_list = bucket_list - bucket_size = len(bucket_list) - self.random_list = np.random.binomial(n=(bucket_size - 1), p=0.55, size=self.__len__()) - self.random_list = (self.random_list + 2) % bucket_size + self.train_steps = 20000 + self.random_list_strategy3() self.random_list = [bucket_list[i] for i in self.random_list] - valid_dataset_len = int(valid_dataset_len * self.__len__()) - self.random_list = self.random_list[:valid_dataset_len] + [bucket_list[-1]] * self.__len__() self._init_variables() + self.target_clip_length = bucket_list[0] + self.clipped_length_from = bucket_list[-1] def _init_variables(self): self.data_bucket = {bucket: [] for bucket in self.bucket_list} self.iter = 0 self.remaining_data = [] - self.remaining_data_size = 1 self.stage = 0 + def random_list_strategy3(self, p128=0.5): + p128 = os.getenv("P128", p128) + print("p128: ", p128) + p128 = int(float(p128) * 100) + p512 = 100 - p128 + rpt = self.__len__() // 100 + rmd = self.__len__() % 100 + self.random_list = np.array(([0] * p128 + [1] * p512) * rpt + [0] * rmd) + def __next__(self): if self.stage != 0: return self._process_remaining_data() for item in self.iterator: - for seq_length in self.bucket_list: - if np.sum(item[1]) <= seq_length: - self.data_bucket[seq_length].append(item) - break + if np.sum(item[1]) < 384: + item = self.clip_data(item) + self.data_bucket.get(self.bucket_list[0]).append(item) + else: + self.data_bucket.get(self.bucket_list[-1]).append(item) for key in self.data_bucket.keys(): data = self.data_bucket[key] if len(data) >= self.batch_size and self.random_list[self.iter] == key: @@ -72,8 +63,18 @@ class BucketDatasetGenerator: self.stage = 1 return self._process_remaining_data() + @staticmethod + def clip_data(item): + item_clip = item[:3] + item_clip.append(np.array([0], np.int64)) + masked_lm_positions = np.array([0] * 76) + masked_lm_ids = np.array([0] * 76) + masked_lm_weights = np.array([0.0] * 76, np.float32) + item_clip += [masked_lm_positions, masked_lm_ids, masked_lm_weights] + return item_clip + def _package_data(self, data, key): - """package a set of data.""" + """Package a set of data.""" arr = data[0] for i in range(1, self.batch_size): current_data = data[i] @@ -118,22 +119,36 @@ class BucketDatasetGenerator: def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, schema_dir=None, batch_size=32, - bucket_list=None, use_packed=False): + bucket_list=None, dataset_format="mindrecord", num_samples=None): """create train dataset""" # apply repeat operations files = os.listdir(data_dir) data_files = [] for file_name in files: - if use_packed or "tfrecord" in file_name: + condition1 = dataset_format == "tfrecord" and "tfrecord" in file_name + condition2 = dataset_format == "mindrecord" and "mindrecord" in file_name and "mindrecord.db" not in file_name + if condition1 or condition2: data_files.append(os.path.join(data_dir, file_name)) - columns_list = ["input_ids", "input_mask", "segment_ids", "next_sentence_labels", "masked_lm_positions", - "masked_lm_ids", "masked_lm_weights"] - if use_packed: - columns_list.extend(["next_sentence_positions", "next_sentence_weights"]) - data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, - columns_list=columns_list, - shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False, - num_shards=device_num, shard_id=rank, shard_equal_rows=True) + if dataset_format == "mindrecord": + if str(num_samples).lower() != "none": + data_set = ds.MindDataset(data_files, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + shuffle=False, num_shards=device_num, shard_id=rank, num_samples=num_samples) + else: + data_set = ds.MindDataset(data_files, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False, + num_shards=device_num, shard_id=rank) + elif dataset_format == "tfrecord": + data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False, + num_shards=device_num, shard_id=rank, shard_equal_rows=True) + else: + raise NotImplementedError("Only supported dataset_format for tfrecord or mindrecord.") if bucket_list: bucket_dataset = BucketDatasetGenerator(data_set, batch_size, bucket_list=bucket_list) data_set = ds.GeneratorDataset(bucket_dataset, @@ -152,9 +167,6 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") - if use_packed: - data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_positions") - data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_weights") # apply batch operations logger.info("data size: {}".format(data_set.get_dataset_size())) logger.info("repeat count: {}".format(data_set.get_repeat_count())) @@ -169,10 +181,12 @@ def create_ner_dataset(batch_size=1, assessment_method="accuracy", data_file_pat dataset = ds.MindDataset([data_file_path], columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle) - else: + elif dataset_format == "tfrecord": dataset = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle) + else: + raise NotImplementedError("Only supported dataset_format for tfrecord or mindrecord.") if assessment_method == "Spearman_correlation": type_cast_op_float = C.TypeCast(mstype.float32) dataset = dataset.map(operations=type_cast_op_float, input_columns="label_ids") @@ -186,13 +200,20 @@ def create_ner_dataset(batch_size=1, assessment_method="accuracy", data_file_pat return dataset -def create_classification_dataset(batch_size=1, assessment_method="accuracy", - data_file_path=None, schema_file_path=None, do_shuffle=True): +def create_classification_dataset(batch_size=1, assessment_method="accuracy", data_file_path=None, + schema_file_path=None, dataset_format="mindrecord", do_shuffle=True): """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) - data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + if dataset_format == "mindrecord": + data_set = ds.MindDataset([data_file_path], columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle) + elif dataset_format == "tfrecord": + data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], + shuffle=do_shuffle) + else: + raise NotImplementedError("Only supported dataset_format for tfrecord or mindrecord.") if assessment_method == "Spearman_correlation": type_cast_op_float = C.TypeCast(mstype.float32) data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids") @@ -212,19 +233,34 @@ def generator_squad(data_features): def create_squad_dataset(batch_size=1, data_file_path=None, schema_file_path=None, - is_training=True, do_shuffle=True): + is_training=True, do_shuffle=True, dataset_format="mindrecord"): """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) if is_training: - data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + if dataset_format == "mindrecord": + data_set = ds.MindDataset([data_file_path], columns_list=["input_ids", "input_mask", "segment_ids", "start_positions", "end_positions", "unique_ids", "is_impossible"], shuffle=do_shuffle) + elif dataset_format == "tfrecord": + data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "start_positions", + "end_positions", "unique_ids", "is_impossible"], + shuffle=do_shuffle) + else: + raise NotImplementedError("Only supported dataset_format for tfrecord or mindrecord.") data_set = data_set.map(operations=type_cast_op, input_columns="start_positions") data_set = data_set.map(operations=type_cast_op, input_columns="end_positions") else: - data_set = ds.GeneratorDataset(generator_squad(data_file_path), shuffle=do_shuffle, - column_names=["input_ids", "input_mask", "segment_ids", "unique_ids"]) + if dataset_format == "mindrecord": + data_set = ds.MindDataset([data_file_path], + columns_list=["input_ids", "input_mask", "segment_ids", "unique_ids"], + shuffle=do_shuffle) + elif dataset_format == "tfrecord": + data_set = ds.GeneratorDataset(generator_squad(data_file_path), shuffle=do_shuffle, + column_names=["input_ids", "input_mask", "segment_ids", "unique_ids"]) + else: + raise NotImplementedError("Only supported dataset_format for tfrecord or mindrecord.") data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") @@ -234,22 +270,54 @@ def create_squad_dataset(batch_size=1, data_file_path=None, schema_file_path=Non return data_set -def create_eval_dataset(batchsize=32, device_num=1, rank=0, data_dir=None, schema_dir=None, use_packed=False): +@dataclass +class CreateEvalDatasetInput: + batchsize: int = field(default=32) + device_num: int = field(default=1) + rank: int = field(default=0) + data_dir: str = field(default=None) + schema_dir: str = field(default=None) + dataset_format: str = field(default="mindrecord") + num_samples: int = field(default=None) + + +def create_eval_dataset(inputs=CreateEvalDatasetInput()): """create evaluation dataset""" + batchsize = inputs.batchsize + device_num = inputs.device_num + rank = inputs.rank + data_dir = inputs.data_dir + schema_dir = inputs.schema_dir + dataset_format = inputs.dataset_format + num_samples = inputs.num_samples data_files = [] if os.path.isdir(data_dir): files = os.listdir(data_dir) for file_name in files: - if use_packed or "tfrecord" in file_name: + condition1 = dataset_format == "tfrecord" and "tfrecord" in file_name + condition2 = dataset_format == "mindrecord" and "mindrecord" in file_name \ + and "mindrecord.db" not in file_name + if condition1 or condition2: data_files.append(os.path.join(data_dir, file_name)) else: data_files.append(data_dir) - columns_list = ["input_ids", "input_mask", "segment_ids", "next_sentence_labels", - "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"] - if use_packed: - columns_list.extend(["next_sentence_positions", "next_sentence_weights"]) - data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, - columns_list=columns_list, shard_equal_rows=True) + if dataset_format == "mindrecord": + if str(num_samples).lower() != "none": + data_set = ds.MindDataset(data_files, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + num_samples=num_samples) + else: + data_set = ds.MindDataset(data_files, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"]) + elif dataset_format == "tfrecord": + data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + shard_equal_rows=True) + else: + raise NotImplementedError("Only supported dataset_format for tfrecord or mindrecord.") ori_dataset_size = data_set.get_dataset_size() print("origin eval size: ", ori_dataset_size) dtypes = data_set.output_types() @@ -265,18 +333,30 @@ def create_eval_dataset(batchsize=32, device_num=1, rank=0, data_dir=None, schem "masked_lm_positions": np.zeros(shapes[4], dtypes[4]), "masked_lm_ids": np.zeros(shapes[5], dtypes[5]), "masked_lm_weights": np.zeros(shapes[6], dtypes[6])} - if use_packed: - item["next_sentence_positions"] = np.zeros(shapes[7], dtypes[7]) - item["next_sentence_weights"] = np.zeros(shapes[8], dtypes[8]) padded_samples = [item for x in range(padded_num)] padded_ds = ds.PaddedDataset(padded_samples) eval_ds = data_set + padded_ds sampler = ds.DistributedSampler(num_shards=device_num, shard_id=rank, shuffle=False) eval_ds.use_sampler(sampler) else: - eval_ds = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, - columns_list=columns_list, num_shards=device_num, - shard_id=rank, shard_equal_rows=True) + if dataset_format == "mindrecord": + if str(num_samples).lower() != "none": + eval_ds = ds.MindDataset(data_files, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + num_shards=device_num, shard_id=rank, num_samples=num_samples) + else: + eval_ds = ds.MindDataset(data_files, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + num_shards=device_num, shard_id=rank) + elif dataset_format == "tfrecord": + eval_ds = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + num_shards=device_num, shard_id=rank, shard_equal_rows=True) + else: + raise NotImplementedError("Only supported dataset_format for tfrecord or mindrecord.") type_cast_op = C.TypeCast(mstype.int32) eval_ds = eval_ds.map(input_columns="masked_lm_ids", operations=type_cast_op) @@ -285,9 +365,6 @@ def create_eval_dataset(batchsize=32, device_num=1, rank=0, data_dir=None, schem eval_ds = eval_ds.map(input_columns="segment_ids", operations=type_cast_op) eval_ds = eval_ds.map(input_columns="input_mask", operations=type_cast_op) eval_ds = eval_ds.map(input_columns="input_ids", operations=type_cast_op) - if use_packed: - eval_ds = eval_ds.map(input_columns="next_sentence_positions", operations=type_cast_op) - eval_ds = eval_ds.map(input_columns="next_sentence_weights", operations=type_cast_op) eval_ds = eval_ds.batch(batchsize, drop_remainder=True) print("eval data size: {}".format(eval_ds.get_dataset_size())) diff --git a/benchmark/ascend/bert/src/model_utils/config.py b/benchmark/ascend/bert/src/model_utils/config.py index b0687b79203e72c096fafaf93ffd287c98bca658..089d3c477947dc918ab8d338a28c2d07f394c894 100644 --- a/benchmark/ascend/bert/src/model_utils/config.py +++ b/benchmark/ascend/bert/src/model_utils/config.py @@ -113,13 +113,16 @@ def merge(args, cfg): def parse_dtype(dtype): - if dtype not in ["mstype.float32", "mstype.float16"]: + #if dtype not in ["mstype.float32", "mstype.float16"]: + if dtype not in ["mstype.float32", "mstype.float16", "mstype.bfloat16"]: #773491 raise ValueError("Not supported dtype") if dtype == "mstype.float32": return mstype.float32 if dtype == "mstype.float16": return mstype.float16 + if dtype == "mstype.bfloat16": #773491 + return mstype.bfloat16 return None def extra_operations(cfg): @@ -157,7 +160,6 @@ def extra_operations(cfg): _bert_net_cfg = cfg.large_boost_net_cfg else: pass - _bert_net_cfg.use_packed = cfg.use_packed cfg.bert_net_cfg = BertConfig(**_bert_net_cfg.__dict__) elif cfg.description == 'run_ner': cfg.optimizer_cfg.AdamWeightDecay.decay_filter = \ @@ -194,7 +196,7 @@ def get_config(): current_dir = os.path.dirname(os.path.abspath(__file__)) return os.path.join(current_dir, path_relative) parser = argparse.ArgumentParser(description="default name", add_help=False) - parser.add_argument("--config_path", type=get_abs_path, default="../../pretrain_config_Ascend_Boost.yaml", + parser.add_argument("--config_path", type=get_abs_path, default="../../pretrain_config.yaml", help="Config file path") path_args, _ = parser.parse_known_args() default, helper, choices = parse_yaml(path_args.config_path) diff --git a/benchmark/ascend/bert/src/utils.py b/benchmark/ascend/bert/src/utils.py index 682cbfc18119e8538a8e837c0866af63221079df..1eba868952028a251acc07cd8de85d541a0bd885 100644 --- a/benchmark/ascend/bert/src/utils.py +++ b/benchmark/ascend/bert/src/utils.py @@ -20,6 +20,7 @@ Functional Cells used in Bert finetune and evaluation. import os import math import collections +import datetime import numpy as np import mindspore.nn as nn from mindspore import log as logger @@ -84,6 +85,7 @@ def make_directory(path: str): raise TypeError("No write permission on the directory.") return real_path + class LossCallBack(Callback): """ Monitor the loss in training. @@ -96,24 +98,24 @@ class LossCallBack(Callback): def __init__(self, dataset_size=-1): super(LossCallBack, self).__init__() self._dataset_size = dataset_size + def step_end(self, run_context): """ Print loss after each step """ cb_params = run_context.original_args() - loss, is_overflow, loss_scale = [output.asnumpy().item() for output in cb_params.net_outputs] if self._dataset_size > 0: percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size) if percent == 0: percent = 1 epoch_num -= 1 - print("epoch: {}, current epoch percent: {}, step: {}, loss: {}, overflow: {}, loss scale: {}" - .format(int(epoch_num), "%.3f" % percent, cb_params.cur_step_num, loss, is_overflow, - int(loss_scale)), flush=True) + print("time: {}, epoch: {}, current epoch percent: {}, step: {}, outputs are {}" + .format(datetime.datetime.utcnow(), int(epoch_num), "%.3f" % percent, cb_params.cur_step_num, + str(cb_params.net_outputs)), + flush=True) else: - print("epoch: {}, step: {}, loss: {}, overflow: {}, loss scale: {}" - .format(cb_params.cur_epoch_num, cb_params.cur_step_num, loss, is_overflow, - int(loss_scale)), flush=True) + print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num, + str(cb_params.net_outputs)), flush=True) def LoadNewestCkpt(load_finetune_checkpoint_dir, prefix): @@ -149,6 +151,8 @@ class BertLearningRate(LearningRateSchedule): self.greater = P.Greater() self.one = Tensor(np.array([1.0]).astype(np.float32)) self.cast = P.Cast() + from mindspore import ops + self.print = ops.Print() def construct(self, global_step): decay_lr = self.decay_lr(global_step) @@ -176,6 +180,7 @@ def convert_labels_to_index(label_list): label2id[sub_label] = index return label2id + def _get_poly_lr(global_step, lr_init, lr_end, lr_max, warmup_steps, total_steps, poly_power): """ generate learning rate array @@ -252,13 +257,15 @@ class EvalCallBack(Callback): if num_samples < self.eval_samples: return self.last_eval_step = cb_params.cur_step_num - total_sumples = cb_params.cur_step_num * self.global_batch + total_samples = cb_params.cur_step_num * self.global_batch res = self.model.eval(self.eval_ds, dataset_sink_mode=True) res = res['bert_acc'] print("====================================", flush=True) - print("Accuracy is: ", "%.6f" % res, ", current samples is: ", total_sumples) + print(f"Time: {datetime.datetime.utcnow()} Accuracy is: ", "%.6f" % res, ", current samples is: ", + total_samples) print("====================================", flush=True) + class BertMetric(Metric): """ The metric of bert network. diff --git a/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config.yaml b/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config.yaml index 8a426a6b50d51b11bb6ab822bcf2629490e2bab6..21755da8794bff33d0388d02b4a659fbf6e17d90 100644 --- a/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config.yaml +++ b/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config.yaml @@ -5,10 +5,10 @@ data_url: "" train_url: "" checkpoint_url: "" # Path for local -run_distribute: False +run_distribute: True enable_profiling: False -data_path: "/cache/data" -output_path: "/cache/train" +data_path: "/data/resnet_tc/Imagenet2012/train" +output_dir: "../outputs" load_path: "/cache/checkpoint_path/" device_target: "Ascend" checkpoint_path: "./checkpoint/" @@ -20,46 +20,49 @@ optimizer: "LARS" infer_label: "" class_num: 1001 batch_size: 256 +eval_batch_size: 250 loss_scale: 1024 -momentum: 0.85 -weight_decay: 5.0e-5 -epoch_size: 60 -pretrain_epoch_size: 0 +momentum: 0.9 +weight_decay: 5.0e-05 +epoch_size: 36 +start_epoch: 0 +resume_ckpt: "" save_checkpoint: False save_checkpoint_epochs: 5 keep_checkpoint_max: 10 -warmup_epochs: 1 +warmup_epochs: 5 lr_decay_mode: "poly" use_label_smooth: True label_smooth_factor: 0.1 lr_init: 0 -lr_max: 13.01 -lr_end: 0.0 +lr_max: 7.6 +lr_end: 0.0001 lars_epsilon: 0.0 lars_coefficient: 0.001 net_name: "resnet50" dataset: "imagenet2012" -device_num: 1 +device_num: 8 pre_trained: "" -run_eval: False -eval_dataset_path: "" +run_eval: True +eval_dataset_path: "/data/resnet_tc/Imagenet2012/val" parameter_server: False filter_weight: False -save_best_ckpt: True -eval_start_epoch: 30 -eval_interval: 1 +save_best_ckpt: False +eval_start_epoch: 3 +eval_interval: 4 enable_cache: False cache_session_id: "" mode_name: "GRAPH" boost_mode: "O1" conv_init: "TruncatedNormal" -dense_init: "RandomNormal" +dense_init: "TruncatedNormal" all_reduce_fusion_config: - 85 - 160 train_image_size: 192 eval_image_size: 224 +max_device_memory: "30GB" # Export options device_id: 0 @@ -80,6 +83,18 @@ has_trained_step: 0 result_path: '' label_path: '' +# prediction +img_path: '' + +# lite inference +enable_predict: False +enable_predict_lite_backend: False +enable_predict_lite_mindir: False + +# lite mindir inference +mindir_path: 'net.mindir' + + --- # Help description for each configuration enable_modelarts: "Whether training on modelarts, default: False" @@ -92,8 +107,9 @@ device_target: "Target device type, available: [Ascend, GPU, CPU]" enable_profiling: "Whether enable profiling while training, default: False" num_classes: "Class for dataset" batch_size: "Batch size for training and evaluation" -epoch_size: "Total training epochs." +epoch_size: 47 checkpoint_path: "The location of the checkpoint file." checkpoint_file_path: "The location of the checkpoint file." save_graphs: "Whether save graphs during training, default: False." save_graphs_path: "Path to save graphs." +img_path: "image file path." diff --git a/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config_16p.yaml b/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config_16p.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1f11aadef5b56e2c59eb09e05ac8d6b32d6a9e21 --- /dev/null +++ b/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config_16p.yaml @@ -0,0 +1,114 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +run_distribute: True +enable_profiling: False +data_path: "/data/resnet_tc/Imagenet2012/train" +output_dir: "../outputs" +load_path: "/cache/checkpoint_path/" +device_target: "Ascend" +checkpoint_path: "./checkpoint/" +checkpoint_file_path: "" + +# ============================================================================== +# Training options +optimizer: "LARS" +infer_label: "" +class_num: 1001 +batch_size: 192 +eval_batch_size: 125 +loss_scale: 1024 +momentum: 0.9 +weight_decay: 5.0e-05 +epoch_size: 37 +start_epoch: 0 +resume_ckpt: "" +save_checkpoint: False +save_checkpoint_epochs: 5 +keep_checkpoint_max: 10 +warmup_epochs: 5 +lr_decay_mode: "poly" +use_label_smooth: True +label_smooth_factor: 0.1 +lr_init: 0 +lr_max: 11.0 +lr_end: 0.0001 +lars_epsilon: 0.0 +lars_coefficient: 0.001 + +net_name: "resnet50" +dataset: "imagenet2012" +device_num: 8 +pre_trained: "" +run_eval: True +eval_dataset_path: "/data/resnet_tc/Imagenet2012/val" +parameter_server: False +filter_weight: False +save_best_ckpt: False +eval_start_epoch: 4 +eval_interval: 4 +enable_cache: False +cache_session_id: "" +mode_name: "GRAPH" +boost_mode: "O1" +conv_init: "TruncatedNormal" +dense_init: "TruncatedNormal" +all_reduce_fusion_config: + - 85 + - 160 +train_image_size: 192 +eval_image_size: 224 +max_device_memory: "30GB" + +# Export options +device_id: 0 +width: 224 +height: 224 +file_name: "resnet50" +file_format: "MINDIR" +ckpt_file: "" +network_dataset: "resnet50_imagenet2012" + +# Retrain options +save_graphs: False +save_graphs_path: "./graphs" +has_trained_epoch: 0 +has_trained_step: 0 + +# postprocess resnet inference +result_path: '' +label_path: '' + +# prediction +img_path: '' + +# lite inference +enable_predict: False +enable_predict_lite_backend: False +enable_predict_lite_mindir: False + +# lite mindir inference +mindir_path: 'net.mindir' + + +# # Help description for each configuration +# enable_modelarts: "Whether training on modelarts, default: False" +# data_url: "Dataset url for obs" +# checkpoint_url: "The location of checkpoint for obs" +# data_path: "Dataset path for local" +# output_path: "Training output path for local" +# load_path: "The location of checkpoint for obs" +# device_target: "Target device type, available: [Ascend, GPU, CPU]" +# enable_profiling: "Whether enable profiling while training, default: False" +# num_classes: "Class for dataset" +# batch_size: "Batch size for training and evaluation" +# epoch_size: 37 +# checkpoint_path: "The location of the checkpoint file." +# checkpoint_file_path: "The location of the checkpoint file." +# save_graphs: "Whether save graphs during training, default: False." +# save_graphs_path: "Path to save graphs." +# img_path: "image file path." diff --git a/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config_32p.yaml b/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config_32p.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bc866eee638a3590fa0509ba1395953375064523 --- /dev/null +++ b/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config_32p.yaml @@ -0,0 +1,114 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +run_distribute: True +enable_profiling: False +data_path: "/data/resnet_tc/Imagenet2012/train" +output_dir: "../outputs" +load_path: "/cache/checkpoint_path/" +device_target: "Ascend" +checkpoint_path: "./checkpoint/" +checkpoint_file_path: "" + +# ============================================================================== +# Training options +optimizer: "LARS" +infer_label: "" +class_num: 1001 +batch_size: 96 +eval_batch_size: 125 +loss_scale: 1024 +momentum: 0.9 +weight_decay: 5.0e-05 +epoch_size: 37 +start_epoch: 0 +resume_ckpt: "" +save_checkpoint: False +save_checkpoint_epochs: 5 +keep_checkpoint_max: 10 +warmup_epochs: 5 +lr_decay_mode: "poly" +use_label_smooth: True +label_smooth_factor: 0.1 +lr_init: 0 +lr_max: 11.0 +lr_end: 0.0001 +lars_epsilon: 0.0 +lars_coefficient: 0.001 + +net_name: "resnet50" +dataset: "imagenet2012" +device_num: 32 +pre_trained: "" +run_eval: True +eval_dataset_path: "/data/resnet_tc/Imagenet2012/val" +parameter_server: False +filter_weight: False +save_best_ckpt: False +eval_start_epoch: 4 +eval_interval: 4 +enable_cache: False +cache_session_id: "" +mode_name: "GRAPH" +boost_mode: "O1" +conv_init: "TruncatedNormal" +dense_init: "TruncatedNormal" +all_reduce_fusion_config: + - 85 + - 160 +train_image_size: 192 +eval_image_size: 224 +max_device_memory: "30GB" + +# Export options +device_id: 0 +width: 224 +height: 224 +file_name: "resnet50" +file_format: "MINDIR" +ckpt_file: "" +network_dataset: "resnet50_imagenet2012" + +# Retrain options +save_graphs: False +save_graphs_path: "./graphs" +has_trained_epoch: 0 +has_trained_step: 0 + +# postprocess resnet inference +result_path: '' +label_path: '' + +# prediction +img_path: '' + +# lite inference +enable_predict: False +enable_predict_lite_backend: False +enable_predict_lite_mindir: False + +# lite mindir inference +mindir_path: 'net.mindir' + + +# # Help description for each configuration +# enable_modelarts: "Whether training on modelarts, default: False" +# data_url: "Dataset url for obs" +# checkpoint_url: "The location of checkpoint for obs" +# data_path: "Dataset path for local" +# output_path: "Training output path for local" +# load_path: "The location of checkpoint for obs" +# device_target: "Target device type, available: [Ascend, GPU, CPU]" +# enable_profiling: "Whether enable profiling while training, default: False" +# num_classes: "Class for dataset" +# batch_size: "Batch size for training and evaluation" +# epoch_size: 37 +# checkpoint_path: "The location of the checkpoint file." +# checkpoint_file_path: "The location of the checkpoint file." +# save_graphs: "Whether save graphs during training, default: False." +# save_graphs_path: "Path to save graphs." +# img_path: "image file path." diff --git a/benchmark/ascend/resnet/eval.py b/benchmark/ascend/resnet/eval.py deleted file mode 100644 index d7be56d476fe4318afc1c9c167245c3687ad5ba1..0000000000000000000000000000000000000000 --- a/benchmark/ascend/resnet/eval.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright 2020-2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""eval resnet.""" -import os -import mindspore as ms -from mindspore import Tensor -from mindspore.nn.optim import Momentum -from mindspore.common import set_seed -from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits -from mindspore.train.model import Model -from src.CrossEntropySmooth import CrossEntropySmooth -from src.model_utils.config import config -from src.model_utils.moxing_adapter import moxing_wrapper - -set_seed(1) - -if config.net_name in ("resnet18", "resnet34", "resnet50", "resnet152"): - if config.net_name == "resnet18": - from src.resnet import resnet18 as resnet - elif config.net_name == "resnet34": - from src.resnet import resnet34 as resnet - elif config.net_name == "resnet50": - from src.resnet import resnet50 as resnet - else: - from src.resnet import resnet152 as resnet - if config.dataset == "cifar10": - from src.dataset import create_dataset1 as create_dataset - else: - from src.dataset import create_dataset2 as create_dataset -elif config.net_name == "resnet101": - from src.resnet import resnet101 as resnet - from src.dataset import create_dataset3 as create_dataset -else: - from src.resnet import se_resnet50 as resnet - from src.dataset import create_dataset4 as create_dataset - - -def init_group_params(net): - decayed_params = [] - no_decayed_params = [] - for param in net.trainable_params(): - if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: - decayed_params.append(param) - else: - no_decayed_params.append(param) - - group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay}, - {'params': no_decayed_params}, - {'order_params': net.trainable_params()}] - return group_params - - -@moxing_wrapper() -def eval_net(): - """eval net""" - target = config.device_target - - # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) - if target == "Ascend": - device_id = int(os.getenv('DEVICE_ID')) - ms.set_context(device_id=device_id) - - # create dataset - dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, - eval_image_size=config.eval_image_size, - target=target) - - # define net - net = resnet(class_num=config.class_num) - - # load checkpoint - param_dict = ms.load_checkpoint(config.checkpoint_file_path) - ms.load_param_into_net(net, param_dict) - net.set_train(False) - - # define loss, model - if config.dataset == "imagenet2012": - if not config.use_label_smooth: - config.label_smooth_factor = 0.0 - loss = CrossEntropySmooth(sparse=True, reduction='mean', - smooth_factor=config.label_smooth_factor, - num_classes=config.class_num) - else: - loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') - - #Currently, boost inference only supports scenarios with optimizers - #Optimizer waiting for decoupling in boost model - group_params = init_group_params(net) - opt = Momentum(group_params, Tensor(0.0), config.momentum, loss_scale=config.loss_scale) - - # define model, add boostmode for eval scenarios with train.py - model = Model(net, loss_fn=loss, boost_level=config.boost_mode, - optimizer=opt, metrics={'top_1_accuracy', 'top_5_accuracy'}) - - # eval model - res = model.eval(dataset) - print("result:", res, "ckpt=", config.checkpoint_file_path) - -if __name__ == '__main__': - eval_net() diff --git a/benchmark/ascend/resnet/export.py b/benchmark/ascend/resnet/export.py deleted file mode 100644 index 26786983602eb0698f36c9b4e629000544e2d220..0000000000000000000000000000000000000000 --- a/benchmark/ascend/resnet/export.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright 2020-2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -##############export checkpoint file into air and onnx models################# -python export.py -""" -import os - -import mindspore as ms -from src.model_utils.config import config -from src.model_utils.moxing_adapter import moxing_wrapper - -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) -if config.device_target != "GPU": - ms.set_context(device_id=config.device_id) - -def modelarts_pre_process(): - '''modelarts pre process function.''' - config.file_name = os.path.join(config.output_path, config.file_name) - -@moxing_wrapper(pre_process=modelarts_pre_process) -def run_export(): - """run export.""" - if config.network_dataset in ['resnet18_cifar10', 'resnet18_imagenet2012']: - from src.resnet import resnet18 as resnet - elif config.network_dataset == 'resnet34_imagenet2012': - from src.resnet import resnet34 as resnet - elif config.network_dataset in ['resnet50_cifar10', 'resnet50_imagenet2012']: - from src.resnet import resnet50 as resnet - elif config.network_dataset == 'resnet101_imagenet2012': - from src.resnet import resnet101 as resnet - elif config.network_dataset == 'resnet152_imagenet2012': - from src.resnet import resnet152 as resnet - elif config.network_dataset == 'se-resnet50_imagenet2012': - from src.resnet import se_resnet50 as resnet - else: - raise ValueError("network and dataset is not support.") - - net = resnet(config.class_num) - - assert config.checkpoint_file_path is not None, "checkpoint_path is None." - - param_dict = ms.load_checkpoint(config.checkpoint_file_path) - ms.load_param_into_net(net, param_dict) - - input_arr = ms.numpy.zeros([config.batch_size, 3, config.height, config.width], ms.float32) - ms.export(net, input_arr, file_name=config.file_name, file_format=config.file_format) - - -if __name__ == '__main__': - run_export() diff --git a/benchmark/ascend/resnet/scripts/run_distribute_train.sh b/benchmark/ascend/resnet/scripts/run_distribute_train.sh index c6577b16dfdd5ce2d2a634ce474169f57f9bf89a..6dc9722819dc8b49af989d400cbe5e249850fdb5 100644 --- a/benchmark/ascend/resnet/scripts/run_distribute_train.sh +++ b/benchmark/ascend/resnet/scripts/run_distribute_train.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2020-2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,10 +18,12 @@ CURPATH="$(dirname "$0")" # shellcheck source=/dev/null . ${CURPATH}/cache_util.sh -if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] +if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] && [ $# != 6 ] then - echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)" - echo " bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH]" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) [RESUME_CKPT](optional)" exit 1 fi @@ -39,13 +41,13 @@ CONFIG_FILE=$(get_real_path $3) str="Boost_" if [[ $CONFIG_FILE =~ $str ]] then - export MS_DISABLE_REF_MODE=1 + export MS_DISABLE_REF_MODE=0 export MS_ENABLE_FORMAT_MODE=0 fi if [ $# == 4 ] then - PATH3=$(get_real_path $4) + RESUME_CKPT=$(get_real_path $4) fi if [ $# == 5 ] @@ -54,6 +56,13 @@ then EVAL_DATASET_PATH=$(get_real_path $5) fi +if [ $# == 6 ] +then + RUN_EVAL=$4 + EVAL_DATASET_PATH=$(get_real_path $5) + RESUME_CKPT=$(get_real_path $6) +fi + if [ ! -f $PATH1 ] then echo "error: RANK_TABLE_FILE=$PATH1 is not a file" @@ -66,9 +75,9 @@ then exit 1 fi -if [ $# == 4 ] && [ ! -f $PATH3 ] +if [ $# == 4 ] && [ ! -f $RESUME_CKPT ] then - echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file" + echo "error: RESUME_CKPT=$RESUME_CKPT is not a file" exit 1 fi @@ -101,6 +110,7 @@ do start=`expr $i \* $avg` end=`expr $start \+ $gap` cmdopt=$start"-"$end + echo "773491: $cmdopt" export DEVICE_ID=${i} export RANK_ID=$((rank_start + i)) rm -rf ./train_parallel$i @@ -115,20 +125,31 @@ do if [ $# == 3 ] then taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ - --config_path=$CONFIG_FILE --output_path './output' &> log & + --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & fi - + if [ $# == 4 ] then - taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 --pre_trained=$PATH3 \ - --config_path=$CONFIG_FILE --output_path './output' &> log & + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 --resume_ckpt=$RESUME_CKPT \ + --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & fi if [ $# == 5 ] then taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ - --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True \ - --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_path './output' &> log & + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=False \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi + fi + + if [ $# == 6 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --resume_ckpt=$RESUME_CKPT \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & if [ "x${RUN_EVAL}" == "xTrue" ] then echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" diff --git a/benchmark/ascend/resnet/scripts/run_distribute_train_2node_16p.sh b/benchmark/ascend/resnet/scripts/run_distribute_train_2node_16p.sh new file mode 100644 index 0000000000000000000000000000000000000000..6df3a9d69b16c00015b63c365b5e0d6283fc3664 --- /dev/null +++ b/benchmark/ascend/resnet/scripts/run_distribute_train_2node_16p.sh @@ -0,0 +1,135 @@ +#!/bin/bash +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +CURPATH="$(dirname "$0")" +# shellcheck source=/dev/null +. ${CURPATH}/cache_util.sh + +if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] && [ $# != 6 ] +then + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH]" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) [RESUME_CKPT](optional)" + exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) +CONFIG_FILE=$(get_real_path $3) +str="Boost_" +if [[ $CONFIG_FILE =~ $str ]] +then + export MS_DISABLE_REF_MODE=0 + export MS_ENABLE_FORMAT_MODE=0 +fi + +if [ $# == 5 ] +then + RUN_EVAL=$4 + EVAL_DATASET_PATH=$(get_real_path $5) +fi + +export SERVER_ID=0 + +if [ $# == 6 ] +then + RUN_EVAL=$4 + EVAL_DATASET_PATH=$(get_real_path $5) + export SERVER_ID=$6 +fi + +if [ ! -f $PATH1 ] +then + echo "error: RANK_TABLE_FILE=$PATH1 is not a file" +exit 1 +fi + +if [ ! -d $PATH2 ] +then + echo "error: DATASET_PATH=$PATH2 is not a directory" +exit 1 +fi + +if [ "x${RUN_EVAL}" == "xTrue" ] && [ ! -d $EVAL_DATASET_PATH ] +then + echo "error: EVAL_DATASET_PATH=$EVAL_DATASET_PATH is not a directory" + exit 1 +fi + + +ulimit -u unlimited +export DEVICE_NUM=8 +export RANK_SIZE=16 +export RANK_TABLE_FILE=$PATH1 +offset=0 + +rank_start=$((DEVICE_NUM * SERVER_ID)) + +cpus=`cat /proc/cpuinfo| grep "processor"| wc -l` +avg=`expr $cpus \/ $DEVICE_NUM` +gap=`expr $avg \- 1` + +for((i=0; i<${DEVICE_NUM}; i++)) +do + start=`expr $i \* $avg` + end=`expr $start \+ $gap` + cmdopt=$start"-"$end + echo "773491: $cmdopt" + export DEVICE_ID=$((offset + i)) + export RANK_ID=$((rank_start + i)) + rm -rf ./train_parallel$i + mkdir ./train_parallel$i + cp ../*.py ./train_parallel$i + cp *.sh ./train_parallel$i + cp -r ../config/*.yaml ./train_parallel$i + cp -r ../src ./train_parallel$i + cd ./train_parallel$i || exit + echo "start training for rank $RANK_ID, device $DEVICE_ID" + env > env.log + + if [ $# == 5 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=False \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi + fi + + if [ $# == 6 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=False \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi + fi + cd .. +done + diff --git a/benchmark/ascend/resnet/scripts/run_distribute_train_4p.sh b/benchmark/ascend/resnet/scripts/run_distribute_train_4p.sh new file mode 100644 index 0000000000000000000000000000000000000000..6c6d1680be98d8bcd65cdfcc8f6d6258a7d72c10 --- /dev/null +++ b/benchmark/ascend/resnet/scripts/run_distribute_train_4p.sh @@ -0,0 +1,154 @@ +#!/bin/bash +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +CURPATH="$(dirname "$0")" +# shellcheck source=/dev/null +. ${CURPATH}/cache_util.sh + +if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] && [ $# != 6 ] +then + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH]" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) [RESUME_CKPT](optional)" + exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) +CONFIG_FILE=$(get_real_path $3) +str="Boost_" +if [[ $CONFIG_FILE =~ $str ]] +then + export MS_DISABLE_REF_MODE=0 + export MS_ENABLE_FORMAT_MODE=0 +fi + +if [ $# == 4 ] +then + RESUME_CKPT=$(get_real_path $4) +fi + +if [ $# == 5 ] +then + RUN_EVAL=$4 + EVAL_DATASET_PATH=$(get_real_path $5) +fi + +if [ $# == 6 ] +then + RUN_EVAL=$4 + EVAL_DATASET_PATH=$(get_real_path $5) + RESUME_CKPT=$(get_real_path $6) +fi + +if [ ! -f $PATH1 ] +then + echo "error: RANK_TABLE_FILE=$PATH1 is not a file" +exit 1 +fi + +if [ ! -d $PATH2 ] +then + echo "error: DATASET_PATH=$PATH2 is not a directory" +exit 1 +fi + +if [ $# == 4 ] && [ ! -f $RESUME_CKPT ] +then + echo "error: RESUME_CKPT=$RESUME_CKPT is not a file" +exit 1 +fi + +if [ "x${RUN_EVAL}" == "xTrue" ] && [ ! -d $EVAL_DATASET_PATH ] +then + echo "error: EVAL_DATASET_PATH=$EVAL_DATASET_PATH is not a directory" + exit 1 +fi + + +ulimit -u unlimited +export DEVICE_NUM=4 +export RANK_SIZE=4 +export RANK_TABLE_FILE=$PATH1 + +export SERVER_ID=0 +rank_start=$((DEVICE_NUM * SERVER_ID)) + +cpus=`cat /proc/cpuinfo| grep "processor"| wc -l` +avg=`expr $cpus \/ $DEVICE_NUM` +gap=`expr $avg \- 1` + +for((i=0; i<${DEVICE_NUM}; i++)) +do + start=`expr $i \* $avg` + end=`expr $start \+ $gap` + cmdopt=$start"-"$end + echo "773491: $cmdopt" + export DEVICE_ID=$((4 + i)) + export RANK_ID=$((rank_start + i)) + rm -rf ./train_parallel$i + mkdir ./train_parallel$i + cp ../*.py ./train_parallel$i + cp *.sh ./train_parallel$i + cp -r ../config/*.yaml ./train_parallel$i + cp -r ../src ./train_parallel$i + cd ./train_parallel$i || exit + echo "start training for rank $RANK_ID, device $DEVICE_ID" + env > env.log + if [ $# == 3 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ + --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + fi + + if [ $# == 4 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 --resume_ckpt=$RESUME_CKPT \ + --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + fi + + if [ $# == 5 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=False \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi + fi + + if [ $# == 6 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --resume_ckpt=$RESUME_CKPT \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi + fi + cd .. +done diff --git a/benchmark/ascend/resnet/scripts/run_distribute_train_multi_server.sh b/benchmark/ascend/resnet/scripts/run_distribute_train_multi_server.sh new file mode 100644 index 0000000000000000000000000000000000000000..c8f66898c9c6f5ce2c0b145852eb71dc705394fd --- /dev/null +++ b/benchmark/ascend/resnet/scripts/run_distribute_train_multi_server.sh @@ -0,0 +1,134 @@ +#!/bin/bash +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +CURPATH="$(dirname "$0")" +# shellcheck source=/dev/null +. ${CURPATH}/cache_util.sh + +if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] && [ $# != 6 ] && [ $# != 7 ] +then + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH]" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) [RESUME_CKPT](optional)" + exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) +CONFIG_FILE=$(get_real_path $3) +str="Boost_" +if [[ $CONFIG_FILE =~ $str ]] +then + export MS_DISABLE_REF_MODE=0 + export MS_ENABLE_FORMAT_MODE=0 +fi + +if [ $# == 5 ] +then + RUN_EVAL=$4 + EVAL_DATASET_PATH=$(get_real_path $5) +fi + +export SERVER_ID=0 + +if [ $# == 7 ] +then + RUN_EVAL=$4 + EVAL_DATASET_PATH=$(get_real_path $5) + export SERVER_ID=$6 +fi + +if [ ! -f $PATH1 ] +then + echo "error: RANK_TABLE_FILE=$PATH1 is not a file" +exit 1 +fi + +if [ ! -d $PATH2 ] +then + echo "error: DATASET_PATH=$PATH2 is not a directory" +exit 1 +fi + +if [ "x${RUN_EVAL}" == "xTrue" ] && [ ! -d $EVAL_DATASET_PATH ] +then + echo "error: EVAL_DATASET_PATH=$EVAL_DATASET_PATH is not a directory" + exit 1 +fi + + +ulimit -u unlimited +export DEVICE_NUM=8 +export RANK_SIZE=$7 +export RANK_TABLE_FILE=$PATH1 + +rank_start=$((DEVICE_NUM * SERVER_ID)) + +cpus=`cat /proc/cpuinfo| grep "processor"| wc -l` +avg=`expr $cpus \/ $DEVICE_NUM` +gap=`expr $avg \- 1` + +for((i=0; i<${DEVICE_NUM}; i++)) +do + start=`expr $i \* $avg` + end=`expr $start \+ $gap` + cmdopt=$start"-"$end + echo "773491: $cmdopt" + export DEVICE_ID=${i} + export RANK_ID=$((rank_start + i)) + rm -rf ./train_parallel$i + mkdir ./train_parallel$i + cp ../*.py ./train_parallel$i + cp *.sh ./train_parallel$i + cp -r ../config/*.yaml ./train_parallel$i + cp -r ../src ./train_parallel$i + cd ./train_parallel$i || exit + echo "start training for rank $RANK_ID, device $DEVICE_ID" + env > env.log + + if [ $# == 5 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=False \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi + fi + + if [ $# == 7 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=False \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi + fi + cd .. +done + diff --git a/benchmark/ascend/resnet/scripts/run_infer_310.sh b/benchmark/ascend/resnet/scripts/run_infer_310.sh deleted file mode 100644 index a733fd4745860aff7490ca8ba78fd633eca8a549..0000000000000000000000000000000000000000 --- a/benchmark/ascend/resnet/scripts/run_infer_310.sh +++ /dev/null @@ -1,145 +0,0 @@ -#!/bin/bash -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -if [[ $# -lt 5 || $# -gt 6 ]]; then - echo "Usage: bash run_infer_310.sh [MINDIR_PATH] [NET_TYPE] [DATASET] [DATA_PATH] [CONFIG_PATH] [DEVICE_ID] - NET_TYPE can choose from [resnet18, resnet34, se-resnet50, resnet50, resnet101, resnet152] - DATASET can choose from [cifar10, imagenet] - DEVICE_ID is optional, it can be set by environment variable device_id, otherwise the value is zero" -exit 1 -fi - -get_real_path(){ - if [ "${1:0:1}" == "/" ]; then - echo "$1" - else - echo "$(realpath -m $PWD/$1)" - fi -} -model=$(get_real_path $1) -if [ $2 == 'resnet18' ] || [ $2 == 'resnet34' ] || [ $2 == 'se-resnet50' ] || [ $2 == 'resnet50' ] || [ $2 == 'resnet152' ] || [ $2 == 'resnet101' ]; then - network=$2 -else - echo "NET_TYPE can choose from [resnet18, se-resnet50]" - exit 1 -fi - -if [ $3 == 'cifar10' ] || [ $3 == 'imagenet' ]; then - dataset=$3 -else - echo "DATASET can choose from [cifar10, imagenet]" - exit 1 -fi - -data_path=$(get_real_path $4) -config_path=$(get_real_path $5) - -device_id=0 -if [ $# == 6 ]; then - device_id=$6 -fi - -echo "mindir name: "$model -echo "dataset path: "$data_path -echo "network: "$network -echo "dataset: "$dataset -echo "device id: "$device_id - -export ASCEND_HOME=/usr/local/Ascend/ -if [ -d ${ASCEND_HOME}/ascend-toolkit ]; then - export PATH=$ASCEND_HOME/fwkacllib/bin:$ASCEND_HOME/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/ascend-toolkit/latest/atc/bin:$PATH - export LD_LIBRARY_PATH=$ASCEND_HOME/fwkacllib/lib64:/usr/local/lib:$ASCEND_HOME/ascend-toolkit/latest/atc/lib64:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/lib64:$ASCEND_HOME/driver/lib64:$ASCEND_HOME/add-ons:$LD_LIBRARY_PATH - export TBE_IMPL_PATH=$ASCEND_HOME/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe - export PYTHONPATH=$ASCEND_HOME/fwkacllib/python/site-packages:${TBE_IMPL_PATH}:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/python/site-packages:$PYTHONPATH - export ASCEND_OPP_PATH=$ASCEND_HOME/ascend-toolkit/latest/opp -else - export PATH=$ASCEND_HOME/fwkacllib/bin:$ASCEND_HOME/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/atc/ccec_compiler/bin:$ASCEND_HOME/atc/bin:$PATH - export LD_LIBRARY_PATH=$ASCEND_HOME/fwkacllib/lib64:/usr/local/lib:$ASCEND_HOME/atc/lib64:$ASCEND_HOME/acllib/lib64:$ASCEND_HOME/driver/lib64:$ASCEND_HOME/add-ons:$LD_LIBRARY_PATH - export PYTHONPATH=$ASCEND_HOME/fwkacllib/python/site-packages:$ASCEND_HOME/atc/python/site-packages:$PYTHONPATH - export ASCEND_OPP_PATH=$ASCEND_HOME/opp -fi - -function compile_app() -{ - cd ../ascend310_infer/src/ || exit - if [ -f "Makefile" ]; then - make clean - fi - bash build.sh &> build.log -} - -function preprocess_data() -{ - if [ -d preprocess_Result ]; then - rm -rf ./preprocess_Result - fi - mkdir preprocess_Result - python ../preprocess.py --data_path=$data_path --output_path=./preprocess_Result --config_path=$config_path &> preprocess.log -} - -function infer() -{ - cd - || exit - if [ -d result_Files ]; then - rm -rf ./result_Files - fi - if [ -d time_Result ]; then - rm -rf ./time_Result - fi - mkdir result_Files - mkdir time_Result - ../ascend310_infer/src/main --mindir_path=$model --dataset_path=$data_path --network=$network --dataset=$dataset --device_id=$device_id &> infer.log -} - -function cal_acc() -{ - if [ "x${dataset}" == "xcifar10" ] || [ "x${dataset}" == "xCifar10" ]; then - python ../postprocess.py --dataset=$dataset --label_path=./preprocess_Result/label --result_path=result_Files --config_path=$config_path &> acc.log - else - python ../create_imagenet2012_label.py --img_path=$data_path - python ../postprocess.py --dataset=$dataset --result_path=./result_Files --label_path=./imagenet_label.json --config_path=$config_path &> acc.log - fi - if [ $? -ne 0 ]; then - echo "calculate accuracy failed" - exit 1 - fi -} - -if [ "x${dataset}" == "xcifar10" ] || [ "x${dataset}" == "xCifar10" ]; then - if [ $2 == 'resnet18' ]; then - CONFIG_PATH=resnet18_cifar10_config.yaml - else - CONFIG_PATH=resnet50_cifar10_config.yaml - fi - preprocess_data ${CONFIG_PATH} - data_path=./preprocess_Result/img_data -fi - -compile_app -if [ $? -ne 0 ]; then - echo "compile app code failed" - exit 1 -fi -infer -if [ $? -ne 0 ]; then - echo " execute inference failed" - exit 1 -fi -cal_acc -if [ $? -ne 0 ]; then - echo "calculate accuracy failed" - exit 1 -fi \ No newline at end of file diff --git a/benchmark/ascend/resnet/scripts/run_standalone_train.sh b/benchmark/ascend/resnet/scripts/run_standalone_train.sh deleted file mode 100644 index de5274f5a7945457293fd98618d2ec031d0d0c1a..0000000000000000000000000000000000000000 --- a/benchmark/ascend/resnet/scripts/run_standalone_train.sh +++ /dev/null @@ -1,110 +0,0 @@ -#!/bin/bash -# Copyright 2020-2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -CURPATH="$(dirname "$0")" -# shellcheck source=/dev/null -. ${CURPATH}/cache_util.sh - -if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] -then - echo "Usage: bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)" - echo "bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" -exit 1 -fi - -get_real_path(){ - if [ "${1:0:1}" == "/" ]; then - echo "$1" - else - echo "$(realpath -m $PWD/$1)" - fi -} - -PATH1=$(get_real_path $1) -CONFIG_FILE=$(get_real_path $2) -if [ $# == 3 ] -then - PATH2=$(get_real_path $3) -fi - -if [ $# == 4 ] -then - RUN_EVAL=$2 - EVAL_DATASET_PATH=$(get_real_path $4) -fi - -if [ ! -d $PATH1 ] -then - echo "error: DATASET_PATH=$PATH1 is not a directory" -exit 1 -fi - -if [ $# == 3 ] && [ ! -f $PATH2 ] -then - echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" -exit 1 -fi - -if [ "x${RUN_EVAL}" == "xTrue" ] && [ ! -d $EVAL_DATASET_PATH ] -then - echo "error: EVAL_DATASET_PATH=$EVAL_DATASET_PATH is not a directory" - exit 1 -fi - -if [ "x${RUN_EVAL}" == "xTrue" ] -then - bootup_cache_server - CACHE_SESSION_ID=$(generate_cache_session) -fi - -ulimit -u unlimited -export DEVICE_NUM=1 -export RANK_ID=0 -export RANK_SIZE=1 - -if [ -d "train" ]; -then - rm -rf ./train -fi -mkdir ./train -cp ../config/*.yaml ./train -cp ../*.py ./train -cp *.sh ./train -cp -r ../src ./train -cd ./train || exit -echo "start training for device $DEVICE_ID" -env > env.log -if [ $# == 2 ] -then - python train.py --data_path=$PATH1 --config_path=$CONFIG_FILE --output_path './output' &> log & -fi - -if [ $# == 3 ] -then - python train.py --data_path=$PATH1 --pre_trained=$PATH2 --config_path=$CONFIG_FILE --output_path './output' &> log & -fi - -if [ $# == 4 ] -then - python train.py --data_path=$PATH1 --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH \ - --enable_cache=True --cache_session_id=$CACHE_SESSION_ID \ - --config_path=$CONFIG_FILE --output_path './output' &> log & - if [ "x${RUN_EVAL}" == "xTrue" ] - then - echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" - fi -fi -cd .. diff --git a/benchmark/ascend/resnet/src/CrossEntropySmooth.py b/benchmark/ascend/resnet/src/CrossEntropySmooth.py index 1634033c2c4a554fb2c729a3da81bcc2153b3fbd..2077b4a4063bf4e66ac71738fe9352a73afe0cb2 100644 --- a/benchmark/ascend/resnet/src/CrossEntropySmooth.py +++ b/benchmark/ascend/resnet/src/CrossEntropySmooth.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/benchmark/ascend/resnet/src/eval_callback.py b/benchmark/ascend/resnet/src/callback.py similarity index 41% rename from benchmark/ascend/resnet/src/eval_callback.py rename to benchmark/ascend/resnet/src/callback.py index 5e68632ada737bd4c95f00058cd503208eede667..5be6f467481c8720399df21d1f647581a7b7c514 100644 --- a/benchmark/ascend/resnet/src/eval_callback.py +++ b/benchmark/ascend/resnet/src/callback.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,11 +17,85 @@ import os import stat import time -from mindspore import save_checkpoint -from mindspore import log as logger +import numpy as np +import mindspore as ms from mindspore.train.callback import Callback +class LossCallBack(Callback): + """ + Monitor the loss in training. + If the loss in NAN or INF terminating training. + """ + + def __init__(self, epoch_size, logger, lr, per_print_time=1, global_steps=0): + super(LossCallBack, self).__init__() + self.epoch_size = epoch_size + self.logger = logger + self.lr = lr + self.global_steps = global_steps + self.per_print_time = per_print_time + self.step_start_time = time.time() + self.epoch_start_time = time.time() + + def on_train_step_end(self, run_context): + cb_params = run_context.original_args() + loss = cb_params.net_outputs + data_sink_mode = cb_params.get('dataset_sink_mode', True) + if not data_sink_mode: + if isinstance(loss, (tuple, list)): + if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): + loss = loss[0] + + if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray): + loss = np.mean(loss.asnumpy()) + + cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 + cur_epoch_num = cb_params.cur_epoch_num + if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)): + raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format( + cb_params.cur_epoch_num, cur_step_in_epoch)) + + if self.per_print_time != 0 and cur_step_in_epoch % self.per_print_time == 0: + # pylint: disable=line-too-long + per_step_time = 1000 * (time.time() - self.step_start_time) / self.per_print_time + log_info = "epoch: [%s/%s] step: [%s/%s], lr: %.6f, loss: %.6f, per step time: %.3f ms" % ( + cur_epoch_num, self.epoch_size, cur_step_in_epoch, cb_params.batch_num, self.lr[self.global_steps], + loss, per_step_time) + self.logger.info(log_info) + self.step_start_time = time.time() + self.global_steps += 1 + + def on_train_epoch_begin(self, run_context): + self.epoch_start_time = time.time() + self.step_start_time = time.time() + + def on_train_epoch_end(self, run_context): + cb_params = run_context.original_args() + loss = cb_params.net_outputs + cur_epoch_num = cb_params.cur_epoch_num + if isinstance(loss, (tuple, list)): + if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): + loss = loss[0] + + if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray): + loss = np.mean(loss.asnumpy()) + + epoch_time = time.time() - self.epoch_start_time + log_info = 'epoch: [%s/%s] loss: %.6f, epoch time: %.3f s, per step time: %.3f ms' % ( + cur_epoch_num, self.epoch_size, loss, epoch_time, epoch_time * 1000 / cb_params.batch_num) + self.logger.info(log_info) + + +class ResumeCallback(Callback): + def __init__(self, start_epoch=0): + super(ResumeCallback, self).__init__() + self.start_epoch = start_epoch + + def on_train_epoch_begin(self, run_context): + run_context.original_args().cur_epoch_num += self.start_epoch + + class EvalCallBack(Callback): """ Evaluation callback when training. @@ -42,18 +116,20 @@ class EvalCallBack(Callback): >>> EvalCallBack(eval_function, eval_param_dict) """ - def __init__(self, eval_function, eval_param_dict, interval=1, eval_start_epoch=1, save_best_ckpt=True, - ckpt_directory="./", best_ckpt_name="best.ckpt", metrics_name="acc"): + def __init__(self, eval_function, eval_param_dict, interval=1, eval_start_epoch=1, rank_id=0, save_best_ckpt=True, + ckpt_directory="./", best_ckpt_name="best.ckpt", metrics_name="acc", logger=None): super(EvalCallBack, self).__init__() self.eval_param_dict = eval_param_dict self.eval_function = eval_function self.eval_start_epoch = eval_start_epoch + self.logger = logger if interval < 1: raise ValueError("interval should >= 1.") self.interval = interval self.save_best_ckpt = save_best_ckpt self.best_res = 0 self.best_epoch = 0 + self.rank_id = rank_id if not os.path.isdir(ckpt_directory): os.makedirs(ckpt_directory) self.best_ckpt_path = os.path.join(ckpt_directory, best_ckpt_name) @@ -65,11 +141,11 @@ class EvalCallBack(Callback): os.chmod(file_name, stat.S_IWRITE) os.remove(file_name) except OSError: - logger.warning("OSError, failed to remove the older ckpt file %s.", file_name) + self.logger.warning("OSError, failed to remove the older ckpt file %s.", file_name) except ValueError: - logger.warning("ValueError, failed to remove the older ckpt file %s.", file_name) + self.logger.warning("ValueError, failed to remove the older ckpt file %s.", file_name) - def epoch_end(self, run_context): + def on_train_epoch_end(self, run_context): """Callback when epoch end.""" cb_params = run_context.original_args() cur_epoch = cb_params.cur_epoch_num @@ -77,19 +153,8 @@ class EvalCallBack(Callback): eval_start = time.time() res = self.eval_function(self.eval_param_dict) eval_cost = time.time() - eval_start - print("epoch: {}, {}: {}, eval_cost:{:.2f}".format(cur_epoch, self.metrics_name, res, eval_cost), - flush=True) - if res >= self.best_res: - self.best_res = res - self.best_epoch = cur_epoch - print("update best result: {}".format(res), flush=True) - if self.save_best_ckpt: - if os.path.exists(self.best_ckpt_path): - self.remove_ckpoint_file(self.best_ckpt_path) - save_checkpoint(cb_params.train_network, self.best_ckpt_path) - print("update best checkpoint at: {}".format(self.best_ckpt_path), flush=True) - - def end(self, run_context): - print("End training, the best {0} is: {1}, the best {0} epoch is {2}".format(self.metrics_name, - self.best_res, - self.best_epoch), flush=True) + self.logger.info("epoch: {}, {}: {}, eval_cost:{:.2f}".format(cur_epoch, self.metrics_name, res, eval_cost)) + + def on_train_end(self, run_context): + self.logger.info("End training, the best %s is: %s, the best %s epoch is %s" % ( + self.metrics_name, self.best_res, self.metrics_name, self.best_epoch)) diff --git a/benchmark/ascend/resnet/src/data_split.py b/benchmark/ascend/resnet/src/data_split.py new file mode 100644 index 0000000000000000000000000000000000000000..dcdf649aee44277f149e2852e1a4cc10e652377c --- /dev/null +++ b/benchmark/ascend/resnet/src/data_split.py @@ -0,0 +1,61 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +cpu_cut_data. +""" +import os +import shutil + + +def generate_data(): + dirs = [] + path = "./" + _ = None + for _, j, _ in os.walk(path): + if len(j).__trunc__() > 0: + dirs.append(j) + + train_folder = os.path.exists("./train") + if not train_folder: + os.makedirs("./train") + test_folder = os.path.exists("./test") + if not test_folder: + os.makedirs("./test") + + for di in dirs[0]: + files = os.listdir(di) + train_set = files[: int(len(files) * 3 / 4)] + test_set = files[int(len(files) * 3 / 4):] + for file in train_set: + fname = "./train/" + di + "/" + folder = os.path.exists(fname) + if not folder: + os.makedirs(fname) + src_file = "./" + di + "/" + file + dst_file = fname + file + shutil.copyfile(src_file, dst_file) + + for file in test_set: + fname = "./test/" + di + "/" + folder = os.path.exists(fname) + if not folder: + os.makedirs(fname) + src_file = "./" + di + "/" + file + dst_file = fname + file + shutil.copyfile(src_file, dst_file) + + +if __name__ == '__main__': + generate_data() diff --git a/benchmark/ascend/resnet/src/dataset.py b/benchmark/ascend/resnet/src/dataset.py index 16f3af464c85ea2aa0d1b87c6e719bdb2466f3e4..8bf586c2e6a081fb5f8682880b572bc0590439f9 100644 --- a/benchmark/ascend/resnet/src/dataset.py +++ b/benchmark/ascend/resnet/src/dataset.py @@ -1,4 +1,4 @@ -# Copyright 2020-2022 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,74 +16,16 @@ create train or eval dataset. """ import multiprocessing +from io import BytesIO +import numpy as np +from PIL import Image import mindspore as ms import mindspore.dataset as ds from mindspore.communication.management import init, get_rank, get_group_size -def create_dataset1(dataset_path, do_train, batch_size=32, train_image_size=224, eval_image_size=224, - target="Ascend", distribute=False, enable_cache=False, cache_session_id=None): - """ - create a train or evaluate cifar10 dataset for resnet50 - Args: - dataset_path(string): the path of dataset. - do_train(bool): whether dataset is used for train or eval. - repeat_num(int): the repeat times of dataset. Default: 1 - batch_size(int): the batch size of dataset. Default: 32 - target(str): the device target. Default: Ascend - distribute(bool): data for distribute or not. Default: False - enable_cache(bool): whether tensor caching service is used for eval. Default: False - cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None - Returns: - dataset - """ - device_num, rank_id = _get_rank_info(distribute) - ds.config.set_prefetch_size(64) - if device_num == 1: - data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=get_num_parallel_workers(12), shuffle=True) - else: - data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=get_num_parallel_workers(12), shuffle=True, - num_shards=device_num, shard_id=rank_id) - - # define map operations - trans = [] - if do_train: - trans += [ - ds.vision.RandomCrop((32, 32), (4, 4, 4, 4)), - ds.vision.RandomHorizontalFlip(prob=0.5) - ] - - trans += [ - ds.vision.Resize((train_image_size, train_image_size)), - ds.vision.Rescale(1.0 / 255.0, 0.0), - ds.vision.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]), - ds.vision.HWC2CHW() - ] - - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) - - data_set = data_set.map(operations=type_cast_op, input_columns="label", - num_parallel_workers=get_num_parallel_workers(8)) - # only enable cache for eval - if do_train: - enable_cache = False - if enable_cache: - if not cache_session_id: - raise ValueError("A cache session_id must be provided to use cache.") - eval_cache = ds.DatasetCache(session_id=int(cache_session_id), size=0) - data_set = data_set.map(operations=trans, input_columns="image", - num_parallel_workers=get_num_parallel_workers(8), cache=eval_cache) - else: - data_set = data_set.map(operations=trans, input_columns="image", - num_parallel_workers=get_num_parallel_workers(8)) - - # apply batch operations - data_set = data_set.batch(batch_size, drop_remainder=True) - - return data_set - -def create_dataset2(dataset_path, do_train, batch_size=32, train_image_size=224, eval_image_size=224, - target="Ascend", distribute=False, enable_cache=False, cache_session_id=None): +def create_dataset(dataset_path, do_train, batch_size=32, train_image_size=224, eval_image_size=224, + target="Ascend", distribute=False, enable_cache=False, cache_session_id=None, drop_remainder=True): """ create a train or eval imagenet2012 dataset for resnet50 @@ -104,9 +46,9 @@ def create_dataset2(dataset_path, do_train, batch_size=32, train_image_size=224, ds.config.set_prefetch_size(64) if device_num == 1: - data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(12), shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(24), shuffle=True) else: - data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(12), shuffle=True, + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(24), shuffle=True, #12 num_shards=device_num, shard_id=rank_id) # Computed from random subset of ImageNet training images @@ -120,6 +62,31 @@ def create_dataset2(dataset_path, do_train, batch_size=32, train_image_size=224, ds.vision.RandomHorizontalFlip(prob=0.5) ] else: + batch_per_step = batch_size * device_num + print("eval batch per step:{}".format(batch_per_step)) + if batch_per_step < 50000: + if 50000 % batch_per_step == 0: + num_padded = 0 + else: + num_padded = batch_per_step - (50000 % batch_per_step) + else: + num_padded = batch_per_step - 50000 + print("eval padded samples:{}".format(num_padded)) + + if num_padded != 0: + white_io = BytesIO() + Image.new('RGB', (224, 224), (255, 255, 255)).save(white_io, 'JPEG') + padded_sample = { + "image": np.array(bytearray(white_io.getvalue()), dtype="uint8"), + "label": np.array(-1, np.int32) + } + sample = [padded_sample for x in range(num_padded)] + ds_pad = ds.PaddedDataset(sample) + ds_imagefolder = ds.ImageFolderDataset(dataset_path, num_parallel_workers=24) + data_set = ds_pad + ds_imagefolder + distributeSampler = ds.DistributedSampler(num_shards=device_num, + shard_id=rank_id, shuffle=False, num_samples=None) + data_set.use_sampler(distributeSampler) trans = [ ds.vision.Decode(), ds.vision.Resize(256), @@ -148,214 +115,10 @@ def create_dataset2(dataset_path, do_train, batch_size=32, train_image_size=224, cache=eval_cache) else: data_set = data_set.map(operations=type_cast_op, input_columns="label", - num_parallel_workers=get_num_parallel_workers(12)) - - # apply batch operations - data_set = data_set.batch(batch_size, drop_remainder=True) - - return data_set - -def create_dataset_pynative(dataset_path, do_train, batch_size=32, train_image_size=224, - eval_image_size=224, target="Ascend", distribute=False, enable_cache=False, - cache_session_id=None): - """ - create a train or eval imagenet2012 dataset for resnet50 benchmark - - Args: - dataset_path(string): the path of dataset. - do_train(bool): whether dataset is used for train or eval. - repeat_num(int): the repeat times of dataset. Default: 1 - batch_size(int): the batch size of dataset. Default: 32 - target(str): the device target. Default: Ascend - distribute(bool): data for distribute or not. Default: False - enable_cache(bool): whether tensor caching service is used for eval. Default: False - cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None - - Returns: - dataset - """ - device_num, rank_id = _get_rank_info(distribute) - - if device_num == 1: - data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(8), shuffle=True) - else: - data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(2), shuffle=True, - num_shards=device_num, shard_id=rank_id) - - # Computed from random subset of ImageNet training images - mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] - std = [0.229 * 255, 0.224 * 255, 0.225 * 255] - - # define map operations - if do_train: - trans = [ - ds.vision.RandomCropDecodeResize(train_image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)), - ds.vision.RandomHorizontalFlip(prob=0.5), - ds.vision.Normalize(mean=mean, std=std), - ds.vision.HWC2CHW() - ] - else: - trans = [ - ds.vision.Decode(), - ds.vision.Resize(256), - ds.vision.CenterCrop(eval_image_size), - ds.vision.Normalize(mean=mean, std=std), - ds.vision.HWC2CHW() - ] - - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) - - data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=4) - # only enable cache for eval - if do_train: - enable_cache = False - if enable_cache: - if not cache_session_id: - raise ValueError("A cache session_id must be provided to use cache.") - eval_cache = ds.DatasetCache(session_id=int(cache_session_id), size=0) - data_set = data_set.map(operations=type_cast_op, input_columns="label", - num_parallel_workers=get_num_parallel_workers(2), - cache=eval_cache) - else: - data_set = data_set.map(operations=type_cast_op, input_columns="label", - num_parallel_workers=get_num_parallel_workers(2)) - - # apply batch operations - data_set = data_set.batch(batch_size, drop_remainder=True) - - return data_set - -def create_dataset3(dataset_path, do_train, batch_size=32, train_image_size=224, eval_image_size=224, - target="Ascend", distribute=False, enable_cache=False, cache_session_id=None): - """ - create a train or eval imagenet2012 dataset for resnet101 - Args: - dataset_path(string): the path of dataset. - do_train(bool): whether dataset is used for train or eval. - repeat_num(int): the repeat times of dataset. Default: 1 - batch_size(int): the batch size of dataset. Default: 32 - target(str): the device target. Default: Ascend - distribute(bool): data for distribute or not. Default: False - enable_cache(bool): whether tensor caching service is used for eval. Default: False - cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None - - Returns: - dataset - """ - device_num, rank_id = _get_rank_info(distribute) - if device_num == 1: - data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(8), shuffle=True) - else: - data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(8), shuffle=True, - num_shards=device_num, shard_id=rank_id) - - mean = [0.475 * 255, 0.451 * 255, 0.392 * 255] - std = [0.275 * 255, 0.267 * 255, 0.278 * 255] - - # define map operations - if do_train: - trans = [ - ds.vision.RandomCropDecodeResize(train_image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)), - ds.vision.RandomHorizontalFlip(rank_id / (rank_id + 1)), - ds.vision.Normalize(mean=mean, std=std), - ds.vision.HWC2CHW() - ] - else: - trans = [ - ds.vision.Decode(), - ds.vision.Resize(256), - ds.vision.CenterCrop(eval_image_size), - ds.vision.Normalize(mean=mean, std=std), - ds.vision.HWC2CHW() - ] - - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) - - data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=get_num_parallel_workers(8)) - # only enable cache for eval - if do_train: - enable_cache = False - if enable_cache: - if not cache_session_id: - raise ValueError("A cache session_id must be provided to use cache.") - eval_cache = ds.DatasetCache(session_id=int(cache_session_id), size=0) - data_set = data_set.map(operations=type_cast_op, input_columns="label", - num_parallel_workers=get_num_parallel_workers(8), - cache=eval_cache) - else: - data_set = data_set.map(operations=type_cast_op, input_columns="label", - num_parallel_workers=get_num_parallel_workers(8)) - - # apply batch operations - data_set = data_set.batch(batch_size, drop_remainder=True) - - return data_set - -def create_dataset4(dataset_path, do_train, batch_size=32, train_image_size=224, eval_image_size=224, - target="Ascend", distribute=False, enable_cache=False, cache_session_id=None): - """ - create a train or eval imagenet2012 dataset for se-resnet50 - - Args: - dataset_path(string): the path of dataset. - do_train(bool): whether dataset is used for train or eval. - repeat_num(int): the repeat times of dataset. Default: 1 - batch_size(int): the batch size of dataset. Default: 32 - target(str): the device target. Default: Ascend - distribute(bool): data for distribute or not. Default: False - enable_cache(bool): whether tensor caching service is used for eval. Default: False - cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None - - Returns: - dataset - """ - device_num, rank_id = _get_rank_info(distribute) - ds.config.set_prefetch_size(64) - if device_num == 1: - data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(12), shuffle=True) - else: - data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(12), shuffle=True, - num_shards=device_num, shard_id=rank_id) - - # Computed from random subset of ImageNet training images - mean = [123.68, 116.78, 103.94] - std = [1.0, 1.0, 1.0] - - # define map operations - if do_train: - trans = [ - ds.vision.RandomCropDecodeResize(train_image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)), - ds.vision.RandomHorizontalFlip(prob=0.5), - ds.vision.Normalize(mean=mean, std=std), - ds.vision.HWC2CHW() - ] - else: - trans = [ - ds.vision.Decode(), - ds.vision.Resize(292), - ds.vision.CenterCrop(eval_image_size), - ds.vision.Normalize(mean=mean, std=std), - ds.vision.HWC2CHW() - ] - - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) - data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=get_num_parallel_workers(12)) - # only enable cache for eval - if do_train: - enable_cache = False - if enable_cache: - if not cache_session_id: - raise ValueError("A cache session_id must be provided to use cache.") - eval_cache = ds.DatasetCache(session_id=int(cache_session_id), size=0) - data_set = data_set.map(operations=type_cast_op, input_columns="label", - num_parallel_workers=get_num_parallel_workers(12), - cache=eval_cache) - else: - data_set = data_set.map(operations=type_cast_op, input_columns="label", - num_parallel_workers=get_num_parallel_workers(12)) + num_parallel_workers=get_num_parallel_workers(12)) #12 # apply batch operations - data_set = data_set.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=drop_remainder) return data_set @@ -372,6 +135,7 @@ def _get_rank_info(distribute): device_num = 1 return device_num, rank_id + def get_num_parallel_workers(num_parallel_workers): """ Get num_parallel_workers used in dataset operations. diff --git a/benchmark/ascend/resnet/src/dataset_infer.py b/benchmark/ascend/resnet/src/dataset_infer.py index aa26e0a75c02c3865ea61f066e429a1612e2105f..9561aedaa1ed790a161009301f47ba0a4721b654 100644 --- a/benchmark/ascend/resnet/src/dataset_infer.py +++ b/benchmark/ascend/resnet/src/dataset_infer.py @@ -1,4 +1,4 @@ -# Copyright 2021-2022 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -89,7 +89,8 @@ class ImgDataset: return len(self.data) -def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): +def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, image_size=224, + target="Ascend", distribute=False): """ create a train or eval imagenet2012 dataset for resnet50 @@ -123,7 +124,6 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" num_parallel_workers=8, shuffle=True, num_shards=device_num, shard_id=rank_id) - image_size = 224 # Computed from random subset of ImageNet training images mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] std = [0.229 * 255, 0.224 * 255, 0.225 * 255] @@ -164,7 +164,8 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" return data_set -def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): +def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, image_size=224, + target="Ascend", distribute=False): """ create a train or eval imagenet2012 dataset for resnet101 Args: @@ -196,7 +197,6 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= data_set = ds.GeneratorDataset(source=dataset_generator, column_names=["label", "image", "filename"], num_parallel_workers=8, shuffle=True, num_shards=device_num, shard_id=rank_id) - image_size = 224 mean = [0.475 * 255, 0.451 * 255, 0.392 * 255] std = [0.275 * 255, 0.267 * 255, 0.278 * 255] @@ -233,7 +233,8 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= return data_set -def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): +def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, image_size=224, + target="Ascend", distribute=False): """ create a train or eval imagenet2012 dataset for se-resnet50 @@ -265,7 +266,6 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target= data_set = ds.GeneratorDataset(source=dataset_generator, column_names=["label", "image", "filename"], num_parallel_workers=8, shuffle=True, num_shards=device_num, shard_id=rank_id) - image_size = 224 # Computed from random subset of ImageNet training images mean = [123.68, 116.78, 103.94] std = [1.0, 1.0, 1.0] diff --git a/benchmark/ascend/resnet/src/logger.py b/benchmark/ascend/resnet/src/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..e52a3be6d3aca92e516c7a502e4690aea79b7263 --- /dev/null +++ b/benchmark/ascend/resnet/src/logger.py @@ -0,0 +1,87 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ======================================================================================= +"""Custom Logger.""" +import os +import sys +import logging + + +class LOGGER(logging.Logger): + """ + Logger. + + Args: + logger_name: String. Logger name. + rank: Integer. Rank id. + """ + def __init__(self, logger_name, rank=0, param_server=False): + super(LOGGER, self).__init__(logger_name) + self.rank = rank + if rank % 8 == 0 or param_server or self.use_server(): + console = logging.StreamHandler(sys.stdout) + console.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') + console.setFormatter(formatter) + self.addHandler(console) + self.log_fn = None + + @staticmethod + def use_server(): + worked = os.getenv('MS_WORKER_NUM', None) + server = os.getenv('MS_SERVER_NUM', None) + if worked is not None and server is not None: + return True + return False + + def setup_logging_file(self, log_dir): + """Setup logging file.""" + if not os.path.exists(log_dir): + os.makedirs(log_dir, exist_ok=True) + log_name = 'log.txt' + self.log_fn = os.path.join(log_dir, log_name) + fh = logging.FileHandler(self.log_fn) + fh.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') + fh.setFormatter(formatter) + self.addHandler(fh) + + def info(self, msg, *args, **kwargs): + if self.isEnabledFor(logging.INFO): + self._log(logging.INFO, msg, args, **kwargs) + + def save_args(self, args): + self.info('Args:') + args_dict = vars(args) + for key in args_dict.keys(): + self.info('--> %s: %s', key, args_dict[key]) + self.info('') + + def important_info(self, msg, *args, **kwargs): + if self.isEnabledFor(logging.INFO) and self.rank == 0: + line_width = 2 + important_msg = '\n' + important_msg += ('*'*70 + '\n')*line_width + important_msg += ('*'*line_width + '\n')*2 + important_msg += '*'*line_width + ' '*8 + msg + '\n' + important_msg += ('*'*line_width + '\n')*2 + important_msg += ('*'*70 + '\n')*line_width + self.info(important_msg, *args, **kwargs) + + +def get_logger(path, rank, param_server=False): + """Get Logger.""" + logger = LOGGER('resnet', rank, param_server=param_server) + logger.setup_logging_file(os.path.join(path, 'rank_' + str(rank))) + return logger diff --git a/benchmark/ascend/resnet/src/lr_generator.py b/benchmark/ascend/resnet/src/lr_generator.py index d28c2acd0fd459bc226d005ffff21f7531bd7e10..fee7f7db4d477c4a724170a421d6efa5ab8d0f42 100644 --- a/benchmark/ascend/resnet/src/lr_generator.py +++ b/benchmark/ascend/resnet/src/lr_generator.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import math import numpy as np -def _generate_steps_lr(lr_init, lr_max, total_steps, warmup_steps): +def _generate_steps_lr(lr_init, lr_max, total_steps, warmup_steps, start_steps): """ Applies three steps decay to generate learning rate array. @@ -45,10 +45,10 @@ def _generate_steps_lr(lr_init, lr_max, total_steps, warmup_steps): else: lr = lr_max * 0.001 lr_each_step.append(lr) - return lr_each_step + return lr_each_step[start_steps:] -def _generate_step_lr(lr_init, lr_max, total_steps, warmup_steps): +def _generate_step_lr(lr_max, total_steps, start_steps): """ Applies three steps decay to generate learning rate array. @@ -75,10 +75,10 @@ def _generate_step_lr(lr_init, lr_max, total_steps, warmup_steps): else: lr = 0.00005 lr_each_step.append(lr) - return lr_each_step + return lr_each_step[start_steps:] -def _generate_poly_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps): +def _generate_poly_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps, start_steps): """ Applies polynomial decay to generate learning rate array. @@ -102,14 +102,14 @@ def _generate_poly_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps): lr = float(lr_init) + inc_each_step * float(i) else: base = (1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps))) - lr = float(lr_max) * base * base + lr = (float(lr_max) - float(lr_end)) * base * base + float(lr_end) # 773491 if lr < 0.0: lr = 0.0 lr_each_step.append(lr) - return lr_each_step + return lr_each_step[start_steps:] -def _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps): +def _generate_cosine_lr(lr_init, lr_max, total_steps, warmup_steps, start_steps): """ Applies cosine decay to generate learning rate array. @@ -135,10 +135,10 @@ def _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps): decayed = linear_decay * cosine_decay + 0.00001 lr = lr_max * decayed lr_each_step.append(lr) - return lr_each_step + return lr_each_step[start_steps:] -def _generate_liner_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps): +def _generate_liner_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps, start_steps): """ Applies liner decay to generate learning rate array. @@ -159,11 +159,10 @@ def _generate_liner_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps): else: lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps) lr_each_step.append(lr) - return lr_each_step - + return lr_each_step[start_steps:] -def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode): +def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, start_epoch, steps_per_epoch, lr_decay_mode): """ generate learning rate array @@ -179,21 +178,20 @@ def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch Returns: np.array, learning rate array """ - lr_each_step = [] total_steps = steps_per_epoch * total_epochs warmup_steps = steps_per_epoch * warmup_epochs + start_steps = steps_per_epoch * start_epoch if lr_decay_mode == 'steps': - lr_each_step = _generate_steps_lr(lr_init, lr_max, total_steps, warmup_steps) + lr_each_step = _generate_steps_lr(lr_init, lr_max, total_steps, warmup_steps, start_steps) elif lr_decay_mode == 'step': - warmup_steps = warmup_epochs - lr_each_step = _generate_step_lr(lr_init, lr_max, total_steps, warmup_steps) + lr_each_step = _generate_step_lr(lr_max, total_steps, start_steps) elif lr_decay_mode == 'poly': - lr_each_step = _generate_poly_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps) + lr_each_step = _generate_poly_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps, start_steps) elif lr_decay_mode == 'cosine': - lr_each_step = _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps) + lr_each_step = _generate_cosine_lr(lr_init, lr_max, total_steps, warmup_steps, start_steps) else: - lr_each_step = _generate_liner_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps) + lr_each_step = _generate_liner_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps, start_steps) lr_each_step = np.array(lr_each_step).astype(np.float32) return lr_each_step diff --git a/benchmark/ascend/resnet/src/metric.py b/benchmark/ascend/resnet/src/metric.py index 75472e1b3054124bc7c0368f2db51dd1d555319d..b56d3ffd8cdc8e4e34023e666603b7a8cd14a3ac 100644 --- a/benchmark/ascend/resnet/src/metric.py +++ b/benchmark/ascend/resnet/src/metric.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ import mindspore as ms from mindspore.communication.management import GlobalComm import mindspore.ops as ops import mindspore.nn as nn +import mindspore.train as train class ClassifyCorrectCell(nn.Cell): r""" @@ -61,7 +62,7 @@ class ClassifyCorrectCell(nn.Cell): return (total_correct,) -class DistAccuracy(nn.Metric): +class DistAccuracy(train.Metric): r""" Calculates the accuracy for classification data in distributed mode. The accuracy class creates two local variables, correct number and total number that are used to compute the @@ -90,11 +91,11 @@ class DistAccuracy(nn.Metric): self.clear() self.batch_size = batch_size self.device_num = device_num + self._total_num = 50000 #TODO def clear(self): """Clears the internal evaluation result.""" self._correct_num = 0 - self._total_num = 0 def update(self, *inputs): """ @@ -113,7 +114,7 @@ class DistAccuracy(nn.Metric): raise ValueError('Distribute accuracy needs 1 input (y_correct), but got {}'.format(len(inputs))) y_correct = self._convert_data(inputs[0]) self._correct_num += y_correct - self._total_num += self.batch_size * self.device_num + # self._total_num += self.batch_size * self.device_num #TODO def eval(self): """ @@ -125,7 +126,7 @@ class DistAccuracy(nn.Metric): Raises: RuntimeError: If the sample size is 0. """ - if self._total_num == 0: raise RuntimeError('Accuracy can not be calculated, because the number of samples is 0.') + print("self._total_num********", self._total_num) return self._correct_num / self._total_num diff --git a/benchmark/ascend/resnet/src/model_utils/config.py b/benchmark/ascend/resnet/src/model_utils/config.py index 7cabf17cc9fbf5599f5de0ffe1959ca77ad2ffed..5d51ec9b75474043d7d2deb997616f1115154bc1 100644 --- a/benchmark/ascend/resnet/src/model_utils/config.py +++ b/benchmark/ascend/resnet/src/model_utils/config.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ import os import ast import argparse -from pprint import pprint, pformat +from pprint import pformat import yaml _config_path = "./config/resnet50_cifar10_config.yaml" @@ -123,8 +123,8 @@ def get_config(): default, helper, choices = parse_yaml(path_args.config_path) args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) final_config = merge(args, default) - pprint(final_config) print("Please check the above information for the configurations", flush=True) return Config(final_config) + config = get_config() diff --git a/benchmark/ascend/resnet/src/model_utils/device_adapter.py b/benchmark/ascend/resnet/src/model_utils/device_adapter.py index 9c3d21d5e47c22617170887df9da97beff668495..1515acc01bb48d15e88a46e74555fb553c7c5e09 100644 --- a/benchmark/ascend/resnet/src/model_utils/device_adapter.py +++ b/benchmark/ascend/resnet/src/model_utils/device_adapter.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/benchmark/ascend/resnet/src/model_utils/local_adapter.py b/benchmark/ascend/resnet/src/model_utils/local_adapter.py index 769fa6dc78e59eb66dbc8e6773accdc1d08b649e..98df5674af518978db1a220a51496cec62c60602 100644 --- a/benchmark/ascend/resnet/src/model_utils/local_adapter.py +++ b/benchmark/ascend/resnet/src/model_utils/local_adapter.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/benchmark/ascend/resnet/src/model_utils/moxing_adapter.py b/benchmark/ascend/resnet/src/model_utils/moxing_adapter.py index e5d77145e28c14f892ff8249c7b57b0ac23796db..5bc226c978f7007281840ac6d1ed32e5c28225c6 100644 --- a/benchmark/ascend/resnet/src/model_utils/moxing_adapter.py +++ b/benchmark/ascend/resnet/src/model_utils/moxing_adapter.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -89,14 +89,14 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.checkpoint_url, config.load_path) print("Preload downloaded: ", os.listdir(config.load_path)) if config.train_url: - sync_data(config.train_url, config.output_path) - print("Workspace downloaded: ", os.listdir(config.output_path)) + sync_data(config.train_url, config.output_dir) + print("Workspace downloaded: ", os.listdir(config.output_dir)) - ms.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + ms.set_context(save_graphs_path=os.path.join(config.output_dir, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() - if not os.path.exists(config.output_path): - os.makedirs(config.output_path) + if not os.path.exists(config.output_dir): + os.makedirs(config.output_dir) if pre_process: pre_process() @@ -110,6 +110,6 @@ def moxing_wrapper(pre_process=None, post_process=None): if config.train_url: print("Start to copy output directory") - sync_data(config.output_path, config.train_url) + sync_data(config.output_dir, config.train_url) return wrapped_func return wrapper diff --git a/benchmark/ascend/resnet/src/momentum.py b/benchmark/ascend/resnet/src/momentum.py index 65783bc37ef085a396843c454d4072701089fe52..d63a36fdeeb9407e098b54dfff2c5884c63d2de8 100644 --- a/benchmark/ascend/resnet/src/momentum.py +++ b/benchmark/ascend/resnet/src/momentum.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -94,8 +94,8 @@ class Momentum(Optimizer): tuple[bool], all elements are True. Raises: - AssertionError: If the momentum is less than 0.0. - AssertionError: If the momentum is not a float or use_nesterov is not a bool. + ValueError: If the momentum is less than 0.0. + TypeError: If the momentum is not a float or use_nesterov is not a bool. Supported Platforms: ``GPU`` @@ -121,7 +121,7 @@ class Momentum(Optimizer): """ def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0, use_nesterov=False): super(Momentum, self).__init__(learning_rate, params, weight_decay, loss_scale) - assert isinstance(momentum, float) and momentum >= 0.0, "momentum should be equal or bigger than 0" + assert isinstance(momentum, float) and momentum >= 0, "momentum should be equal or bigger than 0" assert isinstance(use_nesterov, bool), "use_nesterov should be bool" self.momentum = Parameter(Tensor(momentum, ms.float32), name="momentum") self.params = self.parameters @@ -131,15 +131,12 @@ class Momentum(Optimizer): self.opt = ops.FusedWeightScaleApplyMomentum() def construct(self, gradients): - ''' - Momentum construct - ''' params = self.params moments = self.moments weight_decay = Tensor(0.0, ms.float32) scale = Tensor(1.0, ms.float32) if self.exec_weight_decay: - weight_decay = self.weight_decay_tensor + weight_decay = self.weight_decay if self.need_scale: scale = self.reciprocal_scale lr = self.get_lr() diff --git a/benchmark/ascend/resnet/src/resnet.py b/benchmark/ascend/resnet/src/resnet.py index 6d14733d48e508a65cbf4256c18012ba7abeb995..c4e6363951379e3024530313dd2d1ee009f949e9 100644 --- a/benchmark/ascend/resnet/src/resnet.py +++ b/benchmark/ascend/resnet/src/resnet.py @@ -1,4 +1,4 @@ -# Copyright 2020-2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,9 +24,6 @@ from src.model_utils.config import config def conv_variance_scaling_initializer(in_channel, out_channel, kernel_size): - ''' - Initializer for conv - ''' fan_in = in_channel * kernel_size * kernel_size scale = 1.0 scale /= max(1., fan_in) @@ -111,9 +108,6 @@ def kaiming_uniform(inputs_shape, a=0., mode='fan_in', nonlinearity='leaky_relu' def _conv3x3(in_channel, out_channel, stride=1, use_se=False, res_base=False): - ''' - Create Conv2d with 3x3 kernel - ''' if use_se: weight = conv_variance_scaling_initializer(in_channel, out_channel, kernel_size=3) else: @@ -129,9 +123,6 @@ def _conv3x3(in_channel, out_channel, stride=1, use_se=False, res_base=False): def _conv1x1(in_channel, out_channel, stride=1, use_se=False, res_base=False): - ''' - Create Conv2d with 1x1 kernel - ''' if use_se: weight = conv_variance_scaling_initializer(in_channel, out_channel, kernel_size=1) else: @@ -147,9 +138,6 @@ def _conv1x1(in_channel, out_channel, stride=1, use_se=False, res_base=False): def _conv7x7(in_channel, out_channel, stride=1, use_se=False, res_base=False): - ''' - Create Conv2d with 7x7 kernel - ''' if use_se: weight = conv_variance_scaling_initializer(in_channel, out_channel, kernel_size=7) else: diff --git a/benchmark/ascend/resnet/src/resnet_gpu_benchmark.py b/benchmark/ascend/resnet/src/resnet_gpu_benchmark.py index 67ec6ffa67598dee896fbf1d59a204d97ab85f6f..869780617f873983a059a8044039d991e8115bea 100644 --- a/benchmark/ascend/resnet/src/resnet_gpu_benchmark.py +++ b/benchmark/ascend/resnet/src/resnet_gpu_benchmark.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/benchmark/ascend/resnet/src/util.py b/benchmark/ascend/resnet/src/util.py new file mode 100644 index 0000000000000000000000000000000000000000..c072cf59d7dee68060bc0801f6f7ee5fd720588f --- /dev/null +++ b/benchmark/ascend/resnet/src/util.py @@ -0,0 +1,144 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import os +import numpy as np +import mindspore as ms +import mindspore.nn as nn +from mindspore.communication.management import GlobalComm +import mindspore.ops as ops +from src.callback import EvalCallBack +from src.resnet import conv_variance_scaling_initializer + + +def filter_checkpoint_parameter_by_list(origin_dict, param_filter, cfg): + """remove useless parameters according to filter_list""" + for key in list(origin_dict.keys()): + for name in param_filter: + if name in key: + cfg.logger.info("Delete parameter from checkpoint: %s", key) + del origin_dict[key] + break + + +def apply_eval(eval_param): + eval_model = eval_param["model"] + eval_ds = eval_param["dataset"] + metrics_name = eval_param["metrics_name"] + res = eval_model.eval(eval_ds, dataset_sink_mode=True) + return res[metrics_name] + + +def init_group_params(net, cfg): + decayed_params = [] + no_decayed_params = [] + for param in net.trainable_params(): + if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: + decayed_params.append(param) + else: + no_decayed_params.append(param) + + group_params = [{'params': decayed_params, 'weight_decay': cfg.weight_decay}, + {'params': no_decayed_params}, + {'order_params': net.trainable_params()}] + return group_params + + +def eval_callback(model, cfg, eval_dataset): + eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"} + eval_cb = EvalCallBack(apply_eval, eval_param_dict, interval=cfg.eval_interval, + eval_start_epoch=cfg.eval_start_epoch, rank_id=cfg.rank_id, + save_best_ckpt=cfg.save_best_ckpt, ckpt_directory=cfg.save_ckpt_dir, + best_ckpt_name="best_acc.ckpt", metrics_name="acc", logger=cfg.logger) + return eval_cb + + +def set_output_dir(cfg): + """set save ckpt dir""" + cfg.output_dir = os.path.realpath(os.path.join(cfg.output_dir, cfg.net_name, cfg.dataset)) + cfg.save_ckpt_dir = os.path.join(cfg.output_dir, 'ckpt') + cfg.log_dir = os.path.join(cfg.output_dir, 'log') + return cfg + + +def set_golden_output_dir(cfg): + """set save ckpt dir""" + cfg.output_dir = os.path.realpath(os.path.join(cfg.output_dir, cfg.net_name, cfg.dataset, cfg.comp_algo)) + cfg.save_ckpt_dir = os.path.join(cfg.output_dir, 'ckpt') + cfg.log_dir = os.path.join(cfg.output_dir, 'log') + return cfg + + +def init_weight(net, cfg): + """init_weight""" + + if cfg.pre_trained: + if not os.path.isfile(cfg.pre_trained): + cfg.logger.warning("There is not ckpt file: %s", cfg.pre_trained) + else: + param_dict = ms.load_checkpoint(cfg.pre_trained) + if cfg.filter_weight: + filter_list = [x.name for x in net.end_point.get_parameters()] + filter_checkpoint_parameter_by_list(param_dict, filter_list) + ms.load_param_into_net(net, param_dict) + cfg.logger.info("Pre trained ckpt mode: %s loading", cfg.pre_trained) + else: + for _, cell in net.cells_and_names(): + if isinstance(cell, nn.Conv2d): + if cfg.conv_init == "XavierUniform": + cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + cell.weight.shape, + cell.weight.dtype)) + elif cfg.conv_init == "TruncatedNormal": + weight = conv_variance_scaling_initializer(cell.in_channels, + cell.out_channels, + cell.kernel_size[0]) + cell.weight.set_data(weight) + if isinstance(cell, nn.Dense): + if cfg.dense_init == "TruncatedNormal": + cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + cell.weight.shape, + cell.weight.dtype)) + elif cfg.dense_init == "RandomNormal": + in_channel = cell.in_channels + out_channel = cell.out_channels + weight = np.random.normal(loc=0, scale=0.01, size=out_channel * in_channel) + weight = ms.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) + cell.weight.set_data(weight) + + +class AllreduceSync(nn.Cell): + def __init__(self,): + super(AllreduceSync, self).__init__() + self.allreduce = ops.AllReduce(ops.ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP) + + def construct(self, x): + y = self.allreduce(x) + return y + + +def reset_weight(model, orign_params): + train_parameters = ms.ParameterTuple(model._train_network.get_parameters()) + for idx, params in enumerate(train_parameters): + if "global_step" in params.name: + print("before global_step is", params.name, params.asnumpy(), flush=True) + params.set_data(orign_params[idx]) + if "global_step" in params.name: + print("after global_step is", params.name, params.asnumpy(), flush=True) + + +def pre_build(model, train_dataset, val_dataset, sink_size, epoch): + model.build(train_dataset, val_dataset, sink_size=sink_size, epoch=epoch) + asyn = AllreduceSync() + asyn(ms.Tensor(np.ones(32).astype(np.float32))) diff --git a/benchmark/ascend/resnet/train.py b/benchmark/ascend/resnet/train.py index bfe937f699ff0a862a6a18c1c9b0d4067cec70fa..6791a1bec473198bd3469e42a9009b2f17c698d4 100644 --- a/benchmark/ascend/resnet/train.py +++ b/benchmark/ascend/resnet/train.py @@ -1,4 +1,4 @@ -# Copyright 2020-2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,117 +13,40 @@ # limitations under the License. # ============================================================================ """train resnet.""" -import datetime -import glob import os -import numpy as np - import mindspore as ms -from mindspore import Tensor -from mindspore.nn.optim import Momentum, thor, LARS -from mindspore.train.model import Model -from mindspore.context import ParallelMode +import mindspore.nn as nn +import mindspore.log as logger from mindspore.train.train_thor import ConvertModelUtils -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor -from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits -from mindspore.train.loss_scale_manager import FixedLossScaleManager +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig from mindspore.communication.management import init, get_rank -from mindspore.common import set_seed from mindspore.parallel import set_algo_parameters -import mindspore.nn as nn -import mindspore.log as logger +from src.logger import get_logger from src.lr_generator import get_lr, warmup_cosine_annealing_lr from src.CrossEntropySmooth import CrossEntropySmooth -from src.eval_callback import EvalCallBack +from src.callback import LossCallBack, ResumeCallback +from src.util import eval_callback, init_weight, init_group_params, set_output_dir, pre_build from src.metric import DistAccuracy, ClassifyCorrectCell from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -from src.model_utils.device_adapter import get_rank_id, get_device_num -from src.resnet import conv_variance_scaling_initializer - - -set_seed(1) - - -class LossCallBack(LossMonitor): - """ - Monitor the loss in training. - If the loss in NAN or INF terminating training. - """ - - def __init__(self, has_trained_epoch=0): - super(LossCallBack, self).__init__() - self.has_trained_epoch = has_trained_epoch - - def step_end(self, run_context): - cb_params = run_context.original_args() - loss = cb_params.net_outputs - - if isinstance(loss, (tuple, list)): - if isinstance(loss[0], Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): - loss = loss[0] - - if isinstance(loss, Tensor) and isinstance(loss.asnumpy(), np.ndarray): - loss = np.mean(loss.asnumpy()) - - cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 - - if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)): - raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format( - cb_params.cur_epoch_num, cur_step_in_epoch)) - if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0: - print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num + int(self.has_trained_epoch), - cur_step_in_epoch, loss), flush=True) - - -if config.net_name in ("resnet18", "resnet34", "resnet50", "resnet152"): - if config.net_name == "resnet18": - from src.resnet import resnet18 as resnet - elif config.net_name == "resnet34": - from src.resnet import resnet34 as resnet - elif config.net_name == "resnet50": - from src.resnet import resnet50 as resnet - else: - from src.resnet import resnet152 as resnet - if config.dataset == "cifar10": - from src.dataset import create_dataset1 as create_dataset - else: - if config.mode_name == "GRAPH": - from src.dataset import create_dataset2 as create_dataset - else: - from src.dataset import create_dataset_pynative as create_dataset -elif config.net_name == "resnet101": - from src.resnet import resnet101 as resnet - from src.dataset import create_dataset3 as create_dataset +from src.model_utils.device_adapter import get_device_num + +from src.dataset import create_dataset +if config.net_name == "resnet18": + from src.resnet import resnet18 as resnet +elif config.net_name == "resnet34": + from src.resnet import resnet34 as resnet +elif config.net_name == "resnet50": + from src.resnet import resnet50 as resnet else: - from src.resnet import se_resnet50 as resnet - from src.dataset import create_dataset4 as create_dataset - - -def filter_checkpoint_parameter_by_list(origin_dict, param_filter): - """remove useless parameters according to filter_list""" - for key in list(origin_dict.keys()): - for name in param_filter: - if name in key: - print("Delete parameter from checkpoint: ", key) - del origin_dict[key] - break - - -def apply_eval(eval_param): - eval_model = eval_param["model"] - eval_ds = eval_param["dataset"] - metrics_name = eval_param["metrics_name"] - res = eval_model.eval(eval_ds) - return res[metrics_name] - + from src.resnet import resnet152 as resnet def set_graph_kernel_context(run_platform, net_name): if run_platform == "GPU" and net_name == "resnet101": ms.set_context(enable_graph_kernel=True) ms.set_context(graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D") - +ms.set_seed(1) def set_parameter(): """set_parameter""" @@ -134,7 +57,7 @@ def set_parameter(): # init context if config.mode_name == 'GRAPH': if target == "Ascend": - rank_save_graphs_path = os.path.join(config.save_graphs_path, "soma", str(os.getenv('DEVICE_ID'))) + rank_save_graphs_path = os.path.join(config.save_graphs_path, "soma", str(os.getenv('DEVICE_ID', '0'))) ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=config.save_graphs, save_graphs_path=rank_save_graphs_path) else: @@ -142,15 +65,15 @@ def set_parameter(): set_graph_kernel_context(target, config.net_name) else: ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) - + set_ascend_max_device_memory() if config.parameter_server: ms.set_ps_context(enable_ps=True) if config.run_distribute: if target == "Ascend": - device_id = int(os.getenv('DEVICE_ID')) + device_id = int(os.getenv('DEVICE_ID', '0')) ms.set_context(device_id=device_id) - ms.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, - gradients_mean=True) + ms.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ms.ParallelMode.DATA_PARALLEL, + gradients_mean=True, parameter_broadcast=False) set_algo_parameters(elementwise_op_strategy_follow=True) if config.net_name == "resnet50" or config.net_name == "se-resnet50": if config.boost_mode not in ["O1", "O2"]: @@ -162,92 +85,28 @@ def set_parameter(): else: init() ms.set_auto_parallel_context(device_num=get_device_num(), - parallel_mode=ParallelMode.DATA_PARALLEL, + parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True) if config.net_name == "resnet50": ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) - - -def load_pre_trained_checkpoint(): - """ - Load checkpoint according to pre_trained path. - """ - param_dict = None - if config.pre_trained: - if os.path.isdir(config.pre_trained): - ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path, "ckpt_0") - ckpt_pattern = os.path.join(ckpt_save_dir, "*.ckpt") - ckpt_files = glob.glob(ckpt_pattern) - if not ckpt_files: - logger.warning(f"There is no ckpt file in {ckpt_save_dir}, " - f"pre_trained is unsupported.") - else: - ckpt_files.sort(key=os.path.getmtime, reverse=True) - time_stamp = datetime.datetime.now() - print(f"time stamp {time_stamp.strftime('%Y.%m.%d-%H:%M:%S')}" - f" pre trained ckpt model {ckpt_files[0]} loading", - flush=True) - param_dict = ms.load_checkpoint(ckpt_files[0]) - elif os.path.isfile(config.pre_trained): - param_dict = ms.load_checkpoint(config.pre_trained) - else: - print(f"Invalid pre_trained {config.pre_trained} parameter.") - return param_dict - - -def init_weight(net, param_dict): - """init_weight""" - if config.pre_trained: - if param_dict: - if param_dict.get("epoch_num") and param_dict.get("step_num"): - config.has_trained_epoch = int(param_dict["epoch_num"].data.asnumpy()) - config.has_trained_step = int(param_dict["step_num"].data.asnumpy()) - else: - config.has_trained_epoch = 0 - config.has_trained_step = 0 - - if config.filter_weight: - filter_list = [x.name for x in net.end_point.get_parameters()] - filter_checkpoint_parameter_by_list(param_dict, filter_list) - ms.load_param_into_net(net, param_dict) - else: - for _, cell in net.cells_and_names(): - if isinstance(cell, nn.Conv2d): - if config.conv_init == "XavierUniform": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), - cell.weight.shape, - cell.weight.dtype)) - elif config.conv_init == "TruncatedNormal": - weight = conv_variance_scaling_initializer(cell.in_channels, - cell.out_channels, - cell.kernel_size[0]) - cell.weight.set_data(weight) - if isinstance(cell, nn.Dense): - if config.dense_init == "TruncatedNormal": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), - cell.weight.shape, - cell.weight.dtype)) - elif config.dense_init == "RandomNormal": - in_channel = cell.in_channels - out_channel = cell.out_channels - weight = np.random.normal(loc=0, scale=0.01, size=out_channel * in_channel) - weight = Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) - cell.weight.set_data(weight) + config.rank_id = get_rank() if config.run_distribute else 0 def init_lr(step_size): """init lr""" if config.optimizer == "Thor": from src.lr_generator import get_thor_lr - lr = get_thor_lr(0, config.lr_init, config.lr_decay, config.lr_end_epoch, step_size, decay_epochs=39) + lr = get_thor_lr(config.start_epoch * step_size, config.lr_init, config.lr_decay, config.lr_end_epoch, + step_size, decay_epochs=39) else: if config.net_name in ("resnet18", "resnet34", "resnet50", "resnet152", "se-resnet50"): + config.lr_max = config.lr_max #/ 8 * config.device_num lr = get_lr(lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, - warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, steps_per_epoch=step_size, - lr_decay_mode=config.lr_decay_mode) + warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, + start_epoch=config.start_epoch, steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode) else: lr = warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, config.epoch_size, - config.pretrain_epoch_size * step_size) + config.start_epoch * step_size) return lr @@ -258,52 +117,15 @@ def init_loss_scale(): loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=config.label_smooth_factor, num_classes=config.class_num) else: - loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') + loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') return loss -def init_group_params(net): - decayed_params = [] - no_decayed_params = [] - for param in net.trainable_params(): - if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: - decayed_params.append(param) - else: - no_decayed_params.append(param) - - group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay}, - {'params': no_decayed_params}, - {'order_params': net.trainable_params()}] - return group_params - - -def run_eval(target, model, ckpt_save_dir, cb): - """run_eval""" - if config.run_eval: - if config.eval_dataset_path is None or (not os.path.isdir(config.eval_dataset_path)): - raise ValueError("{} is not a existing path.".format(config.eval_dataset_path)) - eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, - batch_size=config.batch_size, train_image_size=config.train_image_size, - eval_image_size=config.eval_image_size, - target=target, enable_cache=config.enable_cache, - cache_session_id=config.cache_session_id) - eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"} - eval_cb = EvalCallBack(apply_eval, eval_param_dict, interval=config.eval_interval, - eval_start_epoch=config.eval_start_epoch, save_best_ckpt=config.save_best_ckpt, - ckpt_directory=ckpt_save_dir, best_ckpt_name="best_acc.ckpt", - metrics_name="acc") - cb += [eval_cb] - - -def set_save_ckpt_dir(): - """set save ckpt dir""" - ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path) - if config.enable_modelarts and config.run_distribute: - ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank_id()) + "/" - else: - if config.run_distribute: - ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank()) + "/" - return ckpt_save_dir +def set_ascend_max_device_memory(): + if ms.get_context("enable_ge") and ms.get_context("mode") == ms.GRAPH_MODE and \ + hasattr(config, "max_device_memory"): + logger.warning("When encountering a memory shortage situation in 1980B, reduce the max_device_memory.") + ms.set_context(max_device_memory=config.max_device_memory) @moxing_wrapper() @@ -311,7 +133,8 @@ def train_net(): """train net""" target = config.device_target set_parameter() - ckpt_param_dict = load_pre_trained_checkpoint() + set_output_dir(config) + config.logger = get_logger(config.log_dir, config.rank_id, config.parameter_server) dataset = create_dataset(dataset_path=config.data_path, do_train=True, batch_size=config.batch_size, train_image_size=config.train_image_size, eval_image_size=config.eval_image_size, target=target, @@ -320,65 +143,94 @@ def train_net(): net = resnet(class_num=config.class_num) if config.parameter_server: net.set_param_ps() + init_weight(net, config) + + if config.resume_ckpt: + resume_param = ms.load_checkpoint(config.resume_ckpt, + choice_func=lambda x: not x.startswith(('learning_rate', 'global_step'))) + config.start_epoch = int(resume_param.get('epoch_num', ms.Tensor(0, ms.int32)).asnumpy().item()) - init_weight(net=net, param_dict=ckpt_param_dict) - lr = Tensor(init_lr(step_size=step_size)) + lr = ms.Tensor(init_lr(step_size=step_size)) # define opt - group_params = init_group_params(net) - opt = Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale) + group_params = init_group_params(net, config) + opt = nn.Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale) if config.optimizer == "LARS": - opt = LARS(opt, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient, - lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name) + opt = nn.LARS(opt, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient, + lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name) loss = init_loss_scale() - loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) dist_eval_network = ClassifyCorrectCell(net) if config.run_distribute else None metrics = {"acc"} if config.run_distribute: metrics = {'acc': DistAccuracy(batch_size=config.batch_size, device_num=config.device_num)} if (config.net_name not in ("resnet18", "resnet34", "resnet50", "resnet101", "resnet152", "se-resnet50")) or \ - config.parameter_server or target == "CPU": - ## fp32 training - model = Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network) + config.parameter_server or target == "CPU": + # fp32 training + model = ms.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network) else: - model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, - amp_level="O2", boost_level=config.boost_mode, keep_batchnorm_fp32=False, - eval_network=dist_eval_network) + model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, + amp_level="O3", boost_level=config.boost_mode, + eval_network=dist_eval_network, + boost_config_dict={"boost": {"mode": "manual", "less_bn": True, "grad_freeze": False, + "adasum": False, "grad_accumulation": False, + "dim_reduce": False}}) if config.optimizer == "Thor" and config.dataset == "imagenet2012": from src.lr_generator import get_thor_damping - damping = get_thor_damping(0, config.damping_init, config.damping_decay, 70, step_size) + damping = get_thor_damping(step_size * config.start_epoch, config.damping_init, config.damping_decay, 70, + step_size) split_indices = [26, 53] - opt = thor(net, lr, Tensor(damping), config.momentum, config.weight_decay, config.loss_scale, - config.batch_size, split_indices=split_indices, frequency=config.frequency) + opt = nn.thor(net, lr, ms.Tensor(damping), config.momentum, config.weight_decay, config.loss_scale, + config.batch_size, split_indices=split_indices, frequency=config.frequency) model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, - amp_level="O2", keep_batchnorm_fp32=False) + amp_level="O3") config.run_eval = False - logger.warning("Thor optimizer not support evaluation while training.") + config.logger.warning("Thor optimizer not support evaluation while training.") + + # load resume param + if config.resume_ckpt: + ms.load_param_into_net(net, resume_param) + ms.load_param_into_net(opt, resume_param) + config.logger.info('resume train from epoch: %s', config.start_epoch) # define callbacks - time_cb = TimeMonitor(data_size=step_size) - loss_cb = LossCallBack(config.has_trained_epoch) - cb = [time_cb, loss_cb] - ckpt_save_dir = set_save_ckpt_dir() - if config.save_checkpoint: - ckpt_append_info = [{"epoch_num": config.has_trained_epoch, "step_num": config.has_trained_step}] + loss_cb = LossCallBack(config.epoch_size, config.logger, lr, per_print_time=1) + resume_cb = ResumeCallback(config.start_epoch) + cb = [loss_cb, resume_cb] + if config.save_checkpoint and config.rank_id == 0: + ckpt_append_info = [{"epoch_num": 0, "step_num": 0}] config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size, keep_checkpoint_max=config.keep_checkpoint_max, append_info=ckpt_append_info) - ckpt_cb = ModelCheckpoint(prefix="resnet", directory=ckpt_save_dir, config=config_ck) + ckpt_cb = ModelCheckpoint(prefix=config.net_name, directory=config.save_ckpt_dir, config=config_ck) cb += [ckpt_cb] - run_eval(target, model, ckpt_save_dir, cb) + + eval_dataset = None + if config.run_eval: + eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, + batch_size=config.eval_batch_size, train_image_size=config.train_image_size, + eval_image_size=config.eval_image_size, + target=target, enable_cache=False, + cache_session_id=config.cache_session_id, + distribute=config.run_distribute, drop_remainder=False) + eval_cb = eval_callback(model, config, eval_dataset) + cb.append(eval_cb) + # train model if config.net_name == "se-resnet50": config.epoch_size = config.train_epoch_size dataset_sink_mode = (not config.parameter_server) and target != "CPU" - config.pretrain_epoch_size = config.has_trained_epoch - model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb, - sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) + config.logger.save_args(config) + sink_size = dataset.get_dataset_size() + new_repeat_count = config.epoch_size * dataset.get_dataset_size() // sink_size + pre_build(model, dataset, eval_dataset, sink_size=sink_size, epoch=new_repeat_count) + config.logger.info("Build end, start training!") + + model.train(new_repeat_count, dataset, callbacks=cb, + sink_size=sink_size, dataset_sink_mode=dataset_sink_mode) - if config.run_eval and config.enable_cache: - print("Remember to shut down the cache server via \"cache_admin --stop\"") + config.logger.info("If run eval and enable_cache Remember to shut down the cache server via \"cache_admin --stop\"") if __name__ == '__main__': diff --git a/official/audio/DeepSpeech2/eval.py b/official/audio/DeepSpeech2/eval.py index 166f0379aec9f9b841e49de6f138399f0295e6cb..4f6cbd0524f055fbf8ce8859b06a2077ff8a8584 100644 --- a/official/audio/DeepSpeech2/eval.py +++ b/official/audio/DeepSpeech2/eval.py @@ -24,7 +24,7 @@ from src.config import eval_config from src.deepspeech2 import DeepSpeechModel, PredictWithSoftmax from src.dataset import create_dataset from src.greedydecoder import MSGreedyDecoder -from mindspore import context +import mindspore from mindspore.train.serialization import load_checkpoint, load_param_into_net parser = argparse.ArgumentParser(description='DeepSpeech evaluation') @@ -36,7 +36,7 @@ parser.add_argument('--device_target', type=str, default="GPU", choices=("GPU", args = parser.parse_args() if __name__ == '__main__': - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=args.device_target, save_graphs=False) config = eval_config with open(config.DataConfig.labels_path) as label_file: labels = json.load(label_file) diff --git a/official/audio/DeepSpeech2/export.py b/official/audio/DeepSpeech2/export.py index 1b914ba1e43e1f6aca43b8baa3e3d008d3af9891..b9b704f1cbf5a6ffcdce3f7b058fe934a947a271 100644 --- a/official/audio/DeepSpeech2/export.py +++ b/official/audio/DeepSpeech2/export.py @@ -18,7 +18,8 @@ export checkpoint file to mindir model import json import argparse import numpy as np -from mindspore import context, Tensor +import mindspore +from mindspore import Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net, export from src.deepspeech2 import DeepSpeechModel from src.config import train_config @@ -33,7 +34,7 @@ args = parser.parse_args() if __name__ == '__main__': config = train_config - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=args.device_target, save_graphs=False) with open(config.DataConfig.labels_path) as label_file: labels = json.load(label_file) diff --git a/official/audio/DeepSpeech2/quick_start.py b/official/audio/DeepSpeech2/quick_start.py index 00b53b9b7dfd1a03c1977b74537c5f479b46eb6e..7bd0585a2e9351fa59f28f2c55c0ef3e53230451 100644 --- a/official/audio/DeepSpeech2/quick_start.py +++ b/official/audio/DeepSpeech2/quick_start.py @@ -20,7 +20,7 @@ from src.qs_config import quickstart_config from src.deepspeech2 import DeepSpeechModel, PredictWithSoftmax from src.dataset import create_dataset from src.greedydecoder import MSGreedyDecoder -from mindspore import context +import mindspore from mindspore.train.serialization import load_checkpoint, load_param_into_net parser = argparse.ArgumentParser(description='DeepSpeech evaluation') @@ -32,7 +32,7 @@ parser.add_argument('--device_target', type=str, default="CPU", choices=("GPU", args = parser.parse_args() if __name__ == '__main__': - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=args.device_target, save_graphs=False) config = quickstart_config with open(config.DataConfig.labels_path) as label_file: labels = json.load(label_file) diff --git a/official/audio/DeepSpeech2/train.py b/official/audio/DeepSpeech2/train.py index 014a4e0891db2243ac098bf7beca9d0d601b222e..4c01d37727f23bdb1d982c02a525acb050b00540 100644 --- a/official/audio/DeepSpeech2/train.py +++ b/official/audio/DeepSpeech2/train.py @@ -18,9 +18,10 @@ import argparse import json import os -from mindspore import context, Tensor, ParameterTuple +import mindspore +from mindspore import Tensor, ParameterTuple from mindspore.communication.management import init, get_rank, get_group_size -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn import TrainOneStepCell from mindspore.nn.optim import Adam from mindspore.train import Model @@ -47,15 +48,15 @@ if __name__ == '__main__': group_size = 1 config = train_config data_sink = (args.device_target != "CPU") - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=args.device_target, save_graphs=False) if args.device_target == "GPU": - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) if args.is_distributed: init() rank_id = get_rank() group_size = get_group_size() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) with open(config.DataConfig.labels_path) as label_file: diff --git a/official/audio/EcapaTDNN/eval.py b/official/audio/EcapaTDNN/eval.py index a3d8fe83a2dfa1be5ca19b374b1ee476ef50b5a3..4d24d41e13ef2fd760af532d73ae2ea95ac276bb 100644 --- a/official/audio/EcapaTDNN/eval.py +++ b/official/audio/EcapaTDNN/eval.py @@ -21,14 +21,15 @@ import pickle import numpy as np from scipy.spatial.distance import cosine from sklearn.metrics.pairwise import cosine_similarity +import mindspore from mindspore import Tensor -from mindspore import context, load_checkpoint, load_param_into_net +from mindspore import load_checkpoint, load_param_into_net from src.ecapa_tdnn import ECAPA_TDNN from src.reader import DatasetGenerator from src.metrics import get_EER_from_scores from src.model_utils.config import config as hparams -context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") +mindspore.set_context(mode=0, device_target="Ascend") excluded_set = {2302, 2303, 2304, 2305, 2306, 2307, 2308, 2309, 2310, 2311, 2312, 2313, 2314, 2315, 2316, 2317, 2318, 2319, 2320, 2321, 2322, 2323, 2324, 2325, 2326, 2327, 2328, 2329, @@ -191,7 +192,7 @@ def compute_embeddings(embedder, dataloader, startidx=0, dur=50000, exc_set=None if __name__ == "__main__": - context.set_context(device_id=hparams.device_id) + mindspore.set_context(device_id=hparams.device_id) in_channels = hparams.in_channels channels = hparams.channels emb_size = hparams.emb_size diff --git a/official/audio/EcapaTDNN/export.py b/official/audio/EcapaTDNN/export.py index 8c6002a4141798641617ff9745dacd273d31a549..5dba563c0f1fe663eec7df1f75d48cde24138462 100644 --- a/official/audio/EcapaTDNN/export.py +++ b/official/audio/EcapaTDNN/export.py @@ -20,8 +20,8 @@ import os import sys from hyperpyyaml import load_hyperpyyaml import numpy as np -import mindspore as ms -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.ecapa_tdnn import ECAPA_TDNN def modelarts_pre_process(): @@ -29,7 +29,7 @@ def modelarts_pre_process(): config.file_name = os.path.join(config.output_path, config.file_name) def run_export(hparams): - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") + mindspore.set_context(mode=0, device_target="Ascend") in_channels = hparams["in_channels"] channels = hparams["channels"] @@ -43,7 +43,7 @@ def run_export(hparams): load_param_into_net(net, param_dict) file_name = hparams["file_name"] file_format = hparams["file_format"] - input_arr = Tensor(np.ones([1, hparams["length"], hparams["channel"]]), ms.float32) + input_arr = Tensor(np.ones([1, hparams["length"], hparams["channel"]]), mindspore.float32) export(net, input_arr, file_name=file_name, file_format=file_format) if __name__ == '__main__': diff --git a/official/audio/EcapaTDNN/modelart/ecapatdnn-modelart.py b/official/audio/EcapaTDNN/modelart/ecapatdnn-modelart.py index 6901c07eb72f7cdfdad9dc050229cd3ba026473e..a35462e96f69bb37baafe933fba5551cee9723f5 100644 --- a/official/audio/EcapaTDNN/modelart/ecapatdnn-modelart.py +++ b/official/audio/EcapaTDNN/modelart/ecapatdnn-modelart.py @@ -22,17 +22,16 @@ import ast from datetime import datetime import math import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn -from mindspore import Tensor +from mindspore import Tensor, ParallelMode import mindspore.dataset as ds import mindspore.ops as ops from mindspore.nn import FixedLossScaleUpdateCell -from mindspore import context, load_checkpoint, load_param_into_net, export +from mindspore import load_checkpoint, load_param_into_net, export from mindspore.train.callback import ModelCheckpoint from mindspore.train.callback import CheckpointConfig from mindspore.train.callback import RunContext, _InternalCallbackParam -from mindspore.context import ParallelMode from mindspore.communication.management import init, get_rank, get_group_size from src.ecapa_tdnn import ECAPA_TDNN, Classifier from src.reader import DatasetGeneratorBatch as DatasetGenerator @@ -49,7 +48,7 @@ args, unknown = parser.parse_known_args() def save_ckpt_to_air(save_ckpt_path, path): - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") + mindspore.set_context(mode=0, device_target="Ascend") in_channels = 80 channels = 1024 @@ -60,7 +59,7 @@ def save_ckpt_to_air(save_ckpt_path, path): # assert config.ckpt_file is not None, "config.ckpt_file is None." param_dict = load_checkpoint(path) load_param_into_net(net, param_dict) - input_arr = Tensor(np.ones([1, 301, 80]), ms.float32) + input_arr = Tensor(np.ones([1, 301, 80]), mindspore.float32) export(net, input_arr, file_name=save_ckpt_path+'ecapatdnn', file_format="AIR") @@ -87,12 +86,12 @@ def create_dataset(cfg, data_home, shuffle=False): class CorrectLabelNum(nn.Cell): def __init__(self): super(CorrectLabelNum, self).__init__() - self.argmax = ms.ops.Argmax(axis=1) - self.sum = ms.ops.ReduceSum() + self.argmax = ops.Argmax(axis=1) + self.sum = ops.ReduceSum() def construct(self, output, target): output = self.argmax(output) - correct = self.sum((output == target).astype(ms.dtype.float32)) + correct = self.sum((output == target).astype(mindspore.float32)) return correct @@ -105,7 +104,7 @@ class BuildTrainNetwork(nn.Cell): self.criterion = my_criterion self.lossfunc = lossfunc # Initialize self.output - self.output = ms.Parameter(Tensor(np.ones((train_batch_size, class_num_)), ms.float32), requires_grad=False) + self.output = mindspore.Parameter(Tensor(np.ones((train_batch_size, class_num_)), mindspore.float32), requires_grad=False) self.depth = class_num_ def construct(self, input_data, label): @@ -219,17 +218,17 @@ def train(): # init distributed if hparams.run_distribute: device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=device_id) init() hparams.rank = get_rank() hparams.group_size = get_group_size() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=8, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=8, parameter_broadcast=True) else: hparams.rank = 0 hparams.group_size = 1 - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=hparams.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=hparams.device_id) data_dir = args.data_url in_channels = hparams.in_channels channels = hparams.channels diff --git a/official/audio/EcapaTDNN/src/ecapa_tdnn.py b/official/audio/EcapaTDNN/src/ecapa_tdnn.py index f3f39137c9107357c7abeb112f69ecf8137bbee5..3a01234e8ad4bf7836470c58fe1dff157f7c3d75 100644 --- a/official/audio/EcapaTDNN/src/ecapa_tdnn.py +++ b/official/audio/EcapaTDNN/src/ecapa_tdnn.py @@ -14,13 +14,13 @@ # ============================================================================ import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore import Tensor from mindspore.common.initializer import initializer, XavierUniform -ms.set_seed(0) +mindspore.set_seed(0) class MyBatchNorm1d(nn.Cell): def __init__( @@ -94,8 +94,8 @@ class Res2NetBlock(nn.Cell): ] ) self.scale = scale - self.cat = ms.ops.Concat(axis=1) - self.split = ms.ops.Split(1, scale) + self.cat = ops.Concat(axis=1) + self.split = ops.Split(1, scale) self.print = ops.operations.Print() def construct(self, x): y = [] @@ -131,12 +131,12 @@ class SEBlock(nn.Cell): in_channels=in_channels, out_channels=se_channels, kernel_size=1, has_bias=True, weight_init='he_uniform', bias_init='truncatedNormal' ) - self.relu = ms.nn.ReLU() - self.conv2 = ms.nn.Conv1d( + self.relu = nn.ReLU() + self.conv2 = nn.Conv1d( in_channels=se_channels, out_channels=out_channels, kernel_size=1, has_bias=True, weight_init='he_uniform', bias_init='truncatedNormal' ) - self.sigmoid = ms.nn.Sigmoid() + self.sigmoid = nn.Sigmoid() self.print = ops.operations.Print() def construct(self, x, lengths=None): s = x.mean((2), True) @@ -202,7 +202,7 @@ class SERes2NetBlock(nn.Cell): self.shortcut = None if in_channels != out_channels: - self.shortcut = Conv1d( + self.shortcut = nn.Conv1d( in_channels=in_channels, out_channels=out_channels, kernel_size=1, @@ -247,14 +247,14 @@ class AttentiveStatisticsPooling(nn.Cell): in_channels=attention_channels, out_channels=channels, kernel_size=1, has_bias=bias, weight_init='he_uniform', bias_init='truncatedNormal' ) - self.sqrt = ms.ops.Sqrt() - self.pow = ms.ops.Pow() - self.expandDim = ms.ops.ExpandDims() - self.softmax = ms.ops.Softmax(axis=2) - self.cat = ms.ops.Concat(axis=1) + self.sqrt = ops.Sqrt() + self.pow = ops.Pow() + self.expandDim = ops.ExpandDims() + self.softmax = ops.Softmax(axis=2) + self.cat = ops.Concat(axis=1) self.print = ops.operations.Print() - self.ones = ms.ops.Ones() - self.tile = ms.ops.Tile() + self.ones = ops.Ones() + self.tile = ops.Tile() def construct(self, x, lengths=None): def _compute_statistics(x, m, dim=2, eps=self.eps): mean = (m * x).sum(dim) @@ -372,11 +372,11 @@ class ECAPA_TDNN(nn.Cell): weight_init='he_uniform', bias_init='truncatedNormal' ) - self.expandDim = ms.ops.ExpandDims() - self.softmax = ms.ops.Softmax(axis=2) - self.cat = ms.ops.Concat(axis=1) + self.expandDim = ops.ExpandDims() + self.softmax = ops.Softmax(axis=2) + self.cat = ops.Concat(axis=1) self.print = ops.operations.Print() - self.transpose = ms.ops.Transpose() + self.transpose = ops.Transpose() def construct(self, x, lengths=None): # Minimize transpose for efficiency @@ -438,14 +438,14 @@ class Classifier(nn.Cell): input_size = lin_neurons input_size = lin_neurons # Final Layer - tensor1 = initializer(XavierUniform(), [out_neurons, input_size], ms.float32) - self.weight = ms.Parameter( + tensor1 = initializer(XavierUniform(), [out_neurons, input_size], mindspore.float32) + self.weight = mindspore.Parameter( tensor1 ) - self.norm = ms.ops.L2Normalize(axis=1) + self.norm = ops.L2Normalize(axis=1) self.print = ops.operations.Print() - self.matmul = ms.ops.MatMul() - self.expand_dims = ms.ops.ExpandDims() + self.matmul = ops.MatMul() + self.expand_dims = ops.ExpandDims() def construct(self, x): """Returns the output probabilities over speakers. @@ -461,7 +461,7 @@ class Classifier(nn.Cell): return output if __name__ == '__main__': - input_feats = Tensor(np.ones([1, 32, 60]), ms.float32) + input_feats = Tensor(np.ones([1, 32, 60]), mindspore.float32) compute_embedding = ECAPA_TDNN(32, channels=[256, 256, 256, 256, 768], lin_neurons=192) outputs = compute_embedding(input_feats) print(outputs.shape_) diff --git a/official/audio/EcapaTDNN/src/model_utils/moxing_adapter.py b/official/audio/EcapaTDNN/src/model_utils/moxing_adapter.py index 7d40450e9370729a5e1a3bdf21e9be0355098aa5..df07d48166148c096337ab563cdcdf8df4d0a337 100644 --- a/official/audio/EcapaTDNN/src/model_utils/moxing_adapter.py +++ b/official/audio/EcapaTDNN/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from src.model_utils.config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/audio/EcapaTDNN/src/util.py b/official/audio/EcapaTDNN/src/util.py index c02e831176c52d38f2d9b30996c4aba9c3a97fb5..b321b978df88e7300bf0a109ff69c1d9996c2de5 100644 --- a/official/audio/EcapaTDNN/src/util.py +++ b/official/audio/EcapaTDNN/src/util.py @@ -14,7 +14,7 @@ # ============================================================================ import math -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.numpy as np @@ -48,8 +48,8 @@ class AdditiveAngularMargin(nn.Cell): self.sin_m = math.sin(self.margin) self.th = math.cos(math.pi - self.margin) self.mm = math.sin(math.pi - self.margin) * self.margin - self.sqrt = ms.ops.Sqrt() - self.pow = ms.ops.Pow() + self.sqrt = mindspore.ops.Sqrt() + self.pow = mindspore.ops.Pow() def construct(self, outputs, targets): """ diff --git a/official/audio/EcapaTDNN/train.py b/official/audio/EcapaTDNN/train.py index 7cd48094bde82819ee221e8e4a181af986edd0ed..56117e3a0ca63993db95ff1c3418d7dd672cd3b8 100644 --- a/official/audio/EcapaTDNN/train.py +++ b/official/audio/EcapaTDNN/train.py @@ -20,17 +20,17 @@ import time from datetime import datetime import math import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore import Tensor import mindspore.dataset as ds import mindspore.ops as ops from mindspore.nn import FixedLossScaleUpdateCell -from mindspore import context, load_checkpoint, load_param_into_net +from mindspore import load_checkpoint, load_param_into_net from mindspore.train.callback import ModelCheckpoint from mindspore.train.callback import CheckpointConfig from mindspore.train.callback import RunContext, _InternalCallbackParam -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import init, get_rank, get_group_size from src.ecapa_tdnn import ECAPA_TDNN, Classifier from src.reader import DatasetGeneratorBatch as DatasetGenerator @@ -63,12 +63,12 @@ def create_dataset(cfg, data_home, shuffle=False): class CorrectLabelNum(nn.Cell): def __init__(self): super(CorrectLabelNum, self).__init__() - self.argmax = ms.ops.Argmax(axis=1) - self.sum = ms.ops.ReduceSum() + self.argmax = ops.Argmax(axis=1) + self.sum = ops.ReduceSum() def construct(self, output, target): output = self.argmax(output) - correct = self.sum((output == target).astype(ms.dtype.float32)) + correct = self.sum((output == target).astype(mindspore.float32)) return correct @@ -81,7 +81,7 @@ class BuildTrainNetwork(nn.Cell): self.criterion = my_criterion self.lossfunc = lossfunc # Initialize self.output - self.output = ms.Parameter(Tensor(np.ones((train_batch_size, class_num_)), ms.float32), requires_grad=False) + self.output = mindspore.Parameter(Tensor(np.ones((train_batch_size, class_num_)), mindspore.float32), requires_grad=False) self.depth = class_num_ def construct(self, input_data, label): @@ -191,17 +191,17 @@ def train(): # init distributed if hparams.run_distribute: device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=device_id) init() hparams.rank = get_rank() hparams.group_size = get_group_size() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=8, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=8, parameter_broadcast=True) else: hparams.rank = 0 hparams.group_size = 1 - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=hparams.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=hparams.device_id) data_dir = hparams.train_data_path in_channels = hparams.in_channels channels = hparams.channels diff --git a/official/audio/LPCNet/eval.py b/official/audio/LPCNet/eval.py index 67006a9e75b4ea59781856971b8ba99c51f47bff..1ac5a4645ae4ccd3b0c54f52a1faaf7642909bd2 100644 --- a/official/audio/LPCNet/eval.py +++ b/official/audio/LPCNet/eval.py @@ -19,7 +19,7 @@ from pathlib import Path import numpy as np import mindspore import mindspore.numpy as mnp -from mindspore import context, load_checkpoint +from mindspore import load_checkpoint from src import lpcnet from src.ulaw import lin2ulaw, ulaw2lin @@ -92,7 +92,7 @@ if __name__ == "__main__": device_id = args.device_id # NOTE: fails without max_call_depth due to RNN - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", + mindspore.set_context(mode=0, device_target="Ascend", max_call_depth=5000, device_id=device_id) _model = lpcnet.WithLossLPCNet() diff --git a/official/audio/LPCNet/export.py b/official/audio/LPCNet/export.py index 9fe594d1131fa4783cdb27e0190dc8567565888f..d19682a7206d2639c66ac53bde9635c449c757fc 100644 --- a/official/audio/LPCNet/export.py +++ b/official/audio/LPCNet/export.py @@ -17,7 +17,7 @@ from argparse import ArgumentParser import numpy as np import mindspore -from mindspore import context, export, load_checkpoint +from mindspore import export, load_checkpoint from src import lpcnet @@ -40,7 +40,7 @@ if __name__ == "__main__": f.write(f"#define MAXLEN {args.max_len}") # NOTE: fails without max_call_depth due to RNN - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, + mindspore.set_context(mode=0, device_target=args.device_target, max_call_depth=30000, device_id=args.device_id) model = lpcnet.WithLossLPCNet() diff --git a/official/audio/LPCNet/src/rnns/rnns.py b/official/audio/LPCNet/src/rnns/rnns.py index f12b6022e65e699198a36547938dbbab81dd49f6..8b973a2691b3af9ca90e3eaed365e813c28d9f6f 100644 --- a/official/audio/LPCNet/src/rnns/rnns.py +++ b/official/audio/LPCNet/src/rnns/rnns.py @@ -16,6 +16,7 @@ '''RNN operators module, include RNN, GRU, LSTM''' import math import numpy as np +import mindspore import mindspore.nn as nn import mindspore.ops as P import mindspore.numpy as mnp @@ -23,7 +24,6 @@ from mindspore.common import dtype as mstype from mindspore.ops.primitive import constexpr from mindspore import Tensor, Parameter, ParameterTuple from mindspore import log as logger -from mindspore import context from .rnn_cells import rnn_relu_cell, rnn_tanh_cell, gru_cell, lstm_cell @constexpr @@ -248,7 +248,7 @@ class _RNNBase(nn.Cell): def __init__(self, mode, input_size, hidden_size, num_layers=1, has_bias=True, batch_first=False, dropout=0, bidirectional=False): super().__init__() - is_ascend = context.get_context("device_target") == "Ascend" + is_ascend = mindspore.get_context("device_target") == "Ascend" if not 0 <= dropout <= 1: raise ValueError("dropout should be a number in range [0, 1] " "representing the probability of an element being " diff --git a/official/audio/LPCNet/train.py b/official/audio/LPCNet/train.py index 54503dedb0da7705dbfb6572c91232e49892ee9d..a23b772dd3ef700c2bc9a42ee374a2f26b4c0e7e 100644 --- a/official/audio/LPCNet/train.py +++ b/official/audio/LPCNet/train.py @@ -19,7 +19,7 @@ from pathlib import Path import mindspore import mindspore.dataset as ds import mindspore.numpy as np -from mindspore import Model, context, nn, ops +from mindspore import Model, nn, ops from mindspore.train.callback import (CheckpointConfig, LossMonitor, ModelCheckpoint, TimeMonitor) @@ -117,7 +117,7 @@ if __name__ == "__main__": if retrain: input_model = args.retrain - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, + mindspore.set_context(mode=0, device_target=args.device_target, max_call_depth=5000) # NOTE: fails without max_call_depth due to RNN ds.config.set_prefetch_size(16) diff --git a/official/audio/LPCNet/train_lpcnet_parallel.py b/official/audio/LPCNet/train_lpcnet_parallel.py index a0b8f09c7c8724f676af238a424cf214d325332c..7cebfc0faf31788eb662a4cd8e8d93398ce8233b 100644 --- a/official/audio/LPCNet/train_lpcnet_parallel.py +++ b/official/audio/LPCNet/train_lpcnet_parallel.py @@ -20,9 +20,8 @@ from pathlib import Path import mindspore import mindspore.dataset as ds import mindspore.numpy as np -from mindspore import Model, context, nn, ops +from mindspore import Model, nn, ops, ParallelMode from mindspore.communication import get_group_size, init -from mindspore.context import ParallelMode from mindspore.train.callback import (CheckpointConfig, LossMonitor, ModelCheckpoint, TimeMonitor) @@ -120,12 +119,12 @@ if __name__ == "__main__": input_model = args.retrain device_id = int(os.getenv('DEVICE_ID')) - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, + mindspore.set_context(mode=0, device_target=args.device_target, max_call_depth=5000) # NOTE: fails without max_call_depth due to RNN - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) init() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=False, parameter_broadcast=True) ds.config.set_prefetch_size(16) diff --git a/official/audio/MELGAN/README.md b/official/audio/MELGAN/README.md index 448fa1bb5d57e140d2f7777fa10e3379f0642e13..4185a79a59de82d5220dd46f58de302259a598ca 100644 --- a/official/audio/MELGAN/README.md +++ b/official/audio/MELGAN/README.md @@ -46,7 +46,7 @@ Dataset used: [LJ Speech]() - Dataset size:2.6GB,13,100 short audio clips of a single speaker reading passages from 7 non-fiction books. - Data format:Each audio file is a single-channel 16-bit PCM WAV with a sample rate of 22050 Hz - - The audio data needs to be processed to a mel-spectrum, and you can refer to the script in [mel-spectrogram data creation](https://github.com/seungwonpark/melgan/blob/master/preprocess.py). Non CUDA environment needs to delete `. cuda()` in `utils/stfy.py`. To save data in the `npy` format, `preprocess.py` also needs to be modified. As follows: + - The audio data needs to be processed to a mel-spectrum, and you can refer to the script in [mel-spectrogram data creation](https://github.com/seungwonpark/melgan/blob/master/preprocess.py). Non CUDA environment needs to delete `. cuda()` in `utils/stft.py`. To save data in the `npy` format, `preprocess.py` also needs to be modified. As follows: ``` # 37 - 38 lines diff --git a/official/audio/MELGAN/README_CN.md b/official/audio/MELGAN/README_CN.md index 03a3b7badd31889dc4666668db587d7ce3afd340..5d80d749a4a672ce67e1c1d17d4ca158284f8af1 100644 --- a/official/audio/MELGAN/README_CN.md +++ b/official/audio/MELGAN/README_CN.md @@ -46,7 +46,7 @@ MelGAN模型是非自回归全卷积模型。它的参数比同类模型少得 - Dataset size:2.6GB,包含13,100条只有一个说话人的短语音。语音的内容来自7本纪实书籍。 - 数据格式:每条语音文件都是单声道、16-bit以及采样率为22050。 - - 语音需要被处理为Mel谱, 可以参考脚本[Mel谱处理脚本](https://github.com/seungwonpark/melgan/blob/master/preprocess.py)。非CUDA环境需删除`utils/stfy.py`中的`.cuda()`,因为要保存`npy`格式的数据,所以`preproccess.py`也需要修改以下,参考代码如下: + - 语音需要被处理为Mel谱, 可以参考脚本[Mel谱处理脚本](https://github.com/seungwonpark/melgan/blob/master/preprocess.py)。非CUDA环境需删除`utils/stft.py`中的`.cuda()`,因为要保存`npy`格式的数据,所以`preproccess.py`也需要修改以下,参考代码如下: ``` # 37 - 38 行 diff --git a/official/audio/MELGAN/eval.py b/official/audio/MELGAN/eval.py index 7ac894034e1f619251fe3ec435d7496765057222..033bee0df01eef98f6e82acd1cf5e12d1395ebfd 100644 --- a/official/audio/MELGAN/eval.py +++ b/official/audio/MELGAN/eval.py @@ -17,19 +17,19 @@ import os import numpy as np from scipy.io.wavfile import write +import mindspore from mindspore import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.common.tensor import Tensor -import mindspore.context as context from src.model import Generator from src.model_utils.config import config as cfg -context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") +mindspore.set_context(mode=0, device_target="Ascend") if __name__ == '__main__': - context.set_context(device_id=cfg.device_id) + mindspore.set_context(device_id=cfg.device_id) if not os.path.exists(cfg.output_path): os.mkdir(cfg.output_path) diff --git a/official/audio/MELGAN/export.py b/official/audio/MELGAN/export.py index 0ae2e1a9f1e171852592827da97a4ef1c9682b45..dc5aea4debfb46cde94698ed54dcede2d829c807 100644 --- a/official/audio/MELGAN/export.py +++ b/official/audio/MELGAN/export.py @@ -16,7 +16,7 @@ import argparse import numpy as np -import mindspore as ms +import mindspore from mindspore import Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net, export @@ -35,5 +35,5 @@ if __name__ == '__main__': param_dict = load_checkpoint(args_opt.checkpoint_path) load_param_into_net(net, param_dict) - input_arr = Tensor(np.random.uniform(0.0, 1.0, size=[1, 80, 240]), ms.float32) + input_arr = Tensor(np.random.uniform(0.0, 1.0, size=[1, 80, 240]), mindspore.float32) export(net, input_arr, file_name=args_opt.model_name, file_format=args_opt.format) diff --git a/official/audio/MELGAN/modelarts/train_modelarts.py b/official/audio/MELGAN/modelarts/train_modelarts.py index 681db402e3a735ce3bf93e031dc84484c89f7fc9..3a13b099e8426223dc3fe4701d7ca5081b60c493 100644 --- a/official/audio/MELGAN/modelarts/train_modelarts.py +++ b/official/audio/MELGAN/modelarts/train_modelarts.py @@ -17,15 +17,14 @@ import os import time import numpy as np -import mindspore as ms +import mindspore import mindspore.common.dtype as mstype -import mindspore.context as context import mindspore.dataset as de import mindspore.nn as nn from mindspore.common import set_seed from mindspore.common.tensor import Tensor from mindspore.communication.management import init, get_rank, get_group_size -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.callback import RunContext, ModelCheckpoint, CheckpointConfig, _InternalCallbackParam from mindspore.train.loss_scale_manager import DynamicLossScaleManager from mindspore.train.serialization import load_checkpoint, load_param_into_net, export @@ -74,17 +73,17 @@ def train(): # init distributed if cfg.run_distribute: device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=device_id) init() cfg.rank = get_rank() cfg.group_size = get_group_size() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=8, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=8, parameter_broadcast=True) else: cfg.rank = 0 cfg.group_size = 1 - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=cfg.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=cfg.device_id) # get network and init net_D = MultiDiscriminator() net_G = Generator(alpha=cfg.leaky_alpha) @@ -169,7 +168,7 @@ def train(): duration = time.perf_counter() - epoch_t print('finish in {:.2f}mins'.format(duration / 60)) - input_arr = Tensor(np.random.uniform(0.0, 1.0, size=[1, 80, 240]), ms.float32) + input_arr = Tensor(np.random.uniform(0.0, 1.0, size=[1, 80, 240]), mindspore.float32) export(net_G, input_arr, file_name=os.path.join(cfg.train_url, 'melgan_final'), file_format="AIR") diff --git a/official/audio/MELGAN/src/model_utils/moxing_adapter.py b/official/audio/MELGAN/src/model_utils/moxing_adapter.py index aabd5ac6cf1bde3ca20f3d6ea9cf3d5310169f1e..32c4e5ab6b8c2c6fc3e6c3ceca743bd75b671f23 100644 --- a/official/audio/MELGAN/src/model_utils/moxing_adapter.py +++ b/official/audio/MELGAN/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from src.model_utils.config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/audio/MELGAN/src/trainonestep.py b/official/audio/MELGAN/src/trainonestep.py index 7f5f1b387d7ba2eb8328eef68dba021ce07f1af4..2eaa0bc170c1f95228e24a83fefb7d8b6afe541a 100644 --- a/official/audio/MELGAN/src/trainonestep.py +++ b/official/audio/MELGAN/src/trainonestep.py @@ -18,7 +18,7 @@ import mindspore.common.dtype as mstype from mindspore.common.tensor import Tensor from mindspore.ops import functional as F from mindspore.ops import operations as P -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.cell import Cell from mindspore.common.parameter import ParameterTuple from mindspore.ops.operations import ReduceSum, \ diff --git a/official/audio/MELGAN/train.py b/official/audio/MELGAN/train.py index 0c6f3a9fc8dd7473cefbe828b5a6f6b6a73e7ecc..20a4127b4b99219c8dff381d471b430bda8e0a9b 100644 --- a/official/audio/MELGAN/train.py +++ b/official/audio/MELGAN/train.py @@ -16,14 +16,13 @@ import time import os -import mindspore.nn as nn +import mindspore +from mindspore import ParallelMode, nn from mindspore.common import set_seed import mindspore.common.dtype as mstype from mindspore.common.tensor import Tensor -from mindspore.context import ParallelMode from mindspore.communication.management import init, get_rank, get_group_size import mindspore.dataset as de -import mindspore.context as context from mindspore.train.loss_scale_manager import DynamicLossScaleManager from mindspore.train.callback import RunContext, ModelCheckpoint, CheckpointConfig, _InternalCallbackParam from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -67,17 +66,17 @@ def train(): # init distributed if cfg.run_distribute: device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=device_id) init() cfg.rank = get_rank() cfg.group_size = get_group_size() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=8, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=8, parameter_broadcast=True) else: cfg.rank = 0 cfg.group_size = 1 - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=cfg.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=cfg.device_id) # get network and init net_D = MultiDiscriminator() net_G = Generator(alpha=cfg.leaky_alpha) diff --git a/official/audio/Tacotron2/README.md b/official/audio/Tacotron2/README.md index 53283ba9b019691a9454d9b9a7c8e51337f86611..51c939693002b061c2d6f4d6a4d5c54f134e39ef 100644 --- a/official/audio/Tacotron2/README.md +++ b/official/audio/Tacotron2/README.md @@ -76,8 +76,8 @@ After installing MindSpore via the official website, you can start training and # example: bash run_standalone_train.sh /path/ljspeech.hdf5 0 # run distributed training - bash run_distributed_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] - # example: bash run_distributed_train.sh /path/ljspeech.h5 ../hccl_8p_01234567_127.0.0.1.json 8 0 + bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] + # example: bash run_distribute_train.sh /path/ljspeech.h5 ../hccl_8p_01234567_127.0.0.1.json 8 0 # run evaluation bash run_eval.sh [OUTPUT_PATH] [MODEL_CKPT] [DEVICE_ID] text is set in config.py( can modify text of ljspeech_config.yaml) @@ -246,7 +246,7 @@ Parameters for both training and evaluation can be set in [DATASET]_config.yaml ```bash cd scripts - bash run_distributed_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] + bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] ``` Note: `DATASET_PATH` is the directory contains hdf5 file. diff --git a/official/audio/Tacotron2/README_CN.md b/official/audio/Tacotron2/README_CN.md index e104f6e4872f367bd2ab838ea118ee80fc74952c..aaf135f92cb8ce673929f52cde79a20551ee576d 100644 --- a/official/audio/Tacotron2/README_CN.md +++ b/official/audio/Tacotron2/README_CN.md @@ -76,8 +76,8 @@ Tacotron2实质上是一种包含编码器和解码器的序列到序列模型 # 示例:bash run_standalone_train.sh /path/ljspeech.hdf5 0 # 运行分布式训练 - bash run_distributed_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] - # 示例:bash run_distributed_train.sh /path/ljspeech.h5 ../hccl_8p_01234567_127.0.0.1.json 8 0 + bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] + # 示例:bash run_distribute_train.sh /path/ljspeech.h5 ../hccl_8p_01234567_127.0.0.1.json 8 0 # 运行评估 bash run_eval.sh [OUTPUT_PATH] [MODEL_CKPT] [DEVICE_ID] text is set in config.py( can modify text of ljspeech_config.yaml) @@ -246,7 +246,7 @@ tacotron2/ ```bash cd scripts - bash run_distributed_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] + bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] ``` 注:`DATASET_PATH`是包含HDF5文件的目录。 diff --git a/official/audio/Tacotron2/eval.py b/official/audio/Tacotron2/eval.py index e04016a47747cb848f4a289c8a7d183a18615a9c..c6150ddc388ea496f815cf288985749660f36020 100644 --- a/official/audio/Tacotron2/eval.py +++ b/official/audio/Tacotron2/eval.py @@ -24,7 +24,6 @@ import matplotlib.pylab as plt import numpy as np import mindspore -from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore import Tensor @@ -39,7 +38,7 @@ from model_utils.device_adapter import get_device_id, get_device_num matplotlib.use('Agg') -context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target=config.device_target) +mindspore.set_context(mode=0, save_graphs=False, device_target=config.device_target) def load_model(ckpt_pth): diff --git a/official/audio/Tacotron2/model_utils/moxing_adapter.py b/official/audio/Tacotron2/model_utils/moxing_adapter.py index 25838a7da99a27a1bb744684c1f75f80f5704688..189ff0667a1a783691749e55e41f1562c100b9c9 100644 --- a/official/audio/Tacotron2/model_utils/moxing_adapter.py +++ b/official/audio/Tacotron2/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/audio/Tacotron2/src/tacotron2.py b/official/audio/Tacotron2/src/tacotron2.py index e685cf0f8d0ca516cc18a92674488e9a12b6ca44..314bb0f302f14b80d5cd48d027198af40c62d5aa 100644 --- a/official/audio/Tacotron2/src/tacotron2.py +++ b/official/audio/Tacotron2/src/tacotron2.py @@ -22,8 +22,7 @@ from mindspore.ops import operations as P from mindspore.ops import composite as C from mindspore.ops import functional as F from mindspore.ops import Argmax as indexArgmax -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_group_size from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.parallel._utils import _get_gradients_mean @@ -1112,7 +1111,7 @@ class TrainStepWrap(nn.Cell): dtype=mindspore.float32)) self.reducer_flag = False - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.is_distributed = True @@ -1168,7 +1167,7 @@ class TrainStepWrap(nn.Cell): overflow = ops.logical_not(amp.all_finite(grads)) if self.reducer_flag: - overflow = self.allreduce(overflow.to(mstype.float32)) >= self.base + overflow = self.all_reduce(overflow.to(mstype.float32)) >= self.base overflow = self.loss_scaling_manager(self.loss_scale, overflow) diff --git a/official/audio/Tacotron2/train.py b/official/audio/Tacotron2/train.py index 9b6bc2ad6551b48129f569ef3420c711eeecf525..a47dd3a8ce317f57e34f1722e065ed5e11ea5322 100644 --- a/official/audio/Tacotron2/train.py +++ b/official/audio/Tacotron2/train.py @@ -20,12 +20,12 @@ import numpy as np import mindspore import mindspore.dataset as ds -from mindspore.context import ParallelMode +import mindspore +from mindspore import ParallelMode from mindspore.communication import management as MultiDevice from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell -from mindspore import context from mindspore import Model from mindspore import Tensor from mindspore import dtype as mstype @@ -53,7 +53,7 @@ mindspore.common.set_seed(1024) time_stamp_init = False time_stamp_first = 0 -context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target=config.device_target, max_call_depth=8000) +mindspore.set_context(mode=0, save_graphs=False, device_target=config.device_target, max_call_depth=8000) def prepare_dataloaders(dataset_path, rank_id, group_size): @@ -195,9 +195,9 @@ def _build_training_pipeline(pre_dataset, run_distribute=False): def set_parallel_env(): '''set parallel context''' - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() MultiDevice.init() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, device_num=MultiDevice.get_group_size(), gradients_mean=True) diff --git a/official/cv/Arcface/eval_ijbc.py b/official/cv/Arcface/eval_ijbc.py index 26f5180b15892eedcfc758fc3780958f5176f10b..493da95e9686fc6862782f49415d9c36c5a602bd 100644 --- a/official/cv/Arcface/eval_ijbc.py +++ b/official/cv/Arcface/eval_ijbc.py @@ -40,8 +40,9 @@ import cv2 from skimage import transform as trans +import mindspore from mindspore.train.serialization import load_checkpoint, load_param_into_net -from mindspore import Tensor, context +from mindspore import Tensor from mindspore import dtype as mstype import mindspore.ops as ops import mindspore.nn as nn @@ -156,8 +157,8 @@ def divideIntoNstrand(listTemp, n): def read_template_media_list(path): ijb_meta = pd.read_csv(path, sep=' ', header=None).values - templates = ijb_meta[:, 1].astype(np.int) - media = ijb_meta[:, 2].astype(np.int) + templates = ijb_meta[:, 1].astype(np.int_) + media = ijb_meta[:, 2].astype(np.int_) return templates, media @@ -166,9 +167,9 @@ def read_template_media_list(path): def read_template_pair_list(path): pairs = pd.read_csv(path, sep=' ', header=None).values - t1 = pairs[:, 0].astype(np.int) - t2 = pairs[:, 1].astype(np.int) - label = pairs[:, 2].astype(np.int) + t1 = pairs[:, 0].astype(np.int_) + t2 = pairs[:, 1].astype(np.int_) + label = pairs[:, 2].astype(np.int_) return t1, t2, label @@ -348,7 +349,7 @@ def read_score(path): def main(): - context.set_context(mode=context.GRAPH_MODE, device_id=0) + mindspore.set_context(mode=0, device_id=0) # # Step1: Load Meta Data # In[ ]: diff --git a/official/cv/Arcface/eval_ijbc_onnx.py b/official/cv/Arcface/eval_ijbc_onnx.py index 317c51f4b5680755e3fb5e146dd1ec07e45c3901..b6ba4fbb18126cb95f70080ada08596bd01d6625 100644 --- a/official/cv/Arcface/eval_ijbc_onnx.py +++ b/official/cv/Arcface/eval_ijbc_onnx.py @@ -148,16 +148,16 @@ def divideIntoNstrand(listTemp, n): def read_template_media_list(path): ijb_meta = pd.read_csv(path, sep=' ', header=None).values - templates = ijb_meta[:, 1].astype(np.int) - media = ijb_meta[:, 2].astype(np.int) + templates = ijb_meta[:, 1].astype(np.int_) + media = ijb_meta[:, 2].astype(np.int_) return templates, media def read_template_pair_list(path): pairs = pd.read_csv(path, sep=' ', header=None).values - t1 = pairs[:, 0].astype(np.int) - t2 = pairs[:, 1].astype(np.int) - label = pairs[:, 2].astype(np.int) + t1 = pairs[:, 0].astype(np.int_) + t2 = pairs[:, 1].astype(np.int_) + label = pairs[:, 2].astype(np.int_) return t1, t2, label diff --git a/official/cv/Arcface/export.py b/official/cv/Arcface/export.py index 52ca1d9b26c68d3bb2163205fc401ce26cbc5fbc..f6bd1a0ee220eb3d5c83a951ca0fecf98de25362 100644 --- a/official/cv/Arcface/export.py +++ b/official/cv/Arcface/export.py @@ -19,8 +19,9 @@ python export.py import argparse import numpy as np +import mindspore from mindspore import dtype as mstype -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.iresnet import iresnet100 @@ -36,9 +37,9 @@ parser.add_argument('--dataset_name', type=str, default='MS1MV2', choices=['MS1M help='dataset name.') args = parser.parse_args() -context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) +mindspore.set_context(mode=0, device_target=args.device_target) if args.device_target == "Ascend": - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) if __name__ == '__main__': if args.dataset_name != 'MS1MV2': diff --git a/official/cv/Arcface/modelarts/start.py b/official/cv/Arcface/modelarts/start.py index db6b7ce2e0bed416c97ff0197c2a8149804365f8..5317ef683624f99d13ccc5de921b2e406b5e47c5 100644 --- a/official/cv/Arcface/modelarts/start.py +++ b/official/cv/Arcface/modelarts/start.py @@ -19,10 +19,11 @@ import os import glob import argparse import numpy as np +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore import export -from mindspore import context, Tensor +from mindspore import Tensor from mindspore import dtype as mstype from mindspore.parallel import set_algo_parameters from mindspore.train.model import Model, ParallelMode @@ -114,15 +115,15 @@ if __name__ == "__main__": ckpt_save_path = CKPT_PATH train_epoch = args.epochs target = args.device_target - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if args.device_num > 1: device_id = int(os.getenv('DEVICE_ID')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) else: - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) if args.device_num > 1: - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, ) cost_model_context.set_cost_model_context(device_memory_capacity=32.0 * 1024.0 * 1024.0 * 1024.0, diff --git a/official/cv/Arcface/train.py b/official/cv/Arcface/train.py index 80c2a858489828acefd301634b84173cc3c19a67..0dda876a6be0b23bd5068782b2132d0889537564 100644 --- a/official/cv/Arcface/train.py +++ b/official/cv/Arcface/train.py @@ -18,9 +18,10 @@ python train.py import argparse import os import numpy as np + import mindspore import mindspore.nn as nn -from mindspore import context, Tensor +from mindspore import Tensor import mindspore.ops as ops from mindspore.train.model import Model, ParallelMode from mindspore import dtype as mstype @@ -99,14 +100,14 @@ class MyNetWithLoss(nn.Cell): if __name__ == "__main__": train_epoch = args.epochs target = args.device_target - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=target, save_graphs=False) device_id = args.device_id if args.device_num > 1: if target == 'Ascend': device_id = int(os.getenv('DEVICE_ID')) - context.set_context(device_id=device_id) - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_context(device_id=device_id) + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, ) cost_model_context.set_cost_model_context(device_memory_capacity=32.0 * 1024.0 * 1024.0 * 1024.0, @@ -116,7 +117,7 @@ if __name__ == "__main__": init() elif target == 'GPU': init() - context.set_auto_parallel_context(device_num=args.device_num, + mindspore.set_auto_parallel_context(device_num=args.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, auto_parallel_search_mode="recursive_programming") diff --git a/official/cv/Arcface/val.py b/official/cv/Arcface/val.py index 1e4243c3261487e6e055c54df5bea2cd3811e4e9..f2d4b010b85294702aa8ef1584986200be9d37b0 100644 --- a/official/cv/Arcface/val.py +++ b/official/cv/Arcface/val.py @@ -28,9 +28,8 @@ from sklearn.decomposition import PCA from sklearn.model_selection import KFold import matplotlib.pyplot as plt from scipy import interpolate -import mindspore as ms +import mindspore from mindspore.train.serialization import load_checkpoint, load_param_into_net -from mindspore import context from src.iresnet import iresnet100 @@ -251,7 +250,7 @@ def test(data_set, backbone, batch_size, nfolds=10): time0 = datetime.datetime.now() img = ((_data / 255) - 0.5) / 0.5 - net_out = backbone(ms.Tensor(img, ms.float32)) + net_out = backbone(mindspore.Tensor(img, mindspore.float32)) _embeddings = net_out.asnumpy() time_now = datetime.datetime.now() diff = time_now - time0 @@ -305,7 +304,7 @@ def main(): parser.add_argument('--max', default='', type=str, help='') parser.add_argument('--nfolds', default=10, type=int, help='') args = parser.parse_args() - context.set_context(device_id=args.device_id, mode=context.GRAPH_MODE, + mindspore.set_context(device_id=args.device_id, mode=0, device_target=args.device_target) image_size = [112, 112] time0 = datetime.datetime.now() diff --git a/official/cv/CRNN/eval.py b/official/cv/CRNN/eval.py index 635ae6124e930a6c8d7e976500c038396aac0999..8e50e63a7d5358ff97acde4aa311c39963e10384 100644 --- a/official/cv/CRNN/eval.py +++ b/official/cv/CRNN/eval.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ """Warpctc evaluation""" -from mindspore import context +import mindspore from mindspore.common import set_seed from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -29,14 +29,14 @@ from src.model_utils.device_adapter import get_device_id set_seed(1) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False) +mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) @moxing_wrapper(pre_process=None) def crnn_eval(): if config.device_target == 'Ascend': device_id = get_device_id() - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) config.batch_size = 1 max_text_length = config.max_text_length diff --git a/official/cv/CRNN/export.py b/official/cv/CRNN/export.py index 9a89c30bb6bf30454b3610958a0b18f93ce484a4..10986006d9afbfccb579bb96250b4dd6d95baabc 100644 --- a/official/cv/CRNN/export.py +++ b/official/cv/CRNN/export.py @@ -16,14 +16,14 @@ """ export model for CRNN """ import os import numpy as np -import mindspore as ms -from mindspore import Tensor, context, load_checkpoint, export +import mindspore +from mindspore import Tensor, load_checkpoint, export from src.crnn import crnn from src.model_utils.moxing_adapter import moxing_wrapper from src.model_utils.config import config from src.model_utils.device_adapter import get_device_id -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False) +mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) def modelarts_pre_process(): config.file_name = os.path.join(config.output_path, config.file_name) @@ -32,7 +32,7 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def model_export(): if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) config.batch_size = 1 net = crnn(config, full_precision=config.device_target != 'Ascend') @@ -40,7 +40,7 @@ def model_export(): load_checkpoint(config.ckpt_file, net=net) net.set_train(False) - input_data = Tensor(np.zeros([1, 3, config.image_height, config.image_width]), ms.float32) + input_data = Tensor(np.zeros([1, 3, config.image_height, config.image_width]), mindspore.float32) export(net, input_data, file_name=config.file_name, file_format=config.file_format) diff --git a/official/cv/CRNN/modelarts/start.py b/official/cv/CRNN/modelarts/start.py index 29826dcf56d44364513f143d5c410483d237b1af..0c271214db43b715eb2f0cd70144d1460d913609 100644 --- a/official/cv/CRNN/modelarts/start.py +++ b/official/cv/CRNN/modelarts/start.py @@ -19,11 +19,11 @@ import glob import shutil import numpy as np import mindspore.nn as nn -import mindspore as ms -from mindspore import context, Tensor, export +import mindspore +from mindspore import Tensor, export from mindspore.common import set_seed from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.wrap import WithLossCell from mindspore.train.callback import TimeMonitor, LossMonitor, CheckpointConfig, ModelCheckpoint from mindspore.train.serialization import load_checkpoint @@ -40,7 +40,7 @@ from src.model_utils.device_adapter import get_rank_id, get_device_num, get_devi set_seed(1) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False) +mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) CKPT_OUTPUT_PATH = config.train_url CKPT_OUTPUT_FILE_PATH = os.path.join(CKPT_OUTPUT_PATH, 'ckpt_0') @@ -60,7 +60,7 @@ def modelarts_pre_process(): def train(): if config.device_target == 'Ascend': device_id = get_device_id() - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) if config.model_version == 'V1' and config.device_target != 'Ascend': raise ValueError("model version V1 is only supported on Ascend, pls check the config.") @@ -75,8 +75,8 @@ def train(): init() device_num = get_group_size() rank = get_rank() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: @@ -152,7 +152,7 @@ def model_trans(): net.set_train(False) input_data = Tensor( - np.zeros([1, 3, config.image_height, config.image_width]), ms.float32) + np.zeros([1, 3, config.image_height, config.image_width]), mindspore.float32) export(net, input_data, file_name='crnn', file_format='AIR') shutil.copy('crnn.air', CKPT_OUTPUT_PATH) diff --git a/official/cv/CRNN/src/crnn_for_train.py b/official/cv/CRNN/src/crnn_for_train.py index aedf04f2bd30f8e293509c08f7c16fa43ff3e9e7..cd2ad6629f73480dd0812f9ef0f8a26ecc9930e6 100644 --- a/official/cv/CRNN/src/crnn_for_train.py +++ b/official/cv/CRNN/src/crnn_for_train.py @@ -14,8 +14,8 @@ # ============================================================================ """Automatic differentiation with grad clip.""" import numpy as np -from mindspore import context -from mindspore.context import ParallelMode +import mindspore +from mindspore import ParallelMode from mindspore.common import dtype as mstype from mindspore.ops import composite as C from mindspore.ops import functional as F @@ -85,7 +85,7 @@ class TrainOneStepCellWithGradClip(Cell): self.cast = P.Cast() self.concat = P.Concat(axis=0) self.ten = Tensor(np.array([10.0]).astype(np.float32)) - parallel_mode = context.get_auto_parallel_context("parallel_mode") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: diff --git a/official/cv/CRNN/src/model_utils/moxing_adapter.py b/official/cv/CRNN/src/model_utils/moxing_adapter.py index c2d2282402b6a2950af74b66f282550aac75cb14..344dfc034e1e553b2b5da61517cdc4b179d34b1a 100644 --- a/official/cv/CRNN/src/model_utils/moxing_adapter.py +++ b/official/cv/CRNN/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config @@ -101,7 +101,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print('Workspace downloaded: ', os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/CRNN/train.py b/official/cv/CRNN/train.py index 5b08088554b8cacb068332ec4dbb327d0c1cfb05..a1dc2e0b82ff4e4199de7720cbf6f828eba3c438 100644 --- a/official/cv/CRNN/train.py +++ b/official/cv/CRNN/train.py @@ -14,11 +14,11 @@ # ============================================================================ """crnn training""" import os +import mindspore import mindspore.nn as nn -from mindspore import context from mindspore.common import set_seed from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.wrap import WithLossCell from mindspore.train.callback import CheckpointConfig, ModelCheckpoint from mindspore.communication.management import init, get_group_size, get_rank @@ -39,7 +39,7 @@ from src.model_utils.lr_scheduler import cosine_decay_lr_with_start_step set_seed(1) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False) +mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) def apply_eval(eval_param): @@ -69,7 +69,7 @@ def train(): if config.device_target == 'Ascend': device_id = get_device_id() - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) if config.model_version == 'V1' and config.device_target != 'Ascend': raise ValueError("model version V1 is only supported on Ascend, pls check the config.") @@ -86,8 +86,8 @@ def train(): # lr_scale = 1 device_num = get_group_size() rank = get_rank() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: diff --git a/official/cv/CTPN/README.md b/official/cv/CTPN/README.md index a4b601bb0aa0bf9f247414691b02304ad37867dd..47be063e7836a0daaf09b57dfed71dd20e3e356f 100644 --- a/official/cv/CTPN/README.md +++ b/official/cv/CTPN/README.md @@ -282,7 +282,7 @@ ICDAR2013, SCUT-FORU to improve precision and recall, and when doing Finetune, w ### Result -Training result will be stored in the example path. Checkpoints will be stored at `ckpt_path` by default, and training log will be redirected to `./log`, also the loss will be redirected to `./loss_0.log` like followings. +Training result will be stored in the example path. Checkpoints will be stored at `ckpt_path` by default, and training log will be redirected to `./log`, also the loss will be redirected to `./loss_0.log` like following. ```python 377 epoch: 1 step: 229 ,rpn_loss: 0.00355 @@ -391,7 +391,7 @@ You can add `run_eval` to start shell and set it True, if you want evaluation wh ### Result -Evaluation result will be stored in the example path, you can find result like the followings in `log`. +Evaluation result will be stored in the example path, you can find result like the following in `log`. ```text {"precision": 0.90791, "recall": 0.86118, "hmean": 0.88393} @@ -547,7 +547,7 @@ bash eval_res.sh ### Result -Evaluation result will be stored in the example path, you can find result like the followings in `log`. +Evaluation result will be stored in the example path, you can find result like the following in `log`. ```text {"precision": 0.88913, "recall": 0.86082, "hmean": 0.87475} diff --git a/official/cv/CTPN/eval.py b/official/cv/CTPN/eval.py index 47af42290bc6c399577e46449f431209d88f9743..e45ce18407f0dcebcad974e6e5c3563a8a026ec0 100644 --- a/official/cv/CTPN/eval.py +++ b/official/cv/CTPN/eval.py @@ -15,7 +15,7 @@ """Evaluation for CTPN""" import os -from mindspore import context +import mindspore from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.common import set_seed from src.ctpn import CTPN @@ -28,7 +28,7 @@ from src.model_utils.device_adapter import get_device_id set_seed(1) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) +mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) def modelarts_pre_process(): pass diff --git a/official/cv/CTPN/export.py b/official/cv/CTPN/export.py index ebd87ce0a1af19dc8ea1dc44da4fd2a213c44fb7..9c42196a3b40480a9bbc6c57998a6ca02d691835 100644 --- a/official/cv/CTPN/export.py +++ b/official/cv/CTPN/export.py @@ -15,18 +15,18 @@ """export checkpoint file into air, onnx, mindir models""" import os import numpy as np -import mindspore as ms -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.ctpn import CTPN_Infer from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) def modelarts_pre_process(): @@ -50,7 +50,7 @@ def model_export(): load_param_into_net(net, param_dict_new) - img = Tensor(np.zeros([config.test_batch_size, 3, config.img_height, config.img_width]), ms.float16) + img = Tensor(np.zeros([config.test_batch_size, 3, config.img_height, config.img_width]), mindspore.float16) export(net, img, file_name=config.file_name, file_format=config.file_format) diff --git a/official/cv/CTPN/src/CTPN/bbox_assign_sample.py b/official/cv/CTPN/src/CTPN/bbox_assign_sample.py index 93d348c7b136ecd9709f6206fa445ccee2789939..94f47c12ddad25559bf4ce60c7f668dd2118d8bd 100644 --- a/official/cv/CTPN/src/CTPN/bbox_assign_sample.py +++ b/official/cv/CTPN/src/CTPN/bbox_assign_sample.py @@ -15,15 +15,15 @@ """CTPN positive and negative sample screening for RPN.""" import numpy as np +import mindspore import mindspore.nn as nn -from mindspore import context from mindspore.ops import operations as P from mindspore.common.tensor import Tensor import mindspore.common.dtype as mstype from src.CTPN.BoundingBoxEncode import BoundingBoxEncode -if context.get_context("device_target") == "Ascend": +if mindspore.get_context("device_target") == "Ascend": mtype = mstype.float16 nptype = np.float16 else: diff --git a/official/cv/CTPN/src/CTPN/proposal_generator.py b/official/cv/CTPN/src/CTPN/proposal_generator.py index f1dd795488543053aaff2de8aa0e270a98baa3b0..78a3d9aeb91e85a9b5c2c1876b7b6fb8fabb6043 100644 --- a/official/cv/CTPN/src/CTPN/proposal_generator.py +++ b/official/cv/CTPN/src/CTPN/proposal_generator.py @@ -15,14 +15,15 @@ """CTPN proposal generator.""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P -from mindspore import Tensor, context +from mindspore import Tensor from src.CTPN.BoundingBoxDecode import BoundingBoxDecode -if context.get_context("device_target") == "Ascend": +if mindspore.get_context("device_target") == "Ascend": mtype = mstype.float16 nptype = np.float16 else: diff --git a/official/cv/CTPN/src/CTPN/rpn.py b/official/cv/CTPN/src/CTPN/rpn.py index 903c182076cd8dea9da2dcb80e822370bcc49b2e..1eca50b499448e6e36a88ecf92b1c6aaec9d6912 100644 --- a/official/cv/CTPN/src/CTPN/rpn.py +++ b/official/cv/CTPN/src/CTPN/rpn.py @@ -14,15 +14,16 @@ # ============================================================================ """RPN for fasterRCNN""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P -from mindspore import Tensor, context +from mindspore import Tensor from mindspore.ops import functional as F from src.CTPN.bbox_assign_sample import BboxAssignSample -if context.get_context("device_target") == "Ascend": +if mindspore.get_context("device_target") == "Ascend": mtype = mstype.float16 nptype = np.float16 else: diff --git a/official/cv/CTPN/src/ctpn.py b/official/cv/CTPN/src/ctpn.py index 5ae25fa22c4120d8bf3e2a402049306d820c3fc3..d0e90977f1032fa96fceb4dd780c8f648a107ed4 100644 --- a/official/cv/CTPN/src/ctpn.py +++ b/official/cv/CTPN/src/ctpn.py @@ -15,7 +15,7 @@ """CPTN network definition.""" import numpy as np -from mindspore import context +import mindspore import mindspore.nn as nn from mindspore import Tensor, Parameter from mindspore.common import dtype as mstype @@ -26,7 +26,7 @@ from src.CTPN.proposal_generator import Proposal from src.CTPN.vgg16 import VGG16FeatureExtraction from src.weight_init import lstm_default_state -if context.get_context("device_target") == "Ascend": +if mindspore.get_context("device_target") == "Ascend": mtype = mstype.float16 nptype = np.float16 else: @@ -113,7 +113,7 @@ class CTPN(nn.Cell): self.transpose = P.Transpose() self.cast = P.Cast() self.is_training = is_training - self.device_target = context.get_context("device_target") + self.device_target = mindspore.get_context("device_target") # rpn block self.rpn_with_loss = RPN(config, diff --git a/official/cv/CTPN/src/model_utils/moxing_adapter.py b/official/cv/CTPN/src/model_utils/moxing_adapter.py index c2d2282402b6a2950af74b66f282550aac75cb14..344dfc034e1e553b2b5da61517cdc4b179d34b1a 100644 --- a/official/cv/CTPN/src/model_utils/moxing_adapter.py +++ b/official/cv/CTPN/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config @@ -101,7 +101,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print('Workspace downloaded: ', os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/CTPN/src/text_connector/connect_text_lines.py b/official/cv/CTPN/src/text_connector/connect_text_lines.py index 171beca9a7856a5922011a44f050d2e4e3554403..87fa238921b20b980fff8db5ba4d07022f1416c8 100644 --- a/official/cv/CTPN/src/text_connector/connect_text_lines.py +++ b/official/cv/CTPN/src/text_connector/connect_text_lines.py @@ -52,7 +52,7 @@ def connect_text_lines(text_proposals, scores, size): text_lines = clip_boxes(text_lines, size) - text_recs = np.zeros((len(text_lines), 9), np.float) + text_recs = np.zeros((len(text_lines), 9), np.float_) index = 0 for line in text_lines: xmin, ymin, xmax, ymax = line[0], line[1], line[2], line[3] diff --git a/official/cv/CTPN/src/text_connector/detector.py b/official/cv/CTPN/src/text_connector/detector.py index 7e5d724d440bc0cd8d6eab816bc053b51c30a8b6..707876a6773f837102b034cd02ffd6f0e26d8796 100644 --- a/official/cv/CTPN/src/text_connector/detector.py +++ b/official/cv/CTPN/src/text_connector/detector.py @@ -44,9 +44,9 @@ def filter_boxes(boxes): Returns: boxes(numpy.array): Text boxes after filter. """ - heights = np.zeros((len(boxes), 1), np.float) - widths = np.zeros((len(boxes), 1), np.float) - scores = np.zeros((len(boxes), 1), np.float) + heights = np.zeros((len(boxes), 1), np.float_) + widths = np.zeros((len(boxes), 1), np.float_) + scores = np.zeros((len(boxes), 1), np.float_) index = 0 for box in boxes: widths[index] = abs(box[2] - box[0]) diff --git a/official/cv/CTPN/train.py b/official/cv/CTPN/train.py index 7e932179fb7f532260cf46e23b85210e609aa8a9..f52df437b6f7c905cbca953374231db594457e51 100644 --- a/official/cv/CTPN/train.py +++ b/official/cv/CTPN/train.py @@ -18,12 +18,13 @@ import os import ast import operator import numpy as np +import mindspore import mindspore.common.dtype as mstype -from mindspore import context, Tensor, Parameter +from mindspore import Tensor, Parameter from mindspore.communication.management import init, get_group_size, get_rank from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor from mindspore.train import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.nn import Momentum from mindspore.common import set_seed @@ -41,11 +42,11 @@ from src.model_utils.device_adapter import get_device_id set_seed(1) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) +mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) # Set mempool block size in PYNATIVE_MODE for improving memory utilization, which will not take effect in GRAPH_MODE -if context.get_context("mode") == context.PYNATIVE_MODE: - context.set_context(mempool_block_size="20GB") +if mindspore.get_context("mode") == 1: + mindspore.set_context(mempool_block_size="20GB") binOps = { ast.Add: operator.add, @@ -95,10 +96,10 @@ def train(): config.weight_decay = arithmeticeval(config.weight_decay) if config.run_distribute: init() - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() rank = get_rank() device_num = get_group_size() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: rank = 0 diff --git a/official/cv/CycleGAN/eval.py b/official/cv/CycleGAN/eval.py index 8135ccdbde1af03803c3fd06d4204ccecd811536..5a0dcc1c164ab644842b11ca00d4096536703004 100644 --- a/official/cv/CycleGAN/eval.py +++ b/official/cv/CycleGAN/eval.py @@ -16,7 +16,7 @@ """Cycle GAN test.""" import os -import mindspore as ms +import mindspore from src.models.cycle_gan import get_generator from src.utils.args import get_args from src.dataset.cyclegan_dataset import create_dataset @@ -27,12 +27,12 @@ from src.utils.tools import save_image, load_ckpt def predict(): """Predict function.""" args = get_args("predict") - ms.set_context(mode=ms.GRAPH_MODE, device_target=args.platform, - save_graphs=args.save_graphs, device_id=args.device_id) + mindspore.set_context(mode=0, device_target=args.platform, save_graphs=args.save_graphs, device_id=args.device_id, + jit_config={"jit_level": "O2"}) args.rank = 0 args.device_num = 1 if args.platform == "GPU": - ms.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) G_A = get_generator(args) G_B = get_generator(args) G_A.set_train(True) @@ -50,7 +50,7 @@ def predict(): reporter = Reporter(args) reporter.start_predict("A to B") for data in ds.create_dict_iterator(output_numpy=True): - img_A = ms.Tensor(data["image"]) + img_A = mindspore.Tensor(data["image"]) path_A = data["image_name"][0] path_B = path_A[0:-4] + "_fake_B.jpg" fake_B = G_A(img_A) @@ -63,7 +63,7 @@ def predict(): reporter.dataset_size = args.dataset_size reporter.start_predict("B to A") for data in ds.create_dict_iterator(output_numpy=True): - img_B = ms.Tensor(data["image"]) + img_B = mindspore.Tensor(data["image"]) path_B = data["image_name"][0] path_A = path_B[0:-4] + "_fake_A.jpg" fake_A = G_B(img_B) diff --git a/official/cv/CycleGAN/export.py b/official/cv/CycleGAN/export.py index 66d1c52e78046a44ef5f066a7957fbe255806b68..6da610d1deb94d24f79e689598b3552cee86c9e4 100644 --- a/official/cv/CycleGAN/export.py +++ b/official/cv/CycleGAN/export.py @@ -16,7 +16,7 @@ """export file.""" import numpy as np -import mindspore as ms +import mindspore from src.models.cycle_gan import get_generator from src.utils.args import get_args from src.utils.tools import load_ckpt, enable_batch_statistics @@ -24,7 +24,7 @@ from src.utils.tools import load_ckpt, enable_batch_statistics if __name__ == '__main__': args = get_args("export") - ms.set_context(mode=ms.GRAPH_MODE, device_target=args.platform) + mindspore.set_context(mode=0, device_target=args.platform) G_A = get_generator(args) G_B = get_generator(args) # Use BatchNorm2d with batchsize=1, affine=False, use_batch_statistics=True instead of InstanceNorm2d @@ -34,8 +34,8 @@ if __name__ == '__main__': load_ckpt(args, G_A, G_B) input_shp = [args.export_batch_size, 3, args.image_size, args.image_size] - input_array = ms.Tensor(np.random.uniform(-1.0, 1.0, size=input_shp).astype(np.float32)) + input_array = mindspore.Tensor(np.random.uniform(-1.0, 1.0, size=input_shp).astype(np.float32)) G_A_file = f"{args.export_file_name}_AtoB" - ms.export(G_A, input_array, file_name=G_A_file, file_format=args.export_file_format) + mindspore.export(G_A, input_array, file_name=G_A_file, file_format=args.export_file_format) G_B_file = f"{args.export_file_name}_BtoA" - ms.export(G_B, input_array, file_name=G_B_file, file_format=args.export_file_format) + mindspore.export(G_B, input_array, file_name=G_B_file, file_format=args.export_file_format) diff --git a/official/cv/CycleGAN/src/models/cycle_gan.py b/official/cv/CycleGAN/src/models/cycle_gan.py index 521bf800e07cf9487627163f10fefc5ceaf4c5f9..762761f1bdc0e081f2c7ea5c302c4778d19c2966 100644 --- a/official/cv/CycleGAN/src/models/cycle_gan.py +++ b/official/cv/CycleGAN/src/models/cycle_gan.py @@ -15,10 +15,9 @@ """Cycle GAN network.""" -import mindspore as ms +import mindspore import mindspore.nn as nn -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.communication.management import get_group_size import mindspore.ops as ops @@ -176,17 +175,17 @@ class TrainOneStepG(nn.Cell): self.G.D_B.set_train(False) self.grad = ops.GradOperation(get_by_list=True, sens_param=True) self.sens = sens - self.weights = ms.ParameterTuple(generator.trainable_params()) + self.weights = mindspore.ParameterTuple(generator.trainable_params()) self.net = WithLossCell(G) self.reducer_flag = False self.grad_reducer = None - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True if self.reducer_flag: - mean = context.get_auto_parallel_context("gradients_mean") + mean = mindspore.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): - degree = context.get_auto_parallel_context("device_num") + degree = mindspore.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree) @@ -224,16 +223,16 @@ class TrainOneStepD(nn.Cell): self.D.set_train() self.grad = ops.GradOperation(get_by_list=True, sens_param=True) self.sens = sens - self.weights = ms.ParameterTuple(D.trainable_params()) + self.weights = mindspore.ParameterTuple(D.trainable_params()) self.reducer_flag = False self.grad_reducer = None - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True if self.reducer_flag: - mean = context.get_auto_parallel_context("gradients_mean") + mean = mindspore.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): - degree = context.get_auto_parallel_context("device_num") + degree = mindspore.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree) diff --git a/official/cv/CycleGAN/train.py b/official/cv/CycleGAN/train.py index d777ec9ce447d5363de961ffe2f1f3b97bcbb247..a81ae1604b06cee87cabc1706f51357440570b6a 100644 --- a/official/cv/CycleGAN/train.py +++ b/official/cv/CycleGAN/train.py @@ -20,7 +20,7 @@ Example: python train.py --dataroot ./data/horse2zebra --model ResNet """ -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.communication.management import init, get_rank, get_group_size from src.utils.args import get_args @@ -30,26 +30,27 @@ from src.dataset.cyclegan_dataset import create_dataset from src.models.losses import DiscriminatorLoss, GeneratorLoss from src.models.cycle_gan import get_generator, get_discriminator, Generator, TrainOneStepG, TrainOneStepD -ms.set_seed(1) +mindspore.set_seed(1) def train(): """Train function.""" args = get_args("train") if args.device_num > 1: - ms.set_context(mode=ms.GRAPH_MODE, device_target=args.platform, save_graphs=args.save_graphs) + mindspore.set_context(mode=0, device_target=args.platform, save_graphs=args.save_graphs, + jit_config={"jit_level": "O2"}) init() - ms.reset_auto_parallel_context() - ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True) + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) args.rank = get_rank() args.group_size = get_group_size() else: - ms.set_context(mode=ms.GRAPH_MODE, device_target=args.platform, - save_graphs=args.save_graphs, device_id=args.device_id) + mindspore.set_context(mode=0, device_target=args.platform, save_graphs=args.save_graphs, + device_id=args.device_id, jit_config={"jit_level": "O2"}) args.rank = 0 args.device_num = 1 if args.platform == "GPU": - ms.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) if args.need_profiler: from mindspore.profiler.profiling import Profiler profiler = Profiler(output_path=args.outputs_dir, is_detail=True, is_show_op_path=True) diff --git a/official/cv/DBNet/eval.py b/official/cv/DBNet/eval.py index 1147f93126d5c16b9e7352761e7337d339003344..1c897109328186b58fcb148a4b2cc1b2ce09fc26 100644 --- a/official/cv/DBNet/eval.py +++ b/official/cv/DBNet/eval.py @@ -16,7 +16,7 @@ import os import sys -import mindspore as ms +import mindspore from src.datasets.load import create_dataset from src.utils.eval_utils import WithEval @@ -59,7 +59,7 @@ def evaluate(cfg, path): eval_net = WithEval(eval_net, cfg) eval_net.model.set_train(False) cfg.logger.info(f"infer {p}") - ms.load_checkpoint(p, eval_net.model) + mindspore.load_checkpoint(p, eval_net.model) metrics, fps = eval_net.eval(val_dataset, show_imgs=cfg.eval.show_images) params = sum([param.size for param in eval_net.model.get_parameters()]) / (1024 ** 2) cfg.logger.info(f"Param: {params} M") diff --git a/official/cv/DBNet/export.py b/official/cv/DBNet/export.py index 926d5c3aa5f81dd4fad1252fcf04c65271610300..6bc848e215ae305cf92ab5512458437b73b4e799 100644 --- a/official/cv/DBNet/export.py +++ b/official/cv/DBNet/export.py @@ -16,7 +16,7 @@ import os import sys -import mindspore as ms +import mindspore from src.utils.env import init_env from src.modules.model import get_dbnet @@ -32,14 +32,14 @@ def export(): init_env(config) config.backbone.pretrained = False eval_net = get_dbnet(config.net, config, isTrain=False) - ms.load_checkpoint(config.ckpt_path, eval_net) + mindspore.load_checkpoint(config.ckpt_path, eval_net) eval_net.set_train(False) if not config.dataset.offload: - inp = ms.ops.ones((1, 3, *config.eval.eval_size), ms.float32) + inp = mindspore.ops.ones((1, 3, *config.eval.eval_size), mindspore.float32) else: - inp = ms.ops.ones((1, *config.eval.eval_size, 3), ms.float32) + inp = mindspore.ops.ones((1, *config.eval.eval_size, 3), mindspore.float32) file_name = config.net + '_' + config.backbone.initializer - ms.export(eval_net, inp, file_name=file_name, file_format='MINDIR') + mindspore.export(eval_net, inp, file_name=file_name, file_format='MINDIR') print("MINDIR saved at", file_name+".mindir") diff --git a/official/cv/DBNet/src/model_utils/moxing_adapter.py b/official/cv/DBNet/src/model_utils/moxing_adapter.py index ed75cf910663ec2f6a455eb8b7d15e31f1ddfeb7..6e96d5c085aac2eaeca41930e67dd42b1f0c49ca 100644 --- a/official/cv/DBNet/src/model_utils/moxing_adapter.py +++ b/official/cv/DBNet/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config _global_sync_count = 0 @@ -116,7 +116,7 @@ def moxing_wrapper(pre_process=None, post_process=None): config.eval.gt_dir = os.path.join(config.data_path, "test_gts") config.backbone.backbone_ckpt = os.path.join(config.data_path, config.backbone.backbone_ckpt) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_dir): diff --git a/official/cv/DBNet/src/modules/backbone/__init__.py b/official/cv/DBNet/src/modules/backbone/__init__.py index 415403f763341455a41f3e7640aceee312c2ea52..02eb50367e2b636b817c5e1236e15be155b31d3b 100644 --- a/official/cv/DBNet/src/modules/backbone/__init__.py +++ b/official/cv/DBNet/src/modules/backbone/__init__.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ -import mindspore as ms +import mindspore from .resnet import ResNet, Bottleneck, BasicBlock from .mobilenetv3 import MobileNetV3 @@ -21,21 +21,21 @@ from .mobilenetv3 import MobileNetV3 def mobilenetv3(pretrained=True, backbone_ckpt=None, **kwargs): model = MobileNetV3(**kwargs) if pretrained: - ms.load_checkpoint(backbone_ckpt, model) + mindspore.load_checkpoint(backbone_ckpt, model) return model def resnet18(pretrained=True, backbone_ckpt=None, **kwargs): model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) if pretrained: - ms.load_checkpoint(backbone_ckpt, model) + mindspore.load_checkpoint(backbone_ckpt, model) return model def deformable_resnet18(pretrained=True, backbone_ckpt=None, **kwargs): model = ResNet(BasicBlock, [2, 2, 2, 2], dcn=True, **kwargs) if pretrained: - ms.load_checkpoint(backbone_ckpt, model) + mindspore.load_checkpoint(backbone_ckpt, model) return model @@ -46,7 +46,7 @@ def resnet50(pretrained=True, backbone_ckpt=None, **kwargs): """ model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) if pretrained: - ms.load_checkpoint(backbone_ckpt, model) + mindspore.load_checkpoint(backbone_ckpt, model) return model @@ -57,7 +57,7 @@ def deformable_resnet50(pretrained=True, backbone_ckpt=None, **kwargs): """ model = ResNet(Bottleneck, [3, 4, 6, 3], dcn=True, **kwargs) if pretrained: - ms.load_checkpoint(backbone_ckpt, model) + mindspore.load_checkpoint(backbone_ckpt, model) return model diff --git a/official/cv/DBNet/src/modules/data_offload.py b/official/cv/DBNet/src/modules/data_offload.py index 3dc8d19c56820d1059b008d5f02a507427466db5..2fb4e359da2b07f6aa6faf9fe6aa5b85fda156c5 100644 --- a/official/cv/DBNet/src/modules/data_offload.py +++ b/official/cv/DBNet/src/modules/data_offload.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ """Supported dataset operations applied on devices""" -import mindspore as ms +import mindspore from mindspore import ops, nn from mindspore.dataset.engine.offload import RandomColorAdjust @@ -24,8 +24,8 @@ class Normalize(nn.Cell): """ def __init__(self, mean, std): super(Normalize, self).__init__(auto_prefix=False) - self.mean = ms.Tensor(mean, ms.float32) - self.std = ms.Tensor(std, ms.float32) + self.mean = mindspore.Tensor(mean, mindspore.float32) + self.std = mindspore.Tensor(std, mindspore.float32) def construct(self, img): img = (img - self.mean.reshape((1, 1, 1, -1))) / self.std.reshape((1, 1, 1, -1)) diff --git a/official/cv/DBNet/src/modules/loss.py b/official/cv/DBNet/src/modules/loss.py index 7da24573470d72b9408b1beec54b684125365634..b6afa69a2dfcbafe63b5c1d7362e8201f74b7e2d 100644 --- a/official/cv/DBNet/src/modules/loss.py +++ b/official/cv/DBNet/src/modules/loss.py @@ -16,7 +16,7 @@ """Loss functions.""" from mindspore import nn, ops -import mindspore as ms +import mindspore import mindspore.numpy as mnp @@ -140,15 +140,15 @@ class BalanceCrossEntropyLoss(nn.LossBase): pred = pred.squeeze(axis=1) gt = gt.squeeze(axis=1) - pos = (gt * mask).astype(ms.float32) - neg = ((1 - gt) * mask).astype(ms.float32) + pos = (gt * mask).astype(mindspore.float32) + neg = ((1 - gt) * mask).astype(mindspore.float32) - positive_count = pos.sum(axis=(1, 2), keepdims=True).astype(ms.int32) - negative_count = neg.sum(axis=(1, 2), keepdims=True).astype(ms.int32) + positive_count = pos.sum(axis=(1, 2), keepdims=True).astype(mindspore.int32) + negative_count = neg.sum(axis=(1, 2), keepdims=True).astype(mindspore.int32) negative_count = self.min(negative_count, positive_count * self.negative_ratio).squeeze(axis=(1, 2)) - loss = self.bceloss(pred.astype(ms.float32), gt.astype(ms.float32)) + loss = self.bceloss(pred.astype(mindspore.float32), gt.astype(mindspore.float32)) positive_loss = loss * pos N = loss.shape[0] @@ -159,13 +159,13 @@ class BalanceCrossEntropyLoss(nn.LossBase): neg_index = self.stack((batch_iter, negative_count)) min_neg_score = self.unsqueeze(self.gather(negative_value, neg_index), 1) - masked_neg_loss = self.cast(negative_loss >= min_neg_score, ms.float32) # filter out losses less than topk loss. + masked_neg_loss = self.cast(negative_loss >= min_neg_score, mindspore.float32) # filter out losses less than topk loss. masked_neg_loss = ops.stop_gradient(masked_neg_loss) masked_neg_loss = masked_neg_loss * negative_loss balance_loss = (positive_loss.sum() + masked_neg_loss.sum()) / \ - ((positive_count + negative_count).astype(ms.float32).sum() + self.eps) + ((positive_count + negative_count).astype(mindspore.float32).sum() + self.eps) return balance_loss diff --git a/official/cv/DBNet/src/utils/callback.py b/official/cv/DBNet/src/utils/callback.py index 55c45ade8d9f573d1cb52981d1530008aa0f4eb6..5ea153a662d253d7807c6c5ae9c0a5b7a595f3b9 100644 --- a/official/cv/DBNet/src/utils/callback.py +++ b/official/cv/DBNet/src/utils/callback.py @@ -17,7 +17,7 @@ import os import time import numpy as np -import mindspore as ms +import mindspore from mindspore.train.callback import Callback from src.datasets.load import create_dataset @@ -100,10 +100,10 @@ class DBNetMonitor(Callback): def handle_loss(self, net_outputs): """Handle loss""" if isinstance(net_outputs, (tuple, list)): - if isinstance(net_outputs[0], ms.Tensor) and isinstance(net_outputs[0].asnumpy(), np.ndarray): + if isinstance(net_outputs[0], mindspore.Tensor) and isinstance(net_outputs[0].asnumpy(), np.ndarray): loss = net_outputs[0].asnumpy() - elif isinstance(net_outputs, ms.Tensor) and isinstance(net_outputs.asnumpy(), np.ndarray): + elif isinstance(net_outputs, mindspore.Tensor) and isinstance(net_outputs.asnumpy(), np.ndarray): loss = float(np.mean(net_outputs.asumpy())) return loss @@ -196,11 +196,11 @@ class DBNetMonitor(Callback): f'best fmeasure is: {cur_f:.2f}, ' f'e2e cost: {time.time() - self.train_start:.2f} s, ' f'current train time: {sum(self.train_time_list):.2f} s') - if ms.context.get_context("enable_ge"): + if mindspore.get_context("enable_ge"): from mindspore.train.callback import _set_cur_net _set_cur_net(cb_params.train_network) cb_params.train_network.exec_checkpoint_graph() - ms.save_checkpoint(self.eval_net.model, + mindspore.save_checkpoint(self.eval_net.model, os.path.join(self.save_ckpt_dir, f"best_rank{self.config.rank_id}.ckpt")) self.max_f = cur_f if self.early_stop and isinstance(self.stop_value, dict) and self.stop_value: @@ -213,7 +213,7 @@ class DBNetMonitor(Callback): f"best recall: {metrics['recall'].avg:.2f}, " f"precision: {metrics['precision'].avg:.2f}, " f"fmeasure: {metrics['fmeasure'].avg:.2f}") - ms.save_checkpoint(self.eval_net.model, + mindspore.save_checkpoint(self.eval_net.model, os.path.join(self.save_ckpt_dir, f"best_rank{self.config.rank_id}.ckpt")) run_context.request_stop() e2e_time = time.time() - self.epoch_start_time diff --git a/official/cv/DBNet/src/utils/env.py b/official/cv/DBNet/src/utils/env.py index c633c102e8d887cdf27914440fef160e9acf186f..79822ec3da9b215f3e7e4758b974d19b9511bba3 100644 --- a/official/cv/DBNet/src/utils/env.py +++ b/official/cv/DBNet/src/utils/env.py @@ -15,29 +15,29 @@ """Environ setting.""" import os import cv2 -import mindspore as ms +import mindspore from mindspore.communication.management import init, get_rank, get_group_size def init_env(cfg): os.environ["OPENBLAS_NUM_THREADS"] = "1" cv2.setNumThreads(2) - ms.set_seed(cfg.seed) + mindspore.set_seed(cfg.seed) if cfg.device_target != "None": if cfg.device_target not in ["Ascend", "GPU", "CPU"]: raise ValueError(f"Invalid device_target: {cfg.device_target}, " f"should be in ['None', 'Ascend', 'GPU', 'CPU") - ms.set_context(device_target=cfg.device_target) + mindspore.set_context(device_target=cfg.device_target) if cfg.context_mode not in ["graph", "pynative"]: raise ValueError(f"Invalid context_mode: {cfg.context_mode}, " f"should be in ['graph', 'pynative") - context_mode = ms.GRAPH_MODE if cfg.context_mode == "graph" else ms.PYNATIVE_MODE - ms.set_context(mode=context_mode) - ms.set_context(ascend_config={'atomic_clean_policy': 0}) + context_mode = 0 if cfg.context_mode == "graph" else 1 + mindspore.set_context(mode=context_mode) + mindspore.set_context(ascend_config={'atomic_clean_policy': 0}) - cfg.device_target = ms.get_context("device_target") + cfg.device_target = mindspore.get_context("device_target") if cfg.device_target == "CPU": print("run on CPU !!!") cfg.device_id = 0 @@ -45,7 +45,7 @@ def init_env(cfg): cfg.rank_id = 0 if cfg.device_target == 'Ascend': - ms.set_context(device_id=cfg.device_id, ascend_config={"precision_mode": "allow_fp32_to_fp16"}) + mindspore.set_context(device_id=cfg.device_id, ascend_config={"precision_mode": "allow_fp32_to_fp16"}) if cfg.device_num > 1: init() @@ -53,13 +53,13 @@ def init_env(cfg): if cfg.device_num != group_size: raise ValueError(f"the setting device_num: {cfg.device_num} not equal to the real group_size: {group_size}") cfg.rank_id = get_rank() - ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True) + mindspore.set_auto_parallel_context(parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) if hasattr(cfg, "all_reduce_fusion_config"): - ms.set_auto_parallel_context(all_reduce_fusion_config=cfg.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=cfg.all_reduce_fusion_config) cpu_affinity(cfg.rank_id, cfg.device_num) else: if hasattr(cfg, "device_id") and isinstance(cfg.device_id, int) and cfg.device_target == 'Ascend': - ms.set_context(device_id=cfg.device_id) + mindspore.set_context(device_id=cfg.device_id) cfg.device_num = 1 cfg.rank_id = 0 diff --git a/official/cv/DBNet/src/utils/eval_utils.py b/official/cv/DBNet/src/utils/eval_utils.py index e516e975eaf860aa01d100ec9052d3b11837aade..a5ec3a8e32d977ce88e6d14d44d4ee9cdfcc9c8c 100644 --- a/official/cv/DBNet/src/utils/eval_utils.py +++ b/official/cv/DBNet/src/utils/eval_utils.py @@ -21,7 +21,7 @@ import numpy as np import cv2 from tqdm.auto import tqdm -import mindspore as ms +import mindspore from .metric import QuadMetric from .post_process import SegDetectorRepresenter @@ -39,7 +39,7 @@ class WithEval: config.eval.dest) def once_eval(self, batch): start = time.time() - img = ms.Tensor(batch['img']) + img = mindspore.Tensor(batch['img']) preds = self.model(img).asnumpy() boxes, scores = self.post_process({'binary': preds}) cur_time = time.time() - start diff --git a/official/cv/DBNet/src/utils/post_process.py b/official/cv/DBNet/src/utils/post_process.py index f7fe02ad04643b6555cc40874f24fc3ac78f3714..aa2f6b4d18e24a227b55f67d7d5bf4abb3cdaa88 100644 --- a/official/cv/DBNet/src/utils/post_process.py +++ b/official/cv/DBNet/src/utils/post_process.py @@ -19,7 +19,7 @@ import numpy as np from shapely.geometry import Polygon import pyclipper -import mindspore as ms +import mindspore import mindspore.ops as ops @@ -52,7 +52,7 @@ class SegDetectorRepresenter: dest_dict = {'binary': 0, 'thresh': 1, 'thresh_binary': 2} idx = dest_dict[self.dest] pred = pred[idx][:, 0, :, :] - if isinstance(pred, ms.Tensor): + if isinstance(pred, mindspore.Tensor): pred = pred.asnumpy() segmentation = self.binarize(pred) boxes_batch = [] diff --git a/official/cv/DBNet/train.py b/official/cv/DBNet/train.py index 72951125e2cf4299cd548c220b24316c49a232f8..30a5f3f7eb65d7893a0f5241bc8a130e7faa4ddf 100644 --- a/official/cv/DBNet/train.py +++ b/official/cv/DBNet/train.py @@ -16,7 +16,7 @@ import os import sys -import mindspore as ms +import mindspore from mindspore import nn from mindspore.train.callback import CheckpointConfig, ModelCheckpoint import src.modules.loss as loss @@ -71,17 +71,17 @@ def train(): net = get_dbnet(config.net, config, isTrain=True) if config.device_num > 1: params_num = len(net.trainable_params()) - ms.set_auto_parallel_context(all_reduce_fusion_config=[params_num // 2, params_num // 3 * 2, params_num - 1]) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[params_num // 2, params_num // 3 * 2, params_num - 1]) if config.train.pretrained_ckpt: - ms.load_checkpoint(net, config.train.pretrained_ckpt) + mindspore.load_checkpoint(net, config.train.pretrained_ckpt) config.logger.info("load pretrained checkpoint: %s", config.train.pretrained_ckpt) if config.train.resume_ckpt: - resume_param = ms.load_checkpoint(config.train.resume_ckpt, + resume_param = mindspore.load_checkpoint(config.train.resume_ckpt, choice_func=lambda x: not x.startswith(('learning_rate', 'global_step'))) - config.train.start_epoch_num = int(resume_param.get('epoch_num', ms.Tensor(0, ms.int32)).asnumpy().item()) + config.train.start_epoch_num = int(resume_param.get('epoch_num', mindspore.Tensor(0, mindspore.int32)).asnumpy().item()) - lr = ms.Tensor(warmup_polydecay(base_lr=config.optimizer.lr.base_lr, + lr = mindspore.Tensor(warmup_polydecay(base_lr=config.optimizer.lr.base_lr, target_lr=config.optimizer.lr.target_lr, warmup_epoch=config.optimizer.lr.warmup_epoch, total_epoch=config.train.total_epochs, @@ -107,8 +107,8 @@ def train(): bce_scale=config.loss.bce_scale, bce_replace=config.loss.bce_replace) if config.mix_precision: # only resnet run with float16 - net.to_float(ms.float32) - net.backbone.to_float(ms.float16) + net.to_float(mindspore.float32) + net.backbone.to_float(mindspore.float16) net_with_loss = WithLossCell(net, criterion) train_net = TrainOneStepCell(net_with_loss, optimizer=opt, scale_sense=nn.FixedLossScaleUpdateCell(1024.), clip_grad=config.train.clip_grad, force_update=config.train.force_update) @@ -123,11 +123,11 @@ def train(): cb_default.append(ModelCheckpoint(config=ckpt_config, directory=config.save_ckpt_dir, prefix=config.net + '-' + config.backbone.initializer)) if config.train.resume_ckpt: - ms.load_param_into_net(train_net, resume_param) + mindspore.load_param_into_net(train_net, resume_param) cb_default.append(ResumeCallback(config.train.start_epoch_num)) config.logger.info("Resume train from epoch: %s", config.train.start_epoch_num) cb_default.append(DBNetMonitor(config, net, lr.asnumpy(), per_print_times=config.per_print_times)) - model = ms.Model(train_net) + model = mindspore.Model(train_net) config.logger.save_args(config) if config.run_profiler: model.train(3, train_dataset, callbacks=cb_default, sink_size=20, diff --git a/official/cv/DeepLabV3P/eval.py b/official/cv/DeepLabV3P/eval.py index 9bceefc5dfdae8cd14ea60f8281b9230c935bda9..194f3c1b224e9ba343bc2230ad49706c3bc62411 100644 --- a/official/cv/DeepLabV3P/eval.py +++ b/official/cv/DeepLabV3P/eval.py @@ -17,10 +17,10 @@ import os import argparse import numpy as np import cv2 +import mindspore from mindspore import Tensor import mindspore.common.dtype as mstype import mindspore.nn as nn -from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.deeplab_v3plus import DeepLabV3Plus @@ -156,7 +156,7 @@ def eval_batch_scales(args, eval_net, img_lst, scales, def net_eval(): """net_eval""" args = parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False, + mindspore.set_context(mode=0, device_target=args.device_target, save_graphs=False, device_id=args.device_id) # data list with open(args.data_lst) as f: diff --git a/official/cv/DeepLabV3P/export.py b/official/cv/DeepLabV3P/export.py index 3ffeafc555ed32aa2aa96938736740cbbd3b52c9..b36ccf42bef8b3886e672ccc6b303d5cf2dffb34 100644 --- a/official/cv/DeepLabV3P/export.py +++ b/official/cv/DeepLabV3P/export.py @@ -15,12 +15,13 @@ """export MINDIR file.""" import argparse import numpy as np +import mindspore import mindspore.nn as nn import mindspore.ops as ops -from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.deeplab_v3plus import DeepLabV3Plus -context.set_context(mode=context.GRAPH_MODE, device_target='Ascend') +mindspore.set_context(mode=0, device_target='Ascend') class BuildEvalNetwork(nn.Cell): """BuildEvalNetwork""" diff --git a/official/cv/DeepLabV3P/train.py b/official/cv/DeepLabV3P/train.py index 8a57aa73fa7abc2f2746239465a170aec08850d0..49591835fd67d227457b9976855142c341f34966 100644 --- a/official/cv/DeepLabV3P/train.py +++ b/official/cv/DeepLabV3P/train.py @@ -16,9 +16,9 @@ import os import argparse import ast -from mindspore import context +import mindspore from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode import mindspore.nn as nn from mindspore.train.callback import ModelCheckpoint, CheckpointConfig from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -100,9 +100,9 @@ def parse_args(): def train(): """train""" args = parse_args() - context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target=args.device_target) + mindspore.set_context(mode=0, save_graphs=False, device_target=args.device_target) if args.device_target != "CPU": - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) # init multicards training if args.modelArts_mode: @@ -116,7 +116,7 @@ def train(): args.rank = get_rank() args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=args.group_size) local_data_url = os.path.join(local_data_url, str(device_id)) # download dataset from obs to cache @@ -131,7 +131,7 @@ def train(): args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=args.group_size) data_file = args.data_file ckpt_file = args.ckpt_pre_trained diff --git a/official/cv/DeepLabv3/eval.py b/official/cv/DeepLabv3/eval.py index 5a407a6013bf7e991682ccb4be139a59b1d071e5..a34c290e0c9666da7d16ad9045b8631fa0487dac 100644 --- a/official/cv/DeepLabv3/eval.py +++ b/official/cv/DeepLabv3/eval.py @@ -18,11 +18,11 @@ import os import time import numpy as np import cv2 +import mindspore from mindspore import Tensor import mindspore.common.dtype as mstype import mindspore.nn as nn import mindspore.ops as ops -from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.nets import net_factory @@ -187,7 +187,7 @@ def net_eval(): config.scales = config.scales_list[config.scales_type] args = config - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False, + mindspore.set_context(mode=0, device_target=args.device_target, save_graphs=False, device_id=get_device_id()) # data list diff --git a/official/cv/DeepLabv3/export.py b/official/cv/DeepLabv3/export.py index 0375e4048ad452ec37ad32128fe52bc08e4a0aa8..af2433be5f1de43531f92f64ecda958c90241061 100644 --- a/official/cv/DeepLabv3/export.py +++ b/official/cv/DeepLabv3/export.py @@ -16,9 +16,10 @@ import os import numpy as np +import mindspore import mindspore.nn as nn import mindspore.ops as ops -from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.nets import net_factory from model_utils.config import config @@ -48,9 +49,9 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def run_export(): '''run export.''' - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) if config.export_model == 'deeplab_v3_s16': network = net_factory.nets_map['deeplab_v3_s16']('eval', config.num_classes, 16, config.freeze_bn) diff --git a/official/cv/DeepLabv3/model_utils/moxing_adapter.py b/official/cv/DeepLabv3/model_utils/moxing_adapter.py index 25838a7da99a27a1bb744684c1f75f80f5704688..189ff0667a1a783691749e55e41f1562c100b9c9 100644 --- a/official/cv/DeepLabv3/model_utils/moxing_adapter.py +++ b/official/cv/DeepLabv3/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/DeepLabv3/modelarts/train_start.py b/official/cv/DeepLabv3/modelarts/train_start.py index 0a0f3f3e4c29ef61b44a312e6e0d42ac42adb0a6..227230d45ace13114c64717d87286d5aaf704a91 100644 --- a/official/cv/DeepLabv3/modelarts/train_start.py +++ b/official/cv/DeepLabv3/modelarts/train_start.py @@ -19,9 +19,10 @@ import glob import argparse import moxing as mox import numpy as np -from mindspore import context, export, Tensor +import mindspore +from mindspore import export, Tensor from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode import mindspore.nn as nn import mindspore.ops as ops from mindspore.train.callback import ModelCheckpoint, CheckpointConfig @@ -157,9 +158,9 @@ def get_device_id(): def train(args, train_url, data_file, ckpt_pre_trained): if args.device_target == "CPU": - context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="CPU") + mindspore.set_context(mode=0, save_graphs=False, device_target="CPU") else: - context.set_context(mode=context.GRAPH_MODE, save_graphs=False, + mindspore.set_context(mode=0, save_graphs=False, device_target="Ascend", device_id=get_device_id()) # init multicards training @@ -169,7 +170,7 @@ def train(args, train_url, data_file, ckpt_pre_trained): args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=args.group_size) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=args.group_size) # dataset dataset = data_generator.SegDataset(image_mean=args.image_mean, @@ -257,7 +258,7 @@ def train(args, train_url, data_file, ckpt_pre_trained): def export_air(args, train_url): '''run export.''' - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) + mindspore.set_context(mode=0, device_target=args.device_target) ckpt_list = glob.glob(train_url + "/*.ckpt") ckpt_list.sort(key=os.path.getmtime) ckpt_model = ckpt_list[-1] diff --git a/official/cv/DeepLabv3/train.py b/official/cv/DeepLabv3/train.py index 1a115a66f5ad8009bf0a34aaada4f87a7729a894..0f374d33596ddc6b63176e30287c52c9c56e79e9 100644 --- a/official/cv/DeepLabv3/train.py +++ b/official/cv/DeepLabv3/train.py @@ -16,9 +16,9 @@ import os import time -from mindspore import context +import mindspore from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode import mindspore.nn as nn from mindspore.train.callback import ModelCheckpoint, CheckpointConfig from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -107,9 +107,9 @@ def train(): args = config if args.device_target == "CPU": - context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="CPU") + mindspore.set_context(mode=0, save_graphs=False, device_target="CPU") else: - context.set_context(mode=context.GRAPH_MODE, save_graphs=False, + mindspore.set_context(mode=0, save_graphs=False, device_target="Ascend", device_id=get_device_id()) # init multicards training @@ -119,7 +119,7 @@ def train(): args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=args.group_size) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=args.group_size) # dataset dataset = data_generator.SegDataset(image_mean=args.image_mean, diff --git a/official/cv/DeepText/README.md b/official/cv/DeepText/README.md index 22867dfc4785b4d5d1f9677fc9b9b857df95a279..73854bf83d77abdb0c2a3c860c892a0f64d923f5 100644 --- a/official/cv/DeepText/README.md +++ b/official/cv/DeepText/README.md @@ -273,7 +273,7 @@ Here we used 4 datasets for training, and 1 datasets for Evaluation. ### Result -Training result will be stored in the example path. Checkpoints will be stored at `ckpt_path` by default, and training log will be redirected to `./log`, also the loss will be redirected to `./loss_0.log` like followings. +Training result will be stored in the example path. Checkpoints will be stored at `ckpt_path` by default, and training log will be redirected to `./log`, also the loss will be redirected to `./loss_0.log` like following. ```python 469 epoch: 1 step: 982 ,rpn_loss: 0.03940, rcnn_loss: 0.48169, rpn_cls_loss: 0.02910, rpn_reg_loss: 0.00344, rcnn_cls_loss: 0.41943, rcnn_reg_loss: 0.06223, total_loss: 0.52109 @@ -306,7 +306,7 @@ You can start training using python or shell scripts. The usage of shell scripts ### Result -Evaluation result will be stored in the example path, you can find result like the followings in `log`. +Evaluation result will be stored in the example path, you can find result like the following in `log`. ```python ======================================== diff --git a/official/cv/DeepText/eval.py b/official/cv/DeepText/eval.py index 9aa93a432d72d02b7f3c29398e3b58a4a6dca740..d88905a51499a1ec7fe4b3609000aef6ef0ec8ce 100644 --- a/official/cv/DeepText/eval.py +++ b/official/cv/DeepText/eval.py @@ -18,8 +18,8 @@ import os import time import numpy as np +import mindspore import mindspore.common.dtype as mstype -from mindspore import context from mindspore.common import set_seed from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -33,7 +33,7 @@ from model_utils.device_adapter import get_device_id, get_device_num set_seed(1) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) +mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) def deeptext_eval_test(dataset_path='', ckpt_path=''): @@ -51,7 +51,7 @@ def deeptext_eval_test(dataset_path='', ckpt_path=''): print("\n========================================\n", flush=True) print("Processing, please wait a moment.", flush=True) - device_type = "Ascend" if context.get_context("device_target") == "Ascend" else "Others" + device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "Others" if device_type == "Ascend": net.to_float(mstype.float16) diff --git a/official/cv/DeepText/export.py b/official/cv/DeepText/export.py index 4f668b707b6f6cc792eab52eba3ae3aaa677113e..c74305d6246a81a9c3b4106dc42f6c976ba23e64 100644 --- a/official/cv/DeepText/export.py +++ b/official/cv/DeepText/export.py @@ -16,8 +16,8 @@ import os import numpy as np -import mindspore as ms -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.Deeptext.deeptext_vgg16 import Deeptext_VGG16_Infer @@ -36,8 +36,8 @@ def modelarts_pre_process(): def run_export(): '''run export.''' config.test_batch_size = config.export_batch_size - context.set_context(mode=context.GRAPH_MODE, device_target=config.export_device_target) - context.set_context(device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=config.export_device_target) + mindspore.set_context(device_id=get_device_id()) net = Deeptext_VGG16_Infer(config=config) net.set_train(False) @@ -50,7 +50,7 @@ def run_export(): load_param_into_net(net, param_dict_new) - img_data = Tensor(np.zeros([config.test_batch_size, 3, config.img_height, config.img_width]), ms.float32) + img_data = Tensor(np.zeros([config.test_batch_size, 3, config.img_height, config.img_width]), mindspore.float32) export(net, img_data, file_name=config.file_name, file_format=config.file_format) diff --git a/official/cv/DeepText/model_utils/moxing_adapter.py b/official/cv/DeepText/model_utils/moxing_adapter.py index 25838a7da99a27a1bb744684c1f75f80f5704688..189ff0667a1a783691749e55e41f1562c100b9c9 100644 --- a/official/cv/DeepText/model_utils/moxing_adapter.py +++ b/official/cv/DeepText/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/DeepText/src/Deeptext/proposal_generator.py b/official/cv/DeepText/src/Deeptext/proposal_generator.py index 2b484be8895a5d26926ec0d7f4577fdc279f3147..ff78039647bb39f4acc7977b0d03bdbef8c7220a 100644 --- a/official/cv/DeepText/src/Deeptext/proposal_generator.py +++ b/official/cv/DeepText/src/Deeptext/proposal_generator.py @@ -15,11 +15,12 @@ """Deeptext proposal generator.""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import functional as F from mindspore.ops import operations as P -from mindspore import context, Tensor +from mindspore import Tensor class Proposal(nn.Cell): @@ -113,7 +114,7 @@ class Proposal(nn.Cell): cfg = config self.topK_stage1 = () self.topK_shape = () - self.exec_mode = context.get_context("mode") + self.exec_mode = mindspore.get_context("mode") total_max_topk_input = 0 if not self.training_local: self.num_pre = cfg.rpn_nms_pre @@ -148,7 +149,7 @@ class Proposal(nn.Cell): bbox_pred_list = bbox_pred_list + (rpn_bbox_pred_i,) proposals, masks = self.get_bboxes_single(cls_score_list, bbox_pred_list, anchor_list) - if self.exec_mode == context.PYNATIVE_MODE: + if self.exec_mode == 1: proposals = F.stop_gradient(proposals) masks = F.stop_gradient(masks) proposals_tuple += (proposals,) diff --git a/official/cv/DeepText/train.py b/official/cv/DeepText/train.py index ad6b95bffe27da67389ce86f521886437e3fedff..d00d1f60bac2c26411de02e63c1d3455405daf79 100644 --- a/official/cv/DeepText/train.py +++ b/official/cv/DeepText/train.py @@ -28,11 +28,12 @@ from model_utils.config import config from model_utils.moxing_adapter import moxing_wrapper from model_utils.device_adapter import get_device_id, get_device_num, get_rank_id +import mindspore import mindspore.common.dtype as mstype -from mindspore import context, Tensor, Parameter +from mindspore import Tensor, Parameter from mindspore.common import set_seed from mindspore.communication.management import init, get_group_size, get_rank -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn import Momentum, TrainOneStepWithLossScaleCell from mindspore.train import Model from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor @@ -42,7 +43,7 @@ np.set_printoptions(threshold=np.inf) set_seed(1001) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) +mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) def modelarts_pre_process(): @@ -102,15 +103,15 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def run_train(): - device_type = "Ascend" if context.get_context("device_target") == "Ascend" else "GPU" + device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "GPU" if device_type == "GPU": - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) if config.run_distribute: init() - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() rank = get_rank() device_num = get_group_size() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: rank = get_rank_id() diff --git a/official/cv/EDSR/export.py b/official/cv/EDSR/export.py index 2aadb54c111d2e210d227f1fd406003b8af9a9a2..1e5458442e700b863835b9f33f074c103cf90de8 100644 --- a/official/cv/EDSR/export.py +++ b/official/cv/EDSR/export.py @@ -19,8 +19,8 @@ python export.py import os import numpy as np -import mindspore as ms -from mindspore import Tensor, export, context +import mindspore +from mindspore import Tensor, export from src.utils import init_net from model_utils.config import config @@ -28,9 +28,9 @@ from model_utils.device_adapter import get_device_id from model_utils.moxing_adapter import moxing_wrapper -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) MAX_HR_SIZE = 2040 @@ -48,7 +48,7 @@ def run_export(): net = init_net(cfg) max_lr_size = MAX_HR_SIZE // cfg.scale - input_arr = Tensor(np.ones([1, cfg.n_colors, max_lr_size, max_lr_size]), ms.float32) + input_arr = Tensor(np.ones([1, cfg.n_colors, max_lr_size, max_lr_size]), mindspore.float32) file_name = os.path.splitext(os.path.basename(cfg.pre_trained))[0] file_name = file_name + f"_InputSize{max_lr_size}" file_path = os.path.join(cfg.output_path, file_name) diff --git a/official/cv/EDSR/model_utils/moxing_adapter.py b/official/cv/EDSR/model_utils/moxing_adapter.py index b9cab7332413a22993c1cfb86ac0a916aa73efc3..60c1b6779e46f90c89ed700b9bb81414bb430e9a 100644 --- a/official/cv/EDSR/model_utils/moxing_adapter.py +++ b/official/cv/EDSR/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config @@ -95,7 +95,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/EDSR/src/utils.py b/official/cv/EDSR/src/utils.py index 65eea98a77a15d8e1cddd551f211eef08f2556c0..3bb96b4c9bff4c809ac34c30fead978917cacfcd 100644 --- a/official/cv/EDSR/src/utils.py +++ b/official/cv/EDSR/src/utils.py @@ -18,9 +18,9 @@ import os import time -from mindspore import context +import mindspore from mindspore.communication.management import init -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint from model_utils.config import config @@ -33,21 +33,21 @@ def init_env(cfg): """ init env for mindspore """ - context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target) + mindspore.set_context(mode=0, device_target=cfg.device_target) device_num = get_device_num() if cfg.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) if device_num > 1: init() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) elif cfg.device_target == "GPU": - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) if device_num > 1: init() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) elif cfg.device_target == "CPU": pass diff --git a/official/cv/Efficientnet/efficientnet-b0/eval.py b/official/cv/Efficientnet/efficientnet-b0/eval.py index 00b0689874c1e8306d0c159957b0a6546956beb3..173714967b47519a56a771d7a04daaeefc2a3020 100644 --- a/official/cv/Efficientnet/efficientnet-b0/eval.py +++ b/official/cv/Efficientnet/efficientnet-b0/eval.py @@ -15,8 +15,8 @@ """evaluate imagenet""" import time +import mindspore from mindspore import nn -from mindspore import context from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -33,7 +33,7 @@ if __name__ == '__main__': else: raise NotImplementedError("This model currently not supported") - context.set_context(mode=context.GRAPH_MODE, device_target=config.platform) + mindspore.set_context(mode=0, device_target=config.platform) if model_name == 'efficientnet_b0': net = efficientnet_b0(num_classes=config.num_classes, diff --git a/official/cv/Efficientnet/efficientnet-b0/export.py b/official/cv/Efficientnet/efficientnet-b0/export.py index d6a5fceef37eff303287a52e185a5449a2cc188a..769745cf442def4c75dcf19bbd50f5dee0b8c980 100644 --- a/official/cv/Efficientnet/efficientnet-b0/export.py +++ b/official/cv/Efficientnet/efficientnet-b0/export.py @@ -15,11 +15,12 @@ """export file""" import numpy as np -from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.efficientnet import efficientnet_b0 from src.config import config -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if __name__ == "__main__": diff --git a/official/cv/Efficientnet/efficientnet-b0/src/transform_utils.py b/official/cv/Efficientnet/efficientnet-b0/src/transform_utils.py index 4e86cf9fbd7141941f019e3e733f6dcf57612295..ef3092cf9915826fa261a020492d07811b79ff78 100644 --- a/official/cv/Efficientnet/efficientnet-b0/src/transform_utils.py +++ b/official/cv/Efficientnet/efficientnet-b0/src/transform_utils.py @@ -421,7 +421,7 @@ def skew(img, v, **__): matrix.append([p1[0], p1[1], 1, 0, 0, 0, -p2[0] * p1[0], -p2[0] * p1[1]]) matrix.append([0, 0, 0, p1[0], p1[1], 1, -p2[1] * p1[0], -p2[1] * p1[1]]) - A = np.matrix(matrix, dtype=np.float) + A = np.matrix(matrix, dtype=np.float_) B = np.array(original_plane).reshape(8) perspective_skew_coefficients_matrix = np.dot(np.linalg.pinv(A), B) perspective_skew_coefficients_matrix = np.array(perspective_skew_coefficients_matrix).reshape(8) diff --git a/official/cv/Efficientnet/efficientnet-b0/train.py b/official/cv/Efficientnet/efficientnet-b0/train.py index 32c2a93dd8f3b2fc878fd64a03ad67e69659981b..f600e7575ef9efb0b4b007d9f086b18b7f5b9bb9 100644 --- a/official/cv/Efficientnet/efficientnet-b0/train.py +++ b/official/cv/Efficientnet/efficientnet-b0/train.py @@ -19,10 +19,10 @@ import os import numpy as np import mindspore from mindspore import nn -from mindspore import Tensor, context +from mindspore import Tensor from mindspore.communication.management import get_group_size, get_rank, init from mindspore.nn import SGD, RMSProp -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.callback import (CheckpointConfig, LossMonitor, ModelCheckpoint, TimeMonitor) from mindspore.train.loss_scale_manager import FixedLossScaleManager @@ -107,14 +107,14 @@ if __name__ == '__main__': summary_dir = local_path + "/train/summary/" rank_id, rank_size = 0, 1 - context.set_context(mode=context.GRAPH_MODE) + mindspore.set_context(mode=0) if config.platform == "GPU": dataset_sink_mode = True - context.set_context(device_target='GPU', enable_graph_kernel=True) + mindspore.set_context(device_target='GPU', enable_graph_kernel=True) elif config.platform == "CPU": dataset_sink_mode = False - context.set_context(device_target='CPU') + mindspore.set_context(device_target='CPU') else: raise NotImplementedError("Training only supported for CPU and GPU.") @@ -123,10 +123,10 @@ if __name__ == '__main__': init("nccl") else: raise NotImplementedError("Distributed Training only supported for GPU.") - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() rank_id = get_rank() rank_size = get_group_size() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=rank_size) summary_dir += "thread_num_" + str(rank_id) + "/" diff --git a/official/cv/Efficientnet/efficientnet-b1/eval.py b/official/cv/Efficientnet/efficientnet-b1/eval.py index 74cd2144da2928ac4d15e9f7ded6d5a0ec66517f..c9950c0060bf66d00cc0b9a31c7e584fea48d029 100644 --- a/official/cv/Efficientnet/efficientnet-b1/eval.py +++ b/official/cv/Efficientnet/efficientnet-b1/eval.py @@ -17,8 +17,9 @@ import ast import timeit import argparse +import mindspore import mindspore.nn as nn -from mindspore import context, Model +from mindspore import Model from mindspore.common import set_seed from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -62,7 +63,7 @@ def parse_args(): @moxing_wrapper(config) def main(): """Main function for model evaluation.""" - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) dataset = create_imagenet(dataset_path=config.data_path, do_train=False, repeat_num=1, input_size=config.input_size, batch_size=config.batchsize, target=config.device_target, distribute=config.run_distribute) diff --git a/official/cv/Efficientnet/efficientnet-b1/export.py b/official/cv/Efficientnet/efficientnet-b1/export.py index e3ff757914048552bc2104332d953a6d0e66308c..3aa6470330650ccdd00de22a373ab48853b87b69 100644 --- a/official/cv/Efficientnet/efficientnet-b1/export.py +++ b/official/cv/Efficientnet/efficientnet-b1/export.py @@ -15,7 +15,8 @@ """export efficientnet IR.""" import argparse import numpy as np -from mindspore import context, Tensor, load_checkpoint, load_param_into_net, export +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.models.effnet import EfficientNet from src.config import efficientnet_b1_config_ascend as config @@ -30,7 +31,7 @@ parser.add_argument("--file_format", type=str, choices=["AIR", "ONNX", "MINDIR"] args_opt = parser.parse_args() if __name__ == "__main__": - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") + mindspore.set_context(mode=0, device_target="Ascend") net = EfficientNet(width_coeff=config.width_coeff, depth_coeff=config.depth_coeff, dropout_rate=config.dropout_rate, drop_connect_rate=config.drop_connect_rate, diff --git a/official/cv/Efficientnet/efficientnet-b1/src/model_utils/moxing_adapter.py b/official/cv/Efficientnet/efficientnet-b1/src/model_utils/moxing_adapter.py index fe745699185661307c7acb02f7c61d73865f3096..50b6b969d05893fbabe82214620ac5b2b1139882 100644 --- a/official/cv/Efficientnet/efficientnet-b1/src/model_utils/moxing_adapter.py +++ b/official/cv/Efficientnet/efficientnet-b1/src/model_utils/moxing_adapter.py @@ -16,7 +16,7 @@ import os import time import functools -from mindspore import context +import mindspore from src.config import show_config @@ -106,7 +106,7 @@ def moxing_wrapper(config, pre_process=None, post_process=None): sync_data(config.eval_data_url, config.eval_data_path) print("Workspace downloaded: ", os.listdir(config.eval_data_path), flush=True) - context.set_context(save_graphs_path=os.path.join(config.train_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.train_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.train_path): diff --git a/official/cv/Efficientnet/efficientnet-b1/train.py b/official/cv/Efficientnet/efficientnet-b1/train.py index c73fffe621ad7e2d4ec2e190a757f8609c6f40b7..430467c52c1dda9dc149dc6495d8a79fb6f9a4ae 100644 --- a/official/cv/Efficientnet/efficientnet-b1/train.py +++ b/official/cv/Efficientnet/efficientnet-b1/train.py @@ -17,8 +17,8 @@ import os import ast import argparse +import mindspore import mindspore.nn as nn -from mindspore import context from mindspore.train.model import Model, ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.communication.management import init @@ -77,13 +77,13 @@ def parse_args(): @moxing_wrapper(config) def main(): - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.run_distribute: init() device_id = int(os.getenv("DEVICE_ID")) device_num = int(os.getenv("RANK_SIZE")) parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=device_num) else: diff --git a/official/cv/Efficientnet/efficientnet-b2/eval.py b/official/cv/Efficientnet/efficientnet-b2/eval.py index 728393eb2ccc8c3b5b9a0722b500fab98b40401f..b3f7e2131da28df1fb4a752c81555b88947cccc2 100644 --- a/official/cv/Efficientnet/efficientnet-b2/eval.py +++ b/official/cv/Efficientnet/efficientnet-b2/eval.py @@ -16,7 +16,8 @@ import os import ast import argparse -from mindspore import context, nn +import mindspore +from mindspore import nn from mindspore.train.model import Model from mindspore.common import set_seed from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -42,20 +43,20 @@ if __name__ == '__main__': parser.add_argument('--device_target', type=str, choices=["Ascend", "GPU"], default="Ascend", help='Device target') args_opt = parser.parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=args_opt.device_target, save_graphs=False) if args_opt.run_modelarts: import moxing as mox device_id = int(os.getenv('DEVICE_ID')) device_num = int(os.getenv('RANK_SIZE')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) local_data_url = '/cache/data/' local_train_url = '/cache/ckpt/' mox.file.copy_parallel(args_opt.data_url, local_data_url) mox.file.copy_parallel(args_opt.train_url, local_train_url) else: - context.set_context(device_id=args_opt.device_id) + mindspore.set_context(device_id=args_opt.device_id) if args_opt.device_target == "GPU": config = config_gpu diff --git a/official/cv/Efficientnet/efficientnet-b2/export.py b/official/cv/Efficientnet/efficientnet-b2/export.py index d758e080a81d9c986b9a1138a32ac33e55febb4a..005e05e465ffb2fa1e7532ed80288e21d0ab1bac 100644 --- a/official/cv/Efficientnet/efficientnet-b2/export.py +++ b/official/cv/Efficientnet/efficientnet-b2/export.py @@ -17,7 +17,8 @@ efficientnet export. """ import argparse import numpy as np -from mindspore import context, Tensor, load_checkpoint, load_param_into_net, export +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.models.effnet import EfficientNet parser = argparse.ArgumentParser(description='Image classification') @@ -30,7 +31,7 @@ parser.add_argument('--device_target', type=str, choices=["Ascend", "GPU"], defa args_opt = parser.parse_args() if __name__ == '__main__': - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) + mindspore.set_context(mode=0, device_target=args_opt.device_target) net = EfficientNet(1.1, 1.2, dropout_rate=0.3) diff --git a/official/cv/Efficientnet/efficientnet-b2/train.py b/official/cv/Efficientnet/efficientnet-b2/train.py index f5c3b8796c1c5b486a46e24667d7c874e348f84b..2236ff007b181c359b61837f0933fa868e9f4c92 100644 --- a/official/cv/Efficientnet/efficientnet-b2/train.py +++ b/official/cv/Efficientnet/efficientnet-b2/train.py @@ -17,7 +17,7 @@ import os import ast import argparse -from mindspore import context +import mindspore from mindspore import Tensor from mindspore.nn import SGD, RMSProp from mindspore.train.model import Model, ParallelMode @@ -53,7 +53,7 @@ if __name__ == '__main__': parser.add_argument('--device_target', type=str, choices=["Ascend", "GPU"], default="Ascend", help='Device target') args_opt = parser.parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=args_opt.device_target, save_graphs=False) # init distributed if args_opt.run_modelarts: @@ -61,36 +61,36 @@ if __name__ == '__main__': device_id = int(os.getenv('DEVICE_ID')) device_num = int(os.getenv('RANK_SIZE')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) local_data_url = '/cache/data' local_train_url = '/cache/ckpt' if device_num > 1: init() - context.set_auto_parallel_context(device_num=device_num, parallel_mode='data_parallel', gradients_mean=True) + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode='data_parallel', gradients_mean=True) local_data_url = os.path.join(local_data_url, str(device_id)) mox.file.copy_parallel(args_opt.data_url, local_data_url) else: if args_opt.run_distribute: if args_opt.device_target == "GPU": init() - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() device_id = get_rank() device_num = get_group_size() print("run distribute......", "deviceNum:", device_num, ",rank_id:", device_id) - context.set_auto_parallel_context(device_num=device_num, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: device_id = int(os.getenv('DEVICE_ID')) device_num = int(os.getenv('RANK_SIZE')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) init() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, - parallel_mode=context.ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) else: - context.set_context(device_id=args_opt.device_id) + mindspore.set_context(device_id=args_opt.device_id) device_num = 1 device_id = 0 diff --git a/official/cv/Efficientnet/efficientnet-b3/eval.py b/official/cv/Efficientnet/efficientnet-b3/eval.py index de055d85d81fe57b2ce251eeb958f22e6d5039d0..d56c3e8b21afb00b48c4088fa82bb97303911303 100644 --- a/official/cv/Efficientnet/efficientnet-b3/eval.py +++ b/official/cv/Efficientnet/efficientnet-b3/eval.py @@ -16,7 +16,8 @@ import os import ast import argparse -from mindspore import context, nn +import mindspore +from mindspore import nn from mindspore.train.model import Model from mindspore.common import set_seed from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -42,20 +43,20 @@ if __name__ == '__main__': parser.add_argument('--run_modelarts', type=ast.literal_eval, default=False, help='Run distribute') args_opt = parser.parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=args_opt.device_target, save_graphs=False) if args_opt.run_modelarts: import moxing as mox device_id = int(os.getenv('DEVICE_ID')) device_num = int(os.getenv('RANK_SIZE')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) local_data_url = '/cache/data/' local_train_url = '/cache/ckpt/' mox.file.copy_parallel(args_opt.data_url, local_data_url) mox.file.copy_parallel(args_opt.train_url, local_train_url) else: - context.set_context(device_id=args_opt.device_id) + mindspore.set_context(device_id=args_opt.device_id) # create dataset if args_opt.run_modelarts: diff --git a/official/cv/Efficientnet/efficientnet-b3/export.py b/official/cv/Efficientnet/efficientnet-b3/export.py index cfaec24bc49c2e66c86832e5b8eb47c095ea39ed..2494270f41c911fb21a5fe0988dec33c62e2f237 100644 --- a/official/cv/Efficientnet/efficientnet-b3/export.py +++ b/official/cv/Efficientnet/efficientnet-b3/export.py @@ -17,7 +17,8 @@ efficientnet export. """ import argparse import numpy as np -from mindspore import context, Tensor, load_checkpoint, load_param_into_net, export +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.models.effnet import EfficientNet @@ -31,7 +32,7 @@ parser.add_argument("--device_target", type=str, choices=["Ascend", "GPU"], defa args_opt = parser.parse_args() if __name__ == '__main__': - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) + mindspore.set_context(mode=0, device_target=args_opt.device_target) net = EfficientNet() diff --git a/official/cv/Efficientnet/efficientnet-b3/infer/util/classification_task_metric.py b/official/cv/Efficientnet/efficientnet-b3/infer/util/classification_task_metric.py index 9e689cde780b1a8c7722b29656b0f73f7cdcaedd..ce60a7353792ddd6d92f16dd6d86bbd3bf75580b 100644 --- a/official/cv/Efficientnet/efficientnet-b3/infer/util/classification_task_metric.py +++ b/official/cv/Efficientnet/efficientnet-b3/infer/util/classification_task_metric.py @@ -55,7 +55,7 @@ def load_statistical_predict_result(filepath): data_vec = np.zeros((len(label_list)), dtype=np.float32) if n_label != 0: for ind, cls_ind in enumerate(label_list): - data_vec[ind] = np.int(cls_ind) + data_vec[ind] = np.int_(cls_ind) return data_vec, n_label diff --git a/official/cv/Efficientnet/efficientnet-b3/modelarts/train_start.py b/official/cv/Efficientnet/efficientnet-b3/modelarts/train_start.py index bdc6324570d4d35bad8494d845ef4b344b3abca5..b04303192b19105aa7982db523eb2f93aab30d20 100644 --- a/official/cv/Efficientnet/efficientnet-b3/modelarts/train_start.py +++ b/official/cv/Efficientnet/efficientnet-b3/modelarts/train_start.py @@ -18,11 +18,11 @@ import ast import argparse import numpy as np -from mindspore import context +import mindspore from mindspore import Tensor from mindspore.nn import SGD, RMSProp from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.callback import ModelCheckpoint, CheckpointConfig from mindspore.communication.management import init from mindspore.train.loss_scale_manager import FixedLossScaleManager @@ -55,33 +55,33 @@ if __name__ == '__main__': parser.add_argument('--resume', type=str, default='', help='resume training with existed checkpoint') args_opt = parser.parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") + mindspore.set_context(mode=0, device_target="Ascend") # init distributed if args_opt.run_modelarts: import moxing as mox device_id = int(os.getenv('DEVICE_ID')) device_num = int(os.getenv('RANK_SIZE')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) local_data_url = '/cache/data' local_train_url = '/cache/ckpt' if device_num > 1: init() - context.set_auto_parallel_context(device_num=device_num, parallel_mode='data_parallel', gradients_mean=True) + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode='data_parallel', gradients_mean=True) local_data_url = os.path.join(local_data_url, str(device_id)) mox.file.copy_parallel(args_opt.data_url, local_data_url) else: if args_opt.run_distribute: device_id = int(os.getenv('DEVICE_ID')) device_num = int(os.getenv('RANK_SIZE')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) init() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: - context.set_context(device_id=args_opt.device_id) + mindspore.set_context(device_id=args_opt.device_id) device_num = 1 device_id = 0 diff --git a/official/cv/Efficientnet/efficientnet-b3/train.py b/official/cv/Efficientnet/efficientnet-b3/train.py index dfa9d8d4d4af9f95cb56d49c733ac22832a68b91..35cbd4d3d3fd38c00fd01f190e90fd32e8533515 100644 --- a/official/cv/Efficientnet/efficientnet-b3/train.py +++ b/official/cv/Efficientnet/efficientnet-b3/train.py @@ -17,7 +17,7 @@ import os import ast import argparse -from mindspore import context +import mindspore from mindspore import Tensor from mindspore.nn import SGD, RMSProp from mindspore.train.model import Model @@ -53,7 +53,7 @@ if __name__ == '__main__': parser.add_argument('--resume', type=str, default='', help='resume training with existed checkpoint') args_opt = parser.parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=args_opt.device_target, save_graphs=False) # init distributed if args_opt.run_modelarts: @@ -62,27 +62,27 @@ if __name__ == '__main__': device_id = int(os.getenv('DEVICE_ID')) rank = int(os.getenv('RANK_ID')) device_num = int(os.getenv('RANK_SIZE')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) local_data_url = '/cache/data' local_train_url = '/cache/ckpt' if device_num > 1: init() - context.set_auto_parallel_context(device_num=device_num, parallel_mode='data_parallel', gradients_mean=True) + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode='data_parallel', gradients_mean=True) local_data_url = os.path.join(local_data_url, str(device_id)) mox.file.copy_parallel(args_opt.data_url, local_data_url) else: if args_opt.run_distribute: if os.getenv('DEVICE_ID', "not_set").isdigit(): - context.set_context(device_id=int(os.getenv("DEVICE_ID"))) + mindspore.set_context(device_id=int(os.getenv("DEVICE_ID"))) init() rank = get_rank() device_num = get_group_size() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, - parallel_mode=context.ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) else: - context.set_context(device_id=args_opt.device_id) + mindspore.set_context(device_id=args_opt.device_id) device_num = 1 rank = 0 diff --git a/official/cv/Efficientnet/efficientnetv2/eval.py b/official/cv/Efficientnet/efficientnetv2/eval.py index 56d6f376f348d1b165774b5cccd8d4d80158838f..134c5d91df85e442cf80e57ced25668b6dc6204c 100644 --- a/official/cv/Efficientnet/efficientnetv2/eval.py +++ b/official/cv/Efficientnet/efficientnetv2/eval.py @@ -13,9 +13,8 @@ # limitations under the License. # ============================================================================ """eval""" - +import mindspore from mindspore import Model -from mindspore import context from mindspore import nn from mindspore.common import set_seed @@ -30,13 +29,13 @@ set_seed(args.seed) def main(): mode = { - 0: context.GRAPH_MODE, - 1: context.PYNATIVE_MODE + 0: 0, + 1: 1 } - context.set_context(mode=mode[args.graph_mode], device_target=args.device_target) - context.set_context(enable_graph_kernel=False) + mindspore.set_context(mode=mode[args.graph_mode], device_target=args.device_target) + mindspore.set_context(enable_graph_kernel=False) if args.device_target == "Ascend": - context.set_context(enable_auto_mixed_precision=True) + mindspore.set_context(enable_auto_mixed_precision=True) set_device(args) # get model diff --git a/official/cv/Efficientnet/efficientnetv2/export.py b/official/cv/Efficientnet/efficientnetv2/export.py index 3d94ed906faabe940a596b4438bd006570dac9ed..1cb6f26d692ba379d3a4de568a2ffc9d273046b8 100644 --- a/official/cv/Efficientnet/efficientnetv2/export.py +++ b/official/cv/Efficientnet/efficientnetv2/export.py @@ -18,7 +18,9 @@ python export.py """ import numpy as np -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context + +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from mindspore import dtype as mstype from src.args import args @@ -26,10 +28,10 @@ from src.tools.cell import cast_amp from src.tools.criterion import get_criterion, NetWithLoss from src.tools.get_misc import get_model -context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) +mindspore.set_context(mode=0, device_target=args.device_target) if args.device_target in ["Ascend", "GPU"]: - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) if __name__ == '__main__': net = get_model(args) diff --git a/official/cv/Efficientnet/efficientnetv2/src/tools/get_misc.py b/official/cv/Efficientnet/efficientnetv2/src/tools/get_misc.py index f1f4168aeb294e8a1e57bff90be467f729d005e5..e3e7b05b4cc7b3abecfbd9aacabb2c247f9c9875 100644 --- a/official/cv/Efficientnet/efficientnetv2/src/tools/get_misc.py +++ b/official/cv/Efficientnet/efficientnetv2/src/tools/get_misc.py @@ -14,11 +14,10 @@ # ============================================================================ """misc functions for program""" import os - -from mindspore import context +import mindspore from mindspore import nn from mindspore.communication.management import init, get_rank -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from src import models, data @@ -35,16 +34,16 @@ def set_device(args): if device_target == "Ascend": if device_num > 1: - context.set_context(device_id=int(os.environ["DEVICE_ID"])) + mindspore.set_context(device_id=int(os.environ["DEVICE_ID"])) init(backend_name='hccl') - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) - # context.set_auto_parallel_context(pipeline_stages=2, full_batch=True) + # mindspore.set_auto_parallel_context(pipeline_stages=2, full_batch=True) rank = get_rank() else: - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) else: raise ValueError("Unsupported platform.") diff --git a/official/cv/Efficientnet/efficientnetv2/train.py b/official/cv/Efficientnet/efficientnetv2/train.py index 59b1e8847da4c17921666805ae5b10ad33d88c29..1b2c3762fe8e1b6686bd7241cae0d338f08c17c2 100644 --- a/official/cv/Efficientnet/efficientnetv2/train.py +++ b/official/cv/Efficientnet/efficientnetv2/train.py @@ -20,7 +20,8 @@ Acc: ImageNet1k-84.9% (pretrained on ImageNet22k) """ import os -from mindspore import Model, nn, context, set_seed +import mindspore +from mindspore import Model, nn, set_seed from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from src.args import args @@ -34,13 +35,13 @@ from src.tools.optimizer import get_optimizer def main(): set_seed(args.seed) mode = { - 0: context.GRAPH_MODE, - 1: context.PYNATIVE_MODE + 0: 0, # GRAPH_MODE + 1: 1 # PYNATIVE_MODE } - context.set_context(mode=mode[args.graph_mode], device_target=args.device_target) - context.set_context(enable_graph_kernel=False) + mindspore.set_context(mode=mode[args.graph_mode], device_target=args.device_target) + mindspore.set_context(enable_graph_kernel=False) if args.device_target == "Ascend": - context.set_context(enable_auto_mixed_precision=True) + mindspore.set_context(enable_auto_mixed_precision=True) rank = set_device(args) # get model and cast amp_level diff --git a/official/cv/FasterRCNN/README.md b/official/cv/FasterRCNN/README.md index 58a1897b9dc9b614937a3edde1e04fc6ac51b1ba..130130c80e61431c252a6fb0603173dee63b8870 100644 --- a/official/cv/FasterRCNN/README.md +++ b/official/cv/FasterRCNN/README.md @@ -82,7 +82,7 @@ Dataset used: [FaceMaskDetection]() - Docker base image - - [Ascend Hub](ascend.huawei.com/ascendhub/#/home) + - [Ascend Hub](www.hiascend.com/developer/ascendhub) - Install [MindSpore](https://www.mindspore.cn/install/en). diff --git a/official/cv/FasterRCNN/README_CN.md b/official/cv/FasterRCNN/README_CN.md index 676b27231976d34369334b0f11406f711df07f2a..ed214c0af5c75e1ca2bcc2a1b41dbd08a607c44a 100644 --- a/official/cv/FasterRCNN/README_CN.md +++ b/official/cv/FasterRCNN/README_CN.md @@ -116,7 +116,7 @@ Faster R-CNN是一个两阶段目标检测网络,该网络采用RPN,可以 - 获取基础镜像 - - [Ascend Hub](https://ascend.huawei.com/ascendhub/#/home) + - [Ascend Hub](www.hiascend.com/developer/ascendhub) - 安装[MindSpore](https://www.mindspore.cn/install)。 @@ -625,7 +625,7 @@ bash run_infer_cpp.sh [MINDIR_PATH] [DATA_PATH] [ANNO_PATH] [DEVICE_TYPE] [IMAGE | 上传日期 | 2020/8/31 | 2021/2/10 |2022/8/10| | MindSpore版本 | 1.0.0 |1.2.0 |1.7.0| | 数据集 | COCO 2017 |COCO 2017 |FaceMaskDetection| -| 训练参数 | epoch=12, batch_size=2 |epoch=12, batch_size=2 |epoch=20,batch_size=2| +| 训练参数 | epoch=12, batch_size=2 |epoch=20, batch_size=2 |epoch=20,batch_size=2| | 优化器 | SGD |SGD |SGD| | 损失函数 | Softmax交叉熵,Sigmoid交叉熵,SmoothL1Loss |Softmax交叉熵,Sigmoid交叉熵,SmoothL1Loss |Softmax交叉熵,Sigmoid交叉熵,SmoothL1Loss| | 速度 | 1卡:190毫秒/步;8卡:200毫秒/步 | 1卡:320毫秒/步;8卡:335毫秒/步 |1卡:7328毫秒/步| diff --git a/official/cv/FasterRCNN/eval.py b/official/cv/FasterRCNN/eval.py index fc60bafb23fa39dae174e6561f25da2114e38550..80d237846c0c2b0e5ecc2c7d76b8ca499b726ebc 100644 --- a/official/cv/FasterRCNN/eval.py +++ b/official/cv/FasterRCNN/eval.py @@ -20,7 +20,7 @@ from collections import defaultdict import numpy as np from pycocotools.coco import COCO -import mindspore as ms +import mindspore from mindspore.common import set_seed, Parameter from src.dataset import data_to_mindrecord_byte_image, create_fasterrcnn_dataset, parse_json_annos_from_txt @@ -29,7 +29,7 @@ from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.model_utils.device_adapter import get_device_id from src.FasterRcnn.faster_rcnn import Faster_Rcnn -ms.context.set_context(max_call_depth=2000) +mindspore.set_context(max_call_depth=2000) def fasterrcnn_eval(dataset_path, ckpt_path, anno_path): """FasterRcnn evaluation.""" @@ -39,7 +39,7 @@ def fasterrcnn_eval(dataset_path, ckpt_path, anno_path): net = Faster_Rcnn(config) try: - param_dict = ms.load_checkpoint(ckpt_path) + param_dict = mindspore.load_checkpoint(ckpt_path) except RuntimeError as ex: ex = str(ex) print("Traceback:\n", ex, flush=True) @@ -60,12 +60,12 @@ def fasterrcnn_eval(dataset_path, ckpt_path, anno_path): for key, value in param_dict.items(): tensor = value.asnumpy().astype(np.float32) param_dict[key] = Parameter(tensor, key) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) - device_type = "Ascend" if ms.get_context("device_target") == "Ascend" else "Others" + device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "Others" if device_type == "Ascend": - net.to_float(ms.float16) + net.to_float(mindspore.float16) eval_iter = 0 total = ds.get_dataset_size() @@ -199,6 +199,7 @@ def eval_fasterrcnn(): if __name__ == '__main__': set_seed(1) - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id(), + jit_config={"jit_level": "O2"}) eval_fasterrcnn() diff --git a/official/cv/FasterRCNN/export.py b/official/cv/FasterRCNN/export.py index e525cc4b77eed85dc823a451bbd968318b7893b8..da6507f111ec0333058a1c10fe182489a0cddd96 100644 --- a/official/cv/FasterRCNN/export.py +++ b/official/cv/FasterRCNN/export.py @@ -15,16 +15,16 @@ """export checkpoint file into air, onnx, mindir models""" import numpy as np -import mindspore as ms +import mindspore from mindspore import Tensor from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.model_utils.device_adapter import get_device_id from src.FasterRcnn.faster_rcnn import FasterRcnn_Infer -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, max_call_depth=2000) +mindspore.set_context(mode=0, device_target=config.device_target, max_call_depth=2000) if config.device_target == "Ascend": - ms.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_pre_process(): @@ -40,7 +40,7 @@ def export_fasterrcnn(): net = FasterRcnn_Infer(config=config) try: - param_dict = ms.load_checkpoint(config.ckpt_file) + param_dict = mindspore.load_checkpoint(config.ckpt_file) except RuntimeError as ex: ex = str(ex) print("Traceback:\n", ex, flush=True) @@ -53,22 +53,22 @@ def export_fasterrcnn(): key = key.replace("ncek", "neck") param_dict_new["network." + key] = value - ms.load_param_into_net(net, param_dict_new) + mindspore.load_param_into_net(net, param_dict_new) - device_type = "Ascend" if ms.get_context("device_target") == "Ascend" else "Others" + device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "Others" if device_type == "Ascend": - net.to_float(ms.float16) + net.to_float(mindspore.float16) - img = Tensor(np.zeros([config.test_batch_size, 3, config.img_height, config.img_width]), ms.float32) - img_metas = Tensor(np.random.uniform(0.0, 1.0, size=[config.test_batch_size, 4]), ms.float32) + img = Tensor(np.zeros([config.test_batch_size, 3, config.img_height, config.img_width]), mindspore.float32) + img_metas = Tensor(np.random.uniform(0.0, 1.0, size=[config.test_batch_size, 4]), mindspore.float32) if not config.restore_bbox: print("[WARNING] When parameter 'restore_bbox' set to False, " "ascend310_infer of this project provided will not be available " "and need to complete 310 infer function by yourself.") - ms.export(net, img, file_name=config.file_name, file_format=config.file_format) + mindspore.export(net, img, file_name=config.file_name, file_format=config.file_format) else: - ms.export(net, img, img_metas, file_name=config.file_name, file_format=config.file_format) + mindspore.export(net, img, img_metas, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': diff --git a/official/cv/FasterRCNN/scripts/run_distribute_train_ascend.sh b/official/cv/FasterRCNN/scripts/run_distribute_train_ascend.sh index 86588bf774e0ab70aae3eed4e1acc60a4d6da7a7..8087e1f88701b48b668a4737dd5ebf9b0f4023b3 100644 --- a/official/cv/FasterRCNN/scripts/run_distribute_train_ascend.sh +++ b/official/cv/FasterRCNN/scripts/run_distribute_train_ascend.sh @@ -96,6 +96,7 @@ export HCCL_CONNECT_TIMEOUT=600 export DEVICE_NUM=8 export RANK_SIZE=8 export RANK_TABLE_FILE=$PATH1 +export MS_ASCEND_CHECK_OVERFLOW_MODE="SATURATION_MODE" for((i=0; i<${DEVICE_NUM}; i++)) do diff --git a/official/cv/FasterRCNN/scripts/run_distribute_train_ascend_msrun.sh b/official/cv/FasterRCNN/scripts/run_distribute_train_ascend_msrun.sh new file mode 100644 index 0000000000000000000000000000000000000000..730440d274e319242e222a39dd79f5955541bceb --- /dev/null +++ b/official/cv/FasterRCNN/scripts/run_distribute_train_ascend_msrun.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# -le 2 ] +then + echo "Usage: bash run_distribute_train_ascend_msrun.sh [PRETRAINED_PATH] [BACKBONE] [COCO_ROOT] [MINDRECORD_DIR](option)" +exit 1 +fi + +if [ $2 != "resnet_v1_50" ] && [ $2 != "resnet_v1.5_50" ] && [ $2 != "resnet_v1_101" ] && [ $2 != "resnet_v1_152" ] && [ $2 != "inception_resnet_v2" ] +then + echo "error: the selected backbone must be resnet_v1_50, resnet_v1.5_50, resnet_v1_101, resnet_v1_152, inception_resnet_v2" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $3) +echo $PATH1 +echo $PATH2 + +if [ ! -f $PATH1 ] +then + echo "error: PRETRAINED_PATH=$PATH1 is not a file" +exit 1 +fi + +if [ ! -d $PATH2 ] +then + echo "error: COCO_ROOT=$PATH2 is not a dir" +exit 1 +fi + +mindrecord_dir=$PATH2/MindRecord_COCO_TRAIN/ +if [ $# -eq 4 ] +then + mindrecord_dir=$(get_real_path $4) + if [ ! -d $mindrecord_dir ] + then + echo "error: mindrecord_dir=$mindrecord_dir is not a dir" + exit 1 + fi +fi +echo $mindrecord_dir + +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) +if [ $# -ge 1 ]; then + if [ $2 == 'resnet_v1.5_50' ]; then + CONFIG_FILE="${BASE_PATH}/../default_config.yaml" + elif [ $2 == 'resnet_v1_101' ]; then + CONFIG_FILE="${BASE_PATH}/../default_config_101.yaml" + elif [ $2 == 'resnet_v1_152' ]; then + CONFIG_FILE="${BASE_PATH}/../default_config_152.yaml" + elif [ $2 == 'resnet_v1_50' ]; then + CONFIG_FILE="${BASE_PATH}/../default_config.yaml" + elif [ $2 == 'inception_resnet_v2' ]; then + CONFIG_FILE="${BASE_PATH}/../default_config_InceptionResnetV2.yaml" + else + echo "Unrecognized parameter" + exit 1 + fi +else + CONFIG_FILE="${BASE_PATH}/../default_config.yaml" +fi + +ulimit -u unlimited +export HCCL_CONNECT_TIMEOUT=600 +export DEVICE_NUM=8 +export RANK_SIZE=8 +export MS_ASCEND_CHECK_OVERFLOW_MODE="SATURATION_MODE" + +echo "Start training..." +msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port=8118 \ + --log_dir=msrun_log --join=True --cluster_time_out=300 \ + ${BASE_PATH}/../train.py --config_path=$CONFIG_FILE --coco_root=$PATH2 --mindrecord_dir=$mindrecord_dir \ + --run_distribute=True --device_num=$DEVICE_NUM --pre_trained=$PATH1 --backbone=$2 &> log.txt & diff --git a/official/cv/FasterRCNN/scripts/run_distribute_train_gpu.sh b/official/cv/FasterRCNN/scripts/run_distribute_train_gpu.sh index 8b27d1c67045df723712934d7eea82a5de73f9ff..d7af4ca64d193b2713d27dc7928443a75132dbcc 100644 --- a/official/cv/FasterRCNN/scripts/run_distribute_train_gpu.sh +++ b/official/cv/FasterRCNN/scripts/run_distribute_train_gpu.sh @@ -97,4 +97,5 @@ mpirun -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout --all --pre_trained=$PRETRAINED_PATH \ --backbone=$3 \ --coco_root=$PATH3 \ + --base_lr=0.008 \ --mindrecord_dir=$mindrecord_dir > train.log 2>&1 & \ No newline at end of file diff --git a/official/cv/FasterRCNN/scripts/run_eval_ascend.sh b/official/cv/FasterRCNN/scripts/run_eval_ascend.sh index d276546174af5532e7d89df4ac8276edf8b492fe..d9b2be113aacc44c9811fb762a30e45f2f00274e 100644 --- a/official/cv/FasterRCNN/scripts/run_eval_ascend.sh +++ b/official/cv/FasterRCNN/scripts/run_eval_ascend.sh @@ -95,6 +95,7 @@ export DEVICE_NUM=1 export RANK_SIZE=$DEVICE_NUM export DEVICE_ID=$5 export RANK_ID=0 +export MS_ASCEND_CHECK_OVERFLOW_MODE="SATURATION_MODE" if [ -d "eval" ]; then diff --git a/official/cv/FasterRCNN/scripts/run_standalone_train_ascend.sh b/official/cv/FasterRCNN/scripts/run_standalone_train_ascend.sh index 565f1c56245b75d9a9cd6ad781504c41b89f18fd..828f5133e125583a73247bc1c9c433bb6806a3ce 100644 --- a/official/cv/FasterRCNN/scripts/run_standalone_train_ascend.sh +++ b/official/cv/FasterRCNN/scripts/run_standalone_train_ascend.sh @@ -88,6 +88,7 @@ export DEVICE_NUM=1 export DEVICE_ID=$4 export RANK_ID=0 export RANK_SIZE=1 +export MS_ASCEND_CHECK_OVERFLOW_MODE="SATURATION_MODE" if [ -d "train" ]; then diff --git a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py index a49572c6aee2e115d8bfe6dd98923189fa3cceae..9bf64569551fd9818609d6cf5009cdc200f46cc7 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py @@ -15,7 +15,7 @@ """FasterRcnn positive and negative sample screening for RPN.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore.common.tensor import Tensor @@ -46,7 +46,7 @@ class BboxAssignSample(nn.Cell): super(BboxAssignSample, self).__init__() cfg = config self.dtype = np.float32 - self.ms_type = ms.float32 + self.ms_type = mindspore.float32 self.batch_size = batch_size self.neg_iou_thr = Tensor(cfg.neg_iou_thr, self.ms_type) @@ -99,11 +99,11 @@ class BboxAssignSample(nn.Cell): self.check_anchor_two = Tensor(np.full((self.num_bboxes, 4), -2, dtype=self.dtype)) def construct(self, gt_bboxes_i, gt_labels_i, valid_mask, bboxes, gt_valids): - gt_bboxes_i = self.select(self.cast(self.tile(self.reshape(self.cast(gt_valids, ms.int32), - (self.num_gts, 1)), (1, 4)), ms.bool_), gt_bboxes_i, + gt_bboxes_i = self.select(self.cast(self.tile(self.reshape(self.cast(gt_valids, mindspore.int32), + (self.num_gts, 1)), (1, 4)), mindspore.bool_), gt_bboxes_i, self.check_gt_one) - bboxes = self.select(self.cast(self.tile(self.reshape(self.cast(valid_mask, ms.int32), - (self.num_bboxes, 1)), (1, 4)), ms.bool_), bboxes, + bboxes = self.select(self.cast(self.tile(self.reshape(self.cast(valid_mask, mindspore.int32), + (self.num_bboxes, 1)), (1, 4)), mindspore.bool_), bboxes, self.check_anchor_two) overlaps = self.iou(bboxes, gt_bboxes_i) max_overlaps_w_gt_index, max_overlaps_w_gt = self.max_gt(overlaps) @@ -115,7 +115,8 @@ class BboxAssignSample(nn.Cell): pos_sample_iou_mask = self.greaterequal(max_overlaps_w_gt, self.pos_iou_thr) assigned_gt_inds3 = self.select(pos_sample_iou_mask, - max_overlaps_w_gt_index + self.assigned_gt_ones, assigned_gt_inds2) + max_overlaps_w_gt_index.astype(mindspore.int32) + self.assigned_gt_ones, + assigned_gt_inds2) assigned_gt_inds4 = assigned_gt_inds3 for j in range(self.num_gts): max_overlaps_w_ac_j = max_overlaps_w_ac[j:j + 1:1] @@ -133,10 +134,10 @@ class BboxAssignSample(nn.Cell): pos_check_valid = self.cast(self.greater(assigned_gt_inds5, 0), self.ms_type) pos_check_valid = self.sum_inds(pos_check_valid, -1) valid_pos_index = self.less(self.range_pos_size, pos_check_valid) - pos_index = pos_index * self.reshape(self.cast(valid_pos_index, ms.int32), (self.num_expected_pos, 1)) + pos_index = pos_index * self.reshape(self.cast(valid_pos_index, mindspore.int32), (self.num_expected_pos, 1)) pos_assigned_gt_index = self.gatherND(assigned_gt_inds5, pos_index) - self.assigned_pos_ones - pos_assigned_gt_index = pos_assigned_gt_index * self.cast(valid_pos_index, ms.int32) + pos_assigned_gt_index = pos_assigned_gt_index * self.cast(valid_pos_index, mindspore.int32) pos_assigned_gt_index = self.reshape(pos_assigned_gt_index, (self.num_expected_pos, 1)) neg_index, valid_neg_index = self.random_choice_with_mask_neg(self.equal(assigned_gt_inds5, 0)) @@ -144,13 +145,7 @@ class BboxAssignSample(nn.Cell): num_pos = self.cast(self.logicalnot(valid_pos_index), self.ms_type) num_pos = self.sum_inds(num_pos, -1) unvalid_pos_index = self.less(self.range_pos_size, num_pos) - valid_neg_index = self.logicaland( - self.cast(self.concat(( - self.cast(self.check_neg_mask, ms.int32), - self.cast(unvalid_pos_index, ms.int32) - )), ms.bool_), - self.cast(valid_neg_index, ms.bool_) - ) + valid_neg_index = self.logicaland(self.concat((self.check_neg_mask, unvalid_pos_index)), valid_neg_index) pos_bboxes_ = self.gatherND(bboxes, pos_index) pos_gt_bboxes_ = self.gatherND(gt_bboxes_i, pos_assigned_gt_index) @@ -158,15 +153,15 @@ class BboxAssignSample(nn.Cell): pos_bbox_targets_ = self.bounding_box_encode(pos_bboxes_, pos_gt_bboxes_) - valid_pos_index = self.cast(valid_pos_index, ms.int32) - valid_neg_index = self.cast(valid_neg_index, ms.int32) + valid_pos_index = self.cast(valid_pos_index, mindspore.int32) + valid_neg_index = self.cast(valid_neg_index, mindspore.int32) total_index = self.concat((pos_index, neg_index)) - pos_index = self.cast(pos_index, ms.int64) + pos_index = self.cast(pos_index, mindspore.int64) bbox_targets_total = self.scatterNd(pos_index, pos_bbox_targets_, (self.num_bboxes, 4)) bbox_weights_total = self.scatterNd(pos_index, valid_pos_index, (self.num_bboxes,)) labels_total = self.scatterNd(pos_index, pos_gt_labels, (self.num_bboxes,)) total_valid_index = self.concat((valid_pos_index, valid_neg_index)) label_weights_total = self.scatterNd(total_index, total_valid_index, (self.num_bboxes,)) - return bbox_targets_total, self.cast(bbox_weights_total, ms.bool_), \ - labels_total, self.cast(label_weights_total, ms.bool_) + return bbox_targets_total, self.cast(bbox_weights_total, mindspore.bool_), \ + labels_total, self.cast(label_weights_total, mindspore.bool_) diff --git a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py index 7602adcc53ae5f30bb56657a26e4c56cbbe8b5ec..942527051d3c3c903a768d98b3277a1c43f71b04 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py @@ -15,7 +15,7 @@ """FasterRcnn tpositive and negative sample screening for Rcnn.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore.common.tensor import Tensor @@ -46,7 +46,7 @@ class BboxAssignSampleForRcnn(nn.Cell): super(BboxAssignSampleForRcnn, self).__init__() cfg = config self.dtype = np.float32 - self.ms_type = ms.float32 + self.ms_type = mindspore.float32 self.batch_size = batch_size self.neg_iou_thr = cfg.neg_iou_thr_stage2 self.pos_iou_thr = cfg.pos_iou_thr_stage2 @@ -109,12 +109,12 @@ class BboxAssignSampleForRcnn(nn.Cell): self.scalar_min_pos_iou = Tensor(self.min_pos_iou, dtype=self.ms_type) def construct(self, gt_bboxes_i, gt_labels_i, valid_mask, bboxes, gt_valids): - gt_bboxes_i = self.select(self.cast(self.tile(self.reshape(self.cast(gt_valids, ms.int32), \ - (self.num_gts, 1)), (1, 4)), ms.bool_), \ + gt_bboxes_i = self.select(self.cast(self.tile(self.reshape(self.cast(gt_valids, mindspore.int32), \ + (self.num_gts, 1)), (1, 4)), mindspore.bool_), \ gt_bboxes_i, self.check_gt_one) - bboxes = self.select(self.cast(self.tile(self.reshape(self.cast(valid_mask, ms.int32), \ - (self.num_bboxes, 1)), (1, 4)), ms.bool_), \ - self.cast(bboxes, ms.float16), self.check_anchor_two) + bboxes = self.select(self.cast(self.tile(self.reshape(self.cast(valid_mask, mindspore.int32), \ + (self.num_bboxes, 1)), (1, 4)), mindspore.bool_), \ + bboxes, self.check_anchor_two) overlaps = self.iou(bboxes, gt_bboxes_i) @@ -129,8 +129,9 @@ class BboxAssignSampleForRcnn(nn.Cell): assigned_gt_inds2 = self.select(neg_sample_iou_mask, self.assigned_gt_zeros, self.assigned_gt_inds) pos_sample_iou_mask = self.greaterequal(max_overlaps_w_gt, self.scalar_pos_iou_thr) - assigned_gt_inds3 = self.select(pos_sample_iou_mask, \ - max_overlaps_w_gt_index + self.assigned_gt_ones, assigned_gt_inds2) + assigned_gt_inds3 = self.select(pos_sample_iou_mask, + max_overlaps_w_gt_index.astype(mindspore.int32) + self.assigned_gt_ones, + assigned_gt_inds2) for j in range(self.num_gts): max_overlaps_w_ac_j = max_overlaps_w_ac[j:j+1:1] @@ -153,10 +154,10 @@ class BboxAssignSampleForRcnn(nn.Cell): pos_check_valid = self.cast(self.greater(assigned_gt_inds5, 0), self.ms_type) pos_check_valid = self.sum_inds(pos_check_valid, -1) valid_pos_index = self.less(self.range_pos_size, pos_check_valid) - pos_index = pos_index * self.reshape(self.cast(valid_pos_index, ms.int32), (self.num_expected_pos, 1)) + pos_index = pos_index * self.reshape(self.cast(valid_pos_index, mindspore.int32), (self.num_expected_pos, 1)) num_pos = self.sum_inds(self.cast(self.logicalnot(valid_pos_index), self.ms_type), -1) - valid_pos_index = self.cast(valid_pos_index, ms.int32) + valid_pos_index = self.cast(valid_pos_index, mindspore.int32) pos_index = self.reshape(pos_index, self.reshape_shape_pos) valid_pos_index = self.reshape(valid_pos_index, self.reshape_shape_pos) pos_index = pos_index * valid_pos_index @@ -171,16 +172,10 @@ class BboxAssignSampleForRcnn(nn.Cell): neg_index, valid_neg_index = self.random_choice_with_mask_neg(self.equal(assigned_gt_inds5, 0)) unvalid_pos_index = self.less(self.range_pos_size, num_pos) - valid_neg_index = self.logicaland( - self.cast(self.concat(( - self.cast(self.check_neg_mask, ms.int32), - self.cast(unvalid_pos_index, ms.int32) - )), ms.bool_), - self.cast(valid_neg_index, ms.bool_) - ) + valid_neg_index = self.logicaland(self.concat((self.check_neg_mask, unvalid_pos_index)), valid_neg_index) neg_index = self.reshape(neg_index, self.reshape_shape_neg) - valid_neg_index = self.cast(valid_neg_index, ms.int32) + valid_neg_index = self.cast(valid_neg_index, mindspore.int32) valid_neg_index = self.reshape(valid_neg_index, self.reshape_shape_neg) neg_index = neg_index * valid_neg_index diff --git a/official/cv/FasterRCNN/src/FasterRcnn/faster_rcnn.py b/official/cv/FasterRCNN/src/FasterRcnn/faster_rcnn.py index 9e78ec349eef3201142dd48777de6060ad348756..551eaedd1709478d23ddf8894e4344c871239e2d 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/faster_rcnn.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/faster_rcnn.py @@ -15,10 +15,9 @@ """FasterRcnn""" import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn -from mindspore import context from mindspore.ops import functional as F from mindspore.ops.primitive import constexpr from mindspore.common.tensor import Tensor @@ -63,7 +62,7 @@ class Faster_Rcnn(nn.Cell): def __init__(self, config): super(Faster_Rcnn, self).__init__() self.dtype = np.float32 - self.ms_type = ms.float32 + self.ms_type = mindspore.float32 self.train_batch_size = config.batch_size self.without_bg_loss = config.without_bg_loss self.num_classes = config.num_classes @@ -161,7 +160,7 @@ class Faster_Rcnn(nn.Cell): # Init tensor self.init_tensor(config) - self.device_type = "Ascend" if context.get_context("device_target") == "Ascend" else "Others" + self.device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "Others" def roi_init(self, config): """ @@ -281,16 +280,16 @@ class Faster_Rcnn(nn.Cell): labels_tuple = () mask_tuple = () if self.training: - gt_labels = self.cast(gt_labels, ms.int32) - gt_valids = self.cast(gt_valids, ms.int32) + gt_labels = self.cast(gt_labels, mindspore.int32) + gt_valids = self.cast(gt_valids, mindspore.int32) for i in range(self.train_batch_size): gt_bboxes_i = self.squeeze(gt_bboxes[i:i + 1:1, ::]) gt_labels_i = self.squeeze(gt_labels[i:i + 1:1, ::]) - gt_labels_i = self.cast(gt_labels_i, ms.uint8) + gt_labels_i = self.cast(gt_labels_i, mindspore.uint8) gt_valids_i = self.squeeze(gt_valids[i:i + 1:1, ::]) - gt_valids_i = self.cast(gt_valids_i, ms.bool_) + gt_valids_i = self.cast(gt_valids_i, mindspore.bool_) bboxes, deltas, labels, mask = self.bbox_assigner_sampler_for_rcnn(gt_bboxes_i, gt_labels_i, @@ -306,7 +305,7 @@ class Faster_Rcnn(nn.Cell): rcnn_labels = self.concat(labels_tuple) bbox_targets = ops.stop_gradient(bbox_targets) rcnn_labels = ops.stop_gradient(rcnn_labels) - rcnn_labels = self.cast(rcnn_labels, ms.int32) + rcnn_labels = self.cast(rcnn_labels, mindspore.int32) else: mask_tuple += proposal_mask bbox_targets = proposal_mask @@ -326,29 +325,29 @@ class Faster_Rcnn(nn.Cell): else: bboxes_all = bboxes_tuple[0] if self.device_type == "Ascend": - bboxes_all = self.cast(bboxes_all, ms.float16) + bboxes_all = self.cast(bboxes_all, mindspore.float16) rois = self.concat_1((self.roi_align_index_test_tensor, bboxes_all)) - rois = self.cast(rois, ms.float32) + rois = self.cast(rois, mindspore.float32) rois = ops.stop_gradient(rois) if self.training: roi_feats = self.roi_align(rois, - self.cast(x[0], ms.float32), - self.cast(x[1], ms.float32), - self.cast(x[2], ms.float32), - self.cast(x[3], ms.float32)) + self.cast(x[0], mindspore.float32), + self.cast(x[1], mindspore.float32), + self.cast(x[2], mindspore.float32), + self.cast(x[3], mindspore.float32)) else: roi_feats = self.roi_align_test(rois, - self.cast(x[0], ms.float32), - self.cast(x[1], ms.float32), - self.cast(x[2], ms.float32), - self.cast(x[3], ms.float32)) + self.cast(x[0], mindspore.float32), + self.cast(x[1], mindspore.float32), + self.cast(x[2], mindspore.float32), + self.cast(x[3], mindspore.float32)) roi_feats = self.cast(roi_feats, self.ms_type) rcnn_masks = self.concat(mask_tuple) rcnn_masks = ops.stop_gradient(rcnn_masks) - rcnn_mask_squeeze = self.squeeze(self.cast(rcnn_masks, ms.bool_)) + rcnn_mask_squeeze = self.squeeze(self.cast(rcnn_masks, mindspore.bool_)) rcnn_loss, rcnn_cls_loss, rcnn_reg_loss, _ = self.rcnn(roi_feats, bbox_targets, rcnn_labels, @@ -375,7 +374,7 @@ class Faster_Rcnn(nn.Cell): img_metas_all = self.split(img_metas) scores_all = self.split(scores) - mask_all = self.split(self.cast(mask_logits, ms.int32)) + mask_all = self.split(self.cast(mask_logits, mindspore.int32)) boxes_all_with_batchsize = () for i in range(self.test_batch_size): @@ -403,7 +402,7 @@ class Faster_Rcnn(nn.Cell): for i in range(self.test_batch_size): bboxes = boxes_all[i] scores = scores_all[i] - masks = self.cast(mask_all[i], ms.bool_) + masks = self.cast(mask_all[i], mindspore.bool_) res_boxes_tuple = () res_labels_tuple = () @@ -421,7 +420,7 @@ class Faster_Rcnn(nn.Cell): cls_mask = self.greater(_cls_scores, self.test_score_thresh) _mask = self.logicand(_mask_o, cls_mask) - _reg_mask = self.cast(self.tile(self.cast(_mask, ms.int32), (1, 4)), ms.bool_) + _reg_mask = self.cast(self.tile(self.cast(_mask, mindspore.int32), (1, 4)), mindspore.bool_) _bboxes = self.select(_reg_mask, _bboxes, self.test_box_zeros) _cls_scores = self.select(_mask, _cls_scores, self.test_score_zeros) @@ -506,7 +505,7 @@ def generator_img_meta(n, ori_h, ori_w, in_h, in_w): resize_scale = width_scale if width_scale < height_scale else height_scale img_metas.append([ori_h, ori_w, resize_scale, resize_scale]) - img_metas = Tensor(np.array(img_metas), ms.float32) + img_metas = Tensor(np.array(img_metas), mindspore.float32) return img_metas diff --git a/official/cv/FasterRCNN/src/FasterRcnn/fpn_neck.py b/official/cv/FasterRCNN/src/FasterRcnn/fpn_neck.py index b22da232f057ebd9a8721d84c525cb188a70b65d..a1dd2fa80c04c6e347d54d7823fb4085ec37e6cc 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/fpn_neck.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/fpn_neck.py @@ -15,7 +15,7 @@ """FasterRcnn feature pyramid network.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore.common.tensor import Tensor @@ -30,7 +30,7 @@ def bias_init_zeros(shape): def _conv(in_channels, out_channels, kernel_size=3, stride=1, padding=0, pad_mode='pad'): """Conv2D wrapper.""" shape = (out_channels, in_channels, kernel_size, kernel_size) - weights = ms.common.initializer.initializer("XavierUniform", shape=shape, dtype=ms.float32).init_data() + weights = mindspore.common.initializer.initializer("XavierUniform", shape=shape, dtype=mindspore.float32).init_data() shape_bias = (out_channels,) biass = bias_init_zeros(shape_bias) return nn.Conv2d(in_channels, out_channels, diff --git a/official/cv/FasterRCNN/src/FasterRcnn/proposal_generator.py b/official/cv/FasterRCNN/src/FasterRcnn/proposal_generator.py index 5317ca51c35a6ba8f3ee3de144f4e0b06f4f4623..9068334fde3c15576a8a11ea99b6b6ecb52a2021 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/proposal_generator.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/proposal_generator.py @@ -15,7 +15,7 @@ """FasterRcnn proposal generator.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore import Tensor @@ -103,7 +103,7 @@ class Proposal(nn.Cell): self.set_train_local(config, training=True) self.dtype = np.float32 - self.ms_type = ms.float32 + self.ms_type = mindspore.float32 self.multi_10 = Tensor(10.0, self.ms_type) @@ -183,21 +183,13 @@ class Proposal(nn.Cell): mlvl_proposals = mlvl_proposals + (proposals,) mlvl_mask = mlvl_mask + (mask_valid,) - proposals = self.concat_axis0( - tuple(self.cast(proposal, ms.int64) for proposal in mlvl_proposals) - ) - masks = self.concat_axis0( - tuple(self.cast(mask, ms.int64) for mask in mlvl_mask) - ) + proposals = self.concat_axis0(mlvl_proposals) + masks = self.concat_axis0(mlvl_mask) _, _, _, _, scores = self.split(proposals) scores = self.squeeze(scores) topk_mask = self.cast(self.topK_mask, self.ms_type) - scores_using = self.cast(self.select( - self.cast(masks, ms.bool_), - self.cast(scores, ms.bool_), - self.cast(topk_mask, ms.bool_) - ), ms.int32) + scores_using = self.select(masks, scores, topk_mask) _, topk_inds = self.topKv2(scores_using, self.max_num) diff --git a/official/cv/FasterRCNN/src/FasterRcnn/rcnn.py b/official/cv/FasterRCNN/src/FasterRcnn/rcnn.py index fa02da335afc218b02c580b5a671ce4872484f4c..be6f00d95a6428ddc318a0bc8afede11b221da29 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/rcnn.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/rcnn.py @@ -15,7 +15,7 @@ """FasterRcnn Rcnn network.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore.common.tensor import Tensor @@ -27,20 +27,20 @@ class DenseNoTranpose(nn.Cell): def __init__(self, input_channels, output_channels, weight_init): super(DenseNoTranpose, self).__init__() - self.weight = Parameter(ms.common.initializer.initializer(weight_init, \ - [input_channels, output_channels], ms.float32)) - self.bias = Parameter(ms.common.initializer.initializer("zeros", \ - [output_channels], ms.float32)) + self.weight = Parameter(mindspore.common.initializer.initializer(weight_init, \ + [input_channels, output_channels], mindspore.float32)) + self.bias = Parameter(mindspore.common.initializer.initializer("zeros", \ + [output_channels], mindspore.float32)) self.matmul = ops.MatMul(transpose_b=False) self.bias_add = ops.BiasAdd() self.cast = ops.Cast() - self.device_type = "Ascend" if ms.get_context("device_target") == "Ascend" else "Others" + self.device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "Others" def construct(self, x): if self.device_type == "Ascend": - x = self.cast(x, ms.float16) - weight = self.cast(self.weight, ms.float16) + x = self.cast(x, mindspore.float16) + weight = self.cast(self.weight, mindspore.float16) output = self.bias_add(self.matmul(x, weight), self.bias) else: output = self.bias_add(self.matmul(x, self.weight), self.bias) @@ -78,7 +78,7 @@ class Rcnn(nn.Cell): super(Rcnn, self).__init__() cfg = config self.dtype = np.float32 - self.ms_type = ms.float32 + self.ms_type = mindspore.float32 self.rcnn_loss_cls_weight = Tensor(np.array(cfg.rcnn_loss_cls_weight).astype(self.dtype)) self.rcnn_loss_reg_weight = Tensor(np.array(cfg.rcnn_loss_reg_weight).astype(self.dtype)) self.rcnn_fc_out_channels = cfg.rcnn_fc_out_channels @@ -94,17 +94,17 @@ class Rcnn(nn.Cell): self.test_batch_size = cfg.test_batch_size shape_0 = (self.rcnn_fc_out_channels, representation_size) - weights_0 = ms.common.initializer.initializer("XavierUniform", shape=shape_0[::-1], \ + weights_0 = mindspore.common.initializer.initializer("XavierUniform", shape=shape_0[::-1], \ dtype=self.ms_type).init_data() shape_1 = (self.rcnn_fc_out_channels, self.rcnn_fc_out_channels) - weights_1 = ms.common.initializer.initializer("XavierUniform", shape=shape_1[::-1], \ + weights_1 = mindspore.common.initializer.initializer("XavierUniform", shape=shape_1[::-1], \ dtype=self.ms_type).init_data() self.shared_fc_0 = DenseNoTranpose(representation_size, self.rcnn_fc_out_channels, weights_0) self.shared_fc_1 = DenseNoTranpose(self.rcnn_fc_out_channels, self.rcnn_fc_out_channels, weights_1) - cls_weight = ms.common.initializer.initializer('Normal', shape=[num_classes, self.rcnn_fc_out_channels][::-1], + cls_weight = mindspore.common.initializer.initializer('Normal', shape=[num_classes, self.rcnn_fc_out_channels][::-1], dtype=self.ms_type).init_data() - reg_weight = ms.common.initializer.initializer('Normal', shape=[self.num_classes_fronted * 4, + reg_weight = mindspore.common.initializer.initializer('Normal', shape=[self.num_classes_fronted * 4, self.rcnn_fc_out_channels][::-1], dtype=self.ms_type).init_data() self.cls_scores = DenseNoTranpose(self.rcnn_fc_out_channels, num_classes, cls_weight) @@ -126,8 +126,8 @@ class Rcnn(nn.Cell): self.gather = ops.GatherNd() self.argmax = ops.ArgMaxWithValue(axis=1) - self.on_value = Tensor(1.0, ms.float32) - self.off_value = Tensor(0.0, ms.float32) + self.on_value = Tensor(1.0, mindspore.float32) + self.off_value = Tensor(0.0, mindspore.float32) self.value = Tensor(1.0, self.ms_type) self.num_bboxes = (cfg.num_expected_pos_stage2 + cfg.num_expected_neg_stage2) * batch_size @@ -151,7 +151,7 @@ class Rcnn(nn.Cell): x_reg = self.reg_scores(x) if self.training: - bbox_weights = self.cast(self.logicaland(self.greater(labels, 0), mask), ms.int32) * labels + bbox_weights = self.cast(self.logicaland(self.greater(labels, 0), mask), mindspore.int32) * labels labels = self.onehot(labels, self.num_classes, self.on_value, self.off_value) bbox_targets = self.tile(self.expandims(bbox_targets, 1), (1, self.num_classes_fronted, 1)) diff --git a/official/cv/FasterRCNN/src/FasterRcnn/roi_align.py b/official/cv/FasterRCNN/src/FasterRcnn/roi_align.py index e3c4c0490b09f85cbb00f468d0fd071b4473a06f..4fd33e28709a3c8cbfa70681c733924cf6e720af 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/roi_align.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/roi_align.py @@ -15,7 +15,7 @@ """FasterRcnn ROIAlign module.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore.nn import layer as L @@ -99,7 +99,7 @@ class SingleRoIExtractor(nn.Cell): _mode_16 = False self.dtype = np.float16 if _mode_16 else np.float32 - self.ms_dtype = ms.float16 if _mode_16 else ms.float32 + self.ms_dtype = mindspore.float16 if _mode_16 else mindspore.float32 self.set_train_local(cfg, training=True) def set_train_local(self, config, training=True): @@ -158,7 +158,7 @@ class SingleRoIExtractor(nn.Cell): target_lvls = self.log2(scale / self.finest_scale + self.epslion) target_lvls = ops.Floor()(target_lvls) - target_lvls = self.cast(target_lvls, ms.int32) + target_lvls = self.cast(target_lvls, mindspore.int32) target_lvls = self.clamp(target_lvls, self.zeros, self.max_levels) return target_lvls @@ -168,11 +168,11 @@ class SingleRoIExtractor(nn.Cell): res = self.res_ target_lvls = self._c_map_roi_levels(rois) for i in range(self.num_levels): - mask = self.equal(target_lvls, ops.ScalarToTensor()(i, ms.int32)) + mask = self.equal(target_lvls, ops.ScalarToTensor()(i, mindspore.int32)) mask = ops.Reshape()(mask, (-1, 1, 1, 1)) roi_feats_t = self.roi_layers[i](feats[i], rois) - mask = self.cast(ops.Tile()(self.cast(mask, ms.int32),\ - (1, 256, self.out_size, self.out_size)), ms.bool_) + mask = self.cast(ops.Tile()(self.cast(mask, mindspore.int32),\ + (1, 256, self.out_size, self.out_size)), mindspore.bool_) res = self.select(mask, roi_feats_t, res) return res diff --git a/official/cv/FasterRCNN/src/FasterRcnn/rpn.py b/official/cv/FasterRCNN/src/FasterRcnn/rpn.py index eea59e0c30d342d41f9e27dd88eee8f7175ff686..908ca651288e14ccdc03e042af01a67404098fd6 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/rpn.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/rpn.py @@ -14,7 +14,7 @@ # ============================================================================ """RPN for fasterRCNN""" import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore import Tensor @@ -99,8 +99,8 @@ class RPN(nn.Cell): super(RPN, self).__init__() cfg_rpn = config self.dtype = np.float32 - self.ms_type = ms.float32 - self.device_type = "Ascend" if ms.get_context("device_target") == "Ascend" else "Others" + self.ms_type = mindspore.float32 + self.device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "Others" self.num_bboxes = cfg_rpn.num_bboxes self.slice_index = () self.feature_anchor_shape = () @@ -115,7 +115,7 @@ class RPN(nn.Cell): self.batch_size = batch_size self.test_batch_size = cfg_rpn.test_batch_size self.num_layers = 5 - self.real_ratio = ms.Tensor(np.ones((1, 1), self.dtype)) + self.real_ratio = mindspore.Tensor(np.ones((1, 1), self.dtype)) self.rpn_convs_list = nn.layer.CellList(self._make_rpn_layer(self.num_layers, in_channels, feat_channels, num_anchors, cls_out_channels)) @@ -124,7 +124,7 @@ class RPN(nn.Cell): self.reshape = ops.Reshape() self.concat = ops.Concat(axis=0) self.fill = ops.Fill() - self.placeh1 = ms.Tensor(np.ones((1,), self.dtype)) + self.placeh1 = mindspore.Tensor(np.ones((1,), self.dtype)) self.trans_shape = (0, 2, 3, 1) @@ -143,9 +143,9 @@ class RPN(nn.Cell): self.cast = ops.Cast() self.tile = ops.Tile() self.zeros_like = ops.ZerosLike() - self.loss = ms.Tensor(np.zeros((1,), self.dtype)) - self.clsloss = ms.Tensor(np.zeros((1,), self.dtype)) - self.regloss = ms.Tensor(np.zeros((1,), self.dtype)) + self.loss = mindspore.Tensor(np.zeros((1,), self.dtype)) + self.clsloss = mindspore.Tensor(np.zeros((1,), self.dtype)) + self.regloss = mindspore.Tensor(np.zeros((1,), self.dtype)) def _make_rpn_layer(self, num_layers, in_channels, feat_channels, num_anchors, cls_out_channels): """ @@ -165,25 +165,25 @@ class RPN(nn.Cell): shp_weight_conv = (feat_channels, in_channels, 3, 3) shp_bias_conv = (feat_channels,) - weight_conv = ms.common.initializer.initializer('Normal', shape=shp_weight_conv, dtype=self.ms_type).init_data() - bias_conv = ms.common.initializer.initializer(0, shape=shp_bias_conv, dtype=self.ms_type).init_data() + weight_conv = mindspore.common.initializer.initializer('Normal', shape=shp_weight_conv, dtype=self.ms_type).init_data() + bias_conv = mindspore.common.initializer.initializer(0, shape=shp_bias_conv, dtype=self.ms_type).init_data() shp_weight_cls = (num_anchors * cls_out_channels, feat_channels, 1, 1) shp_bias_cls = (num_anchors * cls_out_channels,) - weight_cls = ms.common.initializer.initializer('Normal', shape=shp_weight_cls, dtype=self.ms_type).init_data() - bias_cls = ms.common.initializer.initializer(0, shape=shp_bias_cls, dtype=self.ms_type).init_data() + weight_cls = mindspore.common.initializer.initializer('Normal', shape=shp_weight_cls, dtype=self.ms_type).init_data() + bias_cls = mindspore.common.initializer.initializer(0, shape=shp_bias_cls, dtype=self.ms_type).init_data() shp_weight_reg = (num_anchors * 4, feat_channels, 1, 1) shp_bias_reg = (num_anchors * 4,) - weight_reg = ms.common.initializer.initializer('Normal', shape=shp_weight_reg, dtype=self.ms_type).init_data() - bias_reg = ms.common.initializer.initializer(0, shape=shp_bias_reg, dtype=self.ms_type).init_data() + weight_reg = mindspore.common.initializer.initializer('Normal', shape=shp_weight_reg, dtype=self.ms_type).init_data() + bias_reg = mindspore.common.initializer.initializer(0, shape=shp_bias_reg, dtype=self.ms_type).init_data() for i in range(num_layers): rpn_reg_cls_block = RpnRegClsBlock(in_channels, feat_channels, num_anchors, cls_out_channels, \ weight_conv, bias_conv, weight_cls, \ bias_cls, weight_reg, bias_reg) if self.device_type == "Ascend": - rpn_reg_cls_block.to_float(ms.float16) + rpn_reg_cls_block.to_float(mindspore.float16) rpn_layer.append(rpn_reg_cls_block) for i in range(1, num_layers): @@ -235,7 +235,7 @@ class RPN(nn.Cell): for j in range(self.num_layers): res = self.cast(self.CheckValid(anchor_list[j], self.squeeze(img_metas[i:i + 1:1, ::])), - ms.int32) + mindspore.int32) multi_level_flags = multi_level_flags + (res,) anchor_list_tuple = anchor_list_tuple + (anchor_list[j],) @@ -249,7 +249,7 @@ class RPN(nn.Cell): bbox_target, bbox_weight, label, label_weight = self.get_targets(gt_bboxes_i, gt_labels_i, self.cast(valid_flag_list, - ms.bool_), + mindspore.bool_), anchor_using_list, gt_valids_i) bbox_target = self.cast(bbox_target, self.ms_type) diff --git a/official/cv/FasterRCNN/src/convert_checkpoint.py b/official/cv/FasterRCNN/src/convert_checkpoint.py index cadff45269f39f4adc058e492b0520606f653985..abf0fb8ca35a1b8614045c44c1c1c55e0aa19640 100644 --- a/official/cv/FasterRCNN/src/convert_checkpoint.py +++ b/official/cv/FasterRCNN/src/convert_checkpoint.py @@ -15,7 +15,7 @@ """ convert pretrain model to faster_rcnn backbone pretrain model """ -import mindspore as ms +import mindspore from mindspore.common.parameter import Parameter from mindspore.common.tensor import Tensor from model_utils.config import config @@ -32,7 +32,7 @@ def load_weights(model_path, use_fp16_weight): Returns: parameter list(list): pretrain model weight list. """ - ms_ckpt = ms.load_checkpoint(model_path) + ms_ckpt = mindspore.load_checkpoint(model_path) weights = {} for msname in ms_ckpt: if msname.startswith("layer") or msname.startswith("conv1") or msname.startswith("bn"): @@ -45,9 +45,9 @@ def load_weights(model_path, use_fp16_weight): param_name = param_name.replace("down_sample_layer.1", "bn_down_sample") weights[param_name] = ms_ckpt[msname].data.asnumpy() if use_fp16_weight: - dtype = ms.float16 + dtype = mindspore.float16 else: - dtype = ms.float32 + dtype = mindspore.float32 parameter_dict = {} for name in weights: parameter_dict[name] = Parameter(Tensor(weights[name], dtype), name=name) @@ -58,4 +58,4 @@ def load_weights(model_path, use_fp16_weight): if __name__ == "__main__": parameter_list = load_weights(config.ckpt_file, use_fp16_weight=False) - ms.save_checkpoint(parameter_list, "backbone.ckpt") + mindspore.save_checkpoint(parameter_list, "backbone.ckpt") diff --git a/official/cv/FasterRCNN/src/dataset.py b/official/cv/FasterRCNN/src/dataset.py index 21a058d4de975df41dabef2bcb6be56a9f04fef5..e3acdaf2bcfaa2c124bd243b9c73fc998ac17592 100644 --- a/official/cv/FasterRCNN/src/dataset.py +++ b/official/cv/FasterRCNN/src/dataset.py @@ -21,7 +21,7 @@ import numpy as np from numpy import random import cv2 -import mindspore as ms +import mindspore import mindspore.dataset as de from mindspore.mindrecord import FileWriter @@ -553,7 +553,7 @@ def create_fasterrcnn_dataset(config, mindrecord_file, batch_size=2, device_num= de.config.set_prefetch_size(8) ds = de.MindDataset(mindrecord_file, columns_list=["image", "annotation"], num_shards=device_num, shard_id=rank_id, num_parallel_workers=4, shuffle=is_training) - decode = ms.dataset.vision.Decode() + decode = mindspore.dataset.vision.Decode() ds = ds.map(input_columns=["image"], operations=decode) compose_map_func = (lambda image, annotation: preprocess_fn(image, annotation, is_training, config=config)) diff --git a/official/cv/FasterRCNN/src/detecteval.py b/official/cv/FasterRCNN/src/detecteval.py index a6766af9712fc1e54b0fe6fadbee1b5b010d6635..63d9b21a33a7ddc38d5a35d4350311ec3f1bccc5 100644 --- a/official/cv/FasterRCNN/src/detecteval.py +++ b/official/cv/FasterRCNN/src/detecteval.py @@ -499,8 +499,8 @@ class DetectEval(COCOeval): assert (tps.shape[0]) == 1 assert (fps.shape[0]) == 1 - tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float) - fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float) + tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float_) + fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float_) ids = catIds[k0] label = labels[ids] diff --git a/official/cv/FasterRCNN/src/eval_callback.py b/official/cv/FasterRCNN/src/eval_callback.py index 00ea880bc91c2b5ed58e00a9994c56a53eb97b2a..fbf0b1717742d8529e3534aa5967bc3a1e1ee8ec 100644 --- a/official/cv/FasterRCNN/src/eval_callback.py +++ b/official/cv/FasterRCNN/src/eval_callback.py @@ -15,7 +15,7 @@ import os import shutil -import mindspore as ms +import mindspore from mindspore.train.callback import Callback @@ -71,7 +71,7 @@ class EvalCallBack(Callback): shutil.rmtree(self.best_ckpt_path) os.mkdir(self.best_ckpt_path) - ms.save_checkpoint(cb_params.train_network, os.path.join(self.best_ckpt_path, "best.ckpt")) + mindspore.save_checkpoint(cb_params.train_network, os.path.join(self.best_ckpt_path, "best.ckpt")) print("update best result: {} in the {} th epoch".format(self.best_res, self.best_epoch), flush=True) diff --git a/official/cv/FasterRCNN/src/eval_utils.py b/official/cv/FasterRCNN/src/eval_utils.py index 8c0158098d13bfcccce2bac96daa512ad3b947df..0705134be15451c4f75fdd663e14b85565a1cd27 100644 --- a/official/cv/FasterRCNN/src/eval_utils.py +++ b/official/cv/FasterRCNN/src/eval_utils.py @@ -21,7 +21,7 @@ import numpy as np from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval -import mindspore as ms +import mindspore from mindspore.common import Parameter from src.dataset import data_to_mindrecord_byte_image, create_fasterrcnn_dataset, parse_json_annos_from_txt from src.util import bbox2result_1image, results2json @@ -55,17 +55,17 @@ def apply_eval(net, config, dataset_path, ckpt_path, anno_path): raise RuntimeError("CheckPoint file {} is not valid.".format(ckpt_path)) ds = create_fasterrcnn_dataset(config, dataset_path, batch_size=config.test_batch_size, is_training=False) - param_dict = ms.load_checkpoint(ckpt_path) + param_dict = mindspore.load_checkpoint(ckpt_path) if config.device_target == "GPU": for key, value in param_dict.items(): tensor = value.asnumpy().astype(np.float32) param_dict[key] = Parameter(tensor, key) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) - device_type = "Ascend" if ms.get_context("device_target") == "Ascend" else "Others" + device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "Others" if device_type == "Ascend": - net.to_float(ms.float16) + net.to_float(mindspore.float16) eval_iter = 0 total = ds.get_dataset_size() diff --git a/official/cv/FasterRCNN/src/model_utils/moxing_adapter.py b/official/cv/FasterRCNN/src/model_utils/moxing_adapter.py index 830d19a6fc99de8d602703971d5ac5b24e060d11..9c6d88e5d6e9831a21d75dc52fcc166584f8e61e 100644 --- a/official/cv/FasterRCNN/src/model_utils/moxing_adapter.py +++ b/official/cv/FasterRCNN/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/FasterRCNN/src/network_define.py b/official/cv/FasterRCNN/src/network_define.py index 9638c6b70f61edc04e0b29e824e5b4c2b56f48b1..c2fcf55dfd2b04d911a3e20631d63783af816ee8 100644 --- a/official/cv/FasterRCNN/src/network_define.py +++ b/official/cv/FasterRCNN/src/network_define.py @@ -15,7 +15,7 @@ """FasterRcnn training network wrapper.""" import time -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore.train.callback import Callback @@ -136,7 +136,7 @@ class TrainOneStepCell(nn.TrainOneStepWithLossScaleCell): def __init__(self, network, optimizer, scale_sense=1, grad_clip=False): if isinstance(scale_sense, (int, float)): - scale_sense = ms.Tensor(scale_sense, ms.float32) + scale_sense = mindspore.Tensor(scale_sense, mindspore.float32) super(TrainOneStepCell, self).__init__(network, optimizer, scale_sense) self.grad_clip = grad_clip diff --git a/official/cv/FasterRCNN/src/quick_start.py b/official/cv/FasterRCNN/src/quick_start.py index 9574fbaf9a30b25910ab1952df7f3de39e02c2c9..ca3ce2bd7dde9e564d0a898ed177c28bee7a202e 100644 --- a/official/cv/FasterRCNN/src/quick_start.py +++ b/official/cv/FasterRCNN/src/quick_start.py @@ -19,7 +19,7 @@ import cv2 import numpy as np from tqdm import tqdm -import mindspore as ms +import mindspore from mindspore.common.tensor import Tensor import mindspore.ops.operations as P @@ -27,7 +27,7 @@ from src.model_utils.config import config from src.maskrcnn.mask_rcnn_r50 import Mask_Rcnn_Resnet50 from src.model_utils.device_adapter import get_device_id random.seed(1) -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) +mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) def rescale_with_tuple(img, scale): h, w = img.shape[:2] @@ -118,8 +118,8 @@ def save_result(img, boxes, labels, img_metas_, save_name): def det(): net = Mask_Rcnn_Resnet50(config) - param_dict = ms.load_checkpoint(config.ckpt_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.ckpt_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) image_list = os.listdir(config.image_folder) max_num = config.num_gts diff --git a/official/cv/FasterRCNN/train.py b/official/cv/FasterRCNN/train.py index 0bf8196579793b2f3c9563a9c92ef2fb91e69bea..7556a7a74139455d3f592bc4a7558b8d331bef97 100644 --- a/official/cv/FasterRCNN/train.py +++ b/official/cv/FasterRCNN/train.py @@ -19,14 +19,14 @@ import os import time from pprint import pprint import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore import Tensor, Parameter, ParameterTuple from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor from mindspore.train import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn import SGD, Adam from mindspore.common import set_seed from mindspore.train.callback import SummaryCollector @@ -64,7 +64,7 @@ class TrainOneStepCellCPU(nn.Cell): self.optimizer = optimizer self.grad = ops.GradOperation(get_by_list=True, sens_param=True) - self.sens = Tensor([sens,], ms.float32) + self.sens = Tensor([sens,], mindspore.float32) self.reduce_flag = reduce_flag if reduce_flag: self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) @@ -146,14 +146,14 @@ def load_ckpt_to_network(net): "rcnn.reg_scores.bias", "accum.rcnn.cls_scores.weight", "accum.rcnn.cls_scores.bias", "accum.rcnn.reg_scores.weight", "accum.rcnn.reg_scores.bias" ] - param_dict = ms.load_checkpoint(load_path, choice_func=lambda x: not x.startswith(tuple(param_not_load))) + param_dict = mindspore.load_checkpoint(load_path, choice_func=lambda x: not x.startswith(tuple(param_not_load))) for key, val in param_dict.items(): # Correct previous misspellings key = key.replace("ncek", "neck") new_param[key] = val else: print(f"\n[{rank}]", "===> Loading from checkpoint:", load_path) - param_dict = ms.load_checkpoint(load_path) + param_dict = mindspore.load_checkpoint(load_path) key_mapping = {'down_sample_layer.1.beta': 'bn_down_sample.beta', 'down_sample_layer.1.gamma': 'bn_down_sample.gamma', 'down_sample_layer.0.weight': 'conv_down_sample.weight', @@ -182,7 +182,7 @@ def load_ckpt_to_network(net): new_param = param_dict try: - ms.load_param_into_net(net, new_param) + mindspore.load_param_into_net(net, new_param) except RuntimeError as ex: ex = str(ex) print("Traceback:\n", ex, flush=True) @@ -196,10 +196,10 @@ def load_ckpt_to_network(net): def set_ascend_max_device_memory(): """Set the maximum memory on 910B used by MindSpore""" - if ms.get_context("enable_ge") and ms.get_context("mode") == ms.GRAPH_MODE and \ + if mindspore.get_context("enable_ge") and mindspore.get_context("mode") == 0 and \ hasattr(config, "max_device_memory"): print("[WARNING] When encountering a memory shortage situation in 910B, reduce the max_device_memory.") - ms.set_context(max_device_memory=config.max_device_memory) + mindspore.set_context(max_device_memory=config.max_device_memory) @moxing_wrapper(pre_process=modelarts_pre_process) @@ -213,10 +213,10 @@ def train_fasterrcnn(): net = net.set_train() net = load_ckpt_to_network(net) - device_type = "Ascend" if ms.get_context("device_target") == "Ascend" else "Others" + device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "Others" print(f"\n[{rank}]", "===> Device type:", device_type, "\n") if device_type == "Ascend": - net.to_float(ms.float16) + net.to_float(mindspore.float16) # single card, original base_lr is for 8 cards if not config.run_distribute: @@ -227,9 +227,9 @@ def train_fasterrcnn(): if config.lr_type.lower() not in ("dynamic", "multistep"): raise ValueError("Optimize type should be 'dynamic' or 'dynamic'") if config.lr_type.lower() == "dynamic": - lr = Tensor(dynamic_lr(config, dataset_size), ms.float32) + lr = Tensor(dynamic_lr(config, dataset_size), mindspore.float32) else: - lr = Tensor(multistep_lr(config, dataset_size), ms.float32) + lr = Tensor(multistep_lr(config, dataset_size), mindspore.float32) if config.opt_type.lower() not in ("sgd", "adam"): raise ValueError("Optimize type should be 'SGD' or 'Adam'") @@ -287,19 +287,21 @@ def train_fasterrcnn(): if __name__ == '__main__': set_seed(1) - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id(), + ascend_config={"ge_options": {"global": {"ge.exec.memoryOptimizationPolicy": ""}}}, + jit_config={"jit_level": "O2"}) set_ascend_max_device_memory() local_path = '/'.join(os.path.realpath(__file__).split('/')[:-1]) summary_dir = local_path + "/train/summary/" if config.device_target == "GPU": - ms.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) if config.run_distribute: init() rank = get_rank() device_num = get_group_size() - ms.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, - gradients_mean=True) + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + gradients_mean=True) summary_dir += "thread_num_" + str(rank) + "/" else: rank = 0 diff --git a/official/cv/Inception/inceptionv3/README.md b/official/cv/Inception/inceptionv3/README.md index 75542628c67d183cef872a721202741c591050c2..e710f3a29cfa3f560b14cfa2ca641ee68a9e43c4 100644 --- a/official/cv/Inception/inceptionv3/README.md +++ b/official/cv/Inception/inceptionv3/README.md @@ -332,7 +332,7 @@ bash scripts/run_standalone_train_cpu.sh DATA_PATH ### Result -Training result will be stored in the example path. Checkpoints will be stored at `./ckpt` by default, and training log will be redirected to `./log.txt` like followings. +Training result will be stored in the example path. Checkpoints will be stored at `./ckpt` by default, and training log will be redirected to `./log.txt` like following. #### Ascend @@ -391,7 +391,7 @@ You can start training using python or shell scripts. The usage of shell scripts ### Result -Evaluation result will be stored in the example path, you can find result like the followings in `eval.log`. +Evaluation result will be stored in the example path, you can find result like the following in `eval.log`. ```python metric: {'Loss': 1.778, 'Top1-Acc':0.788, 'Top5-Acc':0.942} diff --git a/official/cv/Inception/inceptionv3/eval.py b/official/cv/Inception/inceptionv3/eval.py index 35e0a436b08b423e9f5d70e46e855afe207f6ee4..508a0cbf3014e9ac43be5d92fce86372b2851d40 100644 --- a/official/cv/Inception/inceptionv3/eval.py +++ b/official/cv/Inception/inceptionv3/eval.py @@ -24,8 +24,8 @@ from src.dataset import create_dataset_imagenet, create_dataset_cifar10 from src.inception_v3 import InceptionV3 from src.loss import CrossEntropy_Val +import mindspore import mindspore.nn as nn -from mindspore import context from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -98,10 +98,10 @@ def eval_inceptionv3(): if config.platform == 'Ascend': device_id = int(os.getenv('DEVICE_ID')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) create_dataset = DS_DICT[config.ds_type] - context.set_context(mode=context.GRAPH_MODE, device_target=config.platform) + mindspore.set_context(mode=0, device_target=config.platform) net = InceptionV3(num_classes=config.num_classes, is_training=False) ckpt = load_checkpoint(config.checkpoint) load_param_into_net(net, ckpt) diff --git a/official/cv/Inception/inceptionv3/export.py b/official/cv/Inception/inceptionv3/export.py index 31f09ec2407303239445dbb0e89847f559fff4b2..622d884d935fe1d008918a07cca840b4200ee395 100644 --- a/official/cv/Inception/inceptionv3/export.py +++ b/official/cv/Inception/inceptionv3/export.py @@ -20,15 +20,15 @@ from src.model_utils.moxing_adapter import moxing_wrapper from src.model_utils.device_adapter import get_device_id from src.inception_v3 import InceptionV3 -import mindspore as ms -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export config.batch_size = 1 -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_process(): pass @@ -41,7 +41,7 @@ def export_inceptionv3(): load_param_into_net(net, param_dict) input_arr = Tensor(np.random.uniform(0.0, 1.0, size=[config.batch_size, 3, config.width, \ - config.height]), ms.float32) + config.height]), mindspore.float32) export(net, input_arr, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': diff --git a/official/cv/Inception/inceptionv3/modelarts/train_start.py b/official/cv/Inception/inceptionv3/modelarts/train_start.py index b4db0ea4747b476260f069cb8c2d0f7349b2ae44..453519ecf0feae9ad507a13e1a304247a16dc9b7 100644 --- a/official/cv/Inception/inceptionv3/modelarts/train_start.py +++ b/official/cv/Inception/inceptionv3/modelarts/train_start.py @@ -18,10 +18,9 @@ import argparse import glob import moxing as mox import numpy as np -import mindspore as ms +import mindspore from mindspore import Model from mindspore import Tensor -from mindspore import context from mindspore.common import set_seed from mindspore.common.initializer import XavierUniform, initializer from mindspore.communication import init, get_rank, get_group_size @@ -75,7 +74,7 @@ def frozen_to_air(network, args): param_dict_t = load_checkpoint(args.get("ckpt_file")) load_param_into_net(network, param_dict_t) input_arr = Tensor(np.random.uniform(0.0, 1.0, size=[args.get("batch_size"), 3, args.get("width"), \ - args.get("height")]), ms.float32) + args.get("height")]), mindspore.float32) export(network, input_arr, file_name=args.get("file_name"), file_format=args.get("file_format")) @@ -95,11 +94,11 @@ if __name__ == '__main__': config.dataset_path = os.path.join(config.dataset_path, "train") if config.platform == "GPU": - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) - context.set_context(mode=context.GRAPH_MODE, device_target=config.platform, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.platform, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): - context.set_context(device_id=int(os.getenv('DEVICE_ID'))) + mindspore.set_context(device_id=int(os.getenv('DEVICE_ID'))) # init distributed if config.is_distributed: @@ -107,7 +106,7 @@ if __name__ == '__main__': config.rank = get_rank() config.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=config.group_size, + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=config.group_size, gradients_mean=True) else: config.rank = 0 diff --git a/official/cv/Inception/inceptionv3/src/model_utils/moxing_adapter.py b/official/cv/Inception/inceptionv3/src/model_utils/moxing_adapter.py index 830d19a6fc99de8d602703971d5ac5b24e060d11..9c6d88e5d6e9831a21d75dc52fcc166584f8e61e 100644 --- a/official/cv/Inception/inceptionv3/src/model_utils/moxing_adapter.py +++ b/official/cv/Inception/inceptionv3/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/Inception/inceptionv3/train.py b/official/cv/Inception/inceptionv3/train.py index 9b2fe4e96d2bb10bb203b89e35c0532adef638c2..7467232515c1412618a9645c11b20bbd7973776e 100644 --- a/official/cv/Inception/inceptionv3/train.py +++ b/official/cv/Inception/inceptionv3/train.py @@ -24,11 +24,10 @@ from src.inception_v3 import InceptionV3 from src.lr_generator import get_lr from src.loss import CrossEntropy -import mindspore as ms +import mindspore import mindspore.log as logger from mindspore import Tensor -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.rmsprop import RMSProp from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor @@ -103,10 +102,10 @@ def modelarts_pre_process(): def set_ascend_max_device_memory(): - if ms.get_context("enable_ge") and ms.get_context("mode") == ms.GRAPH_MODE and \ + if mindspore.get_context("enable_ge") and mindspore.get_context("mode") == 0 and \ hasattr(config, "max_device_memory"): logger.warning("When encountering a memory shortage situation in 1980B, reduce the max_device_memory.") - ms.set_context(max_device_memory=config.max_device_memory) + mindspore.set_context(max_device_memory=config.max_device_memory) @moxing_wrapper(pre_process=modelarts_pre_process) @@ -114,11 +113,11 @@ def train_inceptionv3(): create_dataset = DS_DICT[config.ds_type] if config.platform == "GPU": - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) - context.set_context(mode=context.GRAPH_MODE, device_target=config.platform, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.platform, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): - context.set_context(device_id=int(os.getenv('DEVICE_ID'))) + mindspore.set_context(device_id=int(os.getenv('DEVICE_ID'))) set_ascend_max_device_memory() # init distributed if config.is_distributed: @@ -126,7 +125,7 @@ def train_inceptionv3(): config.rank = get_rank() config.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=config.group_size, + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=config.group_size, gradients_mean=True) else: config.rank = 0 diff --git a/official/cv/Inception/inceptionv4/eval.py b/official/cv/Inception/inceptionv4/eval.py index af369ffe2b4354751d42a86b60be60e01ffd6806..5387448991beb65f6bf4115ad38202182a2be7b0 100644 --- a/official/cv/Inception/inceptionv4/eval.py +++ b/official/cv/Inception/inceptionv4/eval.py @@ -22,8 +22,8 @@ from src.model_utils.device_adapter import get_device_id, get_device_num from src.dataset import create_dataset_imagenet, create_dataset_cifar10 from src.inceptionv4 import Inceptionv4 +import mindspore import mindspore.nn as nn -from mindspore import context from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -96,11 +96,11 @@ def inception_v4_eval(): if config.platform == 'Ascend': device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) create_dataset = DS_DICT[config.ds_type] - context.set_context(mode=context.GRAPH_MODE, device_target=config.platform) + mindspore.set_context(mode=0, device_target=config.platform) net = Inceptionv4(classes=config.num_classes) ckpt = load_checkpoint(config.checkpoint_path) load_param_into_net(net, ckpt) diff --git a/official/cv/Inception/inceptionv4/export.py b/official/cv/Inception/inceptionv4/export.py index 5557e035e45080ec93f6e1b11c7126fc1dbfa8e7..825ea99c3cc5dc76ae0bc8fe02d39a68e6934eba 100644 --- a/official/cv/Inception/inceptionv4/export.py +++ b/official/cv/Inception/inceptionv4/export.py @@ -20,16 +20,16 @@ from src.model_utils.device_adapter import get_device_id from src.model_utils.moxing_adapter import moxing_wrapper from src.inceptionv4 import Inceptionv4 -import mindspore as ms +import mindspore from mindspore import Tensor -from mindspore.train.serialization import load_checkpoint, load_param_into_net, export, context +from mindspore.train.serialization import load_checkpoint, load_param_into_net, export config.batch_size = 1 -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_process(): pass @@ -41,7 +41,7 @@ def export_inceptionv4(): param_dict = load_checkpoint(config.ckpt_file) load_param_into_net(net, param_dict) - input_arr = Tensor(np.ones([config.batch_size, 3, config.width, config.height]), ms.float32) + input_arr = Tensor(np.ones([config.batch_size, 3, config.width, config.height]), mindspore.float32) export(net, input_arr, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': diff --git a/official/cv/Inception/inceptionv4/infer/sdk/classification_task_metric.py b/official/cv/Inception/inceptionv4/infer/sdk/classification_task_metric.py index 09d0184a85bc4df6a82ff33c368bee7c05ab2ccb..4fdf1ec2b3c16a4129b52cc388fe7cadb1a34f2f 100644 --- a/official/cv/Inception/inceptionv4/infer/sdk/classification_task_metric.py +++ b/official/cv/Inception/inceptionv4/infer/sdk/classification_task_metric.py @@ -48,7 +48,7 @@ def load_statistical_predict_result(filepath): data_vec = np.zeros((n_label), dtype=np.float32) if n_label != 0: for ind, cls_ind in enumerate(temp): - data_vec[ind] = np.int(cls_ind) + data_vec[ind] = np.int_(cls_ind) return data_vec, n_label diff --git a/official/cv/Inception/inceptionv4/modelarts/train_start.py b/official/cv/Inception/inceptionv4/modelarts/train_start.py index 7cdd8445834cb8d91b33eccf1b632c10ce9c9b0a..dbb6c311f5ba5c05a48ba6ec8b82ccc3c53e0fa2 100644 --- a/official/cv/Inception/inceptionv4/modelarts/train_start.py +++ b/official/cv/Inception/inceptionv4/modelarts/train_start.py @@ -20,9 +20,9 @@ import glob import moxing as mox import numpy as np +import mindspore from mindspore import Model from mindspore import Tensor -from mindspore import context from mindspore.common import set_seed from mindspore.common.initializer import XavierUniform, initializer from mindspore.communication import init, get_rank, get_group_size @@ -162,16 +162,16 @@ if __name__ == '__main__': print('epoch_size: {} batch_size: {} class_num {}'.format(config.epoch_size, config.batch_size, config.num_classes)) - context.set_context(mode=context.GRAPH_MODE, device_target=config.platform) + mindspore.set_context(mode=0, device_target=config.platform) if config.platform == "Ascend": - context.set_context(device_id=get_device_id()) - context.set_context(enable_graph_kernel=False) + mindspore.set_context(device_id=get_device_id()) + mindspore.set_context(enable_graph_kernel=False) if device_num > 1: init() config.rank = get_rank() config.group_size = get_group_size() - context.set_auto_parallel_context(device_num=device_num, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[200, 400]) diff --git a/official/cv/Inception/inceptionv4/src/model_utils/moxing_adapter.py b/official/cv/Inception/inceptionv4/src/model_utils/moxing_adapter.py index 830d19a6fc99de8d602703971d5ac5b24e060d11..9c6d88e5d6e9831a21d75dc52fcc166584f8e61e 100644 --- a/official/cv/Inception/inceptionv4/src/model_utils/moxing_adapter.py +++ b/official/cv/Inception/inceptionv4/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/Inception/inceptionv4/train.py b/official/cv/Inception/inceptionv4/train.py index acc1138eeb98fa446f72cb2bedcd8de4684bba4a..442052e38cae02cd960d50076b0f4e142c4b026a 100644 --- a/official/cv/Inception/inceptionv4/train.py +++ b/official/cv/Inception/inceptionv4/train.py @@ -24,9 +24,9 @@ from src.model_utils.device_adapter import get_device_id, get_device_num from src.dataset import create_dataset_imagenet, create_dataset_cifar10 from src.inceptionv4 import Inceptionv4 +import mindspore from mindspore import Model from mindspore import Tensor -from mindspore import context from mindspore.common import set_seed from mindspore.common.initializer import XavierUniform, initializer from mindspore.communication import init, get_rank, get_group_size @@ -35,7 +35,7 @@ from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor, LossMonitor from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.train.serialization import load_checkpoint, load_param_into_net -from mindspore.context import ParallelMode +from mindspore import ParallelMode os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python' @@ -151,16 +151,16 @@ def inception_v4_train(): """ print('epoch_size: {} batch_size: {} class_num {}'.format(config.epoch_size, config.batch_size, config.num_classes)) - context.set_context(mode=context.GRAPH_MODE, device_target=config.platform) + mindspore.set_context(mode=0, device_target=config.platform) if config.platform == "Ascend": - context.set_context(device_id=get_device_id()) - context.set_context(enable_graph_kernel=False) + mindspore.set_context(device_id=get_device_id()) + mindspore.set_context(enable_graph_kernel=False) if device_num > 1: init() config.rank = get_rank() config.group_size = get_group_size() - context.set_auto_parallel_context(device_num=device_num, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[200, 400]) diff --git a/official/cv/Inception/xception/eval.py b/official/cv/Inception/xception/eval.py index 7eebabc1140cdbc9040d4a64ccc9a499542c1cdd..15224453b0524b0a0c02efa4c3c93b8dceafef57 100644 --- a/official/cv/Inception/xception/eval.py +++ b/official/cv/Inception/xception/eval.py @@ -15,7 +15,8 @@ """eval Xception.""" import time import os -from mindspore import context, nn +import mindspore +from mindspore import nn from mindspore.train.model import Model from mindspore.common import set_seed from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -95,8 +96,8 @@ def run_eval(): else: raise ValueError("Unsupported device_target.") - context.set_context(device_id=args_opt.device_id) - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, save_graphs=False) + mindspore.set_context(device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target=args_opt.device_target, save_graphs=False) # create dataset dataset = create_dataset(args_opt.test_data_dir, do_train=False, batch_size=config.batch_size, device_num=1, rank=0) diff --git a/official/cv/Inception/xception/export.py b/official/cv/Inception/xception/export.py index 7df6f040321310b7773154fbaea2f2f2ac5edd00..67efbc22314fa225b01f3b67007eceffe1729167 100644 --- a/official/cv/Inception/xception/export.py +++ b/official/cv/Inception/xception/export.py @@ -16,7 +16,8 @@ import os import numpy as np -from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.Xception import xception from src.model_utils.config import config as args, config_gpu, config_ascend @@ -39,8 +40,8 @@ def run_export(): else: raise ValueError("Unsupported device_target.") - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - context.set_context(device_id=args.device_id) + mindspore.set_context(mode=0, device_target=args.device_target) + mindspore.set_context(device_id=args.device_id) net = xception(class_num=config.class_num) # load checkpoint diff --git a/official/cv/Inception/xception/src/model_utils/moxing_adapter.py b/official/cv/Inception/xception/src/model_utils/moxing_adapter.py index 09cb0f0cf0fb88ba809d5ba9a40432b644d789b3..a6d8a3fce9707a33120d15cb8043bf891f8c07b3 100644 --- a/official/cv/Inception/xception/src/model_utils/moxing_adapter.py +++ b/official/cv/Inception/xception/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from src.model_utils.config import config @@ -94,7 +94,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/Inception/xception/train.py b/official/cv/Inception/xception/train.py index b80783c4400af5f065bc76731c67a3d1ad855774..7758ff006cdc6f54f90bae400d664f3eb5b1bc40 100644 --- a/official/cv/Inception/xception/train.py +++ b/official/cv/Inception/xception/train.py @@ -15,11 +15,10 @@ """train Xception.""" import os import time - -from mindspore import context +import mindspore from mindspore import Tensor from mindspore.nn.optim.momentum import Momentum -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.model import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor, LossMonitor from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -106,18 +105,18 @@ def run_train(): # init distributed if args_opt.is_distributed: - context.set_context(device_id=get_device_id(), mode=context.GRAPH_MODE, device_target=args_opt.device_target, + mindspore.set_context(device_id=get_device_id(), mode=0, device_target=args_opt.device_target, save_graphs=False) init() rank = get_rank_id() group_size = get_device_num() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=True) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=True) else: rank = 0 group_size = 1 device_id = get_device_id() - context.set_context(device_id=device_id, mode=context.GRAPH_MODE, device_target=args_opt.device_target, + mindspore.set_context(device_id=device_id, mode=0, device_target=args_opt.device_target, save_graphs=False) # define network net = xception(class_num=config.class_num) diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/eval.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/eval.py index 11d608a58f8f3a1218eb209de98a486853776f99..3ca708dfa356ac5f315d8434e6d094ba6188e6b4 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/eval.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/eval.py @@ -18,7 +18,8 @@ import os import time import numpy as np from pycocotools.coco import COCO -from mindspore import context, Tensor +import mindspore +from mindspore import Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.common import set_seed @@ -164,12 +165,12 @@ def modelarts_process(): @moxing_wrapper(pre_process=modelarts_process) def eval_(): device_target = config.device_target - context.set_context(mode=context.GRAPH_MODE, device_target=device_target) + mindspore.set_context(mode=0, device_target=device_target) if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) else: - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) config.mindrecord_dir = os.path.join(config.coco_root, config.mindrecord_dir) print('\nconfig:\n', config) diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/export.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/export.py index 81f868ed76fdfa43c1347a437ea368e6ccc86ad3..3e0459ddde5060c54c5be5f314262961e9c36006 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/export.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/export.py @@ -15,7 +15,8 @@ """export checkpoint file into air, mindir models""" import re import numpy as np -from mindspore import Tensor, context, load_checkpoint, export, load_param_into_net +import mindspore +from mindspore import Tensor, load_checkpoint, export, load_param_into_net from src.model_utils.config import config from src.model_utils.device_adapter import get_device_id from src.model_utils.moxing_adapter import moxing_wrapper @@ -33,9 +34,9 @@ def config_(cfg): config = config_(config) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_process(): pass diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/dataset.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/dataset.py index f817f75551d03b26cf0f59fc92e3cd3d79cc9f3e..96e742f98cff487fb313718594f3398f0af9e5b0 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/dataset.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/dataset.py @@ -22,10 +22,10 @@ import cv2 import numpy as np from numpy import random +import mindspore import mindspore.dataset as de import mindspore.dataset.vision as C from mindspore.mindrecord import FileWriter -from mindspore import context from src.model_utils.config import config @@ -315,7 +315,7 @@ def flip_column(img, img_shape, gt_bboxes, gt_label, gt_num, gt_mask): def transpose_column(img, img_shape, gt_bboxes, gt_label, gt_num, gt_mask): """transpose operation for image""" - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": platform_dtype = np.float32 else: platform_dtype = np.float16 diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample.py index e0345dfc3352539a82a95f55a77231d883acb982..a0c1446e94351bed4977a92b8ee43362e3f85d90 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample.py @@ -15,11 +15,11 @@ """MaskRcnn positive and negative sample screening for RPN.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor import mindspore.common.dtype as mstype -from mindspore import context class BboxAssignSample(nn.Cell): @@ -48,7 +48,7 @@ class BboxAssignSample(nn.Cell): cfg = config self.batch_size = batch_size - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.cast_type = mstype.float32 else: self.cast_type = mstype.float16 @@ -98,7 +98,7 @@ class BboxAssignSample(nn.Cell): self.check_neg_mask = Tensor(np.array(np.ones(self.num_expected_neg - self.num_expected_pos), dtype=np.bool_)) - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.check_gt_one = Tensor(np.array(-1 * np.ones((self.num_gts, 4)), dtype=np.float32)) self.check_anchor_two = Tensor(np.array(-2 * np.ones((self.num_bboxes, 4)), dtype=np.float32)) self.range_pos_size = Tensor(np.arange(self.num_expected_pos).astype(np.float32)) @@ -124,8 +124,9 @@ class BboxAssignSample(nn.Cell): assigned_gt_inds2 = self.select(neg_sample_iou_mask, self.assigned_gt_zeros, self.assigned_gt_inds) pos_sample_iou_mask = self.greaterequal(max_overlaps_w_gt, self.pos_iou_thr) - assigned_gt_inds3 = self.select(pos_sample_iou_mask, \ - max_overlaps_w_gt_index + self.assigned_gt_ones, assigned_gt_inds2) + assigned_gt_inds3 = self.select(pos_sample_iou_mask, + max_overlaps_w_gt_index.astype(mstype.int32) + self.assigned_gt_ones, + assigned_gt_inds2) assigned_gt_inds4 = assigned_gt_inds3 for j in range(self.num_gts): max_overlaps_w_ac_j = max_overlaps_w_ac[j:j+1:1] diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py index 07c5bc62115426cf410795b4aedcba4160ab0f3b..96c79a9c9cd03da3a7404001f305a511509ba6d0 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py @@ -15,12 +15,12 @@ """MaskRcnn tpositive and negative sample screening for Rcnn.""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P from mindspore.ops import function as F from mindspore.common.tensor import Tensor -from mindspore import context class BboxAssignSampleForRcnn(nn.Cell): @@ -81,7 +81,7 @@ class BboxAssignSampleForRcnn(nn.Cell): self.tile = P.Tile() # Check - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.cast_type = mstype.float32 self.np_cast_type = np.float32 self.int_cast_type = np.int32 @@ -148,8 +148,9 @@ class BboxAssignSampleForRcnn(nn.Cell): assigned_gt_inds2 = self.select(neg_sample_iou_mask, self.assigned_gt_zeros, self.assigned_gt_inds) pos_sample_iou_mask = self.greaterequal(max_overlaps_w_gt, self.scalar_pos_iou_thr) - assigned_gt_inds3 = self.select(pos_sample_iou_mask, \ - max_overlaps_w_gt_index + self.assigned_gt_ones, assigned_gt_inds2) + assigned_gt_inds3 = self.select(pos_sample_iou_mask, + max_overlaps_w_gt_index.astype(mstype.int32) + self.assigned_gt_ones, + assigned_gt_inds2) for j in range(self.num_gts): max_overlaps_w_ac_j = max_overlaps_w_ac[j:j+1:1] @@ -214,6 +215,7 @@ class BboxAssignSampleForRcnn(nn.Cell): # normalized box coordinate boxes = boxes / self.image_h_w box_ids = F.range(self.start, self.limit, self.delta) + box_ids = self.cast(box_ids, mstype.int32) pos_masks_fb = self.expand_dims(pos_masks_fb, -1) boxes = self.cast(boxes, mstype.float32) pos_masks_fb = self.crop_and_resize(pos_masks_fb, boxes, box_ids, self.mask_shape) diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py index 2ad832bbd6825ee8e843df141866ef68303b0de2..5e512961f0c0b7f3cecc91e9eec11bb538c009bd 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py @@ -15,12 +15,12 @@ """MaskRcnn feature pyramid network.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor from mindspore.common import dtype as mstype from mindspore.common.initializer import initializer -from mindspore import context def bias_init_zeros(shape): @@ -67,7 +67,7 @@ class FeatPyramidNeck(nn.Cell): out_channels, num_outs): super(FeatPyramidNeck, self).__init__() - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.platform_mstype = mstype.float32 else: self.platform_mstype = mstype.float16 diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/mask_rcnn_mobilenetv1.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/mask_rcnn_mobilenetv1.py index 1ad4867ca1245f7e6308424f5a63711d29d8acc1..f1f9ed3550d83cf0a233acc527a359f501856ef9 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/mask_rcnn_mobilenetv1.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/mask_rcnn_mobilenetv1.py @@ -15,12 +15,12 @@ """MaskRcnn based on mobilenetv1.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor import mindspore.common.dtype as mstype from mindspore.ops import functional as F -from mindspore import context from .mobilenetv1 import MobileNetV1_FeatureSelector from .bbox_assign_sample_stage2 import BboxAssignSampleForRcnn from .fpn_neck import FeatPyramidNeck @@ -484,7 +484,7 @@ class Mask_Rcnn_Mobilenetv1(nn.Cell): return mask_fb_pred_all def init_datatype(self): - self.platform = context.get_context("device_target") + self.platform = mindspore.get_context("device_target") if self.platform == "CPU" or self.platform == "GPU": self.platform_dtype = np.float32 self.platform_mstype = mstype.float32 diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/proposal_generator.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/proposal_generator.py index 21f77b1cc44a6d7d18b94cc59c986df38d3170e2..718557eb67e442506244e5b70f7b469987fd6d40 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/proposal_generator.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/proposal_generator.py @@ -15,11 +15,11 @@ """MaskRcnn proposal generator.""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P from mindspore import Tensor -from mindspore import context class Proposal(nn.Cell): """ @@ -50,7 +50,7 @@ class Proposal(nn.Cell): ): super(Proposal, self).__init__() - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.platform_dtype = np.float32 self.platform_mstype = mstype.float32 else: @@ -112,7 +112,7 @@ class Proposal(nn.Cell): self.multi_10 = Tensor(10.0, self.platform_mstype) - self.platform = context.get_context("device_target") + self.platform = mindspore.get_context("device_target") def set_train_local(self, config, training=True): """Set training flag.""" diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_cls.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_cls.py index 2f9fbc214c3b8a612c2a05e3d2cedae27783f756..f6885ece554d3904116806137eb9f8a04dd8e184 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_cls.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_cls.py @@ -15,13 +15,13 @@ """MaskRcnn Rcnn classification and box regression network.""" import numpy as np +import mindspore import mindspore.common.dtype as mstype import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor from mindspore.common.initializer import initializer from mindspore.common.parameter import Parameter -from mindspore import context class DenseNoTranpose(nn.Cell): """Dense method""" @@ -41,7 +41,7 @@ class FpnCls(nn.Cell): """dense layer of classification and box head""" def __init__(self, input_channels, output_channels, num_classes, pool_size): super(FpnCls, self).__init__() - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.platform_mstype = mstype.float32 else: self.platform_mstype = mstype.float16 @@ -105,7 +105,7 @@ class RcnnCls(nn.Cell): ): super(RcnnCls, self).__init__() cfg = config - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.platform_mstype = mstype.float32 self.platform_dtype = np.float32 else: diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_mask.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_mask.py index 71fd749b6ddb2f5a83dba62acdf87a87466174c3..e07b8a02037648b9d564e08924ecb694013f2579 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_mask.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_mask.py @@ -15,12 +15,12 @@ """MaskRcnn Rcnn for mask network.""" import numpy as np +import mindspore import mindspore.common.dtype as mstype import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor from mindspore.common.initializer import initializer -from mindspore import context def _conv(in_channels, out_channels, kernel_size=1, stride=1, padding=0, pad_mode='pad'): """Conv2D wrapper.""" @@ -46,7 +46,7 @@ class FpnMask(nn.Cell): """conv layers of mask head""" def __init__(self, input_channels, output_channels, num_classes): super(FpnMask, self).__init__() - self.platform = context.get_context("device_target") + self.platform = mindspore.get_context("device_target") if self.platform == "CPU" or self.platform == "GPU": self.platform_mstype = mstype.float32 else: @@ -120,7 +120,7 @@ class RcnnMask(nn.Cell): ): super(RcnnMask, self).__init__() cfg = config - self.platform = context.get_context("device_target") + self.platform = mindspore.get_context("device_target") if self.platform == "CPU" or self.platform == "GPU": self.platform_dtype = np.float32 self.platform_mstype = mstype.float32 diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rpn.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rpn.py index 8df441ba0db7b974739bf4a9ab5998bbbc13a408..eb2697697939d110cf4acb8bef0ee0d4637b62e8 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rpn.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rpn.py @@ -14,13 +14,13 @@ # ============================================================================ """RPN for MaskRCNN""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P from mindspore import Tensor from mindspore.ops import functional as F from mindspore.common.initializer import initializer -from mindspore import context from .bbox_assign_sample import BboxAssignSample @@ -101,7 +101,7 @@ class RPN(nn.Cell): cls_out_channels): super(RPN, self).__init__() cfg_rpn = config - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.platform_dtype = np.float32 self.platform_mstype = mstype.float32 else: diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/model_utils/moxing_adapter.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/model_utils/moxing_adapter.py index 830d19a6fc99de8d602703971d5ac5b24e060d11..9c6d88e5d6e9831a21d75dc52fcc166584f8e61e 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/model_utils/moxing_adapter.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/network_define.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/network_define.py index 077d42460ebfd9fb1cae8c61b1eb59bdff3fed32..c12852e1c92c65dc588d5830a4d198a44f1c751b 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/network_define.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/network_define.py @@ -16,6 +16,7 @@ import time import numpy as np +import mindspore import mindspore.nn as nn from mindspore.common.tensor import Tensor from mindspore.ops import functional as F @@ -23,7 +24,6 @@ from mindspore.ops import composite as C from mindspore import ParameterTuple from mindspore.train.callback import Callback from mindspore.nn.wrap.grad_reducer import DistributedGradReducer -from mindspore import context from src.maskrcnn_mobilenetv1.mask_rcnn_mobilenetv1 import Mask_Rcnn_Mobilenetv1 time_stamp_init = False @@ -167,7 +167,7 @@ class TrainOneStepCell(nn.Cell): self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.sens = Tensor((np.ones((1,)) * sens).astype(np.float32)) else: self.sens = Tensor((np.ones((1,)) * sens).astype(np.float16)) diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/train.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/train.py index 80300601fbc70f57c3922a7473e2d7641c415d7a..a4eeddf2ffd24971422cbf323be6f9959e261543 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/train.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/train.py @@ -18,12 +18,13 @@ import os import time +import mindspore import mindspore.common.dtype as mstype -from mindspore import context, Tensor +from mindspore import Tensor from mindspore.communication.management import init, get_rank from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor from mindspore.train import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.nn import Momentum from mindspore.common import set_seed @@ -116,13 +117,13 @@ def create_mindrecord_files(rank, mindrecord_file, mindrecord_dir, prefix): while not os.path.exists(mindrecord_file+".db"): time.sleep(5) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) # Set mempool block size for improving memory utilization, which will not take effect in GRAPH_MODE -if context.get_context("mode") == context.PYNATIVE_MODE: - context.set_context(mempool_block_size="28GB") +if mindspore.get_context("mode") == 1: + mindspore.set_context(mempool_block_size="28GB") @moxing_wrapper(pre_process=modelarts_pre_process) def train_maskrcnn_mobilenetv1(): @@ -132,13 +133,13 @@ def train_maskrcnn_mobilenetv1(): device_num = get_device_num() if config.device_target == "Ascend": rank = get_rank_id() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() elif config.device_target == "GPU": init() rank = get_rank() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: rank = 0 diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/README.md b/official/cv/MaskRCNN/maskrcnn_resnet50/README.md index 0d53f6878007a655737322b30e7ae19aebfd3775..0de91cf8f854c0f17457040a8861797434bf2448 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/README.md +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/README.md @@ -67,7 +67,7 @@ Note that you can run the scripts based on the dataset mentioned in original pap - Framework - [MindSpore](https://gitee.com/mindspore/mindspore) - Docker base image - - [Ascend Hub](https://ascend.huawei.com/ascendhub/#/home) + - [Ascend Hub](https://www.hiascend.com/developer/ascendhub) - For more information, please check the resources below: - [MindSpore Tutorials](https://www.mindspore.cn/tutorials/en/master/index.html) - [MindSpore Python API](https://www.mindspore.cn/docs/en/master/api_python/mindspore.html) diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/README_CN.md b/official/cv/MaskRCNN/maskrcnn_resnet50/README_CN.md index e033c138e9e18c18ba13f8c5dc689caaf513e909..9dd2156c858b17ff8a015479eeaf273d467a284f 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/README_CN.md +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/README_CN.md @@ -65,7 +65,7 @@ MaskRCNN是一个两级目标检测网络,作为FasterRCNN的扩展模型, - 框架 - [MindSpore](https://gitee.com/mindspore/mindspore) - 获取基础镜像 - - [Ascend Hub](https://ascend.huawei.com/ascendhub/#/home) + - [Ascend Hub](https://www.hiascend.com/developer/ascendhub) - 如需查看详情,请参见如下资源: - [MindSpore教程](https://www.mindspore.cn/tutorials/zh-CN/master/index.html) - [MindSpore Python API](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore.html) diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/eval.py b/official/cv/MaskRCNN/maskrcnn_resnet50/eval.py index 64625732ff86280c7197a1ea9311facad8450e90..532b351cac83d2c897e573096ea143b93984a519 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/eval.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/eval.py @@ -26,7 +26,8 @@ from src.dataset import data_to_mindrecord_byte_image, create_maskrcnn_dataset from src.util import coco_eval, bbox2result_1image, results2json, get_seg_masks from pycocotools.coco import COCO -from mindspore import context, Tensor +import mindspore +from mindspore import Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.common import set_seed @@ -161,7 +162,7 @@ def modelarts_process(): @moxing_wrapper(pre_process=modelarts_process) def eval_(): device_target = config.device_target - context.set_context(mode=context.GRAPH_MODE, device_target=device_target, device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=device_target, device_id=get_device_id()) config.mindrecord_dir = os.path.join(config.coco_root, config.mindrecord_dir) print('\neval.py config:\n', config) diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/export.py b/official/cv/MaskRCNN/maskrcnn_resnet50/export.py index 7bdaa768ea0356d525671f2dded273c9bd8a2377..5fefa87bf8cab4cdd5b51ffa0f6cd8e52b0382b2 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/export.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/export.py @@ -19,15 +19,16 @@ from src.model_utils.config import config from src.model_utils.device_adapter import get_device_id from src.model_utils.moxing_adapter import moxing_wrapper from src.maskrcnn.mask_rcnn_r50 import MaskRcnn_Infer -from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export if not config.enable_modelarts: config.ckpt_file = config.ckpt_file_local -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_process(): pass diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample.py index e4421110c07ae78e6e0cd7ce2a3e9e1c528591e8..a49fb28f0b75d28a05e9dbca8945988e930bf0cb 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample.py @@ -15,10 +15,10 @@ """MaskRcnn positive and negative sample screening for RPN.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor -from mindspore import context import mindspore.common.dtype as mstype @@ -47,7 +47,7 @@ class BboxAssignSample(nn.Cell): super(BboxAssignSample, self).__init__() cfg = config - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 self.np_cast_type = np.float16 else: @@ -121,8 +121,9 @@ class BboxAssignSample(nn.Cell): assigned_gt_inds2 = self.select(neg_sample_iou_mask, self.assigned_gt_zeros, self.assigned_gt_inds) pos_sample_iou_mask = self.greaterequal(max_overlaps_w_gt, self.pos_iou_thr) - assigned_gt_inds3 = self.select(pos_sample_iou_mask, \ - max_overlaps_w_gt_index + self.assigned_gt_ones, assigned_gt_inds2) + assigned_gt_inds3 = self.select(pos_sample_iou_mask, + max_overlaps_w_gt_index.astype(mstype.int32) + self.assigned_gt_ones, + assigned_gt_inds2) assigned_gt_inds4 = assigned_gt_inds3 for j in range(self.num_gts): max_overlaps_w_ac_j = max_overlaps_w_ac[j:j+1:1] diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py index e97ee0a832da2b0c92de87095209168c849cd7ff..599585d23eb3c3cb217ffb93bfb368c06ed989f2 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py @@ -15,12 +15,12 @@ """MaskRcnn tpositive and negative sample screening for Rcnn.""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P from mindspore.ops import function as F from mindspore.common.tensor import Tensor -from mindspore import context class BboxAssignSampleForRcnn(nn.Cell): @@ -44,7 +44,7 @@ class BboxAssignSampleForRcnn(nn.Cell): super(BboxAssignSampleForRcnn, self).__init__() cfg = config - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 self.np_cast_type = np.float16 else: @@ -146,8 +146,9 @@ class BboxAssignSampleForRcnn(nn.Cell): assigned_gt_inds2 = self.select(neg_sample_iou_mask, self.assigned_gt_zeros, self.assigned_gt_inds) pos_sample_iou_mask = self.greaterequal(max_overlaps_w_gt, self.scalar_pos_iou_thr) - assigned_gt_inds3 = self.select(pos_sample_iou_mask, \ - max_overlaps_w_gt_index + self.assigned_gt_ones, assigned_gt_inds2) + assigned_gt_inds3 = self.select(pos_sample_iou_mask, + max_overlaps_w_gt_index.astype(mstype.int32) + self.assigned_gt_ones, + assigned_gt_inds2) for j in range(self.num_gts): max_overlaps_w_ac_j = max_overlaps_w_ac[j:j+1:1] @@ -212,6 +213,7 @@ class BboxAssignSampleForRcnn(nn.Cell): # normalized box coordinate boxes = boxes / self.image_h_w box_ids = F.range(self.start, self.limit, self.delta) + box_ids = self.cast(box_ids, mstype.int32) pos_masks_fb = self.expand_dims(pos_masks_fb, -1) boxes = self.cast(boxes, mstype.float32) pos_masks_fb = self.crop_and_resize(pos_masks_fb, boxes, box_ids, self.mask_shape) diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py index 68590ec9db4630ce763821e8d1f685c8dc90f5a4..94b0d7ffa789169bb6585e8dcef2df44fbc1a5da 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py @@ -15,12 +15,12 @@ """MaskRcnn feature pyramid network.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor from mindspore.common import dtype as mstype from mindspore.common.initializer import initializer -from mindspore import context def bias_init_zeros(shape): @@ -69,7 +69,7 @@ class FeatPyramidNeck(nn.Cell): feature_shapes): super(FeatPyramidNeck, self).__init__() - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 else: self.cast_type = mstype.float32 diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/mask_rcnn_r50.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/mask_rcnn_r50.py index 6c72d8c08438ef18f7677bd9bb88294c855d55af..d36b68dd34cd2c36f9241c6d9d121d332f032bd7 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/mask_rcnn_r50.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/mask_rcnn_r50.py @@ -15,12 +15,12 @@ """MaskRcnn based on ResNet50.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor import mindspore.common.dtype as mstype from mindspore.ops import functional as F -from mindspore import context from .resnet50 import ResNetFea, ResidualBlockUsing from .bbox_assign_sample_stage2 import BboxAssignSampleForRcnn from .fpn_neck import FeatPyramidNeck @@ -54,7 +54,7 @@ class Mask_Rcnn_Resnet50(nn.Cell): def __init__(self, config): super(Mask_Rcnn_Resnet50, self).__init__() - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 self.np_cast_type = np.float16 else: diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/proposal_generator.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/proposal_generator.py index f556d0e176b50a63ed8f803dfda66d95b338af20..f3f698de118d2c78c9a0c1a102e0e50ff4e7265f 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/proposal_generator.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/proposal_generator.py @@ -15,11 +15,11 @@ """MaskRcnn proposal generator.""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P from mindspore import Tensor -from mindspore import context class Proposal(nn.Cell): @@ -52,7 +52,7 @@ class Proposal(nn.Cell): super(Proposal, self).__init__() cfg = config - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 self.np_cast_type = np.float16 else: diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rcnn_cls.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rcnn_cls.py index 28a46e70eeafdb2c3a7ea2c44f694cf460eee3a9..ad8ce0de63211b31a6de6e699fa8b1e6e3d927e6 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rcnn_cls.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rcnn_cls.py @@ -15,13 +15,13 @@ """MaskRcnn Rcnn classification and box regression network.""" import numpy as np +import mindspore import mindspore.common.dtype as mstype import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor from mindspore.common.initializer import initializer from mindspore.common.parameter import Parameter -from mindspore import context class DenseNoTranpose(nn.Cell): @@ -45,7 +45,7 @@ class FpnCls(nn.Cell): def __init__(self, input_channels, output_channels, num_classes, pool_size): super(FpnCls, self).__init__() - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 else: self.cast_type = mstype.float32 @@ -112,7 +112,7 @@ class RcnnCls(nn.Cell): super(RcnnCls, self).__init__() cfg = config - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 self.np_cast_type = np.float16 else: diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rcnn_mask.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rcnn_mask.py index c34df040feb58010a17cba1947565d4d1669e6c3..e54bd16faabcbce89edb223b9ed3426805870b34 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rcnn_mask.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rcnn_mask.py @@ -15,11 +15,11 @@ """MaskRcnn Rcnn for mask network.""" import numpy as np +import mindspore import mindspore.common.dtype as mstype import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor -from mindspore import context def _conv(in_channels, out_channels, kernel_size=1, stride=1, padding=0, pad_mode='pad', gain=1): @@ -59,7 +59,7 @@ class FpnMask(nn.Cell): def __init__(self, input_channels, output_channels, num_classes): super(FpnMask, self).__init__() - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 else: self.cast_type = mstype.float32 @@ -136,7 +136,7 @@ class RcnnMask(nn.Cell): super(RcnnMask, self).__init__() cfg = config - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 self.np_cast_type = np.float16 else: diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/resnet50.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/resnet50.py index cb943c174506345249339d51298ccc8d217e31df..358ba0a9ae7596c1fae79e7818880f9426114f89 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/resnet50.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/resnet50.py @@ -15,14 +15,14 @@ """Resnet50 backbone.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor from mindspore.ops import functional as F import mindspore.common.dtype as mstype -from mindspore import context -if context.get_context("device_target") == "Ascend": +if mindspore.get_context("device_target") == "Ascend": ms_cast_type = mstype.float16 else: ms_cast_type = mstype.float32 diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rpn.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rpn.py index edc062581e08b46bd6af759f25f41e63a3e78623..cbb8dbfb00d7e7a919d6d2ef66dfce3c3a166cb4 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rpn.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rpn.py @@ -14,10 +14,11 @@ # ============================================================================ """RPN for MaskRCNN""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P -from mindspore import Tensor, context +from mindspore import Tensor from mindspore.ops import functional as F from mindspore.common.initializer import initializer from .bbox_assign_sample import BboxAssignSample @@ -100,7 +101,7 @@ class RPN(nn.Cell): super(RPN, self).__init__() cfg_rpn = config - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 self.np_cast_type = np.float16 else: diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/model_utils/moxing_adapter.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/model_utils/moxing_adapter.py index 830d19a6fc99de8d602703971d5ac5b24e060d11..9c6d88e5d6e9831a21d75dc52fcc166584f8e61e 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/model_utils/moxing_adapter.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/train.py b/official/cv/MaskRCNN/maskrcnn_resnet50/train.py index 19728e310936d092540edbc66f7a447f98ff6005..0de706e7f9c46f603b858354fa406987bf4f3a3b 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/train.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/train.py @@ -26,12 +26,13 @@ from src.network_define import LossCallBack, WithLossCell, TrainOneStepCell, Los from src.dataset import data_to_mindrecord_byte_image, create_maskrcnn_dataset from src.lr_schedule import dynamic_lr +import mindspore import mindspore.common.dtype as mstype -from mindspore import context, Tensor, Parameter +from mindspore import Tensor, Parameter from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor from mindspore.train import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.nn import Momentum from mindspore.common import set_seed @@ -152,7 +153,7 @@ def load_pretrained_ckpt(net, load_path, device_target): @moxing_wrapper(pre_process=modelarts_pre_process) def train_maskrcnn(): device_target = config.device_target - context.set_context(mode=context.GRAPH_MODE, device_target=device_target, device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=device_target, device_id=get_device_id()) config.mindrecord_dir = os.path.join(config.coco_root, config.mindrecord_dir) print('\ntrain.py config:\n', config) @@ -164,7 +165,7 @@ def train_maskrcnn(): rank = get_rank() dataset_sink_mode_flag = device_target == 'Ascend' device_num = get_group_size() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: rank = 0 diff --git a/official/cv/MobileNet/mobilenetv1/README.md b/official/cv/MobileNet/mobilenetv1/README.md index 93b2a871f24c6a3592fcfc0450b513afb1a3936b..685c53126e0e11f2a98b996996718f71ab20c24d 100644 --- a/official/cv/MobileNet/mobilenetv1/README.md +++ b/official/cv/MobileNet/mobilenetv1/README.md @@ -335,7 +335,7 @@ You can start training using python or shell scripts.If the train method is trai ### Result -Inference result will be stored in the example path, you can find result like the followings in `eval/log`. +Inference result will be stored in the example path, you can find result like the following in `eval/log`. ```shell Ascend diff --git a/official/cv/MobileNet/mobilenetv1/eval.py b/official/cv/MobileNet/mobilenetv1/eval.py index 30c8a5f30fa4606b7f4f7279a674f51adab5e7d7..ce88276f61e716a3a7fcf5123dc0ed576de7d665 100644 --- a/official/cv/MobileNet/mobilenetv1/eval.py +++ b/official/cv/MobileNet/mobilenetv1/eval.py @@ -14,7 +14,7 @@ # ============================================================================ """eval mobilenet_v1.""" import os -import mindspore as ms +import mindspore from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from src.CrossEntropySmooth import CrossEntropySmooth from src.mobilenet_v1 import mobilenet_v1 as mobilenet @@ -22,7 +22,7 @@ from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper, modelarts_process -ms.set_seed(1) +mindspore.set_seed(1) if config.dataset == 'cifar10': from src.dataset import create_dataset1 as create_dataset @@ -39,10 +39,10 @@ def eval_mobilenetv1(): target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID', '0')) - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # create dataset dataset = create_dataset(dataset_path=config.dataset_path, do_train=False, batch_size=config.batch_size, @@ -53,8 +53,8 @@ def eval_mobilenetv1(): net = mobilenet(class_num=config.class_num) # load checkpoint - param_dict = ms.load_checkpoint(config.checkpoint_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.checkpoint_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) # define loss, model @@ -67,7 +67,7 @@ def eval_mobilenetv1(): loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) # eval model res = model.eval(dataset) diff --git a/official/cv/MobileNet/mobilenetv1/export.py b/official/cv/MobileNet/mobilenetv1/export.py index 679c4f5d1c160b43d4706be3f72dd049bdb5d862..bb6a9471aa54fd67832b56eefb0a5328365cd308 100644 --- a/official/cv/MobileNet/mobilenetv1/export.py +++ b/official/cv/MobileNet/mobilenetv1/export.py @@ -15,7 +15,7 @@ import numpy as np -import mindspore as ms +import mindspore from src.mobilenet_v1 import mobilenet_v1 as mobilenet from src.model_utils.config import config @@ -23,7 +23,7 @@ from src.model_utils.device_adapter import get_device_id from src.model_utils.moxing_adapter import moxing_wrapper, modelarts_export_preprocess -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) @moxing_wrapper(pre_process=modelarts_export_preprocess) @@ -31,13 +31,13 @@ def export_mobilenetv1(): """ export_mobilenetv1 """ target = config.device_target if target != "GPU": - ms.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) network = mobilenet(class_num=config.class_num) - ms.load_checkpoint(config.ckpt_file, net=network) + mindspore.load_checkpoint(config.ckpt_file, net=network) network.set_train(False) - input_data = ms.numpy.zeros([config.batch_size, 3, config.height, config.width]).astype(np.float32) - ms.export(network, input_data, file_name=config.file_name, file_format=config.file_format) + input_data = mindspore.numpy.zeros([config.batch_size, 3, config.height, config.width]).astype(np.float32) + mindspore.export(network, input_data, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': diff --git a/official/cv/MobileNet/mobilenetv1/src/CrossEntropySmooth.py b/official/cv/MobileNet/mobilenetv1/src/CrossEntropySmooth.py index f8283eb7609a5a1cfd8f39a792d24063dd744ad1..7c2c5d00c8e5973529ffbf436adac3051636189a 100644 --- a/official/cv/MobileNet/mobilenetv1/src/CrossEntropySmooth.py +++ b/official/cv/MobileNet/mobilenetv1/src/CrossEntropySmooth.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ """define loss function for network""" -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops @@ -24,8 +24,8 @@ class CrossEntropySmooth(nn.LossBase): super(CrossEntropySmooth, self).__init__() self.onehot = ops.OneHot() self.sparse = sparse - self.on_value = ms.Tensor(1.0 - smooth_factor, ms.float32) - self.off_value = ms.Tensor(1.0 * smooth_factor / (num_classes - 1), ms.float32) + self.on_value = mindspore.Tensor(1.0 - smooth_factor, mindspore.float32) + self.off_value = mindspore.Tensor(1.0 * smooth_factor / (num_classes - 1), mindspore.float32) self.ce = nn.SoftmaxCrossEntropyWithLogits(reduction=reduction) def construct(self, logit, label): diff --git a/official/cv/MobileNet/mobilenetv1/src/dataset.py b/official/cv/MobileNet/mobilenetv1/src/dataset.py index 476d04b1dee1a88cf99c7323a2605e401d6ef9b4..46f33dd90ba8564258f49f1defe6bd0fad8095d0 100644 --- a/official/cv/MobileNet/mobilenetv1/src/dataset.py +++ b/official/cv/MobileNet/mobilenetv1/src/dataset.py @@ -17,7 +17,7 @@ create train or eval dataset. """ import os from multiprocessing import cpu_count -import mindspore as ms +import mindspore import mindspore.dataset as ds import mindspore.communication as comm @@ -58,7 +58,7 @@ def create_dataset1(dataset_path, do_train, device_num=1, batch_size=32, target= ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=THREAD_NUM) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=THREAD_NUM) @@ -112,7 +112,7 @@ def create_dataset2(dataset_path, do_train, device_num=1, batch_size=32, target= ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=THREAD_NUM) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=THREAD_NUM) diff --git a/official/cv/MobileNet/mobilenetv1/src/model_utils/moxing_adapter.py b/official/cv/MobileNet/mobilenetv1/src/model_utils/moxing_adapter.py index 31233d6fce26aafb3b52fa910e03f63a6fd31db4..d8cbe4824274656937fe899c6bcf5eee19d0efe4 100644 --- a/official/cv/MobileNet/mobilenetv1/src/model_utils/moxing_adapter.py +++ b/official/cv/MobileNet/mobilenetv1/src/model_utils/moxing_adapter.py @@ -20,7 +20,7 @@ import functools import zipfile import time -import mindspore as ms +import mindspore from .config import config _global_sync_count = 0 @@ -95,7 +95,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - ms.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): @@ -105,7 +105,7 @@ def moxing_wrapper(pre_process=None, post_process=None): pre_process() if config.enable_profiling: - profiler = ms.profiler.Profiler() + profiler = mindspore.profiler.Profiler() run_func(*args, **kwargs) diff --git a/official/cv/MobileNet/mobilenetv1/train.py b/official/cv/MobileNet/mobilenetv1/train.py index e8737c295096b9fc2ed8ac4eb71004bd4193503d..8436b1aacf5cdae40d9da798e36214d15d6e181f 100644 --- a/official/cv/MobileNet/mobilenetv1/train.py +++ b/official/cv/MobileNet/mobilenetv1/train.py @@ -15,7 +15,7 @@ """train mobilenet_v1.""" import os -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.communication as comm import mindspore.common.initializer as weight_init @@ -29,7 +29,7 @@ from src.model_utils.moxing_adapter import moxing_wrapper, modelarts_process from src.model_utils.device_adapter import get_device_num -ms.set_seed(1) +mindspore.set_seed(1) if config.dataset == 'cifar10': from src.dataset import create_dataset1 as create_dataset @@ -40,8 +40,8 @@ else: def init_weigth(net): # init weight if config.pre_trained: - param_dict = ms.load_checkpoint(config.pre_trained) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.pre_trained) + mindspore.load_param_into_net(net, param_dict) else: for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): @@ -63,26 +63,26 @@ def train_mobilenetv1(): ckpt_save_dir = config.save_checkpoint_path # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) # Set mempool block size in PYNATIVE_MODE for improving memory utilization, which will not take effect in GRAPH_MODE - if ms.get_context("mode") == ms.PYNATIVE_MODE: - ms.set_context(mempool_block_size="31GB") + if mindspore.get_context("mode") == 1: + mindspore.set_context(mempool_block_size="31GB") if config.parameter_server: - ms.set_ps_context(enable_ps=True) + mindspore.set_ps_context(enable_ps=True) device_id = int(os.getenv('DEVICE_ID', '0')) if config.run_distribute: if target == "Ascend": - ms.set_context(device_id=device_id) - ms.set_auto_parallel_context(device_num=get_device_num(), parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_context(device_id=device_id) + mindspore.set_auto_parallel_context(device_num=get_device_num(), parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) comm.init() - ms.set_auto_parallel_context(all_reduce_fusion_config=[75]) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[75]) # GPU target else: comm.init() - ms.set_auto_parallel_context(device_num=comm.get_group_size(), parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=comm.get_group_size(), parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(comm.get_rank()) + "/" @@ -102,7 +102,7 @@ def train_mobilenetv1(): lr = get_lr(lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode) - lr = ms.Tensor(lr) + lr = mindspore.Tensor(lr) # define opt decayed_params = [] @@ -129,12 +129,12 @@ def train_mobilenetv1(): smooth_factor=config.label_smooth_factor, num_classes=config.class_num) else: loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') - loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + loss_scale = mindspore.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) if target == "Ascend": - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, amp_level="O2", keep_batchnorm_fp32=False) else: - model = ms.Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) # define callbacks time_cb = TimeMonitor(data_size=step_size) diff --git a/official/cv/MobileNet/mobilenetv2/README.md b/official/cv/MobileNet/mobilenetv2/README.md index 66be81f6e5c5805377a20c4943c0c593b9d51b7c..bf808e4170669cac84de04b78f1d119615c2d85a 100644 --- a/official/cv/MobileNet/mobilenetv2/README.md +++ b/official/cv/MobileNet/mobilenetv2/README.md @@ -312,7 +312,7 @@ You can start training using python or shell scripts. The usage of shell scripts ### Result -Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train.log` like followings with the platform CPU and GPU. +Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train.log` like following with the platform CPU and GPU. ```log epoch: [ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100] @@ -352,7 +352,7 @@ You can start training using python or shell scripts.If the train method is trai ### Result -Inference result will be stored in the example path, you can find result like the followings in `eval.log`. +Inference result will be stored in the example path, you can find result like the following in `eval.log`. ```log result: {'acc': 0.71976314102564111} ckpt=./ckpt_0/mobilenet-200_625.ckpt diff --git a/official/cv/MobileNet/mobilenetv2/eval.py b/official/cv/MobileNet/mobilenetv2/eval.py index f348ca8ec4f7327f0542fa451ff45f791733e7db..bd39c52b09b324d317fa85ab166c4c52220d23a3 100644 --- a/official/cv/MobileNet/mobilenetv2/eval.py +++ b/official/cv/MobileNet/mobilenetv2/eval.py @@ -16,7 +16,7 @@ eval. """ import os -import mindspore as ms +import mindspore import mindspore.nn as nn from src.dataset import create_dataset from src.models import define_net, load_ckpt @@ -29,7 +29,7 @@ config.is_training = config.is_training_eval @moxing_wrapper(pre_process=modelarts_process) def eval_mobilenetv2(): - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.platform, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.platform, save_graphs=False) config.dataset_path = os.path.join(config.dataset_path, 'validation_preprocess') print('\nconfig: \n', config) if not config.device_id: @@ -46,7 +46,7 @@ def eval_mobilenetv2(): net.set_train(False) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') - model = ms.Model(net, loss_fn=loss, metrics={'acc'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'acc'}) res = model.eval(dataset) print(f"result:{res}\npretrain_ckpt={config.pretrain_ckpt}") diff --git a/official/cv/MobileNet/mobilenetv2/export.py b/official/cv/MobileNet/mobilenetv2/export.py index 095fbc824c42378597d415465d608ae2fa5a815d..66ed700fb3051ee580753aa97b292d7702924ea5 100644 --- a/official/cv/MobileNet/mobilenetv2/export.py +++ b/official/cv/MobileNet/mobilenetv2/export.py @@ -16,7 +16,7 @@ mobilenetv2 export file. """ import numpy as np -import mindspore as ms +import mindspore from src.models import define_net, load_ckpt from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper @@ -29,13 +29,13 @@ config.is_training = config.is_training_export def export_mobilenetv2(): """ export_mobilenetv2 """ print('\nconfig: \n', config) - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.platform, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.platform, save_graphs=False) _, _, net = define_net(config, config.is_training) load_ckpt(net, config.ckpt_file) input_shp = [config.batch_size, 3, config.image_height, config.image_width] - input_array = ms.Tensor(np.random.uniform(-1.0, 1.0, size=input_shp).astype(np.float32)) - ms.export(net, input_array, file_name=config.file_name, file_format=config.file_format) + input_array = mindspore.Tensor(np.random.uniform(-1.0, 1.0, size=input_shp).astype(np.float32)) + mindspore.export(net, input_array, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': diff --git a/official/cv/MobileNet/mobilenetv2/golden_stick/quantization/simqat/eval.py b/official/cv/MobileNet/mobilenetv2/golden_stick/quantization/simqat/eval.py index 799308e9df64115efbd2854f170fc31711281c2c..7787481e2771bbf494938ef251798fc11aa15c6b 100644 --- a/official/cv/MobileNet/mobilenetv2/golden_stick/quantization/simqat/eval.py +++ b/official/cv/MobileNet/mobilenetv2/golden_stick/quantization/simqat/eval.py @@ -15,7 +15,7 @@ """ eval. """ -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.train.serialization import load_checkpoint, load_param_into_net from simqat import create_simqat @@ -33,9 +33,9 @@ config.is_training = config.is_training_eval def eval_mobilenetv2(): """eval_mobilenetv2 """ if config.mode_name == "GRAPH": - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.platform, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.platform, save_graphs=False) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=config.platform, save_graphs=False) + mindspore.set_context(mode=1, device_target=config.platform, save_graphs=False) config.dataset_path = config.dataset_path print('\nconfig: \n', config) if not config.device_id: @@ -48,7 +48,7 @@ def eval_mobilenetv2(): net.set_train(False) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') - model = ms.Model(net, loss_fn=loss, metrics={'acc'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'acc'}) dataset = create_dataset_cifar10(dataset_path=config.dataset_path, do_train=False, config=config) step_size = dataset.get_dataset_size() if step_size == 0: diff --git a/official/cv/MobileNet/mobilenetv2/golden_stick/quantization/simqat/train.py b/official/cv/MobileNet/mobilenetv2/golden_stick/quantization/simqat/train.py index ed1cea70dcbac49f8c293cce6acca2297f85c0f2..047625382c51e6aa3fff625476ade989cf973925 100644 --- a/official/cv/MobileNet/mobilenetv2/golden_stick/quantization/simqat/train.py +++ b/official/cv/MobileNet/mobilenetv2/golden_stick/quantization/simqat/train.py @@ -19,7 +19,7 @@ import time import random import numpy as np -import mindspore as ms +import mindspore import mindspore.communication as comm import mindspore.nn as nn from simqat import create_simqat @@ -33,7 +33,7 @@ from src.model_utils.moxing_adapter import moxing_wrapper, modelarts_process from src.model_utils.device_adapter import get_device_id -ms.set_seed(1) +mindspore.set_seed(1) @moxing_wrapper(pre_process=modelarts_process) def train_mobilenetv2(): @@ -41,16 +41,16 @@ def train_mobilenetv2(): if config.platform != "GPU": raise NotImplementedError("SimQAT only support running on GPU now!") if config.mode_name == "GRAPH": - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.platform, + mindspore.set_context(mode=0, device_target=config.platform, save_graphs=False, enable_graph_kernel=True) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=config.platform, + mindspore.set_context(mode=1, device_target=config.platform, save_graphs=False, enable_graph_kernel=True) if config.run_distribute: comm.init() config.rank_id = comm.get_rank() config.rank_size = comm.get_group_size() - ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) config.train_dataset_path = config.dataset_path config.eval_dataset_path = config.dataset_path @@ -88,7 +88,7 @@ def train_mobilenetv2(): epoch_size = config.epoch_size # get learning rate - lr = ms.Tensor(get_lr(global_step=0, + lr = mindspore.Tensor(get_lr(global_step=0, lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, @@ -100,7 +100,7 @@ def train_mobilenetv2(): eval_dataset = None if config.pretrain_ckpt == "" or config.freeze_layer != "backbone": opt = nn.Momentum(net.trainable_params(), lr, config.momentum, config.weight_decay) - model = ms.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network, boost_level=config.boost_mode) cb = config_ckpoint(config, lr, step_size, model, eval_dataset) print("============== Starting Training ==============") @@ -127,15 +127,15 @@ def train_mobilenetv2(): epoch_start = time.time() losses = [] for j in idx_list: - feature = ms.Tensor(np.load(os.path.join(features_path, "feature_{}.npy".format(j)))) - label = ms.Tensor(np.load(os.path.join(features_path, "label_{}.npy".format(j)))) + feature = mindspore.Tensor(np.load(os.path.join(features_path, "feature_{}.npy".format(j)))) + label = mindspore.Tensor(np.load(os.path.join(features_path, "label_{}.npy".format(j)))) losses.append(network(feature, label).asnumpy()) epoch_mseconds = (time.time()-epoch_start) * 1000 per_step_mseconds = epoch_mseconds / step_size print("epoch[{}/{}], iter[{}] cost: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}"\ .format(epoch + 1, epoch_size, step_size, epoch_mseconds, per_step_mseconds, np.mean(np.array(losses)))) if (epoch + 1) % config.save_checkpoint_epochs == 0: - ms.save_checkpoint(net, os.path.join(save_ckpt_path, "mobilenetv2_{}.ckpt".format(epoch + 1))) + mindspore.save_checkpoint(net, os.path.join(save_ckpt_path, "mobilenetv2_{}.ckpt".format(epoch + 1))) print("total cost {:5.4f} s".format(time.time() - start)) diff --git a/official/cv/MobileNet/mobilenetv2/scripts/run_distribute_train_msrun.sh b/official/cv/MobileNet/mobilenetv2/scripts/run_distribute_train_msrun.sh new file mode 100644 index 0000000000000000000000000000000000000000..636e337f609a2a21c453537a905a565ebc66b130 --- /dev/null +++ b/official/cv/MobileNet/mobilenetv2/scripts/run_distribute_train_msrun.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash scripts/run_distributed_train_msrun.sh [DATA_PATH] [RANK_SIZE]" +echo "For example: bash scripts/run_distributed_train_msrun.sh /path/dataset 8" +echo "It is better to use the absolute path." +echo "==============================================================================================================" + +DATA_PATH=$1 +RANK_SIZE=$2 +export DATA_PATH=${DATA_PATH} +export RANK_SIZE=${RANK_SIZE} +export HCCL_CONNECT_TIMEOUT=600 +ulimit -s 302400 + +EXEC_PATH=$(pwd) +CONFIG_PATH=${EXEC_PATH}/default_config.yaml + +if [ ! -d "${DATA_PATH}" ] +then + echo "ERROR: ${DATA_PATH} is not a valid path for dataset, please check." + exit 0 +fi + +env > env.log +echo "start training" +msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port 8118 \ + --log_dir=msrun_log --join=True --cluster_time_out=300 \ + train.py --run_distribute True --config_path=${CONFIG_PATH} --platform Ascend --dataset_path=${DATA_PATH} --rank_size ${RANK_SIZE} &> log.txt & + diff --git a/official/cv/MobileNet/mobilenetv2/src/dataset.py b/official/cv/MobileNet/mobilenetv2/src/dataset.py index 3204020f5a4921035b86b15c78db8774be772ae2..c4dec72745e2f1240f9ac321b98b9cdaa00276a4 100644 --- a/official/cv/MobileNet/mobilenetv2/src/dataset.py +++ b/official/cv/MobileNet/mobilenetv2/src/dataset.py @@ -18,7 +18,7 @@ create train or eval dataset. import os import numpy as np -import mindspore as ms +import mindspore import mindspore.dataset as ds @@ -70,7 +70,7 @@ def create_dataset(dataset_path, do_train, config, enable_cache=False, cache_ses else: trans = [decode_op, resize_op, center_crop, normalize_op, change_swap_op] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=num_workers) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_workers) @@ -133,7 +133,7 @@ def create_dataset_cifar10(dataset_path, do_train, config, enable_cache=False, c ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_workers) @@ -168,7 +168,7 @@ def extract_features(net, dataset_path, config): raise ValueError("The step_size of dataset is zero. Check if the images count of train dataset is more \ than batch_size in config.py") - model = ms.Model(net) + model = mindspore.Model(net) for i, data in enumerate(dataset.create_dict_iterator(output_numpy=True)): features_path = os.path.join(features_folder, f"feature_{i}.npy") @@ -176,7 +176,7 @@ def extract_features(net, dataset_path, config): if not os.path.exists(features_path) or not os.path.exists(label_path): image = data["image"] label = data["label"] - features = model.predict(ms.Tensor(image)) + features = model.predict(mindspore.Tensor(image)) np.save(features_path, features.asnumpy()) np.save(label_path, label) print(f"Complete the batch {i + 1}/{step_size}") diff --git a/official/cv/MobileNet/mobilenetv2/src/metric.py b/official/cv/MobileNet/mobilenetv2/src/metric.py index bf10de06d15a8447badc33f46906f5ef2c38bf9b..3a6ee94d139456183f4c679f6baa1fbd82e2aed6 100644 --- a/official/cv/MobileNet/mobilenetv2/src/metric.py +++ b/official/cv/MobileNet/mobilenetv2/src/metric.py @@ -14,7 +14,7 @@ # ============================================================================ """evaluation metric.""" -import mindspore as ms +import mindspore import mindspore.communication as comm import mindspore.ops as ops import mindspore.nn as nn @@ -56,9 +56,9 @@ class ClassifyCorrectCell(nn.Cell): def construct(self, data, label): outputs = self._network(data) y_pred = self.argmax(outputs) - y_pred = self.cast(y_pred, ms.int32) + y_pred = self.cast(y_pred, mindspore.int32) y_correct = self.equal(y_pred, label) - y_correct = self.cast(y_correct, ms.float32) + y_correct = self.cast(y_correct, mindspore.float32) y_correct = self.reduce_sum(y_correct) if self.run_distribute: y_correct = self.allreduce(y_correct) diff --git a/official/cv/MobileNet/mobilenetv2/src/mobilenetV2.py b/official/cv/MobileNet/mobilenetv2/src/mobilenetV2.py index c762e8ea99fda3a8169a1fa9dd746d6511124d77..c4a50ba13f8b976718903d00819b49f2208c7834 100644 --- a/official/cv/MobileNet/mobilenetv2/src/mobilenetV2.py +++ b/official/cv/MobileNet/mobilenetv2/src/mobilenetV2.py @@ -14,7 +14,7 @@ # ============================================================================ """MobileNetV2 model define""" import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops @@ -211,17 +211,17 @@ class MobileNetV2Backbone(nn.Cell): for _, m in self.cells_and_names(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - m.weight.set_data(ms.Tensor(np.random.normal(0, np.sqrt(2. / n), + m.weight.set_data(mindspore.Tensor(np.random.normal(0, np.sqrt(2. / n), m.weight.data.shape).astype("float32"))) if m.bias is not None: m.bias.set_data( - ms.Tensor(np.zeros(m.bias.data.shape, dtype="float32"), dtype=ms.float32)) + mindspore.Tensor(np.zeros(m.bias.data.shape, dtype="float32"), dtype=mindspore.float32)) elif isinstance(m, nn.BatchNorm2d): m.gamma.set_data( - ms.Tensor(np.ones(m.gamma.data.shape, dtype="float32"))) + mindspore.Tensor(np.ones(m.gamma.data.shape, dtype="float32"))) m.beta.set_data( - ms.Tensor(np.zeros(m.beta.data.shape, dtype="float32"), dtype=ms.float32)) + mindspore.Tensor(np.zeros(m.beta.data.shape, dtype="float32"), dtype=mindspore.float32)) @property def get_features(self): @@ -280,11 +280,11 @@ class MobileNetV2Head(nn.Cell): self.init_parameters_data() for _, m in self.cells_and_names(): if isinstance(m, nn.Dense): - m.weight.set_data(ms.Tensor(np.random.normal( + m.weight.set_data(mindspore.Tensor(np.random.normal( 0, 0.01, m.weight.data.shape).astype("float32"))) if m.bias is not None: m.bias.set_data( - ms.Tensor(np.zeros(m.bias.data.shape, dtype="float32"), dtype=ms.float32)) + mindspore.Tensor(np.zeros(m.bias.data.shape, dtype="float32"), dtype=mindspore.float32)) class MobileNetV2Combine(nn.Cell): diff --git a/official/cv/MobileNet/mobilenetv2/src/mobilenetV2_fusion.py b/official/cv/MobileNet/mobilenetv2/src/mobilenetV2_fusion.py index 6141916cd7d201ccce6b8a5cb7d9b3bf611bf258..acd3788c2b9112f0f973fbfa333dd26746f8d044 100644 --- a/official/cv/MobileNet/mobilenetv2/src/mobilenetV2_fusion.py +++ b/official/cv/MobileNet/mobilenetv2/src/mobilenetV2_fusion.py @@ -15,7 +15,7 @@ # """MobileNetV2 Quant model define""" import numpy as np -import minspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops @@ -215,25 +215,25 @@ class mobilenetV2(nn.Cell): for _, m in self.cells_and_names(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - w = ms.Tensor(np.random.normal(0, np.sqrt(2. / n), m.weight.data.shape).astype("float32")) + w = mindspore.Tensor(np.random.normal(0, np.sqrt(2. / n), m.weight.data.shape).astype("float32")) m.weight.set_data(w) if m.bias is not None: - m.bias.set_data(ms.numpy.zeros(m.bias.data.shape, dtype="float32")) + m.bias.set_data(mindspore.numpy.zeros(m.bias.data.shape, dtype="float32")) elif isinstance(m, nn.Conv2dBnAct): n = m.conv.kernel_size[0] * m.conv.kernel_size[1] * m.conv.out_channels - w = ms.Tensor(np.random.normal(0, np.sqrt(2. / n), m.conv.weight.data.shape).astype("float32")) + w = mindspore.Tensor(np.random.normal(0, np.sqrt(2. / n), m.conv.weight.data.shape).astype("float32")) m.conv.weight.set_data(w) if m.conv.bias is not None: - m.conv.bias.set_data(ms.numpy.zeros(m.conv.bias.data.shape, dtype="float32")) + m.conv.bias.set_data(mindspore.numpy.zeros(m.conv.bias.data.shape, dtype="float32")) elif isinstance(m, nn.BatchNorm2d): - m.gamma.set_data(ms.Tensor(np.ones(m.gamma.data.shape, dtype="float32"))) - m.beta.set_data(ms.numpy.zeros(m.beta.data.shape, dtype="float32")) + m.gamma.set_data(mindspore.Tensor(np.ones(m.gamma.data.shape, dtype="float32"))) + m.beta.set_data(mindspore.numpy.zeros(m.beta.data.shape, dtype="float32")) elif isinstance(m, nn.Dense): - m.weight.set_data(ms.Tensor(np.random.normal(0, 0.01, m.weight.data.shape).astype("float32"))) + m.weight.set_data(mindspore.Tensor(np.random.normal(0, 0.01, m.weight.data.shape).astype("float32"))) if m.bias is not None: - m.bias.set_data(ms.numpy.zeros(m.bias.data.shape, dtype="float32")) + m.bias.set_data(mindspore.numpy.zeros(m.bias.data.shape, dtype="float32")) elif isinstance(m, nn.DenseBnAct): m.dense.weight.set_data( - ms.Tensor(np.random.normal(0, 0.01, m.dense.weight.data.shape).astype("float32"))) + mindspore.Tensor(np.random.normal(0, 0.01, m.dense.weight.data.shape).astype("float32"))) if m.dense.bias is not None: - m.dense.bias.set_data(ms.numpy.zeros(m.dense.bias.data.shape, dtype="float32")) + m.dense.bias.set_data(mindspore.numpy.zeros(m.dense.bias.data.shape, dtype="float32")) diff --git a/official/cv/MobileNet/mobilenetv2/src/model_utils/moxing_adapter.py b/official/cv/MobileNet/mobilenetv2/src/model_utils/moxing_adapter.py index 12a72538d97fbca9d5f35c7b9e4bd75a04737712..8f604f1428362ae9621430e7943cc73f1a994a09 100644 --- a/official/cv/MobileNet/mobilenetv2/src/model_utils/moxing_adapter.py +++ b/official/cv/MobileNet/mobilenetv2/src/model_utils/moxing_adapter.py @@ -18,7 +18,7 @@ import os import functools import time -import mindspore as ms +import mindspore from .config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - ms.context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): @@ -102,7 +102,7 @@ def moxing_wrapper(pre_process=None, post_process=None): pre_process() if config.enable_profiling: - profiler = ms.profiler.Profiler() + profiler = mindspore.profiler.Profiler() run_func(*args, **kwargs) diff --git a/official/cv/MobileNet/mobilenetv2/src/models.py b/official/cv/MobileNet/mobilenetv2/src/models.py index 9e4324f90755446668220f6c4cccacdc5593f4a2..03b778cffff062d823efccb3be1a1b2a73af4924 100644 --- a/official/cv/MobileNet/mobilenetv2/src/models.py +++ b/official/cv/MobileNet/mobilenetv2/src/models.py @@ -14,7 +14,7 @@ # ============================================================================ import time import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore.train.callback import Callback @@ -40,15 +40,15 @@ class CrossEntropyWithLabelSmooth(nn.LossBase): def __init__(self, smooth_factor=0., num_classes=1000): super(CrossEntropyWithLabelSmooth, self).__init__() self.onehot = ops.OneHot() - self.on_value = ms.Tensor(1.0 - smooth_factor, ms.float32) - self.off_value = ms.Tensor(1.0 * smooth_factor / - (num_classes - 1), ms.float32) + self.on_value = mindspore.Tensor(1.0 - smooth_factor, mindspore.float32) + self.off_value = mindspore.Tensor(1.0 * smooth_factor / + (num_classes - 1), mindspore.float32) self.ce = nn.SoftmaxCrossEntropyWithLogits() self.mean = ops.ReduceMean(False) self.cast = ops.Cast() def construct(self, logit, label): - one_hot_label = self.onehot(self.cast(label, ms.int32), ops.shape(logit)[1], + one_hot_label = self.onehot(self.cast(label, mindspore.int32), ops.shape(logit)[1], self.on_value, self.off_value) out_loss = self.ce(logit, one_hot_label) out_loss = self.mean(out_loss, 0) @@ -66,7 +66,7 @@ class Monitor(Callback): None Examples: - >>> Monitor(100,lr_init=ms.Tensor([0.05]*100).asnumpy()) + >>> Monitor(100,lr_init=mindspore.Tensor([0.05]*100).asnumpy()) """ def __init__(self, lr_init=None, model=None, eval_dataset=None): @@ -107,9 +107,9 @@ class Monitor(Callback): step_mseconds = (time.time() - self.step_time) * 1000 step_loss = cb_params.net_outputs - if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], ms.Tensor): + if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], mindspore.Tensor): step_loss = step_loss[0] - if isinstance(step_loss, ms.Tensor): + if isinstance(step_loss, mindspore.Tensor): step_loss = np.mean(step_loss.asnumpy()) self.losses.append(step_loss) @@ -123,8 +123,8 @@ class Monitor(Callback): def load_ckpt(network, pretrain_ckpt_path, trainable=True): """load checkpoint into network.""" - param_dict = ms.load_checkpoint(pretrain_ckpt_path) - ms.load_param_into_net(network, param_dict) + param_dict = mindspore.load_checkpoint(pretrain_ckpt_path) + mindspore.load_param_into_net(network, param_dict) if not trainable: for param in network.get_parameters(): param.requires_grad = False diff --git a/official/cv/MobileNet/mobilenetv2/train.py b/official/cv/MobileNet/mobilenetv2/train.py index d6c657445b3d4c21edabf8e0e4635f533573e0b3..283c5fcffee2d4ca5fc15e771d134d786aba0fd1 100644 --- a/official/cv/MobileNet/mobilenetv2/train.py +++ b/official/cv/MobileNet/mobilenetv2/train.py @@ -19,7 +19,7 @@ import time import random import numpy as np -import mindspore as ms +import mindspore import mindspore.communication as comm import mindspore.nn as nn @@ -33,7 +33,7 @@ from src.model_utils.moxing_adapter import moxing_wrapper, modelarts_process from src.model_utils.device_adapter import get_device_id -ms.set_seed(1) +mindspore.set_seed(1) @moxing_wrapper(pre_process=modelarts_process) @@ -41,12 +41,12 @@ def train_mobilenetv2(): """ train_mobilenetv2 """ if config.platform == "CPU": config.run_distribute = False - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.platform, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.platform, save_graphs=False) if config.run_distribute: comm.init() config.rank_id = comm.get_rank() config.rank_size = comm.get_group_size() - ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) config.train_dataset_path = os.path.join(config.dataset_path, 'train') config.eval_dataset_path = os.path.join(config.dataset_path, 'validation_preprocess') @@ -61,7 +61,7 @@ def train_mobilenetv2(): enable_cache=config.enable_cache, cache_session_id=config.cache_session_id) step_size = dataset.get_dataset_size() if config.platform == "GPU": - ms.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) if config.pretrain_ckpt: if config.freeze_layer == "backbone": load_ckpt(backbone_net, config.pretrain_ckpt, trainable=False) @@ -84,7 +84,7 @@ def train_mobilenetv2(): epoch_size = config.epoch_size # get learning rate - lr = ms.Tensor(get_lr(global_step=0, + lr = mindspore.Tensor(get_lr(global_step=0, lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, @@ -100,10 +100,10 @@ def train_mobilenetv2(): eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, config=config) if config.pretrain_ckpt == "" or config.freeze_layer != "backbone": if config.platform == "Ascend": - loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + loss_scale = mindspore.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) group_params = build_params_groups(net) opt = nn.Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale) - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, eval_network=dist_eval_network, amp_level="O2", keep_batchnorm_fp32=False, boost_level=config.boost_mode, @@ -111,7 +111,7 @@ def train_mobilenetv2(): else: opt = nn.Momentum(net.trainable_params(), lr, config.momentum, config.weight_decay) - model = ms.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network, boost_level=config.boost_mode) cb = config_ckpoint(config, lr, step_size, model, eval_dataset) print("============== Starting Training ==============") @@ -138,15 +138,15 @@ def train_mobilenetv2(): epoch_start = time.time() losses = [] for j in idx_list: - feature = ms.Tensor(np.load(os.path.join(features_path, "feature_{}.npy".format(j)))) - label = ms.Tensor(np.load(os.path.join(features_path, "label_{}.npy".format(j)))) + feature = mindspore.Tensor(np.load(os.path.join(features_path, "feature_{}.npy".format(j)))) + label = mindspore.Tensor(np.load(os.path.join(features_path, "label_{}.npy".format(j)))) losses.append(network(feature, label).asnumpy()) epoch_mseconds = (time.time()-epoch_start) * 1000 per_step_mseconds = epoch_mseconds / step_size print("epoch[{}/{}], iter[{}] cost: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}"\ .format(epoch + 1, epoch_size, step_size, epoch_mseconds, per_step_mseconds, np.mean(np.array(losses)))) if (epoch + 1) % config.save_checkpoint_epochs == 0: - ms.save_checkpoint(net, os.path.join(save_ckpt_path, "mobilenetv2_{}.ckpt".format(epoch + 1))) + mindspore.save_checkpoint(net, os.path.join(save_ckpt_path, "mobilenetv2_{}.ckpt".format(epoch + 1))) print("total cost {:5.4f} s".format(time.time() - start)) if config.enable_cache: diff --git a/official/cv/MobileNet/mobilenetv3/Readme.md b/official/cv/MobileNet/mobilenetv3/Readme.md index f0d5116e0063116e2d63e3cc833a0626006a2265..c14f0f44f5b0f7d6b75b06a545bb05c191351a35 100644 --- a/official/cv/MobileNet/mobilenetv3/Readme.md +++ b/official/cv/MobileNet/mobilenetv3/Readme.md @@ -105,7 +105,7 @@ You can start training using python or shell scripts. The usage of shell scripts ### Result -Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log` like followings. +Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log` like following. ```bash epoch: [ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100] @@ -140,7 +140,7 @@ You can start training using python or shell scripts. The usage of shell scripts ### Result -Inference result will be stored in the example path, you can find result like the followings in `val.log`. +Inference result will be stored in the example path, you can find result like the following in `val.log`. ```bash result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.ckpt diff --git a/official/cv/MobileNet/mobilenetv3/eval.py b/official/cv/MobileNet/mobilenetv3/eval.py index 38657813ee230bb82da4de5efe9663cd2a92abf5..33a85755b2ab7f0528ac275d9f8a0cf4b4e44e52 100644 --- a/official/cv/MobileNet/mobilenetv3/eval.py +++ b/official/cv/MobileNet/mobilenetv3/eval.py @@ -16,7 +16,7 @@ eval. """ import argparse -import mindspore as ms +import mindspore from mindspore import nn from src.dataset import create_dataset from src.dataset import create_dataset_cifar @@ -36,7 +36,7 @@ if __name__ == '__main__': config = None if args_opt.device_target == "GPU": config = config_gpu - ms.set_context(mode=ms.GRAPH_MODE, + mindspore.set_context(mode=0, device_target="GPU", save_graphs=False) dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, @@ -45,7 +45,7 @@ if __name__ == '__main__': batch_size=config.batch_size) elif args_opt.device_target == "CPU": config = config_cpu - ms.set_context(mode=ms.GRAPH_MODE, + mindspore.set_context(mode=0, device_target="CPU", save_graphs=False) dataset = create_dataset_cifar(dataset_path=args_opt.dataset_path, do_train=False, @@ -59,10 +59,10 @@ if __name__ == '__main__': step_size = dataset.get_dataset_size() if args_opt.checkpoint_path: - param_dict = ms.load_checkpoint(args_opt.checkpoint_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(args_opt.checkpoint_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) - model = ms.Model(net, loss_fn=loss, metrics={'acc'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'acc'}) res = model.eval(dataset) print("result:", res, "ckpt=", args_opt.checkpoint_path) diff --git a/official/cv/MobileNet/mobilenetv3/export.py b/official/cv/MobileNet/mobilenetv3/export.py index 6dabadfd7d640d58c583dbab30c1dd9ce6e4b4b0..f40a0790071b577b2f379fba0bc823852f87298d 100644 --- a/official/cv/MobileNet/mobilenetv3/export.py +++ b/official/cv/MobileNet/mobilenetv3/export.py @@ -17,7 +17,7 @@ mobilenetv3 export mindir. """ import argparse import numpy as np -import mindspore as ms +import mindspore from src.config import config_gpu from src.config import config_cpu from src.config import config_ascend @@ -35,20 +35,20 @@ if __name__ == '__main__': cfg = None if args_opt.device_target == "GPU": cfg = config_gpu - ms.set_context(mode=ms.GRAPH_MODE, device_target="GPU") + mindspore.set_context(mode=0, device_target="GPU") elif args_opt.device_target == "CPU": cfg = config_cpu - ms.set_context(mode=ms.GRAPH_MODE, device_target="CPU") + mindspore.set_context(mode=0, device_target="CPU") elif args_opt.device_target == "Ascend": cfg = config_ascend - ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend") + mindspore.set_context(mode=0, device_target="Ascend") else: raise ValueError("Unsupported device_target.") net = mobilenet_v3_large(num_classes=cfg.num_classes, activation="Softmax") - param_dict = ms.load_checkpoint(args_opt.checkpoint_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(args_opt.checkpoint_path) + mindspore.load_param_into_net(net, param_dict) input_shp = [1, 3, cfg.image_height, cfg.image_width] - input_array = ms.Tensor(np.random.uniform(-1.0, 1.0, size=input_shp).astype(np.float32)) - ms.export(net, input_array, file_name=args_opt.file_name, file_format=args_opt.file_format) + input_array = mindspore.Tensor(np.random.uniform(-1.0, 1.0, size=input_shp).astype(np.float32)) + mindspore.export(net, input_array, file_name=args_opt.file_name, file_format=args_opt.file_format) diff --git a/official/cv/MobileNet/mobilenetv3/infer_onnx.py b/official/cv/MobileNet/mobilenetv3/infer_onnx.py index b478067afbb3322fd3cbc0fa100fc691ca33a89f..75fc717cbd41666a99f8f2bbe1d2f5cc919695d1 100644 --- a/official/cv/MobileNet/mobilenetv3/infer_onnx.py +++ b/official/cv/MobileNet/mobilenetv3/infer_onnx.py @@ -18,7 +18,7 @@ import argparse import onnxruntime import numpy as np -import mindspore as ms +import mindspore from mindspore import Tensor from mindspore import ops from src.dataset import create_dataset @@ -64,7 +64,7 @@ if __name__ == '__main__': model_predict = session.run(None, inputs) model_predict = np.expand_dims(np.squeeze(model_predict), axis=0) - input_x = Tensor(model_predict[0], ms.float16) + input_x = Tensor(model_predict[0], mindspore.float16) _, k_label = topk(input_x, k) if k_label[0] == labels: correct_top1 = correct_top1 + 1 diff --git a/official/cv/MobileNet/mobilenetv3/src/dataset.py b/official/cv/MobileNet/mobilenetv3/src/dataset.py index 43c2b7b63df5ca30e35effe3dba1a5ae74bbc9c4..006d88100cd654c2e7b2c03ddd5b5c906d6f48b9 100644 --- a/official/cv/MobileNet/mobilenetv3/src/dataset.py +++ b/official/cv/MobileNet/mobilenetv3/src/dataset.py @@ -15,7 +15,7 @@ """ create train or eval dataset. """ -import mindspore as ms +import mindspore import mindspore.dataset as ds @@ -67,7 +67,7 @@ def create_dataset(dataset_path, do_train, config, device_target, batch_size=32, else: trans = [decode_op, resize_op, center_crop, normalize_op, change_swap_op] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) @@ -117,7 +117,7 @@ def create_dataset_cifar(dataset_path, ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=type_cast_op, input_columns="label", diff --git a/official/cv/MobileNet/mobilenetv3/src/mobilenetV3.py b/official/cv/MobileNet/mobilenetv3/src/mobilenetV3.py index d243e70e6bb022ded146de3b3c65549c8a84c0d4..58361ded6cf0d46dec7f14de8689434c16732b92 100644 --- a/official/cv/MobileNet/mobilenetv3/src/mobilenetV3.py +++ b/official/cv/MobileNet/mobilenetv3/src/mobilenetV3.py @@ -15,7 +15,7 @@ """MobileNetV3 model define""" from functools import partial import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops @@ -339,21 +339,21 @@ class MobileNetV3(nn.Cell): for _, m in self.cells_and_names(): if isinstance(m, (nn.Conv2d)): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - m.weight.set_data(ms.Tensor(np.random.normal(0, np.sqrt(2. / n), + m.weight.set_data(mindspore.Tensor(np.random.normal(0, np.sqrt(2. / n), m.weight.data.shape).astype("float32"))) if m.bias is not None: m.bias.set_data( - ms.numpy.zeros(m.bias.data.shape, dtype="float32")) + mindspore.numpy.zeros(m.bias.data.shape, dtype="float32")) elif isinstance(m, nn.BatchNorm2d): m.gamma.set_data( - ms.Tensor(np.ones(m.gamma.data.shape, dtype="float32"))) + mindspore.Tensor(np.ones(m.gamma.data.shape, dtype="float32"))) m.beta.set_data( - ms.numpy.zeros(m.beta.data.shape, dtype="float32")) + mindspore.numpy.zeros(m.beta.data.shape, dtype="float32")) elif isinstance(m, nn.Dense): - m.weight.set_data(ms.Tensor(np.random.normal( + m.weight.set_data(mindspore.Tensor(np.random.normal( 0, 0.01, m.weight.data.shape).astype("float32"))) if m.bias is not None: - m.bias.set_data(ms.numpy.zeros(m.bias.data.shape, dtype="float32")) + m.bias.set_data(mindspore.numpy.zeros(m.bias.data.shape, dtype="float32")) def mobilenet_v3(model_name, **kwargs): diff --git a/official/cv/MobileNet/mobilenetv3/train.py b/official/cv/MobileNet/mobilenetv3/train.py index 20f51ba9a7c65681622360674e975a39315f68a9..32a4dd57ff92ec9f172824dbdfd8aca1f7408947 100644 --- a/official/cv/MobileNet/mobilenetv3/train.py +++ b/official/cv/MobileNet/mobilenetv3/train.py @@ -19,7 +19,7 @@ import argparse import ast import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, Callback @@ -32,7 +32,7 @@ from src.config import config_gpu from src.config import config_cpu from src.mobilenetV3 import mobilenet_v3_large -ms.set_seed(1) +mindspore.set_seed(1) parser = argparse.ArgumentParser(description='Image classification') parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') @@ -42,16 +42,16 @@ parser.add_argument('--run_distribute', type=ast.literal_eval, default=False, he args_opt = parser.parse_args() if args_opt.device_target == "GPU": - ms.set_context(mode=ms.GRAPH_MODE, + mindspore.set_context(mode=0, device_target="GPU", save_graphs=False) if args_opt.run_distribute: init() - ms.set_auto_parallel_context(device_num=get_group_size(), - parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=get_group_size(), + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) elif args_opt.device_target == "CPU": - ms.set_context(mode=ms.GRAPH_MODE, + mindspore.set_context(mode=0, device_target="CPU", save_graphs=False) else: @@ -76,14 +76,14 @@ class CrossEntropyWithLabelSmooth(nn.LossBase): def __init__(self, smooth_factor=0., num_classes=1000): super(CrossEntropyWithLabelSmooth, self).__init__() self.onehot = ops.OneHot() - self.on_value = ms.Tensor(1.0 - smooth_factor, ms.float32) - self.off_value = ms.Tensor(1.0 * smooth_factor / (num_classes - 1), ms.float32) + self.on_value = mindspore.Tensor(1.0 - smooth_factor, mindspore.float32) + self.off_value = mindspore.Tensor(1.0 * smooth_factor / (num_classes - 1), mindspore.float32) self.ce = nn.SoftmaxCrossEntropyWithLogits() self.mean = ops.ReduceMean(False) self.cast = ops.Cast() def construct(self, logit, label): - one_hot_label = self.onehot(self.cast(label, ms.int32), ops.shape(logit)[1], + one_hot_label = self.onehot(self.cast(label, mindspore.int32), ops.shape(logit)[1], self.on_value, self.off_value) out_loss = self.ce(logit, one_hot_label) out_loss = self.mean(out_loss, 0) @@ -101,7 +101,7 @@ class Monitor(Callback): None Examples: - >>> Monitor(100,lr_init=ms.Tensor([0.05]*100).asnumpy()) + >>> Monitor(100,lr_init=mindspore.Tensor([0.05]*100).asnumpy()) """ def __init__(self, lr_init=None): @@ -130,9 +130,9 @@ class Monitor(Callback): step_mseconds = (time.time() - self.step_time) * 1000 step_loss = cb_params.net_outputs - if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], ms.Tensor): + if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], mindspore.Tensor): step_loss = step_loss[0] - if isinstance(step_loss, ms.Tensor): + if isinstance(step_loss, mindspore.Tensor): step_loss = np.mean(step_loss.asnumpy()) self.losses.append(step_loss) @@ -148,7 +148,7 @@ if __name__ == '__main__': config_ = None if args_opt.device_target == "GPU": config_ = config_gpu - ms.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) elif args_opt.device_target == "CPU": config_ = config_cpu else: @@ -183,12 +183,12 @@ if __name__ == '__main__': step_size = dataset.get_dataset_size() # resume if args_opt.pre_trained: - param_dict = ms.load_checkpoint(args_opt.pre_trained) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(args_opt.pre_trained) + mindspore.load_param_into_net(net, param_dict) # define optimizer - loss_scale = ms.FixedLossScaleManager( + loss_scale = mindspore.FixedLossScaleManager( config_.loss_scale, drop_overflow_update=False) - lr = ms.Tensor(get_lr(global_step=0, + lr = mindspore.Tensor(get_lr(global_step=0, lr_init=0, lr_end=0, lr_max=config_.lr, @@ -198,7 +198,7 @@ if __name__ == '__main__': opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config_.momentum, config_.weight_decay, config_.loss_scale) # define model - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale) + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale) cb = [Monitor(lr_init=lr.asnumpy())] if args_opt.run_distribute and args_opt.device_target != "CPU": diff --git a/official/cv/OCRNet/eval.py b/official/cv/OCRNet/eval.py index 08d0f7617f4470cc08d5ccaeca1357120461285f..312610065a67ae48fa2b8c9824ebfbdfaff09b22 100644 --- a/official/cv/OCRNet/eval.py +++ b/official/cv/OCRNet/eval.py @@ -18,7 +18,8 @@ import argparse import ast import numpy as np -from mindspore import context, DatasetHelper +import mindspore +from mindspore import DatasetHelper from mindspore import ops as P from mindspore.dataset import engine as de from mindspore.train.serialization import load_param_into_net, load_checkpoint @@ -113,7 +114,7 @@ def testval(dataset, helper, model, num_classes=19, ignore_label=255, scales=Non def main(): """Inference process.""" # Set context - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) # Initialize network net = get_seg_model(config) param_dict = load_checkpoint(ckpt_file_name=config.checkpoint_path) diff --git a/official/cv/OCRNet/export.py b/official/cv/OCRNet/export.py index 7f0279a1165158cc3370b2b5128ac232453eff6b..cdd49101c5b8ea233eb13490548b4c33d7c89305 100644 --- a/official/cv/OCRNet/export.py +++ b/official/cv/OCRNet/export.py @@ -17,7 +17,8 @@ import argparse import numpy as np -from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.config import config_hrnetv2_w48 as config from src.seg_hrnet_ocr import get_seg_model @@ -35,7 +36,7 @@ def main(): args = parser.parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=args.device_id) + mindspore.set_context(mode=0, device_target=args.device_target, device_id=args.device_id) net = get_seg_model(config) params_dict = load_checkpoint(args.checkpoint_file) diff --git a/official/cv/OCRNet/src/basedataset.py b/official/cv/OCRNet/src/basedataset.py index 104ee830f131389cf35e7e5b033cbd1df131b8ec..cb676d79336365f1dfe97f1c906640925deee33e 100644 --- a/official/cv/OCRNet/src/basedataset.py +++ b/official/cv/OCRNet/src/basedataset.py @@ -89,14 +89,14 @@ class BaseDataset: def multi_scale_aug(self, image, label=None, rand_scale=1, rand_crop=True): """Augment feature into different scales.""" - long_size = np.int(self.base_size * rand_scale + 0.5) + long_size = np.int_(self.base_size * rand_scale + 0.5) h, w, _ = image.shape if h > w: new_h = long_size - new_w = np.int(w * long_size / h + 0.5) + new_w = np.int_(w * long_size / h + 0.5) else: new_w = long_size - new_h = np.int(h * long_size / w + 0.5) + new_h = np.int_(h * long_size / w + 0.5) image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR) # image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_NEAREST) @@ -156,8 +156,8 @@ class BaseDataset: batch, _, ori_height, ori_width = image.shape assert batch == 1, "only supporting batchsize 1." image = image.asnumpy()[0].transpose((1, 2, 0)).copy() - stride_h = np.int(self.crop_size[0] * 2.0 / 3.0) - stride_w = np.int(self.crop_size[1] * 2.0 / 3.0) + stride_h = np.int_(self.crop_size[0] * 2.0 / 3.0) + stride_w = np.int_(self.crop_size[1] * 2.0 / 3.0) final_pred = Tensor(np.zeros([1, self.num_classes, ori_height, ori_width]), dtype=dtype.float32) padvalue = -1.0 * np.array(self.mean) / np.array(self.std) @@ -178,10 +178,10 @@ class BaseDataset: new_img = self.pad_image(new_img, height, width, self.crop_size, padvalue) new_h, new_w = new_img.shape[:-1] - rows = np.int(np.ceil(1.0 * (new_h - - self.crop_size[0]) / stride_h)) + 1 - cols = np.int(np.ceil(1.0 * (new_w - - self.crop_size[1]) / stride_w)) + 1 + rows = np.int_(np.ceil(1.0 * (new_h - + self.crop_size[0]) / stride_h)) + 1 + cols = np.int_(np.ceil(1.0 * (new_w - + self.crop_size[1]) / stride_w)) + 1 preds = Tensor(np.zeros([1, self.num_classes, new_h, new_w]), dtype=dtype.float32) count = Tensor(np.zeros([1, 1, new_h, new_w]), dtype=dtype.float32) diff --git a/official/cv/OCRNet/src/cityscapes.py b/official/cv/OCRNet/src/cityscapes.py index 3dbe7a51944fad9b74a09a53a9a6f25a5ddb92a6..18b3a335d3f71c697103130e34ee4a92208fceb3 100644 --- a/official/cv/OCRNet/src/cityscapes.py +++ b/official/cv/OCRNet/src/cityscapes.py @@ -118,8 +118,8 @@ class Cityscapes(BaseDataset): batch, _, ori_height, ori_width = image.shape assert batch == 1, "only supporting batchsize 1." image = image.asnumpy()[0].transpose((1, 2, 0)).copy() - stride_h = np.int(self.crop_size[0] * 1.0) - stride_w = np.int(self.crop_size[1] * 1.0) + stride_h = np.int_(self.crop_size[0] * 1.0) + stride_w = np.int_(self.crop_size[1] * 1.0) final_pred = Tensor(np.zeros([1, self.num_classes, ori_height, ori_width]), dtype=dtype.float32) for scale in scales: @@ -137,10 +137,10 @@ class Cityscapes(BaseDataset): preds = preds[:, :, 0:height, 0:width] else: new_h, new_w = new_img.shape[:-1] - rows = np.int(np.ceil(1.0 * (new_h - - self.crop_size[0]) / stride_h)) + 1 - cols = np.int(np.ceil(1.0 * (new_w - - self.crop_size[1]) / stride_w)) + 1 + rows = np.int_(np.ceil(1.0 * (new_h - + self.crop_size[0]) / stride_h)) + 1 + cols = np.int_(np.ceil(1.0 * (new_w - + self.crop_size[1]) / stride_w)) + 1 preds = np.zeros([1, self.num_classes, new_h, new_w]).astype(np.float32) count = np.zeros([1, 1, new_h, new_w]).astype(np.float32) diff --git a/official/cv/OCRNet/src/model_utils/moxing_adapter.py b/official/cv/OCRNet/src/model_utils/moxing_adapter.py index 08a6797de103a5949f2ed7a62f19a8a4803cbe02..33381866d6105655024d52f11dea206d9945dd85 100644 --- a/official/cv/OCRNet/src/model_utils/moxing_adapter.py +++ b/official/cv/OCRNet/src/model_utils/moxing_adapter.py @@ -16,7 +16,7 @@ """Moxing adapter for ModelArts""" import os import functools -from mindspore import context +import mindspore from src.config import show_config @@ -101,7 +101,7 @@ def moxing_wrapper(config, pre_process=None, post_process=None): sync_data(config.eval_data_url, config.eval_data_path) print("Workspace downloaded: ", os.listdir(config.eval_data_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/OCRNet/src/seg_hrnet.py b/official/cv/OCRNet/src/seg_hrnet.py index 49d3740abfc3d438b5cc02a9ca93224984b8decc..3fbcc063270ef2fc27fac9cd3f6e7e751cdaed9f 100644 --- a/official/cv/OCRNet/src/seg_hrnet.py +++ b/official/cv/OCRNet/src/seg_hrnet.py @@ -340,7 +340,7 @@ class HighResolutionNet(nn.Cell): self.stage4, pre_stage_channels = self._make_stage( self.stage4_cfg, num_channels, multi_scale_output=True) - last_inp_channels = np.int(np.sum(pre_stage_channels)) + last_inp_channels = np.int_(np.sum(pre_stage_channels)) self.last_layer = nn.SequentialCell([ diff --git a/official/cv/OCRNet/src/seg_hrnet_ocr.py b/official/cv/OCRNet/src/seg_hrnet_ocr.py index 6cbd664a71daab6758c18d4cb5e2a5fd7c43859a..275e623f98c8617360c73df338562a880ae25228 100644 --- a/official/cv/OCRNet/src/seg_hrnet_ocr.py +++ b/official/cv/OCRNet/src/seg_hrnet_ocr.py @@ -565,7 +565,7 @@ class HighResolutionNet(nn.Cell): self.stage4, pre_stage_channels = self._make_stage( self.stage4_cfg, num_channels, multi_scale_output=True) - last_inp_channels = np.int(np.sum(pre_stage_channels)) + last_inp_channels = np.int_(np.sum(pre_stage_channels)) ocr_mid_channels = config.model.ocr.mid_channels ocr_key_channels = config.model.ocr.key_channels diff --git a/official/cv/OCRNet/train.py b/official/cv/OCRNet/train.py index df6b6f37f687cc631c2da89a3e036db74e3a420b..1697afe8494ccb95259b8fb36c791f0e9914dbea 100644 --- a/official/cv/OCRNet/train.py +++ b/official/cv/OCRNet/train.py @@ -19,11 +19,12 @@ import ast import os import numpy as np -from mindspore import context, Model +import mindspore +from mindspore import Model from mindspore import dataset as de from mindspore.common import set_seed from mindspore.communication.management import init, get_rank, get_group_size -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn import SGD from mindspore.train.callback import LossMonitor, TimeMonitor, ModelCheckpoint, CheckpointConfig from mindspore.train.loss_scale_manager import FixedLossScaleManager @@ -109,13 +110,13 @@ def parse_args(): @moxing_wrapper(config) def main(): """Training process.""" - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.run_distribute: init() device_id = int(os.getenv("DEVICE_ID")) if config.device_target == "Ascend" else get_rank() device_num = int(os.getenv("RANK_SIZE")) if config.device_target == "Ascend" else get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=device_num) else: diff --git a/official/cv/OpenPose/eval.py b/official/cv/OpenPose/eval.py index 081ed9b30aca07fbcfc70e83c0d87e6ca2c01de4..2226d720e47d89c1965c096d904b915d0f5d0146 100644 --- a/official/cv/OpenPose/eval.py +++ b/official/cv/OpenPose/eval.py @@ -23,7 +23,8 @@ from scipy.ndimage.filters import gaussian_filter from tqdm import tqdm from pycocotools.coco import COCO as LoadAnn from pycocotools.cocoeval import COCOeval as MapEval -from mindspore import context, Tensor +import mindspore +from mindspore import Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.communication.management import init from mindspore.common import dtype as mstype @@ -36,8 +37,8 @@ from src.model_utils.device_adapter import get_device_id, get_rank_id, get_devic warnings.filterwarnings("ignore") devid = get_device_id() -context.set_context(mode=context.GRAPH_MODE, - device_target=config.device_target, save_graphs=False, device_id=devid) +mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, device_id=devid, + jit_config={"jit_level": "O2"}) show_gt = 0 diff --git a/official/cv/OpenPose/export.py b/official/cv/OpenPose/export.py index 5e83cbe8acdbea3316f7af8a9a4efbf2af82d5db..5941341bf4fed782039b50546564432124588338 100644 --- a/official/cv/OpenPose/export.py +++ b/official/cv/OpenPose/export.py @@ -15,15 +15,15 @@ """export""" import os import numpy as np +import mindspore from mindspore import Tensor -from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net, export from src.openposenet import OpenPoseNet from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=config.device_id) +mindspore.set_context(mode=0, device_target=config.device_target, device_id=config.device_id) def modelarts_pre_process(): @@ -32,7 +32,7 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=None) def model_export(): - context.set_context(mode=context.GRAPH_MODE, save_graphs=False) + mindspore.set_context(mode=0, save_graphs=False) # define net net = OpenPoseNet() diff --git a/official/cv/OpenPose/modelarts/train_start.py b/official/cv/OpenPose/modelarts/train_start.py index 282d111aa47b111e35f16fa964919249981c16d2..58bd855a6b746ef133402c9f7cad94844ffcc191 100644 --- a/official/cv/OpenPose/modelarts/train_start.py +++ b/official/cv/OpenPose/modelarts/train_start.py @@ -17,11 +17,11 @@ import os import argparse import glob from ast import literal_eval as liter +import mindspore from mindspore import Tensor -from mindspore import context from mindspore import export from mindspore.common import set_seed -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import init from mindspore.train import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor @@ -103,7 +103,7 @@ if __name__ == "__main__": if args_opt.vgg_path: config.vgg_path = os.path.join("/cache/data/", args_opt.vgg_path) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) + mindspore.set_context(mode=0, device_target="Ascend", save_graphs=False) config.lr = liter(config.lr) config.outputs_dir = config.save_model_path @@ -111,7 +111,7 @@ if __name__ == "__main__": if device_num > 1: init() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) config.rank = get_rank_id() config.outputs_dir = os.path.join(config.outputs_dir, "ckpt_{}/".format(config.rank)) diff --git a/benchmark/ascend/resnet/scripts/run_infer.sh b/official/cv/OpenPose/scripts/run_distribute_train_msrun.sh similarity index 42% rename from benchmark/ascend/resnet/scripts/run_infer.sh rename to official/cv/OpenPose/scripts/run_distribute_train_msrun.sh index b73e956c18ac711d986c5a25905ad3477f53f803..6c1ba6e91e89c097e9663d7acb70dd38dd099acb 100644 --- a/benchmark/ascend/resnet/scripts/run_infer.sh +++ b/official/cv/OpenPose/scripts/run_distribute_train_msrun.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,55 +13,19 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ - -if [ $# != 3 ] +if [ $# != 4 ] then - echo "Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]" -exit 1 -fi - -get_real_path(){ - if [ "${1:0:1}" == "/" ]; then - echo "$1" - else - echo "$(realpath -m $PWD/$1)" - fi -} - -PATH1=$(get_real_path $1) -PATH2=$(get_real_path $2) -CONFIG_FILE=$(get_real_path $3) - - -if [ ! -d $PATH1 ] -then - echo "error: DATASET_PATH=$PATH1 is not a directory" -exit 1 -fi - -if [ ! -f $PATH2 ] -then - echo "error: CHECKPOINT_PATH=$PATH2 is not a file" + echo "Usage: bash scripts/run_distribute_train_msrun.sh [IAMGEPATH_TRAIN] [JSONPATH_TRAIN] [MASKPATH_TRAIN] [VGG_PATH]" exit 1 fi +export DEVICE_NUM=8 +export RANK_SIZE=8 ulimit -u unlimited -export DEVICE_NUM=1 -export DEVICE_ID=0 -export RANK_SIZE=$DEVICE_NUM -export RANK_ID=0 - -if [ -d "infer" ]; -then - rm -rf ./infer -fi -mkdir ./infer -cp ../config/*.yaml ./infer -cp ../*.py ./infer -cp *.sh ./infer -cp -r ../src ./infer -cd ./infer || exit env > env.log -echo "start evaluation for device $DEVICE_ID" -python infer.py --data_path=$PATH1 --checkpoint_file_path=$PATH2 --config_path=$CONFIG_FILE &> log & -cd .. + +echo "start training" +msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port 8118 \ + --log_dir=msrun_log --join=True --cluster_time_out=300 \ + train.py --imgpath_train=$1 --jsonpath_train=$2 --maskpath_train=$3 --vgg_path=$4 &> log.txt & + diff --git a/official/cv/OpenPose/src/loss.py b/official/cv/OpenPose/src/loss.py index e10effb0c62a671a4793c02afabfb88c454c616f..8de7bab201b77eb2025f76adbc9dc422d44c994e 100644 --- a/official/cv/OpenPose/src/loss.py +++ b/official/cv/OpenPose/src/loss.py @@ -12,19 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P from mindspore.nn.cell import Cell from mindspore.ops import functional as F from mindspore.ops import composite as C -from mindspore.context import ParallelMode, get_auto_parallel_context +from mindspore import ParallelMode, get_auto_parallel_context from mindspore.communication.management import get_group_size -from mindspore import context from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from src.model_utils.config import config -context.set_context(mode=context.GRAPH_MODE) +mindspore.set_context(mode=0) time_stamp_init = False time_stamp_first = 0 grad_scale = C.MultitypeFuncGraph("grad_scale") diff --git a/official/cv/OpenPose/src/model_utils/moxing_adapter.py b/official/cv/OpenPose/src/model_utils/moxing_adapter.py index c2d2282402b6a2950af74b66f282550aac75cb14..344dfc034e1e553b2b5da61517cdc4b179d34b1a 100644 --- a/official/cv/OpenPose/src/model_utils/moxing_adapter.py +++ b/official/cv/OpenPose/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config @@ -101,7 +101,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print('Workspace downloaded: ', os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/OpenPose/src/openposenet.py b/official/cv/OpenPose/src/openposenet.py index a3ea966afde6074aeb441d4dc397071074ef0f5e..396d37e489a5eb6d06bdb441183fc7275a8ca009 100644 --- a/official/cv/OpenPose/src/openposenet.py +++ b/official/cv/OpenPose/src/openposenet.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ +import mindspore import mindspore.nn as nn from mindspore.nn import Conv2d, ReLU from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.ops import operations as P -from mindspore import context -context.set_context(mode=context.GRAPH_MODE) +mindspore.set_context(mode=0) time_stamp_init = False time_stamp_first = 0 diff --git a/official/cv/OpenPose/train.py b/official/cv/OpenPose/train.py index 8b8f56edf23f9a9ad8c99718e38da0b42b91a3c2..59b4c257e81907002b437a7e1f2486fbe856bbac 100644 --- a/official/cv/OpenPose/train.py +++ b/official/cv/OpenPose/train.py @@ -15,8 +15,8 @@ import os from ast import literal_eval as liter import mindspore -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import set_seed +from mindspore import ParallelMode from mindspore.communication.management import init from mindspore.train import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor @@ -31,8 +31,8 @@ from src.model_utils.config import config from src.model_utils.device_adapter import get_rank_id, get_device_num -mindspore.common.seed.set_seed(1) -context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) +set_seed(1) +mindspore.set_context(mode=0, device_target="Ascend", save_graphs=False, jit_config={"jit_level": "O2"}) def modelarts_pre_process(): @@ -48,8 +48,8 @@ def train(): if device_num > 1: init() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, - gradients_mean=True) + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + gradients_mean=True) config.rank = get_rank_id() config.outputs_dir = os.path.join(config.outputs_dir, "ckpt_{}/".format(config.rank)) else: diff --git a/official/cv/PVNet/eval.py b/official/cv/PVNet/eval.py index a9b2088f3285a0d15fb761391d4af347afaeabc9..5c06ec08a431657b915e2356af359bbf4fb3f43c 100644 --- a/official/cv/PVNet/eval.py +++ b/official/cv/PVNet/eval.py @@ -20,7 +20,6 @@ import numpy as np import mindspore import mindspore.dataset.transforms as C import mindspore.dataset.vision as V -from mindspore import context from model_utils.config import config as cfg from model_utils.data_file_utils import read_pickle, read_rgb_np @@ -38,7 +37,7 @@ def test(args): assert seg_dim == 2 # set graph mode and parallel mode - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=args.rank) + mindspore.set_context(mode=0, device_target=args.device_target, device_id=args.rank) # load model parameters net = Resnet18_8s(ver_dim=args.vote_num * 2) diff --git a/official/cv/PVNet/export.py b/official/cv/PVNet/export.py index 6b2d40416e1a2c8048e5091274560bbdaa398d48..732b49a015f25f1e33cda5b28c5d693c005a993f 100644 --- a/official/cv/PVNet/export.py +++ b/official/cv/PVNet/export.py @@ -14,21 +14,21 @@ # ============================================================================ """export to mindir""" import numpy as np -import mindspore as ms -from mindspore import context, Tensor +import mindspore +from mindspore import Tensor from src.model_reposity import Resnet18_8s from model_utils.config import config as cfg -context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target) +mindspore.set_context(mode=0, device_target=cfg.device_target) if cfg.device_target == "Ascend": - context.set_context(device_id=cfg.rank) + mindspore.set_context(device_id=cfg.rank) if __name__ == "__main__": net = Resnet18_8s(ver_dim=cfg.vote_num * 2) - param_dict = ms.load_checkpoint(cfg.ckpt_file) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(cfg.ckpt_file) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) - input_data = Tensor(np.zeros([1, 3, cfg.img_height, cfg.img_width]), ms.float32) - ms.export(net, input_data, file_name=cfg.file_name, file_format=cfg.file_format) + input_data = Tensor(np.zeros([1, 3, cfg.img_height, cfg.img_width]), mindspore.float32) + mindspore.export(net, input_data, file_name=cfg.file_name, file_format=cfg.file_format) diff --git a/official/cv/PVNet/modelarts/start_train.py b/official/cv/PVNet/modelarts/start_train.py index 1d18a9b001286bb33e8719664b2abdb479b857d2..77e43e9d16b920ed1e86ab3aaf918db391eeef99 100644 --- a/official/cv/PVNet/modelarts/start_train.py +++ b/official/cv/PVNet/modelarts/start_train.py @@ -22,7 +22,6 @@ import numpy as np import mindspore -import mindspore.context as context from mindspore import Tensor from mindspore import nn from mindspore.communication import get_rank, init, get_group_size @@ -56,7 +55,7 @@ def export_AIR(args_opt): print("checkpoint path", ckpt_model) # if args.device_target == "Ascend": - # context.set_context(device_id=args.rank) + # mindspore.set_context(device_id=args.rank) net = Resnet18_8s(ver_dim=args.vote_num * 2) param_dict = mindspore.load_checkpoint(ckpt_model) mindspore.load_param_into_net(net, param_dict) @@ -172,7 +171,7 @@ class Train: def network_init(argvs): """ init distribute training """ - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=argvs.device_target, save_graphs=False, device_id=int(os.getenv('DEVICE_ID', '0')), @@ -182,9 +181,9 @@ def network_init(argvs): init() argvs.rank = get_rank() argvs.group_size = get_group_size() - context.reset_auto_parallel_context() - parallel_mode = context.ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=argvs.group_size) + mindspore.reset_auto_parallel_context() + parallel_mode = mindspore.ParallelMode.DATA_PARALLEL + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=argvs.group_size) def parse_args(): diff --git a/official/cv/PVNet/src/loss_scale.py b/official/cv/PVNet/src/loss_scale.py index 3e5dfcfe3097022806c211559e503652c90d6783..b6bba92271940d88c62975ab5344f0667010ae80 100644 --- a/official/cv/PVNet/src/loss_scale.py +++ b/official/cv/PVNet/src/loss_scale.py @@ -13,8 +13,8 @@ # limitations under the License. # ============================================================================ """dynamic loss scale """ -import mindspore.context as context -from mindspore.context import ParallelMode +import mindspore +from mindspore import ParallelMode from mindspore import nn from mindspore.nn import Cell from mindspore import Tensor, RowTensor @@ -248,7 +248,7 @@ class TrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): self.less_equal = P.LessEqual() self.allreduce = P.AllReduce() self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) - self.gpu_target = (context.get_context("device_target") == "GPU") + self.gpu_target = (mindspore.get_context("device_target") == "GPU") self.loss_scaling_manager = None self.clip_gradients = ClipGradients() if isinstance(scale_sense, Cell): diff --git a/official/cv/PVNet/train.py b/official/cv/PVNet/train.py index 65d0d7c7e088431677aee30a7d7b762eb2b5b0b2..c23fc031d9b4d12c077b11e00429669d11aba911 100644 --- a/official/cv/PVNet/train.py +++ b/official/cv/PVNet/train.py @@ -17,7 +17,6 @@ import os import time import mindspore -import mindspore.context as context from mindspore import Tensor from mindspore import nn from mindspore.communication import get_rank, init, get_group_size @@ -140,7 +139,7 @@ class Train: def network_init(argvs): """ init distribute training """ - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=argvs.device_target, save_graphs=False, device_id=int(os.getenv('DEVICE_ID', '0')), @@ -150,9 +149,9 @@ def network_init(argvs): init() argvs.rank = get_rank() argvs.group_size = get_group_size() - context.reset_auto_parallel_context() - parallel_mode = context.ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=argvs.group_size) + mindspore.reset_auto_parallel_context() + parallel_mode = mindspore.ParallelMode.DATA_PARALLEL + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=argvs.group_size) if __name__ == '__main__': diff --git a/official/cv/PointNet/eval.py b/official/cv/PointNet/eval.py index 67eb612be42f0e6261b300af2b581344c448efdb..358d9b08618f4ac75158e01510362ef3cd281227 100644 --- a/official/cv/PointNet/eval.py +++ b/official/cv/PointNet/eval.py @@ -20,7 +20,7 @@ import random import math import numpy as np import mindspore -from mindspore import load_checkpoint, load_param_into_net, context +from mindspore import load_checkpoint, load_param_into_net import mindspore.dataset as ds import mindspore.ops as ops from src.dataset import ShapeNetDataset @@ -100,18 +100,18 @@ if __name__ == "__main__": local_data_url = './cache/data' local_train_url = './cache/ckpt' device_target = args.device_target - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - context.set_context(save_graphs=False) + mindspore.set_context(mode=0, device_target=args.device_target) + mindspore.set_context(save_graphs=False) if device_target == "Ascend": - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) else: raise ValueError("Unsupported platform.") import moxing as mox mox.file.copy_parallel(src_url=args.data_url, dst_url=local_data_url) else: - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - context.set_context(save_graphs=False) + mindspore.set_context(mode=0, device_target=args.device_target) + mindspore.set_context(save_graphs=False) if not os.path.exists(local_train_url): os.makedirs(local_train_url) diff --git a/official/cv/PointNet/preprocess.py b/official/cv/PointNet/preprocess.py index a689e2171ae073917a48ceaf63dca541a48dc17b..be42b4b7e044e61929f9791edeac62cc05732058 100644 --- a/official/cv/PointNet/preprocess.py +++ b/official/cv/PointNet/preprocess.py @@ -15,7 +15,7 @@ """pre process for 310 inference""" import os import argparse -from mindspore import context +import mindspore import mindspore.dataset as ds import numpy as np from src.dataset import ShapeNetDataset @@ -31,7 +31,7 @@ parser.add_argument( '--batchSize', type=int, default=1, help='input batch size') args = parser.parse_args() -context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend", device_id=args.device_id) +mindspore.set_context(mode=1, device_target="Ascend", device_id=args.device_id) if __name__ == '__main__': dataset_generator = ShapeNetDataset( root=args.dataset_path, diff --git a/official/cv/PointNet/src/export.py b/official/cv/PointNet/src/export.py index cd4da5c6e43c4b2ae80dbc570f66ca9dbd227e22..5a3c39662c748e5e3a0a9b6823117792d8727b62 100644 --- a/official/cv/PointNet/src/export.py +++ b/official/cv/PointNet/src/export.py @@ -16,7 +16,8 @@ import os import argparse import numpy as np -from mindspore import Tensor, export, load_checkpoint, context +import mindspore +from mindspore import Tensor, export, load_checkpoint from src.network import PointNetDenseCls parser = argparse.ArgumentParser(description='MindSpore Pointnet Segmentation') parser.add_argument( @@ -28,7 +29,7 @@ parser.add_argument('--file_format', type=str, default='MINDIR', help="export fi parser.add_argument('--feature_transform', action='store_true', help="use feature transform") args = parser.parse_args() -context.set_context(mode=context.PYNATIVE_MODE, device_target=args.device_target) +mindspore.set_context(mode=1, device_target=args.device_target) num_classes = 4 classifier = PointNetDenseCls(k=num_classes, feature_transform=args.feature_transform) if not os.path.exists('./mindir'): diff --git a/official/cv/PointNet/src/preprocess.py b/official/cv/PointNet/src/preprocess.py index d7c0ffd30e6c1646db512966a6d5dd66ea1b8cbb..ab2ca7222712a3c2cdb0b458ef4c4ac4666c9612 100644 --- a/official/cv/PointNet/src/preprocess.py +++ b/official/cv/PointNet/src/preprocess.py @@ -15,7 +15,7 @@ """pre process for 310 inference""" import os import argparse -from mindspore import context +import mindspore import mindspore.dataset as ds import numpy as np from src.dataset import ShapeNetDataset @@ -32,7 +32,7 @@ parser.add_argument( '--batchSize', type=int, default=1, help='input batch size') args = parser.parse_args() -context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend", device_id=args.device_id) +mindspore.set_context(mode=1, device_target="Ascend", device_id=args.device_id) if __name__ == '__main__': dataset_generator = ShapeNetDataset( root=args.dataset_path, diff --git a/official/cv/PointNet/train.py b/official/cv/PointNet/train.py index 35472fbe1725f6fb3f38f8ce62a250d58873b645..9420ab993a9fe0f139ec343e9fc228e65c3a3ebc 100644 --- a/official/cv/PointNet/train.py +++ b/official/cv/PointNet/train.py @@ -21,8 +21,7 @@ import math import numpy as np import mindspore import mindspore.nn as nn -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode import mindspore.dataset as ds from mindspore import save_checkpoint import mindspore.ops as ops @@ -128,14 +127,14 @@ if __name__ == "__main__": device_target = args.device_target num_shards = int(os.getenv("RANK_SIZE")) shard_id = int(os.getenv("DEVICE_ID")) - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - context.set_context(save_graphs=False) + mindspore.set_context(mode=0, device_target=args.device_target) + mindspore.set_context(save_graphs=False) if device_target == "Ascend": - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) if device_num > 1: args.learning_rate *= 2 - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() local_data_url = os.path.join(local_data_url, str(device_id)) @@ -147,15 +146,15 @@ if __name__ == "__main__": mox.file.copy_parallel(src_url=args.data_url, dst_url=local_data_url) else: # run on the local server - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - context.set_context(save_graphs=False) + mindspore.set_context(mode=0, device_target=args.device_target) + mindspore.set_context(save_graphs=False) if args.device_target == "GPU": - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) if device_num > 1: args.learning_rate = args.learning_rate * 2 - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() shard_id = get_rank() @@ -187,7 +186,7 @@ if __name__ == "__main__": num_classes = dataset_generator.num_seg_classes classifier = PointNetDenseCls(k=num_classes, feature_transform=args.feature_transform) classifier.set_train(True) - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": classifier.to_float(mindspore.float16) for _, cell in classifier.cells_and_names(): if isinstance(cell, nn.LogSoftmax): diff --git a/official/cv/PointNet2/eval.py b/official/cv/PointNet2/eval.py index e70bc48978564aefb50e697e5dc1ab6000a5255a..efaffbf5d3f6fb3e9bb307e60eca78b35cdcea03 100644 --- a/official/cv/PointNet2/eval.py +++ b/official/cv/PointNet2/eval.py @@ -19,8 +19,8 @@ import ast import os import time +import mindspore import mindspore.dataset as ds -from mindspore import context from mindspore.nn.metrics import Accuracy from mindspore.train import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -70,10 +70,10 @@ def run_eval(): pretrained_ckpt_path = args.pretrained_ckpt if args.platform == "Ascend": - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id) - context.set_context(max_call_depth=2048) + mindspore.set_context(mode=0, device_target="Ascend", device_id=device_id) + mindspore.set_context(max_call_depth=2048) else: - context.set_context(mode=context.GRAPH_MODE, device_target="GPU", max_call_depth=2000, device_id=device_id) + mindspore.set_context(mode=0, device_target="GPU", max_call_depth=2000, device_id=device_id) print(args) diff --git a/official/cv/PointNet2/export.py b/official/cv/PointNet2/export.py index b1f9d7166005bd9a3c723e71890d0e7c416ee578..510072c339e3bd2295275582fe79d3e840e81005 100644 --- a/official/cv/PointNet2/export.py +++ b/official/cv/PointNet2/export.py @@ -21,8 +21,9 @@ import ast import os import numpy as np +import mindspore import mindspore.common.dtype as mstype -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.pointnet2 import PointNet2 @@ -38,9 +39,9 @@ parser.add_argument('--num_category', default=40, type=int, choices=[10, 40], he parser.add_argument('--use_normals', action='store_true', default=False, help='use normals') # channels = 6 if true args = parser.parse_args() -context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") -context.set_context(device_id=int(os.getenv('DEVICE_ID', '0'))) -context.set_context(max_call_depth=2048) +mindspore.set_context(mode=0, device_target="Ascend") +mindspore.set_context(device_id=int(os.getenv('DEVICE_ID', '0'))) +mindspore.set_context(max_call_depth=2048) if args.enable_modelarts: import moxing as mox diff --git a/official/cv/PointNet2/src/pointnet2_utils.py b/official/cv/PointNet2/src/pointnet2_utils.py index 828a00abfee15154cd6484d0cb33ff33029ae143..6fdddc0601a91fb47f4ece689102f75a2ba07fb8 100644 --- a/official/cv/PointNet2/src/pointnet2_utils.py +++ b/official/cv/PointNet2/src/pointnet2_utils.py @@ -15,7 +15,7 @@ """network definition utils""" import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.numpy as mnp import mindspore.ops as P @@ -28,14 +28,14 @@ from src.layers import Conv2d @constexpr def generate_tensor_fps(B, N): """generate tensor""" - farthest = Tensor(np.random.randint(N, size=(B,)), ms.int32) + farthest = Tensor(np.random.randint(N, size=(B,)), mindspore.int32) return farthest @constexpr def generate_tensor_batch_indices(B): """generate tensor""" - return Tensor(np.arange(B), ms.int32) + return Tensor(np.arange(B), mindspore.int32) def square_distance(src, dst): @@ -91,16 +91,16 @@ def farthest_point_sample(xyz, npoint): centroids: sampled pointcloud index, [B, npoint] """ B, N, _ = xyz.shape - centroids = mnp.zeros((npoint, B), ms.int32) - distance = mnp.ones((B, N), ms.int32) * 1e9 + centroids = mnp.zeros((npoint, B), mindspore.int32) + distance = mnp.ones((B, N), mindspore.int32) * 1e9 farthest = generate_tensor_fps(B, N) batch_indices = generate_tensor_batch_indices(B) for i in range(npoint): - centroids = P.Cast()(centroids, ms.float32) - farthest = P.Cast()(farthest, ms.float32) + centroids = P.Cast()(centroids, mindspore.float32) + farthest = P.Cast()(farthest, mindspore.float32) centroids[i] = farthest - centroids = P.Cast()(centroids, ms.int32) - farthest = P.Cast()(farthest, ms.int32) + centroids = P.Cast()(centroids, mindspore.int32) + farthest = P.Cast()(farthest, mindspore.int32) index = P.Concat(-1)((batch_indices.reshape(batch_indices.shape + (1,)), farthest.reshape(farthest.shape + (1,)))) centroid = P.GatherNd()(xyz, index).reshape((B, 1, 3)) @@ -122,15 +122,15 @@ def query_ball_point(radius, nsample, xyz, new_xyz): """ B, N, _ = xyz.shape _, S, _ = new_xyz.shape - group_idx = mnp.arange(0, N, 1, ms.int32).view(1, 1, N) + group_idx = mnp.arange(0, N, 1, mindspore.int32).view(1, 1, N) group_idx = P.Tile()(group_idx, (B, S, 1)) sqrdists = square_distance(new_xyz, xyz) idx = sqrdists > radius ** 2 group_idx = P.Select()(idx, -1 * P.OnesLike()(group_idx), group_idx) - group_idx = P.Cast()(group_idx, ms.float32) + group_idx = P.Cast()(group_idx, mindspore.float32) group_idx, _ = P.TopK()(group_idx, nsample) - group_idx = P.Cast()(group_idx, ms.int32) + group_idx = P.Cast()(group_idx, mindspore.int32) group_first = group_idx[:, :, 0].view(B, S, 1) group_first = P.Tile()(group_first, (1, 1, nsample)) # [B, S, nsample] diff --git a/official/cv/PointNet2/train.py b/official/cv/PointNet2/train.py index 3698219fe1ef7973aefefa68b76e8cefe35a5bbd..e998e45f60263b8b1badad7c13ecb712f5b02de2 100644 --- a/official/cv/PointNet2/train.py +++ b/official/cv/PointNet2/train.py @@ -23,8 +23,8 @@ import time import mindspore import mindspore.dataset as ds import mindspore.nn as nn -from mindspore import Model, Tensor, context, load_checkpoint, load_param_into_net -from mindspore.context import ParallelMode +from mindspore import Model, Tensor, load_checkpoint, load_param_into_net +from mindspore import ParallelMode from mindspore.profiler import Profiler from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.communication.management import init, get_rank @@ -76,29 +76,29 @@ def content_init(args, device_id, device_num): raise ValueError("Unsupported platform {}".format(args.platform)) if _platform == "ascend": - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target="Ascend", device_id=device_id) - context.set_context(max_call_depth=2048) + mindspore.set_context(max_call_depth=2048) if device_num > 1: init() - context.set_auto_parallel_context( + mindspore.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target="GPU", max_call_depth=2048) if device_num > 1: mindspore.dataset.config.set_enable_shared_mem(False) - context.set_auto_parallel_context( - parallel_mode=context.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context( + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) mindspore.common.set_seed(1234) init() else: - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) def get_data_url(args, rank_id=0): diff --git a/official/cv/ResNet/README.md b/official/cv/ResNet/README.md index 035dea1f05efdd169f308ee44b2a264c9858283e..c3f919be56066725f1f0ace0e334c2a5bed8f3a9 100644 --- a/official/cv/ResNet/README.md +++ b/official/cv/ResNet/README.md @@ -17,26 +17,31 @@ - [Usage](#usage) - [Running on Ascend](#running-on-ascend) - [Running on GPU](#running-on-gpu) - - [Running parameter server mode training](#running-parameter-server-mode-training) + - [Running parameter server mode training](#running-parameter-server-mode-training) - [Evaluation while training](#evaluation-while-training) - - [Result](#result) - - [Evaluation Process](#evaluation-process) - - [Usage](#usage-1) - - [Running on Ascend](#running-on-ascend-1) - - [Running on GPU](#running-on-gpu-1) - - [Result](#result-1) - - [Prediction Process](#prediction-process) - - [Prediction](#prediction) - - [Inference Process](#inference-process) - - [Export MindIR](#export-mindir) - - [Infer on Ascend310](#infer-on-ascend310) - - [result](#result-2) + - [Resume Process](#resume-process) + - [Usage](#usage-1) + - [Running on Ascend](#running-on-ascend-1) + - [Result](#result) + - [Evaluation Process](#evaluation-process) + - [Usage](#usage-2) + - [Running on Ascend](#running-on-ascend-2) + - [Running on GPU](#running-on-gpu-1) + - [Result](#result-1) + - [Prediction Process](#prediction-process) + - [Prediction](#prediction) + - [Inference Process](#inference-process) + - [Export MindIR](#export-mindir) + - [Infer on Ascend310](#infer-on-ascend310) + - [result](#result-2) - [Apply algorithm in MindSpore Golden Stick](#apply-algorithm-in-mindspore-golden-stick) - [Training Process](#training-process-1) - [Running on GPU](#running-on-gpu-2) - - [Evaluation Process](#evaluation-process-1) - - [Running on GPU](#running-on-gpu-3) - - [Result](#result-3) + - [Running on Ascend](#running-on-ascend-3) + - [Evaluation Process](#evaluation-process-1) + - [Running on GPU](#running-on-gpu-3) + - [Running on Ascend](#running-on-ascend-4) + - [Result](#result-3) - [Model Description](#model-description) - [Performance](#performance) - [Evaluation Performance](#evaluation-performance) @@ -46,17 +51,20 @@ - [ResNet50 on ImageNet2012](#resnet50-on-imagenet2012) - [ResNet34 on ImageNet2012](#resnet34-on-imagenet2012) - [ResNet101 on ImageNet2012](#resnet101-on-imagenet2012) + - [ResNet152 on ImageNet2012](#resnet152-on-imagenet2012) - [SE-ResNet50 on ImageNet2012](#se-resnet50-on-imagenet2012) - - [Inference Performance](#inference-performance) - - [ResNet18 on CIFAR-10](#resnet18-on-cifar-10-1) - - [ResNet18 on ImageNet2012](#resnet18-on-imagenet2012-1) - - [ResNet34 on ImageNet2012](#resnet34-on-imagenet2012-1) - - [ResNet50 on CIFAR-10](#resnet50-on-cifar-10-1) - - [ResNet50 on ImageNet2012](#resnet50-on-imagenet2012-1) - - [ResNet101 on ImageNet2012](#resnet101-on-imagenet2012-1) - - [SE-ResNet50 on ImageNet2012](#se-resnet50-on-imagenet2012-1) + - [Inference Performance](#inference-performance) + - [ResNet18 on CIFAR-10](#resnet18-on-cifar-10-1) + - [ResNet18 on ImageNet2012](#resnet18-on-imagenet2012-1) + - [ResNet34 on ImageNet2012](#resnet34-on-imagenet2012-1) + - [ResNet50 on CIFAR-10](#resnet50-on-cifar-10-1) + - [ResNet50 on ImageNet2012](#resnet50-on-imagenet2012-1) + - [ResNet101 on ImageNet2012](#resnet101-on-imagenet2012-1) + - [ResNet152 on ImageNet2012](#resnet152-on-imagenet2012-1) + - [SE-ResNet50 on ImageNet2012](#se-resnet50-on-imagenet2012-1) - [Description of Random Situation](#description-of-random-situation) - [ModelZoo Homepage](#modelzoo-homepage) +- [FAQ](#faq) # [ResNet Description](#contents) @@ -1466,7 +1474,7 @@ Refer to the [ModelZoo FAQ](https://gitee.com/mindspore/models#FAQ) for some com - **Q: How to use `boost` to get the best performance?** - **A**: We provide the `boost_level` in the `Model` interface, when you set it to `O1` or `O2` mode, the network will automatically speed up. The high-performance mode has been fully verified on resnet50, you can use the `resnet50_imagenet2012_Boost_config.yaml` to experience this mode. Meanwhile, in `O1` or `O2` mode, it is recommended to set the following environment variables: ` export ENV_FUSION_CLEAR=1; export DATASET_ENABLE_NUMA=True; export ENV_SINGLE_EVAL=1; export SKT_ENABLE=1;`. + **A**: We provide the `boost_level` in the `Model` interface, when you set it to `O1` or `O2` mode, the network will automatically speed up. The high-performance mode has been fully verified on resnet50, you can use the `resnet50_imagenet2012_Boost_config.yaml` to experience this mode. - **Q: How to use to preprocess imagenet2012 dataset?** @@ -1474,7 +1482,7 @@ Refer to the [ModelZoo FAQ](https://gitee.com/mindspore/models#FAQ) for some com - **Q: How to solve the memory shortage caused by accumulation operators such as ReduceMean and BiasAddGrad on 910B?** - **A**: Suggested adding `ms.set_context(ascend_config={"atomic_clean_policy": 0})` in `train.py`. If the problem still hasn't been resolved, please go to the [MindSpore community](https://gitee.com/mindspore/mindspore/issues) to submit an issue. + **A**: Suggested adding `mindspore.set_context(ascend_config={"atomic_clean_policy": 0})` in `train.py`. If the problem still hasn't been resolved, please go to the [MindSpore community](https://gitee.com/mindspore/mindspore/issues) to submit an issue. - **Q: How to solve the problem of `out of memory`?** diff --git a/official/cv/ResNet/README_CN.md b/official/cv/ResNet/README_CN.md index 9392125923e87ed5bbb68a8e5583a030d0b4d130..dc59a8b825fce1ce7ec43189ec7f42d5bac23cd8 100644 --- a/official/cv/ResNet/README_CN.md +++ b/official/cv/ResNet/README_CN.md @@ -16,44 +16,45 @@ - [脚本及样例代码](#脚本及样例代码) - [脚本参数](#脚本参数) - [训练过程](#训练过程) - - [用法](#用法) - - [Ascend处理器环境运行](#ascend处理器环境运行) - - [GPU处理器环境运行](#gpu处理器环境运行) - - [运行参数服务器模式训练](#运行参数服务器模式训练) - - [训练时推理](#训练时推理) - - [迁移训练过程](#迁移训练过程) - - [迁移数据集处理](#迁移数据集处理) - - [迁移训练Ckpt获取](#迁移训练ckpt获取) - - [用法](#用法-1) - - [结果](#结果) - - [迁移训练推理过程](#迁移训练推理过程) - - [用法](#用法-2) - - [续训过程](#续训过程) - - [用法](#用法-3) - - [Ascend处理器环境运行](#ascend处理器环境运行-1) - - [结果](#结果-1) - - [评估过程](#评估过程) - - [用法](#用法-4) - - [Ascend处理器环境运行](#ascend处理器环境运行-2) - - [GPU处理器环境运行](#gpu处理器环境运行-1) - - [结果](#结果-2) - - [预测过程](#预测过程) - - [预测](#预测) - - [推理过程](#推理过程) - - [导出MindIR](#导出mindir) - - [ONNX的导出与推理](#onnx的导出与推理) - - [执行推理](#执行推理) - - [结果](#结果-3) + - [用法](#用法) + - [Ascend处理器环境运行](#ascend处理器环境运行) + - [GPU处理器环境运行](#gpu处理器环境运行) + - [运行参数服务器模式训练](#运行参数服务器模式训练) + - [训练时推理](#训练时推理) + - [迁移训练过程](#迁移训练过程) + - [迁移数据集处理](#迁移数据集处理) + - [迁移训练Ckpt获取](#迁移训练ckpt获取) + - [用法](#用法-1) + - [结果](#结果) + - [迁移训练推理过程](#迁移训练推理过程) + - [用法](#用法-2) + - [续训过程](#续训过程) + - [用法](#用法-3) + - [Ascend处理器环境运行](#ascend处理器环境运行-1) + - [结果](#结果-1) + - [评估过程](#评估过程) + - [用法](#用法-4) + - [Ascend处理器环境运行](#ascend处理器环境运行-2) + - [GPU处理器环境运行](#gpu处理器环境运行-1) + - [结果](#结果-2) + - [预测过程](#预测过程) + - [预测](#预测) + - [推理过程](#推理过程) + - [导出MindIR](#导出mindir) + - [ONNX的导出与推理](#onnx的导出与推理) + - [执行推理](#执行推理) + - [结果](#结果-3) - [应用MindSpore Golden Stick模型压缩算法](#应用mindspore-golden-stick模型压缩算法) + - [mindspore\_gs环境安装参考gloden-stick](#mindspore_gs环境安装参考gloden-stick) - [训练过程](#训练过程-1) - [GPU处理器环境运行](#gpu处理器环境运行-2) - [Ascend处理器环境运行](#ascend处理器环境运行-3) - - [评估过程](#评估过程-1) - - [GPU处理器环境运行](#gpu处理器环境运行-3) - - [Ascend处理器环境运行](#ascend处理器环境运行-4) - - [结果](#结果-4) - - [GPU结果](#gpu结果) - - [Ascend结果](#ascend结果) + - [评估过程](#评估过程-1) + - [GPU处理器环境运行](#gpu处理器环境运行-3) + - [Ascend处理器环境运行](#ascend处理器环境运行-4) + - [结果](#结果-4) + - [GPU结果](#gpu结果) + - [Ascend结果](#ascend结果) - [模型描述](#模型描述) - [性能](#性能) - [评估性能](#评估性能) @@ -1415,7 +1416,7 @@ result:{'top_1_accuracy': 0.928385416666666} prune_rate=0.45 ckpt=~/resnet50_cif - **Q: 如何使用`boost`功能获取最优的性能?** - **A**: 我们在`Model`中提供了`boost_level`的入参,当你将其设置为O1或者O2模式时,框架会自动对网络的性能进行优化。当前这个模式已在resnet50上充分验证,你可以使用`resnet50_imagenet2012_Boost_config.yaml`来体验该模式。同时,在O1或者O2模式下,建议设置以下环境变量:`export ENV_FUSION_CLEAR=1;export DATASET_ENABLE_NUMA=True;export ENV_SINGLE_EVAL=1;export SKT_ENABLE=1;`来获取更好的性能。 + **A**: 我们在`Model`中提供了`boost_level`的入参,当你将其设置为O1或者O2模式时,框架会自动对网络的性能进行优化。当前这个模式已在resnet50上充分验证,你可以使用`resnet50_imagenet2012_Boost_config.yaml`来体验该模式。 - **Q: 如何使用对ImageNet2012数据集进行预处理?** @@ -1423,7 +1424,7 @@ result:{'top_1_accuracy': 0.928385416666666} prune_rate=0.45 ckpt=~/resnet50_cif - **Q: 如何解决910B硬件上因ReduceMean、BiasAddGrad等累加算子导致的内存不足?** - **A**: 建议在`train.py`中添加`ms.set_context(ascend_config={"atomic_clean_policy": 0})`,如果还是没有解决问题,请到[MindSpore社区](https://gitee.com/mindspore/mindspore/issues)提issue。 + **A**: 建议在`train.py`中添加`mindspore.set_context(ascend_config={"atomic_clean_policy": 0})`,如果还是没有解决问题,请到[MindSpore社区](https://gitee.com/mindspore/mindspore/issues)提issue。 - **Q: 遇到`out of memory`如何解决?** diff --git a/official/cv/ResNet/eval.py b/official/cv/ResNet/eval.py index 239ae7d86d1a8516b72fd89b3d77f621c484e2d4..582b25334b7ecdebb3013089b6ff3d46a03e639c 100644 --- a/official/cv/ResNet/eval.py +++ b/official/cv/ResNet/eval.py @@ -14,7 +14,7 @@ # ============================================================================ """eval resnet.""" import os -import mindspore as ms +import mindspore from mindspore import Tensor from mindspore.nn.optim import Momentum from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits @@ -22,7 +22,7 @@ from src.CrossEntropySmooth import CrossEntropySmooth from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -ms.set_seed(1) +mindspore.set_seed(1) if config.net_name in ("resnet18", "resnet34", "resnet50", "resnet152"): if config.net_name == "resnet18": @@ -66,10 +66,10 @@ def eval_net(): """eval net""" target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False, jit_config={"jit_level": "O2"}) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID', '0')) - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, @@ -80,8 +80,8 @@ def eval_net(): net = resnet(class_num=config.class_num) # load checkpoint - param_dict = ms.load_checkpoint(config.checkpoint_file_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) # define loss, model @@ -100,8 +100,8 @@ def eval_net(): opt = Momentum(group_params, Tensor(0.0), config.momentum, loss_scale=config.loss_scale) # define model, add boostmode for eval scenarios with train.py - model = ms.Model(net, loss_fn=loss, boost_level=config.boost_mode, - optimizer=opt, metrics={'top_1_accuracy', 'top_5_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, boost_level=config.boost_mode, + optimizer=opt, metrics={'top_1_accuracy', 'top_5_accuracy'}) # eval model res = model.eval(dataset) diff --git a/official/cv/ResNet/export.py b/official/cv/ResNet/export.py index 106992af23b98c183fadabdb02f9ca95249c5c85..2fe07032dd8b847b72a7120695ff1c23310b650f 100644 --- a/official/cv/ResNet/export.py +++ b/official/cv/ResNet/export.py @@ -18,13 +18,13 @@ python export.py """ import os -import mindspore as ms +import mindspore from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target != "GPU": - ms.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) def modelarts_pre_process(): @@ -54,10 +54,10 @@ def run_export(): assert config.checkpoint_file_path is not None, "checkpoint_path is None." - param_dict = ms.load_checkpoint(config.checkpoint_file_path) - ms.load_param_into_net(net, param_dict) - input_arr = ms.numpy.zeros([config.batch_size, 3, config.height, config.width], ms.float32) - ms.export(net, input_arr, file_name=config.file_name, file_format=config.file_format) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) + mindspore.load_param_into_net(net, param_dict) + input_arr = mindspore.numpy.zeros([config.batch_size, 3, config.height, config.width], mindspore.float32) + mindspore.export(net, input_arr, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': run_export() diff --git a/official/cv/ResNet/fine_tune.py b/official/cv/ResNet/fine_tune.py index a43f2fe79a64708290dbf5fc9213b319755da037..0799aa8a767fa952aafd96519606d217cec37731 100644 --- a/official/cv/ResNet/fine_tune.py +++ b/official/cv/ResNet/fine_tune.py @@ -15,7 +15,7 @@ """train resnet34.""" import os -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.train.model import Model @@ -29,7 +29,7 @@ from src.util import eval_callback, set_output_dir from src.logger import get_logger -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, save_graphs=False) +mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) def import_data(): @@ -77,7 +77,7 @@ def init_weight(net, param_dict): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(param_dict, filter_list) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) def eval_net(net, dataset): @@ -87,7 +87,7 @@ def eval_net(net, dataset): loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) # eval model res = model.eval(dataset) @@ -99,7 +99,7 @@ def finetune_train(): config.logger = get_logger(config.log_dir, 0) dataset_train, data_val = import_data() - ckpt_param_dict = ms.load_checkpoint(config.checkpoint_path) + ckpt_param_dict = mindspore.load_checkpoint(config.checkpoint_path) net = resnet34(class_num=1001) init_weight(net=net, param_dict=ckpt_param_dict) config.logger.info("net parameter:") @@ -133,7 +133,7 @@ def finetune_train(): # define callbacks step_size = dataset_train.get_dataset_size() time_cb = TimeMonitor(data_size=step_size) - lr = ms.Tensor([config.learning_rate] * step_size * config.epoch_size) + lr = mindspore.Tensor([config.learning_rate] * step_size * config.epoch_size) loss_cb = LossCallBack(config.epoch_size, config.logger, lr, per_print_time=10) cb = [time_cb, loss_cb] diff --git a/official/cv/ResNet/golden_stick/ghost/eval.py b/official/cv/ResNet/golden_stick/ghost/eval.py index 916f37e1808eb27c6c26b951ca14c95f46eaf815..c579ee434ce889ca4a3a8dc5d96beb57ecc323da 100644 --- a/official/cv/ResNet/golden_stick/ghost/eval.py +++ b/official/cv/ResNet/golden_stick/ghost/eval.py @@ -15,7 +15,7 @@ """eval resnet.""" import os import numpy as np -import mindspore as ms +import mindspore from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore_gs import GhostAlgo from src.CrossEntropySmooth import CrossEntropySmooth @@ -27,7 +27,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) def eval_net(): @@ -35,10 +35,10 @@ def eval_net(): target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, @@ -48,9 +48,9 @@ def eval_net(): net = resnet(class_num=config.class_num) algo = GhostAlgo({}) net = algo.apply(net) - param_dict = ms.load_checkpoint(config.checkpoint_file_path) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) # load checkpoint - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) # define loss, model @@ -64,7 +64,7 @@ def eval_net(): loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy'}) # eval model res = model.eval(dataset) diff --git a/official/cv/ResNet/golden_stick/ghost/train.py b/official/cv/ResNet/golden_stick/ghost/train.py index b073627085b7a5ab102b0508142e46440a52c605..dcde2bfb09a875cafcaf92491db0f839cb20f364 100644 --- a/official/cv/ResNet/golden_stick/ghost/train.py +++ b/official/cv/ResNet/golden_stick/ghost/train.py @@ -15,7 +15,7 @@ """train resnet.""" import os -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.common.initializer as weight_init from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor @@ -33,7 +33,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) def filter_checkpoint_parameter_by_list(origin_dict, param_filter): @@ -54,27 +54,27 @@ def set_parameter(): # init context if config.mode_name == "GRAPH": - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=1, device_target=target, save_graphs=False) if config.run_distribute: if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - ms.set_context(device_id=device_id) - ms.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_context(device_id=device_id) + mindspore.set_auto_parallel_context(device_num=config.device_num, parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) set_algo_parameters(elementwise_op_strategy_follow=True) if config.boost_mode not in ["O1", "O2"]: - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) init() else: # GPU target init() - ms.set_auto_parallel_context(device_num=config.device_num, - parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=config.device_num, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) def init_weight(net, param_dict): @@ -90,7 +90,7 @@ def init_weight(net, param_dict): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(param_dict, filter_list) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) else: for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): @@ -106,11 +106,11 @@ def init_weight(net, param_dict): def load_fp32_ckpt(net): if config.fp32_ckpt: if os.path.isfile(config.fp32_ckpt): - ckpt = ms.load_checkpoint(config.fp32_ckpt) + ckpt = mindspore.load_checkpoint(config.fp32_ckpt) if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - ms.load_param_into_net(net, ckpt) + mindspore.load_param_into_net(net, ckpt) else: print(f"Invalid fp32_ckpt {config.fp32_ckpt} parameter.") @@ -118,7 +118,7 @@ def load_fp32_ckpt(net): def load_pretrained_ckpt(net): if config.pre_trained: if os.path.isfile(config.pre_trained): - ckpt = ms.load_checkpoint(config.pre_trained) + ckpt = mindspore.load_checkpoint(config.pre_trained) if ckpt.get("epoch_num") and ckpt.get("step_num"): config.has_trained_epoch = int(ckpt["epoch_num"].data.asnumpy()) config.has_trained_step = int(ckpt["step_num"].data.asnumpy()) @@ -134,7 +134,7 @@ def load_pretrained_ckpt(net): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - not_load_param, _ = ms.load_param_into_net(net, ckpt) + not_load_param, _ = mindspore.load_param_into_net(net, ckpt) if not_load_param: raise RuntimeError("Load param into net fail.") else: @@ -207,7 +207,7 @@ def train_net(): loss_scale=config.loss_scale ) kf_loss_fn = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') - loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + loss_scale = mindspore.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) ckpt_save_dir = set_save_ckpt_dir() config_ck = CheckpointConfig(save_checkpoint_steps=5 * step_size, keep_checkpoint_max=config.keep_checkpoint_max) @@ -217,7 +217,7 @@ def train_net(): loss_cb = LossMonitor() metrics = {"acc"} cb = [loss_cb, time_cb, ckpt_cb] - model = ms.Model(net, loss_fn=kf_loss_fn, optimizer=optimizer, loss_scale_manager=loss_scale, metrics=metrics, + model = mindspore.Model(net, loss_fn=kf_loss_fn, optimizer=optimizer, loss_scale_manager=loss_scale, metrics=metrics, boost_level=config.boost_mode, boost_config_dict={"grad_freeze": {"total_steps": config.epoch_size * step_size}}) model.train(config.epoch_size, dataset, callbacks=cb, dataset_sink_mode=True) diff --git a/official/cv/ResNet/golden_stick/pruner/scop/eval.py b/official/cv/ResNet/golden_stick/pruner/scop/eval.py index cb866278e00b59584bc5fd8653620a907677e678..ed5d545e8b21965f58a42e154800f0082de5f2ba 100644 --- a/official/cv/ResNet/golden_stick/pruner/scop/eval.py +++ b/official/cv/ResNet/golden_stick/pruner/scop/eval.py @@ -15,7 +15,7 @@ """eval resnet.""" import os import numpy as np -import mindspore as ms +import mindspore from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore_gs import PrunerKfCompressAlgo, PrunerFtCompressAlgo from mindspore_gs.pruner.scop.scop_pruner import KfConv2d, MaskedConv2dbn @@ -28,7 +28,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) def eval_net(): @@ -36,10 +36,10 @@ def eval_net(): target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, @@ -49,7 +49,7 @@ def eval_net(): net = resnet(class_num=config.class_num) net = PrunerKfCompressAlgo({}).apply(net) out_index = [] - param_dict = ms.load_checkpoint(config.checkpoint_file_path) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) for key in param_dict.keys(): if 'out_index' in key: out_index.append(param_dict[key]) @@ -60,7 +60,7 @@ def eval_net(): net = ft_algo._recover_conv(net) # load checkpoint - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) # define loss, model @@ -74,7 +74,7 @@ def eval_net(): loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy'}) # eval model res = model.eval(dataset) diff --git a/official/cv/ResNet/golden_stick/pruner/scop/infer.py b/official/cv/ResNet/golden_stick/pruner/scop/infer.py index 70597796de7b0c5333f675a74529b0710d46ac5d..f30157e88bf6ddb6ef4d84c29b32618abf740d17 100644 --- a/official/cv/ResNet/golden_stick/pruner/scop/infer.py +++ b/official/cv/ResNet/golden_stick/pruner/scop/infer.py @@ -14,7 +14,7 @@ # ============================================================================ """infer scop_resnet mindir.""" import datetime -import mindspore as ms +import mindspore import mindspore.nn as nn from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper @@ -32,7 +32,7 @@ def infer_net(): raise ValueError("Currently only support GPU.") # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, @@ -40,7 +40,7 @@ def infer_net(): step_size = dataset.get_dataset_size() # load mindir - graph = ms.load(config.mindir_path) + graph = mindspore.load(config.mindir_path) net = nn.GraphCell(graph) print("start infer") @@ -49,7 +49,7 @@ def infer_net(): for _, data in enumerate(data_loader): images = data["image"] start_time = datetime.datetime.now() - net(ms.Tensor(images)) + net(mindspore.Tensor(images)) end_time = datetime.datetime.now() total_time += (end_time - start_time).microseconds diff --git a/official/cv/ResNet/golden_stick/pruner/scop/train.py b/official/cv/ResNet/golden_stick/pruner/scop/train.py index 6e1cd12c5bfb05ef3251672bdfdaf6751a7e8f8a..13d7d667778c2185651a264ca6872187cfb9fb5f 100644 --- a/official/cv/ResNet/golden_stick/pruner/scop/train.py +++ b/official/cv/ResNet/golden_stick/pruner/scop/train.py @@ -16,7 +16,7 @@ import os import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor @@ -36,7 +36,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) class NetWithLossCell(nn.WithLossCell): @@ -74,27 +74,27 @@ def set_parameter(): # init context if config.mode_name == "GRAPH": - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=1, device_target=target, save_graphs=False) if config.run_distribute: if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - ms.set_context(device_id=device_id) - ms.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_context(device_id=device_id) + mindspore.set_auto_parallel_context(device_num=config.device_num, parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) set_algo_parameters(elementwise_op_strategy_follow=True) if config.boost_mode not in ["O1", "O2"]: - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) init() else: # GPU target init() - ms.set_auto_parallel_context(device_num=config.device_num, - parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=config.device_num, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) def init_weight(net, param_dict): @@ -110,12 +110,12 @@ def init_weight(net, param_dict): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(param_dict, filter_list) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) else: for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): if config.conv_init == "XavierUniform": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.XavierUniform(), cell.weight.shape, cell.weight.dtype)) elif config.conv_init == "TruncatedNormal": @@ -125,25 +125,25 @@ def init_weight(net, param_dict): cell.weight.set_data(weight) if isinstance(cell, nn.Dense): if config.dense_init == "TruncatedNormal": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) elif config.dense_init == "RandomNormal": in_channel = cell.in_channels out_channel = cell.out_channels weight = np.random.normal(loc=0, scale=0.01, size=out_channel * in_channel) - weight = ms.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) + weight = mindspore.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) cell.weight.set_data(weight) def load_fp32_ckpt(net): if config.fp32_ckpt: if os.path.isfile(config.fp32_ckpt): - ckpt = ms.load_checkpoint(config.fp32_ckpt) + ckpt = mindspore.load_checkpoint(config.fp32_ckpt) if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - ms.load_param_into_net(net, ckpt) + mindspore.load_param_into_net(net, ckpt) else: print(f"Invalid fp32_ckpt {config.fp32_ckpt} parameter.") @@ -151,7 +151,7 @@ def load_fp32_ckpt(net): def load_pretrained_ckpt(net): if config.pre_trained: if os.path.isfile(config.pre_trained): - ckpt = ms.load_checkpoint(config.pre_trained) + ckpt = mindspore.load_checkpoint(config.pre_trained) if ckpt.get("epoch_num") and ckpt.get("step_num"): config.has_trained_epoch = int(ckpt["epoch_num"].data.asnumpy()) config.has_trained_step = int(ckpt["step_num"].data.asnumpy()) @@ -167,7 +167,7 @@ def load_pretrained_ckpt(net): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - not_load_param, _ = ms.load_param_into_net(net, ckpt) + not_load_param, _ = mindspore.load_param_into_net(net, ckpt) if not_load_param: raise RuntimeError("Load param into net fail.") else: @@ -247,7 +247,7 @@ def train_net(): if config.pre_trained: train_ft(net) else: - model = ms.Model(net, loss_fn=kf_loss_fn, optimizer=optimizer) + model = mindspore.Model(net, loss_fn=kf_loss_fn, optimizer=optimizer) model.train(config.epoch_kf, dataset, callbacks=cb, dataset_sink_mode=False) train_ft(net) @@ -261,7 +261,7 @@ def train_ft(net): algo_ft = PrunerFtCompressAlgo({'prune_rate': config.prune_rate}) net = algo_ft.apply(net) load_pretrained_ckpt(net) - lr_ft_new = ms.Tensor(get_lr(lr_init=config.lr_init, + lr_ft_new = mindspore.Tensor(get_lr(lr_init=config.lr_init, lr_end=config.lr_end_ft, lr_max=config.lr_max_ft, warmup_epochs=config.warmup_epochs, @@ -279,7 +279,7 @@ def train_ft(net): metrics = {"acc"} loss_scale = FixedLossScaleManager(1024, drop_overflow_update=False) ft_loss_fn = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') - model_ft = ms.Model(net, loss_fn=ft_loss_fn, optimizer=optimizer_ft, loss_scale_manager=loss_scale, + model_ft = mindspore.Model(net, loss_fn=ft_loss_fn, optimizer=optimizer_ft, loss_scale_manager=loss_scale, metrics=metrics, amp_level="O2", boost_level="O0", keep_batchnorm_fp32=False) diff --git a/official/cv/ResNet/golden_stick/pruner/uni_pruning/eval.py b/official/cv/ResNet/golden_stick/pruner/uni_pruning/eval.py index 256685f0c73ba262cac766a2d92f56018bf37552..989abf9ca0ebfe9222036a9e458610eca5594404 100644 --- a/official/cv/ResNet/golden_stick/pruner/uni_pruning/eval.py +++ b/official/cv/ResNet/golden_stick/pruner/uni_pruning/eval.py @@ -16,7 +16,7 @@ import os import json import numpy as np -import mindspore as ms +import mindspore from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore_gs.pruner.uni_pruning import UniPruner @@ -29,7 +29,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) def eval_net(): @@ -37,12 +37,12 @@ def eval_net(): target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) else: device_id = config.device_id - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, @@ -73,7 +73,7 @@ def eval_net(): else: mask = None tag = 'original' - ms.load_checkpoint(config.checkpoint_file_path, net) + mindspore.load_checkpoint(config.checkpoint_file_path, net) algo.prune_by_mask(net, mask, config, tag) @@ -88,7 +88,7 @@ def eval_net(): loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy'}) # eval model res = model.eval(dataset) diff --git a/official/cv/ResNet/golden_stick/pruner/uni_pruning/export.py b/official/cv/ResNet/golden_stick/pruner/uni_pruning/export.py index 60b8e0672a3e25263bd396fc67b19ae9e0b8a976..d2e5ee532d024e69187976f5c0115b4777eff44b 100644 --- a/official/cv/ResNet/golden_stick/pruner/uni_pruning/export.py +++ b/official/cv/ResNet/golden_stick/pruner/uni_pruning/export.py @@ -17,7 +17,7 @@ pruning masks (.json format) are obtained during training in the experiment directory.""" import os import numpy as np -import mindspore as ms +import mindspore from mindspore_gs.pruner.uni_pruning import UniPruner #pylint: disable=ungrouped-imports from src.resnet import resnet18, resnet50 @@ -30,12 +30,12 @@ def export(): target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) else: device_id = config.device_id - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # define net if config.net_name == 'resnet18': @@ -60,8 +60,8 @@ def export(): ckpt_path=config.checkpoint_file_path, mask_path=config.mask_path) inputs = np.random.uniform(0.0, 1.0, size=input_size).astype(np.float32) - inputs = ms.Tensor(inputs) - ms.export(net_deploy, inputs, file_name=f"{save_path}_pruned.mindir", file_format="MINDIR") + inputs = mindspore.Tensor(inputs) + mindspore.export(net_deploy, inputs, file_name=f"{save_path}_pruned.mindir", file_format="MINDIR") if __name__ == '__main__': diff --git a/official/cv/ResNet/golden_stick/pruner/uni_pruning/train.py b/official/cv/ResNet/golden_stick/pruner/uni_pruning/train.py index 4bb6544233ec5ce3c0055801ba6136e68c46cab5..bf3512640d4c3030eb62d3583272fb9b05392883 100644 --- a/official/cv/ResNet/golden_stick/pruner/uni_pruning/train.py +++ b/official/cv/ResNet/golden_stick/pruner/uni_pruning/train.py @@ -16,8 +16,8 @@ import os import numpy as np -import mindspore as ms -from mindspore import context, nn +import mindspore +from mindspore import nn from mindspore.train.model import ParallelMode from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.communication.management import init, get_rank @@ -39,7 +39,7 @@ else: else: from src.dataset import create_dataset_pynative as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) def filter_checkpoint_parameter_by_list(origin_dict, param_filter): @@ -58,10 +58,10 @@ def init_env(args): device_num = 1 if args.mode_name == 'GRAPH' and args.device_target == "GPU": print('GPU GRAPH MODE') - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=args.device_target, device_id=args.device_id) if args.device_num > 1: - context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True) init("nccl") rank = get_rank() @@ -72,19 +72,19 @@ def init_env(args): device_num = int(os.getenv('RANK_SIZE')) device_id = int(os.getenv('DEVICE_ID')) rank = int(os.getenv('RANK_ID')) - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - context.set_context(max_call_depth=2000) + mindspore.set_context(mode=0, device_target=args.device_target) + mindspore.set_context(max_call_depth=2000) if device_num > 1: os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = os.getenv('RANK_TABLE_FILE') - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) if device_num > 1: - context.set_auto_parallel_context(device_num=device_num, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() else: print(f'Single node pynative mode on {args.device_target}') - context.set_context(mode=context.PYNATIVE_MODE, device_target=args.device_target, + mindspore.set_context(mode=1, device_target=args.device_target, device_id=args.device_id) return rank @@ -95,7 +95,7 @@ def init_weight(net): for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): if config.conv_init == "XavierUniform": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.XavierUniform(), cell.weight.shape, cell.weight.dtype)) elif config.conv_init == "TruncatedNormal": @@ -105,14 +105,14 @@ def init_weight(net): cell.weight.set_data(weight) if isinstance(cell, nn.Dense): if config.dense_init == "TruncatedNormal": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) elif config.dense_init == "RandomNormal": in_channel = cell.in_channels out_channel = cell.out_channels weight = np.random.normal(loc=0, scale=0.01, size=out_channel * in_channel) - weight = ms.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) + weight = mindspore.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) cell.weight.set_data(weight) @@ -155,7 +155,7 @@ def load_pretrained_ckpt(net): """load checkpoint""" if config.pre_trained: if os.path.isfile(config.pre_trained): - ckpt = ms.load_checkpoint(config.pre_trained) + ckpt = mindspore.load_checkpoint(config.pre_trained) if ckpt.get("epoch_num") and ckpt.get("step_num"): config.has_trained_epoch = int(ckpt["epoch_num"].data.asnumpy()) config.has_trained_step = int(ckpt["step_num"].data.asnumpy()) @@ -166,7 +166,7 @@ def load_pretrained_ckpt(net): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - ms.load_param_into_net(net, ckpt) + mindspore.load_param_into_net(net, ckpt) else: print(f"Invalid pre_trained {config.pre_trained} parameter.") @@ -218,7 +218,7 @@ def train_net(): if config.pre_trained: lr = lr[config.has_trained_epoch * step_size:] - lr = ms.Tensor(lr) + lr = mindspore.Tensor(lr) # define optimizer group_params = init_group_params(net) if config.optimizer == 'Momentum': @@ -230,7 +230,7 @@ def train_net(): metrics = {"acc"} if config.run_distribute: metrics = {'acc': DistAccuracy(batch_size=config.batch_size, device_num=config.device_num)} - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, amp_level="O2", boost_level="O0", keep_batchnorm_fp32=False) # define callbacks diff --git a/official/cv/ResNet/golden_stick/quantization/simqat/eval.py b/official/cv/ResNet/golden_stick/quantization/simqat/eval.py index fcee1eaa17434f79cb58c816678921974f3d8ebb..9910c1cb5e7164cc7f011a537e3f76513349012c 100644 --- a/official/cv/ResNet/golden_stick/quantization/simqat/eval.py +++ b/official/cv/ResNet/golden_stick/quantization/simqat/eval.py @@ -14,7 +14,7 @@ # ============================================================================ """eval resnet.""" import os -import mindspore as ms +import mindspore import mindspore.log as logger from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from src.CrossEntropySmooth import CrossEntropySmooth @@ -26,7 +26,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) def get_comp_algo(): @@ -44,10 +44,10 @@ def eval_net(): target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, @@ -59,8 +59,8 @@ def eval_net(): net = algo.apply(net) # load checkpoint - param_dict = ms.load_checkpoint(config.checkpoint_file_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) # define loss, model @@ -74,7 +74,7 @@ def eval_net(): loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) # eval model res = model.eval(dataset) diff --git a/official/cv/ResNet/golden_stick/quantization/simqat/train.py b/official/cv/ResNet/golden_stick/quantization/simqat/train.py index a9924332982924b017ba7f247b66a489ccf7ef59..85357462894e3b109c99fe0ad210c885aebd2273 100644 --- a/official/cv/ResNet/golden_stick/quantization/simqat/train.py +++ b/official/cv/ResNet/golden_stick/quantization/simqat/train.py @@ -16,7 +16,7 @@ import os import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.communication.management import init, get_rank @@ -35,7 +35,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) class LossCallBack(LossMonitor): @@ -53,10 +53,10 @@ class LossCallBack(LossMonitor): loss = cb_params.net_outputs if isinstance(loss, (tuple, list)): - if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): + if isinstance(loss[0], mindspore.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): loss = loss[0] - if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray): + if isinstance(loss, mindspore.Tensor) and isinstance(loss.asnumpy(), np.ndarray): loss = np.mean(loss.asnumpy()) cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 @@ -88,27 +88,27 @@ def set_parameter(): # init context if config.mode_name == "GRAPH": - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=1, device_target=target, save_graphs=False) if config.run_distribute: if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - ms.set_context(device_id=device_id) - ms.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_context(device_id=device_id) + mindspore.set_auto_parallel_context(device_num=config.device_num, parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) set_algo_parameters(elementwise_op_strategy_follow=True) if config.boost_mode not in ["O1", "O2"]: - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) init() # GPU target else: init() - ms.set_auto_parallel_context(device_num=config.device_num, - parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=config.device_num, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) def init_weight(net): @@ -116,7 +116,7 @@ def init_weight(net): for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): if config.conv_init == "XavierUniform": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.XavierUniform(), cell.weight.shape, cell.weight.dtype)) elif config.conv_init == "TruncatedNormal": @@ -126,25 +126,25 @@ def init_weight(net): cell.weight.set_data(weight) if isinstance(cell, nn.Dense): if config.dense_init == "TruncatedNormal": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) elif config.dense_init == "RandomNormal": in_channel = cell.in_channels out_channel = cell.out_channels weight = np.random.normal(loc=0, scale=0.01, size=out_channel * in_channel) - weight = ms.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) + weight = mindspore.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) cell.weight.set_data(weight) def load_fp32_ckpt(net): if config.fp32_ckpt: if os.path.isfile(config.fp32_ckpt): - ckpt = ms.load_checkpoint(config.fp32_ckpt) + ckpt = mindspore.load_checkpoint(config.fp32_ckpt) if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - ms.load_param_into_net(net, ckpt) + mindspore.load_param_into_net(net, ckpt) else: print(f"Invalid fp32_ckpt {config.fp32_ckpt} parameter.") @@ -152,7 +152,7 @@ def load_fp32_ckpt(net): def load_pretrained_ckpt(net): if config.pre_trained: if os.path.isfile(config.pre_trained): - ckpt = ms.load_checkpoint(config.pre_trained) + ckpt = mindspore.load_checkpoint(config.pre_trained) if ckpt.get("epoch_num") and ckpt.get("step_num"): config.has_trained_epoch = int(ckpt["epoch_num"].data.asnumpy()) config.has_trained_step = int(ckpt["step_num"].data.asnumpy()) @@ -168,7 +168,7 @@ def load_pretrained_ckpt(net): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - not_load_param, _ = ms.load_param_into_net(net, ckpt) + not_load_param, _ = mindspore.load_param_into_net(net, ckpt) if not_load_param: raise RuntimeError("Load param into net fail.") else: @@ -239,7 +239,7 @@ def train_net(): lr_decay_mode='cosine') if config.pre_trained: lr = lr[config.has_trained_epoch * step_size:] - lr = ms.Tensor(lr) + lr = mindspore.Tensor(lr) # define opt group_params = init_group_params(net) opt = nn.Momentum(group_params, lr, config.momentum, weight_decay=config.weight_decay, loss_scale=config.loss_scale) @@ -250,7 +250,7 @@ def train_net(): metrics = {"acc"} if config.run_distribute: metrics = {'acc': DistAccuracy(batch_size=config.batch_size, device_num=config.device_num)} - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, amp_level="O0", boost_level=config.boost_mode, keep_batchnorm_fp32=False, boost_config_dict={"grad_freeze": {"total_steps": config.epoch_size * step_size}}) diff --git a/official/cv/ResNet/golden_stick/quantization/slb/eval.py b/official/cv/ResNet/golden_stick/quantization/slb/eval.py index 1f078f61650d2fcf6ad04e8645c1404cc595ea85..bbbb659e81137226b9578cd155a6e7e6590db970 100644 --- a/official/cv/ResNet/golden_stick/quantization/slb/eval.py +++ b/official/cv/ResNet/golden_stick/quantization/slb/eval.py @@ -14,7 +14,7 @@ # ============================================================================ """eval resnet.""" -import mindspore as ms +import mindspore import mindspore.log as logger from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from slb import create_slb @@ -26,7 +26,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) def eval_net(): """eval net""" @@ -36,9 +36,9 @@ def eval_net(): # init context if config.mode_name == "GRAPH": - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=1, device_target=target, save_graphs=False) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, @@ -50,15 +50,15 @@ def eval_net(): net = algo.apply(net) # load checkpoint - param_dict = ms.load_checkpoint(config.checkpoint_file_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) # define loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) # eval model res = model.eval(dataset) diff --git a/official/cv/ResNet/golden_stick/quantization/slb/train.py b/official/cv/ResNet/golden_stick/quantization/slb/train.py index b7c6a80e9eba9dbb4d5a0b035dc69c8a26bb23bc..7efbdda6c98183deebe9dbeb2ea34e2982f570f3 100644 --- a/official/cv/ResNet/golden_stick/quantization/slb/train.py +++ b/official/cv/ResNet/golden_stick/quantization/slb/train.py @@ -16,7 +16,7 @@ import os import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.communication.management import init, get_rank @@ -34,7 +34,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) class LossCallBack(LossMonitor): @@ -52,10 +52,10 @@ class LossCallBack(LossMonitor): loss = cb_params.net_outputs if isinstance(loss, (tuple, list)): - if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): + if isinstance(loss[0], mindspore.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): loss = loss[0] - if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray): + if isinstance(loss, mindspore.Tensor) and isinstance(loss.asnumpy(), np.ndarray): loss = np.mean(loss.asnumpy()) cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 @@ -84,18 +84,18 @@ def set_parameter(): # init context if config.mode_name == "GRAPH": - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=1, device_target=target, save_graphs=False) if config.run_distribute: # GPU target init() - ms.set_auto_parallel_context(device_num=config.device_num, - parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=config.device_num, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) # Allreduce is not supported for network with dynamic control flow. - ms.set_auto_parallel_context(comm_fusion={"allreduce": {"mode": "size", "config": 0}}) + mindspore.set_auto_parallel_context(comm_fusion={"allreduce": {"mode": "size", "config": 0}}) def init_weight(net): @@ -103,7 +103,7 @@ def init_weight(net): for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): if config.conv_init == "XavierUniform": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.XavierUniform(), cell.weight.shape, cell.weight.dtype)) elif config.conv_init == "TruncatedNormal": @@ -113,14 +113,14 @@ def init_weight(net): cell.weight.set_data(weight) if isinstance(cell, nn.Dense): if config.dense_init == "TruncatedNormal": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) elif config.dense_init == "RandomNormal": in_channel = cell.in_channels out_channel = cell.out_channels weight = np.random.normal(loc=0, scale=0.01, size=out_channel * in_channel) - weight = ms.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) + weight = mindspore.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) cell.weight.set_data(weight) @@ -128,7 +128,7 @@ def get_pretrained_epoch(net): """get_pretrained_epoch""" if config.pre_trained: if os.path.isfile(config.pre_trained): - ckpt = ms.load_checkpoint(config.pre_trained) + ckpt = mindspore.load_checkpoint(config.pre_trained) if ckpt.get("epoch_num") and ckpt.get("step_num"): config.has_trained_epoch = int(ckpt["epoch_num"].data.asnumpy()) config.has_trained_step = int(ckpt["step_num"].data.asnumpy()) @@ -151,7 +151,7 @@ def load_pretrained_ckpt(net): """load_pretrained_ckpt""" if config.pre_trained: if os.path.isfile(config.pre_trained): - ckpt = ms.load_checkpoint(config.pre_trained) + ckpt = mindspore.load_checkpoint(config.pre_trained) if ckpt.get("epoch_num") and ckpt.get("step_num"): config.has_trained_epoch = int(ckpt["epoch_num"].data.asnumpy()) config.has_trained_step = int(ckpt["step_num"].data.asnumpy()) @@ -166,7 +166,7 @@ def load_pretrained_ckpt(net): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - not_load_param, _ = ms.load_param_into_net(net, ckpt) + not_load_param, _ = mindspore.load_param_into_net(net, ckpt) if not_load_param: raise RuntimeError("Load param into net fail.") else: @@ -237,7 +237,7 @@ def train_net(): lr_decay_mode=config.lr_decay_mode) if config.pre_trained: lr = lr[config.has_trained_epoch * step_size:] - lr = ms.Tensor(lr) + lr = mindspore.Tensor(lr) # define optimizer group_params = init_group_params(net) if config.optimizer == 'Momentum': @@ -251,7 +251,7 @@ def train_net(): metrics = {"acc"} if config.run_distribute: metrics = {'acc': DistAccuracy(batch_size=config.batch_size, device_num=config.device_num)} - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, amp_level="O0", boost_level=config.boost_mode, keep_batchnorm_fp32=False, eval_network=dist_eval_network, boost_config_dict={"grad_freeze": {"total_steps": config.epoch_size * step_size}}) diff --git a/official/cv/ResNet/gpu_resnet_benchmark.py b/official/cv/ResNet/gpu_resnet_benchmark.py index 43ac12d17f6a94581a4d8762fc5ef7c4f60e762f..ee2d7d94ba4bcbac5a907d7068f6b9bf9ed9a30f 100644 --- a/official/cv/ResNet/gpu_resnet_benchmark.py +++ b/official/cv/ResNet/gpu_resnet_benchmark.py @@ -16,7 +16,7 @@ import os import time import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.dataset as ds from mindspore.train.callback import Callback, ModelCheckpoint, CheckpointConfig @@ -27,7 +27,7 @@ from src.momentum import Momentum as MomentumWeightDecay from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -ms.set_seed(1) +mindspore.set_seed(1) class MyTimeMonitor(Callback): def __init__(self, batch_size, sink_size, dataset_size, mode): @@ -45,10 +45,10 @@ class MyTimeMonitor(Callback): loss = cb_params.net_outputs if isinstance(loss, (tuple, list)): - if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): + if isinstance(loss[0], mindspore.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): loss = loss[0] - if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray): + if isinstance(loss, mindspore.Tensor) and isinstance(loss.asnumpy(), np.ndarray): loss = np.mean(loss.asnumpy()) @@ -148,17 +148,17 @@ def train(): device_num = 1 # init context if config.mode_name == "GRAPH": - mode = ms.GRAPH_MODE + mode = 0 all_reduce_fusion_config = [85, 160] else: - mode = ms.PYNATIVE_MODE + mode = 1 all_reduce_fusion_config = [30, 90, 160] - ms.set_context(mode=mode, device_target=dev, save_graphs=False) + mindspore.set_context(mode=mode, device_target=dev, save_graphs=False) ckpt_save_dir = os.path.join(config.output_dir, config.checkpoint_path) if config.run_distribute: init() device_num = get_group_size() - ms.set_auto_parallel_context(device_num=device_num, parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=all_reduce_fusion_config) ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank()) + "/" @@ -176,18 +176,18 @@ def train(): # init weight for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.XavierUniform(), cell.weight.shape, cell.weight.dtype)) if isinstance(cell, nn.Dense): - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) # init lr lr = get_liner_lr(lr_init=0, lr_end=0, lr_max=0.8, warmup_epochs=0, total_epochs=epoch_size, steps_per_epoch=step_size) - lr = ms.Tensor(lr) + lr = mindspore.Tensor(lr) # define opt decayed_params = [] @@ -201,18 +201,18 @@ def train(): # define loss, model loss = CrossEntropySmooth(sparse=True, reduction='mean', smooth_factor=0.1, num_classes=1001) opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 1e-4) - loss_scale = ms.FixedLossScaleManager(1024, drop_overflow_update=False) - model = ms.Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) + loss_scale = mindspore.FixedLossScaleManager(1024, drop_overflow_update=False) + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) # Mixed precision if compute_type == "fp16": - if mode == ms.PYNATIVE_MODE: + if mode == 1: opt = MomentumWeightDecay(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 1e-4, 1024) else: opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 1e-4, 1024) - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, amp_level="O2", keep_batchnorm_fp32=False) # define callbacks - if mode == ms.PYNATIVE_MODE: + if mode == 1: print_per_steps = 1 time_cb = MyTimeMonitor(total_batch, print_per_steps, step_size, mode) cb = [time_cb] @@ -222,7 +222,7 @@ def train(): cb += [ckpt_cb] # train model print("========START RESNET50 GPU BENCHMARK========") - if mode == ms.GRAPH_MODE: + if mode == 0: model.train(int(epoch_size * step_size / print_per_steps), dataset, \ callbacks=cb, sink_size=print_per_steps, dataset_sink_mode=True) else: @@ -237,23 +237,23 @@ def eval_(): total_batch = int(config.batch_size) # init context if config.mode_name == "GRAPH": - mode = ms.GRAPH_MODE + mode = 0 else: - mode = ms.PYNATIVE_MODE - ms.set_context(mode=mode, device_target=dev, save_graphs=False) + mode = 1 + mindspore.set_context(mode=mode, device_target=dev, save_graphs=False) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, repeat_num=1, batch_size=total_batch, target=dev, dtype=compute_type) # define net net = resnet(class_num=1001, dtype=compute_type) # load checkpoint - param_dict = ms.load_checkpoint(ckpt_dir) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(ckpt_dir) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) # define loss, model loss = CrossEntropySmooth(sparse=True, reduction='mean', smooth_factor=0.1, num_classes=1001) # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) # eval model print("========START EVAL RESNET50 ON GPU ========") res = model.eval(dataset) diff --git a/official/cv/ResNet/infer.py b/official/cv/ResNet/infer.py index 0fde653ebb2ee5da3fde5a0860a30a914b223b04..ae87c7678d9cf8e756c9af5ef0fe4d876b22f53d 100644 --- a/official/cv/ResNet/infer.py +++ b/official/cv/ResNet/infer.py @@ -15,7 +15,7 @@ """train resnet.""" import os import numpy as np -import mindspore as ms +import mindspore from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper @@ -54,10 +54,10 @@ def infer_net(): target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID', '0')) - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, @@ -68,8 +68,8 @@ def infer_net(): net = resnet(class_num=config.class_num) # load checkpoint - param_dict = ms.load_checkpoint(config.checkpoint_file_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) print("start infer") @@ -81,7 +81,7 @@ def infer_net(): images = data["image"] label = data["label"] file_name = data["filename"] - res = net(ms.Tensor(images)) + res = net(mindspore.Tensor(images)) res = res.asnumpy() predict_id = np.argmax(res, axis=1) predict_negative, only_file = show_predict_info(label.tolist(), predict_id.tolist(), diff --git a/official/cv/ResNet/infer/ResNet152/sdk/classification_task_metric.py b/official/cv/ResNet/infer/ResNet152/sdk/classification_task_metric.py index 4c82151c1ad0b0fd8322b8e704e670a871963c27..a830f7d2cfd87c1a4ba6d8e6d79cd514ce49880b 100644 --- a/official/cv/ResNet/infer/ResNet152/sdk/classification_task_metric.py +++ b/official/cv/ResNet/infer/ResNet152/sdk/classification_task_metric.py @@ -69,7 +69,7 @@ def load_statistical_predict_result(filepath): data_vec = np.zeros((len(temp)), dtype=np.float32) if n_label != 0: for ind, cls_ind in enumerate(temp): - data_vec[ind] = np.int(cls_ind) + data_vec[ind] = np.int_(cls_ind) return data_vec, n_label diff --git a/official/cv/ResNet/modelarts/ResNet152/train_start.py b/official/cv/ResNet/modelarts/ResNet152/train_start.py index f8f33ff4c630d52a6984f880a67abbadd3c768eb..a1eed503d9ea5747a49d9483580e105b10feeabd 100644 --- a/official/cv/ResNet/modelarts/ResNet152/train_start.py +++ b/official/cv/ResNet/modelarts/ResNet152/train_start.py @@ -19,7 +19,7 @@ import os import numpy as np import moxing as mox -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.train.train_thor import ConvertModelUtils from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor @@ -39,7 +39,7 @@ from src.model_utils.device_adapter import get_rank_id, get_device_num from src.resnet import conv_variance_scaling_initializer -ms.set_seed(1) +mindspore.set_seed(1) class LossCallBack(LossMonitor): @@ -58,10 +58,10 @@ class LossCallBack(LossMonitor): loss = cb_params.net_outputs if isinstance(loss, (tuple, list)): - if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): + if isinstance(loss[0], mindspore.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): loss = loss[0] - if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray): + if isinstance(loss, mindspore.Tensor) and isinstance(loss.asnumpy(), np.ndarray): loss = np.mean(loss.asnumpy()) cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 @@ -118,8 +118,8 @@ def apply_eval(eval_param): def set_graph_kernel_context(run_platform, net_name): if run_platform == "GPU" and net_name == "resnet101": - ms.set_context(enable_graph_kernel=True) - ms.set_context(graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D") + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D") def set_parameter(): @@ -134,37 +134,37 @@ def set_parameter(): if config.mode_name == 'GRAPH': if target == "Ascend": rank_save_graphs_path = os.path.join(config.save_graphs_path, "soma", str(os.getenv('DEVICE_ID', '0'))) - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=config.save_graphs, + mindspore.set_context(mode=0, device_target=target, save_graphs=config.save_graphs, save_graphs_path=rank_save_graphs_path) else: - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=config.save_graphs) + mindspore.set_context(mode=0, device_target=target, save_graphs=config.save_graphs) set_graph_kernel_context(target, config.net_name) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=1, device_target=target, save_graphs=False) if config.parameter_server: - ms.set_ps_context(enable_ps=True) + mindspore.set_ps_context(enable_ps=True) if config.run_distribute: if target == "Ascend": device_id = int(os.getenv('DEVICE_ID', '0')) - ms.set_context(device_id=device_id) - ms.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_context(device_id=device_id) + mindspore.set_auto_parallel_context(device_num=config.device_num, parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) set_algo_parameters(elementwise_op_strategy_follow=True) if config.net_name == "resnet50" or config.net_name == "se-resnet50": if config.boost_mode not in ["O1", "O2"]: - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) elif config.net_name in ["resnet101", "resnet152"]: - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) init() # GPU target else: init() - ms.set_auto_parallel_context(device_num=get_device_num(), - parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=get_device_num(), + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) if config.net_name == "resnet50": - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) def load_pre_trained_checkpoint(): @@ -187,9 +187,9 @@ def load_pre_trained_checkpoint(): print(f"time stamp {time_stamp.strftime('%Y.%m.%d-%H:%M:%S')}" f" pre trained ckpt model {ckpt_files[0]} loading", flush=True) - param_dict = ms.load_checkpoint(ckpt_files[0]) + param_dict = mindspore.load_checkpoint(ckpt_files[0]) elif os.path.isfile(config.pre_trained): - param_dict = ms.load_checkpoint(config.pre_trained) + param_dict = mindspore.load_checkpoint(config.pre_trained) else: print(f"Invalid pre_trained {config.pre_trained} parameter.") return param_dict @@ -209,12 +209,12 @@ def init_weight(net, param_dict): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(param_dict, filter_list) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) else: for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): if config.conv_init == "XavierUniform": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.XavierUniform(), cell.weight.shape, cell.weight.dtype)) elif config.conv_init == "TruncatedNormal": @@ -224,14 +224,14 @@ def init_weight(net, param_dict): cell.weight.set_data(weight) if isinstance(cell, nn.Dense): if config.dense_init == "TruncatedNormal": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) elif config.dense_init == "RandomNormal": in_channel = cell.in_channels out_channel = cell.out_channels weight = np.random.normal(loc=0, scale=0.01, size=out_channel * in_channel) - weight = ms.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) + weight = mindspore.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) cell.weight.set_data(weight) @@ -323,13 +323,13 @@ def _export_air(ckpt_dir): if not ckpt_file: return net = resnet(config.class_num) - param_dict = ms.load_checkpoint(ckpt_file) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(ckpt_file) + mindspore.load_param_into_net(net, param_dict) - input_arr = ms.numpy.zeros([1, 3, 304, 304], ms.float32) + input_arr = mindspore.numpy.zeros([1, 3, 304, 304], mindspore.float32) print("Start export air.") - ms.export(net, input_arr, file_name=config.file_name, file_format="AIR") + mindspore.export(net, input_arr, file_name=config.file_name, file_format="AIR") file_name = config.file_name + ".air" mox.file.copy(file_name, os.path.join(config.output_dir, file_name)) @@ -352,7 +352,7 @@ def train_net(): net.set_param_ps() init_weight(net=net, param_dict=ckpt_param_dict) - lr = ms.Tensor(init_lr(step_size=step_size)) + lr = mindspore.Tensor(init_lr(step_size=step_size)) # define opt group_params = init_group_params(net) opt = nn.Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale) @@ -360,7 +360,7 @@ def train_net(): opt = nn.LARS(opt, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient, lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name) loss = init_loss_scale() - loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + loss_scale = mindspore.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) dist_eval_network = ClassifyCorrectCell(net) if config.run_distribute else None metrics = {"acc"} if config.run_distribute: @@ -368,9 +368,9 @@ def train_net(): if (config.net_name not in ("resnet18", "resnet34", "resnet50", "resnet101", "resnet152", "se-resnet50")) or \ config.parameter_server or target == "CPU": ## fp32 training - model = ms.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network) + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network) else: - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, amp_level="O2", # boost_level=config.boost_mode, keep_batchnorm_fp32=False, eval_network=dist_eval_network) @@ -379,7 +379,7 @@ def train_net(): from src.lr_generator import get_thor_damping damping = get_thor_damping(0, config.damping_init, config.damping_decay, 70, step_size) split_indices = [26, 53] - opt = nn.thor(net, lr, ms.Tensor(damping), config.momentum, config.weight_decay, config.loss_scale, + opt = nn.thor(net, lr, mindspore.Tensor(damping), config.momentum, config.weight_decay, config.loss_scale, config.batch_size, split_indices=split_indices, frequency=config.frequency) model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, diff --git a/official/cv/ResNet/modelarts/ResNet18/modelarts_train.py b/official/cv/ResNet/modelarts/ResNet18/modelarts_train.py index 2900c32a0b264aaf33af576918e4529cd4ba44bd..424875e791ccb66737c99536e27e800bef290da8 100644 --- a/official/cv/ResNet/modelarts/ResNet18/modelarts_train.py +++ b/official/cv/ResNet/modelarts/ResNet18/modelarts_train.py @@ -17,7 +17,7 @@ import os import argparse import ast import moxing as mox -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.log as logger @@ -93,7 +93,7 @@ args_opt = parser.parse_args() CKPT_OUTPUT_PATH = "./output" -ms.set_seed(1) +mindspore.set_seed(1) if config.optimizer == "Thor": if args_opt.device_target == "Ascend": @@ -122,7 +122,7 @@ def apply_eval(eval_param): def set_graph_kernel_context(run_platform, net_name): if run_platform == "GPU" and net_name == "resnet101": - ms.set_context(enable_graph_kernel=True, + mindspore.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion") @@ -141,12 +141,12 @@ def _export_air(ckpt_dir): if not ckpt_file: return net = resnet(config.class_num) - param_dict = ms.load_checkpoint(ckpt_file) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(ckpt_file) + mindspore.load_param_into_net(net, param_dict) - input_arr = ms.numpy.zeros([1, 3, 304, 304], ms.float32) + input_arr = mindspore.numpy.zeros([1, 3, 304, 304], mindspore.float32) file_path = os.path.join(args_opt.train_url, "resnet") - ms.export(net, input_arr, file_name=file_path, file_format="AIR") + mindspore.export(net, input_arr, file_name=file_path, file_format="AIR") def set_config(): @@ -156,58 +156,58 @@ def set_config(): def init_context(target): if args_opt.mode == 'GRAPH': - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) set_graph_kernel_context(target, args_opt.net) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=1, device_target=target, save_graphs=False) if args_opt.parameter_server: - ms.set_ps_context(enable_ps=True) + mindspore.set_ps_context(enable_ps=True) if args_opt.run_distribute: if target == "Ascend": device_id = int(os.getenv('DEVICE_ID', '0')) - ms.set_context(device_id=device_id) - ms.set_auto_parallel_context( + mindspore.set_context(device_id=device_id) + mindspore.set_auto_parallel_context( device_num=args_opt.device_num, - parallel_mode=ms.ParallelMode.DATA_PARALLEL, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) set_algo_parameters(elementwise_op_strategy_follow=True) if args_opt.net == "resnet50" or args_opt.net == "se-resnet50": - ms.set_auto_parallel_context( + mindspore.set_auto_parallel_context( all_reduce_fusion_config=[85, 160]) elif args_opt.net == "resnet101": - ms.set_auto_parallel_context( + mindspore.set_auto_parallel_context( all_reduce_fusion_config=[80, 210, 313]) init() # GPU target else: init() - ms.set_auto_parallel_context( + mindspore.set_auto_parallel_context( device_num=get_group_size(), - parallel_mode=ms.ParallelMode.DATA_PARALLEL, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) if args_opt.net == "resnet50": - ms.set_auto_parallel_context( + mindspore.set_auto_parallel_context( all_reduce_fusion_config=[85, 160]) def init_weight(net): if args_opt.pre_trained: - param_dict = ms.load_checkpoint(args_opt.pre_trained) + param_dict = mindspore.load_checkpoint(args_opt.pre_trained) if args_opt.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(param_dict, filter_list) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) else: for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): cell.weight.set_data( - ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + mindspore.common.initializer.initializer(mindspore.common.initializer.XavierUniform(), cell.weight.shape, cell.weight.dtype)) if isinstance(cell, nn.Dense): cell.weight.set_data( - ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + mindspore.common.initializer.initializer(mindspore.common.initializer.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) @@ -229,7 +229,7 @@ def init_lr(step_size): lr = warmup_cosine_annealing_lr( config.lr, step_size, config.warmup_epochs, config.epoch_size, config.pretrain_epoch_size * step_size) - return ms.Tensor(lr) + return mindspore.Tensor(lr) def define_opt(net, lr): @@ -260,7 +260,7 @@ def define_model(net, opt, target): num_classes=config.class_num) else: loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') - loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + loss_scale = mindspore.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) dist_eval_network = ClassifyCorrectCell( net) if args_opt.run_distribute else None @@ -272,10 +272,10 @@ def define_model(net, opt, target): "se-resnet50")) or args_opt.parameter_server \ or target == "CPU": # fp32 training - model = ms.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network) else: - model = ms.Model(net, loss_fn=loss, optimizer=opt, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, amp_level="O2", keep_batchnorm_fp32=False, eval_network=dist_eval_network) @@ -345,7 +345,7 @@ def main(): damping = get_thor_damping(0, config.damping_init, config.damping_decay, 70, step_size) split_indices = [26, 53] - opt = nn.thor(net, lr, ms.Tensor(damping), config.momentum, + opt = nn.thor(net, lr, mindspore.Tensor(damping), config.momentum, config.weight_decay, config.loss_scale, config.batch_size, split_indices=split_indices, frequency=config.frequency) diff --git a/official/cv/ResNet/predict.py b/official/cv/ResNet/predict.py index 5a898b7333ea3921b1a38016ba3cffb3c24ffabb..2ebde600a8472cc658a35deab84af7afb05090d4 100644 --- a/official/cv/ResNet/predict.py +++ b/official/cv/ResNet/predict.py @@ -26,11 +26,11 @@ import time import numpy as np from PIL import Image -import mindspore as ms +import mindspore import mindspore.dataset as ds from src.model_utils.config import config -ms.set_seed(1) +mindspore.set_seed(1) if config.net_name in ("resnet18", "resnet34", "resnet50", "resnet152"): if config.net_name == "resnet18": @@ -56,10 +56,10 @@ def create_model(): # load checkpoint if config.checkpoint_file_path: - param_dict = ms.load_checkpoint(config.checkpoint_file_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) - ms_model = ms.Model(net) + ms_model = mindspore.Model(net) return ms_model @@ -72,7 +72,7 @@ def read_image(img_path): ds.vision.HWC2CHW()] for transform in transform_list: img = transform(img) - img = ms.Tensor(np.expand_dims(img, axis=0), ms.float32) + img = mindspore.Tensor(np.expand_dims(img, axis=0), mindspore.float32) return img @@ -102,7 +102,7 @@ def predict_mindir(data_input): raise RuntimeError("Only support single input in this net.") inputs[0].set_data_from_numpy(data_input.asnumpy()) outputs = lite_mode_input.predict(inputs) - return ms.Tensor(outputs[0].get_data_to_numpy()) + return mindspore.Tensor(outputs[0].get_data_to_numpy()) def _get_lite_context(l_context): lite_context_properties = { @@ -111,18 +111,18 @@ def predict_mindir(data_input): "gpu": ["device_id", "precision_mode"], "ascend": ["device_id", "precision_mode", "provider", "rank_id"] } - lite_device_target = ms.get_context('device_target').lower() + lite_device_target = mindspore.get_context('device_target').lower() if lite_device_target not in ['cpu', 'gpu', 'ascend']: raise RuntimeError(f"Device target should be in ['cpu', 'gpu', 'ascend'], but got {lite_device_target}") l_context.target = [lite_device_target] l_context_device_dict = {'cpu': l_context.cpu, 'gpu': l_context.gpu, 'ascend': l_context.ascend} for single_property in lite_context_properties.get(lite_device_target): try: - context_value = ms.get_context(single_property) + context_value = mindspore.get_context(single_property) if context_value: setattr(l_context_device_dict.get(lite_device_target), single_property, context_value) except ValueError: - print(f'For set lite context, fail to get parameter {single_property} from ms.context.' + print(f'For set lite context, fail to get parameter {single_property} from mindspore.context.' f' Will use default value') return l_context @@ -149,10 +149,10 @@ def predict_net(data_input): """predict net""" target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # model predict ms_model = create_model() diff --git a/official/cv/ResNet/scripts/run_distribute_train.sh b/official/cv/ResNet/scripts/run_distribute_train.sh index 7cb80caaac679acc3aee021c3896268701311c9b..a9a464c674a8d0206d738dff434846829625ce84 100644 --- a/official/cv/ResNet/scripts/run_distribute_train.sh +++ b/official/cv/ResNet/scripts/run_distribute_train.sh @@ -97,6 +97,7 @@ ulimit -u unlimited export DEVICE_NUM=8 export RANK_SIZE=8 export RANK_TABLE_FILE=$PATH1 +export MS_ASCEND_CHECK_OVERFLOW_MODE="SATURATION_MODE" export SERVER_ID=0 rank_start=$((DEVICE_NUM * SERVER_ID)) diff --git a/benchmark/ascend/resnet/scripts/run_standalone_train_gpu.sh b/official/cv/ResNet/scripts/run_distribute_train_msrun.sh similarity index 34% rename from benchmark/ascend/resnet/scripts/run_standalone_train_gpu.sh rename to official/cv/ResNet/scripts/run_distribute_train_msrun.sh index b65d9df4719fb51532a49f5dfa75ffdc0275f162..156daf70ac985c242d513e35d5299706896644ee 100644 --- a/benchmark/ascend/resnet/scripts/run_standalone_train_gpu.sh +++ b/official/cv/ResNet/scripts/run_distribute_train_msrun.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2020-2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,11 +18,13 @@ CURPATH="$(dirname "$0")" # shellcheck source=/dev/null . ${CURPATH}/cache_util.sh -if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] -then - echo "Usage: bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)" - echo " bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" -exit 1 +if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] +then + echo "Usage: bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH]" + echo "Usage: bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional)" + echo "Usage: bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" + echo "Usage: bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) [RESUME_CKPT](optional)" + exit 1 fi get_real_path(){ @@ -35,10 +37,16 @@ get_real_path(){ PATH1=$(get_real_path $1) CONFIG_FILE=$(get_real_path $2) +str="Boost_" +if [[ $CONFIG_FILE =~ $str ]] +then + export MS_DISABLE_REF_MODE=1 + export MS_ENABLE_FORMAT_MODE=0 +fi if [ $# == 3 ] -then - PATH2=$(get_real_path $3) +then + RESUME_CKPT=$(get_real_path $3) fi if [ $# == 4 ] @@ -47,19 +55,25 @@ then EVAL_DATASET_PATH=$(get_real_path $4) fi +if [ $# == 5 ] +then + RUN_EVAL=$3 + EVAL_DATASET_PATH=$(get_real_path $4) + RESUME_CKPT=$(get_real_path $5) +fi + if [ ! -d $PATH1 ] then echo "error: DATASET_PATH=$PATH1 is not a directory" exit 1 -fi +fi -if [ $# == 3 ] && [ ! -f $PATH2 ] +if [ $# == 3 ] && [ ! -f $RESUME_CKPT ] then - echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" + echo "error: RESUME_CKPT=$RESUME_CKPT is not a file" exit 1 fi - if [ "x${RUN_EVAL}" == "xTrue" ] && [ ! -d $EVAL_DATASET_PATH ] then echo "error: EVAL_DATASET_PATH=$EVAL_DATASET_PATH is not a directory" @@ -73,43 +87,52 @@ then fi ulimit -u unlimited -export DEVICE_NUM=1 -export DEVICE_ID=0 -export RANK_ID=0 -export RANK_SIZE=1 +export DEVICE_NUM=8 +export RANK_SIZE=8 +ulimit -u unlimited +export MS_ASCEND_CHECK_OVERFLOW_MODE="SATURATION_MODE" -if [ -d "train" ]; -then - rm -rf ./train -fi -mkdir ./train -cp ../config/*.yaml ./train -cp ../*.py ./train -cp *.sh ./train -cp -r ../src ./train -cd ./train || exit -echo "start training for device $DEVICE_ID" +cd .. env > env.log +echo "start training" if [ $# == 2 ] -then - python train.py --device_target="GPU" --data_path=$PATH1 \ - --config_path=$CONFIG_FILE --output_path './output' &> log & +then + msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port=8118 \ + --log_dir=msrun_log --join=True --cluster_time_out=300 \ + train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH1 \ + --config_path=$CONFIG_FILE --output_dir './outputs' &> log.txt & fi if [ $# == 3 ] then - python train.py --device_target="GPU" --data_path=$PATH1 --pre_trained=$PATH2 \ - --config_path=$CONFIG_FILE --output_path './output' &> log & + msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port=8118 \ + --log_dir=msrun_log --join=True --cluster_time_out=300 \ + train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH1 --resume_ckpt=$RESUME_CKPT \ + --config_path=$CONFIG_FILE --output_dir './outputs' &> log.txt & fi if [ $# == 4 ] then - python train.py --device_target="GPU" --data_path=$PATH1 --run_eval=$RUN_EVAL \ - --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID \ - --config_path=$CONFIG_FILE --output_path './output' &> log & - if [ "x${RUN_EVAL}" == "xTrue" ] - then - echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" - fi + msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port=8118 \ + --log_dir=msrun_log --join=True --cluster_time_out=300 \ + train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH1 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir './outputs' &> log.txt & + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi +fi + +if [ $# == 5 ] +then + msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port=8118 \ + --log_dir=msrun_log --join=True --cluster_time_out=300 \ + train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH1 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --resume_ckpt=$RESUME_CKPT \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir './outputs' &> log.txt & + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi fi -cd .. diff --git a/official/cv/ResNet/src/CrossEntropySmooth.py b/official/cv/ResNet/src/CrossEntropySmooth.py index 1634033c2c4a554fb2c729a3da81bcc2153b3fbd..e129832d91de01411c1abd11a257ed759a554d97 100644 --- a/official/cv/ResNet/src/CrossEntropySmooth.py +++ b/official/cv/ResNet/src/CrossEntropySmooth.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ """define loss function for network""" -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore import Tensor from mindspore.nn.loss import LossBase @@ -26,8 +26,8 @@ class CrossEntropySmooth(LossBase): super(CrossEntropySmooth, self).__init__() self.onehot = ops.OneHot() self.sparse = sparse - self.on_value = Tensor(1.0 - smooth_factor, ms.float32) - self.off_value = Tensor(1.0 * smooth_factor / (num_classes - 1), ms.float32) + self.on_value = Tensor(1.0 - smooth_factor, mindspore.float32) + self.off_value = Tensor(1.0 * smooth_factor / (num_classes - 1), mindspore.float32) self.ce = nn.SoftmaxCrossEntropyWithLogits(reduction=reduction) def construct(self, logit, label): diff --git a/official/cv/ResNet/src/callback.py b/official/cv/ResNet/src/callback.py index c34fa6fd558430046b4698eda50747d889a0d669..21536bf354f346e4a21a5b807729459e5c51c3c7 100644 --- a/official/cv/ResNet/src/callback.py +++ b/official/cv/ResNet/src/callback.py @@ -18,7 +18,7 @@ import os import stat import time import numpy as np -import mindspore as ms +import mindspore from mindspore import save_checkpoint from mindspore.train.callback import Callback @@ -45,10 +45,10 @@ class LossCallBack(Callback): data_sink_mode = cb_params.get('dataset_sink_mode', True) if not data_sink_mode: if isinstance(loss, (tuple, list)): - if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): + if isinstance(loss[0], mindspore.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): loss = loss[0] - if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray): + if isinstance(loss, mindspore.Tensor) and isinstance(loss.asnumpy(), np.ndarray): loss = np.mean(loss.asnumpy()) cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 @@ -76,10 +76,10 @@ class LossCallBack(Callback): loss = cb_params.net_outputs cur_epoch_num = cb_params.cur_epoch_num if isinstance(loss, (tuple, list)): - if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): + if isinstance(loss[0], mindspore.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): loss = loss[0] - if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray): + if isinstance(loss, mindspore.Tensor) and isinstance(loss.asnumpy(), np.ndarray): loss = np.mean(loss.asnumpy()) epoch_time = time.time() - self.epoch_start_time @@ -156,7 +156,7 @@ class EvalCallBack(Callback): eval_cost = time.time() - eval_start self.logger.info("epoch: {}, {}: {}, eval_cost:{:.2f}".format(cur_epoch, self.metrics_name, res, eval_cost)) if res >= self.best_res: - if ms.context.get_context("enable_ge") and int(os.getenv('MS_DISABLE_REF_MODE')) == 1: + if mindspore.get_context("enable_ge") and int(os.getenv('MS_DISABLE_REF_MODE', default="0")) == 1: from mindspore.train.callback import _set_cur_net _set_cur_net(cb_params.train_network) cb_params.train_network.exec_checkpoint_graph() diff --git a/official/cv/ResNet/src/dataset.py b/official/cv/ResNet/src/dataset.py index 614c0eb1457ed1150618f9be88b129c5badd4a30..1d4ddc87c02f81c8609773dacd6b59c58360d6a9 100644 --- a/official/cv/ResNet/src/dataset.py +++ b/official/cv/ResNet/src/dataset.py @@ -16,7 +16,7 @@ create train or eval dataset. """ import multiprocessing -import mindspore as ms +import mindspore import mindspore.dataset as ds from mindspore.communication.management import init, get_rank, get_group_size @@ -61,7 +61,7 @@ def create_dataset1(dataset_path, do_train, batch_size=32, train_image_size=224, ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=get_num_parallel_workers(8)) @@ -129,7 +129,7 @@ def create_dataset2(dataset_path, do_train, batch_size=32, train_image_size=224, ] trans_norm = [ds.vision.Normalize(mean=mean, std=std), ds.vision.HWC2CHW()] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) if device_num == 1: trans_work_num = 24 else: @@ -202,7 +202,7 @@ def create_dataset3(dataset_path, do_train, batch_size=32, train_image_size=224, ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=get_num_parallel_workers(8)) # only enable cache for eval @@ -272,7 +272,7 @@ def create_dataset4(dataset_path, do_train, batch_size=32, train_image_size=224, ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=get_num_parallel_workers(12)) # only enable cache for eval if do_train: diff --git a/official/cv/ResNet/src/dataset_infer.py b/official/cv/ResNet/src/dataset_infer.py index 92ff4aab1a900d6c1bdd40287e29cd5028caf453..75d48204ae83eec355407589d1983f301a573c7f 100644 --- a/official/cv/ResNet/src/dataset_infer.py +++ b/official/cv/ResNet/src/dataset_infer.py @@ -17,7 +17,7 @@ create train or eval dataset. """ import os import numpy as np -import mindspore as ms +import mindspore import mindspore.dataset as ds from mindspore.communication.management import init, get_rank, get_group_size from src.model_utils.config import config @@ -145,7 +145,7 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, image_si ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) @@ -217,7 +217,7 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, image_s ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) @@ -287,7 +287,7 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, image_s ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=12) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12) if do_train: diff --git a/official/cv/ResNet/src/metric.py b/official/cv/ResNet/src/metric.py index 7babb94557b5403775e229aafa88dae5ce4fa7c7..89a82696f313f7088e350d961686cacc00e5f608 100644 --- a/official/cv/ResNet/src/metric.py +++ b/official/cv/ResNet/src/metric.py @@ -14,7 +14,7 @@ # ============================================================================ """evaluation metric.""" -import mindspore as ms +import mindspore from mindspore.communication.management import GlobalComm import mindspore.ops as ops import mindspore.nn as nn @@ -54,9 +54,9 @@ class ClassifyCorrectCell(nn.Cell): def construct(self, data, label): outputs = self._network(data) y_pred = self.argmax(outputs) - y_pred = self.cast(y_pred, ms.int32) + y_pred = self.cast(y_pred, mindspore.int32) y_correct = self.equal(y_pred, label) - y_correct = self.cast(y_correct, ms.float32) + y_correct = self.cast(y_correct, mindspore.float32) y_correct = self.reduce_sum(y_correct) total_correct = self.allreduce(y_correct) return (total_correct,) diff --git a/official/cv/ResNet/src/model_utils/moxing_adapter.py b/official/cv/ResNet/src/model_utils/moxing_adapter.py index 8ad202e15345511605daf77735303ec4fdebf284..81f98ee9fbb9938bb28dfeda6bf4a1749dd55055 100644 --- a/official/cv/ResNet/src/model_utils/moxing_adapter.py +++ b/official/cv/ResNet/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -import mindspore as ms +import mindspore from src.model_utils.config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_dir) print("Workspace downloaded: ", os.listdir(config.output_dir)) - ms.set_context(save_graphs_path=os.path.join(config.output_dir, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_dir, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_dir): diff --git a/official/cv/ResNet/src/momentum.py b/official/cv/ResNet/src/momentum.py index de24e0f48577a6576a74fae4ae0caf899eac2405..6d4f7fcfbd5ab9d02bb181f52219aac739187310 100644 --- a/official/cv/ResNet/src/momentum.py +++ b/official/cv/ResNet/src/momentum.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ """momentum""" -import mindspore as ms +import mindspore import mindspore.ops as ops from mindspore.common.parameter import Parameter from mindspore.common.tensor import Tensor @@ -123,7 +123,7 @@ class Momentum(Optimizer): super(Momentum, self).__init__(learning_rate, params, weight_decay, loss_scale) assert isinstance(momentum, float) and momentum >= 0, "momentum should be equal or bigger than 0" assert isinstance(use_nesterov, bool), "use_nesterov should be bool" - self.momentum = Parameter(Tensor(momentum, ms.float32), name="momentum") + self.momentum = Parameter(Tensor(momentum, mindspore.float32), name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = ops.HyperMap() @@ -133,8 +133,8 @@ class Momentum(Optimizer): def construct(self, gradients): params = self.params moments = self.moments - weight_decay = Tensor(0.0, ms.float32) - scale = Tensor(1.0, ms.float32) + weight_decay = Tensor(0.0, mindspore.float32) + scale = Tensor(1.0, mindspore.float32) if self.exec_weight_decay: weight_decay = self.weight_decay if self.need_scale: diff --git a/official/cv/ResNet/src/resnet_gpu_benchmark.py b/official/cv/ResNet/src/resnet_gpu_benchmark.py index 67ec6ffa67598dee896fbf1d59a204d97ab85f6f..282fe1d16926055849dd9292f778c807d980aaff 100644 --- a/official/cv/ResNet/src/resnet_gpu_benchmark.py +++ b/official/cv/ResNet/src/resnet_gpu_benchmark.py @@ -15,7 +15,7 @@ """ResNet.""" import numpy as np from scipy.stats import truncnorm -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor @@ -35,7 +35,7 @@ def _conv_variance_scaling_initializer(in_channel, out_channel, kernel_size): mu, sigma = 0, stddev weight = truncnorm(-2, 2, loc=mu, scale=sigma).rvs(out_channel * in_channel * kernel_size * kernel_size) weight = np.reshape(weight, (out_channel, kernel_size, kernel_size, in_channel)) - return Tensor(weight, dtype=ms.float32) + return Tensor(weight, dtype=mindspore.float32) def _weight_variable(shape, factor=0.01): init_value = np.random.randn(*shape).astype(np.float32) * factor diff --git a/official/cv/ResNet/src/util.py b/official/cv/ResNet/src/util.py index bef16d1396e7c857e4ef8d07411c2455c5293f4c..d4e4ef817cdeda1cec5496c570ea3ccb4c9dbff1 100644 --- a/official/cv/ResNet/src/util.py +++ b/official/cv/ResNet/src/util.py @@ -1,6 +1,6 @@ import os import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn from src.callback import EvalCallBack from src.resnet import conv_variance_scaling_initializer @@ -71,17 +71,17 @@ def init_weight(net, cfg): if not os.path.isfile(cfg.pre_trained): cfg.logger.warning("There is not ckpt file: %s", cfg.pre_trained) else: - param_dict = ms.load_checkpoint(cfg.pre_trained) + param_dict = mindspore.load_checkpoint(cfg.pre_trained) if cfg.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(param_dict, filter_list) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) cfg.logger.info("Pre trained ckpt mode: %s loading", cfg.pre_trained) else: for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): if cfg.conv_init == "XavierUniform": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.XavierUniform(), cell.weight.shape, cell.weight.dtype)) elif cfg.conv_init == "TruncatedNormal": @@ -91,12 +91,12 @@ def init_weight(net, cfg): cell.weight.set_data(weight) if isinstance(cell, nn.Dense): if cfg.dense_init == "TruncatedNormal": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) elif cfg.dense_init == "RandomNormal": in_channel = cell.in_channels out_channel = cell.out_channels weight = np.random.normal(loc=0, scale=0.01, size=out_channel * in_channel) - weight = ms.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) + weight = mindspore.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) cell.weight.set_data(weight) diff --git a/official/cv/ResNet/train.py b/official/cv/ResNet/train.py index c5d10d2efd81452f53645d805d9cc4fa3463944e..1e5315ba1219b63960ba6055e82b86e65f6bff4a 100644 --- a/official/cv/ResNet/train.py +++ b/official/cv/ResNet/train.py @@ -15,7 +15,7 @@ """train resnet.""" import os -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.log as logger from mindspore.train.train_thor import ConvertModelUtils @@ -33,7 +33,7 @@ from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.model_utils.device_adapter import get_device_num -ms.set_seed(1) +mindspore.set_seed(1) if config.net_name in ("resnet18", "resnet34", "resnet50", "resnet152"): if config.net_name == "resnet18": @@ -58,8 +58,8 @@ else: def set_graph_kernel_context(run_platform, net_name): if run_platform == "GPU" and net_name == "resnet101": - ms.set_context(enable_graph_kernel=True) - ms.set_context(graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D") + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D") def set_parameter(): @@ -72,37 +72,37 @@ def set_parameter(): if config.mode_name == 'GRAPH': if target == "Ascend": rank_save_graphs_path = os.path.join(config.save_graphs_path, "soma", str(os.getenv('DEVICE_ID', '0'))) - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=config.save_graphs, - save_graphs_path=rank_save_graphs_path) + mindspore.set_context(mode=0, device_target=target, save_graphs=config.save_graphs, + save_graphs_path=rank_save_graphs_path, jit_config={"jit_level": "O2"}) else: - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=config.save_graphs) + mindspore.set_context(mode=0, device_target=target, save_graphs=config.save_graphs, + jit_config={"jit_level": "O2"}) set_graph_kernel_context(target, config.net_name) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=1, device_target=target, save_graphs=False) set_ascend_max_device_memory() if config.parameter_server: - ms.set_ps_context(enable_ps=True) + mindspore.set_ps_context(enable_ps=True) if config.run_distribute: if target == "Ascend": device_id = int(os.getenv('DEVICE_ID', '0')) - ms.set_context(device_id=device_id) - ms.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ms.ParallelMode.DATA_PARALLEL, - gradients_mean=True) + mindspore.set_context(device_id=device_id) + mindspore.set_auto_parallel_context(device_num=config.device_num, gradients_mean=True, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL) set_algo_parameters(elementwise_op_strategy_follow=True) if config.net_name == "resnet50" or config.net_name == "se-resnet50": if config.boost_mode not in ["O1", "O2"]: - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) elif config.net_name in ["resnet101", "resnet152"]: - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) init() # GPU target else: init() - ms.set_auto_parallel_context(device_num=get_device_num(), - parallel_mode=ms.ParallelMode.DATA_PARALLEL, - gradients_mean=True) + mindspore.set_auto_parallel_context(device_num=get_device_num(), gradients_mean=True, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL) if config.net_name == "resnet50": - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) config.rank_id = get_rank() if config.run_distribute else 0 @@ -136,10 +136,10 @@ def init_loss_scale(): def set_ascend_max_device_memory(): - if ms.get_context("enable_ge") and ms.get_context("mode") == ms.GRAPH_MODE and \ + if mindspore.get_context("enable_ge") and mindspore.get_context("mode") == 0 and \ hasattr(config, "max_device_memory"): logger.warning("When encountering a memory shortage situation in 1980B, reduce the max_device_memory.") - ms.set_context(max_device_memory=config.max_device_memory) + mindspore.set_context(max_device_memory=config.max_device_memory) @moxing_wrapper() @@ -160,11 +160,11 @@ def train_net(): init_weight(net, config) if config.resume_ckpt: - resume_param = ms.load_checkpoint(config.resume_ckpt, - choice_func=lambda x: not x.startswith(('learning_rate', 'global_step'))) - config.start_epoch = int(resume_param.get('epoch_num', ms.Tensor(0, ms.int32)).asnumpy().item()) + resume_param = mindspore.load_checkpoint(config.resume_ckpt, choice_func=\ + lambda x: not x.startswith(('learning_rate', 'global_step'))) + config.start_epoch = int(resume_param.get('epoch_num', mindspore.Tensor(0, mindspore.int32)).asnumpy().item()) - lr = ms.Tensor(init_lr(step_size=step_size)) + lr = mindspore.Tensor(init_lr(step_size=step_size)) # define opt group_params = init_group_params(net, config) opt = nn.Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale) @@ -172,7 +172,7 @@ def train_net(): opt = nn.LARS(opt, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient, lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name) loss = init_loss_scale() - loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + loss_scale = mindspore.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) dist_eval_network = ClassifyCorrectCell(net) if config.run_distribute else None metrics = {"acc"} if config.run_distribute: @@ -180,19 +180,18 @@ def train_net(): if (config.net_name not in ("resnet18", "resnet34", "resnet50", "resnet101", "resnet152", "se-resnet50")) or \ config.parameter_server or target == "CPU": # fp32 training - model = ms.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network) + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network) else: - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, - amp_level="O3", boost_level=config.boost_mode, - eval_network=dist_eval_network, - boost_config_dict={"grad_freeze": {"total_steps": config.epoch_size * step_size}}) + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, + amp_level="O3", boost_level=config.boost_mode, eval_network=dist_eval_network, + boost_config_dict={"grad_freeze": {"total_steps": config.epoch_size * step_size}}) if config.optimizer == "Thor" and config.dataset == "imagenet2012": from src.lr_generator import get_thor_damping damping = get_thor_damping(step_size * config.start_epoch, config.damping_init, config.damping_decay, 70, step_size) split_indices = [26, 53] - opt = nn.thor(net, lr, ms.Tensor(damping), config.momentum, config.weight_decay, config.loss_scale, + opt = nn.thor(net, lr, mindspore.Tensor(damping), config.momentum, config.weight_decay, config.loss_scale, config.batch_size, split_indices=split_indices, frequency=config.frequency) model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, @@ -202,8 +201,8 @@ def train_net(): # load resume param if config.resume_ckpt: - ms.load_param_into_net(net, resume_param) - ms.load_param_into_net(opt, resume_param) + mindspore.load_param_into_net(net, resume_param) + mindspore.load_param_into_net(opt, resume_param) config.logger.info('resume train from epoch: %s', config.start_epoch) # define callbacks diff --git a/official/cv/RetinaFace_ResNet50/eval.py b/official/cv/RetinaFace_ResNet50/eval.py index 6e79c5f29e3e6ce5c79e9996f35ce97a97a28905..677df951620bcba48775f43f75b5044b4d262e2b 100644 --- a/official/cv/RetinaFace_ResNet50/eval.py +++ b/official/cv/RetinaFace_ResNet50/eval.py @@ -20,7 +20,7 @@ import datetime import numpy as np import cv2 -import mindspore as ms +import mindspore from mindspore import Tensor from mindspore.common import set_seed @@ -148,8 +148,8 @@ class DetectionEngine: keep = self._nms(dets, self.nms_thresh) dets = dets[keep, :] - dets[:, 2:4] = (dets[:, 2:4].astype(np.int) - dets[:, 0:2].astype(np.int)).astype(np.float) # int - dets[:, 0:4] = dets[:, 0:4].astype(np.int).astype(np.float) # int + dets[:, 2:4] = (dets[:, 2:4].astype(np.int_) - dets[:, 0:2].astype(np.int_)).astype(np.float_) # int + dets[:, 0:4] = dets[:, 0:4].astype(np.int_).astype(np.float_) # int # add to result @@ -157,7 +157,7 @@ class DetectionEngine: if event_name not in self.results.keys(): self.results[event_name] = {} self.results[event_name][img_name[:-4]] = {'img_path': image_path, - 'bboxes': dets[:, :5].astype(np.float).tolist()} + 'bboxes': dets[:, :5].astype(np.float_).tolist()} def _get_gt_boxes(self): from scipy.io import loadmat @@ -182,7 +182,7 @@ class DetectionEngine: for event in self.results: for name in self.results[event].keys(): - bbox = np.array(self.results[event][name]['bboxes']).astype(np.float) + bbox = np.array(self.results[event][name]['bboxes']).astype(np.float_) if bbox.shape[0] <= 0: continue max_score = max(max_score, np.max(bbox[:, -1])) @@ -191,7 +191,7 @@ class DetectionEngine: length = max_score - min_score for event in self.results: for name in self.results[event].keys(): - bbox = np.array(self.results[event][name]['bboxes']).astype(np.float) + bbox = np.array(self.results[event][name]['bboxes']).astype(np.float_) if bbox.shape[0] <= 0: continue bbox[:, -1] -= min_score @@ -227,7 +227,7 @@ class DetectionEngine: - image_pr = np.zeros((section_num, 2), dtype=np.float) + image_pr = np.zeros((section_num, 2), dtype=np.float_) for section in range(section_num): _thresh = 1 - (section + 1)/section_num over_score_index = np.where(predict[:, 4] >= _thresh)[0] @@ -254,7 +254,7 @@ class DetectionEngine: for _set in range(len(sets)): gt_list = set_gts[_set] count_gt = 0 - pr_curve = np.zeros((section_num, 2), dtype=np.float) + pr_curve = np.zeros((section_num, 2), dtype=np.float_) for i, _ in enumerate(event_list): event = str(event_list[i][0][0]) image_list = file_list[i][0] @@ -263,7 +263,7 @@ class DetectionEngine: event_gt_box_list = facebox_list[i][0] for j, _ in enumerate(image_list): - predict = np.array(event_predict_dict[str(image_list[j][0][0])]['bboxes']).astype(np.float) + predict = np.array(event_predict_dict[str(image_list[j][0][0])]['bboxes']).astype(np.float_) gt_boxes = event_gt_box_list[j][0].astype('float') keep_index = event_gt_index_list[j][0] count_gt += len(keep_index) @@ -296,7 +296,7 @@ class DetectionEngine: def val(): - ms.set_context(mode=ms.GRAPH_MODE, device_target='GPU', save_graphs=False) + mindspore.set_context(mode=0, device_target='GPU', save_graphs=False) cfg = cfg_res50 @@ -307,10 +307,10 @@ def val(): # load checkpoint assert cfg['val_model'] is not None, 'val_model is None.' - param_dict = ms.load_checkpoint(cfg['val_model']) + param_dict = mindspore.load_checkpoint(cfg['val_model']) print('Load trained model done. {}'.format(cfg['val_model'])) network.init_parameters_data() - ms.load_param_into_net(network, param_dict) + mindspore.load_param_into_net(network, param_dict) # testing dataset testset_folder = cfg['val_dataset_folder'] diff --git a/official/cv/RetinaFace_ResNet50/export.py b/official/cv/RetinaFace_ResNet50/export.py index f2d531895e3002b1239b115f6491c556b6e75c81..2a3a54ea0ea7f8dfc9e867d822855031eba3f44c 100644 --- a/official/cv/RetinaFace_ResNet50/export.py +++ b/official/cv/RetinaFace_ResNet50/export.py @@ -15,7 +15,7 @@ """EXPORT ONNX MODEL WITH CKPT MODEL BASED ON MINDSPORE""" from __future__ import print_function import numpy as np -import mindspore as ms +import mindspore from mindspore import Tensor, export from src.network import RetinaFace, resnet50 from src.config import cfg_res50 @@ -24,7 +24,7 @@ from src.config import cfg_res50 def export_ONNX_model(): cfg = cfg_res50 - ms.set_context(mode=ms.GRAPH_MODE, device_target=cfg.get('device')) + mindspore.set_context(mode=0, device_target=cfg.get('device')) # build network backbone = resnet50(1001) @@ -33,9 +33,9 @@ def export_ONNX_model(): network.set_train(False) # load checkpoint into network - param_dict = ms.load_checkpoint(cfg['ckpt_model']) + param_dict = mindspore.load_checkpoint(cfg['ckpt_model']) network.init_parameters_data() - ms.load_param_into_net(network, param_dict) + mindspore.load_param_into_net(network, param_dict) # build input data input_data = Tensor(np.ones([1, 3, 2176, 2176]).astype(np.float32)) diff --git a/official/cv/RetinaFace_ResNet50/src/loss.py b/official/cv/RetinaFace_ResNet50/src/loss.py index 01fc8a34c60bc1a66fa3b63793c9039d365f5095..6a4b67babf43cd10137b12ce150f6bf23defb212 100644 --- a/official/cv/RetinaFace_ResNet50/src/loss.py +++ b/official/cv/RetinaFace_ResNet50/src/loss.py @@ -14,7 +14,7 @@ # ============================================================================ """Loss.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore import Tensor @@ -26,8 +26,8 @@ class SoftmaxCrossEntropyWithLogits(nn.Cell): self.log_softmax = ops.LogSoftmax() self.neg = ops.Neg() self.one_hot = ops.OneHot() - self.on_value = Tensor(1.0, ms.float32) - self.off_value = Tensor(0.0, ms.float32) + self.on_value = Tensor(1.0, mindspore.float32) + self.off_value = Tensor(0.0, mindspore.float32) self.reduce_sum = ops.ReduceSum() def construct(self, logits, labels): @@ -61,12 +61,12 @@ class MultiBoxLoss(nn.Cell): self.exp = ops.Exp() self.concat = ops.Concat(axis=1) self.reduce_sum2 = ops.ReduceSum(keep_dims=True) - self.idx = Tensor(np.reshape(np.arange(batch_size * num_boxes), (-1, 1)), ms.int32) + self.idx = Tensor(np.reshape(np.arange(batch_size * num_boxes), (-1, 1)), mindspore.int32) def construct(self, loc_data, loc_t, conf_data, conf_t, landm_data, landm_t): # landm loss - mask_pos1 = ops.cast(self.less(0.0, ops.cast(conf_t, ms.float32)), ms.float32) + mask_pos1 = ops.cast(self.less(0.0, ops.cast(conf_t, mindspore.float32)), mindspore.float32) N1 = self.maximum(self.reduce_sum(mask_pos1), 1) mask_pos_idx1 = self.tile(self.expand_dims(mask_pos1, -1), (1, 1, 10)) @@ -74,8 +74,8 @@ class MultiBoxLoss(nn.Cell): loss_landm = loss_landm / N1 # Localization Loss - mask_pos = ops.cast(self.notequal(0, conf_t), ms.float32) - conf_t = ops.cast(mask_pos, ms.int32) + mask_pos = ops.cast(self.notequal(0, conf_t), mindspore.float32) + conf_t = ops.cast(mask_pos, mindspore.int32) N = self.maximum(self.reduce_sum(mask_pos), 1) mask_pos_idx = self.tile(self.expand_dims(mask_pos, -1), (1, 1, 4)) @@ -95,17 +95,17 @@ class MultiBoxLoss(nn.Cell): # hard example mining num_matched_boxes = ops.reshape(self.reduce_sum(mask_pos, 1), (-1,)) - neg_masked_cross_entropy = ops.cast(loss_c * (1 - mask_pos), ms.float32) + neg_masked_cross_entropy = ops.cast(loss_c * (1 - mask_pos), mindspore.float32) _, loss_idx = self.sort_descend(neg_masked_cross_entropy, self.num_boxes) - _, relative_position = self.sort(ops.cast(loss_idx, ms.float32), self.num_boxes) - relative_position = ops.cast(relative_position, ms.float32) + _, relative_position = self.sort(ops.cast(loss_idx, mindspore.float32), self.num_boxes) + relative_position = ops.cast(relative_position, mindspore.float32) relative_position = relative_position[:, ::-1] - relative_position = ops.cast(relative_position, ms.int32) + relative_position = ops.cast(relative_position, mindspore.int32) num_neg_boxes = self.minimum(num_matched_boxes * self.neg_pre_positive, self.num_boxes - 1) tile_num_neg_boxes = self.tile(self.expand_dims(num_neg_boxes, -1), (1, self.num_boxes)) - top_k_neg_mask = ops.cast(self.less(relative_position, tile_num_neg_boxes), ms.float32) + top_k_neg_mask = ops.cast(self.less(relative_position, tile_num_neg_boxes), mindspore.float32) cross_entropy = self.cross_entropy(batch_conf, conf_t) cross_entropy = ops.reshape(cross_entropy, conf_t_shape) diff --git a/official/cv/RetinaFace_ResNet50/src/network.py b/official/cv/RetinaFace_ResNet50/src/network.py index b417e615d638b9e27abb00cd32063a27109cd17b..56cdc94ef50540920cb90958c4747e6109ad4a5c 100644 --- a/official/cv/RetinaFace_ResNet50/src/network.py +++ b/official/cv/RetinaFace_ResNet50/src/network.py @@ -17,7 +17,7 @@ import math from functools import reduce import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore import Tensor @@ -496,20 +496,20 @@ class TrainingWrapper(nn.Cell): def __init__(self, network, optimizer, sens=1.0): super(TrainingWrapper, self).__init__(auto_prefix=False) self.network = network - self.weights = ms.ParameterTuple(network.trainable_params()) + self.weights = mindspore.ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = ops.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None - self.parallel_mode = ms.get_auto_parallel_context("parallel_mode") - class_list = [ms.ParallelMode.DATA_PARALLEL, ms.ParallelMode.HYBRID_PARALLEL] + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") + class_list = [mindspore.ParallelMode.DATA_PARALLEL, mindspore.ParallelMode.HYBRID_PARALLEL] if self.parallel_mode in class_list: self.reducer_flag = True if self.reducer_flag: - mean = ms.get_auto_parallel_context("gradients_mean") + mean = mindspore.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): - degree = ms.get_auto_parallel_context("device_num") + degree = mindspore.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree) diff --git a/official/cv/RetinaFace_ResNet50/train.py b/official/cv/RetinaFace_ResNet50/train.py index e318bd377a14716be3ed2c6c685d8f2b0a37a18e..7e23e50a2a65b66f8f2dfa372f732c570509a9e6 100644 --- a/official/cv/RetinaFace_ResNet50/train.py +++ b/official/cv/RetinaFace_ResNet50/train.py @@ -16,9 +16,9 @@ from __future__ import print_function import math import argparse -import mindspore as ms +import mindspore -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.communication.management import init, get_rank, get_group_size @@ -31,13 +31,13 @@ from src.lr_schedule import adjust_learning_rate def train(cfg, args): - ms.set_context(mode=ms.GRAPH_MODE, device_target='GPU', save_graphs=False) - if ms.get_context("device_target") == "GPU": + mindspore.set_context(mode=0, device_target='GPU', save_graphs=False) + if mindspore.get_context("device_target") == "GPU": # Enable graph kernel - ms.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion") + mindspore.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion") if args.is_distributed: init("nccl") - ms.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) cfg['ckpt_path'] = cfg['ckpt_path'] + "ckpt_" + str(get_rank()) + "/" @@ -65,8 +65,8 @@ def train(cfg, args): if cfg['pretrain'] and cfg['resume_net'] is None: pretrained_res50 = cfg['pretrain_path'] - param_dict_res50 = ms.load_checkpoint(pretrained_res50) - ms.load_param_into_net(backbone, param_dict_res50) + param_dict_res50 = mindspore.load_checkpoint(pretrained_res50) + mindspore.load_param_into_net(backbone, param_dict_res50) print('Load resnet50 from [{}] done.'.format(pretrained_res50)) net = RetinaFace(phase='train', backbone=backbone) @@ -74,8 +74,8 @@ def train(cfg, args): if cfg['resume_net'] is not None: pretrain_model_path = cfg['resume_net'] - param_dict_retinaface = ms.load_checkpoint(pretrain_model_path) - ms.load_param_into_net(net, param_dict_retinaface) + param_dict_retinaface = mindspore.load_checkpoint(pretrain_model_path) + mindspore.load_param_into_net(net, param_dict_retinaface) print('Resume Model from [{}] Done.'.format(cfg['resume_net'])) net = RetinaFaceWithLossCell(net, multibox_loss, cfg) @@ -84,9 +84,9 @@ def train(cfg, args): warmup_epoch=cfg['warmup_epoch']) if cfg['optim'] == 'momentum': - opt = ms.nn.Momentum(net.trainable_params(), lr, momentum) + opt = mindspore.nn.Momentum(net.trainable_params(), lr, momentum) elif cfg['optim'] == 'sgd': - opt = ms.nn.SGD(params=net.trainable_params(), learning_rate=lr, momentum=momentum, + opt = mindspore.nn.SGD(params=net.trainable_params(), learning_rate=lr, momentum=momentum, weight_decay=weight_decay, loss_scale=1) else: raise ValueError('optim is not define.') @@ -113,7 +113,7 @@ if __name__ == '__main__': arg, _ = parser.parse_known_args() config = cfg_res50 - ms.common.seed.set_seed(config['seed']) + mindspore.common.seed.set_seed(config['seed']) print('train config:\n', config) train(cfg=config, args=arg) diff --git a/official/cv/RetinaNet/eval.py b/official/cv/RetinaNet/eval.py index d18bda3ee1ce730ddbf0a62d492975e90b1efc3a..be4d99c29195a51f905e2dedec10c8abca97de3c 100644 --- a/official/cv/RetinaNet/eval.py +++ b/official/cv/RetinaNet/eval.py @@ -21,7 +21,7 @@ import json import numpy as np from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval -from mindspore import context, Tensor +from mindspore import Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.retinanet import retinanet50, resnet50, retinanetInferWithDecoder from src.dataset import create_retinanet_dataset, data_to_mindrecord_byte_image, voc_data_to_mindrecord, \ @@ -30,6 +30,7 @@ from src.box_utils import default_boxes from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.model_utils.device_adapter import get_device_id, get_device_num +import mindspore def apply_nms(all_boxes, all_scores, thres, max_boxes): @@ -161,7 +162,7 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def retinanet_eval(): - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) prefix = "retinanet_eval.mindrecord" mindrecord_dir = config.mindrecord_dir mindrecord_file = os.path.join(mindrecord_dir, prefix + "0") diff --git a/official/cv/RetinaNet/eval_onnx.py b/official/cv/RetinaNet/eval_onnx.py index 863f663dd2c2d23c1ab9199d81284d2cb7f4c914..ded56eb68a79f4e767a98c9afe78c9292a1c57ce 100644 --- a/official/cv/RetinaNet/eval_onnx.py +++ b/official/cv/RetinaNet/eval_onnx.py @@ -21,13 +21,14 @@ import numpy as np import onnxruntime as ort from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval -from mindspore import context from src.dataset import create_retinanet_dataset, data_to_mindrecord_byte_image, voc_data_to_mindrecord from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.model_utils.device_adapter import get_device_id, get_device_num +import mindspore + def create_session(onnx_path, target_device): """Create onnxruntime session""" if target_device == 'GPU': @@ -162,7 +163,7 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def retinanet_eval(): - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) prefix = "retinanet_eval.mindrecord" mindrecord_dir = config.mindrecord_dir mindrecord_file = os.path.join(mindrecord_dir, prefix + "0") diff --git a/official/cv/RetinaNet/export.py b/official/cv/RetinaNet/export.py index 9acd45892cce8fd8dd6378e8249e2911adbe314e..e3506ecc7a18176776c4885b3da5570d7c450c7d 100644 --- a/official/cv/RetinaNet/export.py +++ b/official/cv/RetinaNet/export.py @@ -15,8 +15,9 @@ """export for retinanet""" import os import numpy as np +import mindspore import mindspore.common.dtype as mstype -from mindspore import context, Tensor +from mindspore import Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net, export from src.retinanet import retinanet50, resnet50, retinanetInferWithDecoder from src.model_utils.config import config @@ -30,7 +31,7 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def model_export(): - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=config.device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=config.device_id) backbone = resnet50(config.num_classes) net = retinanet50(backbone, config) diff --git a/official/cv/RetinaNet/modelarts/train_start.py b/official/cv/RetinaNet/modelarts/train_start.py index a1b74ceb7af31ec12e2fbad16b63f8353d82d888..9cc3742dddb2e214a161e2590e64255227165bff 100644 --- a/official/cv/RetinaNet/modelarts/train_start.py +++ b/official/cv/RetinaNet/modelarts/train_start.py @@ -19,8 +19,8 @@ import os import subprocess import time import moxing as mox -from mindspore import context from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id +import mindspore _CACHE_DATA_URL = "/cache/data_url" _CACHE_TRAIN_URL = "/cache/train_url" @@ -189,7 +189,7 @@ def download_data(args): sync_data(args.train_url, args.output_path) print("Workspace downloaded: ", os.listdir(args.output_path)) - context.set_context(save_graphs_path=os.path.join(args.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(args.output_path, str(get_rank_id()))) args.device_num = get_device_num() args.device_id = get_device_id() # create output dir diff --git a/official/cv/RetinaNet/src/model_utils/moxing_adapter.py b/official/cv/RetinaNet/src/model_utils/moxing_adapter.py index c2d2282402b6a2950af74b66f282550aac75cb14..344dfc034e1e553b2b5da61517cdc4b179d34b1a 100644 --- a/official/cv/RetinaNet/src/model_utils/moxing_adapter.py +++ b/official/cv/RetinaNet/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config @@ -101,7 +101,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print('Workspace downloaded: ', os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/RetinaNet/src/retinanet.py b/official/cv/RetinaNet/src/retinanet.py index ee36151b4c04e0556ebb8ca70cd08f44b7469201..247ae0318f3a14061ff4b3a03a9e3045d5a43617 100644 --- a/official/cv/RetinaNet/src/retinanet.py +++ b/official/cv/RetinaNet/src/retinanet.py @@ -16,10 +16,10 @@ """retinanet based resnet.""" import mindspore.common.dtype as mstype -import mindspore as ms +import mindspore import mindspore.nn as nn -from mindspore import context, Tensor -from mindspore.context import ParallelMode +from mindspore import Tensor +from mindspore import ParallelMode from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.communication.management import get_group_size from mindspore.ops import operations as P @@ -292,19 +292,19 @@ class TrainingWrapper(nn.Cell): super(TrainingWrapper, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() - self.weights = ms.ParameterTuple(network.trainable_params()) + self.weights = mindspore.ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True if self.reducer_flag: - mean = context.get_auto_parallel_context("gradients_mean") + mean = mindspore.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): - degree = context.get_auto_parallel_context("device_num") + degree = mindspore.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree) diff --git a/official/cv/RetinaNet/train.py b/official/cv/RetinaNet/train.py index fcb40b81f586ac74701205acc2b7f43004d0c0c6..7c8090149d629ded3a7ba4c7929828107bf55be6 100644 --- a/official/cv/RetinaNet/train.py +++ b/official/cv/RetinaNet/train.py @@ -18,12 +18,13 @@ import os import ast import time +import mindspore import mindspore.nn as nn -from mindspore import context, Tensor +from mindspore import Tensor from mindspore.communication.management import init, get_rank from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, LossMonitor, TimeMonitor, Callback from mindspore.train import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.common import set_seed from src.retinanet import retinanetWithLossCell, TrainingWrapper, retinanet50, resnet50 @@ -117,7 +118,7 @@ def modelarts_pre_process(): def set_graph_kernel_context(device_target): if device_target == "GPU": # Enable graph kernel for default model ssd300 on GPU back-end. - context.set_context(enable_graph_kernel=True, + mindspore.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D") @@ -126,10 +127,10 @@ def main(): config.lr_init = ast.literal_eval(config.lr_init) config.lr_end_rate = ast.literal_eval(config.lr_end_rate) device_id = get_device_id() - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - if context.get_context("mode") == context.PYNATIVE_MODE: - context.set_context(mempool_block_size="31GB") + if mindspore.get_context("mode") == 1: + mindspore.set_context(mempool_block_size="31GB") elif config.device_target == "GPU": set_graph_kernel_context(config.device_target) elif config.device_target == "CPU": @@ -141,12 +142,12 @@ def main(): init() device_num = get_device_num() rank = get_rank() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: rank = 0 device_num = 1 - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) mindrecord_file = os.path.join(config.mindrecord_dir, "retinanet.mindrecord0") diff --git a/official/cv/SSD/README.md b/official/cv/SSD/README.md index 21b66dbd6f065fd5a85df80ae6511da7d33451dc..020ad730cddf593acbf43229a03afbea2b31f676 100644 --- a/official/cv/SSD/README.md +++ b/official/cv/SSD/README.md @@ -351,7 +351,8 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following +in log ```shell epoch: 1 step: 458, loss is 3.1681802 @@ -388,7 +389,8 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name is "LOG". Under this, you can find checkpoint files together with result like the followings in log +Training result will be stored in the current path, whose folder name is "LOG". Under this, you can find checkpoint files together with result like the following +in log ```shell epoch: 1 step: 1, loss is 420.11783 @@ -431,7 +433,8 @@ We need four parameters for this scripts. > checkpoint can be produced in training process. -Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the followings in log. +Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the following +in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.238 @@ -467,7 +470,8 @@ We need four parameters for this scripts. > checkpoint can be produced in training process. -Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the followings in log. +Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the following +in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.224 diff --git a/official/cv/SSD/config/ssd_mobilenet_v1_fpn_config.yaml b/official/cv/SSD/config/ssd_mobilenet_v1_fpn_config.yaml index 099ea3758203638f96c6cae7886c69047b21b8f5..6fa43d8a50492f5ddfc0eaabfa45243fb115e70a 100644 --- a/official/cv/SSD/config/ssd_mobilenet_v1_fpn_config.yaml +++ b/official/cv/SSD/config/ssd_mobilenet_v1_fpn_config.yaml @@ -23,7 +23,7 @@ match_threshold: 0.5 nms_threshold: 0.6 min_score: 0.1 max_boxes: 100 -all_reduce_fusion_config: [29, 58, 89] +all_reduce_fusion_config: [29, 58, 89, 201] # learning rate settings lr_init: 0.01333 diff --git a/official/cv/SSD/eval.py b/official/cv/SSD/eval.py index 0fe0e6ce47e67ea23337ad40af4ca4a16fad90a3..b6923d7c451e64162c86ede306260072f80b0778 100644 --- a/official/cv/SSD/eval.py +++ b/official/cv/SSD/eval.py @@ -16,7 +16,7 @@ """Evaluation for SSD""" import os -import mindspore as ms +import mindspore from mindspore import Tensor from src.ssd import SSD300, SsdInferWithDecoder, ssd_mobilenet_v2, ssd_mobilenet_v1_fpn, ssd_mobilenet_v1, ssd_resnet50_fpn, ssd_vgg16 from src.dataset import create_ssd_dataset, create_mindrecord @@ -45,9 +45,9 @@ def ssd_eval(dataset_path, ckpt_path, anno_json): net = SsdInferWithDecoder(net, Tensor(default_boxes), config) print("Load Checkpoint!") - param_dict = ms.load_checkpoint(ckpt_path) + param_dict = mindspore.load_checkpoint(ckpt_path) net.init_parameters_data() - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) total = ds.get_dataset_size() * batch_size @@ -77,7 +77,7 @@ def eval_net(): else: raise ValueError('SSD eval only support dataset mode is coco and voc!') - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=config.device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=config.device_id) mindrecord_file = create_mindrecord(config.dataset, "ssd_eval.mindrecord", False) diff --git a/official/cv/SSD/eval_onnx.py b/official/cv/SSD/eval_onnx.py index a736ab41ca6117d8159ab2b26690561d14a2dc2c..2a39265fc1f3c0451438cb7bfe5062f217bb3255 100644 --- a/official/cv/SSD/eval_onnx.py +++ b/official/cv/SSD/eval_onnx.py @@ -17,12 +17,13 @@ import os import numpy as np import onnxruntime as ort -from mindspore import context from src.dataset import create_ssd_dataset, create_mindrecord from src.eval_utils import COCOMetrics from src.model_utils.config import config +import mindspore + def create_session(onnx_path, target_device): """Create onnxruntime session""" @@ -95,7 +96,7 @@ def eval_net(): else: raise ValueError('SSD eval only support dataset mode is coco and voc!') - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=config.device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=config.device_id) mindrecord_file = create_mindrecord(config.dataset, "ssd_eval.mindrecord", False) diff --git a/official/cv/SSD/export.py b/official/cv/SSD/export.py index 9917d18d94ac51508e710afe39ce35276f50ee94..65386714513f33f3231c5bd06c020c72f126382e 100644 --- a/official/cv/SSD/export.py +++ b/official/cv/SSD/export.py @@ -16,16 +16,16 @@ import os import numpy as np -import mindspore as ms +import mindspore from mindspore import Tensor from src.ssd import SSD300, SsdInferWithDecoder, ssd_mobilenet_v2, ssd_mobilenet_v1_fpn, ssd_mobilenet_v1, ssd_resnet50_fpn, ssd_vgg16 from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.box_utils import default_boxes -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - ms.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) def modelarts_pre_process(): '''modelarts pre process function.''' @@ -56,14 +56,14 @@ def run_export(): net = SsdInferWithDecoder(net, Tensor(default_boxes), config) - param_dict = ms.load_checkpoint(config.checkpoint_file_path) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) net.init_parameters_data() - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) input_shp = [config.batch_size, 3] + config.img_shape - input_array = Tensor(np.random.uniform(-1.0, 1.0, size=input_shp), ms.float32) - ms.export(net, input_array, file_name=config.file_name, file_format=config.file_format) + input_array = Tensor(np.random.uniform(-1.0, 1.0, size=input_shp), mindspore.float32) + mindspore.export(net, input_array, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': run_export() diff --git a/official/cv/SSD/infer_ssd_mobilenet_v1_fpn_onnx.py b/official/cv/SSD/infer_ssd_mobilenet_v1_fpn_onnx.py index 346d6bf4eaa9b687e238088b627e72f9b79e9f56..ee181c2035bd227fbbc1a79bb33ff4fc314323d5 100644 --- a/official/cv/SSD/infer_ssd_mobilenet_v1_fpn_onnx.py +++ b/official/cv/SSD/infer_ssd_mobilenet_v1_fpn_onnx.py @@ -17,12 +17,13 @@ import os import numpy as np import onnxruntime as ort -from mindspore import context from src.dataset import create_ssd_dataset, create_mindrecord from src.eval_utils import COCOMetrics from src.model_utils.config import config +import mindspore + def create_session(onnx_path, target_device): if target_device == 'GPU': @@ -96,7 +97,7 @@ def eval_net(): else: raise ValueError('SSD eval only support dataset mode is coco and voc!') - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=config.device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=config.device_id) mindrecord_file = create_mindrecord(config.dataset, "ssd_eval.mindrecord", False) diff --git a/official/cv/SSD/scripts/run_distribute_train.sh b/official/cv/SSD/scripts/run_distribute_train.sh index 893f4a76db748e5d14b823f63a163a1747f27ccf..37fe46d37a076ecb0b05b08de202b1f3954bc83d 100644 --- a/official/cv/SSD/scripts/run_distribute_train.sh +++ b/official/cv/SSD/scripts/run_distribute_train.sh @@ -17,7 +17,7 @@ echo "==============================================================================================================" echo "Please run the script as: " echo "bash run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET RANK_TABLE_FILE CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" -echo "for example: bash run_distribute_train.sh 8 500 0.2 coco /data/hccl.json /config_path /opt/ssd-300.ckpt(optional) 200(optional)" +echo "for example: bash run_distribute_train.sh 8 500 0.05 coco /data/hccl.json /config_path /opt/ssd-300.ckpt(optional) 200(optional)" echo "It is better to use absolute path." echo "=================================================================================================================" diff --git a/official/cv/SSD/scripts/run_distribute_train_gpu.sh b/official/cv/SSD/scripts/run_distribute_train_gpu.sh index 0778ad70ff9414d874f0d1e3ce2ffc9d77c499f5..0ff4b1818130aded3bdd9bd5c351148704c69422 100644 --- a/official/cv/SSD/scripts/run_distribute_train_gpu.sh +++ b/official/cv/SSD/scripts/run_distribute_train_gpu.sh @@ -17,7 +17,7 @@ echo "==============================================================================================================" echo "Please run the script as: " echo "bash run_distribute_train_gpu.sh DEVICE_NUM EPOCH_SIZE LR DATASET CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" -echo "for example: bash run_distribute_train_gpu.sh 8 500 0.2 coco /config_path /opt/ssd-300.ckpt(optional) 200(optional)" +echo "for example: bash run_distribute_train_gpu.sh 8 500 0.05 coco /config_path /opt/ssd-300.ckpt(optional) 200(optional)" echo "It is better to use absolute path." echo "=================================================================================================================" diff --git a/official/cv/SSD/src/model_utils/moxing_adapter.py b/official/cv/SSD/src/model_utils/moxing_adapter.py index 72b124bd07b46f04de7575b604bcaa10a6588184..c2cadef73f3f026f3f7a51738db65aeb79d4e612 100644 --- a/official/cv/SSD/src/model_utils/moxing_adapter.py +++ b/official/cv/SSD/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -import mindspore as ms +import mindspore from src.model_utils.config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - ms.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/SSD/src/ssd.py b/official/cv/SSD/src/ssd.py index faf9804dd9cca58037015892efb4e00ec213c07f..a522f12c6e1793e40c63cc7b255eb7b313627712 100644 --- a/official/cv/SSD/src/ssd.py +++ b/official/cv/SSD/src/ssd.py @@ -15,10 +15,10 @@ """SSD net based MobilenetV2.""" -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore import Tensor -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.communication.management import get_group_size import mindspore.ops as ops @@ -333,8 +333,8 @@ class SSD300(nn.Cell): pred_loc, pred_label = self.multi_box(multi_feature) if not self.is_training: pred_label = self.activation(pred_label) - pred_loc = ops.cast(pred_loc, ms.float32) - pred_label = ops.cast(pred_label, ms.float32) + pred_loc = ops.cast(pred_loc, mindspore.float32) + pred_label = ops.cast(pred_label, mindspore.float32) return pred_loc, pred_label @@ -364,8 +364,8 @@ class SsdMobilenetV1Fpn(nn.Cell): pred_loc, pred_label = self.multi_box(features) if not self.training: pred_label = self.activation(pred_label) - pred_loc = ops.cast(pred_loc, ms.float32) - pred_label = ops.cast(pred_label, ms.float32) + pred_loc = ops.cast(pred_loc, mindspore.float32) + pred_label = ops.cast(pred_label, mindspore.float32) return pred_loc, pred_label @@ -413,8 +413,8 @@ class SsdMobilenetV1Feature(nn.Cell): pred_loc, pred_label = self.multi_box(multi_feature) if not self.training: pred_label = self.activation(pred_label) - pred_loc = ops.cast(pred_loc, ms.float32) - pred_label = ops.cast(pred_label, ms.float32) + pred_loc = ops.cast(pred_loc, mindspore.float32) + pred_label = ops.cast(pred_label, mindspore.float32) return pred_loc, pred_label @@ -443,8 +443,8 @@ class SsdResNet50Fpn(nn.Cell): pred_loc, pred_label = self.multi_box(features) if not self.training: pred_label = self.activation(pred_label) - pred_loc = ops.cast(pred_loc, ms.float32) - pred_label = ops.cast(pred_label, ms.float32) + pred_loc = ops.cast(pred_loc, mindspore.float32) + pred_label = ops.cast(pred_label, mindspore.float32) return pred_loc, pred_label @@ -465,8 +465,8 @@ class SigmoidFocalClassificationLoss(nn.Cell): self.sigmoid = ops.Sigmoid() self.pow = ops.Pow() self.onehot = ops.OneHot() - self.on_value = Tensor(1.0, ms.float32) - self.off_value = Tensor(0.0, ms.float32) + self.on_value = Tensor(1.0, mindspore.float32) + self.off_value = Tensor(0.0, mindspore.float32) self.gamma = gamma self.alpha = alpha @@ -474,7 +474,7 @@ class SigmoidFocalClassificationLoss(nn.Cell): label = self.onehot(label, ops.shape(logits)[-1], self.on_value, self.off_value) sigmiod_cross_entropy = self.sigmiod_cross_entropy(logits, label) sigmoid = self.sigmoid(logits) - label = ops.cast(label, ms.float32) + label = ops.cast(label, mindspore.float32) p_t = label * sigmoid + (1 - label) * (1 - sigmoid) modulating_factor = self.pow(1 - p_t, self.gamma) alpha_weight_factor = label * self.alpha + (1 - label) * (1 - self.alpha) @@ -505,8 +505,8 @@ class SSDWithLossCell(nn.Cell): def construct(self, x, gt_loc, gt_label, num_matched_boxes): pred_loc, pred_label = self.network(x) - mask = ops.cast(self.less(0, gt_label), ms.float32) - num_matched_boxes = self.reduce_sum(ops.cast(num_matched_boxes, ms.float32)) + mask = ops.cast(self.less(0, gt_label), mindspore.float32) + num_matched_boxes = self.reduce_sum(ops.cast(num_matched_boxes, mindspore.float32)) # Localization Loss mask_loc = self.tile(self.expand_dims(mask, -1), (1, 1, 4)) @@ -543,20 +543,20 @@ class TrainingWrapper(nn.Cell): super(TrainingWrapper, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() - self.weights = ms.ParameterTuple(network.trainable_params()) + self.weights = mindspore.ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = ops.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None self.use_global_norm = use_global_norm - self.parallel_mode = ms.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True if self.reducer_flag: - mean = ms.get_auto_parallel_context("gradients_mean") + mean = mindspore.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): - degree = ms.get_auto_parallel_context("device_num") + degree = mindspore.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree) @@ -760,8 +760,8 @@ class SSD300VGG16(nn.Cell): pred_loc, pred_label = self.multi_box(multi_feature) if not self.training: pred_label = self.activation(pred_label) - pred_loc = ops.cast(pred_loc, ms.float32) - pred_label = ops.cast(pred_label, ms.float32) + pred_loc = ops.cast(pred_loc, mindspore.float32) + pred_label = ops.cast(pred_label, mindspore.float32) return pred_loc, pred_label diff --git a/official/cv/SSD/train.py b/official/cv/SSD/train.py index 8e8e0d55fa0e29d480479b6d749a4988b7827dbd..16bff8c78bd425d713b137110ba99ab038932684 100644 --- a/official/cv/SSD/train.py +++ b/official/cv/SSD/train.py @@ -16,13 +16,13 @@ """Train SSD and get checkpoint files.""" import os -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore import Tensor from mindspore.communication.management import init, get_rank from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, LossMonitor, TimeMonitor from mindspore.train import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.common import set_seed, dtype import mindspore.log as logger from src.ssd import SSD300, SsdInferWithDecoder, SSDWithLossCell, TrainingWrapper, ssd_mobilenet_v2, \ @@ -51,40 +51,40 @@ def ssd_model_build(): ssd = ssd_mobilenet_v1_fpn(config=config) init_net_param(ssd) if config.feature_extractor_base_param != "": - param_dict = ms.load_checkpoint(config.feature_extractor_base_param) + param_dict = mindspore.load_checkpoint(config.feature_extractor_base_param) for x in list(param_dict.keys()): param_dict["network.feature_extractor.mobilenet_v1." + x] = param_dict[x] del param_dict[x] - ms.load_param_into_net(ssd.feature_extractor.mobilenet_v1.network, param_dict) + mindspore.load_param_into_net(ssd.feature_extractor.mobilenet_v1.network, param_dict) elif config.model_name == "ssd_mobilenet_v1": ssd = ssd_mobilenet_v1(config=config) init_net_param(ssd) if config.feature_extractor_base_param != "": - param_dict = ms.load_checkpoint(config.feature_extractor_base_param) + param_dict = mindspore.load_checkpoint(config.feature_extractor_base_param) for x in list(param_dict.keys()): param_dict["network.feature_extractor.mobilenet_v1." + x] = param_dict[x] del param_dict[x] - ms.load_param_into_net(ssd.feature_extractor.mobilenet_v1.network, param_dict) + mindspore.load_param_into_net(ssd.feature_extractor.mobilenet_v1.network, param_dict) elif config.model_name == "ssd_resnet50_fpn": ssd = ssd_resnet50_fpn(config=config) init_net_param(ssd) if config.feature_extractor_base_param != "": - param_dict = ms.load_checkpoint(config.feature_extractor_base_param) + param_dict = mindspore.load_checkpoint(config.feature_extractor_base_param) for x in list(param_dict.keys()): param_dict["network.feature_extractor.resnet." + x] = param_dict[x] del param_dict[x] - ms.load_param_into_net(ssd.feature_extractor.resnet, param_dict) + mindspore.load_param_into_net(ssd.feature_extractor.resnet, param_dict) elif config.model_name == "ssd_vgg16": ssd = ssd_vgg16(config=config) init_net_param(ssd) if config.feature_extractor_base_param != "": - param_dict = ms.load_checkpoint(config.feature_extractor_base_param) + param_dict = mindspore.load_checkpoint(config.feature_extractor_base_param) from src.vgg16 import ssd_vgg_key_mapper for k in ssd_vgg_key_mapper: v = ssd_vgg_key_mapper[k] param_dict["network.backbone." + v + ".weight"] = param_dict[k + ".weight"] del param_dict[k + ".weight"] - ms.load_param_into_net(ssd.backbone, param_dict) + mindspore.load_param_into_net(ssd.backbone, param_dict) else: raise ValueError(f'config.model: {config.model_name} is not supported') return ssd @@ -93,23 +93,23 @@ def ssd_model_build(): def set_graph_kernel_context(device_target, model): if device_target == "GPU" and model == "ssd300": # Enable graph kernel for default model ssd300 on GPU back-end. - ms.set_context(enable_graph_kernel=True, + mindspore.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D") if device_target == "GPU" and model == "ssd_mobilenet_v1": # Enable graph kernel for default model ssd300 on GPU back-end. - ms.context.set_context(enable_graph_kernel=True, + mindspore.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D") def set_ascend_pynative_mempool_block_size(): - if ms.get_context("mode") == ms.PYNATIVE_MODE and config.device_target == "Ascend": - ms.set_context(mempool_block_size="31GB") + if mindspore.get_context("mode") == 1 and config.device_target == "Ascend": + mindspore.set_context(mempool_block_size="31GB") def set_ascend_max_device_memory(): - if ms.get_context("enable_ge") and ms.get_context("mode") == ms.GRAPH_MODE: + if mindspore.get_context("enable_ge") and mindspore.get_context("mode") == 0: logger.warning("When encountering a memory shortage situation in 1980B, reduce the max_device_memory.") - ms.set_context(max_device_memory="50GB") + mindspore.set_context(max_device_memory="50GB") @moxing_wrapper() @@ -126,22 +126,22 @@ def train_net(): loss_scale = float(config.loss_scale) if config.device_target == "CPU": loss_scale = 1.0 - ms.set_context(mode=ms.GRAPH_MODE, device_target="CPU") + mindspore.set_context(mode=0, device_target="CPU") else: - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=config.device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=config.device_id) # Only works on ascend chip of 1980B - ms.set_context(ascend_config={"precision_mode": "allow_fp32_to_fp16"}) + mindspore.set_context(ascend_config={"precision_mode": "allow_fp32_to_fp16"}) set_graph_kernel_context(config.device_target, config.model_name) set_ascend_pynative_mempool_block_size() set_ascend_max_device_memory() if config.run_distribute: device_num = config.device_num - ms.reset_auto_parallel_context() - ms.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) init() if config.all_reduce_fusion_config: - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) rank = get_rank() mindrecord_file = create_mindrecord(config.dataset, "ssd.mindrecord", True) @@ -167,10 +167,10 @@ def train_net(): ckpoint_cb = ModelCheckpoint(prefix="ssd", directory=ckpt_save_dir, config=ckpt_config) if config.pre_trained: - param_dict = ms.load_checkpoint(config.pre_trained) + param_dict = mindspore.load_checkpoint(config.pre_trained) if config.filter_weight: filter_checkpoint_parameter_by_list(param_dict, config.checkpoint_filter_list) - ms.load_param_into_net(net, param_dict, True) + mindspore.load_param_into_net(net, param_dict, True) lr = Tensor(get_lr(global_step=config.pre_trained_epoch_size * dataset_size, lr_init=config.lr_init, lr_end=config.lr_end_rate * config.lr, lr_max=config.lr, diff --git a/official/cv/ShuffleNet/shufflenetv1/eval.py b/official/cv/ShuffleNet/shufflenetv1/eval.py index 83ba8464cb0259a903825ae543f775145ee0dc3d..00e50cbf6e79447e92a252d280d654bfd655af71 100644 --- a/official/cv/ShuffleNet/shufflenetv1/eval.py +++ b/official/cv/ShuffleNet/shufflenetv1/eval.py @@ -14,7 +14,8 @@ # ============================================================================ """test ShuffleNetV1""" import time -from mindspore import context, nn +import mindspore +from mindspore import nn from mindspore.train.model import Model from mindspore.common import set_seed from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -30,7 +31,7 @@ set_seed(1) @moxing_wrapper(pre_process=None) def test(): - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False, + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, device_id=get_device_id()) # create dataset diff --git a/official/cv/ShuffleNet/shufflenetv1/export.py b/official/cv/ShuffleNet/shufflenetv1/export.py index 5f5709d8c8b77f9e06e263f72ca20f7390bfbe49..f9618edd7c1877b3b8735972f0fafe42e0397ee1 100644 --- a/official/cv/ShuffleNet/shufflenetv1/export.py +++ b/official/cv/ShuffleNet/shufflenetv1/export.py @@ -18,14 +18,14 @@ suggest run as python export.py --file_name [file name] --ckpt_path [ckpt path] """ import os import numpy as np -import mindspore as ms -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.model_utils.config import config from src.shufflenetv1 import ShuffleNetV1 from src.model_utils.moxing_adapter import moxing_wrapper -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) def modelarts_pre_process(): @@ -33,7 +33,7 @@ def modelarts_pre_process(): if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) @moxing_wrapper(pre_process=modelarts_pre_process) @@ -44,7 +44,7 @@ def model_export(): load_param_into_net(net, param_dict) image_height, image_width = (224, 224) - input_arr = Tensor(np.ones([config.batch_size, 3, image_height, image_width]), ms.float32) + input_arr = Tensor(np.ones([config.batch_size, 3, image_height, image_width]), mindspore.float32) export(net, input_arr, file_name=config.file_name, file_format=config.file_format) diff --git a/official/cv/ShuffleNet/shufflenetv1/infer_shufflenetv1_onnx.py b/official/cv/ShuffleNet/shufflenetv1/infer_shufflenetv1_onnx.py index b561b17fa2d8111017ea2a50b007deb47c8aab15..6636f6c2b7af30518645a00626bda4835057f575 100644 --- a/official/cv/ShuffleNet/shufflenetv1/infer_shufflenetv1_onnx.py +++ b/official/cv/ShuffleNet/shufflenetv1/infer_shufflenetv1_onnx.py @@ -15,7 +15,7 @@ """test ShuffleNetV1""" import onnxruntime import numpy as np -import mindspore as ms +import mindspore from mindspore import Tensor from mindspore import ops from src.dataset import create_dataset @@ -51,7 +51,7 @@ def test(): model_predict = np.expand_dims(np.squeeze(model_predict), axis=0) for predict, label in zip(model_predict[0], labels): cnt = cnt + 1 - input_x = Tensor(predict, ms.float16) + input_x = Tensor(predict, mindspore.float16) _, k_label = topk(input_x, k) if k_label[0] == label: correct_top1 = correct_top1 + 1 diff --git a/official/cv/ShuffleNet/shufflenetv1/src/model_utils/moxing_adapter.py b/official/cv/ShuffleNet/shufflenetv1/src/model_utils/moxing_adapter.py index c2d2282402b6a2950af74b66f282550aac75cb14..344dfc034e1e553b2b5da61517cdc4b179d34b1a 100644 --- a/official/cv/ShuffleNet/shufflenetv1/src/model_utils/moxing_adapter.py +++ b/official/cv/ShuffleNet/shufflenetv1/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config @@ -101,7 +101,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print('Workspace downloaded: ', os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/ShuffleNet/shufflenetv1/train.py b/official/cv/ShuffleNet/shufflenetv1/train.py index e758ce9c66a8573ce6d5db38be3e8d8a8bc217db..4a4b5ef165a4a069531f70a74b568f6985f6e88c 100644 --- a/official/cv/ShuffleNet/shufflenetv1/train.py +++ b/official/cv/ShuffleNet/shufflenetv1/train.py @@ -15,11 +15,12 @@ """train ShuffleNetV1""" import os import time -from mindspore import context, nn +import mindspore +from mindspore import nn from mindspore import Tensor from mindspore.common import set_seed from mindspore.nn.optim.momentum import Momentum -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.model import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor, LossMonitor, Callback from mindspore.train.serialization import load_checkpoint, load_param_into_net, save_checkpoint @@ -110,24 +111,24 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def train(): - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) # init distributed if config.is_distributed: if os.getenv('DEVICE_ID', "not_set").isdigit(): - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) init() rank = get_rank() group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=True) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=True) else: rank = 0 group_size = 1 - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) if config.device_target == "GPU": - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) # define network net = ShuffleNetV1(model_size=config.model_size, n_class=config.num_classes) diff --git a/official/cv/ShuffleNet/shufflenetv2/cpu_transfer.py b/official/cv/ShuffleNet/shufflenetv2/cpu_transfer.py index e7e5c36ada21a8fe9a27d34fddb93e44edd9dbfc..43e2960c18508568e1759cfea937eaf28670b117 100644 --- a/official/cv/ShuffleNet/shufflenetv2/cpu_transfer.py +++ b/official/cv/ShuffleNet/shufflenetv2/cpu_transfer.py @@ -16,7 +16,7 @@ import argparse import ast import time -from mindspore import context +import mindspore from mindspore import Tensor from mindspore.common import set_seed from mindspore.nn.optim.momentum import Momentum @@ -58,10 +58,10 @@ if __name__ == '__main__': help='run platform(Default:Ascend)') args_opt = parser.parse_args() if args_opt.use_pynative_mode: - context.set_context(mode=context.PYNATIVE_MODE, device_target=args_opt.platform, + mindspore.set_context(mode=1, device_target=args_opt.platform, device_id=config_cpu.device_id) else: - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.platform, + mindspore.set_context(mode=0, device_target=args_opt.platform, device_id=config_cpu.device_id, save_graphs=False) # define network diff --git a/official/cv/ShuffleNet/shufflenetv2/eval.py b/official/cv/ShuffleNet/shufflenetv2/eval.py index d0e20b91f277135e6fb322b4655f0d3977908654..71f22a4c51f32454df0e61b8e00250fccabbb373 100644 --- a/official/cv/ShuffleNet/shufflenetv2/eval.py +++ b/official/cv/ShuffleNet/shufflenetv2/eval.py @@ -17,10 +17,8 @@ import argparse import ast import os import time - +import mindspore import mindspore.nn as nn - -from mindspore import context from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -88,10 +86,10 @@ if __name__ == '__main__': print('device_id = ', device_id) if args_opt.use_pynative_mode: - context.set_context(mode=context.PYNATIVE_MODE, device_target=args_opt.platform, + mindspore.set_context(mode=1, device_target=args_opt.platform, device_id=device_id) else: - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.platform, + mindspore.set_context(mode=0, device_target=args_opt.platform, device_id=device_id, save_graphs=False) dataset_path = args_opt.dataset_path diff --git a/official/cv/ShuffleNet/shufflenetv2/export.py b/official/cv/ShuffleNet/shufflenetv2/export.py index 30666f7591f2d99b609da9bbaa0a869c1e73a631..e7b0f60196d7398c0085589cf37e37b5888a4dc2 100644 --- a/official/cv/ShuffleNet/shufflenetv2/export.py +++ b/official/cv/ShuffleNet/shufflenetv2/export.py @@ -17,8 +17,8 @@ import argparse import ast import numpy as np -import mindspore as ms -from mindspore import context, Tensor, load_checkpoint, load_param_into_net, export +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.config import config_gpu as cfg from src.shufflenetv2 import ShuffleNetV2 @@ -44,14 +44,14 @@ if __name__ == '__main__': if args.overwrite_config: cfg.num_classes = args.num_classes - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) + mindspore.set_context(mode=0, device_target=args.device_target) if args.device_target == "Ascend" or args.device_target == "GPU": - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) net = ShuffleNetV2(n_class=cfg.num_classes) ckpt = load_checkpoint(args.ckpt_file) load_param_into_net(net, ckpt) net.set_train(False) - input_data = Tensor(np.ones([args.batch_size, 3, args.height, args.width]), ms.float32) + input_data = Tensor(np.ones([args.batch_size, 3, args.height, args.width]), mindspore.float32) export(net, input_data, file_name=args.file_name, file_format=args.file_format) diff --git a/official/cv/ShuffleNet/shufflenetv2/infer_shufflenetv2_onnx.py b/official/cv/ShuffleNet/shufflenetv2/infer_shufflenetv2_onnx.py index f753c2946928a6ad25947eced989a34ad33fb18a..07378afb6fb5dd38a5380fb27d9c12012ab9d4c6 100644 --- a/official/cv/ShuffleNet/shufflenetv2/infer_shufflenetv2_onnx.py +++ b/official/cv/ShuffleNet/shufflenetv2/infer_shufflenetv2_onnx.py @@ -17,7 +17,7 @@ import argparse import numpy as np import onnxruntime -import mindspore as ms +import mindspore from mindspore import Tensor from mindspore import ops from src.dataset import create_dataset @@ -54,7 +54,7 @@ def test(onnx_path, onnx_dataset_path, device_target, device_id): model_predict = session.run(None, inputs) model_predict = np.expand_dims(np.squeeze(model_predict), axis=0) - input_x = Tensor(model_predict[0], ms.float16) + input_x = Tensor(model_predict[0], mindspore.float16) _, k_label = topk(input_x, k) if k_label[0] == labels: correct_top1 = correct_top1 + 1 diff --git a/official/cv/ShuffleNet/shufflenetv2/modelarts/train_start.py b/official/cv/ShuffleNet/shufflenetv2/modelarts/train_start.py index 592709420d4537ff4f7c81a1a835063d9b159d37..d90af02916ff7e6ef4c54b0d2c94b09485defdcc 100644 --- a/official/cv/ShuffleNet/shufflenetv2/modelarts/train_start.py +++ b/official/cv/ShuffleNet/shufflenetv2/modelarts/train_start.py @@ -19,10 +19,9 @@ import os import time import numpy as np +import mindspore import mindspore.nn as nn - -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore import Tensor from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.momentum import Momentum @@ -60,7 +59,7 @@ def export_models(checkpoint_path): if args_opt.export_mindir_model: export(network, input_data, file_name=output_file, file_format="AIR") - if args_opt.export_air_model and context.get_context("device_target") == "Ascend": + if args_opt.export_air_model and mindspore.get_context("device_target") == "Ascend": export(network, input_data, file_name=output_file, file_format="AIR") if args_opt.export_onnx_model: export(network, input_data, file_name=output_file, file_format="ONNX") @@ -166,10 +165,10 @@ if __name__ == '__main__': set_seed(config.random_seed) if args_opt.use_pynative_mode: - context.set_context(mode=context.PYNATIVE_MODE, device_target=args_opt.platform) + mindspore.set_context(mode=1, device_target=args_opt.platform) print('mode = PYNATIVE_MODE') else: - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.platform, save_graphs=False) + mindspore.set_context(mode=0, device_target=args_opt.platform, save_graphs=False) print('mode = GRAPH_MODE') # init distributed @@ -187,13 +186,13 @@ if __name__ == '__main__': device_id = get_rank() config.group_size = get_group_size() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, device_num=config.group_size, gradients_mean=True) else: device_id = args_opt.device_id config.group_size = 1 - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) rank_id = device_id config.rank = rank_id print('rank_id = ', rank_id, ' group_size = ', config.group_size) diff --git a/official/cv/ShuffleNet/shufflenetv2/train.py b/official/cv/ShuffleNet/shufflenetv2/train.py index c132ab257f1f56cb0e706035c0b26bad2265d396..649410de6fd13adb84c3bbcb03ee2be224876ee9 100644 --- a/official/cv/ShuffleNet/shufflenetv2/train.py +++ b/official/cv/ShuffleNet/shufflenetv2/train.py @@ -18,9 +18,9 @@ import ast import os import time +import mindspore import mindspore.nn as nn -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore import Tensor from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.momentum import Momentum @@ -103,15 +103,15 @@ if __name__ == '__main__': set_seed(config.random_seed) if args_opt.use_pynative_mode: - context.set_context(mode=context.PYNATIVE_MODE, device_target=args_opt.platform) + mindspore.set_context(mode=1, device_target=args_opt.platform) print('mode = PYNATIVE_MODE') else: - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.platform, save_graphs=False) + mindspore.set_context(mode=0, device_target=args_opt.platform, save_graphs=False) print('mode = GRAPH_MODE') # Set mempool block size in PYNATIVE_MODE for improving memory utilization, which will not take effect in GRAPH_MODE - if context.get_context("mode") == context.PYNATIVE_MODE: - context.set_context(mempool_block_size="25GB") + if mindspore.get_context("mode") == 1: + mindspore.set_context(mempool_block_size="25GB") # init distributed if args_opt.is_distributed: @@ -128,13 +128,13 @@ if __name__ == '__main__': device_id = get_rank() config.group_size = get_group_size() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, device_num=config.group_size, gradients_mean=True) else: device_id = args_opt.device_id config.group_size = 1 - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) rank_id = device_id config.rank = rank_id print('rank_id = ', rank_id, ' group_size = ', config.group_size) diff --git a/official/cv/SwinTransformer/eval.py b/official/cv/SwinTransformer/eval.py index f26694e15847a535f5cb222e004b1c1009921b2f..9f1eaebccb321802ea634aebac47ce90e1c36f98 100644 --- a/official/cv/SwinTransformer/eval.py +++ b/official/cv/SwinTransformer/eval.py @@ -15,7 +15,6 @@ """eval""" from mindspore import Model -from mindspore import context from mindspore import nn from mindspore.common import set_seed @@ -25,18 +24,20 @@ from src.tools.criterion import get_criterion, NetWithLoss from src.tools.get_misc import get_dataset, set_device, get_model, pretrained, get_train_one_step from src.tools.optimizer import get_optimizer +import mindspore + set_seed(args.seed) def main(): mode = { - 0: context.GRAPH_MODE, - 1: context.PYNATIVE_MODE + 0: 0, + 1: 1 } - context.set_context(mode=mode[args.graph_mode], device_target=args.device_target) - context.set_context(enable_graph_kernel=False) + mindspore.set_context(mode=mode[args.graph_mode], device_target=args.device_target) + mindspore.set_context(enable_graph_kernel=False) if args.device_target == "Ascend": - context.set_context(enable_auto_mixed_precision=True) + mindspore.set_context(enable_auto_mixed_precision=True) set_device(args) # get model diff --git a/official/cv/SwinTransformer/export.py b/official/cv/SwinTransformer/export.py index 692a104e4aa4b1584e09937e0c333ea820398919..3aa941a0457db7c00ad616b6002682db75865d6e 100644 --- a/official/cv/SwinTransformer/export.py +++ b/official/cv/SwinTransformer/export.py @@ -16,9 +16,9 @@ ##############export checkpoint file into air, onnx or mindir model################# python export.py """ - import numpy as np -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from mindspore import dtype as mstype from src.args import args @@ -26,10 +26,10 @@ from src.tools.cell import cast_amp from src.tools.criterion import get_criterion, NetWithLoss from src.tools.get_misc import get_model -context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) +mindspore.set_context(mode=0, device_target=args.device_target) if args.device_target in ["Ascend", "GPU"]: - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) if __name__ == '__main__': net = get_model(args) diff --git a/official/cv/SwinTransformer/src/tools/get_misc.py b/official/cv/SwinTransformer/src/tools/get_misc.py index 73ae63120028bca3a5f0acf10182af5861b5b849..abdf60a69f2ae2868e01896e64c1c4f830756bff 100644 --- a/official/cv/SwinTransformer/src/tools/get_misc.py +++ b/official/cv/SwinTransformer/src/tools/get_misc.py @@ -15,10 +15,10 @@ """misc functions for program""" import os -from mindspore import context +import mindspore from mindspore import nn from mindspore.communication.management import init, get_rank -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from src import models, data @@ -35,25 +35,25 @@ def set_device(args): if device_target == "Ascend": if device_num > 1: - context.set_context(device_id=int(os.environ["DEVICE_ID"])) + mindspore.set_context(device_id=int(os.environ["DEVICE_ID"])) init(backend_name='hccl') - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) - # context.set_auto_parallel_context(pipeline_stages=2, full_batch=True) + # mindspore.set_auto_parallel_context(pipeline_stages=2, full_batch=True) rank = get_rank() else: - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) elif device_target == "GPU": if device_num > 1: init(backend_name='nccl') - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) rank = get_rank() else: - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) else: raise ValueError("Unsupported platform.") diff --git a/official/cv/SwinTransformer/train.py b/official/cv/SwinTransformer/train.py index c4174fc82d22bb973ce3cbb6e2bc5d0fa4cdcd4a..feff1525a76b8c937c459854a5f3ed2b7228f7d4 100644 --- a/official/cv/SwinTransformer/train.py +++ b/official/cv/SwinTransformer/train.py @@ -15,8 +15,8 @@ """train""" import os +import mindspore from mindspore import Model -from mindspore import context from mindspore import nn from mindspore.common import set_seed from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor @@ -33,14 +33,14 @@ def main(): assert args.crop, f"{args.arch} is only for evaluation" set_seed(args.seed) mode = { - 0: context.GRAPH_MODE, - 1: context.PYNATIVE_MODE + 0: 0, + 1: 1 } - context.set_context(mode=mode[args.graph_mode], device_target=args.device_target) + mindspore.set_context(mode=mode[args.graph_mode], device_target=args.device_target) if args.device_target == "GPU": - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) if args.device_target == "Ascend": - context.set_context(enable_auto_mixed_precision=True) + mindspore.set_context(enable_auto_mixed_precision=True) rank = set_device(args) # get model and cast amp_level diff --git a/official/cv/Unet/eval.py b/official/cv/Unet/eval.py index d34b80a7a132a2597425419f4964e08215415682..b5ee1ae8b64fb7eeee5593430450f605b2ea0bad 100644 --- a/official/cv/Unet/eval.py +++ b/official/cv/Unet/eval.py @@ -14,7 +14,8 @@ # ============================================================================ import logging -from mindspore import context, Model +import mindspore +from mindspore import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.data_loader import create_dataset, create_multi_class_dataset @@ -59,10 +60,10 @@ def test_net(data_dir, if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, jit_config={"jit_level": "O2"}) if config.device_target == "Ascend": device_id = get_device_id() - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) test_net(data_dir=config.data_path, ckpt_path=config.checkpoint_file_path, cross_valid_ind=config.cross_valid_ind) diff --git a/official/cv/Unet/export.py b/official/cv/Unet/export.py index 300fe1c0290e95a1c759c87ee5a857b982c466f8..b32da933349afc01adcd2479867f386848a776eb 100644 --- a/official/cv/Unet/export.py +++ b/official/cv/Unet/export.py @@ -16,7 +16,8 @@ import os import numpy as np -from mindspore import Tensor, export, load_checkpoint, load_param_into_net, context +import mindspore +from mindspore import Tensor, export, load_checkpoint, load_param_into_net from src.unet_medical.unet_model import UNetMedical from src.unet_nested import NestedUNet, UNet @@ -26,9 +27,9 @@ from src.model_utils.device_adapter import get_device_id from src.model_utils.moxing_adapter import moxing_wrapper -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_pre_process(): '''modelarts pre process function.''' diff --git a/official/cv/Unet/golden_stick/pruner/uni_pruning/eval.py b/official/cv/Unet/golden_stick/pruner/uni_pruning/eval.py index 4e87deff266c4985d51126e7a29a658ef22fb114..819449cc722e1fa61072ee49f8faa06bee8dc86c 100644 --- a/official/cv/Unet/golden_stick/pruner/uni_pruning/eval.py +++ b/official/cv/Unet/golden_stick/pruner/uni_pruning/eval.py @@ -16,7 +16,8 @@ import logging import os import json -from mindspore import context, Model +import mindspore +from mindspore import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore_gs.pruner.uni_pruning import UniPruner @@ -83,7 +84,7 @@ def test_net(data_dir, if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) assert config.device_target == "GPU" test_net(data_dir=config.data_path, ckpt_path=config.checkpoint_file_path, diff --git a/official/cv/Unet/golden_stick/pruner/uni_pruning/train.py b/official/cv/Unet/golden_stick/pruner/uni_pruning/train.py index 0c776aeabae10cd498dfd8fd55b468c40a46b6aa..3aa899ee9607f11d9b08ecadf3c620fca549725a 100644 --- a/official/cv/Unet/golden_stick/pruner/uni_pruning/train.py +++ b/official/cv/Unet/golden_stick/pruner/uni_pruning/train.py @@ -17,10 +17,10 @@ import logging import mindspore import mindspore.nn as nn -from mindspore import Model, context +from mindspore import Model from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.callback import CheckpointConfig, ModelCheckpoint -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore_gs.pruner.uni_pruning import UniPruner @@ -47,7 +47,7 @@ def train_net(cross_valid_ind=1, group_size = get_group_size() rank = get_rank() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=False) net = UNetMedical(n_channels=config.num_channels, n_classes=config.num_classes) @@ -121,7 +121,7 @@ def train_net(cross_valid_ind=1, if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') # to keep GetNext from timeout, set op_timeout=600 - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False, op_timeout=600) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, op_timeout=600) assert config.device_target == "GPU" epoch_size = config.epoch_size if not config.run_distribute else config.distribute_epochs batchsize = config.batch_size diff --git a/official/cv/Unet/postprocess.py b/official/cv/Unet/postprocess.py index 67def617c84f07193552924990106ec26b61cdf8..8fd6d2da3c8f111a4501ce8e9b038a09f8ba9302 100644 --- a/official/cv/Unet/postprocess.py +++ b/official/cv/Unet/postprocess.py @@ -39,7 +39,7 @@ if __name__ == '__main__': mask = cv2.imread(os.path.join(config.data_path, f, "mask.png"), cv2.IMREAD_GRAYSCALE) mask = cv2.resize(mask, img_size) mask = mask.astype(np.float32) / 255 - mask = (mask > 0.5).astype(np.int) + mask = (mask > 0.5).astype(np.int_) mask = (np.arange(2) == mask[..., None]).astype(int) mask = mask.transpose(2, 0, 1).astype(np.float32) label = mask.reshape(1, 2, 96, 96) diff --git a/benchmark/ascend/resnet/scripts/run_eval.sh b/official/cv/Unet/scripts/run_distribute_train_msrun.sh similarity index 43% rename from benchmark/ascend/resnet/scripts/run_eval.sh rename to official/cv/Unet/scripts/run_distribute_train_msrun.sh index 97a7ba85c712c0d61b8fe480accdfa19a51fad16..b0f9d697d0b23e6eb92d18d53f5b563e30b25145 100644 --- a/benchmark/ascend/resnet/scripts/run_eval.sh +++ b/official/cv/Unet/scripts/run_distribute_train_msrun.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2020-2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,13 +14,18 @@ # limitations under the License. # ============================================================================ -if [ $# != 3 ] -then - echo "Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]" -exit 1 +if [ $# != 2 ] +then + echo "==============================================================================================================" + echo "Usage: bash scripts/run_distribute_train_msrun.sh [DATASET] [CONFIG_PATH]" + echo "Please run the script as: " + echo "bash scripts/run_distribute_train_msrun.sh [DATASET] [CONFIG_PATH]" + echo "for example: bash run_distribute_train_msrun.sh /absolute/path/to/data /absolute/path/to/config" + echo "==============================================================================================================" + exit 1 fi -get_real_path(){ +get_real_path() { if [ "${1:0:1}" == "/" ]; then echo "$1" else @@ -28,40 +33,12 @@ get_real_path(){ fi } -PATH1=$(get_real_path $1) -PATH2=$(get_real_path $2) -CONFIG_FILE=$(get_real_path $3) - - -if [ ! -d $PATH1 ] -then - echo "error: DATASET_PATH=$PATH1 is not a directory" -exit 1 -fi - -if [ ! -f $PATH2 ] -then - echo "error: CHECKPOINT_PATH=$PATH2 is not a file" -exit 1 -fi - +DATASET=$(get_real_path $1) +CONFIG_PATH=$(get_real_path $2) ulimit -u unlimited -export DEVICE_NUM=1 -export DEVICE_ID=0 -export RANK_SIZE=$DEVICE_NUM -export RANK_ID=0 - -if [ -d "eval" ]; -then - rm -rf ./eval -fi -mkdir ./eval -cp ../*.py ./eval -cp *.sh ./eval -cp -r ../config/*.yaml ./eval -cp -r ../src ./eval -cd ./eval || exit env > env.log -echo "start evaluation for device $DEVICE_ID" -python eval.py --data_path=$PATH1 --checkpoint_file_path=$PATH2 --config_path=$CONFIG_FILE &> log & -cd .. + +echo "start training" +msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port 8118 \ + --log_dir=msrun_log --join=True --cluster_time_out=300 \ + train.py --run_distribute=True --data_path=$DATASET --config_path=$CONFIG_PATH --output_path './output' &> log.txt & diff --git a/official/cv/Unet/src/data_loader.py b/official/cv/Unet/src/data_loader.py index f2fc3eb15e81f670d24a3e4006ba16b63d98398c..e90863f6622a4866c393464a046a1d7b193ef98b 100644 --- a/official/cv/Unet/src/data_loader.py +++ b/official/cv/Unet/src/data_loader.py @@ -32,7 +32,7 @@ def _load_multipage_tiff(path): def _get_val_train_indices(length, fold, ratio=0.8): assert 0 < ratio <= 1, "Train/total data ratio must be in range (0.0, 1.0]" np.random.seed(0) - indices = np.arange(0, length, 1, dtype=np.int) + indices = np.arange(0, length, 1, dtype=np.int_) np.random.shuffle(indices) if fold is not None: @@ -49,7 +49,7 @@ def _get_val_train_indices(length, fold, ratio=0.8): def data_post_process(img, mask): img = np.expand_dims(img, axis=0) - mask = (mask > 0.5).astype(np.int) + mask = (mask > 0.5).astype(np.int_) mask = (np.arange(mask.max() + 1) == mask[..., None]).astype(int) mask = mask.transpose(2, 0, 1).astype(np.float32) return img, mask @@ -238,9 +238,9 @@ def preprocess_img_mask(img, mask, num_classes, img_size, augment=False, eval_re img = img.transpose(2, 0, 1) if num_classes == 2: mask = mask.astype(np.float32) / mask.max() - mask = (mask > 0.5).astype(np.int) + mask = (mask > 0.5).astype(np.int_) else: - mask = mask.astype(np.int) + mask = mask.astype(np.int_) mask = (np.arange(num_classes) == mask[..., None]).astype(int) mask = mask.transpose(2, 0, 1).astype(np.float32) return img, mask diff --git a/official/cv/Unet/src/model_utils/moxing_adapter.py b/official/cv/Unet/src/model_utils/moxing_adapter.py index aabd5ac6cf1bde3ca20f3d6ea9cf3d5310169f1e..a5337e6885a3dc69ef76387c6ffc6286ac36df91 100644 --- a/official/cv/Unet/src/model_utils/moxing_adapter.py +++ b/official/cv/Unet/src/model_utils/moxing_adapter.py @@ -17,8 +17,8 @@ import os import functools -from mindspore import context from src.model_utils.config import config +import mindspore _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/Unet/train.py b/official/cv/Unet/train.py index 37143d3a4f6393774003dc618da43cc0125ee214..ecfeb72c38f5ba0e931b5d82b8c421299d3d71b3 100644 --- a/official/cv/Unet/train.py +++ b/official/cv/Unet/train.py @@ -17,10 +17,10 @@ import logging import mindspore import mindspore.nn as nn -from mindspore import Model, context +from mindspore import Model from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.callback import CheckpointConfig, ModelCheckpoint -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.unet_medical import UNetMedical @@ -50,9 +50,7 @@ def train_net(cross_valid_ind=1, group_size = get_group_size() rank = get_rank() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, - device_num=group_size, - gradients_mean=False) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=False) need_slice = False if config.model_name == 'unet_medical': net = UNetMedical(n_channels=config.num_channels, n_classes=config.num_classes) @@ -130,10 +128,11 @@ def train_net(cross_valid_ind=1, if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') # to keep GetNext from timeout, set op_timeout=600 - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False, op_timeout=600) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, op_timeout=600, + jit_config={"jit_level": "O2"}) if config.device_target == "Ascend": device_id = get_device_id() - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) epoch_size = config.epochs if not config.run_distribute else config.distribute_epochs batchsize = config.batch_size if config.device_target == 'GPU' and config.run_distribute: diff --git a/official/cv/VGG/vgg16/eval.py b/official/cv/VGG/vgg16/eval.py index f179ad24207d7eeb9987d15226d052da76641184..8c436f925ccc248937ad538642d0afe8eb0ddfe4 100644 --- a/official/cv/VGG/vgg16/eval.py +++ b/official/cv/VGG/vgg16/eval.py @@ -20,7 +20,8 @@ import glob import numpy as np import mindspore.nn as nn -from mindspore import Tensor, context +import mindspore +from mindspore import Tensor from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -139,10 +140,10 @@ def run_eval(): config.group_size = get_device_num() _enable_graph_kernel = config.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=_enable_graph_kernel, + mindspore.set_context(mode=0, enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit() and config.device_target == "Ascend": - context.set_context(device_id=int(os.getenv('DEVICE_ID'))) + mindspore.set_context(device_id=int(os.getenv('DEVICE_ID'))) config.outputs_dir = os.path.join(config.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) diff --git a/official/cv/VGG/vgg16/export.py b/official/cv/VGG/vgg16/export.py index c0dc7fe0b139fcac9c6be221d415f982b873a854..2ab51df3faccbbe66abd7e59f6f88785b124fe38 100644 --- a/official/cv/VGG/vgg16/export.py +++ b/official/cv/VGG/vgg16/export.py @@ -16,7 +16,8 @@ import os import numpy as np -from mindspore import Tensor, context +import mindspore +from mindspore import Tensor import mindspore.common.dtype as mstype from mindspore.train.serialization import load_checkpoint, export @@ -35,10 +36,10 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def run_export(): - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": config.device_id = get_device_id() - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) if config.dataset == "cifar10": net = vgg16(num_classes=config.num_classes, args=config) diff --git a/official/cv/VGG/vgg16/fine_tune.py b/official/cv/VGG/vgg16/fine_tune.py index 881264fb394e9f9fed75d6bc720f7df0471b18fc..4dce2b341197987d1d52140f9e86c4be59a78804 100644 --- a/official/cv/VGG/vgg16/fine_tune.py +++ b/official/cv/VGG/vgg16/fine_tune.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.train import Model from mindspore.train.callback import LossMonitor, TimeMonitor @@ -21,8 +21,8 @@ from model_utils.config import get_config from src.vgg import Vgg from src.dataset import create_dataset -ms.set_context(mode=ms.GRAPH_MODE, device_target="CPU", save_graphs=False) -ms.set_seed(21) +mindspore.set_context(mode=0, device_target="CPU", save_graphs=False) +mindspore.set_seed(21) def import_data(train_dataset_path="./datasets/train/", eval_dataset_path="./datasets/test/", batch_size=32): @@ -78,7 +78,7 @@ def init_weight(net, param_dict): has_trained_epoch = int(param_dict["epoch_num"].data.asnumpy()) has_trained_step = int(param_dict["step_num"].data.asnumpy()) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) print("has_trained_epoch:", has_trained_epoch) print("has_trained_step:", has_trained_step) return has_trained_epoch, has_trained_step @@ -114,8 +114,8 @@ def eval_net(model_config, checkpoint_path='./vgg16.ckpt', net.classifier[6] = head # load checkpoint - param_dict = ms.load_checkpoint(checkpoint_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(checkpoint_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) # define loss @@ -123,7 +123,7 @@ def eval_net(model_config, checkpoint_path='./vgg16.ckpt', loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) # eval step res = model.eval(data_val) @@ -176,7 +176,7 @@ def finetune_train(model_config, eval_dataset_path=eval_dataset_path, batch_size=batch_size) - ckpt_param_dict = ms.load_checkpoint(finetune_checkpoint_path) + ckpt_param_dict = mindspore.load_checkpoint(finetune_checkpoint_path) net = Vgg(cfg['16'], num_classes=1000, args=model_config, batch_norm=True) init_weight(net=net, param_dict=ckpt_param_dict) print("net parameter:") @@ -210,7 +210,7 @@ def finetune_train(model_config, # do training model.train(num_epochs, dataset_train, callbacks=callbacks, dataset_sink_mode=True) - ms.save_checkpoint(net, save_checkpoint_path) + mindspore.save_checkpoint(net, save_checkpoint_path) if __name__ == '__main__': diff --git a/official/cv/VGG/vgg16/model_utils/moxing_adapter.py b/official/cv/VGG/vgg16/model_utils/moxing_adapter.py index e6e15074e91e53e91470f59cf1ec283056fe7a3e..953ec5214d7dc666ed93fdb9e0fd7b8b17d39d31 100644 --- a/official/cv/VGG/vgg16/model_utils/moxing_adapter.py +++ b/official/cv/VGG/vgg16/model_utils/moxing_adapter.py @@ -17,8 +17,8 @@ import os import functools -from mindspore import context from .config import get_config +import mindspore config = get_config() @@ -94,7 +94,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/VGG/vgg16/modelarts/start.py b/official/cv/VGG/vgg16/modelarts/start.py index 23f345dc1e9397b9c1b172941dd4b3910bd095e2..56b61d8b04d100e49988aabf569784ab2f52dd52 100644 --- a/official/cv/VGG/vgg16/modelarts/start.py +++ b/official/cv/VGG/vgg16/modelarts/start.py @@ -21,15 +21,15 @@ import time import numpy as np import moxing as mox +import mindspore import mindspore.nn as nn from mindspore import Tensor -from mindspore import context import mindspore.common.dtype as mstype from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.momentum import Momentum from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_param_into_net, load_checkpoint, export from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.common import set_seed @@ -125,10 +125,10 @@ def _get_last_ckpt(ckpt_dir): def run_export(ckpt_dir): ckpt_file = _get_last_ckpt(ckpt_dir) - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": config.device_id = get_device_id() - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) if config.dataset == "cifar10": net = vgg16(num_classes=config.num_classes, args=config) @@ -150,7 +150,7 @@ def run_train(): config.per_batch_size = config.batch_size _enable_graph_kernel = config.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target) config.rank = get_rank_id() config.device_id = get_device_id() @@ -159,7 +159,7 @@ def run_train(): if config.is_distributed: if config.device_target == "Ascend": init() - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) elif config.device_target == "GPU": if not config.enable_modelarts: init() @@ -168,12 +168,12 @@ def run_train(): init() device_num = config.group_size - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[2, 18]) else: if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) # select for master rank save ckpt or all rank save, compatible for model parallel config.rank_save_ckpt_flag = 0 diff --git a/official/cv/VGG/vgg16/src/data_split.py b/official/cv/VGG/vgg16/src/data_split.py index d643942dac48796a30c831989bd2f7c1900ce357..d93792e14da2d6226735e8827d25c22bd03630fc 100644 --- a/official/cv/VGG/vgg16/src/data_split.py +++ b/official/cv/VGG/vgg16/src/data_split.py @@ -16,7 +16,7 @@ import os import shutil import multiprocessing -import mindspore as ms +import mindspore import mindspore.dataset as ds @@ -72,7 +72,7 @@ def create_dataset(dataset_path, do_train, batch_size=32, train_image_size=224, ] trans_norm = [ds.vision.Normalize(mean=mean, std=std), ds.vision.HWC2CHW()] - type_cast_op = ds.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.TypeCast(mindspore.int32) trans_work_num = 24 data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=get_num_parallel_workers(trans_work_num)) diff --git a/official/cv/VGG/vgg16/src/dataset.py b/official/cv/VGG/vgg16/src/dataset.py index 6f1be68c49d734960191049491ca72572666cbc0..420e728e9b03e8bb3e38cc682faa26989e8a2d2a 100644 --- a/official/cv/VGG/vgg16/src/dataset.py +++ b/official/cv/VGG/vgg16/src/dataset.py @@ -18,7 +18,7 @@ dataset processing. import os import multiprocessing from PIL import Image, ImageFile -import mindspore as ms +import mindspore from mindspore.common import dtype as mstype import mindspore.dataset as de import mindspore.dataset.transforms as C @@ -214,7 +214,7 @@ def create_dataset(dataset_path, do_train, batch_size=32, train_image_size=224, ] trans_norm = [de.vision.Normalize(mean=mean, std=std), de.vision.HWC2CHW()] - type_cast_op = de.transforms.TypeCast(ms.int32) + type_cast_op = de.transforms.TypeCast(mindspore.int32) trans_work_num = 24 data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=get_num_parallel_workers(trans_work_num)) diff --git a/official/cv/VGG/vgg16/train.py b/official/cv/VGG/vgg16/train.py index eddb70c3f1e0ac7e3a53cf9ae624333e69163fce..abb1ab61583934e97cf0c1c8849cf9cbfd65531f 100644 --- a/official/cv/VGG/vgg16/train.py +++ b/official/cv/VGG/vgg16/train.py @@ -19,14 +19,14 @@ import datetime import os import time +import mindspore import mindspore.nn as nn from mindspore import Tensor -from mindspore import context from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.momentum import Momentum from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_param_into_net, load_checkpoint from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.common import set_seed @@ -120,7 +120,7 @@ def run_train(): config.per_batch_size = config.batch_size _enable_graph_kernel = config.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target) config.rank = get_rank_id() config.device_id = get_device_id() @@ -129,7 +129,7 @@ def run_train(): if config.is_distributed: if config.device_target == "Ascend": init() - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) elif config.device_target == "GPU": if not config.enable_modelarts: init() @@ -138,12 +138,12 @@ def run_train(): init() device_num = config.group_size - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[15, 18]) else: if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) # select for master rank save ckpt or all rank save, compatible for model parallel config.rank_save_ckpt_flag = 0 diff --git a/official/cv/VGG/vgg19/eval.py b/official/cv/VGG/vgg19/eval.py index 0eed89da48439590a795c31afa963f0680e28621..e2a6cba6b9a257ab39c1ea8feee647c6872a3046 100644 --- a/official/cv/VGG/vgg19/eval.py +++ b/official/cv/VGG/vgg19/eval.py @@ -20,7 +20,8 @@ import glob import numpy as np import mindspore.nn as nn -from mindspore import Tensor, context +import mindspore +from mindspore import Tensor from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.momentum import Momentum from mindspore.train.model import Model @@ -135,10 +136,10 @@ def run_eval(): _enable_graph_kernel = config.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=_enable_graph_kernel, + mindspore.set_context(mode=0, enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit() and config.device_target == "Ascend": - context.set_context(device_id=int(os.getenv('DEVICE_ID'))) + mindspore.set_context(device_id=int(os.getenv('DEVICE_ID'))) config.outputs_dir = os.path.join(config.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) diff --git a/official/cv/VGG/vgg19/export.py b/official/cv/VGG/vgg19/export.py index f008d9984aa9624c874891f80c878b956b9ff807..3a5d9f014529c0b44c619529efc2896ba63201b6 100644 --- a/official/cv/VGG/vgg19/export.py +++ b/official/cv/VGG/vgg19/export.py @@ -16,7 +16,8 @@ import os import numpy as np -from mindspore import Tensor, context +import mindspore +from mindspore import Tensor import mindspore.common.dtype as mstype from mindspore.train.serialization import load_checkpoint, export @@ -37,10 +38,10 @@ def run_export(): '''run_export function.''' config.image_size = list(map(int, config.image_size.split(','))) - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": config.device_id = get_device_id() - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) if config.dataset == "cifar10": net = vgg19(num_classes=config.num_classes, args=config) diff --git a/official/cv/VGG/vgg19/model_utils/moxing_adapter.py b/official/cv/VGG/vgg19/model_utils/moxing_adapter.py index e6e15074e91e53e91470f59cf1ec283056fe7a3e..953ec5214d7dc666ed93fdb9e0fd7b8b17d39d31 100644 --- a/official/cv/VGG/vgg19/model_utils/moxing_adapter.py +++ b/official/cv/VGG/vgg19/model_utils/moxing_adapter.py @@ -17,8 +17,8 @@ import os import functools -from mindspore import context from .config import get_config +import mindspore config = get_config() @@ -94,7 +94,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/VGG/vgg19/modelarts/train_modelarts.py b/official/cv/VGG/vgg19/modelarts/train_modelarts.py index 32d5e96f8e5d4c6c017f108a61fcf13dfd045761..5084a60813dbe8632b7435dbf6bb360b137aab55 100644 --- a/official/cv/VGG/vgg19/modelarts/train_modelarts.py +++ b/official/cv/VGG/vgg19/modelarts/train_modelarts.py @@ -19,14 +19,14 @@ import datetime import os import time +import mindspore import mindspore.nn as nn from mindspore import Tensor -from mindspore import context from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.momentum import Momentum from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_param_into_net, load_checkpoint, export from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.common import set_seed @@ -133,7 +133,7 @@ def run_train(): config.per_batch_size = config.batch_size _enable_graph_kernel = config.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target) config.device_id = get_device_id() @@ -141,7 +141,7 @@ def run_train(): if config.is_distributed: if config.device_target == "Ascend": init() - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) elif config.device_target == "GPU": if not config.enable_modelarts: init() @@ -151,12 +151,12 @@ def run_train(): config.rank = get_rank() config.group_size = get_group_size() device_num = config.group_size - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[2, 18]) else: if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) # select for master rank save ckpt or all rank save, compatible for model parallel config.rank_save_ckpt_flag = 0 diff --git a/official/cv/VGG/vgg19/train.py b/official/cv/VGG/vgg19/train.py index 49b8a620f3313fcd90169faf9a2ef7e013ddc024..4384d8a193adf50ea80e9493a398b7392b2bb861 100644 --- a/official/cv/VGG/vgg19/train.py +++ b/official/cv/VGG/vgg19/train.py @@ -19,14 +19,14 @@ import datetime import os import time +import mindspore import mindspore.nn as nn from mindspore import Tensor -from mindspore import context from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.momentum import Momentum from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_param_into_net, load_checkpoint from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.common import set_seed @@ -121,7 +121,7 @@ def run_train(): config.per_batch_size = config.batch_size _enable_graph_kernel = config.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target) config.device_id = get_device_id() @@ -129,7 +129,7 @@ def run_train(): if config.is_distributed: if config.device_target == "Ascend": init() - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) elif config.device_target == "GPU": if not config.enable_modelarts: init() @@ -139,12 +139,12 @@ def run_train(): config.rank = get_rank() config.group_size = get_group_size() device_num = config.group_size - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[2, 18]) else: if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) # select for master rank save ckpt or all rank save, compatible for model parallel config.rank_save_ckpt_flag = 0 diff --git a/official/cv/VIT/README.md b/official/cv/VIT/README.md index 0e1cab66284514e165830ba508b45eec401fb75f..d018a3aa0e6a0aad4a1a7909c6256cd7781b2920 100644 --- a/official/cv/VIT/README.md +++ b/official/cv/VIT/README.md @@ -395,7 +395,8 @@ Current batch_ Size can only be set to 1. Before running the command below, you should modify the config file. The items you should modify are batch_size and val_data_path. - Inference result will be stored in the example path, you can find result like the followings in acc.log. + Inference result will be stored in the example path, you can find result like the following +in acc.log. ```shell cd scripts @@ -458,8 +459,8 @@ If you need to use the trained model to perform inference on multiple hardware p lrs = ... ... # Set context - context.set_context(mode=context.GRAPH_HOME, device_target=args.device_target) - context.set_context(device_id=args.device_id) + mindspore.set_context(mode=context.GRAPH_HOME, device_target=args.device_target) + mindspore.set_context(device_id=args.device_id) # Load unseen dataset for inference dataset = dataset.create_dataset(args.data_path, 1, False) diff --git a/official/cv/VIT/README_CN.md b/official/cv/VIT/README_CN.md index 06b2267e28c3c2edab315c1f972f9bbe24db046e..31cfa9afd462f80d245baad3435269f0ba236611 100644 --- a/official/cv/VIT/README_CN.md +++ b/official/cv/VIT/README_CN.md @@ -461,8 +461,8 @@ python export.py --config_path=[CONFIG_PATH] lrs = ... ... # 设置上下文 - context.set_context(mode=context.GRAPH_HOME, device_target=args.device_target) - context.set_context(device_id=args.device_id) + mindspore.set_context(mode=context.GRAPH_HOME, device_target=args.device_target) + mindspore.set_context(device_id=args.device_id) # 加载未知数据集进行推理 dataset = dataset.create_dataset(args.data_path, 1, False) diff --git a/official/cv/VIT/eval.py b/official/cv/VIT/eval.py index 7dae46f0ead224d4da2fea7a6d052a6779734e46..25ba24ddd3c2823564a76cb127b7e388bdbfc478 100644 --- a/official/cv/VIT/eval.py +++ b/official/cv/VIT/eval.py @@ -17,7 +17,7 @@ import os import numpy as np -import mindspore as ms +import mindspore from mindspore.train.model import Model, ParallelMode from mindspore.communication.management import init from mindspore.profiler.profiling import Profiler @@ -72,17 +72,17 @@ def eval_net(): np.random.seed(args.seed) args.logger = get_logger(args.save_checkpoint_path, rank=local_rank) - ms.set_context(device_id=device_id, - mode=ms.GRAPH_MODE, + mindspore.set_context(device_id=device_id, + mode=0, device_target="Ascend", save_graphs=False) if args.auto_tune: - ms.set_context(auto_tune_mode='GA') + mindspore.set_context(auto_tune_mode='GA') elif args.device_num == 1: pass else: - ms.set_auto_parallel_context(device_num=device_num, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) @@ -97,7 +97,7 @@ def eval_net(): net = get_network(backbone_name=args.backbone, args=args) if os.path.isfile(args.pretrained): - ms.load_checkpoint(args.pretrained, net, strict_load=False) + mindspore.load_checkpoint(args.pretrained, net, strict_load=False) # evaluation dataset eval_dataset = get_dataset(dataset_name=args.dataset_name, diff --git a/official/cv/VIT/export.py b/official/cv/VIT/export.py index 6d4703c16d1f38bb8c9fe615d570f5579b8aeed2..632920610c8ade3702d1f7517835eebe341953cc 100644 --- a/official/cv/VIT/export.py +++ b/official/cv/VIT/export.py @@ -18,14 +18,14 @@ python export.py """ import os -import mindspore as ms +import mindspore from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.vit import get_network -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - ms.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) def modelarts_pre_process(): '''modelarts pre process function.''' @@ -38,14 +38,14 @@ def run_export(): assert config.pretrained is not None, "checkpoint_path is None." - param_dict = ms.load_checkpoint(config.pretrained) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.pretrained) + mindspore.load_param_into_net(net, param_dict) config.height = config.train_image_size config.width = config.train_image_size - input_arr = ms.numpy.zeros([config.batch_size, 3, config.height, config.width], ms.float32) - ms.export(net, input_arr, file_name=config.file_name, file_format=config.file_format) + input_arr = mindspore.numpy.zeros([config.batch_size, 3, config.height, config.width], mindspore.float32) + mindspore.export(net, input_arr, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': run_export() diff --git a/official/cv/VIT/modelarts/train_modelarts.py b/official/cv/VIT/modelarts/train_modelarts.py index c3af183f65adcc0a4fa90415af8a5316da200b33..d3ecd6d4ca636e34d4aa7d53b828747af2093f76 100644 --- a/official/cv/VIT/modelarts/train_modelarts.py +++ b/official/cv/VIT/modelarts/train_modelarts.py @@ -21,7 +21,7 @@ import socket import glob import numpy as np import moxing as mox -import mindspore as ms +import mindspore from mindspore import Tensor from mindspore.train.model import Model, ParallelMode from mindspore.train.callback import ModelCheckpoint, CheckpointConfig @@ -84,11 +84,11 @@ def filter_checkpoint_parameter_by_list(origin_dict, param_filter): def frozen_to_air(network, args): - param_dict_t = ms.load_checkpoint(args.get("ckpt_file")) - ms.load_param_into_net(network, param_dict_t) + param_dict_t = mindspore.load_checkpoint(args.get("ckpt_file")) + mindspore.load_param_into_net(network, param_dict_t) input_arr = Tensor(np.random.uniform(0.0, 1.0, size=[args.get("batch_size"), 3, args.get("width"), \ - args.get("height")]), ms.float32) - ms.export(network, input_arr, file_name=args.get("file_name"), file_format=args.get("file_format")) + args.get("height")]), mindspore.float32) + mindspore.export(network, input_arr, file_name=args.get("file_name"), file_format=args.get("file_format")) if __name__ == '__main__': @@ -113,16 +113,16 @@ if __name__ == '__main__': config.batch_size = config.batch_size config.dataset_path = os.path.join(config.data_path, "train") - ms.set_context(device_id=device_id, - mode=ms.GRAPH_MODE, + mindspore.set_context(device_id=device_id, + mode=0, device_target="Ascend", save_graphs=False) if args_opt.auto_tune: - ms.set_context(auto_tune_mode='GA') + mindspore.set_context(auto_tune_mode='GA') elif args_opt.device_num == 1: pass else: - ms.set_auto_parallel_context(device_num=device_num, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) @@ -151,10 +151,10 @@ if __name__ == '__main__': print("warning!!!, no split point") if os.path.isfile(config.ckpt_path): - ckpt = ms.load_checkpoint(config.ckpt_path) + ckpt = mindspore.load_checkpoint(config.ckpt_path) filter_list = [x.name for x in net.head.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - ms.load_param_into_net(net, ckpt) + mindspore.load_param_into_net(net, ckpt) # loss if not args_opt.use_label_smooth: diff --git a/official/cv/VIT/src/autoaugment.py b/official/cv/VIT/src/autoaugment.py index 737e1945a8551c3cd99f8886dce7f20ee02555e1..fc5426f4f29152527f94982bda53df44ea11265c 100644 --- a/official/cv/VIT/src/autoaugment.py +++ b/official/cv/VIT/src/autoaugment.py @@ -207,7 +207,7 @@ class SubPolicy(): "translateY": np.linspace(0, 150 / 331, 10), "rotate": np.linspace(0, 30, 10), "color": np.linspace(0.0, 0.9, 10), - "posterize": np.round(np.linspace(8, 4, 10), 0).astype(np.int), + "posterize": np.round(np.linspace(8, 4, 10), 0).astype(np.int_), "solarize": np.linspace(256, 0, 10), "contrast": np.linspace(0.0, 0.9, 10), "sharpness": np.linspace(0.0, 0.9, 10), diff --git a/official/cv/VIT/src/cross_entropy.py b/official/cv/VIT/src/cross_entropy.py index 8fc5953d75d8e5ed08d7306eca5430ff150a95ae..aa5c29cf1917af2588dbe5f7ae299903b679ffee 100644 --- a/official/cv/VIT/src/cross_entropy.py +++ b/official/cv/VIT/src/cross_entropy.py @@ -14,7 +14,7 @@ # ============================================================================ """loss functions""" -import mindspore as ms +import mindspore import mindspore.ops as ops from mindspore import nn from mindspore import Tensor @@ -29,8 +29,8 @@ class CrossEntropySmooth(Loss): self.aux_factor = aux_factor self.onehot = ops.OneHot() self.sparse = sparse - self.on_value = Tensor(1.0 - smooth_factor, ms.float32) - self.off_value = Tensor(1.0 * smooth_factor / (num_classes - 1), ms.float32) + self.on_value = Tensor(1.0 - smooth_factor, mindspore.float32) + self.off_value = Tensor(1.0 * smooth_factor / (num_classes - 1), mindspore.float32) self.ce = nn.SoftmaxCrossEntropyWithLogits(reduction=reduction) def construct(self, logits, label): @@ -52,14 +52,14 @@ class CrossEntropySmoothMixup(Loss): """CrossEntropy""" def __init__(self, reduction='mean', smooth_factor=0., num_classes=1000): super().__init__() - self.on_value = Tensor(1.0 - smooth_factor, ms.float32) + self.on_value = Tensor(1.0 - smooth_factor, mindspore.float32) self.off_value = 1.0 * smooth_factor / (num_classes - 2) self.cross_entropy = nn.SoftmaxCrossEntropyWithLogits(reduction=reduction) def construct(self, logit, label): off_label = ops.Select()(ops.Equal()(label, 0.0), \ - ops.Fill()(ms.float32, label.shape, self.off_value), \ - ops.Fill()(ms.float32, label.shape, 0.0)) + ops.Fill()(mindspore.float32, label.shape, self.off_value), \ + ops.Fill()(mindspore.float32, label.shape, 0.0)) label = self.on_value * label + off_label loss = self.cross_entropy(logit, label) @@ -71,8 +71,8 @@ class CrossEntropyIgnore(Loss): def __init__(self, num_classes=21, ignore_label=255): super().__init__() self.one_hot = ops.OneHot(axis=-1) - self.on_value = Tensor(1.0, ms.float32) - self.off_value = Tensor(0.0, ms.float32) + self.on_value = Tensor(1.0, mindspore.float32) + self.off_value = Tensor(0.0, mindspore.float32) self.cast = ops.Cast() self.ce = nn.SoftmaxCrossEntropyWithLogits() self.not_equal = ops.NotEqual() @@ -85,12 +85,12 @@ class CrossEntropyIgnore(Loss): self.reshape = ops.Reshape() def construct(self, logits, labels): - labels_int = self.cast(labels, ms.int32) + labels_int = self.cast(labels, mindspore.int32) labels_int = self.reshape(labels_int, (-1,)) logits_ = self.transpose(logits, (0, 2, 3, 1)) logits_ = self.reshape(logits_, (-1, self.num_cls)) weights = self.not_equal(labels_int, self.ignore_label) - weights = self.cast(weights, ms.float32) + weights = self.cast(weights, mindspore.float32) one_hot_labels = self.one_hot(labels_int, self.num_cls, self.on_value, self.off_value) loss = self.ce(logits_, one_hot_labels) loss = self.mul(weights, loss) diff --git a/official/cv/VIT/src/dataset.py b/official/cv/VIT/src/dataset.py index 4e3c5d76628e08b027ef43178c02b479b2e8d23c..b882c0fe0a502b6b75a587f8f5e0f6b6caa122be 100644 --- a/official/cv/VIT/src/dataset.py +++ b/official/cv/VIT/src/dataset.py @@ -20,7 +20,7 @@ from io import BytesIO from PIL import Image import numpy as np -import mindspore as ms +import mindspore import mindspore.dataset.engine as de import mindspore.dataset.vision as vision import mindspore.dataset.transforms as transforms @@ -134,7 +134,7 @@ def create_dataset(dataset_path, ] ds = ds.map(input_columns="image", num_parallel_workers=num_workers, operations=c_trans) - type_cast_op = transforms.TypeCast(ms.int32) + type_cast_op = transforms.TypeCast(mindspore.int32) ds = ds.map(input_columns="label", num_parallel_workers=1, operations=type_cast_op) if do_train and mixup > 0: diff --git a/official/cv/VIT/src/eval_engine.py b/official/cv/VIT/src/eval_engine.py index f692227891d000146bfb1646493b586ada3960ea..86eb82ccaaeb11260a199b461146ad8724e2a01f 100644 --- a/official/cv/VIT/src/eval_engine.py +++ b/official/cv/VIT/src/eval_engine.py @@ -14,7 +14,7 @@ # ============================================================================ """eval engine""" -import mindspore as ms +import mindspore from mindspore import Tensor from src.metric import ClassifyCorrectWithCache, ClassifyCorrectCell, DistAccuracy @@ -53,12 +53,12 @@ class ImageNetCacheEvelEngine(BasicEvalEngine): self.args = args def compile(self, sink_size=-1): - index = Tensor(0, ms.int32) + index = Tensor(0, mindspore.int32) self.dist_eval_network.set_train(False) self.dist_eval_network.compile(index) def eval(self): - index = Tensor(0, ms.int32) + index = Tensor(0, mindspore.int32) output = self.dist_eval_network(index) output = output.asnumpy() / 50000 self.outputs = {"acc": output} diff --git a/official/cv/VIT/src/metric.py b/official/cv/VIT/src/metric.py index 2c86926cb501670c02c214b479b3783e67261f8c..5cb2a7834f3af7bd2f2a6a6790af49eaa9c4614c 100644 --- a/official/cv/VIT/src/metric.py +++ b/official/cv/VIT/src/metric.py @@ -16,7 +16,7 @@ import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops from mindspore.communication.management import GlobalComm import mindspore.nn as nn @@ -35,7 +35,7 @@ class ClassifyCorrectWithCache(nn.Cell): self.allreduce = ops.AllReduce(ops.ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP) self.assign_add = ops.AssignAdd() self.assign = ops.Assign() - self._correct_num = Parameter(Tensor(0.0, ms.float32), name="correct_num", requires_grad=False) + self._correct_num = Parameter(Tensor(0.0, mindspore.float32), name="correct_num", requires_grad=False) # save data to parameter pdata = [] plabel = [] @@ -44,11 +44,11 @@ class ClassifyCorrectWithCache(nn.Cell): pdata.append(batch["image"]) plabel.append(batch["label"]) step_num = step_num + 1 - pdata = Tensor(np.array(pdata), ms.float32) - plabel = Tensor(np.array(plabel), ms.int32) + pdata = Tensor(np.array(pdata), mindspore.float32) + plabel = Tensor(np.array(plabel), mindspore.int32) self._data = Parameter(pdata, name="pdata", requires_grad=False) self._label = Parameter(plabel, name="plabel", requires_grad=False) - self._step_num = Tensor(step_num, ms.int32) + self._step_num = Tensor(step_num, mindspore.int32) def construct(self, index): self._correct_num = 0 @@ -57,9 +57,9 @@ class ClassifyCorrectWithCache(nn.Cell): label = self._label[index] outputs = self._network(data) y_pred = self.argmax(outputs) - y_pred = ops.cast(y_pred, ms.int32) + y_pred = ops.cast(y_pred, mindspore.int32) y_correct = self.equal(y_pred, label) - y_correct = ops.cast(y_correct, ms.float32) + y_correct = ops.cast(y_correct, mindspore.float32) y_correct_sum = self.reduce_sum(y_correct) self._correct_num += y_correct_sum #self.assign(self._correct_num, y_correct_sum) index = index + 1 @@ -80,9 +80,9 @@ class ClassifyCorrectCell(nn.Cell): def construct(self, data, label): outputs = self._network(data) y_pred = self.argmax(outputs) - y_pred = ops.cast(y_pred, ms.int32) + y_pred = ops.cast(y_pred, mindspore.int32) y_correct = self.equal(y_pred, label) - y_correct = ops.cast(y_correct, ms.float32) + y_correct = ops.cast(y_correct, mindspore.float32) y_correct = self.reduce_sum(y_correct) total_correct = self.allreduce(y_correct) return (total_correct,) diff --git a/official/cv/VIT/src/model_utils/moxing_adapter.py b/official/cv/VIT/src/model_utils/moxing_adapter.py index 77f40e59c197f271b71da7c71e7dd43a442090df..88179b79b8b50062fd63b15db814ef8480aae74b 100644 --- a/official/cv/VIT/src/model_utils/moxing_adapter.py +++ b/official/cv/VIT/src/model_utils/moxing_adapter.py @@ -17,7 +17,8 @@ import os import functools -import mindspore as ms +import mindspore +import mindspore from .config import config _global_sync_count = 0 @@ -92,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - ms.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/VIT/src/optimizer.py b/official/cv/VIT/src/optimizer.py index 0a04acbdc780803def4a88e1a0505115b9dbc1a2..ecee958c855f472f7804a307dbc4cce6fc455642 100644 --- a/official/cv/VIT/src/optimizer.py +++ b/official/cv/VIT/src/optimizer.py @@ -16,7 +16,7 @@ import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops from mindspore import jit @@ -59,7 +59,7 @@ def scale_grad(gradients, reciprocal_scale): _adam_opt = ops.MultitypeFuncGraph("adam_opt") -_scaler_one = Tensor(1, ms.int32) +_scaler_one = Tensor(1, mindspore.int32) @_adam_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", @@ -91,15 +91,15 @@ def _update_run_op(beta1_power, beta2_power, beta1, beta2, eps, lr, weight_decay op_sqrt = ops.Sqrt() op_reshape = ops.Reshape() - param_fp32 = ops.cast(param, ms.float32) - m_fp32 = ops.cast(m, ms.float32) - v_fp32 = ops.cast(v, ms.float32) - gradient_fp32 = ops.cast(gradient, ms.float32) + param_fp32 = ops.cast(param, mindspore.float32) + m_fp32 = ops.cast(m, mindspore.float32) + v_fp32 = ops.cast(v, mindspore.float32) + gradient_fp32 = ops.cast(gradient, mindspore.float32) - next_m = op_mul(beta1, m_fp32) + op_mul(ops.cast(ops.tuple_to_array((1.0,)), ms.float32) + next_m = op_mul(beta1, m_fp32) + op_mul(ops.cast(ops.tuple_to_array((1.0,)), mindspore.float32) - beta1, gradient_fp32) - next_v = op_mul(beta2, v_fp32) + op_mul(ops.cast(ops.tuple_to_array((1.0,)), ms.float32) + next_v = op_mul(beta2, v_fp32) + op_mul(ops.cast(ops.tuple_to_array((1.0,)), mindspore.float32) - beta2, op_square(gradient_fp32)) regulate_m = next_m / (_scaler_one - beta1_power) @@ -135,10 +135,10 @@ class AdamW(Optimizer): self.moments1 = self.parameters.clone(prefix="adam_m", init='zeros') self.moments2 = self.parameters.clone(prefix="adam_v", init='zeros') self.hyper_map = ops.HyperMap() - self.beta1_power = Parameter(initializer(1, [1], ms.float32), name="beta1_power") - self.beta2_power = Parameter(initializer(1, [1], ms.float32), name="beta2_power") + self.beta1_power = Parameter(initializer(1, [1], mindspore.float32), name="beta1_power") + self.beta2_power = Parameter(initializer(1, [1], mindspore.float32), name="beta2_power") - self.reciprocal_scale = Tensor(1.0 / loss_scale, ms.float32) + self.reciprocal_scale = Tensor(1.0 / loss_scale, mindspore.float32) self.clip = clip @jit diff --git a/official/cv/VIT/src/vit.py b/official/cv/VIT/src/vit.py index f67911bcaa6d744b8b5cbb2a786515744719806b..9da3e1bb3d53767871b463530096e18d38f405bf 100644 --- a/official/cv/VIT/src/vit.py +++ b/official/cv/VIT/src/vit.py @@ -18,7 +18,7 @@ from importlib import import_module from easydict import EasyDict as edict import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops from mindspore.common.initializer import initializer from mindspore.common.parameter import Parameter @@ -35,39 +35,39 @@ class VitConfig: self.configs = configs # network init - self.network_norm = ms.nn.LayerNorm((configs.normalized_shape,)) - self.network_init = ms.common.initializer.Normal(sigma=1.0) + self.network_norm = mindspore.nn.LayerNorm((configs.normalized_shape,)) + self.network_init = mindspore.common.initializer.Normal(sigma=1.0) self.network_dropout_rate = 0.1 self.network_pool = 'cls' self.network = ViT # stem - self.stem_init = ms.common.initializer.XavierUniform() + self.stem_init = mindspore.common.initializer.XavierUniform() self.stem = VitStem # body - self.body_norm = ms.nn.LayerNorm + self.body_norm = mindspore.nn.LayerNorm self.body_drop_path_rate = 0.1 self.body = Transformer # body attention - self.attention_init = ms.common.initializer.XavierUniform() - self.attention_activation = ms.nn.Softmax() + self.attention_init = mindspore.common.initializer.XavierUniform() + self.attention_activation = mindspore.nn.Softmax() self.attention_dropout_rate = 0.1 self.attention = Attention # body feedforward - self.feedforward_init = ms.common.initializer.XavierUniform() - self.feedforward_activation = ms.nn.GELU() + self.feedforward_init = mindspore.common.initializer.XavierUniform() + self.feedforward_activation = mindspore.nn.GELU() self.feedforward_dropout_rate = 0.1 self.feedforward = FeedForward # head self.head = origin_head - self.head_init = ms.common.initializer.XavierUniform() + self.head_init = mindspore.common.initializer.XavierUniform() self.head_dropout_rate = 0.1 - self.head_norm = ms.nn.LayerNorm((configs.normalized_shape,)) - self.head_activation = ms.nn.GELU() + self.head_norm = mindspore.nn.LayerNorm((configs.normalized_shape,)) + self.head_activation = mindspore.nn.GELU() class DropPath(Cell): @@ -86,7 +86,7 @@ class DropPath(Cell): def construct(self, x): if self.training: x_shape = self.shape(x) # B N C - mask = self.ones((x_shape[0], 1, 1), ms.float32) + mask = self.ones((x_shape[0], 1, 1), mindspore.float32) x = self.dropout(mask)*x return x @@ -236,7 +236,7 @@ class ViT(Cell): else: x += self.pos_embedding[:, :seq_len] - y = ops.cast(x, ms.float32) + y = ops.cast(x, mindspore.float32) y = self.dropout(y) x = ops.cast(y, x.dtype) @@ -302,7 +302,7 @@ class Attention(Cell): if self.softmax_nz: q = self.reshape(q, (bs, seq_len, h, d)) q = self.transpose(q, (0, 2, 1, 3)) - q = ops.cast(q, ms.float32) + q = ops.cast(q, mindspore.float32) q = self.mul(q, self.scale) k = self.reshape(k, (bs, seq_len, h, d)) @@ -323,7 +323,7 @@ class Attention(Cell): v = self.transpose(v, (0, 2, 1, 3)) attn_scores = self.q_matmul_k(q, k) #bs x h x seq_len x seq_len - attn_scores = ops.cast(attn_scores, ms.float32) + attn_scores = ops.cast(attn_scores, mindspore.float32) attn_scores = self.mul(attn_scores, self.scale) attn_scores = ops.cast(attn_scores, x.dtype) attn_scores = self.activation(attn_scores) @@ -334,7 +334,7 @@ class Attention(Cell): out = self.to_out(out) out = self.reshape(out, (bs, seq_len, d_model)) #out = self.dropout(out) - y = ops.cast(out, ms.float32) + y = ops.cast(out, mindspore.float32) y = self.dropout(y) out = ops.cast(y, out.dtype) #out = self.reshape(out, (bs, seq_len, d_model)) @@ -361,12 +361,12 @@ class FeedForward(Cell): def construct(self, x): y = self.ff1(x) - y = ops.cast(y, ms.float32) + y = ops.cast(y, mindspore.float32) y = self.activation(y) y = self.dropout(y) y = ops.cast(y, x.dtype) y = self.ff2(y) - y = ops.cast(y, ms.float32) + y = ops.cast(y, mindspore.float32) y = self.dropout(y) y = ops.cast(y, x.dtype) return y diff --git a/official/cv/VIT/train.py b/official/cv/VIT/train.py index 02cec0302829029e797b711a252a9abea67943ef..abf623450943e6087e902d9633ae02dfb027296d 100644 --- a/official/cv/VIT/train.py +++ b/official/cv/VIT/train.py @@ -19,7 +19,7 @@ import time import socket import numpy as np -import mindspore as ms +import mindspore from mindspore import Tensor from mindspore.train.model import Model, ParallelMode from mindspore.train.callback import ModelCheckpoint, CheckpointConfig @@ -105,17 +105,17 @@ def train_setcontext(): np.random.seed(args.seed) args.logger = get_logger(args.save_checkpoint_path, rank=local_rank) - ms.set_context(device_id=device_id, - mode=ms.GRAPH_MODE, + mindspore.set_context(device_id=device_id, + mode=0, device_target="Ascend", save_graphs=False) if args.auto_tune: - ms.set_context(auto_tune_mode='GA') + mindspore.set_context(auto_tune_mode='GA') elif args.device_num == 1: pass else: - ms.set_auto_parallel_context(device_num=device_num, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) @@ -152,7 +152,7 @@ def train_net(): print("warning!!!, no split point") if os.path.isfile(args.pretrained): - ms.load_checkpoint(args.pretrained, net, strict_load=False) + mindspore.load_checkpoint(args.pretrained, net, strict_load=False) # loss if not args.use_label_smooth: diff --git a/official/cv/WGAN/eval.py b/official/cv/WGAN/eval.py index 97532cd597d6f4193b48236fccaaf2e1134f7ae2..61836ab6d09c14d0a28a2a64877b1b98af400f62 100644 --- a/official/cv/WGAN/eval.py +++ b/official/cv/WGAN/eval.py @@ -15,11 +15,11 @@ """ test WGAN """ import os import json +import mindspore import mindspore.common.dtype as mstype import mindspore.ops as ops from mindspore import Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net -from mindspore import context import numpy as np from PIL import Image @@ -31,8 +31,8 @@ from src.args import get_args if __name__ == "__main__": args_opt = get_args('eval') - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) - context.set_context(device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target=args_opt.device_target) + mindspore.set_context(device_id=args_opt.device_id) with open(args_opt.config, 'r') as gencfg: generator_config = json.loads(gencfg.read()) diff --git a/official/cv/WGAN/export.py b/official/cv/WGAN/export.py index 1d9e0d0076458be97f158f311a56d9e4b4b8eb91..b4d03851a627ee9168eb063fde2b190ed46b1267 100644 --- a/official/cv/WGAN/export.py +++ b/official/cv/WGAN/export.py @@ -19,8 +19,9 @@ python export.py """ import json import numpy as np +import mindspore import mindspore.common.dtype as mstype -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.args import get_args from src.dcgan_model import DcganG @@ -28,8 +29,8 @@ from src.dcgannobn_model import DcgannobnG if __name__ == '__main__': args_opt = get_args('export') - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) - context.set_context(device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target=args_opt.device_target) + mindspore.set_context(device_id=args_opt.device_id) with open(args_opt.config, 'r') as gencfg: generator_config = json.loads(gencfg.read()) diff --git a/official/cv/WGAN/modelarts/start.py b/official/cv/WGAN/modelarts/start.py index f096e114e7b62623e3db0c4e20b9909b618b8e30..968455f6b4705745b28ec9193cc55495f92b2156 100644 --- a/official/cv/WGAN/modelarts/start.py +++ b/official/cv/WGAN/modelarts/start.py @@ -18,13 +18,13 @@ import os import random import json import numpy as np +import mindspore from mindspore import Tensor, export import mindspore.nn as nn import mindspore.dataset as ds import mindspore.ops as ops import mindspore.common.dtype as mstype from mindspore.common import initializer as init -from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net, save_checkpoint from PIL import Image from src.dataset import create_dataset @@ -41,14 +41,14 @@ if __name__ == '__main__': # init context target = args_opt.device_target - context.set_context(mode=context.GRAPH_MODE, device_target=target) + mindspore.set_context(mode=0, device_target=target) # whether train on modelarts or local server if not args_opt.is_modelarts: if args_opt.experiment is None: args_opt.experiment = 'samples' os.system('mkdir {0}'.format(args_opt.experiment)) - context.set_context(device_id=int(args_opt.device_id)) + mindspore.set_context(device_id=int(args_opt.device_id)) dataset = create_dataset(args_opt.dataroot, args_opt.dataset, args_opt.batchSize, args_opt.imageSize, 1, args_opt.workers, target) @@ -58,7 +58,7 @@ if __name__ == '__main__': if args_opt.experiment is None: args_opt.experiment = '/cache/train_output' os.system('mkdir {0}'.format(args_opt.experiment)) - context.set_context(device_id=int(os.getenv('DEVICE_ID'))) + mindspore.set_context(device_id=int(os.getenv('DEVICE_ID'))) data_name = 'LSUN-bedroom.zip' local_data_url = '/cache/data_path/' mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_url) diff --git a/official/cv/WGAN/src/cell.py b/official/cv/WGAN/src/cell.py index 2cc78f46fcdda1bdaff840ecea0d19722cb2edab..3e53c9ce0445cc9ba481452ed72fa4e8f758c475 100644 --- a/official/cv/WGAN/src/cell.py +++ b/official/cv/WGAN/src/cell.py @@ -20,7 +20,7 @@ import mindspore.ops.operations as P import mindspore.ops.functional as F from mindspore.parallel._utils import (_get_device_num, _get_gradients_mean, _get_parallel_mode) -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.wrap.grad_reducer import DistributedGradReducer diff --git a/official/cv/WGAN/train.py b/official/cv/WGAN/train.py index ff6ca84ab0f5191c7a4e43ed9e67a741b44e5ffc..803eaff395ee282a7808ec174ca9d36f03a8a716 100644 --- a/official/cv/WGAN/train.py +++ b/official/cv/WGAN/train.py @@ -17,13 +17,13 @@ import os import random import json import time +import mindspore from mindspore import Tensor import mindspore.nn as nn import mindspore.dataset as ds import mindspore.ops as ops from mindspore.common import initializer as init import mindspore.common.dtype as mstype -from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net, save_checkpoint from PIL import Image import numpy as np @@ -41,14 +41,14 @@ if __name__ == '__main__': # init context target = args_opt.device_target - context.set_context(mode=context.GRAPH_MODE, device_target=target) + mindspore.set_context(mode=0, device_target=target) # whether train on modelarts or local server if not args_opt.is_modelarts: if args_opt.experiment is None: args_opt.experiment = 'samples' os.system('mkdir {0}'.format(args_opt.experiment)) - context.set_context(device_id=int(args_opt.device_id)) + mindspore.set_context(device_id=int(args_opt.device_id)) dataset = create_dataset(args_opt.dataroot, args_opt.dataset, args_opt.batchSize, args_opt.imageSize, 1, args_opt.workers, target) @@ -57,7 +57,7 @@ if __name__ == '__main__': if args_opt.experiment is None: args_opt.experiment = '/cache/train_output' os.system('mkdir {0}'.format(args_opt.experiment)) - context.set_context(device_id=int(os.getenv('DEVICE_ID'))) + mindspore.set_context(device_id=int(os.getenv('DEVICE_ID'))) data_name = 'LSUN-bedroom.zip' local_data_url = '/cache/data_path/' mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_url) diff --git a/official/cv/YOLOX/eval.py b/official/cv/YOLOX/eval.py index 0c17dbd8e44cf374da36971316e908156486d717..74f98243f1e796a1bc25ac465c400f6acbf57014 100644 --- a/official/cv/YOLOX/eval.py +++ b/official/cv/YOLOX/eval.py @@ -19,8 +19,8 @@ import os import datetime import shutil from model_utils.config import config -from mindspore.context import ParallelMode -from mindspore import context +import mindspore +from mindspore import ParallelMode from mindspore.communication.management import init, get_group_size, get_rank from src.logger import get_logger @@ -36,7 +36,7 @@ def run_test(): config.annFile = os.path.join(config.data_dir, 'annotations/instances_val2017.json') devid = int(os.getenv('DEVICE_ID', '0')) - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False, device_id=devid) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, device_id=devid) # logger config.log_dir = os.path.join( @@ -52,8 +52,8 @@ def run_test(): config.rank = get_rank() config.group_size = get_group_size() device_num = get_group_size() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=device_num) + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=device_num) # ------------------network create---------------------------------------------------------------------------- config.logger.info('Begin Creating Network....') if config.backbone == "yolox_darknet53": diff --git a/official/cv/YOLOX/export.py b/official/cv/YOLOX/export.py index 956cdc4dc6779cbf384d4147c9b4f9f75a5f0c95..575a81182464796754a8b883e96b7926f6b1bea1 100644 --- a/official/cv/YOLOX/export.py +++ b/official/cv/YOLOX/export.py @@ -19,8 +19,8 @@ python export.py import os import numpy as np -import mindspore as ms -from mindspore import Tensor, export, context +import mindspore +from mindspore import Tensor, export from model_utils.config import config from src.yolox import DetectionBlock @@ -33,10 +33,10 @@ def run_export(): Returns:None """ - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) if config.backbone == "yolox_darknet53": backbone = "yolofpn" else: @@ -45,7 +45,7 @@ def run_export(): network.set_train(False) assert config.val_ckpt is not None, "config.ckpt_file is None." network = load_weights(network, config.val_ckpt) - input_arr = Tensor(np.ones([config.export_bs, 3, config.input_size[0], config.input_size[1]]), ms.float32) + input_arr = Tensor(np.ones([config.export_bs, 3, config.input_size[0], config.input_size[1]]), mindspore.float32) file_name = backbone export(network, input_arr, file_name=file_name, file_format=config.file_format) diff --git a/official/cv/YOLOX/model_utils/moxing_adapter.py b/official/cv/YOLOX/model_utils/moxing_adapter.py index 7730180dd247fbdbf5cd9da552af9cc0fdd37a0e..5ed81870f32445d677c03318e40e585f4396f854 100644 --- a/official/cv/YOLOX/model_utils/moxing_adapter.py +++ b/official/cv/YOLOX/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/YOLOX/predict.py b/official/cv/YOLOX/predict.py index a3364306def205706fbf6a267bc77db4ee2fefd5..27a8d0664690ad4782bb17ec3822bdc5299b9a7f 100644 --- a/official/cv/YOLOX/predict.py +++ b/official/cv/YOLOX/predict.py @@ -16,7 +16,8 @@ import os import cv2 import numpy as np -from mindspore import Tensor, context +import mindspore +from mindspore import Tensor from model_utils.config import config from src.transform import preproc from src.util import load_weights, DetectionEngine @@ -39,7 +40,7 @@ LABELS = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'teddy bear', 'hair drier', 'toothbrush'] -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) class YoloxPredict: diff --git a/official/cv/YOLOX/train.py b/official/cv/YOLOX/train.py index 15fae3a9e4b5568d245b4b5ca4ffc1e518e563d4..749e7ea9672f4fd4f6c9ac021664e917dd57e68b 100644 --- a/official/cv/YOLOX/train.py +++ b/official/cv/YOLOX/train.py @@ -21,11 +21,11 @@ import datetime import mindspore from mindspore import DynamicLossScaleManager -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.common import set_seed from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, SummaryCollector from mindspore.communication.management import init, get_rank, get_group_size -from mindspore import context, Model, load_checkpoint, load_param_into_net +from mindspore import Model, load_checkpoint, load_param_into_net from mindspore.profiler.profiling import Profiler from mindspore.common.tensor import Tensor @@ -71,9 +71,9 @@ def set_default(cfg): def set_graph_kernel_context(): - if context.get_context("device_target") == "GPU": - context.set_context(enable_graph_kernel=True) - context.set_context(graph_kernel_flags="--enable_parallel_fusion " + if mindspore.get_context("device_target") == "GPU": + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(graph_kernel_flags="--enable_parallel_fusion " "--enable_trans_op_optimize " "--disable_cluster_ops=ReduceMax,Reshape " "--enable_expand_ops=Conv2D") @@ -82,7 +82,7 @@ def set_graph_kernel_context(): def network_init(cfg): """ Network init """ device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=cfg.device_target, save_graphs=cfg.save_graphs, device_id=device_id, save_graphs_path="ir_path", max_call_depth=2000) set_graph_kernel_context() @@ -100,8 +100,8 @@ def network_init(cfg): init() cfg.rank = get_rank() cfg.group_size = get_group_size() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=cfg.group_size) # select for master rank save ckpt or all rank save, compatible for model parallel @@ -115,13 +115,13 @@ def network_init(cfg): def parallel_init(args): - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE degree = 1 if args.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL degree = get_group_size() - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) def modelarts_pre_process(cfg): @@ -314,7 +314,7 @@ def run_train(cfg): base_network = DetectionBlock(cfg, backbone=backbone) # syc bn only support distributed training in graph mode - if cfg.use_syc_bn and cfg.is_distributed and context.get_context('mode') == context.GRAPH_MODE: + if cfg.use_syc_bn and cfg.is_distributed and mindspore.get_context('mode') == 0: cfg.logger.info("Using Synchronized batch norm layer...") use_syc_bn(base_network) default_recurisive_init(base_network) diff --git a/official/cv/YOLOv3/convert_weight.py b/official/cv/YOLOv3/convert_weight.py index 98b22996f1add5b27592d03b05e3be432b343ede..6e7abf51ef9fc7824ca6100255a369cfcb6a3e17 100644 --- a/official/cv/YOLOv3/convert_weight.py +++ b/official/cv/YOLOv3/convert_weight.py @@ -15,7 +15,7 @@ """Convert weight to mindspore ckpt.""" import os import numpy as np -import mindspore as ms +import mindspore from src.yolo import YOLOV3DarkNet53 from model_utils.config import config @@ -61,14 +61,14 @@ def convert(weights_file, output_file): index += weight.size param_list.append({'name': weight.name, 'type': weight.dtype, 'shape': weight.shape, - 'data': ms.Tensor(weight_data)}) - param_list.append({'name': mean.name, 'type': mean.dtype, 'shape': mean.shape, 'data': ms.Tensor(mean_data)}) - param_list.append({'name': var.name, 'type': var.dtype, 'shape': var.shape, 'data': ms.Tensor(var_data)}) + 'data': mindspore.Tensor(weight_data)}) + param_list.append({'name': mean.name, 'type': mean.dtype, 'shape': mean.shape, 'data': mindspore.Tensor(mean_data)}) + param_list.append({'name': var.name, 'type': var.dtype, 'shape': var.shape, 'data': mindspore.Tensor(var_data)}) param_list.append({'name': gamma.name, 'type': gamma.dtype, 'shape': gamma.shape, - 'data': ms.Tensor(gamma_data)}) - param_list.append({'name': beta.name, 'type': beta.dtype, 'shape': beta.shape, 'data': ms.Tensor(beta_data)}) + 'data': mindspore.Tensor(gamma_data)}) + param_list.append({'name': beta.name, 'type': beta.dtype, 'shape': beta.shape, 'data': mindspore.Tensor(beta_data)}) - ms.save_checkpoint(param_list, output_file) + mindspore.save_checkpoint(param_list, output_file) if __name__ == "__main__": diff --git a/official/cv/YOLOv3/eval.py b/official/cv/YOLOv3/eval.py index 64da549fe78588f6bf34e8fda4bcb42b15aa1631..95e2d47378c8295f6f6ef7ed2455d83cffce9500 100644 --- a/official/cv/YOLOv3/eval.py +++ b/official/cv/YOLOv3/eval.py @@ -17,7 +17,7 @@ import os import datetime import time -import mindspore as ms +import mindspore from src.yolo import YOLOV3DarkNet53 from src.logger import get_logger @@ -37,7 +37,7 @@ def conver_testing_shape(args): def load_parameters(network, file_name): config.logger.info("yolov3 pretrained network model: %s", file_name) - param_dict = ms.load_checkpoint(file_name) + param_dict = mindspore.load_checkpoint(file_name) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): @@ -46,7 +46,7 @@ def load_parameters(network, file_name): param_dict_new[key[13:]] = values else: param_dict_new[key] = values - ms.load_param_into_net(network, param_dict_new) + mindspore.load_param_into_net(network, param_dict_new) config.logger.info('load_model %s success', file_name) @@ -58,7 +58,7 @@ def run_test(): config.annFile = os.path.join(config.data_dir, 'annotations/instances_val2014.json') devid = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0 - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, save_graphs=False, device_id=devid) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, device_id=devid) # logger config.outputs_dir = os.path.join(config.log_path, @@ -66,9 +66,9 @@ def run_test(): rank_id = int(os.environ.get('RANK_ID')) if os.environ.get('RANK_ID') else 0 config.logger = get_logger(config.outputs_dir, rank_id) - ms.reset_auto_parallel_context() - parallel_mode = ms.ParallelMode.STAND_ALONE - ms.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) + mindspore.reset_auto_parallel_context() + parallel_mode = mindspore.ParallelMode.STAND_ALONE + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) config.logger.info('Creating Network....') network = YOLOV3DarkNet53(is_training=False) diff --git a/official/cv/YOLOv3/eval_onnx.py b/official/cv/YOLOv3/eval_onnx.py index e06ec68e60866e850ac8df0295b964a7c90f9a8c..dc6d6db2b2de9b6badf7c48cc66641238ea65bdd 100644 --- a/official/cv/YOLOv3/eval_onnx.py +++ b/official/cv/YOLOv3/eval_onnx.py @@ -16,7 +16,7 @@ import os import datetime import time import onnxruntime -import mindspore as ms +import mindspore from src.logger import get_logger from src.yolo_dataset import create_yolo_dataset from src.util import DetectionEngine @@ -35,7 +35,7 @@ def run_test(): config.annFile = os.path.join(config.data_dir, 'annotations/instances_val2014.json') devid = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0 - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, save_graphs=False, device_id=devid) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, device_id=devid) # logger config.outputs_dir = os.path.join(config.log_path, @@ -43,9 +43,9 @@ def run_test(): rank_id = int(os.environ.get('RANK_ID')) if os.environ.get('RANK_ID') else 0 config.logger = get_logger(config.outputs_dir, rank_id) - ms.reset_auto_parallel_context() - parallel_mode = ms.ParallelMode.STAND_ALONE - ms.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) + mindspore.reset_auto_parallel_context() + parallel_mode = mindspore.ParallelMode.STAND_ALONE + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) print(config.device_target) if config.device_target == 'GPU': providers = ['CUDAExecutionProvider'] diff --git a/official/cv/YOLOv3/export.py b/official/cv/YOLOv3/export.py index 33c0555e7d9173d6ef1bf07ad36b90b74b9907c5..7a477ef679c56edf08c5336c99ba47586c784aef 100644 --- a/official/cv/YOLOv3/export.py +++ b/official/cv/YOLOv3/export.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -import mindspore as ms +import mindspore from src.yolo import YOLOV3DarkNet53 from model_utils.config import config @@ -21,20 +21,20 @@ from model_utils.moxing_adapter import moxing_wrapper, modelarts_export_preproce @moxing_wrapper(pre_process=modelarts_export_preprocess) def run_export(): - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - ms.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) network = YOLOV3DarkNet53(is_training=False) - param_dict = ms.load_checkpoint(config.ckpt_file) - ms.load_param_into_net(network, param_dict) + param_dict = mindspore.load_checkpoint(config.ckpt_file) + mindspore.load_param_into_net(network, param_dict) network.set_train(False) shape = [config.batch_size, 3] + config.test_img_shape - input_data = ms.numpy.zeros(shape, ms.float32) + input_data = mindspore.numpy.zeros(shape, mindspore.float32) - ms.export(network, input_data, file_name=config.file_name, file_format=config.file_format) + mindspore.export(network, input_data, file_name=config.file_name, file_format=config.file_format) if __name__ == "__main__": diff --git a/official/cv/YOLOv3/model_utils/moxing_adapter.py b/official/cv/YOLOv3/model_utils/moxing_adapter.py index 24a6d90e850993228c44c53c712a9c3a5176744a..120e57e8943417ad2ba617d87b34dfd75d03dde1 100644 --- a/official/cv/YOLOv3/model_utils/moxing_adapter.py +++ b/official/cv/YOLOv3/model_utils/moxing_adapter.py @@ -18,7 +18,8 @@ import os import time import functools -import mindspore as ms +import mindspore +import mindspore from .config import config _global_sync_count = 0 @@ -155,7 +156,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - ms.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/YOLOv3/src/initializer.py b/official/cv/YOLOv3/src/initializer.py index a5c6c283d3023bbccbd30a89eeaf0333a8177fb8..5ba4bfbbadb40398cb3476d2e3f6c42325e37103 100644 --- a/official/cv/YOLOv3/src/initializer.py +++ b/official/cv/YOLOv3/src/initializer.py @@ -16,7 +16,7 @@ import math from functools import reduce import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn from .util import load_backbone @@ -137,7 +137,7 @@ def _calculate_fan_in_and_fan_out(arr): return fan_in, fan_out -class KaimingUniform(ms.common.initializer.Initializer): +class KaimingUniform(mindspore.common.initializer.Initializer): """Kaiming uniform initializer.""" def __init__(self, a=0, mode='fan_in', nonlinearity='leaky_relu'): super(KaimingUniform, self).__init__() @@ -154,20 +154,20 @@ def default_recurisive_init(custom_cell): """Initialize parameter.""" for _, cell in custom_cell.cells_and_names(): if isinstance(cell, nn.Conv2d): - cell.weight.set_data(ms.common.initializer.initializer(KaimingUniform(a=math.sqrt(5)), + cell.weight.set_data(mindspore.common.initializer.initializer(KaimingUniform(a=math.sqrt(5)), cell.weight.shape, cell.weight.dtype)) if cell.bias is not None: fan_in, _ = _calculate_fan_in_and_fan_out(cell.weight) bound = 1 / math.sqrt(fan_in) - cell.bias.set_data(ms.common.initializer.initializer(ms.common.initializer.Uniform(bound), + cell.bias.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.Uniform(bound), cell.bias.shape, cell.bias.dtype)) elif isinstance(cell, nn.Dense): - cell.weight.set_data(ms.common.initializer.initializer(KaimingUniform(a=math.sqrt(5)), + cell.weight.set_data(mindspore.common.initializer.initializer(KaimingUniform(a=math.sqrt(5)), cell.weight.shape, cell.weight.dtype)) if cell.bias is not None: fan_in, _ = _calculate_fan_in_and_fan_out(cell.weight) bound = 1 / math.sqrt(fan_in) - cell.bias.set_data(ms.common.initializer.initializer(ms.common.initializer.Uniform(bound), + cell.bias.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.Uniform(bound), cell.bias.shape, cell.bias.dtype)) elif isinstance(cell, (nn.BatchNorm2d, nn.BatchNorm1d)): pass @@ -182,7 +182,7 @@ def load_yolov3_params(args, network): args.logger.info('Not load pre-trained backbone, please be careful') if args.resume_yolov3: - param_dict = ms.load_checkpoint(args.resume_yolov3) + param_dict = mindspore.load_checkpoint(args.resume_yolov3) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): @@ -195,5 +195,5 @@ def load_yolov3_params(args, network): args.logger.info('in resume {}'.format(key)) args.logger.info('resume finished') - ms.load_param_into_net(network, param_dict_new) + mindspore.load_param_into_net(network, param_dict_new) args.logger.info('load_model {} success'.format(args.resume_yolov3)) diff --git a/official/cv/YOLOv3/src/util.py b/official/cv/YOLOv3/src/util.py index 3ac480b37865f6891b992a2ae7215de168f71984..376f9f70395d4e73d44e30ba0260933e59ac321f 100644 --- a/official/cv/YOLOv3/src/util.py +++ b/official/cv/YOLOv3/src/util.py @@ -20,7 +20,7 @@ import numpy as np from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval -import mindspore as ms +import mindspore from .yolo import YoloLossBlock @@ -62,9 +62,9 @@ class AverageMeter: def load_backbone(net, ckpt_path, args): """Load darknet53 backbone checkpoint.""" - param_dict = ms.load_checkpoint(ckpt_path) + param_dict = mindspore.load_checkpoint(ckpt_path) net.init_parameters_data() - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) param_not_load = [] for _, param in net.parameters_and_names(): @@ -148,7 +148,7 @@ def keep_loss_fp32(network): """Keep loss of network with float32""" for _, cell in network.cells_and_names(): if isinstance(cell, (YoloLossBlock,)): - cell.to_float(ms.float32) + cell.to_float(mindspore.float32) def cpu_affinity(rank_id, device_num): diff --git a/official/cv/YOLOv3/src/yolo.py b/official/cv/YOLOv3/src/yolo.py index 23e7bc08501cd072973c798bae3eb68dcfdcf012..86c98d41deef2f21329bc4cca2301c36937c4f6e 100644 --- a/official/cv/YOLOv3/src/yolo.py +++ b/official/cv/YOLOv3/src/yolo.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ """YOLOv3 based on DarkNet.""" -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops @@ -178,7 +178,7 @@ class DetectionBlock(nn.Cell): idx = (6, 7, 8) else: raise KeyError("Invalid scale value for DetectionBlock") - self.anchors = ms.Tensor([self.config.anchor_scales[i] for i in idx], ms.float32) + self.anchors = mindspore.Tensor([self.config.anchor_scales[i] for i in idx], mindspore.float32) self.num_anchors_per_scale = 3 self.num_attrib = 4+1+self.config.num_classes self.lambda_coord = 1 @@ -200,8 +200,8 @@ class DetectionBlock(nn.Cell): range_x = range(grid_size[1]) range_y = range(grid_size[0]) - grid_x = ops.Cast()(ops.tuple_to_array(range_x), ms.float32) - grid_y = ops.Cast()(ops.tuple_to_array(range_y), ms.float32) + grid_x = ops.Cast()(ops.tuple_to_array(range_x), mindspore.float32) + grid_y = ops.Cast()(ops.tuple_to_array(range_y), mindspore.float32) # Tensor of shape [grid_size[0], grid_size[1], 1, 1] representing the coordinate of x/y axis for each grid # [batch, gridx, gridy, 1, 1] grid_x = self.tile(self.reshape(grid_x, (1, 1, -1, 1, 1)), (1, grid_size[0], 1, 1, 1)) @@ -215,7 +215,7 @@ class DetectionBlock(nn.Cell): # gridsize1 is x # gridsize0 is y box_xy = (self.sigmoid(box_xy) + grid) / ops.Cast()(ops.tuple_to_array((grid_size[1], - grid_size[0])), ms.float32) + grid_size[0])), mindspore.float32) # box_wh is w->h box_wh = ops.Exp()(box_wh) * self.anchors / input_shape @@ -278,8 +278,8 @@ class YoloLossBlock(nn.Cell): idx = (6, 7, 8) else: raise KeyError("Invalid scale value for DetectionBlock") - self.anchors = ms.Tensor([self.config.anchor_scales[i] for i in idx], ms.float32) - self.ignore_threshold = ms.Tensor(self.config.ignore_threshold, ms.float32) + self.anchors = mindspore.Tensor([self.config.anchor_scales[i] for i in idx], mindspore.float32) + self.ignore_threshold = mindspore.Tensor(self.config.ignore_threshold, mindspore.float32) self.concat = ops.Concat(axis=-1) self.iou = Iou() self.reduce_max = ops.ReduceMax(keep_dims=False) @@ -299,7 +299,7 @@ class YoloLossBlock(nn.Cell): class_probs = y_true[:, :, :, :, 5:] grid_shape = ops.Shape()(prediction)[1:3] - grid_shape = ops.Cast()(ops.tuple_to_array(grid_shape[::-1]), ms.float32) + grid_shape = ops.Cast()(ops.tuple_to_array(grid_shape[::-1]), mindspore.float32) pred_boxes = self.concat((pred_xy, pred_wh)) true_xy = y_true[:, :, :, :, :2] * grid_shape - grid @@ -323,7 +323,7 @@ class YoloLossBlock(nn.Cell): # ignore_mask IOU too small ignore_mask = best_iou < self.ignore_threshold - ignore_mask = ops.Cast()(ignore_mask, ms.float32) + ignore_mask = ops.Cast()(ignore_mask, mindspore.float32) ignore_mask = ops.ExpandDims()(ignore_mask, -1) # ignore_mask backpro will cause a lot maximunGrad and minimumGrad time consume. # so we turn off its gradient @@ -373,7 +373,7 @@ class YOLOV3DarkNet53(nn.Cell): def construct(self, x): input_shape = ops.shape(x)[2:4] - input_shape = ops.cast(self.tenser_to_array(input_shape), ms.float32) + input_shape = ops.cast(self.tenser_to_array(input_shape), mindspore.float32) big_object_output, medium_object_output, small_object_output = self.feature_map(x) if not self.keep_detect: return big_object_output, medium_object_output, small_object_output @@ -397,7 +397,7 @@ class YoloWithLossCell(nn.Cell): def construct(self, x, y_true_0, y_true_1, y_true_2, gt_0, gt_1, gt_2): input_shape = ops.shape(x)[2:4] - input_shape = ops.cast(self.tenser_to_array(input_shape), ms.float32) + input_shape = ops.cast(self.tenser_to_array(input_shape), mindspore.float32) yolo_out = self.yolo_network(x) loss_l = self.loss_big(*yolo_out[0], y_true_0, gt_0, input_shape) loss_m = self.loss_me(*yolo_out[1], y_true_1, gt_1, input_shape) diff --git a/official/cv/YOLOv3/train.py b/official/cv/YOLOv3/train.py index 6e1351a4ca3e70175e7d8ed2acbf06c25f388504..d762c7ca6a20852d9a011e1f3dcc8c91535cc9d6 100644 --- a/official/cv/YOLOv3/train.py +++ b/official/cv/YOLOv3/train.py @@ -17,7 +17,7 @@ import os import time import datetime -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.communication as comm @@ -34,7 +34,7 @@ from model_utils.config import config # only useful for huawei cloud modelarts. from model_utils.moxing_adapter import moxing_wrapper, modelarts_pre_process -ms.set_seed(1) +mindspore.set_seed(1) def conver_training_shape(args): @@ -43,9 +43,9 @@ def conver_training_shape(args): def set_graph_kernel_context(): - if ms.get_context("device_target") == "GPU": - ms.set_context(enable_graph_kernel=True) - ms.set_context(graph_kernel_flags="--enable_parallel_fusion " + if mindspore.get_context("device_target") == "GPU": + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(graph_kernel_flags="--enable_parallel_fusion " "--enable_trans_op_optimize " "--disable_cluster_ops=ReduceMax,Reshape " "--enable_expand_ops=Conv2D") @@ -53,22 +53,22 @@ def set_graph_kernel_context(): def network_init(args): device_id = int(os.getenv('DEVICE_ID', '0')) - ms.set_context(mode=ms.GRAPH_MODE, device_target=args.device_target, save_graphs=False, device_id=device_id) + mindspore.set_context(mode=0, device_target=args.device_target, save_graphs=False, device_id=device_id) set_graph_kernel_context() # Set mempool block size for improving memory utilization, which will not take effect in GRAPH_MODE - if ms.get_context("mode") == ms.PYNATIVE_MODE: - ms.set_context(mempool_block_size="31GB") + if mindspore.get_context("mode") == 1: + mindspore.set_context(mempool_block_size="31GB") # Since the default max memory pool available size on ascend is 30GB, # which does not meet the requirements and needs to be adjusted larger. - if ms.get_context("device_target") == "Ascend": - ms.set_context(max_device_memory="31GB") + if mindspore.get_context("device_target") == "Ascend": + mindspore.set_context(max_device_memory="31GB") profiler = None if args.need_profiler: profiling_dir = os.path.join("profiling", datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) - profiler = ms.profiler.Profiler(output_path=profiling_dir) + profiler = mindspore.profiler.Profiler(output_path=profiling_dir) # init distributed if args.is_distributed: @@ -94,13 +94,13 @@ def network_init(args): def parallel_init(args): - ms.reset_auto_parallel_context() - parallel_mode = ms.ParallelMode.STAND_ALONE + mindspore.reset_auto_parallel_context() + parallel_mode = mindspore.ParallelMode.STAND_ALONE degree = 1 if args.is_distributed: - parallel_mode = ms.ParallelMode.DATA_PARALLEL + parallel_mode = mindspore.ParallelMode.DATA_PARALLEL degree = comm.get_group_size() - ms.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) @moxing_wrapper(pre_process=modelarts_pre_process) @@ -135,13 +135,13 @@ def run_train(): config.steps_per_epoch = ds.get_dataset_size() lr = get_lr(config) - opt = nn.Momentum(params=get_param_groups(network), momentum=config.momentum, learning_rate=ms.Tensor(lr), + opt = nn.Momentum(params=get_param_groups(network), momentum=config.momentum, learning_rate=mindspore.Tensor(lr), weight_decay=config.weight_decay, loss_scale=config.loss_scale) - is_gpu = ms.get_context("device_target") == "GPU" + is_gpu = mindspore.get_context("device_target") == "GPU" if is_gpu: loss_scale_value = 1.0 - loss_scale = ms.FixedLossScaleManager(loss_scale_value, drop_overflow_update=False) - network = ms.build_train_network(network, optimizer=opt, loss_scale_manager=loss_scale, + loss_scale = mindspore.FixedLossScaleManager(loss_scale_value, drop_overflow_update=False) + network = mindspore.build_train_network(network, optimizer=opt, loss_scale_manager=loss_scale, level="O2", keep_batchnorm_fp32=False) keep_loss_fp32(network) else: @@ -158,14 +158,14 @@ def run_train(): images = data["image"] input_shape = images.shape[2:4] config.logger.info('iter[{}], shape{}'.format(step_idx, input_shape[0])) - images = ms.Tensor.from_numpy(images) + images = mindspore.Tensor.from_numpy(images) - batch_y_true_0 = ms.Tensor.from_numpy(data['bbox1']) - batch_y_true_1 = ms.Tensor.from_numpy(data['bbox2']) - batch_y_true_2 = ms.Tensor.from_numpy(data['bbox3']) - batch_gt_box0 = ms.Tensor.from_numpy(data['gt_box1']) - batch_gt_box1 = ms.Tensor.from_numpy(data['gt_box2']) - batch_gt_box2 = ms.Tensor.from_numpy(data['gt_box3']) + batch_y_true_0 = mindspore.Tensor.from_numpy(data['bbox1']) + batch_y_true_1 = mindspore.Tensor.from_numpy(data['bbox2']) + batch_y_true_2 = mindspore.Tensor.from_numpy(data['bbox3']) + batch_gt_box0 = mindspore.Tensor.from_numpy(data['gt_box1']) + batch_gt_box1 = mindspore.Tensor.from_numpy(data['gt_box2']) + batch_gt_box2 = mindspore.Tensor.from_numpy(data['gt_box3']) loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2) @@ -196,7 +196,7 @@ def run_train(): if not os.path.exists(ckpt_path): os.makedirs(ckpt_path, exist_ok=True) ckpt_name = os.path.join(ckpt_path, "yolov3_{}_{}.ckpt".format(epoch_idx + 1, config.steps_per_epoch)) - ms.save_checkpoint(network, ckpt_name) + mindspore.save_checkpoint(network, ckpt_name) ckpt_list = [os.path.join(ckpt_path, f) for f in os.listdir(ckpt_path)] ckpt_list = sorted(ckpt_list, key=os.path.getmtime) for i in range(len(ckpt_list) - config.max_checkpoint_num): diff --git a/official/cv/YOLOv4/eval.py b/official/cv/YOLOv4/eval.py index 6a4749f6bd75dc32d396b5ac72a1bafeca7d029a..d435fa2a70470627e114f43955dbacbdc3d60929 100644 --- a/official/cv/YOLOv4/eval.py +++ b/official/cv/YOLOv4/eval.py @@ -17,8 +17,8 @@ import os import datetime import time -from mindspore.context import ParallelMode -from mindspore import context +import mindspore +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.yolo import YOLOV4CspDarkNet53 @@ -90,7 +90,7 @@ def modelarts_pre_process(): def run_eval(): start_time = time.time() device_id = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0 - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) # logger config.outputs_dir = os.path.join(config.log_path, @@ -98,9 +98,9 @@ def run_eval(): rank_id = int(os.environ.get('RANK_ID')) if os.environ.get('RANK_ID') else 0 config.logger = get_logger(config.outputs_dir, rank_id) - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) config.logger.info('Creating Network....') network = YOLOV4CspDarkNet53() diff --git a/official/cv/YOLOv4/export.py b/official/cv/YOLOv4/export.py index 6904fbd53d3d87519800a85be7f7e651cc385a81..6173b3b6214336e5eef1663c59a5230988a0147a 100644 --- a/official/cv/YOLOv4/export.py +++ b/official/cv/YOLOv4/export.py @@ -16,7 +16,7 @@ import os import numpy as np import mindspore -from mindspore import context, Tensor +from mindspore import Tensor from mindspore.train.serialization import export, load_checkpoint, load_param_into_net from src.yolo import YOLOV4CspDarkNet53 @@ -32,9 +32,9 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def run_export(): - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) ts_shape = config.testing_shape network = YOLOV4CspDarkNet53() diff --git a/official/cv/YOLOv4/infer/README.md b/official/cv/YOLOv4/infer/README.md index 673cfedfee18ae6fe70619657c97a84d14a7a0c9..4ba63230e7395993e62e66a12b6d14538e8f33b0 100644 --- a/official/cv/YOLOv4/infer/README.md +++ b/official/cv/YOLOv4/infer/README.md @@ -77,7 +77,7 @@ warmup_epochs: 4 MindSpore支持数据并行及自动并行。自动并行是MindSpore融合了数据并行、模型并行及混合并行的一种分布式并行模式,可以自动建立代价模型,为用户选择一种并行模式。相关代码示例。 ```shell -context.set_auto_parallel_context(parallel_mode = ParallelMode.DATA_PARALLEL, device_num = device_num) +mindspore.set_auto_parallel_context(parallel_mode = ParallelMode.DATA_PARALLEL, device_num = device_num) ``` ### 混合精度训练 @@ -92,7 +92,7 @@ context.set_auto_parallel_context(parallel_mode = ParallelMode.DATA_PARALLEL, de 1. 硬件环境准备请参见各硬件产品[“驱动和固件安装升级指南”](https://support.huawei.com/enterprise/zh/category/ai-computing-platform-pid-1557196528909) 。需要在硬件设备上安装与CANN版本配套的固件与驱动。 -2. 宿主机上需要安装Python3和Docker,并登录[Ascend Hub中心](https://ascend.huawei.com/ascendhub/#/home) 获取镜像。 +2. 宿主机上需要安装Python3和Docker,并登录[Ascend Hub中心](https://www.hiascend.com/developer/ascendhub) 获取镜像。 当前模型支持的镜像列表如下表所示。 **表 1** 镜像列表 diff --git a/official/cv/YOLOv4/model_utils/moxing_adapter.py b/official/cv/YOLOv4/model_utils/moxing_adapter.py index 25838a7da99a27a1bb744684c1f75f80f5704688..189ff0667a1a783691749e55e41f1562c100b9c9 100644 --- a/official/cv/YOLOv4/model_utils/moxing_adapter.py +++ b/official/cv/YOLOv4/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/YOLOv4/modelarts/modelarts.py b/official/cv/YOLOv4/modelarts/modelarts.py index bb0c131000d5b5f298a5275350554e54c733d350..4fe575b09250b85779ee9c6300918fe8a73c9a2f 100644 --- a/official/cv/YOLOv4/modelarts/modelarts.py +++ b/official/cv/YOLOv4/modelarts/modelarts.py @@ -20,11 +20,10 @@ import datetime import numpy as np import mindspore -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.optim.momentum import Momentum from mindspore import Tensor import mindspore.nn as nn -from mindspore import context from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.serialization import export, load_checkpoint, load_param_into_net from mindspore.train.callback import ModelCheckpoint, RunContext @@ -82,7 +81,7 @@ def set_default(): config.ann_val_file = os.path.join(args_opt.data_url, 'annotations/instances_val2017.json') device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, device_id=device_id) if config.need_profiler: @@ -208,13 +207,13 @@ def run_train(): profiler = set_default() loss_meter = AverageMeter('loss') - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE degree = 1 if config.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL degree = get_group_size() - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) network = YOLOV4CspDarkNet53() if config.run_eval: diff --git a/official/cv/YOLOv4/src/coco_visual.py b/official/cv/YOLOv4/src/coco_visual.py index ea9459295092e0128612dde1c6398a0ef588bf33..a15a70ee2ff2739a1287c8426d141eb5d0217904 100644 --- a/official/cv/YOLOv4/src/coco_visual.py +++ b/official/cv/YOLOv4/src/coco_visual.py @@ -551,8 +551,8 @@ class DetectEval(COCOeval): assert (tps.shape[0]) == 1 assert (fps.shape[0]) == 1 - tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float) - fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float) + tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float_) + fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float_) ids = catIds[k0] label = labels[ids] diff --git a/official/cv/YOLOv4/src/yolo.py b/official/cv/YOLOv4/src/yolo.py index 357e37e73a91bb530af2043d521e33625edc47e7..40a174eb1e4214ec9e5c040c50550e3e2625833d 100644 --- a/official/cv/YOLOv4/src/yolo.py +++ b/official/cv/YOLOv4/src/yolo.py @@ -13,11 +13,10 @@ # limitations under the License. # ============================================================================ """YOLOv4 based on DarkNet.""" -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.common.tensor import Tensor -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.communication.management import get_group_size from mindspore.ops import operations as P @@ -237,7 +236,7 @@ class DetectionBlock(nn.Cell): self.offset_x_y = 0.025 else: raise KeyError("Invalid scale value for DetectionBlock") - self.anchors = Tensor([self.config.anchor_scales[i] for i in idx], ms.float32) + self.anchors = Tensor([self.config.anchor_scales[i] for i in idx], mindspore.float32) self.num_anchors_per_scale = 3 self.num_attrib = 4+1+self.config.num_classes self.lambda_coord = 1 @@ -262,8 +261,8 @@ class DetectionBlock(nn.Cell): range_x = range(grid_size[1]) range_y = range(grid_size[0]) - grid_x = P.Cast()(F.tuple_to_array(range_x), ms.float32) - grid_y = P.Cast()(F.tuple_to_array(range_y), ms.float32) + grid_x = P.Cast()(F.tuple_to_array(range_x), mindspore.float32) + grid_y = P.Cast()(F.tuple_to_array(range_y), mindspore.float32) # Tensor of shape [grid_size[0], grid_size[1], 1, 1] representing the coordinate of x/y axis for each grid # [batch, gridx, gridy, 1, 1] grid_x = self.tile(self.reshape(grid_x, (1, 1, -1, 1, 1)), (1, grid_size[0], 1, 1, 1)) @@ -279,7 +278,7 @@ class DetectionBlock(nn.Cell): # gridsize1 is x # gridsize0 is y box_xy = (self.scale_x_y * self.sigmoid(box_xy) - self.offset_x_y + grid) / \ - P.Cast()(F.tuple_to_array((grid_size[1], grid_size[0])), ms.float32) + P.Cast()(F.tuple_to_array((grid_size[1], grid_size[0])), mindspore.float32) # box_wh is w->h box_wh = P.Exp()(box_wh) * self.anchors / input_shape box_confidence = self.sigmoid(box_confidence) @@ -342,8 +341,8 @@ class YoloLossBlock(nn.Cell): idx = (6, 7, 8) else: raise KeyError("Invalid scale value for DetectionBlock") - self.anchors = Tensor([self.config.anchor_scales[i] for i in idx], ms.float32) - self.ignore_threshold = Tensor(self.config.ignore_threshold, ms.float32) + self.anchors = Tensor([self.config.anchor_scales[i] for i in idx], mindspore.float32) + self.ignore_threshold = Tensor(self.config.ignore_threshold, mindspore.float32) self.concat = P.Concat(axis=-1) self.iou = Iou() self.reduce_max = P.ReduceMax(keep_dims=False) @@ -372,7 +371,7 @@ class YoloLossBlock(nn.Cell): true_boxes = y_true[:, :, :, :, :4] grid_shape = P.Shape()(prediction)[1:3] - grid_shape = P.Cast()(F.tuple_to_array(grid_shape[::-1]), ms.float32) + grid_shape = P.Cast()(F.tuple_to_array(grid_shape[::-1]), mindspore.float32) pred_boxes = self.concat((pred_xy, pred_wh)) true_wh = y_true[:, :, :, :, 2:4] @@ -396,7 +395,7 @@ class YoloLossBlock(nn.Cell): # ignore_mask IOU too small ignore_mask = best_iou < self.ignore_threshold - ignore_mask = P.Cast()(ignore_mask, ms.float32) + ignore_mask = P.Cast()(ignore_mask, mindspore.float32) ignore_mask = P.ExpandDims()(ignore_mask, -1) # ignore_mask backpro will cause a lot maximunGrad and minimumGrad time consume. # so we turn off its gradient @@ -438,7 +437,7 @@ class YOLOV4CspDarkNet53(nn.Cell): super(YOLOV4CspDarkNet53, self).__init__() self.config = default_config self.keep_detect = self.config.keep_detect - self.test_img_shape = Tensor(tuple(self.config.test_img_shape), ms.float32) + self.test_img_shape = Tensor(tuple(self.config.test_img_shape), mindspore.float32) # YOLOv4 network self.feature_map = YOLOv4(backbone=CspDarkNet53(ResidualBlock, detect=True), @@ -497,13 +496,13 @@ class TrainingWrapper(nn.Cell): self.sens = sens self.reducer_flag = False self.grad_reducer = None - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True if self.reducer_flag: - mean = context.get_auto_parallel_context("gradients_mean") + mean = mindspore.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): - degree = context.get_auto_parallel_context("device_num") + degree = mindspore.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree) @@ -549,9 +548,9 @@ class Giou(nn.Cell): union = box_p_area + box_gt_area - intersection union = union + self.eps c_area = c_area + self.eps - iou = self.div(self.cast(intersection, ms.float32), self.cast(union, ms.float32)) + iou = self.div(self.cast(intersection, mindspore.float32), self.cast(union, mindspore.float32)) res_mid0 = c_area - union - res_mid1 = self.div(self.cast(res_mid0, ms.float32), self.cast(c_area, ms.float32)) + res_mid1 = self.div(self.cast(res_mid0, mindspore.float32), self.cast(c_area, mindspore.float32)) giou = iou - res_mid1 giou = C.clip_by_value(giou, -1.0, 1.0) return giou diff --git a/official/cv/YOLOv4/test.py b/official/cv/YOLOv4/test.py index 26be396ee310c7f98f91a55b8f221940b775c1a8..1235ab62ea9d0578d35d49b70d9c6dcd929f3b89 100644 --- a/official/cv/YOLOv4/test.py +++ b/official/cv/YOLOv4/test.py @@ -22,9 +22,9 @@ from collections import defaultdict import json import numpy as np -from mindspore import context +import mindspore from mindspore import Tensor -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -37,7 +37,7 @@ from model_utils.moxing_adapter import moxing_wrapper from model_utils.device_adapter import get_device_id, get_device_num devid = int(os.getenv('DEVICE_ID')) -context.set_context(mode=context.GRAPH_MODE, device_target="Davinci", save_graphs=False, device_id=devid) +mindspore.set_context(mode=0, device_target="Davinci", save_graphs=False, device_id=devid) config.data_root = os.path.join(config.data_dir, 'test2017') config.nms_thresh = config.test_nms_thresh @@ -288,12 +288,12 @@ def run_test(): config.logger = get_logger(config.outputs_dir, config.rank) - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() if config.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL else: parallel_mode = ParallelMode.STAND_ALONE - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) config.logger.info('Creating Network....') network = YOLOV4CspDarkNet53() diff --git a/official/cv/YOLOv4/train.py b/official/cv/YOLOv4/train.py index eaf55b6cb2541ccf0ee8ade873f502dd301f9e37..988fb3b959303a497c7b5f925487ffa8b5a25a16 100644 --- a/official/cv/YOLOv4/train.py +++ b/official/cv/YOLOv4/train.py @@ -17,15 +17,14 @@ import os import time import datetime -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.optim.momentum import Momentum from mindspore import Tensor import mindspore.nn as nn -from mindspore import context from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.callback import ModelCheckpoint, RunContext from mindspore.train.callback import CheckpointConfig -import mindspore as ms +import mindspore from mindspore.common import set_seed from mindspore.profiler.profiling import Profiler @@ -55,7 +54,7 @@ def set_default(): config.ann_val_file = os.path.join(config.data_dir, 'annotations/instances_val2017.json') device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False, + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, device_id=device_id, ascend_config={"precision_mode": "allow_fp32_to_fp16"}) if config.need_profiler: @@ -180,13 +179,13 @@ def get_network(net, cfg, learning_rate): def run_train(): profiler = set_default() loss_meter = AverageMeter('loss') - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE degree = 1 if config.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL degree = get_group_size() - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) network = YOLOV4CspDarkNet53() if config.run_eval: @@ -234,7 +233,7 @@ def run_train(): data_val_root = config.data_val_root ann_val_file = config.ann_val_file save_ckpt_path = os.path.join(config.outputs_dir, 'ckpt_' + str(config.rank) + '/') - input_val_shape = Tensor(tuple(config.test_img_shape), ms.float32) + input_val_shape = Tensor(tuple(config.test_img_shape), mindspore.float32) # init detection engine eval_dataset, eval_data_size = create_yolo_dataset(data_val_root, ann_val_file, is_training=False, batch_size=config.per_batch_size, max_epoch=1, device_num=1, @@ -263,7 +262,7 @@ def run_train(): batch_gt_box1 = Tensor.from_numpy(data['gt_box2']) batch_gt_box2 = Tensor.from_numpy(data['gt_box3']) - input_shape = Tensor(tuple(input_shape[::-1]), ms.float32) + input_shape = Tensor(tuple(input_shape[::-1]), mindspore.float32) loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2, input_shape) loss_meter.update(loss.asnumpy()) diff --git a/official/cv/YOLOv5/eval.py b/official/cv/YOLOv5/eval.py index 6f451173b77d2a357821e4098be2dae94e391a30..3f4bd88d21ac8de6fbac6e2e694a4890e24c45d7 100644 --- a/official/cv/YOLOv5/eval.py +++ b/official/cv/YOLOv5/eval.py @@ -17,9 +17,8 @@ import os import time import shutil -import mindspore as ms -from mindspore import context -from mindspore.context import ParallelMode +import mindspore +from mindspore import ParallelMode from mindspore.communication.management import init, get_group_size, get_rank from src.yolo import YOLOV5 @@ -37,7 +36,7 @@ def eval_preprocess(): config.val_img_dir = os.path.join(config.data_dir, config.val_img_dir) config.val_ann_file = os.path.join(config.data_dir, config.val_ann_file) device_id = int(os.getenv('DEVICE_ID', '0')) - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) parallel_mode = ParallelMode.STAND_ALONE config.eval_parallel = config.is_distributed and config.eval_parallel device_num = 1 @@ -47,8 +46,8 @@ def eval_preprocess(): config.group_size = get_group_size() device_num = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=device_num) + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=device_num) # logger module is managed by config, it is used in other function. e.x. config.logger.info("xxx") config.logger = get_logger(config.output_dir, device_id) @@ -56,7 +55,7 @@ def eval_preprocess(): def load_parameters(network, filename): config.logger.info("yolov5 pretrained network model: %s", filename) - param_dict = ms.load_checkpoint(filename) + param_dict = mindspore.load_checkpoint(filename) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): @@ -65,7 +64,7 @@ def load_parameters(network, filename): param_dict_new[key[13:]] = values else: param_dict_new[key] = values - ms.load_param_into_net(network, param_dict_new) + mindspore.load_param_into_net(network, param_dict_new) config.logger.info('load_model %s success', filename) diff --git a/official/cv/YOLOv5/eval_onnx.py b/official/cv/YOLOv5/eval_onnx.py index 71aa246a4436134600f6f987b19413b5f02fca6e..58edd578946783303cdf21435897973482634e48 100644 --- a/official/cv/YOLOv5/eval_onnx.py +++ b/official/cv/YOLOv5/eval_onnx.py @@ -20,8 +20,8 @@ import time import numpy as np import onnxruntime as ort -from mindspore.context import ParallelMode -from mindspore import context +import mindspore +from mindspore import ParallelMode from eval import DetectionEngine from model_utils.config import config @@ -63,9 +63,9 @@ def run_eval(): rank_id = int(os.getenv('DEVICE_ID', '0')) config.logger = get_logger(config.outputs_dir, rank_id) - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) ds = create_yolo_dataset(data_root, ann_file, batch_size=config.per_batch_size, device_num=1, rank=rank_id, config=config, is_training=False, shuffle=False) diff --git a/official/cv/YOLOv5/export.py b/official/cv/YOLOv5/export.py index b11392730f1a4c3b70cc39a3d9e2b85a974f365e..24785c4c8e3f127657b120ff8af75ded3e4fe913 100644 --- a/official/cv/YOLOv5/export.py +++ b/official/cv/YOLOv5/export.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -import mindspore as ms +import mindspore from src.yolo import YOLOV5s_Infer @@ -22,9 +22,9 @@ from model_utils.moxing_adapter import moxing_wrapper, modelarts_export_preproce @moxing_wrapper(pre_process=modelarts_export_preprocess, pre_args=[config]) def run_export(): - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - ms.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) dict_version = {'yolov5s': 0, 'yolov5m': 1, 'yolov5l': 2, 'yolov5x': 3} config.file_name = config.file_name + '_' + config.yolov5_version @@ -32,12 +32,12 @@ def run_export(): network = YOLOV5s_Infer(config.testing_shape[0], version=dict_version[config.yolov5_version]) network.set_train(False) - param_dict = ms.load_checkpoint(config.ckpt_file) - ms.load_param_into_net(network, param_dict) + param_dict = mindspore.load_checkpoint(config.ckpt_file) + mindspore.load_param_into_net(network, param_dict) - input_data = ms.numpy.zeros([config.batch_size, config.testing_shape[0], config.testing_shape[1], 3], ms.int8) + input_data = mindspore.numpy.zeros([config.batch_size, config.testing_shape[0], config.testing_shape[1], 3], mindspore.int8) - ms.export(network, input_data, file_name=config.file_name, file_format=config.file_format) + mindspore.export(network, input_data, file_name=config.file_name, file_format=config.file_format) print('==========success export===============') if __name__ == "__main__": diff --git a/official/cv/YOLOv5/model_utils/moxing_adapter.py b/official/cv/YOLOv5/model_utils/moxing_adapter.py index a2f802f598fac7fb32d13c4d4c556251153766be..3ef2093192a05908680b9cbfea1e8b562bdc253f 100644 --- a/official/cv/YOLOv5/model_utils/moxing_adapter.py +++ b/official/cv/YOLOv5/model_utils/moxing_adapter.py @@ -17,7 +17,8 @@ import os import functools -import mindspore as ms +import mindspore +import mindspore from .config import config _global_sync_count = 0 @@ -151,7 +152,7 @@ def moxing_wrapper(pre_process=None, post_process=None, **kwargs): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - ms.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/YOLOv5/modelarts/train_start.py b/official/cv/YOLOv5/modelarts/train_start.py index 81f98c8c83eccffca4b481cfd32688c4fdf346ae..8d510884bd0fdf5fb048f5f169ad0532817a5e0c 100644 --- a/official/cv/YOLOv5/modelarts/train_start.py +++ b/official/cv/YOLOv5/modelarts/train_start.py @@ -16,11 +16,11 @@ import os import time import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.communication as comm from mindspore.train.serialization import export, load_checkpoint, load_param_into_net -from mindspore import context, Tensor +from mindspore import Tensor from src.yolo import YOLOV5, YoloWithLossCell, YOLOV5s_Infer from src.logger import get_logger @@ -36,14 +36,14 @@ from model_utils.device_adapter import get_device_id from model_utils.moxing_adapter import moxing_wrapper, modelarts_pre_process -ms.set_seed(1) +mindspore.set_seed(1) def init_distribute(): comm.init() config.rank = comm.get_rank() config.group_size = comm.get_group_size() - ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.set_auto_parallel_context(parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=config.group_size) @@ -57,7 +57,7 @@ def train_preprocess(): if config.pretrained_checkpoint: config.pretrained_checkpoint = os.path.join(config.load_path, config.pretrained_checkpoint) device_id = get_device_id() - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) if config.is_distributed: # init distributed @@ -81,7 +81,7 @@ def export_models(ckpt_path): outputs_path = os.path.join(config.output_dir, 'yolov5') param_dict = load_checkpoint(ckpt_path) load_param_into_net(net, param_dict) - input_arr = Tensor(np.zeros([1, 12, config.testing_shape[0] // 2, config.testing_shape[1] // 2]), ms.float32) + input_arr = Tensor(np.zeros([1, 12, config.testing_shape[0] // 2, config.testing_shape[1] // 2]), mindspore.float32) export(net, input_arr, file_name=outputs_path, file_format=config.file_format) config.logger.info("export best model finished....") @@ -105,7 +105,7 @@ def run_train(): steps_per_epoch = ds.get_dataset_size() lr = get_lr(config, steps_per_epoch) - opt = nn.Momentum(params=get_param_groups(network), momentum=config.momentum, learning_rate=ms.Tensor(lr), + opt = nn.Momentum(params=get_param_groups(network), momentum=config.momentum, learning_rate=mindspore.Tensor(lr), weight_decay=config.weight_decay, loss_scale=config.loss_scale) network = nn.TrainOneStepCell(network, opt, config.loss_scale // 2) network.set_train() @@ -118,7 +118,7 @@ def run_train(): for step_idx, data in enumerate(data_loader): images = data[0] input_shape = images.shape[2:4] - input_shape = ms.Tensor(tuple(input_shape[::-1]), ms.float32) + input_shape = mindspore.Tensor(tuple(input_shape[::-1]), mindspore.float32) loss = network(images, data[2], data[3], data[4], data[5], data[6], data[7], input_shape) loss_meter.update(loss.asnumpy()) @@ -140,7 +140,7 @@ def run_train(): loss_meter.reset() if config.rank == 0: ckpt_name = os.path.join(config.output_dir, "yolov5_{}_{}.ckpt".format(epoch_idx + 1, steps_per_epoch)) - ms.save_checkpoint(network, ckpt_name) + mindspore.save_checkpoint(network, ckpt_name) export_models(ckpt_name) config.logger.info('==========end training===============') diff --git a/official/cv/YOLOv5/scripts/run_distribute_eval.sh b/official/cv/YOLOv5/scripts/run_distribute_eval.sh index 1d9245d80cd6dbd5d5ca971307aace5e04ad2da5..26551aae015b0656b3fb8467ede8c562d9b0e22b 100644 --- a/official/cv/YOLOv5/scripts/run_distribute_eval.sh +++ b/official/cv/YOLOv5/scripts/run_distribute_eval.sh @@ -84,7 +84,6 @@ do cp ../*.yaml $dir_path cp -r ../src $dir_path cp -r ../model_utils $dir_path - cp -r ../third_party $dir_path cd $dir_path || exit env > env.log echo "start inferring for rank $RANK_ID, device $DEVICE_ID" diff --git a/official/cv/YOLOv5/scripts/run_distribute_train.sh b/official/cv/YOLOv5/scripts/run_distribute_train.sh index e476e06b19cf601b63328cca5b1971c8ecbff866..cb38aae52aa7984daf4b29695a620624b17c8b49 100644 --- a/official/cv/YOLOv5/scripts/run_distribute_train.sh +++ b/official/cv/YOLOv5/scripts/run_distribute_train.sh @@ -66,7 +66,6 @@ do cp ../*.yaml ./train_parallel$i cp -r ../src ./train_parallel$i cp -r ../model_utils ./train_parallel$i - cp -r ../third_party ./train_parallel$i cd ./train_parallel$i || exit echo "start training for rank $RANK_ID, device $DEVICE_ID" env > env.log diff --git a/official/cv/YOLOv5/scripts/run_distribute_train_gpu.sh b/official/cv/YOLOv5/scripts/run_distribute_train_gpu.sh index 26f81ca7dcc92e0a1333fe11bad18a68ccc061a8..b92fe1b65789cb7da8c875bf1bf422912df6b1ee 100644 --- a/official/cv/YOLOv5/scripts/run_distribute_train_gpu.sh +++ b/official/cv/YOLOv5/scripts/run_distribute_train_gpu.sh @@ -48,7 +48,6 @@ cp ../*.py ./distribute_train cp ../*.yaml ./distribute_train cp -r ../src ./distribute_train cp -r ../model_utils ./distribute_train -cp -r ../third_party ./distribute_train cd ./distribute_train || exit mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ diff --git a/official/cv/YOLOv5/scripts/run_eval.sh b/official/cv/YOLOv5/scripts/run_eval.sh index 1a5651383e4714bd073e98a15792dcc7090198af..e43870ccf9ee97cf9f2ba0db1fbc1cc0d39987d6 100644 --- a/official/cv/YOLOv5/scripts/run_eval.sh +++ b/official/cv/YOLOv5/scripts/run_eval.sh @@ -58,7 +58,6 @@ cp ../*.py ./eval cp ../*.yaml ./eval cp -r ../src ./eval cp -r ../model_utils ./eval -cp -r ../third_party ./eval cd ./eval || exit env > env.log echo "start inferring for device $DEVICE_ID" diff --git a/official/cv/YOLOv5/scripts/run_standalone_train.sh b/official/cv/YOLOv5/scripts/run_standalone_train.sh index 260028a4378c94cc8ebee80285379b1dd548269a..2f014e3bbe484d7ddb479275744533e32a02640e 100644 --- a/official/cv/YOLOv5/scripts/run_standalone_train.sh +++ b/official/cv/YOLOv5/scripts/run_standalone_train.sh @@ -53,7 +53,6 @@ cp ../*.py ./train cp ../*.yaml ./train cp -r ../src ./train cp -r ../model_utils ./train -cp -r ../third_party ./train cd ./train || exit echo "start training for device $DEVICE_ID" env > env.log diff --git a/official/cv/YOLOv5/scripts/run_standalone_train_gpu.sh b/official/cv/YOLOv5/scripts/run_standalone_train_gpu.sh index 0e70ab18256effc63c156c29ce9f5c05facfa95f..c8839c18bd3d84aeb48dc2dd1cb391c944e6019e 100644 --- a/official/cv/YOLOv5/scripts/run_standalone_train_gpu.sh +++ b/official/cv/YOLOv5/scripts/run_standalone_train_gpu.sh @@ -55,7 +55,6 @@ cp ../*.py ./train cp ../*.yaml ./train cp -r ../src ./train cp -r ../model_utils ./train -cp -r ../third_party ./train cd ./train || exit echo "======start training======" env > env.log diff --git a/official/cv/YOLOv5/src/initializer.py b/official/cv/YOLOv5/src/initializer.py index ff65b133c7b536292405e86d7ebc6112d9fef428..e92b97cda20aaa5eec16bb389862a1b2735b7d85 100644 --- a/official/cv/YOLOv5/src/initializer.py +++ b/official/cv/YOLOv5/src/initializer.py @@ -14,7 +14,7 @@ # ============================================================================ """Parameter init.""" import math -import mindspore as ms +import mindspore from mindspore import nn @@ -22,14 +22,14 @@ def default_recurisive_init(custom_cell): """Initialize parameter.""" for _, cell in custom_cell.cells_and_names(): if isinstance(cell, (nn.Conv2d, nn.Dense)): - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.HeUniform(math.sqrt(5)), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.HeUniform(math.sqrt(5)), cell.weight.shape, cell.weight.dtype)) def load_yolov5_params(args, network): """Load yolov5 backbone parameter from checkpoint.""" if args.resume_yolov5: - param_dict = load_checkpoint(args.resume_yolov5) + param_dict = mindspore.load_checkpoint(args.resume_yolov5) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): @@ -42,11 +42,11 @@ def load_yolov5_params(args, network): args.logger.info('in resume {}'.format(key)) args.logger.info('resume finished') - load_param_into_net(network, param_dict_new) + mindspore.load_param_into_net(network, param_dict_new) args.logger.info('load_model {} success'.format(args.resume_yolov5)) if args.pretrained_checkpoint: - param_dict = load_checkpoint(args.pretrained_checkpoint) + param_dict = mindspore.load_checkpoint(args.pretrained_checkpoint) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): @@ -62,11 +62,11 @@ def load_yolov5_params(args, network): args.logger.info('in load {}'.format(key)) args.logger.info('pretrained finished') - load_param_into_net(network, param_dict_new) + mindspore.load_param_into_net(network, param_dict_new) args.logger.info('load_model {} success'.format(args.pretrained_backbone)) if args.pretrained_backbone: - param_dict = load_checkpoint(args.pretrained_backbone) + param_dict = mindspore.load_checkpoint(args.pretrained_backbone) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): @@ -79,5 +79,5 @@ def load_yolov5_params(args, network): args.logger.info('in resume {}'.format(key)) args.logger.info('pretrained finished') - load_param_into_net(network, param_dict_new) + mindspore.load_param_into_net(network, param_dict_new) args.logger.info('load_model {} success'.format(args.pretrained_backbone)) diff --git a/official/cv/YOLOv5/src/util.py b/official/cv/YOLOv5/src/util.py index 5bc8fd781e59f6687c9d478163fd616c1e9d99b3..cdcf2ea92a31ca8bcc1f1ae7dffc312a4bc5723a 100644 --- a/official/cv/YOLOv5/src/util.py +++ b/official/cv/YOLOv5/src/util.py @@ -24,7 +24,7 @@ import numpy as np from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore import Tensor, ops @@ -143,7 +143,7 @@ def keep_loss_fp32(network): """Keep loss of network with float32""" for _, cell in network.cells_and_names(): if isinstance(cell, (YoloLossBlock,)): - cell.to_float(ms.float32) + cell.to_float(mindspore.float32) class Redirct: @@ -458,7 +458,7 @@ class EvalWrapper: self.dataset = dataset self.per_batch_size = config.per_batch_size self.device_num = config.group_size - self.input_shape = Tensor(tuple(config.test_img_shape), ms.float32) + self.input_shape = Tensor(tuple(config.test_img_shape), mindspore.float32) self.engine = engine self.eval_parallel = config.eval_parallel if config.eval_parallel: @@ -477,7 +477,7 @@ class EvalWrapper: def inference(self): for index, data in enumerate(self.dataset.create_dict_iterator(output_numpy=True, num_epochs=1)): image = data["image"] - image = ms.Tensor(image) + image = mindspore.Tensor(image) image_shape_ = data["image_shape"] image_id_ = data["img_id"] output_big, output_me, output_small = self.network(image, self.input_shape) diff --git a/official/cv/YOLOv5/src/yolo.py b/official/cv/YOLOv5/src/yolo.py index f9eeec6981253afdb876c712adf8f5fc1d1af05a..143903ffa8b1336c4d874af7baac547a31f6f8e3 100644 --- a/official/cv/YOLOv5/src/yolo.py +++ b/official/cv/YOLOv5/src/yolo.py @@ -14,7 +14,7 @@ # ============================================================================ """YOLOv5 based on DarkNet.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops @@ -139,7 +139,7 @@ class DetectionBlock(nn.Cell): self.offset_x_y = 0.025 else: raise KeyError("Invalid scale value for DetectionBlock") - self.anchors = ms.Tensor([self.config.anchor_scales[i] for i in idx], ms.float32) + self.anchors = mindspore.Tensor([self.config.anchor_scales[i] for i in idx], mindspore.float32) self.num_anchors_per_scale = 3 self.num_attrib = 4+1+self.config.num_classes self.lambda_coord = 1 @@ -166,8 +166,8 @@ class DetectionBlock(nn.Cell): grid_size[1])) prediction = self.transpose(prediction, (0, 3, 4, 1, 2)) - grid_x = ms.numpy.arange(grid_size[1]) - grid_y = ms.numpy.arange(grid_size[0]) + grid_x = mindspore.numpy.arange(grid_size[1]) + grid_y = mindspore.numpy.arange(grid_size[0]) # Tensor of shape [grid_size[0], grid_size[1], 1, 1] representing the coordinate of x/y axis for each grid # [batch, gridx, gridy, 1, 1] grid_x = self.tile(self.reshape(grid_x, (1, 1, -1, 1, 1)), (1, grid_size[0], 1, 1, 1)) @@ -183,7 +183,7 @@ class DetectionBlock(nn.Cell): # gridsize1 is x # gridsize0 is y box_xy = (self.scale_x_y * self.sigmoid(box_xy) - self.offset_x_y + grid) / \ - ops.cast(ops.tuple_to_array((grid_size[1], grid_size[0])), ms.float32) + ops.cast(ops.tuple_to_array((grid_size[1], grid_size[0])), mindspore.float32) # box_wh is w->h box_wh = self.exp(box_wh) * self.anchors / input_shape @@ -250,8 +250,8 @@ class YoloLossBlock(nn.Cell): idx = (6, 7, 8) else: raise KeyError("Invalid scale value for DetectionBlock") - self.anchors = ms.Tensor([self.config.anchor_scales[i] for i in idx], ms.float32) - self.ignore_threshold = ms.Tensor(self.config.ignore_threshold, ms.float32) + self.anchors = mindspore.Tensor([self.config.anchor_scales[i] for i in idx], mindspore.float32) + self.ignore_threshold = mindspore.Tensor(self.config.ignore_threshold, mindspore.float32) self.concat = ops.Concat(axis=-1) self.iou = Iou() self.reduce_max = ops.ReduceMax(keep_dims=False) @@ -281,7 +281,7 @@ class YoloLossBlock(nn.Cell): true_boxes = y_true[:, :, :, :, :4] grid_shape = prediction.shape[1:3] - grid_shape = ops.cast(self.tuple_to_array(grid_shape[::-1]), ms.float32) + grid_shape = ops.cast(self.tuple_to_array(grid_shape[::-1]), mindspore.float32) pred_boxes = self.concat((pred_xy, pred_wh)) true_wh = y_true[:, :, :, :, 2:4] @@ -304,7 +304,7 @@ class YoloLossBlock(nn.Cell): # ignore_mask IOU too small ignore_mask = best_iou < self.ignore_threshold - ignore_mask = ops.cast(ignore_mask, ms.float32) + ignore_mask = ops.cast(ignore_mask, mindspore.float32) ignore_mask = self.expand_dims(ignore_mask, -1) # ignore_mask backpro will cause a lot maximunGrad and minimumGrad time consume. # so we turn off its gradient @@ -353,9 +353,9 @@ class YOLOV5(nn.Cell): self.detect_1 = DetectionBlock('l', is_training=is_training) self.detect_2 = DetectionBlock('m', is_training=is_training) self.detect_3 = DetectionBlock('s', is_training=is_training) - self.mean = ms.Tensor(np.array([0.485 * 255, 0.456 * 255, 0.406 * 255], + self.mean = mindspore.Tensor(np.array([0.485 * 255, 0.456 * 255, 0.406 * 255], dtype=np.float32)).reshape((1, 1, 1, 3)) - self.std = ms.Tensor(np.array([0.229 * 255, 0.224 * 255, 0.225 * 255], + self.std = mindspore.Tensor(np.array([0.229 * 255, 0.224 * 255, 0.225 * 255], dtype=np.float32)).reshape((1, 1, 1, 3)) def construct(self, x, input_shape): @@ -432,9 +432,9 @@ class GIou(nn.Cell): union = box_p_area + box_gt_area - intersection union = union + self.eps c_area = c_area + self.eps - iou = self.div(ops.cast(intersection, ms.float32), ops.cast(union, ms.float32)) + iou = self.div(ops.cast(intersection, mindspore.float32), ops.cast(union, mindspore.float32)) res_mid0 = c_area - union - res_mid1 = self.div(ops.cast(res_mid0, ms.float32), ops.cast(c_area, ms.float32)) + res_mid1 = self.div(ops.cast(res_mid0, mindspore.float32), ops.cast(c_area, mindspore.float32)) giou = iou - res_mid1 giou = ops.clip_by_value(giou, -1.0, 1.0) return giou diff --git a/official/cv/YOLOv5/train.py b/official/cv/YOLOv5/train.py index d2318227c40b71545cc0953124bb868cc7023123..e7af2b662bcbcf46523a6e27d7d1c72c2bf016b5 100644 --- a/official/cv/YOLOv5/train.py +++ b/official/cv/YOLOv5/train.py @@ -16,7 +16,7 @@ import os import time from collections import deque -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.communication as comm from mindspore import load_checkpoint, Parameter, save_checkpoint @@ -35,14 +35,14 @@ from model_utils.device_adapter import get_device_id from model_utils.moxing_adapter import moxing_wrapper, modelarts_pre_process, modelarts_post_process -ms.set_seed(1) +mindspore.set_seed(1) def init_distribute(): comm.init() config.rank = comm.get_rank() config.group_size = comm.get_group_size() - ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.set_auto_parallel_context(parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=config.group_size) @@ -56,9 +56,9 @@ def train_preprocess(): device_id = get_device_id() if config.device_target == "Ascend": device_id = get_device_id() - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) else: - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.is_distributed: # init distributed @@ -95,7 +95,7 @@ def load_parameters(val_network, train_network): param_dict_new[key[13:]] = values else: param_dict_new[key] = values - ms.load_param_into_net(val_network, param_dict_new) + mindspore.load_param_into_net(val_network, param_dict_new) config.logger.info('Load train network success') @@ -146,7 +146,7 @@ def run_train(): steps_per_epoch = ds.get_dataset_size() lr = get_lr(config, steps_per_epoch) - opt = nn.Momentum(params=get_param_groups(network), momentum=config.momentum, learning_rate=ms.Tensor(lr), + opt = nn.Momentum(params=get_param_groups(network), momentum=config.momentum, learning_rate=mindspore.Tensor(lr), weight_decay=config.weight_decay, loss_scale=config.loss_scale) network = nn.TrainOneStepCell(network, opt, config.loss_scale // 2) network.set_train() @@ -163,7 +163,7 @@ def run_train(): for step_idx, data in enumerate(data_loader): images = data[0] input_shape = images.shape[1:3] - input_shape = ms.Tensor(input_shape, ms.float32) + input_shape = mindspore.Tensor(input_shape, mindspore.float32) loss = network(images, data[2], data[3], data[4], data[5], data[6], data[7], input_shape) loss_meter.update(loss.asnumpy()) @@ -185,7 +185,7 @@ def run_train(): loss_meter.reset() if config.rank == 0 and (epoch_idx % config.save_ckpt_interval == 0): ckpt_name = os.path.join(config.output_dir, "yolov5_{}_{}.ckpt".format(epoch_idx + 1, steps_per_epoch)) - ms.save_checkpoint(network, ckpt_name) + mindspore.save_checkpoint(network, ckpt_name) if len(ckpt_queue) == config.save_ckpt_max_num: ckpt_to_remove = ckpt_queue.popleft() os.remove(ckpt_to_remove) diff --git a/official/nlp/Bert/export.py b/official/nlp/Bert/export.py index 942ffeb2e5e3c4aaa97abf10a9dd42f56f269d9b..65a3233b1da7de98bbceacbb233bf1df8891dfdc 100644 --- a/official/nlp/Bert/export.py +++ b/official/nlp/Bert/export.py @@ -17,8 +17,9 @@ import os import shutil import numpy as np +import mindspore import mindspore.common.dtype as mstype -from mindspore import Tensor, context, load_checkpoint, export +from mindspore import Tensor, load_checkpoint, export from src.finetune_eval_model import BertCLSModel, BertSquadModel, BertNERModel from src.bert_for_finetune import BertNER @@ -40,9 +41,9 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def run_export(): '''export function''' - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) + mindspore.set_context(mode=0, device_target=args.device_target) if args.device_target == "Ascend": - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) if args.description == "run_ner": label_list = [] diff --git a/official/nlp/Bert/modelarts/train_start.py b/official/nlp/Bert/modelarts/train_start.py index 8b9413cde0412b6c4abd953bc10b718cf8889407..30149e9039c4420062f722a68dfdbae35d36e0ab 100644 --- a/official/nlp/Bert/modelarts/train_start.py +++ b/official/nlp/Bert/modelarts/train_start.py @@ -19,9 +19,10 @@ Bert finetune and evaluation script. import os import collections import shutil +import mindspore import mindspore.common.dtype as mstype from mindspore import log as logger -from mindspore import Tensor, context, load_checkpoint, export +from mindspore import Tensor, load_checkpoint, export from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.nn.optim import AdamWeightDecay, Lamb, Momentum from mindspore.train.model import Model @@ -153,9 +154,9 @@ def _get_last_ckpt(ckpt_dir): def run_export(ckpt_dir): '''export function''' ckpt_file = _get_last_ckpt(ckpt_dir) - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) + mindspore.set_context(mode=0, device_target=args_opt.device_target) if args_opt.device_target == "Ascend": - context.set_context(device_id=args_opt.device_id) + mindspore.set_context(device_id=args_opt.device_id) if args_opt.description == "run_ner": label_list = [] @@ -218,10 +219,10 @@ def run_squad(): load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path target = args_opt.device_target if target == "Ascend": - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=args_opt.device_id) elif target == "GPU": - context.set_context(mode=context.GRAPH_MODE, device_target="GPU") - context.set_context(enable_graph_kernel=True) + mindspore.set_context(mode=0, device_target="GPU") + mindspore.set_context(enable_graph_kernel=True) if bert_net_cfg.compute_type != mstype.float32: logger.warning('GPU only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 diff --git a/official/nlp/Bert/pretrain_config_Ascend_Boost.yaml b/official/nlp/Bert/pretrain_config_Ascend_Boost.yaml index 0a96802354e7f6c2321496bfa5383d500748e210..032ac0db46937a8f3c2d07489b59564610a3bbce 100644 --- a/official/nlp/Bert/pretrain_config_Ascend_Boost.yaml +++ b/official/nlp/Bert/pretrain_config_Ascend_Boost.yaml @@ -14,6 +14,7 @@ enable_profiling: False # ============================================================================== description: 'run_pretrain' distribute: 'false' +max_device_memory: "28.5GB" epoch_size: 40 device_id: 0 device_num: 1 diff --git a/official/nlp/Bert/pretrain_eval.py b/official/nlp/Bert/pretrain_eval.py index 2537b82294d75932337f11cde06090c5bcb1b621..84c87a9a4427751d68adaa4f7a2597db6a5ca6b2 100644 --- a/official/nlp/Bert/pretrain_eval.py +++ b/official/nlp/Bert/pretrain_eval.py @@ -18,7 +18,7 @@ Bert evaluation script. """ import os -from mindspore import context +import mindspore from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.utils import BertMetric @@ -32,7 +32,7 @@ def bert_predict(): Predict function ''' devid = int(os.getenv('DEVICE_ID')) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) + mindspore.set_context(mode=0, device_target="Ascend", device_id=devid, jit_config={"jit_level": "O2"}) dataset = create_eval_dataset(cfg.batch_size, 1, data_dir=cfg.eval_data_dir, dataset_format=cfg.dataset_format) net_for_pretraining = BertPretrainEval(bert_net_cfg) net_for_pretraining.set_train(False) diff --git a/official/nlp/Bert/quick_start.py b/official/nlp/Bert/quick_start.py index 6dd08e514870ef033f479fdc70d39ebcdb8f83ff..9ca98af7f6b8215c0fd0b7e1f86c0e36ada4a394 100644 --- a/official/nlp/Bert/quick_start.py +++ b/official/nlp/Bert/quick_start.py @@ -17,7 +17,7 @@ Bert quick start script. ''' -import mindspore as ms +import mindspore from mindspore.train.model import Model from mindspore.ops import operations as P from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -59,9 +59,9 @@ def convert_single_example(text, max_seq_length, tokenizer): input_mask.append(0) segment_ids.append(0) - input_ids = ms.Tensor([input_ids,], dtype=ms.int32) - input_mask = ms.Tensor([input_mask,], dtype=ms.int32) - segment_ids = ms.Tensor([segment_ids,], dtype=ms.int32) + input_ids = mindspore.Tensor([input_ids,], dtype=mindspore.int32) + input_mask = mindspore.Tensor([input_mask,], dtype=mindspore.int32) + segment_ids = mindspore.Tensor([segment_ids,], dtype=mindspore.int32) return input_ids, input_mask, segment_ids diff --git a/official/nlp/Bert/run_classifier.py b/official/nlp/Bert/run_classifier.py index cebf106eba26713641d376ee081c641280b9655f..e72dcfb67982e7e95d6fb432a3c812e7431ae39f 100644 --- a/official/nlp/Bert/run_classifier.py +++ b/official/nlp/Bert/run_classifier.py @@ -19,9 +19,8 @@ Bert finetune and evaluation script. import os from tqdm import tqdm -import mindspore as ms +import mindspore import mindspore.common.dtype as mstype -from mindspore import context from mindspore import log as logger from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.nn.optim import AdamWeightDecay, Lamb, Momentum @@ -81,7 +80,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin param_dict = load_checkpoint(load_checkpoint_path) load_param_into_net(network, param_dict) - if ms.get_context("device_target") == "CPU": + if mindspore.get_context("device_target") == "CPU": netwithgrads = BertFinetuneCellCPU(network, optimizer=optimizer) else: update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) @@ -171,18 +170,19 @@ def run_classifier(): load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path target = args_opt.device_target if target == "Ascend": - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=args_opt.device_id, + jit_config={"jit_level": "O2"}) elif target == "GPU": - context.set_context(mode=context.GRAPH_MODE, device_target="GPU") - context.set_context(enable_graph_kernel=True) + mindspore.set_context(mode=0, device_target="GPU") + mindspore.set_context(enable_graph_kernel=True) if bert_net_cfg.compute_type != mstype.float32: logger.warning('GPU only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 elif target == "CPU": if args_opt.use_pynative_mode: - context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU", device_id=args_opt.device_id) + mindspore.set_context(mode=1, device_target="CPU", device_id=args_opt.device_id) else: - context.set_context(mode=context.GRAPH_MODE, device_target="CPU", device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target="CPU", device_id=args_opt.device_id) else: raise Exception("Target error, CPU or GPU or Ascend is supported.") diff --git a/official/nlp/Bert/run_ner.py b/official/nlp/Bert/run_ner.py index f020586f7ce0fdb18b5735378b338989627f53cc..ec7acf369b5e580839939496d41b176bd05c4723 100644 --- a/official/nlp/Bert/run_ner.py +++ b/official/nlp/Bert/run_ner.py @@ -20,9 +20,8 @@ Bert finetune and evaluation script. import os import time from tqdm import tqdm -import mindspore as ms +import mindspore import mindspore.common.dtype as mstype -from mindspore import context from mindspore import log as logger from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.nn.optim import AdamWeightDecay, Lamb, Momentum @@ -80,7 +79,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin param_dict = load_checkpoint(load_checkpoint_path) load_param_into_net(network, param_dict) - if ms.get_context("device_target") == "CPU": + if mindspore.get_context("device_target") == "CPU": netwithgrads = BertFinetuneCellCPU(network, optimizer=optimizer) else: update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) @@ -125,7 +124,7 @@ def do_eval(dataset=None, network=None, use_crf="", with_lstm="", num_class=41, model = Model(net_for_pretraining) if assessment_method == "clue_benchmark": - if ms.get_context("device_target") == "CPU": + if mindspore.get_context("device_target") == "CPU": from src.cluener_evaluation_cpu import submit else: from src.cluener_evaluation import submit @@ -201,18 +200,19 @@ def run_ner(): load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path target = args_opt.device_target if target == "Ascend": - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=args_opt.device_id, + jit_config={"jit_level": "O2"}) elif target == "GPU": - context.set_context(mode=context.GRAPH_MODE, device_target="GPU") - context.set_context(enable_graph_kernel=True) + mindspore.set_context(mode=0, device_target="GPU") + mindspore.set_context(enable_graph_kernel=True) if bert_net_cfg.compute_type != mstype.float32: logger.warning('GPU only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 elif target == "CPU": if args_opt.use_pynative_mode: - context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU", device_id=args_opt.device_id) + mindspore.set_context(mode=1, device_target="CPU", device_id=args_opt.device_id) else: - context.set_context(mode=context.GRAPH_MODE, device_target="CPU", device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target="CPU", device_id=args_opt.device_id) else: raise Exception("Target error, CPU or GPU or Ascend is supported.") label_list = [] diff --git a/official/nlp/Bert/run_pretrain.py b/official/nlp/Bert/run_pretrain.py index 1d1ba0d259d53761567b3a1a46548c33bf00742d..6042d8d5c247c3a398b122708df6daf450a9cb7c 100644 --- a/official/nlp/Bert/run_pretrain.py +++ b/official/nlp/Bert/run_pretrain.py @@ -17,12 +17,12 @@ python run_pretrain.py """ import os +import mindspore import mindspore.communication.management as D from mindspore.communication.management import get_rank import mindspore.common.dtype as mstype -from mindspore import context from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -30,6 +30,7 @@ from mindspore.train.train_thor import ConvertModelUtils from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecay, thor from mindspore import log as logger from mindspore.common import set_seed +from mindspore._c_expression import MSContext from src import BertNetworkWithLoss, BertNetworkMatchBucket, \ BertTrainOneStepCell, \ BertTrainOneStepWithLossScaleCell, \ @@ -48,25 +49,25 @@ _current_dir = os.path.dirname(os.path.realpath(__file__)) def _set_bert_all_reduce_split(): """set bert all_reduce fusion split, support num_hidden_layers is 12 and 24.""" - device_target = context.get_context('device_target') - enable_graph_kernel = context.get_context('enable_graph_kernel') - device_num = context.get_auto_parallel_context('device_num') + device_target = mindspore.get_context('device_target') + enable_graph_kernel = mindspore.get_context('enable_graph_kernel') + device_num = mindspore.get_auto_parallel_context('device_num') if bert_net_cfg.num_hidden_layers == 12: if bert_net_cfg.use_relative_positions: - context.set_auto_parallel_context(all_reduce_fusion_config=[29, 58, 87, 116, 145, 174, 203, 217]) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[29, 58, 87, 116, 145, 174, 203, 217]) else: - context.set_auto_parallel_context(all_reduce_fusion_config=[28, 55, 82, 109, 136, 163, 190, 205]) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[28, 55, 82, 109, 136, 163, 190, 205]) if device_target == 'GPU' and enable_graph_kernel and device_num == 8: - context.set_auto_parallel_context(all_reduce_fusion_config=[180, 205]) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[180, 205]) elif device_target == 'GPU' and enable_graph_kernel and device_num == 16: - context.set_auto_parallel_context(all_reduce_fusion_config=[120, 205]) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[120, 205]) elif bert_net_cfg.num_hidden_layers == 24: if bert_net_cfg.use_relative_positions: - context.set_auto_parallel_context(all_reduce_fusion_config=[30, 90, 150, 210, 270, 330, 390, 421]) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[30, 90, 150, 210, 270, 330, 390, 421]) else: - context.set_auto_parallel_context(all_reduce_fusion_config=[38, 93, 148, 203, 258, 313, 368, 397]) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[38, 93, 148, 203, 258, 313, 368, 397]) if device_target == 'Ascend' and enable_graph_kernel and device_num == 8: - context.set_auto_parallel_context(all_reduce_fusion_config=[ + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[ 0, 1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 35, 40, 50, 70, 93, 148, 203, 258, 313, 368, 397]) @@ -103,7 +104,7 @@ def _get_optimizer(args_opt, network): {'order_params': params}] if args_opt.enable_lossscale == "true" and args_opt.device_target == 'GPU': optimizer = AdamWeightDecayForBert(group_params, learning_rate=lr_schedule, eps=cfg.AdamWeightDecay.eps) - elif context.get_context("mode") == context.PYNATIVE_MODE and args_opt.device_target == 'GPU': + elif mindspore.get_context("mode") == 1 and args_opt.device_target == 'GPU': optimizer = AdamWeightDecayOp(group_params, learning_rate=lr_schedule, eps=cfg.AdamWeightDecay.eps) else: optimizer = AdamWeightDecay(group_params, learning_rate=lr_schedule, eps=cfg.AdamWeightDecay.eps) @@ -131,12 +132,12 @@ def _set_graph_kernel_context(device_target): """Add suitable graph kernel context for different configs.""" if device_target == 'GPU': if cfg.bert_network == 'base': - context.set_context(enable_graph_kernel=True, - graph_kernel_flags="--enable_stitch_fusion=true " - "--enable_parallel_fusion=true " - "--enable_cluster_ops=BatchMatMul") + mindspore.set_context(enable_graph_kernel=True, + graph_kernel_flags="--enable_stitch_fusion=true " + "--enable_parallel_fusion=true " + "--enable_cluster_ops=BatchMatMul") else: - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) else: logger.warning('Graph kernel only supports GPU back-end now, run with graph kernel off.') @@ -159,6 +160,14 @@ def modelarts_pre_process(): cfg.save_checkpoint_path = os.path.join(cfg.output_path, cfg.save_checkpoint_path) +def set_ascend_max_device_memory(config): + is_ascend910b_ge = mindspore.get_context("enable_ge") and mindspore.get_context("mode") == 0 and \ + MSContext.get_instance().get_ascend_soc_version() != 'ascend910' + if is_ascend910b_ge and hasattr(config, "max_device_memory"): + logger.warning("When encountering a memory shortage situation in 1980B, reduce the max_device_memory.") + mindspore.set_context(max_device_memory=config.max_device_memory) + + def InitNetWithGrads(net_with_loss, optimizer): '''init net with grads''' if cfg.enable_lossscale == "true": @@ -196,14 +205,16 @@ def InitNetWithGrads(net_with_loss, optimizer): @moxing_wrapper(pre_process=modelarts_pre_process) def run_pretrain(): """pre-train bert_clue""" - context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target, device_id=cfg.device_id) - context.set_context(reserve_class_name_in_scope=False) + mindspore.set_context(mode=0, device_target=cfg.device_target, device_id=cfg.device_id, + jit_config={"jit_level": "O2"}) + mindspore.set_context(reserve_class_name_in_scope=False) _set_graph_kernel_context(cfg.device_target) ckpt_save_dir = cfg.save_checkpoint_path rank = 0 device_num = 1 if cfg.distribute == "true": if cfg.device_target == 'Ascend': + set_ascend_max_device_memory(cfg) D.init() device_num = cfg.device_num rank = cfg.device_id % device_num @@ -213,9 +224,9 @@ def run_pretrain(): rank = D.get_rank() ckpt_save_dir = os.path.join(cfg.save_checkpoint_path, 'ckpt_' + str(get_rank())) - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, - device_num=device_num) + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + device_num=device_num) _set_bert_all_reduce_split() _check_compute_type(cfg) diff --git a/official/nlp/Bert/run_squad.py b/official/nlp/Bert/run_squad.py index 47f96eff2bb24af5ee0c6d0fe069c3aeb3ccf00e..b1cb2c22a7f6b4a615712ca77c461669afbab474 100644 --- a/official/nlp/Bert/run_squad.py +++ b/official/nlp/Bert/run_squad.py @@ -18,8 +18,8 @@ Bert finetune and evaluation script. ''' import os import collections +import mindspore import mindspore.common.dtype as mstype -from mindspore import context from mindspore import log as logger from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.nn.optim import AdamWeightDecay, Lamb, Momentum @@ -158,18 +158,19 @@ def run_squad(): load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path target = args_opt.device_target if target == "Ascend": - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=args_opt.device_id, + jit_config={"jit_level": "O2"}) elif target == "GPU": - context.set_context(mode=context.GRAPH_MODE, device_target="GPU") - context.set_context(enable_graph_kernel=True) + mindspore.set_context(mode=0, device_target="GPU") + mindspore.set_context(enable_graph_kernel=True) if bert_net_cfg.compute_type != mstype.float32: logger.warning('GPU only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 elif target == "CPU": if args_opt.use_pynative_mode: - context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU", device_id=args_opt.device_id) + mindspore.set_context(mode=1, device_target="CPU", device_id=args_opt.device_id) else: - context.set_context(mode=context.GRAPH_MODE, device_target="CPU", device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target="CPU", device_id=args_opt.device_id) else: raise Exception("Target error, CPU or GPU or Ascend is supported.") diff --git a/official/nlp/Bert/scripts/run_distributed_pretrain_ascend_msrun.sh b/official/nlp/Bert/scripts/run_distributed_pretrain_ascend_msrun.sh new file mode 100644 index 0000000000000000000000000000000000000000..f55364d99618fb5b59fd06487057c5b0c237340f --- /dev/null +++ b/official/nlp/Bert/scripts/run_distributed_pretrain_ascend_msrun.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run_distributed_pretrain_ascend_msrun.sh DATA_DIR" +echo "for example: bash run_distributed_pretrain_ascend_msrun.sh /path/dataset" +echo "It is better to use absolute path." +echo "==============================================================================================================" +export RANK_SIZE=8 +export DEPLOY_MODE=0 +export GE_USE_STATIC_MEMORY=1 +ulimit -s 302400 +cd .. +msrun --bind_core=True --worker_num=8 --local_worker_num=8 \ + --master_port=8118 --log_dir=msrun_log --join=True --cluster_time_out=300 \ + run_pretrain.py --data_dir=$1 --distribute=true --epoch_size=40 \ + --enable_save_ckpt=true --do_shuffle=true --enable_data_sink=true \ + --data_sink_steps=100 --accumulation_steps=1 --allreduce_post_accumulation=true \ + --save_checkpoint_path=./ckpt --save_checkpoint_num=1 --config_path=../../pretrain_config.yaml &> log.txt & diff --git a/official/nlp/Bert/src/bert_for_pre_training.py b/official/nlp/Bert/src/bert_for_pre_training.py index 16ba3407b2225a5e73235fc8baf8e857383a51d4..0a7725d885c6761fa5e39aa870219c2fa8e43171 100644 --- a/official/nlp/Bert/src/bert_for_pre_training.py +++ b/official/nlp/Bert/src/bert_for_pre_training.py @@ -15,6 +15,7 @@ """Bert for pretraining.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore.common.initializer import initializer, TruncatedNormal from mindspore.ops import operations as P @@ -25,9 +26,10 @@ from mindspore.common.parameter import Parameter from mindspore.common.api import jit from mindspore.common import dtype as mstype from mindspore.nn.wrap.grad_reducer import DistributedGradReducer -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_group_size -from mindspore import context, amp, ops +from mindspore import amp, ops +from mindspore._c_expression import MSContext from .bert_model import BertModel GRADIENT_CLIP_TYPE = 1 @@ -366,6 +368,7 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): if scale_update_cell: self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) self.enable_tuple_broaden = True + self.ascend_910a_target = (MSContext.get_instance().get_ascend_soc_version() == 'ascend910') @jit def clip_grads(self, grads): @@ -402,11 +405,15 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): masked_lm_positions, masked_lm_ids, masked_lm_weights, - self.cast(scaling_sens, - mstype.float32)) + self.cast(scaling_sens, mstype.float32)) # apply grad reducer on grads grads = self.grad_reducer(grads) - degree_sens = self.cast(scaling_sens * self.degree, mstype.float32) + if not self.ascend_910a_target and self.reducer_flag: + scaling_sens = self.cast(scaling_sens, mstype.float32) + scaling_sens = F.depend(scaling_sens, grads) + degree_sens = self.allreduce(scaling_sens) + else: + degree_sens = self.cast(scaling_sens * self.degree, mstype.float32) grads = self.hyper_map(F.partial(grad_scale, degree_sens), grads) grads = self.clip_grads(grads) @@ -565,7 +572,7 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = F.identity @@ -697,7 +704,7 @@ class BertTrainAccumulationAllReduceEachWithLossScaleCell(nn.Cell): self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = F.identity @@ -877,7 +884,7 @@ class BertPretrainEval(nn.Cell): self.cast = P.Cast() self.allreduce = P.AllReduce() self.reduce_flag = False - parallel_mode = context.get_auto_parallel_context("parallel_mode") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reduce_flag = True diff --git a/official/nlp/Bert/src/finetune_eval_model.py b/official/nlp/Bert/src/finetune_eval_model.py index 7b0f62622ff724f85f6f25d40bf6975875270871..5690fbbd16861e01fbc9d6bb0d8967bf057ea3f0 100644 --- a/official/nlp/Bert/src/finetune_eval_model.py +++ b/official/nlp/Bert/src/finetune_eval_model.py @@ -16,10 +16,10 @@ ''' Bert finetune and evaluation model script. ''' +import mindspore import mindspore.nn as nn from mindspore.common.initializer import TruncatedNormal from mindspore.ops import operations as P -from mindspore import context from .bert_model import BertModel @@ -76,7 +76,7 @@ class BertSquadModel(nn.Cell): self.dtype = config.dtype self.log_softmax = P.LogSoftmax(axis=1) self.is_training = is_training - self.gpu_target = context.get_context("device_target") == "GPU" + self.gpu_target = mindspore.get_context("device_target") == "GPU" self.cast = P.Cast() self.reshape = P.Reshape() self.transpose = P.Transpose() diff --git a/official/nlp/Bert/src/model_utils/moxing_adapter.py b/official/nlp/Bert/src/model_utils/moxing_adapter.py index 09cb0f0cf0fb88ba809d5ba9a40432b644d789b3..a6d8a3fce9707a33120d15cb8043bf891f8c07b3 100644 --- a/official/nlp/Bert/src/model_utils/moxing_adapter.py +++ b/official/nlp/Bert/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from src.model_utils.config import config @@ -94,7 +94,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/nlp/Bert_thor/README.md b/official/nlp/Bert_thor/README.md index cd9178cd3d31b890d37e87edfbd4942315d9a44e..1280c635fd561b74e9cf848cc29aaac86e08771b 100644 --- a/official/nlp/Bert_thor/README.md +++ b/official/nlp/Bert_thor/README.md @@ -146,7 +146,8 @@ We need five parameters for this scripts. - `SCHEMA_DIR`:Schema path, it is better to use absolute path - `RANK_TABLE_FILE`: rank table file with JSON format -Training result will be stored in the current path, whose folder name begins with the file name that the user defines. Under this, you can find checkpoint file together with result like the followings in log. +Training result will be stored in the current path, whose folder name begins with the file name that the user defines. Under this, you can find checkpoint file together with result like the following +in log. ```shell ... @@ -192,7 +193,8 @@ We need two parameters in evaluation_config.py for this scripts. > checkpoint can be produced in training process. -Inference result will be stored in the example path, you can find result like the followings in log. +Inference result will be stored in the example path, you can find result like the following +in log. ```shell step: 1000 Accuracy: [0.27491578] diff --git a/official/nlp/Bert_thor/pretrain_eval.py b/official/nlp/Bert_thor/pretrain_eval.py index a4f824d7308e0572a80763522da8a11e310b67b9..5d2bcc97f2e2859dcba6426369e0bbd11c512b2d 100644 --- a/official/nlp/Bert_thor/pretrain_eval.py +++ b/official/nlp/Bert_thor/pretrain_eval.py @@ -22,11 +22,11 @@ import os from src import BertModel, GetMaskedLMOutput from src.evaluation_config import cfg, bert_net_cfg +import mindspore import mindspore.common.dtype as mstype import mindspore.dataset as de import mindspore.dataset.transforms as C import mindspore.nn as nn -from mindspore import context from mindspore.common.parameter import Parameter from mindspore.common.tensor import Tensor from mindspore.nn.metrics import Metric @@ -135,7 +135,7 @@ def bert_predict(): Predict function ''' devid = int(os.getenv('DEVICE_ID')) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) + mindspore.set_context(mode=0, device_target="Ascend", device_id=devid) dataset = get_enwiki_512_dataset(cfg.batch_size, 1) net_for_pretraining = BertPretrainEva(bert_net_cfg) net_for_pretraining.set_train(False) diff --git a/official/nlp/Bert_thor/run_pretrain.py b/official/nlp/Bert_thor/run_pretrain.py index a91ce1fe110fac9792d075e637c69790d1a2549e..34f43e74d33edd5cd709849e85cdcb6220696e60 100644 --- a/official/nlp/Bert_thor/run_pretrain.py +++ b/official/nlp/Bert_thor/run_pretrain.py @@ -19,13 +19,13 @@ python run_pretrain.py import argparse import os +import mindspore import mindspore.common.dtype as mstype import mindspore.communication.management as D -from mindspore import context from mindspore import log as logger from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.common import set_seed from mindspore.train.model import Model @@ -113,18 +113,18 @@ def run_pretrain(): parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") args_opt = parser.parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, + mindspore.set_context(mode=0, device_target=args_opt.device_target, device_id=args_opt.device_id, save_graphs=False) - context.set_context(reserve_class_name_in_scope=False) + mindspore.set_context(reserve_class_name_in_scope=False) ckpt_save_dir = args_opt.save_checkpoint_path if args_opt.distribute == "true": D.init() device_num = D.get_group_size() rank = D.get_rank() ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(rank) + '/' - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() _set_bert_all_reduce_split() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) else: diff --git a/official/nlp/Bert_thor/src/bert_for_pre_training.py b/official/nlp/Bert_thor/src/bert_for_pre_training.py index ee9c81a67a2d3080dbeb76d88177efcc3a5aef90..07d0b1165e64e88475df0ee28430459ae46d3742 100644 --- a/official/nlp/Bert_thor/src/bert_for_pre_training.py +++ b/official/nlp/Bert_thor/src/bert_for_pre_training.py @@ -15,6 +15,7 @@ """Bert for pretraining.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore import amp, ops from mindspore.common.initializer import initializer, TruncatedNormal @@ -25,9 +26,8 @@ from mindspore.common.tensor import Tensor from mindspore.common.parameter import Parameter from mindspore.common import dtype as mstype from mindspore.nn.wrap.grad_reducer import DistributedGradReducer -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_group_size -from mindspore import context from .bert_model import BertModel GRADIENT_CLIP_TYPE = 1 @@ -546,7 +546,7 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = F.identity @@ -678,7 +678,7 @@ class BertTrainAccumulationAllReduceEachWithLossScaleCell(nn.Cell): self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = F.identity diff --git a/official/nlp/GPT/eval.py b/official/nlp/GPT/eval.py index a66971c327f89667642f7f177ed53623d8682c8e..fb5ef805c02047a1dac666f5a397291ca87bf5f4 100644 --- a/official/nlp/GPT/eval.py +++ b/official/nlp/GPT/eval.py @@ -20,7 +20,7 @@ GPT evaluation script. import math import argparse import numpy as np -from mindspore import context +import mindspore import mindspore.common.dtype as mstype from mindspore.common.tensor import Tensor from mindspore.nn.transformer.loss import CrossEntropyLoss @@ -31,7 +31,7 @@ from src.dataset import create_dataset from src.gpt import GPT, EvalNet, GPTWithLoss from src.utils import GPTConfig -context.set_context(mode=context.GRAPH_MODE) +mindspore.set_context(mode=0) def ppl_score(probs, length, is_logsoftmax=True): """ calculate perplexity with prob or log_prob inputs """ diff --git a/official/nlp/GPT/src/gpt_wrapcell.py b/official/nlp/GPT/src/gpt_wrapcell.py index b8da50b7981ca684535491883403c119b33ea1fe..11eb6cb65399eec1ca5a044cf09a34f41d9a5466 100644 --- a/official/nlp/GPT/src/gpt_wrapcell.py +++ b/official/nlp/GPT/src/gpt_wrapcell.py @@ -14,13 +14,13 @@ # ============================================================================ """GPT training wrapper""" - +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.ops import composite as C from mindspore.ops import functional as F -from mindspore import context, amp, ops -from mindspore.context import ParallelMode +from mindspore import amp, ops +from mindspore import ParallelMode from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.communication.management import get_group_size from mindspore.common.tensor import Tensor @@ -85,7 +85,7 @@ class GPTTrainOneStepWithLossScaleCell(nn.Cell): sens_param=True) self.reducer_flag = False self.allreduce = P.AllReduce() - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = F.identity diff --git a/official/nlp/GPT/train.py b/official/nlp/GPT/train.py index cf80978739ac970a9fea17b9646b989b718a80b4..f22c514e134d3f2a1ae1289c260b0fb4ab4794b2 100644 --- a/official/nlp/GPT/train.py +++ b/official/nlp/GPT/train.py @@ -20,10 +20,10 @@ GPT train script import os import argparse -from mindspore import context +import mindspore from mindspore.train.model import Model import mindspore.communication.management as D -from mindspore.context import ParallelMode +from mindspore import ParallelMode import mindspore.nn as nn from mindspore.train.callback import TimeMonitor, LossMonitor, ModelCheckpoint, CheckpointConfig from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell @@ -56,15 +56,15 @@ def run_train(): args_opt = parser.parse_args() device_id = int(os.getenv("DEVICE_ID", '0')) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=device_id) if args_opt.distribute == "true": D.init() device_num = args_opt.device_num rank = device_id % device_num print("device_id is {}, rank_id is {}".format(device_id, rank)) - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) else: diff --git a/official/nlp/LSTM/eval.py b/official/nlp/LSTM/eval.py index b0866c70f3f60b48dd0e3b9d36342ded21ee3e38..b5422a7610c0b6d0ba7c0dee318ef33dfcf31520 100644 --- a/official/nlp/LSTM/eval.py +++ b/official/nlp/LSTM/eval.py @@ -22,7 +22,8 @@ from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.dataset import lstm_create_dataset, convert_to_mindrecord from src.lstm import SentimentNet -from mindspore import Tensor, nn, Model, context +import mindspore +from mindspore import Tensor, nn, Model from mindspore.nn import Accuracy, Recall, F1 from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -34,8 +35,8 @@ def eval_lstm(): """ eval lstm """ print('\neval.py config: \n', config) - context.set_context( - mode=context.GRAPH_MODE, + mindspore.set_context( + mode=0, save_graphs=False, device_target=config.device_target) diff --git a/official/nlp/LSTM/export.py b/official/nlp/LSTM/export.py index 8cd6128d881386f3e680272337da6411c8ab6aee..c29e1bb165cae32cc3ca2fc2fc9477a495f53a92 100644 --- a/official/nlp/LSTM/export.py +++ b/official/nlp/LSTM/export.py @@ -19,7 +19,8 @@ python export.py import os import numpy as np -from mindspore import Tensor, context +import mindspore +from mindspore import Tensor from mindspore import export, load_checkpoint, load_param_into_net from src.lstm import SentimentNet @@ -33,8 +34,8 @@ def modelarts_process(): @moxing_wrapper(pre_process=modelarts_process) def export_lstm(): """ export lstm """ - context.set_context( - mode=context.GRAPH_MODE, + mindspore.set_context( + mode=0, save_graphs=False, device_target=config.device_target, device_id=get_device_id()) diff --git a/official/nlp/LSTM/modelarts/data_process.py b/official/nlp/LSTM/modelarts/data_process.py index f359a770b6191c6088e1502db59a5bdb4a505740..534e6be327b38e915eaa6bf781940f9eddcf2862 100644 --- a/official/nlp/LSTM/modelarts/data_process.py +++ b/official/nlp/LSTM/modelarts/data_process.py @@ -11,8 +11,8 @@ import time import moxing as mox import numpy as np +import mindspore import mindspore.dataset as ds -from mindspore import context from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id @@ -143,7 +143,7 @@ def download_data(): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() # create output dir diff --git a/official/nlp/LSTM/modelarts/train_start.py b/official/nlp/LSTM/modelarts/train_start.py index 5e271f8ecaf6e3e0d6d51d2241747eaf08e3807d..a9aa47f4b5b76caaad09e830d8e8a781c5ea6c05 100644 --- a/official/nlp/LSTM/modelarts/train_start.py +++ b/official/nlp/LSTM/modelarts/train_start.py @@ -26,11 +26,12 @@ import time import moxing as mox import numpy as np +import mindspore import mindspore.nn as nn -from mindspore import Tensor, context, export +from mindspore import Tensor, export from mindspore.common import set_seed from mindspore.communication.management import init, get_rank -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.metrics import Accuracy from mindspore.profiler import Profiler from mindspore.train import Model @@ -202,7 +203,7 @@ def download_data(): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() # create output dir @@ -334,8 +335,8 @@ def train_lstm(): # set context device_target = config.device_target _enable_graph_kernel = config.enable_graph_kernel and device_target == "GPU" - context.set_context( - mode=context.GRAPH_MODE, + mindspore.set_context( + mode=0, save_graphs=False, enable_graph_kernel=_enable_graph_kernel, graph_kernel_flags="--enable_cluster_ops=MatMul", @@ -345,18 +346,18 @@ def train_lstm(): device_num = config.device_num rank = 0 if device_num > 1 or config.distribute: - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) if device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) init() rank = get_rank() elif device_target == "GPU": init() else: - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) # dataset preprocess if config.preprocess == 'true': diff --git a/official/nlp/LSTM/src/model_utils/device_adapter.py b/official/nlp/LSTM/src/model_utils/device_adapter.py index 7c5d7f837ddaa8f53cf8dc5573cac0e36881e7b1..825c667a291403bd129078ad476d6fa4af5e1bcc 100644 --- a/official/nlp/LSTM/src/model_utils/device_adapter.py +++ b/official/nlp/LSTM/src/model_utils/device_adapter.py @@ -15,6 +15,7 @@ """Device adapter for ModelArts""" +import mindspore from .config import config if config.enable_modelarts: diff --git a/official/nlp/LSTM/src/model_utils/moxing_adapter.py b/official/nlp/LSTM/src/model_utils/moxing_adapter.py index 830d19a6fc99de8d602703971d5ac5b24e060d11..9c6d88e5d6e9831a21d75dc52fcc166584f8e61e 100644 --- a/official/nlp/LSTM/src/model_utils/moxing_adapter.py +++ b/official/nlp/LSTM/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/nlp/LSTM/train.py b/official/nlp/LSTM/train.py index ca85cb94b4c6ca4b5125c480a2d5e8a6729c1b41..22f0b7215840a4c5343f6f69e2f656848d9e47fa 100644 --- a/official/nlp/LSTM/train.py +++ b/official/nlp/LSTM/train.py @@ -26,13 +26,14 @@ from src.eval_callback import EvalCallBack, apply_eval from src.lr_schedule import get_lr from src.lstm import SentimentNet -from mindspore import Tensor, nn, Model, context +import mindspore +from mindspore import Tensor, nn, Model from mindspore.common import set_seed from mindspore.nn import Accuracy from mindspore.train.callback import LossMonitor, CheckpointConfig, ModelCheckpoint, TimeMonitor from mindspore.train.serialization import load_param_into_net, load_checkpoint from mindspore.communication.management import init, get_rank -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.common import JitConfig set_seed(1) @@ -48,8 +49,8 @@ def train_lstm(): print('\ntrain.py config: \n', config) _enable_graph_kernel = config.enable_graph_kernel == "true" and config.device_target == "GPU" - context.set_context( - mode=context.GRAPH_MODE, + mindspore.set_context( + mode=0, save_graphs=False, enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target) @@ -61,8 +62,8 @@ def train_lstm(): init() device_num = config.device_num # get_device_num() rank = get_rank() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, \ + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, \ device_num=device_num) if config.preprocess == "true": diff --git a/official/nlp/Pangu_alpha/predict.py b/official/nlp/Pangu_alpha/predict.py index d8098fa59ee52c3db907e6bb9448cda240191b2c..a5e6c8091f0184f8b7f99787563313f42c41abaa 100644 --- a/official/nlp/Pangu_alpha/predict.py +++ b/official/nlp/Pangu_alpha/predict.py @@ -22,11 +22,12 @@ import requests import numpy as np from tqdm import tqdm +import mindspore import mindspore.common.dtype as mstype import mindspore.communication.management as D -from mindspore import context, Tensor +from mindspore import Tensor from mindspore import export -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.parallel import set_algo_parameters from mindspore.parallel._cost_model_context import _set_multi_subgraphs from mindspore.train.model import Model @@ -51,15 +52,15 @@ def set_auto_parallel_context(args_opt): """Set the auto parallel context""" rank = 0 device_num = 1 - context.reset_auto_parallel_context() - context.set_auto_parallel_context( + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context( strategy_ckpt_load_file=args_opt.strategy_load_ckpt_path) if args_opt.distribute == "true": D.init() device_num = D.get_group_size() rank = D.get_rank() print("rank_id is {}, device_num is {}".format(rank, device_num)) - context.set_auto_parallel_context( + mindspore.set_auto_parallel_context( parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, gradients_mean=False, full_batch=True, @@ -76,10 +77,10 @@ def load_model(args_opt): The main function for load model """ # Set execution mode - context.set_context(save_graphs=False, - mode=context.GRAPH_MODE, + mindspore.set_context(save_graphs=False, + mode=0, device_target=args_opt.device_target) - context.set_context(max_device_memory="30GB") + mindspore.set_context(max_device_memory="30GB") # Set parallel context rank, device_num = set_auto_parallel_context(args_opt) diff --git a/official/nlp/Pangu_alpha/src/callbacks.py b/official/nlp/Pangu_alpha/src/callbacks.py index 448ef881170aef31365ed1072a80765fcdbf3e86..e1a3d99eef66cd81b2d0fb3f4a0011b6a9eb0a55 100644 --- a/official/nlp/Pangu_alpha/src/callbacks.py +++ b/official/nlp/Pangu_alpha/src/callbacks.py @@ -19,9 +19,9 @@ Callbacks import time import math import numpy as np +import mindspore from mindspore.train.callback import Callback -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_rank class LossCallBack(Callback): @@ -80,9 +80,9 @@ class EvalCallBack(Callback): self.pplMetric = ppl_metric self.has_trained_step = has_trained_step self.pplMetric.clear() - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") - self.strategy_ckpt_save_file = context.get_auto_parallel_context("strategy_ckpt_save_file") - self.strategy_ckpt_load_file = context.get_auto_parallel_context("strategy_ckpt_load_file") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") + self.strategy_ckpt_save_file = mindspore.get_auto_parallel_context("strategy_ckpt_save_file") + self.strategy_ckpt_load_file = mindspore.get_auto_parallel_context("strategy_ckpt_load_file") def step_end(self, run_context): """ @@ -94,7 +94,7 @@ class EvalCallBack(Callback): return self.pplMetric.clear() if self.parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL): - context.set_auto_parallel_context(strategy_ckpt_save_file="", + mindspore.set_auto_parallel_context(strategy_ckpt_save_file="", strategy_ckpt_load_file=self.strategy_ckpt_save_file) rank_id = 0 if self.parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, @@ -109,5 +109,5 @@ class EvalCallBack(Callback): out_str = "{} == Rank: {} == EvalCallBack model.eval(): {}; eval_time: {}s". \ format(time_str, rank_id, out.values(), eval_time) print(out_str) - context.set_auto_parallel_context(strategy_ckpt_save_file=self.strategy_ckpt_save_file, + mindspore.set_auto_parallel_context(strategy_ckpt_save_file=self.strategy_ckpt_save_file, strategy_ckpt_load_file=self.strategy_ckpt_load_file) diff --git a/official/nlp/Pangu_alpha/src/dataset.py b/official/nlp/Pangu_alpha/src/dataset.py index e624b59b38e62ce1250501d88dcfe952f22a6d21..021995df12b411b3004b4bc1ab70aa8eb5143af4 100644 --- a/official/nlp/Pangu_alpha/src/dataset.py +++ b/official/nlp/Pangu_alpha/src/dataset.py @@ -18,11 +18,11 @@ Create dataset for training and evaluating import os import numpy as np +import mindspore import mindspore.dataset as ds import mindspore.dataset.transforms as C import mindspore.common.dtype as mstype -from mindspore import context def get_input_data_batch_slice_map(input_ids, eod_id, rank, dis, eod_reset): """ @@ -90,8 +90,8 @@ def create_dataset(batch_size, data_path, device_num=1, rank=0, drop=True, full_ # Control the size of data queue in the consideration of the memory ds.config.set_prefetch_size(1) - is_data_parallel = context.get_auto_parallel_context( - "parallel_mode") == context.ParallelMode.DATA_PARALLEL + is_data_parallel = mindspore.get_auto_parallel_context( + "parallel_mode") == mindspore.ParallelMode.DATA_PARALLEL # Get path for source data files home_path = os.path.join(os.getcwd(), data_path) diff --git a/official/nlp/Pangu_alpha/src/metrics.py b/official/nlp/Pangu_alpha/src/metrics.py index 4d9e8ca5e8cfb1899d54efbceaa7c7be1f6d6aaf..ff2f17a9723fb72dfa33fb27a5be386bbc82ea86 100644 --- a/official/nlp/Pangu_alpha/src/metrics.py +++ b/official/nlp/Pangu_alpha/src/metrics.py @@ -17,8 +17,8 @@ Eval metrics """ import math +import mindspore from mindspore.nn.metrics import Metric -from mindspore import context from mindspore.communication.management import get_rank, get_group_size class PPLMetric(Metric): @@ -30,7 +30,7 @@ class PPLMetric(Metric): super(PPLMetric, self).__init__() self.clear() self.data_length = data_length - pipeline_stages = context.get_auto_parallel_context("pipeline_stages") + pipeline_stages = mindspore.get_auto_parallel_context("pipeline_stages") per_stage_device_num = get_group_size() // pipeline_stages stage_id = get_rank() // per_stage_device_num self.is_last_stage = (stage_id == pipeline_stages - 1) diff --git a/official/nlp/Pangu_alpha/src/pangu_alpha.py b/official/nlp/Pangu_alpha/src/pangu_alpha.py index 00f594b3d9a199a9e1476e472ffbef7e72caf088..244dde1fd235ebcee83f28606997b0a76dd654a5 100644 --- a/official/nlp/Pangu_alpha/src/pangu_alpha.py +++ b/official/nlp/Pangu_alpha/src/pangu_alpha.py @@ -23,7 +23,6 @@ from mindspore import Tensor, Parameter from mindspore.ops import operations as P from mindspore.ops import functional as F from mindspore.nn import Cell -from mindspore.ops._tracefunc import trace from mindformers.modules.transformer import VocabEmbedding, TransformerEncoder, TransformerEncoderLayer, \ AttentionMask, MoEConfig @@ -306,7 +305,6 @@ class PanguAlpha_Model(Cell): self.load_embedding_from_ckpt(config.load_ckpt_path) self.run_type = config.run_type - @trace def construct_blocks(self, hidden_state, encoder_masks, init_reset, batch_valid_length): if self.blocks is not None: for i in range(self.num_layers - 1): @@ -404,7 +402,6 @@ class PanguAlphaModel(nn.Cell): parallel_config=copied_parallel_config) self.head.pipeline_stage = config.parallel_config.pipeline_stage - 1 self.backbone = PanguAlpha_Model(config) - self.backbone.embedding.word_embedding.embedding_table.add_pipeline_stage(self.head.pipeline_stage) def construct(self, input_ids, input_position, attention_mask, init_reset=True, batch_valid_length=None): diff --git a/official/nlp/Pangu_alpha/src/pangu_alpha_wrapcell.py b/official/nlp/Pangu_alpha/src/pangu_alpha_wrapcell.py index 72cdec353cfc8ad001908150d83d3b2654939b74..d9379b611fbfa1c373ffa88070457ffa87911057 100644 --- a/official/nlp/Pangu_alpha/src/pangu_alpha_wrapcell.py +++ b/official/nlp/Pangu_alpha/src/pangu_alpha_wrapcell.py @@ -14,7 +14,7 @@ # ============================================================================ """GPT training wrapper""" -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.ops import composite as C @@ -22,8 +22,8 @@ from mindspore.ops import functional as F from mindspore.common.tensor import Tensor import mindspore.common.dtype as mstype from mindspore.nn.wrap.loss_scale import TrainOneStepWithLossScaleCell -from mindspore import context, Parameter -from mindspore.context import ParallelMode +from mindspore import Parameter +from mindspore import ParallelMode from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.communication.management import get_group_size from mindspore.ops.operations.math_ops import NPUGetFloatStatusV2, NPUClearFloatStatusV2 @@ -189,7 +189,7 @@ class PanguAlphaTrainPipelineWithLossScaleCell(nn.Cell): sens_param=True) self.reducer_flag = False self.allreduce = P.AllReduce() - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = F.identity @@ -237,7 +237,7 @@ class PanguAlphaTrainPipelineWithLossScaleCell(nn.Cell): else: scaling_sens = sens # alloc status and clear should be right before gradoperation - init = Tensor([0]*8, dtype=ms.int32) + init = Tensor([0]*8, dtype=mindspore.int32) status_clear = self.clear_before_grad(init) scaling_sens = F.depend(scaling_sens, status_clear) grads = self.grad(self.network, weights)(input_ids, diff --git a/official/nlp/Pangu_alpha/src/utils.py b/official/nlp/Pangu_alpha/src/utils.py index 8b20c0428733bb458bd364648b04b94aab93bcc5..60fe0741b7c11cb0adbf803efc62fdd26260d71e 100644 --- a/official/nlp/Pangu_alpha/src/utils.py +++ b/official/nlp/Pangu_alpha/src/utils.py @@ -21,8 +21,8 @@ import os import time import hashlib import numpy as np +import mindspore import mindspore.nn as nn -from mindspore import context from mindspore.ops import operations as P from mindspore.ops import composite as C from mindspore.ops import functional as F @@ -30,7 +30,7 @@ import mindspore.common.dtype as mstype from mindspore.common.tensor import Tensor from mindspore.nn.learning_rate_schedule import LearningRateSchedule, PolynomialDecayLR, WarmUpLR, CosineDecayLR from mindspore.parallel._auto_parallel_context import auto_parallel_context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_rank, get_group_size, create_group from mindspore.nn import AdamWeightDecay from mindspore.common import Parameter, ParameterTuple @@ -145,8 +145,8 @@ class GlobalNorm(nn.Cell): def __init__(self, params, config): super(GlobalNorm, self).__init__() self.hyper_map = C.HyperMap() - self.is_pipeline = context.get_auto_parallel_context("pipeline_stages") > 1 - self.is_data_parallel = context.get_auto_parallel_context("parallel_mode") == ParallelMode.DATA_PARALLEL + self.is_pipeline = mindspore.get_auto_parallel_context("pipeline_stages") > 1 + self.is_data_parallel = mindspore.get_auto_parallel_context("parallel_mode") == ParallelMode.DATA_PARALLEL self.config = config self.group_size = 1 if self.is_data_parallel: @@ -154,7 +154,7 @@ class GlobalNorm(nn.Cell): else: self.merge_op = P.AllReduce() if self.is_pipeline: - if context.get_auto_parallel_context("enable_parallel_optimizer"): + if mindspore.get_auto_parallel_context("enable_parallel_optimizer"): self.group_size = get_group_size() // config.parallel_config.pipeline_stage else: self.group_size = config.parallel_config.model_parallel diff --git a/official/nlp/Pangu_alpha/train.py b/official/nlp/Pangu_alpha/train.py index d672a1cac36cd20d112ac9e662bc46b0d7eb42be..356c4a832cdf93c188b4929ab889b956aa1d961c 100644 --- a/official/nlp/Pangu_alpha/train.py +++ b/official/nlp/Pangu_alpha/train.py @@ -21,11 +21,10 @@ import json import glob import os import math - -from mindspore import context +import mindspore from mindspore.train.model import Model import mindspore.communication.management as D -from mindspore.context import ParallelMode +from mindspore import ParallelMode import mindspore.nn as nn from mindspore.train.callback import TimeMonitor from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell @@ -107,14 +106,14 @@ def set_parallel_context(args_opt): device_num = D.get_group_size() rank = D.get_rank() print("rank_id is {}, device_num is {}".format(rank, device_num)) - context.reset_auto_parallel_context() - context.set_auto_parallel_context( + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context( parallel_mode=args_opt.parallel_mode, gradients_mean=False, search_mode=args_opt.search_mode, full_batch=bool(args_opt.full_batch), strategy_ckpt_load_file=args_opt.strategy_load_ckpt_path, enable_parallel_optimizer=bool(args_opt.optimizer_shard), strategy_ckpt_save_file='strategy.ckpt', enable_alltoall=bool(args_opt.enable_alltoall)) set_algo_parameters(elementwise_op_strategy_follow=True) - if context.get_auto_parallel_context("parallel_mode") == ParallelMode.AUTO_PARALLEL: + if mindspore.get_auto_parallel_context("parallel_mode") == ParallelMode.AUTO_PARALLEL: set_algo_parameters(elementwise_op_strategy_follow=False, fully_use_devices=False) _set_multi_subgraphs() return rank, device_num @@ -137,9 +136,9 @@ def cal_model_property(args_opt, device_num): model_parallel_num = min(args_opt.op_level_model_parallel_num, device_num) data_parallel_num = int(device_num / model_parallel_num) batch_size = args_opt.per_batch_size * data_parallel_num - if (context.get_auto_parallel_context("parallel_mode") == ParallelMode.DATA_PARALLEL or - (context.get_auto_parallel_context("parallel_mode") == ParallelMode.AUTO_PARALLEL and - context.get_auto_parallel_context("search_mode") == "recursive_programming")): + if (mindspore.get_auto_parallel_context("parallel_mode") == ParallelMode.DATA_PARALLEL or + (mindspore.get_auto_parallel_context("parallel_mode") == ParallelMode.AUTO_PARALLEL and + mindspore.get_auto_parallel_context("search_mode") == "recursive_programming")): batch_size = args_opt.per_batch_size return model_parallel_num, data_parallel_num, batch_size @@ -147,16 +146,16 @@ def cal_model_property(args_opt, device_num): def run_train(args_opt): r"""The main training process.""" # Set execution mode - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, max_device_memory="30GB") + mindspore.set_context(mode=0, device_target=args_opt.device_target, max_device_memory="30GB") # Set parallel context rank = 0 device_num = 1 if args_opt.distribute == "true": rank, device_num = set_parallel_context(args_opt) - context.set_context(save_graphs=False, save_graphs_path="./graphs_of_device_id_" + str(rank)) + mindspore.set_context(save_graphs=False, save_graphs_path="./graphs_of_device_id_" + str(rank)) if args_opt.parallel_mode == "data_parallel": # in avoid of the loop call depth - context.set_context(max_call_depth=10000) + mindspore.set_context(max_call_depth=10000) # env variable prepare group_info_file = os.getenv("GROUP_INFO_FILE") @@ -414,22 +413,22 @@ def set_pipeline_parallel_context(args_opt): device_num = D.get_group_size() rank_id = D.get_rank() print("rank_id is {}, device_num is {}".format(rank_id, device_num)) - context.reset_auto_parallel_context() - context.set_auto_parallel_context( + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context( parallel_mode=args_opt.parallel_mode, gradients_mean=False, search_mode=args_opt.search_mode, full_batch=bool(args_opt.full_batch), loss_repeated_mean=True, device_num=device_num, enable_parallel_optimizer=bool(args_opt.optimizer_shard), pipeline_stages=args_opt.stage_num, enable_alltoall=bool(args_opt.enable_alltoall)) set_algo_parameters(elementwise_op_strategy_follow=True) - if context.get_auto_parallel_context("parallel_mode") == ParallelMode.AUTO_PARALLEL: + if mindspore.get_auto_parallel_context("parallel_mode") == ParallelMode.AUTO_PARALLEL: set_algo_parameters(elementwise_op_strategy_follow=False, fully_use_devices=False) _set_multi_subgraphs() return rank_id, device_num def cal_model_property_pipeline(args_opt, device_num): - is_auto_parallel = (context.get_auto_parallel_context("parallel_mode") == ParallelMode.AUTO_PARALLEL and - context.get_auto_parallel_context("search_mode") == "recursive_programming") + is_auto_parallel = (mindspore.get_auto_parallel_context("parallel_mode") == ParallelMode.AUTO_PARALLEL and + mindspore.get_auto_parallel_context("search_mode") == "recursive_programming") # in order to make sure data_parallel_num is always non-zero, set model_parallel_num to 1 model_parallel_num = 1 if is_auto_parallel else args_opt.op_level_model_parallel_num stage_device_num = int(device_num / args_opt.stage_num) @@ -443,8 +442,8 @@ def cal_model_property_pipeline(args_opt, device_num): def run_train_pipeline(args_opt): r"""The main training process in pipeline.""" - context.set_context(save_graphs=False, mode=context.GRAPH_MODE, device_target=args_opt.device_target) - context.set_context(max_device_memory="30GB") + mindspore.set_context(save_graphs=False, mode=0, device_target=args_opt.device_target) + mindspore.set_context(max_device_memory="30GB") rank_id = 0 device_num = 1 if args_opt.distribute == "true": @@ -500,7 +499,7 @@ def run_train_pipeline(args_opt): ds = create_dataset(config.batch_size * parallel_config.micro_batch_num * micro_batch_interleaved, data_path=cache_url, device_num=stage_device_num, rank=rank_id % stage_device_num, eod_reset=True, data_start_index=0, - full_batch=context.get_auto_parallel_context("full_batch"), + full_batch=mindspore.get_auto_parallel_context("full_batch"), column_name=args_opt.data_column_name) epoch_num = args_opt.epoch_size step_per_epoch = ds.get_dataset_size() @@ -552,7 +551,7 @@ if __name__ == "__main__": raise ValueError("The alltoall communication is only effective when applying moe") os.environ['HCCL_CONNECT_TIMEOUT'] = str(opt.hccl_connect_time) if opt.atomic_clean_policy == 1: - context.set_context(ascend_config={"atomic_clean_policy": 1}) + mindspore.set_context(ascend_config={"atomic_clean_policy": 1}) if opt.stage_num > 1: run_train_pipeline(opt) diff --git a/official/nlp/Transformer/eval.py b/official/nlp/Transformer/eval.py index e3e6f367f7f688c9b9d3e9c97d8117ed455054d9..3c695a80ec4749b0af83bf5d6b82b7addbda562a 100644 --- a/official/nlp/Transformer/eval.py +++ b/official/nlp/Transformer/eval.py @@ -17,7 +17,7 @@ import os import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.common.parameter import Parameter from mindspore.common.tensor import Tensor @@ -30,8 +30,8 @@ from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.model_utils.device_adapter import get_device_id -config.dtype = ms.float32 -config.compute_type = ms.float16 +config.dtype = mindspore.float32 +config.compute_type = mindspore.float16 config.batch_size = config.batch_size_ev config.hidden_dropout_prob = config.hidden_dropout_prob_ev config.attention_probs_dropout_prob = config.attention_probs_dropout_prob_ev @@ -45,7 +45,7 @@ def load_test_data(batch_size=1, data_file=None): "target_sos_ids", "target_sos_mask", "target_eos_ids", "target_eos_mask"], shuffle=False) - type_cast_op = deC.TypeCast(ms.int32) + type_cast_op = deC.TypeCast(mindspore.int32) data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_ids") data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_mask") data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_ids") @@ -81,7 +81,7 @@ def load_weights(model_path): ms_ckpt = np.load(model_path) is_npz = True else: - ms_ckpt = ms.load_checkpoint(model_path) + ms_ckpt = mindspore.load_checkpoint(model_path) is_npz = False weights = {} @@ -111,14 +111,14 @@ def run_transformer_eval(): """ Transformer evaluation. """ - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, reserve_class_name_in_scope=False, + mindspore.set_context(mode=0, device_target=config.device_target, reserve_class_name_in_scope=False, device_id=get_device_id()) dataset = load_test_data(batch_size=config.batch_size, data_file=config.data_file) tfm_model = TransformerModel(config=config, is_training=False, use_one_hot_embeddings=False) parameter_dict = load_weights(config.model_file) - ms.load_param_into_net(tfm_model, parameter_dict) + mindspore.load_param_into_net(tfm_model, parameter_dict) tfm_infer = TransformerInferCell(tfm_model) model = Model(tfm_infer) @@ -129,8 +129,8 @@ def run_transformer_eval(): for batch in dataset.create_dict_iterator(output_numpy=True, num_epochs=1): source_sents.append(batch["source_eos_ids"]) target_sents.append(batch["target_eos_ids"]) - source_ids = Tensor(batch["source_eos_ids"], ms.int32) - source_mask = Tensor(batch["source_eos_mask"], ms.int32) + source_ids = Tensor(batch["source_eos_ids"], mindspore.int32) + source_mask = Tensor(batch["source_eos_mask"], mindspore.int32) predicted_ids = model.predict(source_ids, source_mask) predictions.append(predicted_ids.asnumpy()) diff --git a/official/nlp/Transformer/eval_onnx.py b/official/nlp/Transformer/eval_onnx.py index dd650fde7b8621cb0a46613dafe2a85bdd1bad28..c5639ce3c6e3a236348df9f6ce75ffda7d1b2b55 100644 --- a/official/nlp/Transformer/eval_onnx.py +++ b/official/nlp/Transformer/eval_onnx.py @@ -16,7 +16,7 @@ import os -import mindspore as ms +import mindspore import onnxruntime as ort from eval import load_test_data @@ -79,8 +79,8 @@ def run_transformer_eval(): def main(): """Main function""" - config.dtype = ms.float32 - config.compute_type = ms.float16 + config.dtype = mindspore.float32 + config.compute_type = mindspore.float16 config.batch_size = config.batch_size_ev config.hidden_dropout_prob = config.hidden_dropout_prob_ev config.attention_probs_dropout_prob = config.attention_probs_dropout_prob_ev diff --git a/official/nlp/Transformer/export.py b/official/nlp/Transformer/export.py index f76fea6ed7d48bef04fde2dc8c2ad24fee217dff..5cd3e499130777e53e877274302d42d9dd911202 100644 --- a/official/nlp/Transformer/export.py +++ b/official/nlp/Transformer/export.py @@ -16,7 +16,7 @@ import numpy as np -import mindspore as ms +import mindspore from mindspore import Tensor from src.transformer_model import TransformerModel @@ -30,9 +30,9 @@ config.batch_size = config.batch_size_ev config.hidden_dropout_prob = config.hidden_dropout_prob_ev config.attention_probs_dropout_prob = config.attention_probs_dropout_prob_ev -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - ms.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_pre_process(): pass @@ -43,12 +43,12 @@ def export_transformer(): tfm_model = TransformerModel(config=config, is_training=False, use_one_hot_embeddings=False) parameter_dict = load_weights(config.model_file) - ms.load_param_into_net(tfm_model, parameter_dict) + mindspore.load_param_into_net(tfm_model, parameter_dict) source_ids = Tensor(np.ones((config.batch_size, config.seq_length)).astype(np.int32)) source_mask = Tensor(np.ones((config.batch_size, config.seq_length)).astype(np.int32)) - ms.export(tfm_model, source_ids, source_mask, file_name=config.file_name, file_format=config.file_format) + mindspore.export(tfm_model, source_ids, source_mask, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': export_transformer() diff --git a/official/nlp/Transformer/mindspore_hub_conf.py b/official/nlp/Transformer/mindspore_hub_conf.py index cf984efc4b5b26780832f2b0290d30ca7fa976c9..98574ae6e833ad65f46055080d5f19a9c0b01ec0 100644 --- a/official/nlp/Transformer/mindspore_hub_conf.py +++ b/official/nlp/Transformer/mindspore_hub_conf.py @@ -17,7 +17,7 @@ Transformer hub interface for transformer large ''' from src.transformer_model import TransformerModel from src.transformer_model import TransformerConfig -import mindspore as ms +import mindspore transformer_net_cfg_large = TransformerConfig( batch_size=96, seq_length=128, @@ -32,8 +32,8 @@ transformer_net_cfg_large = TransformerConfig( max_position_embeddings=128, initializer_range=0.02, label_smoothing=0.1, - dtype=ms.float32, - compute_type=ms.float16 + dtype=mindspore.float32, + compute_type=mindspore.float16 ) def create_network(name, *args, **kwargs): ''' diff --git a/official/nlp/Transformer/modelarts/train_modelarts.py b/official/nlp/Transformer/modelarts/train_modelarts.py index 2292379737cfd2c9e9262bed827db703ed658a6e..9c80131332e25fb79bf525e80a154766967f8020 100644 --- a/official/nlp/Transformer/modelarts/train_modelarts.py +++ b/official/nlp/Transformer/modelarts/train_modelarts.py @@ -20,8 +20,9 @@ import time import ast import numpy as np from easydict import EasyDict as edict +import mindspore import mindspore.common.dtype as mstype -from mindspore import Tensor, context +from mindspore import Tensor from mindspore.nn.optim import Adam from mindspore.train.model import Model from mindspore.train.loss_scale_manager import DynamicLossScaleManager @@ -30,7 +31,7 @@ from mindspore.train.callback import Callback, TimeMonitor from mindspore.train.serialization import load_checkpoint, load_param_into_net, export import mindspore.communication.management as D from mindspore.communication.management import get_rank -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.common import set_seed from src.transformer_model import TransformerModel from src.transformer_for_train import TransformerTrainOneStepCell, TransformerNetworkWithLoss, \ @@ -137,16 +138,16 @@ def run_transformer_train(): Transformer training. """ if config.device_target == "Ascend": - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) else: - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) - context.set_context(reserve_class_name_in_scope=False) + mindspore.set_context(mode=0, device_target=config.device_target) + mindspore.set_context(reserve_class_name_in_scope=False) # Set mempool block size in PYNATIVE_MODE for improving memory utilization, which will not take effect in GRAPH_MODE if config.device_target == "GPU": # Enable graph kernel - context.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion") + mindspore.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion") if config.distribute == "true": if config.device_target == "Ascend": device_num = config.device_num @@ -156,8 +157,8 @@ def run_transformer_train(): device_num = D.get_group_size() rank = get_rank() config.device_id = rank - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) rank_id = config.device_id % device_num save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(get_rank()) + '/') diff --git a/official/nlp/Transformer/src/beam_search.py b/official/nlp/Transformer/src/beam_search.py index 09e2f6e9a9faa9254a57eca892246c8191b3b013..7d32e4c084294166b88b2bf7173202a74d63686e 100644 --- a/official/nlp/Transformer/src/beam_search.py +++ b/official/nlp/Transformer/src/beam_search.py @@ -15,7 +15,7 @@ """Transformer beam search module.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore.common.tensor import Tensor @@ -28,22 +28,22 @@ class LengthPenalty(nn.Cell): Args: weight (float): Weight of length penalty. Default: 1.0. - compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mindspore.float32. """ def __init__(self, weight=1.0, - compute_type=ms.float32): + compute_type=mindspore.float32): super(LengthPenalty, self).__init__() self.weight = weight self.add = ops.Add() self.pow = ops.Pow() self.div = ops.RealDiv() self.cast = ops.Cast() - self.five = Tensor(5.0, ms.float32) - self.six = Tensor(6.0, ms.float32) + self.five = Tensor(5.0, mindspore.float32) + self.six = Tensor(6.0, mindspore.float32) def construct(self, length_tensor): - length_tensor = self.cast(length_tensor, ms.float32) + length_tensor = self.cast(length_tensor, mindspore.float32) output = self.add(length_tensor, self.five) output = self.div(output, self.six) output = self.pow(output, self.weight) @@ -56,11 +56,11 @@ class TileBeam(nn.Cell): Args: beam_width (int): beam width setting. Default: 4. - compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mindspore.float32. """ def __init__(self, beam_width, - compute_type=ms.float32): + compute_type=mindspore.float32): super(TileBeam, self).__init__() self.beam_width = beam_width self.expand = ops.ExpandDims() @@ -89,10 +89,10 @@ class Mod(nn.Cell): Mod function. Args: - compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mindspore.float32. """ def __init__(self, - compute_type=ms.float32): + compute_type=mindspore.float32): super(Mod, self).__init__() self.compute_type = compute_type self.floor_div = ops.FloorDiv() @@ -120,7 +120,7 @@ class BeamSearchDecoder(nn.Cell): max_decode_length (int): max decode length. Default: 128. sos_id (int): Id of sequence start token. Default: 1. eos_id (int): Id of sequence end token. Default: 2. - compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mindspore.float32. """ def __init__(self, batch_size, @@ -132,7 +132,7 @@ class BeamSearchDecoder(nn.Cell): max_decode_length=128, sos_id=1, eos_id=2, - compute_type=ms.float32): + compute_type=mindspore.float32): super(BeamSearchDecoder, self).__init__(auto_prefix=False) self.seq_length = seq_length self.batch_size = batch_size @@ -148,23 +148,23 @@ class BeamSearchDecoder(nn.Cell): self.shape_flat = (-1,) self.shape = ops.Shape() - self.zero_tensor = Tensor(np.zeros([batch_size, beam_width]), ms.float32) - self.ninf_tensor = Tensor(np.full([batch_size, beam_width], -INF), ms.float32) + self.zero_tensor = Tensor(np.zeros([batch_size, beam_width]), mindspore.float32) + self.ninf_tensor = Tensor(np.full([batch_size, beam_width], -INF), mindspore.float32) self.select = ops.Select() self.flat_shape = (batch_size, beam_width * vocab_size) self.topk = ops.TopK(sorted=True) self.floor_div = ops.FloorDiv() - self.vocab_size_tensor = Tensor(self.vocab_size, ms.int32) + self.vocab_size_tensor = Tensor(self.vocab_size, mindspore.int32) self.real_div = ops.RealDiv() self.mod = Mod() self.equal = ops.Equal() - self.eos_ids = Tensor(np.full([batch_size, beam_width], eos_id), ms.int32) + self.eos_ids = Tensor(np.full([batch_size, beam_width], eos_id), mindspore.int32) beam_ids = np.tile(np.arange(beam_width).reshape((1, beam_width)), [batch_size, 1]) - self.beam_ids = Tensor(beam_ids, ms.int32) + self.beam_ids = Tensor(beam_ids, mindspore.int32) batch_ids = np.arange(batch_size*beam_width).reshape((batch_size, beam_width)) // beam_width - self.batch_ids = Tensor(batch_ids, ms.int32) + self.batch_ids = Tensor(batch_ids, mindspore.int32) self.concat = ops.Concat(axis=-1) self.gather_nd = ops.GatherNd() @@ -174,14 +174,14 @@ class BeamSearchDecoder(nn.Cell): self.zeroslike = ops.ZerosLike() # init inputs and states - self.start_ids = Tensor(np.full([batch_size * beam_width, 1], sos_id), ms.int32) - self.init_seq = Tensor(np.full([batch_size, beam_width, 1], sos_id), ms.int32) + self.start_ids = Tensor(np.full([batch_size * beam_width, 1], sos_id), mindspore.int32) + self.init_seq = Tensor(np.full([batch_size, beam_width, 1], sos_id), mindspore.int32) init_scores = np.tile(np.array([[0.] + [-INF]*(beam_width-1)]), [batch_size, 1]) - self.init_scores = Tensor(init_scores, ms.float32) + self.init_scores = Tensor(init_scores, mindspore.float32) self.init_finished = Tensor(np.zeros([batch_size, beam_width], dtype=np.bool_)) self.init_length = Tensor(np.zeros([batch_size, beam_width], dtype=np.int32)) self.length_penalty = LengthPenalty(weight=length_penalty_weight) - self.one = Tensor(1, ms.int32) + self.one = Tensor(1, mindspore.int32) def one_step(self, cur_input_ids, enc_states, enc_attention_mask, state_log_probs, state_seq, state_finished, state_length): @@ -207,7 +207,7 @@ class BeamSearchDecoder(nn.Cell): beam_indices = self.zeroslike(topk_indices) for _ in range(self.beam_width - 1): temp = self.sub(temp, self.vocab_size_tensor) - res = self.cast(self.greater_equal(temp, 0), ms.int32) + res = self.cast(self.greater_equal(temp, 0), mindspore.int32) beam_indices = beam_indices + res word_indices = topk_indices - beam_indices * self.vocab_size_tensor #====================================================================== diff --git a/official/nlp/Transformer/src/dataset.py b/official/nlp/Transformer/src/dataset.py index 4728db94e0737ead8e0b69107864b167db4ab11c..59300331ae9ffe2cf9c5ba967ca8f7ef1eb67cde 100644 --- a/official/nlp/Transformer/src/dataset.py +++ b/official/nlp/Transformer/src/dataset.py @@ -14,7 +14,7 @@ # ============================================================================ """Data operations, will be used in train.py.""" -import mindspore as ms +import mindspore import mindspore.dataset as de from .model_utils.config import config @@ -33,7 +33,7 @@ def create_transformer_dataset(rank_size=1, rank_id=0, do_shuffle="true", datase "target_sos_ids", "target_sos_mask", "target_eos_ids", "target_eos_mask"], shuffle=(do_shuffle == "true"), num_shards=rank_size, shard_id=rank_id) - type_cast_op = de.transforms.transforms.TypeCast(ms.int32) + type_cast_op = de.transforms.transforms.TypeCast(mindspore.int32) ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids") ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask") ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids") diff --git a/official/nlp/Transformer/src/model_utils/device_adapter.py b/official/nlp/Transformer/src/model_utils/device_adapter.py index 7c5d7f837ddaa8f53cf8dc5573cac0e36881e7b1..825c667a291403bd129078ad476d6fa4af5e1bcc 100644 --- a/official/nlp/Transformer/src/model_utils/device_adapter.py +++ b/official/nlp/Transformer/src/model_utils/device_adapter.py @@ -15,6 +15,7 @@ """Device adapter for ModelArts""" +import mindspore from .config import config if config.enable_modelarts: diff --git a/official/nlp/Transformer/src/model_utils/moxing_adapter.py b/official/nlp/Transformer/src/model_utils/moxing_adapter.py index a35a2590061811e000a1b0bedb79703c2214cb43..636fc02c15428c786070eb4919c196ba49b3818c 100644 --- a/official/nlp/Transformer/src/model_utils/moxing_adapter.py +++ b/official/nlp/Transformer/src/model_utils/moxing_adapter.py @@ -17,7 +17,8 @@ import os import functools -import mindspore as ms +import mindspore +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +94,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - ms.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/nlp/Transformer/src/transformer_for_train.py b/official/nlp/Transformer/src/transformer_for_train.py index a3019301f19e31c5c97e41e01dcfdeab286ae8ef..6647350aa01206cda1bb7ed2e870b4c7deca3f36 100644 --- a/official/nlp/Transformer/src/transformer_for_train.py +++ b/official/nlp/Transformer/src/transformer_for_train.py @@ -18,14 +18,14 @@ from mindspore import jit from mindspore.common.initializer import initializer from mindspore import amp -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore.common.tensor import Tensor from mindspore.common.parameter import Parameter from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.communication.management import get_group_size -from mindspore.context import ParallelMode +from mindspore import ParallelMode from .transformer_model import TransformerModel @@ -73,8 +73,8 @@ class TransformerTrainingLoss(nn.Cell): super(TransformerTrainingLoss, self).__init__(auto_prefix=False) self.vocab_size = config.vocab_size self.onehot = ops.OneHot() - self.on_value = Tensor(float(1 - config.label_smoothing), ms.float32) - self.off_value = Tensor(config.label_smoothing / float(self.vocab_size - 1), ms.float32) + self.on_value = Tensor(float(1 - config.label_smoothing), mindspore.float32) + self.off_value = Tensor(config.label_smoothing / float(self.vocab_size - 1), mindspore.float32) self.reduce_sum = ops.ReduceSum() self.reduce_mean = ops.ReduceMean() self.reshape = ops.Reshape() @@ -88,13 +88,13 @@ class TransformerTrainingLoss(nn.Cell): """Defines the computation performed.""" flat_shape = (self.batch_size * seq_length,) label_ids = self.reshape(label_ids, flat_shape) - label_weights = self.cast(self.reshape(label_weights, flat_shape), ms.float32) + label_weights = self.cast(self.reshape(label_weights, flat_shape), mindspore.float32) one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value) per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx)) numerator = self.reduce_sum(label_weights * per_example_loss, ()) denominator = self.reduce_sum(label_weights, ()) + \ - self.cast(ops.tuple_to_array((1e-5,)), ms.float32) + self.cast(ops.tuple_to_array((1e-5,)), mindspore.float32) loss = numerator / denominator return loss @@ -129,7 +129,7 @@ class TransformerNetworkWithLoss(nn.Cell): prediction_scores = self.transformer(source_ids, source_mask, target_ids, target_mask) seq_length = self.shape(source_ids)[1] total_loss = self.loss(prediction_scores, label_ids, label_weights, seq_length) - return self.cast(total_loss, ms.float32) + return self.cast(total_loss, mindspore.float32) class TransformerTrainOneStepCell(nn.TrainOneStepCell): @@ -188,7 +188,7 @@ class TransformerTrainOneStepCell(nn.TrainOneStepCell): label_ids, label_weights, self.cast(ops.tuple_to_array((self.sens,)), - ms.float32)) + mindspore.float32)) grads = self.clip_grads(grads) # apply grad reducer on grads grads = self.grad_reducer(grads) @@ -234,7 +234,7 @@ class TransformerTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell) self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: - self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=ms.float32)) + self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mindspore.float32)) self.enable_tuple_broaden = True @jit @@ -282,7 +282,7 @@ class TransformerTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell) label_ids, label_weights, self.cast(scaling_sens, - ms.float32)) + mindspore.float32)) # apply grad reducer on grads grads = self.grad_reducer(grads) @@ -304,21 +304,21 @@ add_grads = ops.MultitypeFuncGraph("add_grads") @add_grads.register("Tensor", "Tensor") def _add_grads(accu_grad, grad): - return accu_grad + cast(grad, ms.float32) + return accu_grad + cast(grad, mindspore.float32) update_accu_grads = ops.MultitypeFuncGraph("update_accu_grads") @update_accu_grads.register("Tensor", "Tensor") def _update_accu_grads(accu_grad, grad): succ = True - return ops.depend(succ, ops.assign(accu_grad, cast(grad, ms.float32))) + return ops.depend(succ, ops.assign(accu_grad, cast(grad, mindspore.float32))) accumulate_accu_grads = ops.MultitypeFuncGraph("accumulate_accu_grads") @accumulate_accu_grads.register("Tensor", "Tensor") def _accumulate_accu_grads(accu_grad, grad): succ = True - return ops.depend(succ, ops.assign_add(accu_grad, cast(grad, ms.float32))) + return ops.depend(succ, ops.assign_add(accu_grad, cast(grad, mindspore.float32))) zeroslike = ops.ZerosLike() @@ -361,14 +361,14 @@ class TransformerTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): self.enable_global_norm = enable_global_norm self.one = Tensor(np.array([1]).astype(np.int32)) self.zero = Tensor(np.array([0]).astype(np.int32)) - self.local_step = Parameter(initializer(0, [1], ms.int32)) + self.local_step = Parameter(initializer(0, [1], mindspore.int32)) self.accu_grads = self.weights.clone(prefix="accu_grads", init='zeros') - self.accu_overflow = Parameter(initializer(0, [1], ms.int32)) - self.accu_loss = Parameter(initializer(0, [1], ms.float32)) + self.accu_overflow = Parameter(initializer(0, [1], mindspore.int32)) + self.accu_loss = Parameter(initializer(0, [1], mindspore.float32)) self.grad = ops.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False - self.parallel_mode = ms.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = ops.identity @@ -382,7 +382,7 @@ class TransformerTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): self.overflow_reducer = ops.AllReduce() self.cast = ops.Cast() self.reduce_sum = ops.ReduceSum(keep_dims=False) - self.base = Tensor(1, ms.float32) + self.base = Tensor(1, mindspore.float32) self.less_equal = ops.LessEqual() self.logical_or = ops.LogicalOr() self.not_equal = ops.NotEqual() @@ -392,7 +392,7 @@ class TransformerTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: - self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=ms.float32)) + self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mindspore.float32)) self.enable_tuple_broaden = True @jit @@ -455,7 +455,7 @@ class TransformerTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): label_ids, label_weights, self.cast(scaling_sens, - ms.float32)) + mindspore.float32)) accu_succ = self.clip_accumlate_hyper_map(grads) mean_loss = ops.depend(mean_loss, accu_succ) diff --git a/official/nlp/Transformer/src/transformer_model.py b/official/nlp/Transformer/src/transformer_model.py index c359d7d68970d2ad41f37643a70117dead8b6f0d..8c99c7002bfad02496d7c16b0389994098f9e0de 100644 --- a/official/nlp/Transformer/src/transformer_model.py +++ b/official/nlp/Transformer/src/transformer_model.py @@ -17,7 +17,7 @@ import math import copy import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore.common.tensor import Tensor @@ -53,8 +53,8 @@ class TransformerConfig: beam_width (int): beam width setting. Default: 4 max_decode_length (int): max decode length in evaluation. Default: 80 length_penalty_weight (float): normalize scores of translations according to their length. Default: 1.0 - dtype (:class:`mindspore.dtype`): Data type of the input. Default: ms.float32. - compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: ms.float32. + dtype (:class:`mindspore.dtype`): Data type of the input. Default: mindspore.float32. + compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mindspore.float32. """ def __init__(self, batch_size, @@ -73,8 +73,8 @@ class TransformerConfig: beam_width=4, max_decode_length=80, length_penalty_weight=1.0, - dtype=ms.float32, - compute_type=ms.float32): + dtype=mindspore.float32, + compute_type=mindspore.float32): self.batch_size = batch_size self.seq_length = seq_length self.vocab_size = vocab_size @@ -119,8 +119,8 @@ class EmbeddingLookup(nn.Cell): self.shape_flat = (-1,) self.gather = ops.Gather() self.one_hot = ops.OneHot() - self.on_value = Tensor(1.0, ms.float32) - self.off_value = Tensor(0.0, ms.float32) + self.on_value = Tensor(1.0, mindspore.float32) + self.off_value = Tensor(0.0, mindspore.float32) self.array_mul = ops.MatMul() self.reshape = ops.Reshape() self.shape = ops.Shape() @@ -185,14 +185,14 @@ class EmbeddingPostprocessor(nn.Cell): max_position_embeddings=128, dropout_prob=0.1): super(EmbeddingPostprocessor, self).__init__() - self.scores_mul = Tensor([math.sqrt(float(embedding_size))], dtype=ms.float32) + self.scores_mul = Tensor([math.sqrt(float(embedding_size))], dtype=mindspore.float32) self.multiply = ops.Mul() self.add = ops.Add() self.dropout = nn.Dropout(p=dropout_prob) self.use_dropout = dropout_prob > 0 self.expand_dims = ops.ExpandDims() self.position_embedding_table = Tensor(position_encoding(max_position_embeddings, embedding_size), - ms.float32) + mindspore.float32) self.shape = ops.Shape() def construct(self, word_embeddings): @@ -216,7 +216,7 @@ class CastWrapper(nn.Cell): """ Cast wrapper. """ - def __init__(self, src_type=ms.float32, dst_type=ms.float32): + def __init__(self, src_type=mindspore.float32, dst_type=mindspore.float32): super(CastWrapper, self).__init__() self.cast = ops.Cast() self.dst_type = dst_type @@ -237,7 +237,7 @@ class LayerPreprocess(nn.Cell): self.get_dtype = ops.DType() def construct(self, input_tensor): - output = self.cast(input_tensor, ms.float32) + output = self.cast(input_tensor, mindspore.float32) output = self.layernorm(output) output = self.cast(output, self.get_dtype(input_tensor)) return output @@ -284,7 +284,7 @@ class MultiheadAttention(nn.Cell): initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. do_return_2d_tensor (bool): True for return 2d tensor. False for return 3d tensor. Default: False. - compute_type (:class:`mindspore.dtype`): Compute type in MultiheadAttention. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in MultiheadAttention. Default: mindspore.float32. """ def __init__(self, batch_size, @@ -302,7 +302,7 @@ class MultiheadAttention(nn.Cell): use_one_hot_embeddings=False, initializer_range=0.02, do_return_2d_tensor=True, - compute_type=ms.float32): + compute_type=mindspore.float32): super(MultiheadAttention, self).__init__() self.batch_size = batch_size self.num_attention_heads = num_attention_heads @@ -398,7 +398,7 @@ class MultiheadAttention(nn.Cell): adder = self.multiply(multiply_out, self.multiply_data) attention_scores = self.add(adder, attention_scores) - attention_scores = self.softmax_cast(attention_scores, ms.float32) + attention_scores = self.softmax_cast(attention_scores, mindspore.float32) attention_probs = self.softmax(attention_scores) attention_probs = self.softmax_cast(attention_probs, self.get_dtype(key_layer)) if self.use_dropout: @@ -431,7 +431,7 @@ class SelfAttention(nn.Cell): hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. has_attention_mask (bool): Specifies whether has attention mask. Default: True. is_encdec_att (bool): Specifies whether query sequence and memory sequence are different. Default: False. - compute_type (:class:`mindspore.dtype`): Compute type in MultiheadAttention. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in MultiheadAttention. Default: mindspore.float32. """ def __init__(self, batch_size, @@ -443,7 +443,7 @@ class SelfAttention(nn.Cell): hidden_dropout_prob=0.1, has_attention_mask=True, is_encdec_att=False, - compute_type=ms.float32): + compute_type=mindspore.float32): super(SelfAttention, self).__init__() if hidden_size % num_attention_heads != 0: raise ValueError("The hidden size (%d) is not a multiple of the number " @@ -496,7 +496,7 @@ class FeedForward(nn.Cell): hidden_act (str): name of the activation function. Default: relu initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. - compute_type (:class:`mindspore.dtype`): Compute type in FeedForward. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in FeedForward. Default: mindspore.float32. """ def __init__(self, in_channels, @@ -505,7 +505,7 @@ class FeedForward(nn.Cell): hidden_act="relu", initializer_range=0.02, hidden_dropout_prob=0.1, - compute_type=ms.float32): + compute_type=mindspore.float32): super(FeedForward, self).__init__() self.conv1 = nn.Dense(in_channels, @@ -551,7 +551,7 @@ class EncoderCell(nn.Cell): initializer_range (float): Initialization value of TruncatedNormal. Default: 0.1. hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. hidden_act (str): Activation function. Default: "relu". - compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: mindspore.float32. """ def __init__(self, batch_size, @@ -563,7 +563,7 @@ class EncoderCell(nn.Cell): initializer_range=0.02, hidden_dropout_prob=0.1, hidden_act="relu", - compute_type=ms.float32): + compute_type=mindspore.float32): super(EncoderCell, self).__init__() self.attention = SelfAttention( batch_size=batch_size, @@ -609,7 +609,7 @@ class TransformerEncoder(nn.Cell): initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1.. hidden_act (str): Activation function used in the encoder cells. Default: "gelu". - compute_type (:class:`mindspore.dtype`): Compute type. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type. Default: mindspore.float32. """ def __init__(self, batch_size, @@ -622,7 +622,7 @@ class TransformerEncoder(nn.Cell): initializer_range=0.02, hidden_dropout_prob=0.1, hidden_act="relu", - compute_type=ms.float32): + compute_type=mindspore.float32): super(TransformerEncoder, self).__init__() self.num_hidden_layers = num_hidden_layers self.batch_size = batch_size @@ -679,7 +679,7 @@ class DecoderCell(nn.Cell): initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. hidden_act (str): Activation function. Default: "relu". - compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: mindspore.float32. """ def __init__(self, batch_size, @@ -691,7 +691,7 @@ class DecoderCell(nn.Cell): initializer_range=0.02, hidden_dropout_prob=0.1, hidden_act="relu", - compute_type=ms.float32): + compute_type=mindspore.float32): super(DecoderCell, self).__init__() self.self_attention = SelfAttention( batch_size=batch_size, @@ -751,7 +751,7 @@ class TransformerDecoder(nn.Cell): initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. hidden_act (str): Activation function used in the encoder cells. Default: "gelu". - compute_type (:class:`mindspore.dtype`): Compute type. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type. Default: mindspore.float32. """ def __init__(self, batch_size, @@ -764,7 +764,7 @@ class TransformerDecoder(nn.Cell): initializer_range=0.02, hidden_dropout_prob=0.1, hidden_act="relu", - compute_type=ms.float32): + compute_type=mindspore.float32): super(TransformerDecoder, self).__init__() self.num_hidden_layers = num_hidden_layers @@ -825,7 +825,7 @@ class CreateAttentionMaskFromInputMask(nn.Cell): shape_right = (input_shape[0], 1, input_shape[1]) shape_left = input_shape + (1,) - input_mask = self.cast(input_mask, ms.float32) + input_mask = self.cast(input_mask, mindspore.float32) mask_left = self.reshape(input_mask, shape_left) mask_right = self.reshape(input_mask, shape_right) attention_mask = self.batch_matmul(mask_left, mask_right) @@ -841,14 +841,14 @@ class PredLogProbs(nn.Cell): batch_size (int): Batch size. seq_length (int): Length of input sequence. width (int): Hidden size. - compute_type (:class:`mindspore.dtype`): Compute type. Default: ms.float32. - dtype (:class:`mindspore.dtype`): Compute type to compute log_softmax. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type. Default: mindspore.float32. + dtype (:class:`mindspore.dtype`): Compute type to compute log_softmax. Default: mindspore.float32. """ def __init__(self, batch_size, width, - compute_type=ms.float32, - dtype=ms.float32): + compute_type=mindspore.float32, + dtype=mindspore.float32): super(PredLogProbs, self).__init__() self.batch_size = batch_size self.width = width @@ -896,7 +896,7 @@ class TransformerDecoderStep(nn.Cell): initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. hidden_act (str): Activation function used in the encoder cells. Default: "gelu". - compute_type (:class:`mindspore.dtype`): Compute type. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type. Default: mindspore.float32. embedding_lookup (:class:`EmbeddingLookup`): Embedding lookup module. embedding_processor (:class:`EmbeddingPostprocessor`) Embedding postprocessor module. projection (:class:`PredLogProbs`): PredLogProbs module @@ -913,7 +913,7 @@ class TransformerDecoderStep(nn.Cell): initializer_range=0.02, hidden_dropout_prob=0.3, hidden_act="relu", - compute_type=ms.float32, + compute_type=mindspore.float32, embedding_lookup=None, embedding_processor=None, projection=None): @@ -945,7 +945,7 @@ class TransformerDecoderStep(nn.Cell): self.multiply = ops.Mul() ones = np.ones(shape=(max_decode_length, max_decode_length)) - self.future_mask = Tensor(np.tril(ones), dtype=ms.float32) + self.future_mask = Tensor(np.tril(ones), dtype=mindspore.float32) self.cast_compute_type = CastWrapper(dst_type=compute_type) @@ -985,7 +985,7 @@ class TransformerDecoderStep(nn.Cell): @constexpr def convert_np_to_tensor_encoder(seq_length): ones = np.ones(shape=(seq_length, seq_length)) - return Tensor(np.tril(ones), dtype=ms.float32) + return Tensor(np.tril(ones), dtype=mindspore.float32) class TransformerModel(nn.Cell): @@ -1099,7 +1099,7 @@ class TransformerModel(nn.Cell): self.tfm_decoder.add_flags(loop_can_unroll=True) self.tile_beam = TileBeam(beam_width=self.beam_width) ones = np.ones(shape=(self.batch_size, self.max_decode_length)) - self.encdec_mask = Tensor(ones, ms.float32) + self.encdec_mask = Tensor(ones, mindspore.float32) self.cast = ops.Cast() self.dtype = config.dtype diff --git a/official/nlp/Transformer/train.py b/official/nlp/Transformer/train.py index f2057680ebdabfd91f26714c87c81c00e5326947..01c55c94f5352a1c177cc0ac7a8b803fe37d0bda 100644 --- a/official/nlp/Transformer/train.py +++ b/official/nlp/Transformer/train.py @@ -18,7 +18,7 @@ import os import time from easydict import EasyDict as edict -import mindspore as ms +import mindspore from mindspore.common.tensor import Tensor from mindspore.nn.optim import Adam from mindspore.train.model import Model @@ -27,7 +27,7 @@ from mindspore.train.callback import CheckpointConfig, ModelCheckpoint from mindspore.train.callback import Callback, TimeMonitor import mindspore.communication.management as D from mindspore.communication.management import get_rank -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.common import set_seed from src.transformer_for_train import TransformerTrainOneStepCell, TransformerNetworkWithLoss, \ @@ -49,8 +49,8 @@ def get_ms_timestamp(): time_stamp_init = False time_stamp_first = 0 -config.dtype = ms.float32 -config.compute_type = ms.float16 +config.dtype = mindspore.float32 +config.compute_type = mindspore.float16 config.lr_schedule = edict({ 'learning_rate': 2.0, 'warmup_steps': 8000, @@ -114,18 +114,18 @@ def run_transformer_train(): Transformer training. """ if config.device_target == "Ascend": - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) else: - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) - ms.set_context(reserve_class_name_in_scope=False) + mindspore.set_context(mode=0, device_target=config.device_target) + mindspore.set_context(reserve_class_name_in_scope=False) # Set mempool block size in PYNATIVE_MODE for improving memory utilization, which will not take effect in GRAPH_MODE - if ms.get_context("mode") == ms.PYNATIVE_MODE: - ms.set_context(mempool_block_size="31GB") + if mindspore.get_context("mode") == 1: + mindspore.set_context(mempool_block_size="31GB") if config.device_target == "GPU": # Enable graph kernel - ms.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion") + mindspore.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion") if config.distribute == "true": if config.device_target == "Ascend": device_num = config.device_num @@ -135,8 +135,8 @@ def run_transformer_train(): device_num = D.get_group_size() rank = get_rank() config.device_id = rank - ms.reset_auto_parallel_context() - ms.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) rank_id = config.device_id % device_num save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(get_rank()) + '/') @@ -154,8 +154,8 @@ def run_transformer_train(): netwithloss = TransformerNetworkWithLoss(config, True) if config.checkpoint_path: - parameter_dict = ms.load_checkpoint(config.checkpoint_path) - ms.load_param_into_net(netwithloss, parameter_dict) + parameter_dict = mindspore.load_checkpoint(config.checkpoint_path) + mindspore.load_param_into_net(netwithloss, parameter_dict) hidden_size = config.hidden_size learning_rate = config.lr_schedule.learning_rate if config.device_target == "Ascend" else 1.0 @@ -165,7 +165,7 @@ def run_transformer_train(): warmup_steps=config.lr_schedule.warmup_steps, hidden_size=hidden_size, start_decay_step=config.lr_schedule.start_decay_step, - min_lr=config.lr_schedule.min_lr), ms.float32) + min_lr=config.lr_schedule.min_lr), mindspore.float32) if config.device_target == "GPU" and config.transformer_network == "large": optimizer = Adam(netwithloss.trainable_params(), lr, beta2=config.optimizer_adam_beta2) diff --git a/official/recommend/DeepFM/eval.py b/official/recommend/DeepFM/eval.py index 0f95f1aa29c45be3679e69470af7e3a077c857f1..6856de25f0dbf4e398bff0aab5974d98e6def120 100644 --- a/official/recommend/DeepFM/eval.py +++ b/official/recommend/DeepFM/eval.py @@ -17,7 +17,7 @@ import os import sys import time -from mindspore import context +import mindspore from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -30,7 +30,7 @@ from src.model_utils.device_adapter import get_device_id sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) device_id = get_device_id() # int(os.getenv('DEVICE_ID', '0')) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=device_id) +mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) def add_write(file_path, print_str): with open(file_path, 'a+', encoding='utf-8') as file_out: diff --git a/official/recommend/DeepFM/export.py b/official/recommend/DeepFM/export.py index 88cd2ffa3b544f26382fce0fd0d606065bc3aefd..916d2cc7df7ce09fed40e7266c59278f04650fb6 100644 --- a/official/recommend/DeepFM/export.py +++ b/official/recommend/DeepFM/export.py @@ -15,7 +15,8 @@ """export ckpt to model""" import numpy as np -from mindspore import context, Tensor +import mindspore +from mindspore import Tensor from mindspore.train.serialization import export, load_checkpoint from src.deepfm import ModelBuilder @@ -24,9 +25,9 @@ from src.model_utils.device_adapter import get_device_id from src.model_utils.moxing_adapter import moxing_wrapper -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_process(): pass diff --git a/official/recommend/DeepFM/modelart/start.py b/official/recommend/DeepFM/modelart/start.py index 69ddc52ed1d08eec5f144893c78bbdac06462367..3ca12c64b19a0f8f2df6884bab2ce7c5573953be 100644 --- a/official/recommend/DeepFM/modelart/start.py +++ b/official/recommend/DeepFM/modelart/start.py @@ -16,8 +16,8 @@ import os import sys -from mindspore import context -from mindspore.context import ParallelMode +import mindspore +from mindspore import ParallelMode from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.model import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor @@ -49,11 +49,11 @@ def train_deepfm(): if config.rank_size > 1: if config.device_target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) - context.reset_auto_parallel_context() - context.set_auto_parallel_context( + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[9, 11]) @@ -61,13 +61,13 @@ def train_deepfm(): rank_id = int(os.environ.get('RANK_ID')) elif config.device_target == "GPU": init() - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=True, device_target=config.device_target) - context.set_context( + mindspore.set_context( graph_kernel_flags="--enable_cluster_ops=MatMul") - context.reset_auto_parallel_context() - context.set_auto_parallel_context( + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context( device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) @@ -78,17 +78,17 @@ def train_deepfm(): else: if config.device_target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) elif config.device_target == "GPU": - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=True, device_target=config.device_target) - context.set_context( + mindspore.set_context( graph_kernel_flags="--enable_cluster_ops=MatMul") else: - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=config.device_target) config.rank_size = None rank_id = None diff --git a/official/recommend/DeepFM/src/deepfm.py b/official/recommend/DeepFM/src/deepfm.py index 26ac9a5ec5f749992ca2e56fba136c7141d52922..c564ceb6b72dd3b8ccab5300f89239888d8d9409 100644 --- a/official/recommend/DeepFM/src/deepfm.py +++ b/official/recommend/DeepFM/src/deepfm.py @@ -28,7 +28,7 @@ from mindspore.nn.metrics import Metric from mindspore import nn, Tensor, ParameterTuple, Parameter from mindspore.common.initializer import Uniform, initializer from mindspore.train.callback import ModelCheckpoint, CheckpointConfig -from mindspore.context import ParallelMode, get_auto_parallel_context +from mindspore import ParallelMode, get_auto_parallel_context from mindspore.communication.management import get_group_size from mindspore.nn.wrap.grad_reducer import DistributedGradReducer diff --git a/official/recommend/DeepFM/src/model_utils/device_adapter.py b/official/recommend/DeepFM/src/model_utils/device_adapter.py index 7c5d7f837ddaa8f53cf8dc5573cac0e36881e7b1..825c667a291403bd129078ad476d6fa4af5e1bcc 100644 --- a/official/recommend/DeepFM/src/model_utils/device_adapter.py +++ b/official/recommend/DeepFM/src/model_utils/device_adapter.py @@ -15,6 +15,7 @@ """Device adapter for ModelArts""" +import mindspore from .config import config if config.enable_modelarts: diff --git a/official/recommend/DeepFM/src/model_utils/moxing_adapter.py b/official/recommend/DeepFM/src/model_utils/moxing_adapter.py index 830d19a6fc99de8d602703971d5ac5b24e060d11..9c6d88e5d6e9831a21d75dc52fcc166584f8e61e 100644 --- a/official/recommend/DeepFM/src/model_utils/moxing_adapter.py +++ b/official/recommend/DeepFM/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/recommend/DeepFM/train.py b/official/recommend/DeepFM/train.py index 2cb40957b0d11688422fa4b66178514fa89607bc..4f57f702e64521b9b17497fad22c9e7ec916353f 100644 --- a/official/recommend/DeepFM/train.py +++ b/official/recommend/DeepFM/train.py @@ -16,8 +16,8 @@ import os import sys -from mindspore import context -from mindspore.context import ParallelMode +import mindspore +from mindspore import ParallelMode from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.model import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor @@ -45,23 +45,23 @@ def modelarts_pre_process(): def train_deepfm(): """ train_deepfm """ if config.device_target == "Ascend": - context.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) + mindspore.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) if config.rank_size > 1: if config.device_target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=device_id) - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[9, 11]) init() rank_id = int(os.environ.get('RANK_ID')) elif config.device_target == "GPU": init() - context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target=config.device_target) - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=get_group_size(), + mindspore.set_context(mode=0, enable_graph_kernel=True, device_target=config.device_target) + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) rank_id = get_rank() @@ -71,12 +71,12 @@ def train_deepfm(): else: if config.device_target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) elif config.device_target == "GPU": - context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target=config.device_target) - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.set_context(mode=0, enable_graph_kernel=True, device_target=config.device_target) + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") else: - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, enable_graph_kernel=True) + mindspore.set_context(mode=0, device_target=config.device_target, enable_graph_kernel=True) config.rank_size = None rank_id = None diff --git a/official/recommend/Wide_and_Deep/eval.py b/official/recommend/Wide_and_Deep/eval.py index d8c448fcb01bb1ab7fc913aa2ef70af72a156f10..b936350f9d10c5ea2eee96ecc4082af8a70d2b29 100644 --- a/official/recommend/Wide_and_Deep/eval.py +++ b/official/recommend/Wide_and_Deep/eval.py @@ -17,7 +17,8 @@ import os -from mindspore import Model, context +import mindspore +from mindspore import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net,\ build_searched_strategy, merge_sliced_parameter @@ -118,7 +119,7 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def eval_wide_and_deep(): - context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target) + mindspore.set_context(mode=0, device_target=cfg.device_target) test_eval(cfg) if __name__ == "__main__": diff --git a/official/recommend/Wide_and_Deep/export.py b/official/recommend/Wide_and_Deep/export.py index 55867a8af24460c040176be613764a76cc64aa2a..5af23215991109856577eb96a5d9eb1b62d36b23 100644 --- a/official/recommend/Wide_and_Deep/export.py +++ b/official/recommend/Wide_and_Deep/export.py @@ -16,7 +16,9 @@ ##############export checkpoint file into air, mindir and onnx models################# """ import numpy as np -from mindspore import Tensor, context, load_checkpoint, export, load_param_into_net + +import mindspore +from mindspore import Tensor, load_checkpoint, export, load_param_into_net from eval import ModelBuilder from src.model_utils.device_adapter import get_device_id @@ -24,9 +26,9 @@ from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_pre_process(): pass diff --git a/official/recommend/Wide_and_Deep/modelart/start.py b/official/recommend/Wide_and_Deep/modelart/start.py index dedbb8f815d247a185d5a610ebd13dac0de5c94f..38299bd3cf8939df760bb1478e17c57b49059d66 100644 --- a/official/recommend/Wide_and_Deep/modelart/start.py +++ b/official/recommend/Wide_and_Deep/modelart/start.py @@ -14,7 +14,9 @@ # ============================================================================ """wideanddeep modelarts""" import os -from mindspore import Model, context + +import mindspore +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor import moxing as mox from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel @@ -104,11 +106,11 @@ def modelarts_pre_process(): def train_wide_and_deep(): """train wide and deep""" enable_graph_kernel_ = config.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=enable_graph_kernel_, device_target=config.device_target) if enable_graph_kernel_: - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") test_train(config) diff --git a/official/recommend/Wide_and_Deep/src/callbacks.py b/official/recommend/Wide_and_Deep/src/callbacks.py index c10e221ad6b598495b7ce84b239a278e97724a03..90fdfa42f00c90cd1e116da5395c479f2dfb1630 100644 --- a/official/recommend/Wide_and_Deep/src/callbacks.py +++ b/official/recommend/Wide_and_Deep/src/callbacks.py @@ -15,9 +15,10 @@ callbacks """ import time + +import mindspore from mindspore.train.callback import Callback -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_rank def add_write(file_path, out_str): @@ -57,7 +58,7 @@ class LossCallBack(Callback): cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 cur_num = cb_params.cur_step_num rank_id = 0 - parallel_mode = context.get_auto_parallel_context("parallel_mode") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL, ParallelMode.DATA_PARALLEL): rank_id = get_rank() @@ -107,9 +108,9 @@ class EvalCallBack(Callback): epoch end """ self.aucMetric.clear() - parallel_mode = context.get_auto_parallel_context("parallel_mode") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL): - context.set_auto_parallel_context(strategy_ckpt_save_file="", + mindspore.set_auto_parallel_context(strategy_ckpt_save_file="", strategy_ckpt_load_file=self.config.stra_ckpt) rank_id = 0 if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL, diff --git a/official/recommend/Wide_and_Deep/src/model_utils/device_adapter.py b/official/recommend/Wide_and_Deep/src/model_utils/device_adapter.py index 7c5d7f837ddaa8f53cf8dc5573cac0e36881e7b1..825c667a291403bd129078ad476d6fa4af5e1bcc 100644 --- a/official/recommend/Wide_and_Deep/src/model_utils/device_adapter.py +++ b/official/recommend/Wide_and_Deep/src/model_utils/device_adapter.py @@ -15,6 +15,7 @@ """Device adapter for ModelArts""" +import mindspore from .config import config if config.enable_modelarts: diff --git a/official/recommend/Wide_and_Deep/src/model_utils/moxing_adapter.py b/official/recommend/Wide_and_Deep/src/model_utils/moxing_adapter.py index 830d19a6fc99de8d602703971d5ac5b24e060d11..9c6d88e5d6e9831a21d75dc52fcc166584f8e61e 100644 --- a/official/recommend/Wide_and_Deep/src/model_utils/moxing_adapter.py +++ b/official/recommend/Wide_and_Deep/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/recommend/Wide_and_Deep/src/wide_and_deep.py b/official/recommend/Wide_and_Deep/src/wide_and_deep.py index 4c3a5c44680dd27aeadab774cb37ab6347b2b947..5ccab1a0f73638df46719e3b8fdf3bb91018082b 100644 --- a/official/recommend/Wide_and_Deep/src/wide_and_deep.py +++ b/official/recommend/Wide_and_Deep/src/wide_and_deep.py @@ -14,14 +14,15 @@ # ============================================================================ """wide and deep model""" import numpy as np -from mindspore import nn, context +import mindspore +from mindspore import nn from mindspore import Parameter, ParameterTuple import mindspore.common.dtype as mstype import mindspore.ops as ops from mindspore.nn import Dropout from mindspore.nn.optim import Adam, FTRL from mindspore.common.initializer import Uniform, initializer -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.communication.management import get_group_size @@ -137,7 +138,7 @@ class WideDeepModel(nn.Cell): self.batch_size = config.batch_size host_device_mix = bool(config.host_device_mix) parameter_server = bool(config.parameter_server) - parallel_mode = context.get_auto_parallel_context("parallel_mode") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) if is_auto_parallel: self.batch_size = self.batch_size * get_group_size() @@ -275,7 +276,7 @@ class NetWithLossClass(nn.Cell): host_device_mix = bool(config.host_device_mix) parameter_server = bool(config.parameter_server) sparse = config.sparse - parallel_mode = context.get_auto_parallel_context("parallel_mode") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) self.no_l2loss = (is_auto_parallel if (host_device_mix or config.field_slice) else parameter_server) @@ -332,7 +333,7 @@ class TrainStepWrap(nn.Cell): def __init__(self, network, sens=1024.0, host_device_mix=False, parameter_server=False, sparse=False, cache_enable=False): super(TrainStepWrap, self).__init__() - parallel_mode = context.get_auto_parallel_context("parallel_mode") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) self.network = network self.network.set_train() @@ -377,8 +378,8 @@ class TrainStepWrap(nn.Cell): self.reducer_flag = parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL) if self.reducer_flag: - mean = context.get_auto_parallel_context("gradients_mean") - degree = context.get_auto_parallel_context("device_num") + mean = mindspore.get_auto_parallel_context("gradients_mean") + degree = mindspore.get_auto_parallel_context("device_num") self.grad_reducer_w = DistributedGradReducer(self.optimizer_w.parameters, mean, degree) self.grad_reducer_d = DistributedGradReducer(self.optimizer_d.parameters, mean, degree) @@ -409,8 +410,8 @@ class PredictWithSigmoid(nn.Cell): super(PredictWithSigmoid, self).__init__() self.network = network self.sigmoid = ops.Sigmoid() - parallel_mode = context.get_auto_parallel_context("parallel_mode") - full_batch = context.get_auto_parallel_context("full_batch") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") + full_batch = mindspore.get_auto_parallel_context("full_batch") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) if is_auto_parallel and full_batch: self.sigmoid.shard(((1, 1),)) diff --git a/official/recommend/Wide_and_Deep/train.py b/official/recommend/Wide_and_Deep/train.py index 5d018831d9dccada5730bcdcb6b1ba3b0f7cd7e8..3d1d7894dc50eac21c723001321db2f38de53087 100644 --- a/official/recommend/Wide_and_Deep/train.py +++ b/official/recommend/Wide_and_Deep/train.py @@ -13,7 +13,9 @@ # limitations under the License. """ test_training """ import os -from mindspore import Model, context + +import mindspore +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel from src.callbacks import LossCallBack @@ -93,12 +95,12 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def train_wide_and_deep(): _enable_graph_kernel = config.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) + mindspore.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) if _enable_graph_kernel: - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") test_train(config) if __name__ == "__main__": diff --git a/official/recommend/Wide_and_Deep/train_and_eval.py b/official/recommend/Wide_and_Deep/train_and_eval.py index 88b0a16beb20db5a83f49e37883a967a4fb0054f..61a4689c3d493d12be3431a9d649ccc77104030a 100644 --- a/official/recommend/Wide_and_Deep/train_and_eval.py +++ b/official/recommend/Wide_and_Deep/train_and_eval.py @@ -14,7 +14,8 @@ """ test_training """ import os -from mindspore import Model, context +import mindspore +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel @@ -109,12 +110,12 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def train_wide_and_deep(): if cfg.device_target == "Ascend": - context.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) + mindspore.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) _enable_graph_kernel = cfg.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=_enable_graph_kernel, device_target=cfg.device_target) if _enable_graph_kernel: - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") test_train_eval(cfg) if __name__ == "__main__": diff --git a/official/recommend/Wide_and_Deep/train_and_eval_auto_parallel.py b/official/recommend/Wide_and_Deep/train_and_eval_auto_parallel.py index e82281710c4bb456cee960ffab02dd95727cdb9a..7c3124ae90db2c02d9f55ed9bb95538f16798a86 100644 --- a/official/recommend/Wide_and_Deep/train_and_eval_auto_parallel.py +++ b/official/recommend/Wide_and_Deep/train_and_eval_auto_parallel.py @@ -17,10 +17,12 @@ import os import sys + +import mindspore import mindspore.dataset as ds -from mindspore import Model, context +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.parallel import set_algo_parameters from mindspore.communication.management import get_rank, get_group_size, init @@ -86,7 +88,7 @@ def train_and_eval(config): sparse = config.sparse print("epochs is {}".format(epochs)) if config.full_batch: - context.set_auto_parallel_context(full_batch=True) + mindspore.set_auto_parallel_context(full_batch=True) ds.config.set_seed(1) if config.field_slice: compute_manual_shape(config, get_group_size()) @@ -132,7 +134,7 @@ def train_and_eval(config): ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=os.path.join(config.ckpt_path, 'ckpt_' + str(get_rank())), config=ckptconfig) - context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt) + mindspore.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt) callback_list = [TimeMonitor( ds_train.get_dataset_size()), eval_callback, callback] if not host_device_mix: @@ -148,25 +150,25 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def train_wide_and_deep(): """ train_wide_and_deep """ - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=cfg.device_target) if cfg.device_target == "Ascend": - context.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) + mindspore.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) if cfg.device_target == "GPU": - context.set_context(enable_graph_kernel=True) - context.set_context(max_device_memory="24GB") + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(max_device_memory="24GB") init() if cfg.sparse: if cfg.use_sp: - context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, enable_alltoall=True, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, enable_alltoall=True, search_mode="sharding_propagation", gradients_mean=True, strategy_ckpt_save_file='strategy.ckpt') set_algo_parameters(elementwise_op_strategy_follow=False, fully_use_devices=False) else: - context.set_auto_parallel_context( + mindspore.set_auto_parallel_context( parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, gradients_mean=True) else: - context.set_auto_parallel_context( + mindspore.set_auto_parallel_context( parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True, search_mode="dynamic_programming") train_and_eval(cfg) diff --git a/official/recommend/Wide_and_Deep/train_and_eval_distribute.py b/official/recommend/Wide_and_Deep/train_and_eval_distribute.py index faa2ce17d365d0afd18ebd4a1b3b52a106fda41c..693079014b1ceb4350a3ea95979cfd99b7fe366b 100644 --- a/official/recommend/Wide_and_Deep/train_and_eval_distribute.py +++ b/official/recommend/Wide_and_Deep/train_and_eval_distribute.py @@ -17,9 +17,11 @@ import os import sys -from mindspore import Model, context + +import mindspore +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_rank, get_group_size, init from mindspore.common import set_seed @@ -124,17 +126,17 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def train_wide_and_deep(): """ train_wide_and_deep """ - context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target) + mindspore.set_context(mode=0, device_target=cfg.device_target) if cfg.device_target == "Ascend": - context.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) + mindspore.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) _enable_graph_kernel = cfg.device_target == "GPU" if _enable_graph_kernel: - context.set_context(enable_graph_kernel=True) - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") init() - context.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank())) - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank())) + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=get_group_size(), all_reduce_fusion_config=[6, 12]) train_and_eval(cfg) diff --git a/official/recommend/Wide_and_Deep/train_and_eval_parameter_server_distribute.py b/official/recommend/Wide_and_Deep/train_and_eval_parameter_server_distribute.py index eab55ccc632c98867a20be21befd07b1e19190d4..293695ad6aca21dcb77b2e0525d6d3546a86533f 100644 --- a/official/recommend/Wide_and_Deep/train_and_eval_parameter_server_distribute.py +++ b/official/recommend/Wide_and_Deep/train_and_eval_parameter_server_distribute.py @@ -17,10 +17,12 @@ import os import sys + +import mindspore import mindspore.dataset as ds -from mindspore import Model, context +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_rank, get_group_size, init from mindspore.common import set_seed @@ -88,7 +90,7 @@ def train_and_eval(config): config.full_batch = True print("epochs is {}".format(epochs)) if config.full_batch and os.getenv("MS_ROLE") == "MS_WORKER": - context.set_auto_parallel_context(full_batch=True) + mindspore.set_auto_parallel_context(full_batch=True) ds.config.set_seed(1) ds_train = create_dataset(data_path, train_mode=True, batch_size=batch_size*get_group_size(), data_type=dataset_type) @@ -116,7 +118,7 @@ def train_and_eval(config): if cache_enable: config.stra_ckpt = os.path.join( config.stra_ckpt + "-{}".format(get_rank()), "strategy.ckpt") - context.set_auto_parallel_context( + mindspore.set_auto_parallel_context( strategy_ckpt_save_file=config.stra_ckpt) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) @@ -149,7 +151,7 @@ def modelarts_pre_process(): cfg.ckpt_path = cfg.output_path -context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target) +mindspore.set_context(mode=0, device_target=cfg.device_target) cache_enable = cfg.vocab_cache_size > 0 @@ -157,26 +159,26 @@ cache_enable = cfg.vocab_cache_size > 0 def train_wide_and_deep(): """ train_wide_and_deep """ if cache_enable and cfg.device_target != "GPU": - context.set_context(max_device_memory="24GB") - context.set_ps_context(enable_ps=True) + mindspore.set_context(max_device_memory="24GB") + mindspore.set_ps_context(enable_ps=True) if cfg.device_target == "Ascend": - context.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) + mindspore.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) init() - context.set_context( + mindspore.set_context( save_graphs_path='./graphs_of_device_id_'+str(get_rank())) if cache_enable: if os.getenv("MS_ROLE") == "MS_WORKER": - context.set_auto_parallel_context( + mindspore.set_auto_parallel_context( parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True, search_mode="dynamic_programming") else: - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=get_group_size(), search_mode="dynamic_programming") cfg.sparse = True if cfg.device_target == "GPU": - context.set_context(enable_graph_kernel=True) - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") train_and_eval(cfg) diff --git a/official/recommend/Wide_and_Deep/train_and_eval_parameter_server_standalone.py b/official/recommend/Wide_and_Deep/train_and_eval_parameter_server_standalone.py index 5601767d8042c296507740c36d8b859bd1369090..9fe99aa38c00cc2b0a407e0fc2f7853bc1bf2959 100644 --- a/official/recommend/Wide_and_Deep/train_and_eval_parameter_server_standalone.py +++ b/official/recommend/Wide_and_Deep/train_and_eval_parameter_server_standalone.py @@ -17,7 +17,9 @@ import os import sys -from mindspore import Model, context + +import mindspore +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.common import set_seed @@ -120,23 +122,23 @@ def train_and_eval(config): def modelarts_pre_process(): cfg.ckpt_path = cfg.output_path -context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target) +mindspore.set_context(mode=0, device_target=cfg.device_target) cache_enable = cfg.vocab_cache_size > 0 @moxing_wrapper(pre_process=modelarts_pre_process) def train_wide_and_deep(): """ train_wide_and_deep """ - context.set_ps_context(enable_ps=True) + mindspore.set_ps_context(enable_ps=True) if cfg.device_target == "Ascend": - context.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) + mindspore.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) init() if not cache_enable: cfg.sparse = True if cfg.device_target == "GPU": - context.set_context(enable_graph_kernel=True) - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") train_and_eval(cfg) diff --git a/official/recommend/Wide_and_Deep/train_distribute.py b/official/recommend/Wide_and_Deep/train_distribute.py index c22d19d460a2c2ec5c0b9590b1f4a455ce449000..935e8569885299eaf4241e4a10ea21493f5d6559 100644 --- a/official/recommend/Wide_and_Deep/train_distribute.py +++ b/official/recommend/Wide_and_Deep/train_distribute.py @@ -17,9 +17,11 @@ import os import sys -from mindspore import Model, context + +import mindspore +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_rank, get_group_size, init from mindspore.common import set_seed @@ -120,18 +122,18 @@ def modelarts_pre_process(): def train_wide_and_deep(): """ train_wide_and_deep """ - context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target, save_graphs=True) + mindspore.set_context(mode=0, device_target=cfg.device_target, save_graphs=True) if cfg.device_target == "Ascend": - context.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) + mindspore.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) _enable_graph_kernel = cfg.device_target == "GPU" if _enable_graph_kernel: - context.set_context(enable_graph_kernel=True) - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") init() - context.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank())) - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank())) + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=get_group_size(), all_reduce_fusion_config=[6, 12]) train_and_eval(cfg) diff --git a/official/recommend/Wide_and_Deep_Multitable/eval.py b/official/recommend/Wide_and_Deep_Multitable/eval.py index b5de832d80ef9014cceb7d0f97a5da6f3a77cc7c..065743c9103872cbad57e8cc2d14273bc0edfa37 100644 --- a/official/recommend/Wide_and_Deep_Multitable/eval.py +++ b/official/recommend/Wide_and_Deep_Multitable/eval.py @@ -16,7 +16,9 @@ import os import sys -from mindspore import Model, context + +import mindspore +from mindspore import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel @@ -91,5 +93,5 @@ if __name__ == "__main__": wide_and_deep_config = WideDeepConfig() wide_and_deep_config.argparse_init() compute_emb_dim(wide_and_deep_config) - context.set_context(mode=context.GRAPH_MODE, device_target="Davinci") + mindspore.set_context(mode=0, device_target="Davinci") train_and_eval(wide_and_deep_config) diff --git a/official/recommend/Wide_and_Deep_Multitable/src/wide_and_deep.py b/official/recommend/Wide_and_Deep_Multitable/src/wide_and_deep.py index 5460f1efeccf4bdeae8726cd4cfb5bfe3d298536..86af6769da9e21742d57afadfe8e8cabfa6a746e 100644 --- a/official/recommend/Wide_and_Deep_Multitable/src/wide_and_deep.py +++ b/official/recommend/Wide_and_Deep_Multitable/src/wide_and_deep.py @@ -15,8 +15,9 @@ """wide and deep model""" import numpy as np +import mindspore import mindspore.common.dtype as mstype -from mindspore import nn, context +from mindspore import nn from mindspore import Tensor, Parameter, ParameterTuple from mindspore.ops import functional as F from mindspore.ops import composite as C @@ -24,7 +25,7 @@ from mindspore.ops import operations as P from mindspore.nn import Dropout, Flatten from mindspore.nn.optim import Adam, FTRL from mindspore.common.initializer import Uniform, initializer -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.wrap.grad_reducer import DistributedGradReducer @@ -550,13 +551,13 @@ class TrainStepWrap(nn.Cell): self.reducer_flag = False self.grad_reducer_w = None self.grad_reducer_d = None - parallel_mode = context.get_auto_parallel_context("parallel_mode") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: - mean = context.get_auto_parallel_context("gradients_mean") - degree = context.get_auto_parallel_context("device_num") + mean = mindspore.get_auto_parallel_context("gradients_mean") + degree = mindspore.get_auto_parallel_context("device_num") self.grad_reducer_w = DistributedGradReducer( self.optimizer_w.parameters, mean, degree) self.grad_reducer_d = DistributedGradReducer( diff --git a/official/recommend/Wide_and_Deep_Multitable/train_and_eval.py b/official/recommend/Wide_and_Deep_Multitable/train_and_eval.py index 0a8df002bd67b9bc923cc5972cfea24a2732c4bd..72b5971fac52aace92c3bcbd64122e9333c667f0 100644 --- a/official/recommend/Wide_and_Deep_Multitable/train_and_eval.py +++ b/official/recommend/Wide_and_Deep_Multitable/train_and_eval.py @@ -16,7 +16,9 @@ import os import sys -from mindspore import Model, context + +import mindspore +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig from mindspore.train.callback import TimeMonitor @@ -105,5 +107,5 @@ if __name__ == "__main__": wide_and_deep_config = WideDeepConfig() wide_and_deep_config.argparse_init() compute_emb_dim(wide_and_deep_config) - context.set_context(mode=context.GRAPH_MODE, device_target="Davinci") + mindspore.set_context(mode=0, device_target="Davinci") train_and_eval(wide_and_deep_config) diff --git a/official/recommend/Wide_and_Deep_Multitable/train_and_eval_distribute.py b/official/recommend/Wide_and_Deep_Multitable/train_and_eval_distribute.py index 5372c46d74d04dd3f501e452535fb5b4d1b9806f..434b75fb14a419204363db6171263bc7be1e2a58 100644 --- a/official/recommend/Wide_and_Deep_Multitable/train_and_eval_distribute.py +++ b/official/recommend/Wide_and_Deep_Multitable/train_and_eval_distribute.py @@ -16,10 +16,12 @@ import os import sys -from mindspore import Model, context + +import mindspore +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig from mindspore.train.callback import TimeMonitor -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_rank, get_group_size, init from mindspore.common import set_seed @@ -113,8 +115,8 @@ if __name__ == "__main__": wide_and_deep_config = WideDeepConfig() wide_and_deep_config.argparse_init() compute_emb_dim(wide_and_deep_config) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") + mindspore.set_context(mode=0, device_target="Ascend") init() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=get_group_size()) train_and_eval(wide_and_deep_config) diff --git a/research/cv/Alexnet/train.py b/research/cv/Alexnet/train.py index ded98799b40810e7bf7a88d5759d5417338f8454..385e92b1f1012abf1a9434a88ec9cc5484ee0975 100644 --- a/research/cv/Alexnet/train.py +++ b/research/cv/Alexnet/train.py @@ -55,7 +55,9 @@ def train_alexnet(): device_target = config.device_target context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) context.set_context(save_graphs=False) - if device_target == "GPU": + if device_target == "Ascend": + context.set_context(max_device_memory="56GB") + elif device_target == "GPU": context.set_context(enable_graph_kernel=True) context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") diff --git a/research/cv/RepVGG/README.md b/research/cv/RepVGG/README.md index 859fec1c12b1d6b8e2aa25bb67e26da1d34f3ca7..37b2b5b691eb316bb8322839d8dc64d3ff5d823c 100644 --- a/research/cv/RepVGG/README.md +++ b/research/cv/RepVGG/README.md @@ -358,7 +358,7 @@ We need several parameters for these scripts. Training result will be stored in the current path, whose folder name is "LOG". Under this, you can find checkpoint files together with result like the -followings in log. +following in log. ```log epoch: 1 step: 1000, loss is 6.1155386 @@ -611,7 +611,6 @@ Typical outputs for folder with image: | Accuracy | 75.05% | | Model for inference | 15M (14.33M after re-parametrization)(.ckpt file) | | configuration | RepVGG-B0_experiment.yaml | -| Scripts | | ## [Description of Random Situation](#contents) diff --git a/research/cv/ResNeXt/README.md b/research/cv/ResNeXt/README.md index 997b4f070e4d817959292729d96bb8ce480b5862..729735d81fd1a68337e4d2c10a6ab9692b6b6655 100644 --- a/research/cv/ResNeXt/README.md +++ b/research/cv/ResNeXt/README.md @@ -257,7 +257,7 @@ bash scripts/run_eval.sh 0 /opt/npu/datasets/classification/val /resnext_100.ckp #### Result -Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log. +Evaluation result will be stored in the scripts path. Under this, you can find result like the following in log. ```log acc=78.16%(TOP1) diff --git a/research/cv/ResidualAttentionNet/README.md b/research/cv/ResidualAttentionNet/README.md index 070246a6a87af535922885346c34d7ee2ea2129f..8469c9f98276ab883198aead165298c13a929d07 100644 --- a/research/cv/ResidualAttentionNet/README.md +++ b/research/cv/ResidualAttentionNet/README.md @@ -399,7 +399,7 @@ Current batch_ Size can only be set to 1. # example: bash run_infer_310.sh cifar10-300.mindir cifar10 /data/cifar10/ ../config/cifar10_Ascend_1p_config.yaml 0 ``` -- Inference result will be stored in the example path, you can find result like the followings in acc.log. +- Inference result will be stored in the example path, you can find result like the following in acc.log. ```bash Total data:10000, top1 accuracy:0.9514, top5 accuracy:0.9978. diff --git a/research/cv/TinySAM/README.md b/research/cv/TinySAM/README.md index bd0e17c077d17d03598b7475f25e466c5bc8b466..d01ba5f57034af42f5aebf6ca94a569a37544f3d 100644 --- a/research/cv/TinySAM/README.md +++ b/research/cv/TinySAM/README.md @@ -54,6 +54,10 @@ SNN-MLP After installing MindSpore via the official website, you can start evaluation as follows: +### Download + +Download ckpts from [modelzoo](https://download-mindspore.osinfra.cn/model_zoo/research/cv/TinySAM/tinysam_mindspore.ckpt). + ### Launch ```bash diff --git a/research/cv/east/README.md b/research/cv/east/README.md index d73a3aac6a4c6bdf17f7ac5f462b907c5baf2134..f9b125a59f5d955a747f4b29a21fe9140503794c 100644 --- a/research/cv/east/README.md +++ b/research/cv/east/README.md @@ -294,7 +294,7 @@ You can start training using python or shell scripts. The usage of shell scripts ### Result -Evaluation result will be stored in the output file of evaluation script, you can find result like the followings in `log`. +Evaluation result will be stored in the output file of evaluation script, you can find result like the following in `log`. ```python Calculated {"precision": 0.8329088130412634, "recall": 0.7871930669234473, "hmean": 0.8094059405940593, "AP": 0} diff --git a/research/cv/eppmvsnet/README.md b/research/cv/eppmvsnet/README.md index 32df6e627b2fd2b2d690352aaa1f49e062d77162..1d4bf6183bfacb4205606211cb6fdb786b103879 100644 --- a/research/cv/eppmvsnet/README.md +++ b/research/cv/eppmvsnet/README.md @@ -111,7 +111,7 @@ Parameters for EPP-MVSNet evaluation can be set in validate.py. ``` Evaluation result will be stored in "./results/blendedmvs/val/metrics.txt". You can find the result like the - followings in log. + following in log. ```python stage3_l1_loss:1.1738 diff --git a/research/cv/googlenet/README.md b/research/cv/googlenet/README.md index 2366fd16bcc90d858d54141ec51e9471f58a30bb..9dcd3c773fd2a10954b7523bfd66b77524d46bf8 100644 --- a/research/cv/googlenet/README.md +++ b/research/cv/googlenet/README.md @@ -509,7 +509,7 @@ Current batch_ Size can only be set to 1. Before running the command below, you should modify the cifar10 config file. The items you should modify are batch_size and val_data_path. LABEL_FILE is only useful for imagenet,you can set any value. - Inference result will be stored in the example path, you can find result like the followings in acc.log. + Inference result will be stored in the example path, you can find result like the following in acc.log. ```shell # Ascend310 inference diff --git a/research/cv/googlenet/train.py b/research/cv/googlenet/train.py index 653cb6db10c0a8d716ffef30ed84659ba97879ab..f3f8676ed9bc185d2174f9ae527fe4a790b2ca9c 100644 --- a/research/cv/googlenet/train.py +++ b/research/cv/googlenet/train.py @@ -173,6 +173,7 @@ def run_train(): if cfg.device_target == "Ascend": device_id = get_device_id() context.set_context(device_id=device_id) + context.set_context(max_device_memory="56GB") if device_num > 1: context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, diff --git a/research/cv/llnet/README.md b/research/cv/llnet/README.md index 18fc609e6531ad844419dc9284dfb07a37dbb5a2..8d1f2b3427d79d073e12fd6cc6b83bc2dd2d8f73 100644 --- a/research/cv/llnet/README.md +++ b/research/cv/llnet/README.md @@ -156,7 +156,7 @@ bash run_eval.sh 5 ../dataset ./ckpt_5/llnet-rank5-286_408.ckpt ### Result -Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log. +Evaluation result will be stored in the scripts path. Under this, you can find result like the following in log. PSNR=21.593(dB) SSIM=0.617 ## Inference Process diff --git a/research/cv/nasnet/README.md b/research/cv/nasnet/README.md index 2d768697fe3a99eabea0edea969b6cf8469bd132..de8b2e7bbfcbd356f276e6848d88d29e77b01ed5 100644 --- a/research/cv/nasnet/README.md +++ b/research/cv/nasnet/README.md @@ -182,7 +182,7 @@ bash run_eval_for_gpu.sh 0 /dataset ./ckpt_0/nasnet-a-mobile-rank0-248_10009.ckp ### Result -Evaluation result will be stored in the ./eval path. Under this, you can find result like the followings in `eval.log`. +Evaluation result will be stored in the ./eval path. Under this, you can find result like the following in `eval.log`. acc=74.39%(TOP1,Ascend) acc=73.5%(TOP1,GPU) diff --git a/research/cv/osnet/README.md b/research/cv/osnet/README.md index bc24c8dd00973ad5df11f87512e9e193e91ab654..2bbafb4798fd01f59b72fd6304301ff6cdda05c5 100644 --- a/research/cv/osnet/README.md +++ b/research/cv/osnet/README.md @@ -349,7 +349,7 @@ You can start evaluating using python or shell scripts. The usage of shell scrip ### Result -Evaluation result will be stored in the output file of evaluation script, you can find result like the followings in `eval.log`. +Evaluation result will be stored in the output file of evaluation script, you can find result like the following in `eval.log`. ```python ** Results ** diff --git a/research/cv/pnasnet/README.md b/research/cv/pnasnet/README.md index 951a5b352c65852919154bb0eb9e50e99653b574..9f406d147ac6509ea8b61bc1937fe44ad8dd225a 100644 --- a/research/cv/pnasnet/README.md +++ b/research/cv/pnasnet/README.md @@ -203,7 +203,7 @@ You can find checkpoint file together with result in log. ### Result -Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log. +Evaluation result will be stored in the scripts path. Under this, you can find result like the following in log. - running on Ascend diff --git a/research/cv/proxylessnas/README.md b/research/cv/proxylessnas/README.md index ddca8cd22e5fbe7b39b5870c2ca6746a9052539e..fb55b1cc087abeced11c92d49b1248aa2df92b54 100644 --- a/research/cv/proxylessnas/README.md +++ b/research/cv/proxylessnas/README.md @@ -143,7 +143,7 @@ bash run_eval_for_ascend.sh 0 /dataset ./train_parallel3/ckpt_3/proxylessnas-mob ### Result -Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log. +Evaluation result will be stored in the scripts path. Under this, you can find result like the following in log. acc=75.04%(TOP1) ## Inference Process diff --git a/research/cv/repvgg/__init__.py b/research/cv/repvgg/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..602527cd720c8d268599dbaef190ba1cf1eb6f2b 100644 --- a/research/cv/repvgg/__init__.py +++ b/research/cv/repvgg/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/research/cv/repvgg/eval.py b/research/cv/repvgg/eval.py index 5907eea9dcb564e2089198e0b35b1210a453b998..87848ba75f7e0df551a9dd7412d0eb720216a707 100644 --- a/research/cv/repvgg/eval.py +++ b/research/cv/repvgg/eval.py @@ -12,45 +12,64 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""eval""" +"""Evaluation script. Need training config.""" +import os +from functools import reduce + +import mindspore as ms from mindspore import Model from mindspore import context from mindspore import nn from mindspore.common import set_seed +from mindspore.train.callback import TimeMonitor -from src.args import args -from src.tools.cell import cast_amp +from src.tools.amp import cast_amp from src.tools.criterion import get_criterion, NetWithLoss -from src.tools.get_misc import get_dataset, set_device, get_model, pretrained, get_train_one_step +from src.tools.utils import pretrained, get_train_one_step +from src.dataset import create_dataset_imagenet from src.tools.optimizer import get_optimizer +from src.repvgg import get_model, switch_net_to_deploy -set_seed(args.seed) - - -def main(): - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - context.set_context(enable_graph_kernel=False) - if args.device_target == "Ascend": - context.set_context(enable_auto_mixed_precision=True) - set_device(args) - # get model +def eval_ckpt(args): + print('=== Use checkpoint ===') net = get_model(args) - cast_amp(net) + cast_amp(net, args) criterion = get_criterion(args) net_with_loss = NetWithLoss(net, criterion) if args.pretrained: pretrained(args, net) - data = get_dataset(args, training=False) - batch_num = data.val_dataset.get_dataset_size() + print( + 'Number of parameters (before deploy):', + sum( + reduce(lambda x, y: x * y, params.shape) + for params in net.trainable_params() + ) + ) + switch_net_to_deploy(net) + print( + 'Number of parameters (after deploy):', + sum( + reduce(lambda x, y: x * y, params.shape) + for params in net.trainable_params() + ) + ) + cast_amp(net, args) + net.set_train(False) + + data = create_dataset_imagenet( + str(args.dataset_path), args, training=False + ) + batch_num = data.get_dataset_size() optimizer = get_optimizer(args, net, batch_num) - # save a yaml file to read to record parameters net_with_loss = get_train_one_step(args, net_with_loss, optimizer) - eval_network = nn.WithEvalCell(net, criterion, args.amp_level in ["O2", "O3", "auto"]) + eval_network = nn.WithEvalCell( + net, criterion, args.amp_level in ['O2', 'O3', 'auto'] + ) eval_indexes = [0, 1, 2] eval_metrics = {'Loss': nn.Loss(), 'Top1-Acc': nn.Top1CategoricalAccuracy(), @@ -58,10 +77,58 @@ def main(): model = Model(net_with_loss, metrics=eval_metrics, eval_network=eval_network, eval_indexes=eval_indexes) - print(f"=> begin eval") - results = model.eval(data.val_dataset) - print(f"=> eval results:{results}") - print(f"=> eval success") + + print('=> begin eval') + results = model.eval(data, callbacks=[TimeMonitor()]) + return results + + +def eval_mindir(args): + print('=== Use MINDIR model ===') + data = create_dataset_imagenet( + str(args.dataset_path), args, training=False + ) + iterator = data.create_dict_iterator(num_epochs=1) + + graph = ms.load(str(args.pretrained)) + net = nn.GraphCell(graph) + metrics = { + 'Top1-Acc': nn.Top1CategoricalAccuracy(), + 'Top5-Acc': nn.Top5CategoricalAccuracy(), + } + print('=> begin eval') + for batch in iterator: + y_pred = net(batch['image']) + for metric in metrics.values(): + metric.update(y_pred, batch['label']) + + return {name: metric.eval() for name, metric in metrics.items()} + + +def main(): + """Entry point.""" + from src.config import run_args + args = run_args() + + set_seed(args.seed) + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target) + context.set_context(enable_graph_kernel=False) + if args.device_target == 'Ascend': + context.set_context(enable_auto_mixed_precision=True) + + os.environ["RANK_SIZE"] = '0' + + # get model + if args.pretrained.suffix == '.ckpt': + results = eval_ckpt(args) + elif args.pretrained.suffix == '.mindir': + results = eval_mindir(args) + else: + raise ValueError('Incorrect format checkpoint') + + print(f'=> eval results:{results}') + print('=> eval success') if __name__ == '__main__': diff --git a/research/cv/repvgg/export.py b/research/cv/repvgg/export.py index e7b990331a9e57eb3c01a0196f8a4904d03fcd36..93a589bf8ec9304f2e677ab827b9457a280c62ff 100644 --- a/research/cv/repvgg/export.py +++ b/research/cv/repvgg/export.py @@ -12,37 +12,58 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -""" -##############export checkpoint file into air, onnx or mindir model################# -python export.py -""" - +"""Export model to MINDIR, AIR or ONNX format. Need training config.""" import numpy as np -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +from mindspore import ( + Tensor, load_checkpoint, load_param_into_net, export, context +) from mindspore import dtype as mstype +from mindspore import nn -from src.args import args -from src.tools.cell import cast_amp -from src.tools.criterion import get_criterion, NetWithLoss -from src.tools.get_misc import get_model +from src.config import run_args +from src.tools.amp import cast_amp +from src.repvgg import get_model, switch_net_to_deploy -context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) -if args.device_target in ["Ascend", "GPU"]: - context.set_context(device_id=args.device_id) +class NetWithSoftmax(nn.Cell): + """Network with softmax at the end.""" -if __name__ == '__main__': + def __init__(self, net): + super().__init__() + self.net = net + self.softmax = nn.Softmax() + + def construct(self, x): + return self.softmax(self.net(x)) + + +def main(): + """Entry point.""" + args = run_args() + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target) + if args.device_target in ['Ascend', 'GPU']: + context.set_context(device_id=args.device_id) net = get_model(args) - criterion = get_criterion(args) - cast_amp(net) - net_with_loss = NetWithLoss(net, criterion) - assert args.pretrained is not None, "checkpoint_path is None." + cast_amp(net, args) + assert args.pretrained is not None, 'checkpoint_path is None.' - param_dict = load_checkpoint(args.pretrained) + param_dict = load_checkpoint(str(args.pretrained)) load_param_into_net(net, param_dict) - + switch_net_to_deploy(net) + cast_amp(net, args) + net = NetWithSoftmax(net) net.set_train(False) net.to_float(mstype.float32) - input_arr = Tensor(np.zeros([1, 3, args.image_size, args.image_size], np.float32)) - export(net, input_arr, file_name=args.arch, file_format=args.file_format) + input_arr = Tensor( + np.zeros([1, 3, args.image_size, args.image_size], np.float32) + ) + export( + net, input_arr, file_name=str(args.export_path), + file_format=args.file_format + ) + + +if __name__ == '__main__': + main() diff --git a/research/cv/repvgg/infer_onnx.py b/research/cv/repvgg/infer_onnx.py index d01128fb6c3bb6edadae3ab52fd7f41d3e6f1684..e6e9ae1276a6de7dc1fbbd0c2bf4f8245cb0ed62 100644 --- a/research/cv/repvgg/infer_onnx.py +++ b/research/cv/repvgg/infer_onnx.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* # Copyright 2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,21 +14,95 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""eval""" +""" +Run prediction on folder or single image, output results and save them to +JSON file. +""" +import argparse +import json +from pathlib import Path + +from PIL import Image import onnxruntime as ort -from mindspore import nn -from src.args import args -from src.data.imagenet import create_dataset_imagenet +from src.dataset import get_transforms + + +def parse_args(): + """ + Create and parse command-line arguments. + + Returns + ------- + argparse.Namespace + Parsed command-line arguments. + """ + parser = argparse.ArgumentParser( + description=__doc__, add_help=False, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument('-h', '--help', action='help', + default=argparse.SUPPRESS, + help='Show this help message and exit.') + parser.add_argument('data', type=Path, + help='Path to dataset for prediction.') + parser.add_argument('-c', '--onnx_path', type=Path, + help='Path to ONNX file.') + parser.add_argument( + '-o', '--output', type=Path, default=Path('predictions.json'), + help='Path to output PKL file.' + ) + parser.add_argument('--image_size', type=int, default=224, + help='Image size.') + parser.add_argument( + '--device_target', default='CPU', choices=['GPU', 'CPU'], + help='Target computation platform.' + ) -def create_session(onnx_path, target_device): - if target_device == 'GPU': + return parser.parse_args() + + +def data_loader(path: Path, image_size: int): + """Load image or images from folder in generator.""" + preprocess = get_transforms(image_size=image_size, + training=False) + + def apply(img): + for p in preprocess: + img = p(img) + return img + extensions = ('.png', '.jpg', '.jpeg') + if path.is_dir(): + print('=' * 5, ' Load directory ', '=' * 5) + for item in path.iterdir(): + if item.is_dir(): + continue + if item.suffix.lower() not in extensions: + continue + image = Image.open(str(item)) + image = image.convert('RGB') + image = apply(image) + yield str(item), image[None] + else: + print('=' * 5, ' Load single image ', '=' * 5) + assert path.suffix.lower() in extensions + + image = Image.open(str(path)) + image = image.convert('RGB') + image = apply(image) + yield str(path), image[None] + + +def create_session(onnx_path, device_target): + """Create ONNX inference session.""" + if device_target == 'GPU': providers = ['CUDAExecutionProvider'] - elif target_device == 'CPU': + elif device_target == 'CPU': providers = ['CPUExecutionProvider'] else: raise ValueError( - f'Unsupported target device {target_device}, ' + f'Unsupported target device {device_target}, ' f'Expected one of: "CPU", "GPU"' ) session = ort.InferenceSession(onnx_path, providers=providers) @@ -34,23 +110,25 @@ def create_session(onnx_path, target_device): return session, input_name -def run_eval(onnx_path, data_dir, target_device): - session, input_name = create_session(onnx_path, target_device) - args.batch_size = 1 - dataset = create_dataset_imagenet(data_dir, args, training=False) - metrics = { - 'top-1 accuracy': nn.Top1CategoricalAccuracy(), - 'top-5 accuracy': nn.Top5CategoricalAccuracy(), - } - for batch in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): - y_pred = session.run(None, {input_name: batch['image']})[0] - for metric in metrics.values(): - metric.update(y_pred, batch['label']) - return {name: metric.eval() for name, metric in metrics.items()} +def main(): + """Entry point.""" + args = parse_args() + loader = data_loader(args.data, args.image_size) + session, input_name = create_session( + str(args.onnx_path), args.device_target + ) -if __name__ == '__main__': + d = {} - results = run_eval(args.onnx_path, args.dataset_path, args.device_target) - for name, value in results.items(): - print(f'{name}: {value:.5f}') + for (name, img) in loader: + res = session.run(None, {input_name: img})[0].argmax() + print(name, f'(class: {res})') + d[name] = int(res) + + with args.output.open(mode='w') as f: + json.dump(d, f, indent=1) + + +if __name__ == '__main__': + main() diff --git a/research/cv/repvgg/requriments.txt b/research/cv/repvgg/requriments.txt index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..b209b4fd44203c147ea0c0ba14f7dd907b1d0920 100644 --- a/research/cv/repvgg/requriments.txt +++ b/research/cv/repvgg/requriments.txt @@ -0,0 +1,6 @@ +numpy==1.21.6 +onnxruntime-gpu==1.13.1 +PyYAML==6.0 +matplotlib==3.5.3 +Pillow==9.2.0 +tqdm==4.64.1 diff --git a/research/cv/repvgg/scripts/run_infer_onnx.sh b/research/cv/repvgg/scripts/run_infer_onnx.sh index f1695ae29af7a1d6af4cb85b37601285927ddfe0..d8ac18ee32a7702aaddb38b195d5e9a968642da4 100644 --- a/research/cv/repvgg/scripts/run_infer_onnx.sh +++ b/research/cv/repvgg/scripts/run_infer_onnx.sh @@ -13,9 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ - -if [[ $# -lt 2 || $# -gt 4 ]]; then - echo "Usage: bash run_infer_onnx.sh [ONNX_PATH] [DATASET_PATH] [DEVICE_TARGET(optional)] [DEVICE_ID(optional)]" +if [[ $# -lt 3 || $# -gt 4 ]]; then + echo "Usage: bash scripts/run_infer_onnx.sh [ONNX_PATH] [DATA_PATH] [OUTPUT_PATH] [DEVICE_TARGET(optional)]" exit 1 fi @@ -23,32 +22,29 @@ get_real_path(){ if [ "${1:0:1}" == "/" ]; then echo "$1" else - echo "$(realpath -m $PWD/$1)" + realpath -m "$PWD"/"$1" fi } -onnx_path=$(get_real_path $1) -dataset_path=$(get_real_path $2) -if [ $# -eq 3 ]; then - device_target=$3 + +ONNX_PATH=$(get_real_path "$1") +DATA_PATH=$(get_real_path "$2") +OUTPUT_PATH=$(get_real_path "$3") + +if [ ! -f "$ONNX_PATH" ] +then + echo "error: CHECKPOINT_PATH=$ONNX_PATH is not a file" +exit 1 fi + if [ $# -eq 4 ]; then - device_id=$4 + DEVICE_TARGET="$4" +else + DEVICE_TARGET=CPU fi -echo "onnx_path: "$onnx_path -echo "dataset_path: "$dataset_path -echo "device_target: "$device_target -echo "device_id: "$device_id +echo "onnx_path: $ONNX_PATH" +echo "dataset_path: $DATA_PATH" +echo "output_path: $OUTPUT_PATH" +echo "device_target: $DEVICE_TARGET" -function infer() -{ - python ./infer_onnx.py --onnx_path=$onnx_path \ - --dataset_path=$dataset_path \ - --device_target=$device_target \ - --device_id=$device_id &> infer_onnx.log -} -infer -if [ $? -ne 0 ]; then - echo " execute inference failed" - exit 1 -fi \ No newline at end of file +python ./infer_onnx.py "$DATA_PATH" --onnx_path "$ONNX_PATH" --device_target "$DEVICE_TARGET" --output "$OUTPUT_PATH" &> infer_onnx.log & \ No newline at end of file diff --git a/research/cv/repvgg/src/tools/__init__.py b/research/cv/repvgg/src/tools/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..69dbde1462bbc9868e2589fd1a701c02bbeb8742 100644 --- a/research/cv/repvgg/src/tools/__init__.py +++ b/research/cv/repvgg/src/tools/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""init training tools""" diff --git a/research/cv/repvgg/src/tools/callback.py b/research/cv/repvgg/src/tools/callback.py index 2b6cb4bed0c3789dc45665ac24cf5e70c896b9e8..eeb059180339b5c195ff711130e08863f6ccc19b 100644 --- a/research/cv/repvgg/src/tools/callback.py +++ b/research/cv/repvgg/src/tools/callback.py @@ -12,39 +12,325 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""callback function""" +"""Custom training and evaluation callbacks.""" +import time -from mindspore.train.callback import Callback +from pathlib import Path +from operator import lt, gt -from src.args import args +import mindspore as ms +from mindspore._checkparam import Validator +from mindspore import Callback, SummaryCollector, SummaryRecord, RunContext -class EvaluateCallBack(Callback): - """EvaluateCallBack""" +class BestCheckpointSavingCallback(Callback): + """Callback to save best model checkpoints during training.""" - def __init__(self, model, eval_dataset, src_url, train_url, total_epochs, save_freq=50): - super(EvaluateCallBack, self).__init__() - self.model = model - self.eval_dataset = eval_dataset - self.src_url = src_url - self.train_url = train_url - self.total_epochs = total_epochs - self.save_freq = save_freq - self.best_acc = 0. + def __init__( + self, + ckpt_dir, + target_metric='acc', + best_is_max=True, + prefix='', + buffer=5 + ): + """ + Initialize ckpt saving callback. + + Parameters + ---------- + ckpt_dir: str + Directory to save checkpoints to. + target_metric: str + Name of the metric listed in `metrics` parameter of Model. + best_is_max: bool + Flag to choose is the higher or lower metric value is better. + For example: + - if `target_metric=loss` then `best_is_max` should be False + - if `target_metric=acc` then `best_is_max` should be True + prefix: str + Prefix of saved checkpoint file. + buffer: int + Max number of saved checkpoints. + """ + self.ckpt_dir = Path(ckpt_dir) + self._make_dir() + self.target_metric = target_metric + self.best_is_max = best_is_max + self.prefix = prefix + if best_is_max: + self.best_metric = float('-inf') + self.compare = lt + else: + self.best_metric = float('inf') + self.compare = gt + + self.current_ckpt = [] + self.buffer_size = buffer + + def _make_dir(self): + """Create a checkpoint directory.""" + if not self.ckpt_dir.exists(): + self.ckpt_dir.mkdir(parents=True) + print(f'Directory created: {self.ckpt_dir}') + else: + print(f'Warning! Directory already exists: {self.ckpt_dir}') + + def _save_checkpoint(self, network, epoch): + """ + Save checkpoint. + + Parameters + ---------- + network + Network to save checkpoint for. + """ + # TODO: May not work with model arts or distributed training. + if not float('-inf') < self.best_metric < float('inf'): + return + ckpt_name = f'epoch={epoch}_' \ + f'{self.target_metric}={self.best_metric:.3f}.ckpt' + if self.prefix: + ckpt_name = f'{self.prefix}_{ckpt_name}' + ms.save_checkpoint(network, str(self.ckpt_dir / ckpt_name)) + self.current_ckpt.append(self.ckpt_dir / ckpt_name) + if len(self.current_ckpt) > self.buffer_size: + removed = self.current_ckpt[0] + removed.unlink() + del self.current_ckpt[0] + + def on_eval_end(self, run_context: RunContext): + """ + Check and safe checkpoint if needed after evaluation complete. + + Parameters + ---------- + run_context: RunContext + + """ + cb_params = run_context.original_args() + metrics = {k: v for k, v in cb_params.eval_results.items() if v != 0} + if self.target_metric not in metrics: + raise KeyError( + f'Target metric {self.target_metric} is not in ' + 'cb_params.metrics.' + ) + # If the new metric is better the previous "best" + if self.compare(self.best_metric, metrics[self.target_metric]): + self.best_metric = metrics[self.target_metric] + self._save_checkpoint( + cb_params.network, epoch=cb_params.cur_epoch_num + ) - def epoch_end(self, run_context): + +class SummaryCallbackWithEval(SummaryCollector): + """ + Callback that can collect a common information like SummaryCollector. + + Additionally, this callback collects: + - learning rate + - validation loss + - validation accuracy + """ + + def __init__( + self, + summary_dir, + collect_freq=10, + collect_specified_data=None, + keep_default_action=True, + custom_lineage_data=None, + collect_tensor_freq=None, + max_file_size=None, + export_options=None + ): + super().__init__( + summary_dir, + collect_freq, + collect_specified_data, + keep_default_action, + custom_lineage_data, + collect_tensor_freq, + max_file_size, + export_options + ) + self.entered_count = 0 + + def on_train_epoch_end(self, run_context: RunContext): """ - Test when epoch end, save best model with best.ckpt. + Collect learning rate after train epoch. + + Parameters + ---------- + run_context: RunContext """ cb_params = run_context.original_args() - if cb_params.cur_epoch_num > self.total_epochs * 0.8: - cur_epoch_num = cb_params.cur_epoch_num - result = self.model.eval(self.eval_dataset) - if result["acc"] > self.best_acc: - self.best_acc = result["acc"] - print("epoch: %s acc: %s, best acc is %s" % - (cb_params.cur_epoch_num, result["acc"], self.best_acc), flush=True) - if args.run_modelarts: - import moxing as mox - if cur_epoch_num % self.save_freq == 0: - mox.file.copy_parallel(src_url=self.src_url, dst_url=self.train_url) + optimizer = cb_params.get('optimizer') + if optimizer is None: + optimizer = getattr(cb_params.network, 'optimizer') + if optimizer is None: + print('Warning: There is no optimizer found!') + else: + global_step = optimizer.global_step + lr = optimizer.learning_rate(global_step) + self._record.add_value('scalar', 'Train/learning_rate', + ms.Tensor(lr)) + self._record.record(cb_params.cur_epoch_num) + super().on_train_epoch_end(run_context) + + def on_eval_end(self, run_context: RunContext): + """ + Collect metrics after evaluation complete. + + Parameters + ---------- + run_context: RunContext + """ + cb_params = run_context.original_args() + metrics = {k: v for k, v in cb_params.eval_results.items() if v != 0} + print( + 'Result metrics', f'epoch {cb_params.cur_epoch_num}: ', + {key: metrics[key] for key in sorted(metrics)} + ) + + for metric_name, value in metrics.items(): + self._record.add_value( + 'scalar', f'Metrics/{metric_name}', ms.Tensor(value) + ) + self._record.record(cb_params.cur_epoch_num) + self._record.flush() + + def __enter__(self): + """ + Enter in context manager and control that SummaryRecord created once. + """ + if self.entered_count == 0: + self._record = SummaryRecord(log_dir=self._summary_dir, + max_file_size=self._max_file_size, + raise_exception=False, + export_options=self._export_options) + self._first_step, self._dataset_sink_mode = True, True + self.entered_count += 1 + return self + + def __exit__(self, *err): + """ + Exit from context manager and control SummaryRecord correct closing. + """ + self.entered_count -= 1 + if self.entered_count == 0: + super().__exit__(err) + + +class TrainTimeMonitor(Callback): + """Monitor the time in train process. + + Parameters + ---------- + data_size: int + How many steps are the intervals between print + information each time. + if the program get `batch_num` during training, `data_size` + will be set to `batch_num`, otherwise `data_size` will be used. + Default: None + + Raises + ------ + ValueError: If data_size is not positive int. + """ + + def __init__(self, data_size=None): + super().__init__() + self.data_size = data_size + self.epoch_time = time.time() + + def on_train_epoch_begin(self, run_context): + """Record time at the beginning of epoch. + + Parameters + ---------- + run_context: RunContext + Context of the process running. For more details, please refer to + :class:`mindspore.RunContext` + """ + self.epoch_time = time.time() + + def on_train_epoch_end(self, run_context): + """Print process cost time at the end of epoch. + + Parameters + ---------- + run_context: RunContext + Context of the process running. For more details, please refer to + :class:`mindspore.RunContext` + """ + epoch_seconds = (time.time() - self.epoch_time) * 1000 + step_size = self.data_size + cb_params = run_context.original_args() + mode = cb_params.get('mode', '') + if hasattr(cb_params, 'batch_num'): + batch_num = cb_params.batch_num + if isinstance(batch_num, int) and batch_num > 0: + step_size = cb_params.batch_num + Validator.check_positive_int(step_size) + + step_seconds = epoch_seconds / step_size + print('{} epoch time: {:5.3f} ms, per step time: {:5.3f} ms'.format + (mode.title(), epoch_seconds, step_seconds), flush=True) + + +class EvalTimeMonitor(Callback): + """Monitor the time in eval process. + + Parameters + ---------- + data_size: int + How many steps are the intervals between print information each + time. If the program get `batch_num` during + training, `data_size` will be set to `batch_num`, otherwise + `data_size` will be used. Default: None + + + Raises + ------ + ValueError: If data_size is not positive int. + """ + + def __init__(self, data_size=None): + super().__init__() + self.data_size = data_size + self.epoch_time = time.time() + + def on_eval_epoch_begin(self, run_context): + """Record time at the beginning of epoch. + + Parameters + ---------- + run_context: + Context of the process running. For more details, please refer to + :class:`mindspore.RunContext` + """ + self.epoch_time = time.time() + + def on_eval_epoch_end(self, run_context): + """Print process cost time at the end of epoch. + + Parameters + ---------- + run_context: + Context of the process running. For more details, please refer to + :class:`mindspore.RunContext` + """ + epoch_seconds = (time.time() - self.epoch_time) * 1000 + step_size = self.data_size + cb_params = run_context.original_args() + mode = cb_params.get('mode', '') + if hasattr(cb_params, 'batch_num'): + batch_num = cb_params.batch_num + if isinstance(batch_num, int) and batch_num > 0: + step_size = cb_params.batch_num + Validator.check_positive_int(step_size) + + step_seconds = epoch_seconds / step_size + print('{} epoch time: {:5.3f} ms, per step time: {:5.3f} ms'.format + (mode.title(), epoch_seconds, step_seconds), flush=True) diff --git a/research/cv/repvgg/src/tools/criterion.py b/research/cv/repvgg/src/tools/criterion.py index 0c3254ec04353dbfb6327cd302d776e1c6c93b38..4516bd0ba8a9f9613f10f7ba2d8fc29f76c9c9f5 100644 --- a/research/cv/repvgg/src/tools/criterion.py +++ b/research/cv/repvgg/src/tools/criterion.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""functions of criterion""" +"""Optimized criterion functionality.""" import mindspore.nn as nn from mindspore import Tensor from mindspore import ops @@ -41,18 +41,25 @@ class SoftTargetCrossEntropy(LossBase): class CrossEntropySmooth(LossBase): """CrossEntropy""" - def __init__(self, sparse=True, reduction='mean', smooth_factor=0., num_classes=1000): + def __init__( + self, sparse=True, reduction='mean', smooth_factor=0., + num_classes=1000 + ): super(CrossEntropySmooth, self).__init__() self.onehot = P.OneHot() self.sparse = sparse self.on_value = Tensor(1.0 - smooth_factor, mstype.float32) - self.off_value = Tensor(1.0 * smooth_factor / (num_classes - 1), mstype.float32) + self.off_value = Tensor( + 1.0 * smooth_factor / (num_classes - 1), mstype.float32 + ) self.ce = nn.SoftmaxCrossEntropyWithLogits(reduction=reduction) self.cast = ops.Cast() def construct(self, logit, label): if self.sparse: - label = self.onehot(label, F.shape(logit)[1], self.on_value, self.off_value) + label = self.onehot( + label, F.shape(logit)[1], self.on_value, self.off_value + ) label = P.Cast()(label, mstype.float32) logit = P.Cast()(logit, mstype.float32) loss2 = self.ce(logit, label) @@ -74,15 +81,15 @@ def get_criterion(args): num_classes=args.num_classes) else: print(25 * "=" + "Using Simple CE" + 25 * "=") - criterion = CrossEntropySmooth(sparse=True, reduction="mean", num_classes=args.num_classes) + criterion = CrossEntropySmooth( + sparse=True, reduction="mean", num_classes=args.num_classes + ) return criterion class NetWithLoss(nn.Cell): - """ - NetWithLoss: Only support Network with Classfication - """ + """NetWithLoss: Only support Network with Classfication""" def __init__(self, model, criterion): super(NetWithLoss, self).__init__() diff --git a/research/cv/repvgg/src/tools/optimizer.py b/research/cv/repvgg/src/tools/optimizer.py index 05f407f579b76358a14d53975dab58e61012f3e7..f2bb2553fe13a5ee570a12374cbc290f49f53b6e 100644 --- a/research/cv/repvgg/src/tools/optimizer.py +++ b/research/cv/repvgg/src/tools/optimizer.py @@ -12,9 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""Functions of optimizer""" -import os - +"""Optimizer creation.""" import numpy as np from mindspore.nn.optim import AdamWeightDecay from mindspore.nn.optim.momentum import Momentum @@ -29,30 +27,26 @@ def get_learning_rate(args, batch_num): def get_optimizer(args, model, batch_num): """Get optimizer for training""" - print(f"=> When using train_wrapper, using optimizer {args.optimizer}") + print(f'=> When using train_wrapper, using optimizer {args.optimizer}') optim_type = args.optimizer.lower() params = get_param_groups(model) learning_rate = get_learning_rate(args, batch_num) - step = int(args.start_epoch * batch_num) - accumulation_step = int(args.accumulation_step) - learning_rate = learning_rate[step::accumulation_step] + step = int(args.start_epoch * batch_num) + args.start_step train_step = len(learning_rate) - print(f"=> Get LR from epoch: {args.start_epoch}\n" - f"=> Start step: {step}\n" - f"=> Total step: {train_step}\n" - f"=> Accumulation step:{accumulation_step}") - if accumulation_step > 1: - learning_rate = learning_rate * accumulation_step - learning_rate = learning_rate * args.batch_size * int(os.getenv("DEVICE_NUM", "1")) / 256. - print("learning_rate", np.max(learning_rate)) - if optim_type == "momentum": + learning_rate = learning_rate[step:] + print(f'=> Get LR from epoch: {args.start_epoch}\n' + f'=> Start step: {step}\n' + f'=> Total step: {train_step}\n') + + print('learning_rate', np.max(learning_rate)) + if optim_type == 'momentum': optim = Momentum( params=params, learning_rate=learning_rate, momentum=args.momentum, weight_decay=args.weight_decay ) - elif optim_type == "adamw": + elif optim_type == 'adamw': optim = AdamWeightDecay( params=params, learning_rate=learning_rate, @@ -62,23 +56,27 @@ def get_optimizer(args, model, batch_num): weight_decay=args.weight_decay ) else: - raise ValueError(f"optimizer {optim_type} is not supported") + raise ValueError(f'optimizer {optim_type} is not supported') return optim def get_param_groups(network): - """ get param groups """ + """get param groups""" decay_params = [] no_decay_params = [] for x in network.trainable_params(): parameter_name = x.name - if parameter_name.endswith(".weight"): + if parameter_name.endswith('.weight'): # Dense or Conv's weight using weight decay decay_params.append(x) else: # all bias not using weight decay - # bn weight bias not using weight decay, be carefully for now x not include LN + # bn weight bias not using weight decay, be carefully for now x + # not include LN no_decay_params.append(x) - return [{'params': no_decay_params, 'weight_decay': 0.0}, {'params': decay_params}] + return [ + {'params': no_decay_params, 'weight_decay': 0.0}, + {'params': decay_params} + ] diff --git a/research/cv/repvgg/src/tools/schedulers.py b/research/cv/repvgg/src/tools/schedulers.py index 7b67307679b062b9db822442d7602da7b4dc9618..4ae63b4d65c60bddf82e95ce6aec461281b53c22 100644 --- a/research/cv/repvgg/src/tools/schedulers.py +++ b/research/cv/repvgg/src/tools/schedulers.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""LearningRate scheduler functions""" +"""LearningRate scheduler functionality.""" import numpy as np -__all__ = ["multistep_lr", "cosine_lr", "constant_lr", "get_policy", "exp_lr"] +__all__ = ['multistep_lr', 'cosine_lr', 'constant_lr', 'get_policy', 'exp_lr'] def get_policy(name): @@ -24,10 +24,10 @@ def get_policy(name): return constant_lr out_dict = { - "constant_lr": constant_lr, - "cosine_lr": cosine_lr, - "multistep_lr": multistep_lr, - "exp_lr": exp_lr, + 'constant_lr': constant_lr, + 'cosine_lr': cosine_lr, + 'multistep_lr': multistep_lr, + 'exp_lr': exp_lr, } return out_dict[name] @@ -39,7 +39,9 @@ def constant_lr(args, batch_num): def _lr_adjuster(epoch): if epoch < args.warmup_length: - lr = _warmup_lr(args.warmup_lr, args.base_lr, args.warmup_length, epoch) + lr = _warmup_lr( + args.warmup_lr, args.base_lr, args.warmup_length, epoch + ) else: lr = args.base_lr @@ -53,12 +55,14 @@ def constant_lr(args, batch_num): def exp_lr(args, batch_num): - """Get exp lr """ + """Get exp lr""" learning_rate = [] def _lr_adjuster(epoch): if epoch < args.warmup_length: - lr = _warmup_lr(args.warmup_lr, args.base_lr, args.warmup_length, epoch) + lr = _warmup_lr( + args.warmup_lr, args.base_lr, args.warmup_length, epoch + ) else: lr = args.base_lr * args.lr_gamma ** epoch @@ -77,7 +81,9 @@ def cosine_lr(args, batch_num): def _lr_adjuster(epoch): if epoch < args.warmup_length: - lr = _warmup_lr(args.warmup_lr, args.base_lr, args.warmup_length, epoch) + lr = _warmup_lr( + args.warmup_lr, args.base_lr, args.warmup_length, epoch + ) else: e = epoch - args.warmup_length es = args.epochs - args.warmup_length @@ -93,7 +99,8 @@ def cosine_lr(args, batch_num): def multistep_lr(args, batch_num): - """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" + """Sets the learning rate to the initial LR decayed by 10 every 30 epochs + """ learning_rate = [] def _lr_adjuster(epoch): diff --git a/research/cv/repvgg/train.py b/research/cv/repvgg/train.py index 87cba7eca3e0b3a3091611b23abb6a70c322a9f5..c6caf375de9a0147a015fda1aafc213ea64b3d83 100644 --- a/research/cv/repvgg/train.py +++ b/research/cv/repvgg/train.py @@ -12,75 +12,202 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""train""" -import os +"""Train RepVGG on ImageNet.""" +import shutil +import subprocess +import sys +import traceback + +from datetime import datetime +from functools import reduce +from pathlib import Path from mindspore import Model from mindspore import context from mindspore import nn +from mindspore import dataset as ds from mindspore.common import set_seed -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train.callback import ( + CheckpointConfig, ModelCheckpoint, LossMonitor +) -from src.args import args -from src.tools.callback import EvaluateCallBack -from src.tools.cell import cast_amp -from src.tools.criterion import get_criterion, NetWithLoss -from src.tools.get_misc import get_dataset, set_device, get_model, pretrained, get_train_one_step +from src.tools.callback import ( + BestCheckpointSavingCallback, SummaryCallbackWithEval, + TrainTimeMonitor, + EvalTimeMonitor +) from src.tools.optimizer import get_optimizer +from src.tools.amp import cast_amp +from src.tools.criterion import get_criterion, NetWithLoss +from src.tools.utils import set_device, pretrained, get_train_one_step +from src.dataset import get_dataset +from src.repvgg import get_model + + +def get_callbacks( + arch, rank, train_data_size, val_data_size, ckpt_dir, best_ckpt_dir, + summary_dir, ckpt_save_every_step=0, ckpt_save_every_sec=0, + ckpt_keep_num=10, best_ckpt_num=5, print_loss_every=1, collect_freq=1, + collect_tensor_freq=10, collect_input_data=False, + keep_default_action=False +): + """Get common callbacks.""" + if ckpt_save_every_step == 0 and ckpt_save_every_sec == 0: + ckpt_save_every_step = train_data_size + config_ck = CheckpointConfig( + save_checkpoint_steps=ckpt_save_every_step, + save_checkpoint_seconds=ckpt_save_every_sec, + keep_checkpoint_max=ckpt_keep_num, + append_info=['epoch_num', 'step_num'] + ) + train_time_cb = TrainTimeMonitor(data_size=train_data_size) + eval_time_cb = EvalTimeMonitor(data_size=val_data_size) + + best_ckpt_save_cb = BestCheckpointSavingCallback( + best_ckpt_dir, prefix=arch, buffer=best_ckpt_num + ) + + ckpoint_cb = ModelCheckpoint( + prefix=f'{arch}_{rank}', + directory=ckpt_dir, + config=config_ck + ) + loss_cb = LossMonitor(print_loss_every) + + specified = { + 'collect_metric': True, + 'collect_train_lineage': True, + 'collect_eval_lineage': True, + 'collect_input_data': collect_input_data, + } + summary_collector_cb = SummaryCallbackWithEval( + summary_dir=summary_dir, + collect_specified_data=specified, + collect_freq=collect_freq, + keep_default_action=keep_default_action, + collect_tensor_freq=collect_tensor_freq + ) + return [ + train_time_cb, + eval_time_cb, + ckpoint_cb, + loss_cb, + best_ckpt_save_cb, + summary_collector_cb + ] + + +def dump_env_and_params(ckpt_save_dir, args): + """Dump information about environment ang hyper parameters.""" + shutil.copy(str(args.config), str(ckpt_save_dir)) + with open(str(ckpt_save_dir / 'cmd.txt'), 'w', encoding='utf-8' + ) as file: + file.write(' '.join(sys.argv)) + with open(str(ckpt_save_dir / 'args.txt'), 'w', encoding='utf-8' + ) as file: + file.write(str(args)) + try: + with open(str(ckpt_save_dir / 'git.txt'), 'w', encoding='utf-8' + ) as file: + commit_info = subprocess.check_output( + ['git', 'show', '-s'], + cwd=Path(__file__).absolute().parents[0], + ) + decoded_commit_info = commit_info.decode('utf-8') + decoded_commit_info = decoded_commit_info.replace('\n', ', ') + file.write(decoded_commit_info) + except subprocess.CalledProcessError as git_exception: + print(f'Git dumping error: {git_exception}') + print(traceback.format_exc()) def main(): + """Entry point.""" + from src.config import run_args + args = run_args() + + print(args) set_seed(args.seed) - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - context.set_context(enable_graph_kernel=False) - if args.device_target == "Ascend": + context.set_context( + mode=context.GRAPH_MODE, device_target=args.device_target, + enable_graph_kernel=False + ) + if args.device_target == 'Ascend': context.set_context(enable_auto_mixed_precision=True) - rank = set_device(args) + if args.device_target != 'CPU': + rank = set_device(args) + else: + rank = 0 + + ds.config.set_prefetch_size(args.prefetch) # get model and cast amp_level net = get_model(args) - cast_amp(net) + cast_amp(net, args) criterion = get_criterion(args) net_with_loss = NetWithLoss(net, criterion) - if args.pretrained: - pretrained(args, net) data = get_dataset(args) batch_num = data.train_dataset.get_dataset_size() optimizer = get_optimizer(args, net, batch_num) net_with_loss = get_train_one_step(args, net_with_loss, optimizer) + if args.pretrained: + pretrained(args, net_with_loss, args.exclude_epoch_state) - eval_network = nn.WithEvalCell(net, criterion, args.amp_level in ["O2", "O3", "auto"]) + eval_network = nn.WithEvalCell( + net, criterion, args.amp_level in ['O2', 'O3', 'auto'] + ) eval_indexes = [0, 1, 2] - model = Model(net_with_loss, metrics={"acc", "loss"}, - eval_network=eval_network, - eval_indexes=eval_indexes) - - config_ck = CheckpointConfig(save_checkpoint_steps=data.train_dataset.get_dataset_size(), - keep_checkpoint_max=args.save_every) - time_cb = TimeMonitor(data_size=data.train_dataset.get_dataset_size()) - - ckpt_save_dir = "./ckpt_" + str(rank) - if args.run_modelarts: - ckpt_save_dir = "/cache/ckpt_" + str(rank) - - ckpoint_cb = ModelCheckpoint(prefix=args.arch + str(rank), directory=ckpt_save_dir, - config=config_ck) - loss_cb = LossMonitor() - eval_cb = EvaluateCallBack(model, eval_dataset=data.val_dataset, src_url=ckpt_save_dir, - train_url=os.path.join(args.train_url, "ckpt_" + str(rank)), - total_epochs=args.epochs - args.start_epoch, save_freq=args.save_every) - - print("begin train") - model.train(int(args.epochs - args.start_epoch), data.train_dataset, - callbacks=[time_cb, ckpoint_cb, loss_cb, eval_cb], - dataset_sink_mode=True) - print("train success") - - if args.run_modelarts: - import moxing as mox - mox.file.copy_parallel(src_url=ckpt_save_dir, dst_url=os.path.join(args.train_url, "ckpt_" + str(rank))) + model = Model( + net_with_loss, metrics={'acc', 'loss'}, eval_network=eval_network, + eval_indexes=eval_indexes + ) + + # target folder path + experiment_name = '_'.join([ + datetime.now().strftime('%y%m%d_%H%M%S'), args.arch, str(rank), + ]) + if args.brief is not None: + experiment_name = f'{experiment_name}_{args.brief}' + if args.continues is None: + ckpt_save_dir = args.train_url / experiment_name + else: + ckpt_save_dir = args.continues + + callbacks = [ + TrainTimeMonitor(data_size=data.train_dataset.get_dataset_size()), + LossMonitor(args.save_every) + ] + if rank == 0: + callbacks = get_callbacks( + arch=args.arch, rank=rank, ckpt_dir=str(ckpt_save_dir), + train_data_size=data.train_dataset.get_dataset_size(), + val_data_size=data.val_dataset.get_dataset_size(), + best_ckpt_dir=str(ckpt_save_dir / 'best_acc'), + summary_dir=str(ckpt_save_dir / 'logs'), ckpt_save_every_sec=0, + ckpt_save_every_step=args.save_every, + print_loss_every=args.save_every, + ckpt_keep_num=args.keep_checkpoint_max, + best_ckpt_num=args.keep_best_checkpoints_max + ) + + dump_env_and_params(ckpt_save_dir, args) + + print( + 'Number of parameters:', + sum( + reduce(lambda x, y: x * y, params.shape) + for params in net.trainable_params() + ) + ) + print('begin train') + model.fit( + args.epochs, data.train_dataset, valid_dataset=data.val_dataset, + dataset_sink_mode=bool(args.use_data_sink), callbacks=callbacks, + initial_epoch=args.start_epoch + ) + print('train success') if __name__ == '__main__': diff --git a/research/cv/resnetv2_50_frn/README.md b/research/cv/resnetv2_50_frn/README.md index 416a32d79b43b0f3394722b297e4fe2549b07281..5c55a92b42eb5c4dc175adaccd99f551188f7377 100644 --- a/research/cv/resnetv2_50_frn/README.md +++ b/research/cv/resnetv2_50_frn/README.md @@ -143,7 +143,7 @@ bash run_eval_for_ascend.sh 0 /dataset ./ckpt_0/resnetv2-50-frn-rank0-240_5005.c ### Result -Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log. +Evaluation result will be stored in the scripts path. Under this, you can find result like the following in log. acc=77.4%(TOP1,size:224\*224) acc=78.3%(TOP1,size:299\*299) diff --git a/research/cv/resnext152_64x4d/README.md b/research/cv/resnext152_64x4d/README.md index 0dd9bcd3d6dbf609634f0b5f97d88ddca6fac144..a81c9ecfd7022e503e13297998019c17c06fa5d2 100644 --- a/research/cv/resnext152_64x4d/README.md +++ b/research/cv/resnext152_64x4d/README.md @@ -223,7 +223,7 @@ PLATFORM is Ascend or GPU, default is Ascend. #### Result -Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log. +Evaluation result will be stored in the scripts path. Under this, you can find result like the following in log. ```log acc=80.08%(TOP1) diff --git a/research/cv/se_resnext50/README.md b/research/cv/se_resnext50/README.md index 8e94159b4a117e1c691baea48c9df4b1a8bc414b..7177cdc050c9b2d387c7ec89f3319469c84277a1 100644 --- a/research/cv/se_resnext50/README.md +++ b/research/cv/se_resnext50/README.md @@ -49,7 +49,7 @@ Dataset used: [imagenet2012](http://www.image-net.org/) ## [Mixed Precision](#contents) -The [mixed precision](https://www.mindspore.cn/tutorials/en/master/advanced/mixed_precision.html) training method accelerates the deep learning neural network training process by using both the single-precision and half-precision data formats, and maintains the network precision achieved by the single-precision training at the same time. Mixed precision training can accelerate the computation process, reduce memory usage, and enable a larger model or batch size to be trained on specific hardware. +The [mixed precision](https://www.mindspore.cn/docs/programming_guide/en/r1.5/enable_mixed_precision.html) training method accelerates the deep learning neural network training process by using both the single-precision and half-precision data formats, and maintains the network precision achieved by the single-precision training at the same time. Mixed precision training can accelerate the computation process, reduce memory usage, and enable a larger model or batch size to be trained on specific hardware. For FP16 operators, if the input data type is FP32, the backend of MindSpore will automatically handle it with reduced precision. Users could check the reduced-precision operators by enabling INFO log and then searching ‘reduce precision’. @@ -61,7 +61,7 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil - [MindSpore](https://www.mindspore.cn/install/en) - For more information, please check the resources below: - [MindSpore Tutorials](https://www.mindspore.cn/tutorials/en/r1.3/index.html) -- [MindSpore Python API](https://www.mindspore.cn/docs/en/master/api_python/mindspore.html) +- [MindSpore Python API](https://www.mindspore.cn/docs/en/master/index.html) If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training and evaluation as follows: @@ -230,7 +230,7 @@ sh run_eval.sh 0 /opt/npu/datasets/classification/val /se_resnext50.ckpt Ascend #### Result -Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log. +Evaluation result will be stored in the scripts path. Under this, you can find result like the following in log. ```log acc=78.81%(TOP1) diff --git a/research/cv/se_resnext50/README_CN.md b/research/cv/se_resnext50/README_CN.md index ee609d9d29ac3283125b21c939eaebee1710ee81..81dc81a2f052f87d6ed8b0ea1a55be181220a426 100644 --- a/research/cv/se_resnext50/README_CN.md +++ b/research/cv/se_resnext50/README_CN.md @@ -56,7 +56,7 @@ SE_ResNeXt整体网络架构如下: ## 混合精度 -采用[混合精度](https://www.mindspore.cn/tutorials/zh-CN/master/advanced/mixed_precision.html)的训练方法使用支持单精度和半精度数据来提高深度学习神经网络的训练速度,同时保持单精度训练所能达到的网络精度。混合精度训练提高计算速度、减少内存使用的同时,支持在特定硬件上训练更大的模型或实现更大批次的训练。 +采用[混合精度](https://www.mindspore.cn/docs/programming_guide/zh-CN/r1.5/enable_mixed_precision.html)的训练方法使用支持单精度和半精度数据来提高深度学习神经网络的训练速度,同时保持单精度训练所能达到的网络精度。混合精度训练提高计算速度、减少内存使用的同时,支持在特定硬件上训练更大的模型或实现更大批次的训练。 以FP16算子为例,如果输入数据类型为FP32,MindSpore后台会自动降低精度来处理数据。用户可打开INFO日志,搜索“reduce precision”查看精度降低的算子。 @@ -68,7 +68,7 @@ SE_ResNeXt整体网络架构如下: - [MindSpore](https://www.mindspore.cn/install) - 如需查看详情,请参见如下资源: - [MindSpore教程](https://www.mindspore.cn/tutorials/zh-CN/r1.3/index.html) - - [MindSpore Python API](https://www.mindspore.cn/docs/en/master/api_python/mindspore.html) + - [MindSpore Python API](https://www.mindspore.cn/docs/en/master/index.html) 如果要在modelarts上进行模型的训练,可以参考modelarts的官方指导文档(https://support.huaweicloud.com/modelarts/) 开始进行模型的训练和推理,具体操作如下: diff --git a/research/cv/squeezenet/README.md b/research/cv/squeezenet/README.md index 4d8cfd80d96d051a5e7384ecfabeabc670e83818..0c87bcdfcec19f30d546b8be3382e4bea9e85d43 100644 --- a/research/cv/squeezenet/README.md +++ b/research/cv/squeezenet/README.md @@ -344,7 +344,7 @@ For distributed training, a hccl configuration file with JSON format needs to be Please follow the instructions in the link [hccl_tools](https://gitee.com/mindspore/models/tree/master/utils/hccl_tools). -Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. +Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the following in log. ### Result @@ -413,7 +413,7 @@ checkpoint can be produced in training process. ### Result -Evaluation result will be stored in the example path, whose folder name is "eval". Under this, you can find result like the followings in log. +Evaluation result will be stored in the example path, whose folder name is "eval". Under this, you can find result like the following in log. - Evaluating SqueezeNet with CIFAR-10 dataset diff --git a/research/cv/squeezenet1_1/README.md b/research/cv/squeezenet1_1/README.md index 6a615c71de17d31a0e905c19db55734aa6dadcc2..63cfd4d24006aa4594148c419a5400166a000500 100644 --- a/research/cv/squeezenet1_1/README.md +++ b/research/cv/squeezenet1_1/README.md @@ -169,7 +169,7 @@ For distributed training, a hccl configuration file with JSON format needs to be Please follow the instructions in the link [hccl_tools](https://gitee.com/mindspore/models/tree/master/utils/hccl_tools). -Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. +Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the following in log. ### Result @@ -203,7 +203,7 @@ bash scripts/run_eval.sh 0 /data/imagenet/val ./train/ckpt_squeezenet/squeezenet ### Result -Evaluation result will be stored in the example path, whose folder name is "eval". Under this, you can find result like the followings in log. +Evaluation result will be stored in the example path, whose folder name is "eval". Under this, you can find result like the following in log. - Evaluating SqueezeNet with ImageNet dataset diff --git a/research/cv/ssd_ghostnet/README.md b/research/cv/ssd_ghostnet/README.md index 0d75cb806b1582ac34f7cf718d7b1bdf7fc7f427..5ddcd69c917c8ac9b617b6cd6230f28c7f963191 100644 --- a/research/cv/ssd_ghostnet/README.md +++ b/research/cv/ssd_ghostnet/README.md @@ -234,7 +234,7 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in LOG4/log.txt. +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in LOG4/log.txt. ### Training on GPU @@ -246,7 +246,7 @@ For details about the parameters, see [Training on Ascend](#training-on-ascend) bash run_distribute_train_gpu.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional) ``` -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in LOG/log.txt. +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in LOG/log.txt. ## [Evaluation Process](#contents) diff --git a/research/cv/ssd_inception_v2/README.md b/research/cv/ssd_inception_v2/README.md index cd0916115e99ff7cde2c0d01735d194c48ba1562..c655109b654c5cf9fd278228fa7053e4b310a0e6 100644 --- a/research/cv/ssd_inception_v2/README.md +++ b/research/cv/ssd_inception_v2/README.md @@ -233,7 +233,7 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name is "LOG". Under this, you can find checkpoint files together with result like the followings in log +Training result will be stored in the current path, whose folder name is "LOG". Under this, you can find checkpoint files together with result like the following in log ```shell epoch: 1 step: 320, loss is 4.008658 @@ -274,7 +274,7 @@ We need four parameters for this scripts. > checkpoint can be produced in training process. -Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the followings in log. +Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the following in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.224 diff --git a/research/cv/ssd_mobilenetV2/README.md b/research/cv/ssd_mobilenetV2/README.md index 0b3b3d8c674d9ec2e7abb4c2b2f706d9f1c636c9..d2fa97b9c6b1e87012f6517ecfcd22b7cf4a80ae 100644 --- a/research/cv/ssd_mobilenetV2/README.md +++ b/research/cv/ssd_mobilenetV2/README.md @@ -241,7 +241,7 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in log ```shell epoch: 1 step: 458, loss is 2.329789 @@ -275,7 +275,7 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in log ```shell epoch: 1 step: 3664, loss is 2.1746433 @@ -303,7 +303,7 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in log ```shell epoch: 1 step: 916, loss is 2.1025786 @@ -342,7 +342,7 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in log ```shell epoch: 1 step: 3664, loss is 2.2511892 @@ -366,7 +366,7 @@ We need two parameters for this scripts. > checkpoint can be produced in training process. -Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the followings in log. +Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the following in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.253 @@ -403,7 +403,7 @@ We need two parameters for this scripts. > checkpoint can be produced in training process. -Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the followings in log. +Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the following in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.258 diff --git a/research/cv/ssd_mobilenetV2_FPNlite/README.md b/research/cv/ssd_mobilenetV2_FPNlite/README.md index 4fe3164b3327267d5d371ff7e06bcd90df2d4251..b42f631808289ce3fbdbed36db53a849aa55da50 100644 --- a/research/cv/ssd_mobilenetV2_FPNlite/README.md +++ b/research/cv/ssd_mobilenetV2_FPNlite/README.md @@ -254,7 +254,7 @@ We need six or eight parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in log ```shell epoch: 1 step: 458, loss is 2.873479 @@ -289,7 +289,7 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in log ```shell epoch: 1 step: 3664, loss is 2.3280334 @@ -375,7 +375,7 @@ We need four parameters for this script: > checkpoint can be produced in training process. -Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the followings in log. +Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the following in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.234 @@ -412,7 +412,7 @@ We need four parameters for this script: > checkpoint can be produced in training process. -Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the followings in log. +Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the following in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.258 diff --git a/research/cv/ssd_resnet34/README.md b/research/cv/ssd_resnet34/README.md index 14a1d1cc47366d92a15c176991b07c1f610f6b3c..ae791499b5ed41fc39d908f2fca7460452d97336 100644 --- a/research/cv/ssd_resnet34/README.md +++ b/research/cv/ssd_resnet34/README.md @@ -231,7 +231,7 @@ We need five or six parameters for this scripts. - `TRAIN_OUT_PATH`:the output path of train for distributed train. - `PRE_TRAINED_PATH :` the path of pretrained checkpoint file, it is better to use absolute path. -Training result will be stored in the train path, whose folder name "log". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the train path, whose folder name "log". Under this, you can find checkpoint file together with result like the following in log ```shell epoch: 1 step: 458, loss is 4.185711 @@ -281,7 +281,7 @@ We need five parameters for this scripts. > checkpoint can be produced in training process. -Inference result will be stored in the eval path, whose folder name "log". Under this, you can find result like the followings in log. +Inference result will be stored in the eval path, whose folder name "log". Under this, you can find result like the following in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.240 diff --git a/research/cv/ssd_resnet50/README.md b/research/cv/ssd_resnet50/README.md index d379179994212e3183c57c7c43bbe9167723f44b..1a6f762371bfeac2cfbb5c0faca3b22adfd76448 100644 --- a/research/cv/ssd_resnet50/README.md +++ b/research/cv/ssd_resnet50/README.md @@ -224,7 +224,7 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in log ```shell epoch: 1 step: 458, loss is 3.1681802 @@ -259,7 +259,7 @@ We need two parameters for this scripts. > checkpoint can be produced in training process. -Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the followings in log. +Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the following in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.327 @@ -304,7 +304,7 @@ Current batch_ Size can only be set to 1. bash run_infer_cpp.sh [MINDIR_PATH] [DATA_PATH] [DVPP] [ANNO_FILE] [DEVICE_TYPE] [DEVICE_ID] ``` -Inference result will be stored in the example path, you can find result like the followings in acc.log. +Inference result will be stored in the example path, you can find result like the following in acc.log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.327 diff --git a/research/cv/ssd_resnet_34/README.md b/research/cv/ssd_resnet_34/README.md index 1704cb8ebba08945d807674dd4aae21d9b4be540..d6e8502f0563e418a89dea74ad244f806ffcf639 100644 --- a/research/cv/ssd_resnet_34/README.md +++ b/research/cv/ssd_resnet_34/README.md @@ -230,7 +230,7 @@ We need five or six parameters for this scripts. - `PRE_TRAINED_PATH`: the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE`: number of epochs passed by checkpoint. -Training result will be stored in the train path, whose folder name "log". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the train path, whose folder name "log". Under this, you can find checkpoint file together with result like the following in log ```shell Single GPU training: @@ -282,7 +282,7 @@ We need five parameters for this script. > checkpoint can be produced in training process. -Inference result will be stored in the eval path, whose folder name "log". Under this, you can find result like the followings in log. +Inference result will be stored in the eval path, whose folder name "log". Under this, you can find result like the following in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.254 diff --git a/research/cv/tnt/eval.py b/research/cv/tnt/eval.py index a1a483e33ff585d3426f46fb69f4dee7ea9b505d..f26694e15847a535f5cb222e004b1c1009921b2f 100644 --- a/research/cv/tnt/eval.py +++ b/research/cv/tnt/eval.py @@ -1,4 +1,4 @@ -# Copyright 2022-2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,64 +12,49 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""Evaluation script. Need training config.""" -import os +"""eval""" -from functools import reduce - -import mindspore as ms from mindspore import Model from mindspore import context from mindspore import nn from mindspore.common import set_seed -from mindspore.train.callback import TimeMonitor +from src.args import args from src.tools.cell import cast_amp from src.tools.criterion import get_criterion, NetWithLoss -from src.tools.get_misc import pretrained, get_train_one_step -from src.data.imagenet import create_dataset_imagenet +from src.tools.get_misc import get_dataset, set_device, get_model, pretrained, get_train_one_step from src.tools.optimizer import get_optimizer -from src.tools.get_misc import get_model +set_seed(args.seed) + + +def main(): + mode = { + 0: context.GRAPH_MODE, + 1: context.PYNATIVE_MODE + } + context.set_context(mode=mode[args.graph_mode], device_target=args.device_target) + context.set_context(enable_graph_kernel=False) + if args.device_target == "Ascend": + context.set_context(enable_auto_mixed_precision=True) + set_device(args) -def eval_ckpt(args): - print('=== Use checkpoint ===') + # get model net = get_model(args) - cast_amp(net, args) + cast_amp(net) criterion = get_criterion(args) net_with_loss = NetWithLoss(net, criterion) if args.pretrained: pretrained(args, net) - print( - 'Number of parameters (before deploy):', - sum( - reduce(lambda x, y: x * y, params.shape) - for params in net.trainable_params() - ) - ) - # switch_net_to_deploy(net) - print( - 'Number of parameters (after deploy):', - sum( - reduce(lambda x, y: x * y, params.shape) - for params in net.trainable_params() - ) - ) - cast_amp(net, args) - net.set_train(False) - - data = create_dataset_imagenet( - str(args.ds_val), args, training=False - ) - batch_num = data.get_dataset_size() + data = get_dataset(args, training=False) + batch_num = data.val_dataset.get_dataset_size() optimizer = get_optimizer(args, net, batch_num) + # save a yaml file to read to record parameters net_with_loss = get_train_one_step(args, net_with_loss, optimizer) - eval_network = nn.WithEvalCell( - net, criterion, args.amp_level in ['O2', 'O3', 'auto'] - ) + eval_network = nn.WithEvalCell(net, criterion, args.amp_level in ["O2", "O3", "auto"]) eval_indexes = [0, 1, 2] eval_metrics = {'Loss': nn.Loss(), 'Top1-Acc': nn.Top1CategoricalAccuracy(), @@ -77,57 +62,10 @@ def eval_ckpt(args): model = Model(net_with_loss, metrics=eval_metrics, eval_network=eval_network, eval_indexes=eval_indexes) - - print('=> begin eval') - results = model.eval(data, callbacks=[TimeMonitor()]) - return results - - -def eval_mindir(args): - print('=== Use MINDIR model ===') - data = create_dataset_imagenet( - str(args.dataset_path), args, training=False - ) - iterator = data.create_dict_iterator(num_epochs=1) - - graph = ms.load(str(args.pretrained)) - net = nn.GraphCell(graph) - metrics = { - 'Top1-Acc': nn.Top1CategoricalAccuracy(), - 'Top5-Acc': nn.Top5CategoricalAccuracy(), - } - print('=> begin eval') - for batch in iterator: - y_pred = net(batch['image']) - for metric in metrics.values(): - metric.update(y_pred, batch['label']) - - return {name: metric.eval() for name, metric in metrics.items()} - - -def main(): - """Entry point.""" - from src.args import args - - set_seed(0) - context.set_context(mode=context.GRAPH_MODE, - device_target=args.device_target) - context.set_context(enable_graph_kernel=False) - if args.device_target == 'Ascend': - context.set_context(enable_auto_mixed_precision=True) - - os.environ["RANK_SIZE"] = '0' - - # get model - if args.pretrained.endswith('.ckpt'): - results = eval_ckpt(args) - elif args.pretrained.endswith('.mindir'): - results = eval_mindir(args) - else: - raise ValueError('Incorrect format checkpoint') - - print(f'=> eval results:{results}') - print('=> eval success') + print(f"=> begin eval") + results = model.eval(data.val_dataset) + print(f"=> eval results:{results}") + print(f"=> eval success") if __name__ == '__main__': diff --git a/research/cv/tnt/export.py b/research/cv/tnt/export.py index 1d26ea8efd7c83ad7900ed5da876d2423274c2d2..692a104e4aa4b1584e09937e0c333ea820398919 100644 --- a/research/cv/tnt/export.py +++ b/research/cv/tnt/export.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,12 +29,12 @@ from src.tools.get_misc import get_model context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) if args.device_target in ["Ascend", "GPU"]: - context.set_context(device_id=args.device_id[0]) + context.set_context(device_id=args.device_id) if __name__ == '__main__': net = get_model(args) criterion = get_criterion(args) - cast_amp(net, args) + cast_amp(net) net_with_loss = NetWithLoss(net, criterion) assert args.pretrained is not None, "checkpoint_path is None." diff --git a/research/cv/tnt/src/args.py b/research/cv/tnt/src/args.py index 591d23ee06c4ae4eb63bc8e4579d683f06931889..30f9e28b89d60130c0588bc5556f443a05c6fb3f 100644 --- a/research/cv/tnt/src/args.py +++ b/research/cv/tnt/src/args.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,8 +28,7 @@ args = None def parse_arguments(): """parse_arguments""" global args - parser = argparse.ArgumentParser(description="MindSpore TNT Training", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser = argparse.ArgumentParser(description="MindSpore TNT Training") parser.add_argument("-a", "--arch", metavar="ARCH", default="ResNet18", help="model architecture") parser.add_argument("--accumulation_step", default=1, type=int, help="accumulation step") @@ -41,21 +40,19 @@ def parse_arguments(): parser.add_argument("--beta", default=[0.9, 0.999], type=lambda x: [float(a) for a in x.split(",")], help="beta for optimizer") parser.add_argument("--clip_global_norm_value", default=5., type=float, help="Clip grad value") - parser.add_argument('--ds_train', default="./data/train", help='Training dataset') - parser.add_argument('--ds_val', default="./data/val", help='validation dataset') - parser.add_argument("--device_id", default=[0], type=int, nargs='+', help="Device Ids") + parser.add_argument('--data_url', default="./data", help='Location of data.') + parser.add_argument("--device_id", default=0, type=int, help="Device Id") parser.add_argument("--device_num", default=1, type=int, help="device num") parser.add_argument("--device_target", default="GPU", choices=["GPU", "Ascend", "CPU"], type=str) parser.add_argument("--epochs", default=300, type=int, metavar="N", help="number of total epochs to run") parser.add_argument("--eps", default=1e-8, type=float) - parser.add_argument("--file_format", type=str, choices=["AIR", "MINDIR", "ONNX"], - default="MINDIR", help="file format") + parser.add_argument("--file_format", type=str, choices=["AIR", "MINDIR"], default="MINDIR", help="file format") parser.add_argument("--in_channel", default=3, type=int) parser.add_argument("--is_dynamic_loss_scale", default=1, type=int, help="is_dynamic_loss_scale ") parser.add_argument("--keep_checkpoint_max", default=20, type=int, help="keep checkpoint max num") parser.add_argument("--optimizer", help="Which optimizer to use", default="sgd") parser.add_argument("--set", help="name of dataset", type=str, default="ImageNet") - parser.add_argument("--pynative_mode", default=0, type=int, help="graph mode with 0, python with 1") + parser.add_argument("--graph_mode", default=0, type=int, help="graph mode with 0, python with 1") parser.add_argument("--mix_up", default=0., type=float, help="mix up") parser.add_argument("--mlp_ratio", help="mlp ", default=4., type=float) parser.add_argument("-j", "--num_parallel_workers", default=20, type=int, metavar="N", @@ -74,43 +71,13 @@ def parse_arguments(): parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum") parser.add_argument("--num_classes", default=1000, type=int) parser.add_argument("--pretrained", dest="pretrained", default=None, type=str, help="use pre-trained model") - parser.add_argument("--exclude_epoch_state", action="store_true", help="exclude epoch state and learning rate") parser.add_argument("--tnt_config", help="Config file to use (see configs dir)", default=None, required=True) parser.add_argument("--seed", default=0, type=int, help="seed for initializing training. ") - parser.add_argument("--save_ckpt_every_step", default=0, type=int, help="Save checkpoint every N batches") - parser.add_argument("--save_ckpt_every_sec", default=1800, type=int, help="Save checkpoint every N seconds") - parser.add_argument("--save_ckpt_keep", default=20, type=int, help="Keep N checkpoints") + parser.add_argument("--save_every", default=2, type=int, help="Save every ___ epochs(default:2)") parser.add_argument("--label_smoothing", type=float, help="Label smoothing to use, default 0.0", default=0.1) parser.add_argument("--image_size", default=224, help="Image Size.", type=int) - parser.add_argument("--img_mean", nargs=3, type=float, default=(0.5, 0.5, 0.5), help="Image mean (model input)") - parser.add_argument("--img_std", nargs=3, type=float, default=(0.5, 0.5, 0.5), help="Image std (model input)") parser.add_argument('--train_url', default="./", help='Location of training outputs.') parser.add_argument("--run_modelarts", type=ast.literal_eval, default=False, help="Whether run on modelarts") - - parser.add_argument("--dir_ckpt", default="ckpt", help="Root directory for checkpoints.") - parser.add_argument("--dir_best_ckpt", default="best_ckpt", help="Root directory for best (acc) checkpoints.") - parser.add_argument("--dir_summary", default="summary", help="Root directory for summary logs.") - parser.add_argument("--dump_graph", action="store_true", - help="Dump model graph to MindInsight") - parser.add_argument("--collect_input_data", action="store_true", - help="Dump input images to MindInsight") - - parser.add_argument( - "--tnt_pt_implementation", - default="/mindspore/Efficient-AI-Backbones/tnt_pytorch", - help="Directory with existing implementation of TNT model (PyTorch)" - " (see https://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/tnt_pytorch)." - ) - parser.add_argument( - "--tnt_pt_pretrained", - default=( - # '/mindspore/pt_weights/tnt_s_81.5.pth.tar' - '/mindspore/pt_weights/tnt_b_82.9.pth.tar' - ), - help="Arguments to PyTorch implementation (JSON-encoded list)." - ) - parser.add_argument("--tnt_ms_export", help="Path to exported weights in MindSpore format (.ckpt).") - parser.add_argument("--pred_output", default="preds.json", help="Path to output predictions (JSON)") args = parser.parse_args() # Allow for use from notebook without config file diff --git a/research/cv/tnt/src/configs/parser.py b/research/cv/tnt/src/configs/parser.py index 7b81b1040a2dd533421f1444cefe2d98d08ca0e1..8d757737efee9a03f6e260083cb6ef3139414998 100644 --- a/research/cv/tnt/src/configs/parser.py +++ b/research/cv/tnt/src/configs/parser.py @@ -1,4 +1,4 @@ -# Copyright 2021-2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/src/data/__init__.py b/research/cv/tnt/src/data/__init__.py index 446545075c07c114ed3817645e10b63c78c2ac7b..bd1c59d54f233397bc32ad1bd809a286e6eb2837 100644 --- a/research/cv/tnt/src/data/__init__.py +++ b/research/cv/tnt/src/data/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021-2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/src/data/augment/__init__.py b/research/cv/tnt/src/data/augment/__init__.py index 899e22c0d4c923e926b8a20976aa21420711ae72..b4d178fcfa9992815a0c99e17f74de7e76901390 100644 --- a/research/cv/tnt/src/data/augment/__init__.py +++ b/research/cv/tnt/src/data/augment/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021-2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/src/data/augment/auto_augment.py b/research/cv/tnt/src/data/augment/auto_augment.py index e5c3310ad2f2df09371d883f68929dda3e0bc2ed..51cd1d671534de90cbb5d12343c8118b49341059 100644 --- a/research/cv/tnt/src/data/augment/auto_augment.py +++ b/research/cv/tnt/src/data/augment/auto_augment.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,16 +25,12 @@ AugMix adapted from: Papers: AutoAugment: Learning Augmentation Policies from Data - https://arxiv.org/abs/1805.09501 - Learning Data Augmentation Strategies for Object Detection - https://arxiv.org/abs/1906.11172 - RandAugment: Practical automated data augmentation... - https://arxiv.org/abs/1909.13719 - AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty - https://arxiv.org/abs/1912.02781 Hacked together by / Copyright 2020 Ross Wightman """ -# pylint: disable=R1707 import math import random import re @@ -229,39 +225,35 @@ def _randomly_negate(v): return -v if random.random() > 0.5 else v -def _rotate_level_to_arg(level, hparams): +def _rotate_level_to_arg(level, _hparams): """_randomly_negate""" # range [-30, 30] - _ = hparams level = (level / _MAX_LEVEL) * 30. level = _randomly_negate(level) - return level, + return (level,) -def _enhance_level_to_arg(level, hparams): +def _enhance_level_to_arg(level, _hparams): """_enhance_level_to_arg""" # range [0.1, 1.9] - _ = hparams - return (level / _MAX_LEVEL) * 1.8 + 0.1, + return ((level / _MAX_LEVEL) * 1.8 + 0.1,) -def _enhance_increasing_level_to_arg(level, hparams): +def _enhance_increasing_level_to_arg(level, _hparams): """_enhance_increasing_level_to_arg""" # the 'no change' level is 1.0, moving away from that towards 0. or 2.0 increases the enhancement blend # range [0.1, 1.9] - _ = hparams level = (level / _MAX_LEVEL) * .9 level = 1.0 + _randomly_negate(level) - return level, + return (level,) -def _shear_level_to_arg(level, hparams): +def _shear_level_to_arg(level, _hparams): """_shear_level_to_arg""" # range [-0.3, 0.3] - _ = hparams level = (level / _MAX_LEVEL) * 0.3 level = _randomly_negate(level) - return level, + return (level,) def _translate_abs_level_to_arg(level, hparams): @@ -269,7 +261,7 @@ def _translate_abs_level_to_arg(level, hparams): translate_const = hparams['translate_const'] level = (level / _MAX_LEVEL) * float(translate_const) level = _randomly_negate(level) - return level, + return (level,) def _translate_rel_level_to_arg(level, hparams): @@ -278,16 +270,15 @@ def _translate_rel_level_to_arg(level, hparams): translate_pct = hparams.get('translate_pct', 0.45) level = (level / _MAX_LEVEL) * translate_pct level = _randomly_negate(level) - return level, + return (level,) -def _posterize_level_to_arg(level, hparams): +def _posterize_level_to_arg(level, _hparams): """_posterize_level_to_arg""" # As per Tensorflow TPU EfficientNet impl # range [0, 4], 'keep 0 up to 4 MSB of original image' # intensity/severity of augmentation decreases with level - _ = hparams - return int((level / _MAX_LEVEL) * 4), + return (int((level / _MAX_LEVEL) * 4),) def _posterize_increasing_level_to_arg(level, hparams): @@ -295,38 +286,35 @@ def _posterize_increasing_level_to_arg(level, hparams): # As per Tensorflow models research and UDA impl # range [4, 0], 'keep 4 down to 0 MSB of original image', # intensity/severity of augmentation increases with level - return 4 - _posterize_level_to_arg(level, hparams)[0], + return (4 - _posterize_level_to_arg(level, hparams)[0],) -def _posterize_original_level_to_arg(level, hparams): +def _posterize_original_level_to_arg(level, _hparams): """_posterize_original_level_to_arg""" # As per original AutoAugment paper description # range [4, 8], 'keep 4 up to 8 MSB of image' # intensity/severity of augmentation decreases with level - _ = hparams - return int((level / _MAX_LEVEL) * 4) + 4, + return (int((level / _MAX_LEVEL) * 4) + 4,) -def _solarize_level_to_arg(level, hparams): +def _solarize_level_to_arg(level, _hparams): """_solarize_level_to_arg""" # range [0, 256] # intensity/severity of augmentation decreases with level - _ = hparams - return int((level / _MAX_LEVEL) * 256), + return (int((level / _MAX_LEVEL) * 256),) -def _solarize_increasing_level_to_arg(level, hparams): +def _solarize_increasing_level_to_arg(level, _hparams): """_solarize_increasing_level_to_arg""" # range [0, 256] # intensity/severity of augmentation increases with level - return 256 - _solarize_level_to_arg(level, hparams)[0], + return (256 - _solarize_level_to_arg(level, _hparams)[0],) -def _solarize_add_level_to_arg(level, hparams): +def _solarize_add_level_to_arg(level, _hparams): """_solarize_add_level_to_arg""" # range [0, 110] - _ = hparams - return int((level / _MAX_LEVEL) * 110), + return (int((level / _MAX_LEVEL) * 110),) LEVEL_TO_ARG = { @@ -494,7 +482,6 @@ def auto_augment_policy_v0r(hparams): def auto_augment_policy_original(hparams): """auto_augment_policy_original""" # ImageNet policy from https://arxiv.org/abs/1805.09501 - policy = [ [('PosterizeOriginal', 0.4, 8), ('Rotate', 0.6, 9)], [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], @@ -791,7 +778,6 @@ def augmix_ops(magnitude=10, hparams=None, transforms=None): class AugMixAugment: """ AugMix Transform Adapted and improved from impl here: https://github.com/google-research/augmix/blob/master/imagenet.py - From paper: 'AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty - https://arxiv.org/abs/1912.02781 """ diff --git a/research/cv/tnt/src/data/augment/mixup.py b/research/cv/tnt/src/data/augment/mixup.py index 3cd967bc32f8bd5efddad13eacb9d69867ada38b..f196fccb52790011529664f31d88a45093758c14 100644 --- a/research/cv/tnt/src/data/augment/mixup.py +++ b/research/cv/tnt/src/data/augment/mixup.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -142,7 +142,7 @@ class Mixup: def _params_per_elem(self, batch_size): """_params_per_elem""" lam = np.ones(batch_size, dtype=np.float32) - use_cutmix = np.zeros(batch_size, dtype=np.bool) + use_cutmix = np.zeros(batch_size, dtype=np.bool_) if self.mixup_enabled: if self.mixup_alpha > 0. and self.cutmix_alpha > 0.: use_cutmix = np.random.rand(batch_size) < self.switch_prob @@ -153,7 +153,7 @@ class Mixup: elif self.mixup_alpha > 0.: lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size) elif self.cutmix_alpha > 0.: - use_cutmix = np.ones(batch_size, dtype=np.bool) + use_cutmix = np.ones(batch_size, dtype=np.bool_) lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size) else: assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true." diff --git a/research/cv/tnt/src/data/augment/random_erasing.py b/research/cv/tnt/src/data/augment/random_erasing.py index eaa263c382c1c347568637f4e1ba57144c468463..15e304aa77ec6a281187aa19af6fed009c2dbe51 100644 --- a/research/cv/tnt/src/data/augment/random_erasing.py +++ b/research/cv/tnt/src/data/augment/random_erasing.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/src/data/data_utils/moxing_adapter.py b/research/cv/tnt/src/data/data_utils/moxing_adapter.py index 43691b5527849c6624f8f387d7c1a2ad15ea7644..37d2717e8b3e1fa56c025383d7488218b1e125ca 100644 --- a/research/cv/tnt/src/data/data_utils/moxing_adapter.py +++ b/research/cv/tnt/src/data/data_utils/moxing_adapter.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/src/data/imagenet.py b/research/cv/tnt/src/data/imagenet.py index 94c2e1e6a4a918414171df8864fb0967244094ed..95cb688a720db352887f3cf5a83221a64533a9f5 100644 --- a/research/cv/tnt/src/data/imagenet.py +++ b/research/cv/tnt/src/data/imagenet.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021-2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,10 +16,7 @@ Data operations, will be used in train.py and eval.py """ import os -from dataclasses import dataclass -import math -import numpy as np import mindspore.common.dtype as mstype import mindspore.dataset as ds import mindspore.dataset.transforms as C @@ -46,16 +43,14 @@ class ImageNet: self.train_dataset = create_dataset_imagenet(train_dir, training=True, args=args) self.val_dataset = create_dataset_imagenet(val_ir, training=False, args=args) else: - # train_dir = os.path.join(args.data_url, "train") - # val_ir = os.path.join(args.data_url, "val") + train_dir = os.path.join(args.data_url, "train") + val_ir = os.path.join(args.data_url, "val") if training: - self.train_dataset = create_dataset_imagenet(args.ds_train, training=True, args=args) - self.val_dataset = create_dataset_imagenet(args.ds_val, training=False, args=args) + self.train_dataset = create_dataset_imagenet(train_dir, training=True, args=args) + self.val_dataset = create_dataset_imagenet(val_ir, training=False, args=args) -def create_dataset_imagenet(dataset_dir, args, repeat_num=1, training=True, - preloaded_ds=None - ) -> ds.ImageFolderDataset: +def create_dataset_imagenet(dataset_dir, args, repeat_num=1, training=True): """ create a train or eval imagenet2012 dataset for TNT @@ -69,29 +64,22 @@ def create_dataset_imagenet(dataset_dir, args, repeat_num=1, training=True, """ device_num, rank_id = _get_rank_info() - if device_num is None: - device_num = 1 shuffle = bool(training) - ds.config.set_prefetch_size(args.batch_size) - if preloaded_ds is not None: - data_set = preloaded_ds + if device_num == 1 or not training: + data_set = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=args.num_parallel_workers, + shuffle=shuffle) else: - shard_args = {} - if device_num > 1 and training: - shard_args = {'num_shards': device_num, - 'shard_id': rank_id} - data_set = ds.ImageFolderDataset( - dataset_dir, num_parallel_workers=args.num_parallel_workers, - shuffle=shuffle, **shard_args - ) + data_set = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=args.num_parallel_workers, shuffle=shuffle, + num_shards=device_num, shard_id=rank_id) image_size = args.image_size # define map operations # BICUBIC: 3 - mean, std = args.img_mean, args.img_std # ImageNet: [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] if training: + mean = [0.485, 0.456, 0.406] + std = [0.229, 0.224, 0.225] aa_params = dict( translate_const=int(image_size * 0.45), img_mean=tuple([min(255, round(255 * x)) for x in mean]), @@ -114,24 +102,12 @@ def create_dataset_imagenet(dataset_dir, args, repeat_num=1, training=True, RandomErasing(args.re_prob, mode=args.re_mode, max_count=args.re_count) ] else: - mean = (np.array(mean) * 255).tolist() - std = (np.array(std) * 255).tolist() - - # As in the initial repo. - crop_pct = 0.9 - if isinstance(image_size, tuple): - assert len(image_size) == 2 - if image_size[-1] == image_size[-2]: - # fall-back to older behaviour so Resize scales to shortest edge if target is square - scale_size = int(math.floor(image_size[0] / crop_pct)) - else: - scale_size = tuple([int(x / crop_pct) for x in image_size]) - else: - scale_size = int(math.floor(image_size / crop_pct)) - + mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] + std = [0.229 * 255, 0.224 * 255, 0.225 * 255] + # test transform complete transform_img = [ vision.Decode(), - vision.Resize(scale_size, interpolation=Inter.BICUBIC), + vision.Resize(int(256 / 224 * image_size), interpolation=Inter.BICUBIC), vision.CenterCrop(image_size), vision.Normalize(mean=mean, std=std), vision.HWC2CHW() @@ -143,7 +119,7 @@ def create_dataset_imagenet(dataset_dir, args, repeat_num=1, training=True, operations=transform_img) data_set = data_set.map(input_columns="label", num_parallel_workers=args.num_parallel_workers, operations=transform_label) - if (args.mix_up > 0. or args.cutmix > 0.) and not training: + if (args.mix_up > 0. or args.cutmix > 0.) and not training: # if use mixup and not training(False), one hot val data label one_hot = C.OneHot(num_classes=args.num_classes) data_set = data_set.map(input_columns="label", num_parallel_workers=args.num_parallel_workers, @@ -181,156 +157,3 @@ def _get_rank_info(): rank_size = rank_id = None return rank_size, rank_id - - -@dataclass -class DatasetParams: - """Dataset arguments as a namespace""" - batch_size: int - num_parallel_workers: int - image_size: int - img_mean: list - img_std: list - interpolation: str - auto_augment: str - re_prob: float - re_mode: str - re_count: int - num_classes: int - mix_up: float # alpha - mixup_prob: float # prob - mixup_mode: str - switch_prob: float - cutmix: float - label_smoothing: float - - -def init_dataset( - dataset_dir, batch_size: int, - num_parallel_workers: int, - image_size: int, - img_mean: list, - img_std: list, - interpolation: str, - auto_augment: str, - re_prob: float, - re_mode: str, - re_count: int, - num_classes: int, - mix_up: float, - mixup_prob: float, - mixup_mode: str, - switch_prob: float, - cutmix: float, - label_smoothing: float, repeat_num=1, training=True, - preloaded_ds=None, - **kwargs -) -> ds.ImageFolderDataset: - """Initialize dataset with explicit parameter names""" - _ = kwargs - args = DatasetParams( - batch_size, - num_parallel_workers, - image_size, - img_mean, - img_std, - interpolation, - auto_augment, - re_prob, - re_mode, - re_count, - num_classes, - mix_up, - mixup_prob, - mixup_mode, - switch_prob, - cutmix, - label_smoothing - ) - return create_dataset_imagenet( - dataset_dir, args, repeat_num=repeat_num, training=training, - preloaded_ds=preloaded_ds - ) - - -def get_transforms( - image_size: int, training: bool, **aug: dict -): - """Get images preprocessing according mode and augmentations settings. - - Parameters - ---------- - image_size: int - Target image size. - training: bool - Mode. If True augmentations may be applied. - aug: Dict - Augmentation settings (type, auto aug, random erase). - - Returns - ------- - List of transforms. - """ - mean = [0.485, 0.456, 0.406] - std = [0.229, 0.224, 0.225] - - aug = {} if aug is None else aug - if training: - if aug['type'] == 'weak': - transform = [ - vision.ToPIL(), - vision.RandomResizedCrop( - image_size, scale=(0.08, 1.0), ratio=(3 / 4, 4 / 3), - interpolation=Inter.BILINEAR - ), - vision.RandomHorizontalFlip(prob=0.5), - vision.ToTensor(), - vision.Normalize(mean, std, is_hwc=False), - ] - elif aug['type'] == 'none': - transform = [ - vision.ToPIL(), - vision.Resize(image_size, interpolation=Inter.BILINEAR), - vision.CenterCrop(image_size), - vision.ToTensor(), - vision.Normalize(mean, std, is_hwc=False), - ] - elif aug['type'] == 'auto': - aa_params = dict( - translate_const=int(image_size * 0.45), - img_mean=tuple([min(255, round(255 * x)) for x in mean]), - interpolation=_pil_interp(aug['interpolation']) - ) - auto_augment = aug['auto_augment'] - - transform = [ - vision.RandomResizedCrop( - image_size, scale=(0.08, 1.0), ratio=(3 / 4, 4 / 3), - interpolation=Inter.BILINEAR - ), - vision.RandomHorizontalFlip(prob=0.5), - vision.ToPIL() - ] - if auto_augment is not None: - transform += [rand_augment_transform(auto_augment, aa_params)] - transform += [ - vision.ToTensor(), - vision.Normalize(mean=mean, std=std, is_hwc=False), - RandomErasing( - aug['re_prob'], mode=aug['re_mode'], - max_count=aug['re_count']), - ] - else: - raise ValueError('???' + aug.get('type', 'Unknown')) - else: - transform = [ - vision.ToPIL(), - vision.Resize( - int((256 / 224) * image_size), interpolation=Inter.BILINEAR - ), - vision.CenterCrop(image_size), - vision.ToTensor(), - vision.Normalize(mean, std, is_hwc=False), - ] - - return transform diff --git a/research/cv/tnt/src/models/__init__.py b/research/cv/tnt/src/models/__init__.py index eae6de6ed943d8c7dca6cfb464fcb88d12573d58..2024ccbd36d4f3f5921251b3fafe92ef1caa0d48 100644 --- a/research/cv/tnt/src/models/__init__.py +++ b/research/cv/tnt/src/models/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021-2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,9 +13,8 @@ # limitations under the License. # ============================================================================ """init model""" -from .tnt import tnt_b_patch16_224, tnt_s_patch16_224 +from .tnt import tnt_s_patch16_224 __all__ = [ - "tnt_b_patch16_224", "tnt_s_patch16_224", ] diff --git a/research/cv/tnt/src/models/tnt/__init__.py b/research/cv/tnt/src/models/tnt/__init__.py index e2198920809747b1b82e38d94d4a1d40f564b86b..2c78378dd031f465143e95d74938b30baa93d3bc 100644 --- a/research/cv/tnt/src/models/tnt/__init__.py +++ b/research/cv/tnt/src/models/tnt/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021-2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,5 +13,4 @@ # limitations under the License. # ============================================================================ """import tnt models""" -from .tnt import tnt_b_patch16_224, tnt_s_patch16_224 -from . import layers +from .tnt import tnt_s_patch16_224 diff --git a/research/cv/tnt/src/models/tnt/tnt.py b/research/cv/tnt/src/models/tnt/tnt.py index 95771a196db867c7a9b3fff3ae0eb197bd5974ff..25ce3318fc3ec2494c3c98cf6c1fbad20ce50815 100644 --- a/research/cv/tnt/src/models/tnt/tnt.py +++ b/research/cv/tnt/src/models/tnt/tnt.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,12 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -# -# This file has been derived from the https://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/tnt_pytorch -# repository and modified. -# ============================================================================ -"""Transformer in Transformer (TNT)""" -from dataclasses import dataclass +"""Transformer in Transformer(TNT)""" +import math import numpy as np import mindspore.common.initializer as weight_init @@ -27,15 +23,11 @@ from mindspore import Parameter from mindspore import Tensor from mindspore import dtype as mstype -from .layers.misc import DropPath1D, trunc_array -from .layers.patch_embed import PatchEmbed -from .layers.attention import Attention +from .misc import DropPath1D, to_2tuple, Identity, trunc_array def make_divisible(v, divisor=8, min_value=None): - """ - Round number to the multiple of divisor - """ + """make_divisible""" min_value = min_value or divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. @@ -44,31 +36,43 @@ def make_divisible(v, divisor=8, min_value=None): return new_v +class UnfoldKernelEqPatch(nn.Cell): + """UnfoldKernelEqPatch with better performance""" + + def __init__(self, kernel_size, strides): + super(UnfoldKernelEqPatch, self).__init__() + assert kernel_size == strides + self.kernel_size = kernel_size + self.reshape = P.Reshape() + self.transpose = P.Transpose() + + def construct(self, inputs): + B, C, H, W = inputs.shape + inputs = self.reshape(inputs, + (B, C, H // self.kernel_size[0], self.kernel_size[0], W)) + inputs = self.transpose(inputs, (0, 2, 1, 3, 4)) + inputs = self.reshape(inputs, (-1, C, self.kernel_size[0], W // self.kernel_size[1], self.kernel_size[1])) + inputs = self.transpose(inputs, (0, 3, 1, 2, 4)) + inputs = self.reshape(inputs, (-1, C, self.kernel_size[0], self.kernel_size[1])) + + return inputs + + class Mlp(nn.Cell): - """ - Multi-layer perceptron - - Args: - in_features(int): Number of input features - hidden_features(int): Number of hidden features - out_features(int): Number of output features - act_layer(class): Activation layer (base class) - drop(float): Dropout rate - """ + """Mlp""" - def __init__(self, in_features, hidden_features=None, - out_features=None, act_layer=nn.GELU, drop=0.): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features - self.fc1 = nn.Dense(in_channels=in_features, out_channels=hidden_features, has_bias=True) + self.fc1 = nn.Dense(in_channels=in_features, out_channels=hidden_features) self.act = act_layer() - self.fc2 = nn.Dense(in_channels=hidden_features, out_channels=out_features, has_bias=True) - self.drop = nn.Dropout(keep_prob=1.0 - drop) # if drop > 0. else Identity() + self.fc2 = nn.Dense(in_channels=hidden_features, out_channels=out_features, has_bias=False) + self.drop = nn.Dropout(p=drop) if drop > 0. else Identity() - def construct(self, *inputs, **kwargs): - x = inputs[0] + def construct(self, x): x = self.fc1(x) + x = self.act(x) x = self.drop(x) x = self.fc2(x) @@ -85,71 +89,95 @@ class SE(nn.Cell): self.dim = dim hidden_dim = int(dim * hidden_ratio) self.fc = nn.SequentialCell([ - nn.LayerNorm(normalized_shape=dim, epsilon=1e-5), - nn.Dense(in_channels=dim, out_channels=hidden_dim), + LayerNorm(normalized_shape=dim, eps=1e-05), + nn.Dense(in_channels=dim, out_channels=hidden_dim, has_bias=False), nn.ReLU(), - nn.Dense(in_channels=hidden_dim, out_channels=dim), + nn.Dense(in_channels=hidden_dim, out_channels=dim, has_bias=False), nn.Tanh() ]) - self.reduce_mean = P.ReduceMean() - - def construct(self, *inputs, **kwargs): - x = inputs[0] - a = self.reduce_mean(True, x, 1) # B, 1, C + def construct(self, x): + a = P.ReduceMean()(True, x, 1) # B, 1, C a = self.fc(a) x = a * x return x +class Attention(nn.Cell): + """Attention""" + + def __init__(self, dim, hidden_dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.hidden_dim = hidden_dim + self.num_heads = num_heads + head_dim = hidden_dim // num_heads + self.head_dim = head_dim + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim ** -0.5 + + # self.qk = nn.Dense(in_channels=dim, out_channels=hidden_dim * 2, has_bias=qkv_bias) + self.q = nn.Dense(in_channels=dim, out_channels=hidden_dim, has_bias=qkv_bias) + self.k = nn.Dense(in_channels=dim, out_channels=hidden_dim, has_bias=qkv_bias) + self.v = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias) + self.attn_drop = nn.Dropout(p=attn_drop) + self.proj = nn.Dense(in_channels=dim, out_channels=dim, has_bias=False) + self.proj_drop = nn.Dropout(p=proj_drop) + self.softmax = nn.Softmax(axis=-1) + self.matmul = P.BatchMatMul() + + def construct(self, x): + """Attention construct""" + B, N, _ = x.shape + q = P.Reshape()(self.q(x), (B, N, self.num_heads, self.head_dim)) + q = P.Transpose()(q, (0, 2, 1, 3)) + + k = P.Reshape()(self.k(x), (B, N, self.num_heads, self.head_dim)) + k = P.Transpose()(k, (0, 2, 1, 3)) + # qk = P.Reshape()(self.qk(x), (B, N, 2, self.num_heads, self.head_dim)) + # qk = P.Transpose()(qk, (2, 0, 3, 1, 4)) + + v = P.Reshape()(self.v(x), (B, N, self.num_heads, -1)) + v = P.Transpose()(v, (0, 2, 1, 3)) + + attn = self.matmul(q, P.Transpose()(k, (0, 1, 3, 2))) * self.scale + attn = self.softmax(attn) + attn = self.attn_drop(attn) + + x = P.Transpose()(self.matmul(attn, v), (0, 2, 1, 3)) + x = P.Reshape()(x, (B, N, -1)) + x = self.proj(x) + x = self.proj_drop(x) + return x + + class Block(nn.Cell): - """ - TNT base block - - Args: - outer_dim(int): Number of output features - inner_dim(int): Number of internal features - outer_num_heads(int): Number of output heads - inner_num_heads(int): Number of internal heads - num_words(int): Number of 'visual words' (feature groups) - mlp_ratio(float): Rate of MLP per hidden features - qkv_bias(bool): Use Qk / v bias - qk_scale(float): Qk scale - drop(float): Dropout rate - attn_drop(float): Dropout rate of attention layer - drop_path(float): Path dropout rate - act_layer(class): Activation layer (class) - norm_layer(class): Normalization layer - se(int): SE parameter - """ + """ TNT Block""" - def __init__(self, outer_dim, inner_dim, outer_num_heads, - inner_num_heads, num_words, mlp_ratio=4., - qkv_bias=False, qk_scale=None, drop=0., - attn_drop=0., drop_path=0., act_layer=nn.GELU, + def __init__(self, outer_dim, inner_dim, outer_num_heads, inner_num_heads, num_words, mlp_ratio=4., + qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, se=0): super().__init__() self.has_inner = inner_dim > 0 if self.has_inner: # Inner - self.inner_norm1 = norm_layer((inner_dim,), epsilon=1e-5) + self.inner_norm1 = norm_layer((inner_dim,)) self.inner_attn = Attention( inner_dim, inner_dim, num_heads=inner_num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) - self.inner_norm2 = norm_layer((inner_dim,), epsilon=1e-5) + self.inner_norm2 = norm_layer((inner_dim,)) self.inner_mlp = Mlp(in_features=inner_dim, hidden_features=int(inner_dim * mlp_ratio), out_features=inner_dim, act_layer=act_layer, drop=drop) - self.proj_norm1 = norm_layer((num_words * inner_dim,), epsilon=1e-5) + self.proj_norm1 = norm_layer((num_words * inner_dim,)) self.proj = nn.Dense(in_channels=num_words * inner_dim, out_channels=outer_dim, has_bias=False) - self.proj_norm2 = norm_layer((outer_dim,), epsilon=1e-5) + self.proj_norm2 = norm_layer((outer_dim,)) # Outer - self.outer_norm1 = norm_layer((outer_dim,), epsilon=1e-5) + self.outer_norm1 = norm_layer((outer_dim,)) self.outer_attn = Attention( outer_dim, outer_dim, num_heads=outer_num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) self.drop_path = DropPath1D(drop_path) - self.outer_norm2 = norm_layer((outer_dim,), epsilon=1e-5) + self.outer_norm2 = norm_layer((outer_dim,)) self.outer_mlp = Mlp(in_features=outer_dim, hidden_features=int(outer_dim * mlp_ratio), out_features=outer_dim, act_layer=act_layer, drop=drop) # SE @@ -159,79 +187,65 @@ class Block(nn.Cell): self.se_layer = SE(outer_dim, 0.25) self.zeros = Tensor(np.zeros([1, 1, 1]), dtype=mstype.float32) - self.reshape = P.Reshape() - self.cast = P.Cast() - - def construct(self, *inputs, **kwargs): + def construct(self, inner_tokens, outer_tokens): """TNT Block construct""" - inner_tokens, outer_tokens = inputs[0], inputs[1] if self.has_inner: - in1 = self.inner_norm1(inner_tokens) - attn1 = self.inner_attn(in1) - inner_tokens = inner_tokens + self.drop_path(attn1) # B*N, k*k, c - in2 = self.inner_norm2(inner_tokens) - mlp = self.inner_mlp(in2) - inner_tokens = inner_tokens + self.drop_path(mlp) # B*N, k*k, c - b, n, _ = P.Shape()(outer_tokens) - # zeros = P.Tile()(self.zeros, (B, 1, C)) - proj = self.proj_norm2(self.proj(self.proj_norm1( - self.reshape(inner_tokens, (b, n - 1, -1,)) - ))) - proj = self.cast(proj, mstype.float32) - # proj = P.Concat(1)((zeros, proj)) - # outer_tokens = outer_tokens + proj # B, N, C - outer_tokens[:, 1:] = outer_tokens[:, 1:] + proj + inner_tokens = inner_tokens + self.drop_path(self.inner_attn(self.inner_norm1(inner_tokens))) # B*N, k*k, c + inner_tokens = inner_tokens + self.drop_path(self.inner_mlp(self.inner_norm2(inner_tokens))) # B*N, k*k, c + B, N, C = P.Shape()(outer_tokens) + zeros = P.Tile()(self.zeros, (B, 1, C)) + proj = self.proj_norm2(self.proj(self.proj_norm1(P.Reshape()(inner_tokens, (B, N - 1, -1,))))) + proj = P.Cast()(proj, mstype.float32) + proj = P.Concat(1)((zeros, proj)) + outer_tokens = outer_tokens + proj # B, N, C if self.se > 0: - outer_tokens = outer_tokens + self.drop_path( - self.outer_attn(self.outer_norm1(outer_tokens))) + outer_tokens = outer_tokens + self.drop_path(self.outer_attn(self.outer_norm1(outer_tokens))) tmp_ = self.outer_mlp(self.outer_norm2(outer_tokens)) - outer_tokens = outer_tokens + self.drop_path( - tmp_ + self.se_layer(tmp_)) + outer_tokens = outer_tokens + self.drop_path(tmp_ + self.se_layer(tmp_)) else: - outer_tokens = outer_tokens + self.drop_path( - self.outer_attn(self.outer_norm1(outer_tokens))) - outer_tokens = outer_tokens + self.drop_path( - self.outer_mlp(self.outer_norm2(outer_tokens))) + outer_tokens = outer_tokens + self.drop_path(self.outer_attn(self.outer_norm1(outer_tokens))) + outer_tokens = outer_tokens + self.drop_path(self.outer_mlp(self.outer_norm2(outer_tokens))) return inner_tokens, outer_tokens -class TNT(nn.Cell): +class PatchEmbed(nn.Cell): + """ Image to Visual Word Embedding """ - TNT (Transformer in Transformer) for computer vision - - Args: - img_size(int): Image size (side, px) - patch_size(int): Patch size (side, px) - in_chans(int): Number of input channels - num_classes(int): Number of output classes - outer_dim(int): Number of output features - inner_dim(int): Number of internal features - depth(int): Number of TNT base blocks - outer_num_heads(int): Number of output heads - inner_num_heads(int): Number of internal heads - mlp_ratio(float): Rate of MLP per hidden features - qkv_bias(bool): Use Qk / v bias - qk_scale(float): Qk scale - drop_rate(float): Dropout rate - attn_drop_rate(float): Dropout rate for attention layer - drop_path_rate(float): Dropout rate for DropPath layer - norm_layer(class): Normalization layer - inner_stride(int): Number of strides for internal patches - se(int): SE parameter + + def __init__(self, img_size=224, patch_size=16, in_chans=3, outer_dim=768, inner_dim=24, inner_stride=4): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + self.inner_dim = inner_dim + self.num_words = math.ceil(patch_size[0] / inner_stride) * math.ceil(patch_size[1] / inner_stride) + + self.unfold = UnfoldKernelEqPatch(kernel_size=patch_size, strides=patch_size) + self.proj = nn.Conv2d(in_channels=in_chans, out_channels=inner_dim, kernel_size=7, stride=inner_stride, + pad_mode='pad', padding=3, has_bias=False) + + def construct(self, x): + B = x.shape[0] + x = self.unfold(x) # B, Ck2, N + x = self.proj(x) # B*N, C, 8, 8 + x = P.Reshape()(x, (B * self.num_patches, self.inner_dim, -1,)) # B*N, 8*8, C + x = P.Transpose()(x, (0, 2, 1)) + return x + + +class TNT(nn.Cell): + """ TNT (Transformer in Transformer) for computer vision """ - def __init__(self, img_size=224, patch_size=16, in_chans=3, - num_classes=1000, outer_dim=768, inner_dim=48, - depth=12, outer_num_heads=12, inner_num_heads=4, - mlp_ratio=4., qkv_bias=False, qk_scale=None, - # drop_rate=0., attn_drop_rate=0., drop_path_rate=0., - # norm_layer=LayerNormFixOrder, inner_stride=4, se=0, - drop_rate=0., attn_drop_rate=0., drop_path_rate=0., - norm_layer=nn.LayerNorm, inner_stride=4, se=0, + def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, outer_dim=768, inner_dim=48, + depth=12, outer_num_heads=12, inner_num_heads=4, mlp_ratio=4., qkv_bias=False, qk_scale=None, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, inner_stride=4, se=0, **kwargs): super().__init__() - _ = kwargs self.num_classes = num_classes self.outer_dim = outer_dim @@ -241,16 +255,16 @@ class TNT(nn.Cell): self.num_patches = num_patches = self.patch_embed.num_patches num_words = self.patch_embed.num_words - self.proj_norm1 = norm_layer((num_words * inner_dim,), epsilon=1e-5) - self.proj = nn.Dense(in_channels=num_words * inner_dim, out_channels=outer_dim, has_bias=True) - self.proj_norm2 = norm_layer((outer_dim,), epsilon=1e-5) + self.proj_norm1 = norm_layer((num_words * inner_dim,)) + self.proj = nn.Dense(in_channels=num_words * inner_dim, out_channels=outer_dim, has_bias=False) + self.proj_norm2_tnt = norm_layer((outer_dim,)) self.cls_token = Parameter(Tensor(trunc_array([1, 1, outer_dim]), dtype=mstype.float32), name="cls_token", requires_grad=True) self.outer_pos = Parameter(Tensor(trunc_array([1, num_patches + 1, outer_dim]), dtype=mstype.float32), name="outer_pos") self.inner_pos = Parameter(Tensor(trunc_array([1, num_words, inner_dim]), dtype=mstype.float32)) - self.pos_drop = nn.Dropout(keep_prob=1.0 - drop_rate) + self.pos_drop = nn.Dropout(p=drop_rate) dpr = [x for x in np.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule vanilla_idxs = [] @@ -268,7 +282,6 @@ class TNT(nn.Cell): num_words=num_words, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, se=se)) self.blocks = nn.CellList(blocks) - # self.norm = norm_layer(outer_dim, eps=1e-5) self.norm = norm_layer((outer_dim,)) # NOTE as per official impl, we could have a pre-logits representation dense layer + tanh here @@ -279,12 +292,7 @@ class TNT(nn.Cell): mask = np.zeros([1, num_patches + 1, 1]) mask[:, 0] = 1 self.mask = Tensor(mask, dtype=mstype.float32) - self.head = nn.Dense(in_channels=outer_dim, out_channels=num_classes, has_bias=True) - - self.reshape = P.Reshape() - self.concat = P.Concat(1) - self.tile = P.Tile() - self.cast = P.Cast() + self.head = nn.Dense(in_channels=outer_dim, out_channels=num_classes, has_bias=False) self.init_weights() print("================================success================================") @@ -310,18 +318,13 @@ class TNT(nn.Cell): def forward_features(self, x): """TNT forward_features""" - b = x.shape[0] + B = x.shape[0] inner_tokens = self.patch_embed(x) + self.inner_pos # B*N, 8*8, C - outer_tokens = self.proj_norm2( - self.proj(self.proj_norm1( - self.reshape(inner_tokens, (b, self.num_patches, -1,)) - )) - ) - outer_tokens = self.cast(outer_tokens, mstype.float32) - outer_tokens = self.concat(( - self.tile(self.cls_token, (b, 1, 1)), outer_tokens - )) + outer_tokens = self.proj_norm2_tnt( + self.proj(self.proj_norm1(P.Reshape()(inner_tokens, (B, self.num_patches, -1,))))) + outer_tokens = P.Cast()(outer_tokens, mstype.float32) + outer_tokens = P.Concat(1)((P.Tile()(self.cls_token, (B, 1, 1)), outer_tokens)) outer_tokens = outer_tokens + self.outer_pos outer_tokens = self.pos_drop(outer_tokens) @@ -332,8 +335,7 @@ class TNT(nn.Cell): outer_tokens = self.norm(outer_tokens) # [batch_size, num_patch+1, outer_dim) return outer_tokens[:, 0] - def construct(self, *inputs, **kwargs): - x = inputs[0] + def construct(self, x): x = self.forward_features(x) x = self.head(x) return x @@ -348,13 +350,12 @@ def tnt_s_patch16_224(args): inner_dim = 24 outer_num_heads = 6 inner_num_heads = 4 - depth = 12 drop_path_rate = args.drop_path_rate drop_out = args.drop_out num_classes = args.num_classes outer_dim = make_divisible(outer_dim, outer_num_heads) inner_dim = make_divisible(inner_dim, inner_num_heads) - model = TNT(img_size=224, patch_size=patch_size, outer_dim=outer_dim, inner_dim=inner_dim, depth=depth, + model = TNT(img_size=224, patch_size=patch_size, outer_dim=outer_dim, inner_dim=inner_dim, depth=12, outer_num_heads=outer_num_heads, inner_num_heads=inner_num_heads, qkv_bias=False, inner_stride=inner_stride, drop_path_rate=drop_path_rate, drop_out=drop_out, num_classes=num_classes) return model @@ -369,32 +370,12 @@ def tnt_b_patch16_224(args): inner_dim = 40 outer_num_heads = 10 inner_num_heads = 4 - depth = 12 drop_path_rate = args.drop_path_rate drop_out = args.drop_out num_classes = args.num_classes outer_dim = make_divisible(outer_dim, outer_num_heads) inner_dim = make_divisible(inner_dim, inner_num_heads) - model = TNT(img_size=224, patch_size=patch_size, outer_dim=outer_dim, inner_dim=inner_dim, depth=depth, + model = TNT(img_size=224, patch_size=patch_size, outer_dim=outer_dim, inner_dim=inner_dim, depth=12, outer_num_heads=outer_num_heads, inner_num_heads=inner_num_heads, qkv_bias=False, inner_stride=inner_stride, drop_path_rate=drop_path_rate, drop_out=drop_out, num_classes=num_classes) return model - - -@dataclass -class NetworkParams: - num_classes: int - drop_path_rate: float - drop_out: float - - -def get_model_by_name(arch, num_classes, drop_path_rate, drop_out, - **kwargs) -> TNT: - """get network by name and initialize it""" - _ = kwargs - models = { - 'tnt_s_patch16_224': tnt_s_patch16_224, - 'tnt_b_patch16_224': tnt_b_patch16_224 - } - args = NetworkParams(num_classes, drop_path_rate, drop_out) - return models[arch](args) diff --git a/research/cv/tnt/src/tools/cell.py b/research/cv/tnt/src/tools/cell.py index e506319be9029b62537a3c2c6c124cc9749f2ac2..7e886ad6f23e788345acf593ce2fe5d14d1a9292 100644 --- a/research/cv/tnt/src/tools/cell.py +++ b/research/cv/tnt/src/tools/cell.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ import mindspore.nn as nn from mindspore import dtype as mstype from mindspore.ops import functional as F +from src.args import args + class OutputTo16(nn.Cell): "Wrap cell for amp. Cast network output back to float16" @@ -25,8 +27,7 @@ class OutputTo16(nn.Cell): super(OutputTo16, self).__init__(auto_prefix=False) self._op = op - def construct(self, *inputs, **kwargs): - x = inputs[0] + def construct(self, x): return F.cast(self._op(x), mstype.float16) @@ -37,7 +38,7 @@ def do_keep_fp16(network, cell_types): cell.to_float(mstype.float16) -def cast_amp(net, args): +def cast_amp(net): """cast network amp_level""" if args.amp_level == "O2": cell_types = (nn.Dense,) diff --git a/research/cv/tnt/src/tools/criterion.py b/research/cv/tnt/src/tools/criterion.py index 7d73a4028950e579b31267f029f52480ace0d2f9..ee963c1fee2724003b422532b2c1cb36ba0e9391 100644 --- a/research/cv/tnt/src/tools/criterion.py +++ b/research/cv/tnt/src/tools/criterion.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -31,11 +31,10 @@ class SoftTargetCrossEntropy(LossBase): self.sum_ops = P.ReduceSum(keep_dims=False) self.log_softmax = P.LogSoftmax() - def construct(self, logits, labels): - logits = P.Cast()(logits, mstype.float32) - labels = P.Cast()(labels, mstype.float32) - # pylint: disable=invalid-unary-operand-type - loss = self.sum_ops(-labels * self.log_softmax(logits), -1) + def construct(self, logit, label): + logit = P.Cast()(logit, mstype.float32) + label = P.Cast()(label, mstype.float32) + loss = self.sum_ops(-label * self.log_softmax(logit), -1) return self.mean_ops(loss) @@ -51,11 +50,10 @@ class CrossEntropySmooth(LossBase): self.ce = nn.SoftmaxCrossEntropyWithLogits(reduction=reduction) self.cast = ops.Cast() - def construct(self, logits, labels): + def construct(self, logit, label): if self.sparse: - labels = self.onehot(labels, F.shape(logits)[1], - self.on_value, self.off_value) - loss2 = self.ce(logits, labels) + label = self.onehot(label, F.shape(logit)[1], self.on_value, self.off_value) + loss2 = self.ce(logit, label) return loss2 @@ -89,8 +87,7 @@ class NetWithLoss(nn.Cell): self.model = model self.criterion = criterion - def construct(self, *inputs, **kwargs): - data, label = inputs[:2] + def construct(self, data, label): predict = self.model(data) loss = self.criterion(predict, label) return loss diff --git a/research/cv/tnt/src/tools/get_misc.py b/research/cv/tnt/src/tools/get_misc.py index 6c841c97b72afe2f79aae04193e8567af54df085..73ae63120028bca3a5f0acf10182af5861b5b849 100644 --- a/research/cv/tnt/src/tools/get_misc.py +++ b/research/cv/tnt/src/tools/get_misc.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -44,7 +44,7 @@ def set_device(args): rank = get_rank() else: - context.set_context(device_id=args.device_id[rank]) + context.set_context(device_id=args.device_id) elif device_target == "GPU": if device_num > 1: init(backend_name='nccl') @@ -53,14 +53,14 @@ def set_device(args): gradients_mean=True) rank = get_rank() else: - context.set_context(device_id=args.device_id[rank]) + context.set_context(device_id=args.device_id) else: raise ValueError("Unsupported platform.") return rank -def get_dataset(args, training=True) -> data.ImageNet: +def get_dataset(args, training=True): """"Get model according to args.set""" print(f"=> Getting {args.set} dataset") dataset = getattr(data, args.set)(args, training) @@ -76,7 +76,7 @@ def get_model(args): return model -def pretrained(args, model, exclude_epoch_state=True): +def pretrained(args, model): """"Load pretrained weights if args.pretrained is given""" if args.run_modelarts: print('Download data.') @@ -101,13 +101,6 @@ def pretrained(args, model, exclude_epoch_state=True): if value.shape[0] != args.num_classes: print(f'==> removing {key} with shape {value.shape}') param_dict.pop(key) - if exclude_epoch_state: - if 'epoch_num' in param_dict: - param_dict.pop('epoch_num') - if 'step_num' in param_dict: - param_dict.pop('step_num') - if 'learning_rate' in param_dict: - param_dict.pop('learning_rate') load_param_into_net(model, param_dict) else: print("=> no pretrained weights found at '{}'".format(args.pretrained)) diff --git a/research/cv/tnt/src/tools/optimizer.py b/research/cv/tnt/src/tools/optimizer.py index 9c42a98f7ab10961815356218d26b8936e73530a..b7d80d52ee0e47a7a1846d6c416fdfeaf58407f7 100644 --- a/research/cv/tnt/src/tools/optimizer.py +++ b/research/cv/tnt/src/tools/optimizer.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/src/tools/schedulers.py b/research/cv/tnt/src/tools/schedulers.py index c1bbe4b69527c7396a8c92d30a156e4a66ee43e7..dddc77243e4db6b1f05a3608f0547ebc85e1cf72 100644 --- a/research/cv/tnt/src/tools/schedulers.py +++ b/research/cv/tnt/src/tools/schedulers.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/src/trainers/__init__.py b/research/cv/tnt/src/trainers/__init__.py index 9d38bfada15068e24d67bd5fc96e3923a186548d..077e7628e6bd0913df62661d5bd1cb291db41fcf 100644 --- a/research/cv/tnt/src/trainers/__init__.py +++ b/research/cv/tnt/src/trainers/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021-2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/src/trainers/train_one_step_with_scale_and_clip_global_norm.py b/research/cv/tnt/src/trainers/train_one_step_with_scale_and_clip_global_norm.py index 1b5de92b545695bf4b5f5c1be1a3b7ea2a359627..ab85b5248f7b39e378e754d221b4832ab2550578 100644 --- a/research/cv/tnt/src/trainers/train_one_step_with_scale_and_clip_global_norm.py +++ b/research/cv/tnt/src/trainers/train_one_step_with_scale_and_clip_global_norm.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021-2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/train.py b/research/cv/tnt/train.py index a275d1990bd4ded8eb215be459fe1ab017d69328..f0bb7a77f67f02abed939085b86e78d0d2eb496e 100644 --- a/research/cv/tnt/train.py +++ b/research/cv/tnt/train.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,35 +12,30 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""Training script for TNT model""" -import time -import datetime -import functools +"""train""" +import os from mindspore import Model from mindspore import context from mindspore import nn from mindspore.common import set_seed +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor -from src.tools.common import get_callbacks +from src.args import args +from src.tools.callback import EvaluateCallBack from src.tools.cell import cast_amp -# from src.tools.callbacks import StopAtEpoch from src.tools.criterion import get_criterion, NetWithLoss -from src.tools.get_misc import ( - get_dataset, set_device, get_model, pretrained, get_train_one_step -) +from src.tools.get_misc import get_dataset, set_device, get_model, pretrained, get_train_one_step from src.tools.optimizer import get_optimizer def main(): - from src.args import args set_seed(args.seed) mode = { 0: context.GRAPH_MODE, 1: context.PYNATIVE_MODE } - context.set_context(mode=mode[args.pynative_mode], - device_target=args.device_target) + context.set_context(mode=mode[args.graph_mode], device_target=args.device_target) context.set_context(enable_graph_kernel=False) if args.device_target == "Ascend": context.set_context(enable_auto_mixed_precision=True) @@ -48,9 +43,11 @@ def main(): # get model and cast amp_level net = get_model(args) - cast_amp(net, args) + cast_amp(net) criterion = get_criterion(args) net_with_loss = NetWithLoss(net, criterion) + if args.pretrained: + pretrained(args, net) data = get_dataset(args) batch_num = data.train_dataset.get_dataset_size() @@ -58,54 +55,38 @@ def main(): # save a yaml file to read to record parameters net_with_loss = get_train_one_step(args, net_with_loss, optimizer) - if args.pretrained: - pretrained(args, net_with_loss, args.exclude_epoch_state) - eval_network = nn.WithEvalCell(net, criterion, - args.amp_level in ["O2", "O3", "auto"]) + eval_network = nn.WithEvalCell(net, criterion, args.amp_level in ["O2", "O3", "auto"]) eval_indexes = [0, 1, 2] model = Model(net_with_loss, metrics={"acc", "loss"}, eval_network=eval_network, eval_indexes=eval_indexes) - cur_name = datetime.datetime.now().strftime('%y-%m-%d_%H%M%S') - ckpt_save_dir = "{}/{}_{}".format(args.dir_ckpt, cur_name, rank) - ckpt_best_save_dir = "{}/{}_{}".format(args.dir_best_ckpt, cur_name, rank) - summary_dir = "{}/{}".format(args.dir_summary, cur_name) - # if args.run_modelarts: - # ckpt_save_dir = "/cache/ckpt_" + str(rank) + config_ck = CheckpointConfig(save_checkpoint_steps=data.train_dataset.get_dataset_size(), + keep_checkpoint_max=args.save_every) + time_cb = TimeMonitor(data_size=data.train_dataset.get_dataset_size()) - cb = get_callbacks( - args.arch, rank, data.train_dataset.get_dataset_size(), - data.val_dataset.get_dataset_size(), ckpt_save_dir, ckpt_best_save_dir, - summary_dir, args.save_ckpt_every_step, args.save_ckpt_every_sec, - args.save_ckpt_keep, print_loss_every=100, - collect_graph=args.dump_graph - ) + ckpt_save_dir = "./ckpt_" + str(rank) + if args.run_modelarts: + ckpt_save_dir = "/cache/ckpt_" + str(rank) - print("begin train") - print('Number of parameters:', - sum(functools.reduce(lambda x, y: x * y, params.shape) - for params in net.trainable_params())) - print('Number of samples in dataset:' - ' train={}, val={}'.format(data.train_dataset.get_dataset_size(), - data.val_dataset.get_dataset_size())) - # cb.append(StopAtEpoch(summary_dir, 1, args.epochs - args.start_epoch)) + ckpoint_cb = ModelCheckpoint(prefix=args.arch + str(rank), directory=ckpt_save_dir, + config=config_ck) + loss_cb = LossMonitor() + eval_cb = EvaluateCallBack(model, eval_dataset=data.val_dataset, src_url=ckpt_save_dir, + train_url=os.path.join(args.train_url, "ckpt_" + str(rank)), + save_freq=args.save_every) - sink_mode = True - t1 = time.time() - model.fit(int(args.epochs - args.start_epoch), data.train_dataset, - data.val_dataset, callbacks=cb, dataset_sink_mode=sink_mode) - t2 = time.time() - dt = 1000 * (t2 - t1) - print('Total training time: {:.3f} ms, time per epoch: {:.3f} ms,' - ' time per batch: {:.3f} ms, time per element: {:.3f} ms' - .format(dt, dt / args.epochs, - dt / args.epochs / data.train_dataset.get_dataset_size(), - dt / args.epochs / - data.train_dataset.get_dataset_size() / args.batch_size)) + print("begin train") + model.train(int(args.epochs - args.start_epoch), data.train_dataset, + callbacks=[time_cb, ckpoint_cb, loss_cb, eval_cb], + dataset_sink_mode=True) print("train success") + if args.run_modelarts: + import moxing as mox + mox.file.copy_parallel(src_url=ckpt_save_dir, dst_url=os.path.join(args.train_url, "ckpt_" + str(rank))) + if __name__ == '__main__': main() diff --git a/research/nlp/tprr/README.md b/research/nlp/tprr/README.md index 3f9ae2daf35e0bf65bfbd91cb612d71566c5de65..646132c503e62d599d685bb9194d98744b292952 100644 --- a/research/nlp/tprr/README.md +++ b/research/nlp/tprr/README.md @@ -147,7 +147,7 @@ Parameters for re-ranker and reader evaluation can be passed directly at executi ``` Evaluation result will be stored in the scripts path, whose folder name begins with "eval_tr". You can find the result like the - followings in log. + following in log. ```python ###step###: 0 @@ -175,7 +175,7 @@ Parameters for re-ranker and reader evaluation can be passed directly at executi ``` Evaluation result will be stored in the scripts path, whose folder name begins with "eval". You can find the result like the - followings in log. + following in log. ```python total top1 pem: 0.8803511141120864 diff --git a/research/nlp/transX/README.md b/research/nlp/transX/README.md index 04500ec0b97b3ce3b379654ed3bcafb00f97a61c..deb4b43253ae5ee6aa08800a42a83da2e7459c65 100644 --- a/research/nlp/transX/README.md +++ b/research/nlp/transX/README.md @@ -297,7 +297,7 @@ bash scripts/run_eval_gpu.sh [DATASET_ROOT] [DATASET_NAME] [MODEL_NAME] [CKPT_PA #### Result -Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log. +Evaluation result will be stored in the scripts path. Under this, you can find result like the following in log. The evaluation results will be stored in the **./eval-output** directory. If the shell script is used, the logged information will be redirected to the **./eval-logs** directory. diff --git a/research/recommend/GEHRL/README.MD b/research/recommend/GEHRL/README.MD index 2a77830ff88f187ff1e3f529792e376b8c97e8eb..5409245a0bc81869a2afaa36f51f011b308dd45f 100644 --- a/research/recommend/GEHRL/README.MD +++ b/research/recommend/GEHRL/README.MD @@ -1 +1 @@ -The source code of the paper "Graph Enhanced Hierarchical Reinforcement Learning for Goal-oriented Learning Path Recommendation" will be coming soon... \ No newline at end of file +The source code of the paper "Graph Enhanced Hierarchical Reinforcement Learning for Goal-oriented Learning Path Recommendation" refer to [https://github.com/mindspore-lab/models/tree/master/research/huawei-noah/GEHRL](https://github.com/mindspore-lab/models/tree/master/research/huawei-noah/GEHRL) \ No newline at end of file diff --git a/research/recommend/ULC/README.md b/research/recommend/ULC/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8b46f25d05c4bc9a052cc9c0101c6995b49bc6e9 --- /dev/null +++ b/research/recommend/ULC/README.md @@ -0,0 +1,90 @@ + +# Contents + +- [Contents](#contents) +- [UCL Description](#TAML-description) +- [Dataset](#dataset) +- [Environment Requirements](#environment-requirements) +- [Quick Start](#quick-start) +- [Script Description](#script-description) + - [Script and Sample Code](#script-and-sample-code) + - [Training Process](#training-process) + - [Training](#training) +- [ModelZoo Homepage](#modelzoo-homepage) + +# [ULC Description](#contents) + +Conversion rate prediction is critical to many online applications such as digital display advertising. To capture +dynamic data distribution, industrial systems often require retraining models on recent data daily or weekly. However, +the delay of conversion behavior usually leads to incorrect labeling, which is called delayed feedback problem. Existing +work may fail to introduce the correct information about false negative samples due to data sparsity and dynamic data +distribution. To directly introduce the correct feedback label information, we propose an Unbiased delayed feedback +Label Correction framework (ULC), which uses an auxiliary model to correct labels for observed negative feedback +samples. Firstly, we theoretically prove that the label-corrected loss is an unbiased estimate of the oracle loss using +true labels. Then, as there are no ready training data for label correction, counterfactual labeling is used to +construct artificial training data. Furthermore, since counterfactual labeling utilizes only partial training data, we +design an embedding-based alternative training method to enhance performance. Comparative experiments on both public and +private datasets and detailed analyses show that our proposed approach effectively alleviates the delayed feedback +problem and consistently outperforms the previous state-of-the-art methods. + +A preprint version of our paper is available at http://arxiv.org/abs/2307.12756. + +# [Dataset](#contents) + +- [Criteo dataset](https://drive.google.com/file/d/1x4KktfZtls9QjNdFYKCjTpfjM4tG2PcK/view?usp=sharing) + +# [Environment Requirements](#contents) + +- Hardware(CPU) + - Prepare hardware environment with GPU processor. +- Framework + - [MindSpore-2.0.0](https://www.mindspore.cn/install/en) + +- Requirements + +```shell + + $ conda create --name --file requirements.txt + +``` + +- For more information, please check the resources below: + - [MindSpore Tutorials](https://www.mindspore.cn/tutorials/en/r2.0/index.html) + - [MindSpore Python API](https://www.mindspore.cn/docs/en/r2.0/index.html) + +# [Quick Start](#contents) + +After installing MindSpore via the official website, you can start training and evaluation as follows: + +- processing dataset + +# [Script Description](#contents) + +## [Script and Sample Code](#contents) + +```bash +. +└─ULC + └─src + ├─alternate_train.py # modules in ULC + ├─data.py # data process + ├─loss.py # loss in ULC + ├─main.py # train ULC + ├─metric.py # metrics in ULC + ├─models.py # ULC structure + └─utils.py # modules in ULC +``` + +## [Training Process](#contents) + +### Training + +- running on CPU + + ```python + python ./src/main.py --method ULC --l2_reg 0.00001 --cuda_device 0 --lr 0.0001 --CD 7 --batch_size 1024 --optimizer Adam --seed 0 + ``` + +# [ModelZoo Homepage](#contents) + + Please check the official [homepage](https://gitee.com/mindspore/models) \ No newline at end of file diff --git a/research/recommend/ULC/src/alternate_train.py b/research/recommend/ULC/src/alternate_train.py new file mode 100644 index 0000000000000000000000000000000000000000..8a2945c01705461ae72b6e6a58a0b7418b7330bd --- /dev/null +++ b/research/recommend/ULC/src/alternate_train.py @@ -0,0 +1,234 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from copy import deepcopy + +from models import get_model +from loss import get_loss_fn +from utils import get_optimizer +from metrics import cal_llloss_with_logits, cal_auc, cal_prauc +from data import get_criteo_dataset, RandomAccessDataset +from tqdm import tqdm +import numpy as np +import mindspore.dataset as ds +import mindspore.nn as nn +import mindspore + +def test(model, test_data, params): + all_logits = [] + all_probs = [] + all_labels = [] + model.set_train(False) + + for batch in tqdm(test_data): + batch_x = batch[0] + batch_y = batch[1] + logits = model(batch_x) + all_logits.append(logits.asnumpy()) + all_labels.append(batch_y.asnumpy()) + all_probs.append(nn.Sigmoid()(logits).asnumpy()) + + all_logits = np.reshape(np.concatenate(all_logits, axis=0), (-1,)) + all_labels = np.reshape(np.concatenate(all_labels, axis=0), (-1,)) + all_probs = np.reshape(np.concatenate(all_probs, axis=0), (-1,)) + llloss = cal_llloss_with_logits(all_labels, all_logits) + auc = cal_auc(all_labels, all_probs) + prauc = cal_prauc(all_labels, all_probs) + return auc, prauc, llloss + +def get_valid_llloss_blc(model, test_data, correction_model): + all_logits = [] + all_labels = [] + model.set_train(False) + + for batch in tqdm(test_data): + batch_x = batch[0] + batch_y = batch[1] + logits0 = correction_model(batch_x) + corrections = (nn.Sigmoid()(logits0)).flatten() + corrections = mindspore.numpy.where(batch_y < 1, corrections, batch_y.float()) + logits = model(batch_x) + all_logits.append(logits.asnumpy()) + all_labels.append(corrections.asnumpy()) + + all_logits = np.reshape(np.concatenate(all_logits, axis=0), (-1,)) + all_labels = np.reshape(np.concatenate(all_labels, axis=0), (-1,)) + + + llloss = cal_llloss_with_logits(all_labels, all_logits) + return llloss + +def alternate_run(params, wandb): + cvr_model = None + sub_model = None + dataset = get_criteo_dataset(params) + sub_params = deepcopy(params) + + sub_params["dataset"] = "fsiwsg_cd_"+str(params["CD"])\ + +"_end_"+str(params["training_end_day"])+"_seed_"+str(params["seed"]) + np.random.seed(params["seed"]) + sub_dataset = get_criteo_dataset(sub_params)["train"] + np.random.seed(params["seed"]) + + params["log_step"] = 0 + params["idx"] = 1 + for _ in range(2): + sub_model = sub_train(cvr_model, sub_dataset, params) + cvr_model = cvr_train(sub_model, dataset, params, wandb) + params["idx"] += 1 + +def sub_train(cvr_model, sub_dataset, params): + train_data_x = sub_dataset["x"].to_numpy().astype(np.float32) + train_data_label = sub_dataset["labels"] + train_data_label = 1 - train_data_label + train_data = RandomAccessDataset(train_data_x, train_data_label) + train_data_loader = ds.GeneratorDataset(source=train_data, shuffle=True, column_names=['feature', 'label']) + train_data_loader = train_data_loader.batch(batch_size=params["batch_size"]) + + model = get_model("MLP_FSIW", params) + + if cvr_model is not None: + sd = cvr_model.parameters_dict() + part_sd = {k: v for k, v in sd.items() if ("category_embeddings" in k) or ("numeric_embeddings" in k)} + model_dict = model.parameters_dict() + model_dict.update(part_sd) + mindspore.load_param_into_net(model, model_dict) + + optimizer = nn.Adam(params=model.trainable_params(), learning_rate=0.001, weight_decay=0) + loss_fn = get_loss_fn("cross_entropy_loss") + + def forward_fn(data, label): + outputs = model(data) + targets = {"label": label} + loss_dict = loss_fn(targets, outputs, params) + loss = loss_dict["loss"] + return loss + + grad_fn = mindspore.value_and_grad(forward_fn, None, optimizer.parameters) + + for _ in range(5): + for batch in train_data_loader: + batch_x = batch[0] + batch_y = batch[1][:, 0] + targets = {"label": batch_y} + + model.set_train(True) + _, grads = grad_fn(batch_x, targets["label"]) + + optimizer(grads) + + return model + +def cvr_train(sub_model, datasets, params, wandb): + model = get_model("MLP_SIG", params) + models = {"model": model, "submodel": sub_model} + + optimizer = get_optimizer(models["model"].trainable_params(), params["optimizer"], params) + + train_dataset = datasets["train"] + train_data_x = train_dataset["x"].to_numpy().astype(np.float32) + train_data_label = train_dataset["labels"] + train_data = RandomAccessDataset(train_data_x, train_data_label) + train_data_loader = ds.GeneratorDataset(source=train_data, shuffle=True, column_names=['feature', 'label']) + train_data_loader = train_data_loader.batch(batch_size=params["batch_size"]) + + valid_dataset = datasets["valid"] + valid_data_x = valid_dataset["x"].to_numpy().astype(np.float32) + valid_data_label = valid_dataset["labels"] + valid_data = RandomAccessDataset(valid_data_x, valid_data_label) + valid_data_loader = ds.GeneratorDataset(source=valid_data, column_names=['feature', 'label']) + valid_data_loader = valid_data_loader.batch(batch_size=params["batch_size"]) + + test_dataset = datasets["test"] + test_data_x = test_dataset["x"].to_numpy().astype(np.float32) + test_data_label = test_dataset["labels"] + test_data = RandomAccessDataset(test_data_x, test_data_label) + test_data_loader = ds.GeneratorDataset(source=test_data, column_names=['feature', 'label']) + test_data_loader = test_data_loader.batch(batch_size=params["batch_size"]) + + data_loaders = { + "train_data": train_data_loader, + "test_data": test_data_loader, + "valid_data": valid_data_loader + } + optimizers = { + "optimizer": optimizer + } + + + return train(models, optimizers, data_loaders, params, wandb) + + +def train(models, optimizers, data_loaders, params, wandb): + train_data = data_loaders["train_data"] + valid_data = data_loaders["valid_data"] + test_data = data_loaders["test_data"] + best_model = None + + optimizer = optimizers["optimizer"] + + loss_fn = get_loss_fn(params["loss"]) + val_llloss = [] + test_auc, test_prauc, test_llloss = [], [], [] + + def forward_fn(data, label): + outputs = models["model"](data) + logits0 = models["submodel"](data) + correction_label = nn.Sigmoid()(logits0).flatten() + label = mindspore.numpy.where(label < 1, correction_label, label.float()) + targets = {"label": label} + loss_dict = loss_fn(targets, outputs, params) + loss = loss_dict["loss"] + + return loss + + grad_fn = mindspore.value_and_grad(forward_fn, None, optimizer.parameters) + + for ep in range(params["train_epoch"]): + vllloss = get_valid_llloss_blc(models["model"], valid_data, models["submodel"]) + print("Val ep{}, llloss {}".format(ep, vllloss)) + tauc, tprauc, tllloss = test(models["model"], test_data, params) + print("Test ep{}, auc {}, prauc {}, llloss {}".format(ep, tauc, tprauc, tllloss)) + + if not val_llloss or vllloss < min(val_llloss): + best_model = models["model"].parameters_dict() + + val_llloss.append(vllloss) + test_auc.append(tauc) + test_prauc.append(tprauc) + test_llloss.append(tllloss) + + if len(val_llloss) - val_llloss.index(min(val_llloss)) > params["early_stop"]: + best_ep = val_llloss.index(min(val_llloss)) + print("Early stop at ep {}. Best ep {}. Best val_lloss {}.".format(ep, best_ep, min(val_llloss))) + print("Final test evaluation: auc {}, prauc {}, llloss {}."\ + .format(test_auc[best_ep], test_prauc[best_ep], test_llloss[best_ep])) + break + train_loss = [] + for batch in tqdm(train_data): + batch_x = batch[0] + batch_y = batch[1] + + models["model"].set_train(True) + models["submodel"].set_train(False) + loss, grads = grad_fn(batch_x, batch_y) + + train_loss.append(loss.asnumpy()) + optimizer(grads) + params["log_step"] += 1 + print("Train ep{}, loss {}".format(ep, np.mean(train_loss))) + + mindspore.load_param_into_net(models["model"], best_model) + return models["model"] diff --git a/research/recommend/ULC/src/data.py b/research/recommend/ULC/src/data.py new file mode 100644 index 0000000000000000000000000000000000000000..b15ee49d497c85af4b05a1500e1e552fb9e0b1af --- /dev/null +++ b/research/recommend/ULC/src/data.py @@ -0,0 +1,284 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import copy +import os +import pickle +import datetime +import pandas as pd +import numpy as np + +from sklearn.preprocessing import LabelEncoder + +from utils import parse_float_arg + +SECONDS_A_DAY = 60 * 60 * 24 +SECONDS_AN_HOUR = 60 * 60 +SECONDS_DELAY_NORM = 1 +SECONDS_FSIW_NORM = SECONDS_A_DAY * 5 +num_bin_size = (64, 16, 128, 64, 128, 64, 512, 512) + + +class RandomAccessDataset: + def __init__(self, data, label): + self._data = np.array(data) + self._label = np.array(label) + + def __getitem__(self, index): + return (self._data[index], self._label[index]) + + def __len__(self): + return len(self._data) + + +def get_data_df(params): + if params["dataset_source"] in ["criteo"]: + df = pd.read_csv(params["data_path"], sep="\t", header=None) + click_ts = df[df.columns[0]].to_numpy() + pay_ts = df[df.columns[1]].fillna(-1).to_numpy() + + if params["dataset_source"] == "criteo": + df = df[df.columns[2:]] + for c in df.columns[8:]: + df[c] = df[c].fillna("") + df[c] = df[c].astype(str) + + label_encoder = LabelEncoder() + for c in df.columns[8:]: + df[c] = label_encoder.fit_transform(df[c]) + + for i, c in enumerate(df.columns[:8]): + df[c] = df[c].fillna(-1) + df[c] = (df[c] - df[c].min()) / (df[c].max() - df[c].min()) + df[c] = np.floor(df[c] * (num_bin_size[i] - 0.00001)).astype(str) + df.columns = [str(i) for i in range(17)] + return df, click_ts, pay_ts + + +class DataDF: + + def __init__(self, features, click_ts, pay_ts, sample_ts=None, labels=None, delay_label=None): + self.x = features.copy(deep=True) + self.click_ts = copy.deepcopy(click_ts) + self.pay_ts = copy.deepcopy(pay_ts) + self.delay_label = delay_label + if sample_ts is not None: + self.sample_ts = copy.deepcopy(sample_ts) + else: + self.sample_ts = copy.deepcopy(click_ts) + if labels is not None: + self.labels = copy.deepcopy(labels) + else: + self.labels = (pay_ts > 0).astype(np.int32) + + def sub_days(self, start_day, end_day): + start_ts = start_day * SECONDS_A_DAY + end_ts = end_day * SECONDS_A_DAY + mask = np.logical_and(self.sample_ts >= start_ts, + self.sample_ts < end_ts) + return DataDF(self.x.iloc[mask], + self.click_ts[mask], + self.pay_ts[mask], + self.sample_ts[mask], + self.labels[mask]) + + def to_fsiw_1(self, cd, T): # build pre-training dataset 1 of FSIW + mask = np.logical_and(self.click_ts < T - cd, self.pay_ts > 0) + mask = np.logical_and(mask, self.pay_ts < T) + x = self.x.iloc[mask].copy(deep=True) + pay_ts = self.pay_ts[mask] + click_ts = self.click_ts[mask] + sample_ts = self.click_ts[mask] + label = np.zeros((x.shape[0],)) + label[pay_ts < T - cd] = 1 + # FSIW needs elapsed time information + x.insert(x.shape[1], column="elapse", value=( + T - click_ts - cd) / SECONDS_FSIW_NORM) + return DataDF(x, + click_ts, + pay_ts, + sample_ts, + label) + + def to_fsiw_0(self, cd, T): # build pre-training dataset 0 of FSIW + mask = np.logical_or(self.pay_ts >= T - cd, self.pay_ts < 0) + mask = np.logical_or(mask, self.pay_ts > T) + mask = np.logical_and(self.click_ts < T - cd, mask) + x = self.x.iloc[mask].copy(deep=True) + pay_ts = self.pay_ts[mask] + click_ts = self.click_ts[mask] + sample_ts = self.sample_ts[mask] + label = np.zeros((x.shape[0],)) + label[np.logical_or(pay_ts < 0, pay_ts > T)] = 1 + x.insert(x.shape[1], column="elapse", value=( + T - click_ts - cd) / SECONDS_FSIW_NORM) + return DataDF(x, + click_ts, + pay_ts, + sample_ts, + label) + + def shuffle(self): + idx = list(range(self.x.shape[0])) + np.random.shuffle(idx) + return DataDF(self.x.iloc[idx], + self.click_ts[idx], + self.pay_ts[idx], + self.sample_ts[idx], + self.labels[idx]) + + +def get_criteo_dataset(params): + name = params["dataset"] + print("loading datasest {}".format(name)) + cache_path = os.path.join( + params["data_cache_path"], "{}.pkl".format(name)) + if params["data_cache_path"] != "None" and os.path.isfile(cache_path): + print("cache_path {}".format(cache_path)) + print("\nloading from dataset cache") + with open(cache_path, "rb") as f: + data = pickle.load(f) + train_data = data["train"] + test_data = data["test"] + if "valid" in data: + valid_data = data["valid"] + if "clean" in data: + _ = data["clean"] + if "fn" in data: + fn_data = data["fn"] + else: + train_data, test_data, valid_data, fn_data = build_criteo_dataset(params, name, cache_path) + result = { + "train": { + "x": train_data.x, + "click_ts": train_data.click_ts, + "pay_ts": train_data.pay_ts, + "sample_ts": train_data.sample_ts, + "labels": train_data.labels, + }, + "test": { + "x": test_data.x, + "click_ts": test_data.click_ts, + "pay_ts": test_data.pay_ts, + "sample_ts": train_data.sample_ts, + "labels": test_data.labels, + } + } + if ("next" in name) or ("oracle" in name): + result["valid"] = { + "x": valid_data.x, + "click_ts": valid_data.click_ts, + "pay_ts": valid_data.pay_ts, + "sample_ts": valid_data.sample_ts, + "labels": valid_data.labels, + } + result["fn"] = { + "x": fn_data.x, + "click_ts": fn_data.click_ts, + "pay_ts": fn_data.pay_ts, + "sample_ts": fn_data.sample_ts, + "labels": fn_data.labels, + } + return result + + +def build_criteo_dataset(params, name, cache_path): + print("\nbuilding dataset") + + starttime = datetime.datetime.now() + if params["dataset_source"] == "criteo": + source_cache_path = "./cache_data.pkl" + if os.path.isfile(source_cache_path): + with open(source_cache_path, "rb") as f: + data = pickle.load(f) + else: + df, click_ts, pay_ts = get_data_df(params) + data = DataDF(df, click_ts, pay_ts) + with open(source_cache_path, "wb") as f: + pickle.dump(data, f, protocol=4) + endtime = datetime.datetime.now() + print("Time:{}s".format((endtime - starttime).total_seconds())) + + if "fsiwsg" in name: + cd = parse_float_arg(name, "cd") + training_start = params["training_end_day"] - params["training_duration"] + train_data = data.sub_days(training_start, params["training_end_day"]).shuffle() + test_data = data.sub_days(params["training_end_day"], params["training_end_day"] + 1) + train_data = train_data.to_fsiw_0( + cd=cd * SECONDS_A_DAY, T=params["training_end_day"] * SECONDS_A_DAY) + cvrs = np.reshape(train_data.pay_ts > 0, (-1, 1)) + pot_cvr = np.reshape(train_data.pay_ts > params["training_end_day"] * SECONDS_A_DAY, (-1, 1)) + train_data.labels = np.reshape(train_data.labels, (-1, 1)) + train_data.labels = np.concatenate( + [train_data.labels, cvrs, pot_cvr], axis=1) + test_data = test_data.to_fsiw_0( + cd=cd * SECONDS_A_DAY, T=params["training_end_day"] * SECONDS_A_DAY) + elif "fsiw_next" in name: + cd = parse_float_arg(name, "cd") + training_start = params["training_end_day"] - params["training_duration"] + train_data = data.sub_days(training_start, params["training_end_day"]).shuffle() + mask = train_data.pay_ts > (params["training_end_day"] * SECONDS_A_DAY) + train_data.labels[mask] = 0 + train_data.x.insert(train_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - train_data.click_ts) / SECONDS_FSIW_NORM) + fn_data = DataDF(train_data.x.iloc[mask], + train_data.click_ts[mask], + train_data.pay_ts[mask], + train_data.sample_ts[mask], + train_data.labels[mask]) + valid_data = data.sub_days(params["training_end_day"], + params["training_end_day"] + 1 * params["valid_test_size"]) + valid_data.x.insert(valid_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - valid_data.click_ts) / SECONDS_FSIW_NORM) + val_mask = valid_data.pay_ts > ( + (params["training_end_day"] + 1 * params["valid_test_size"]) * SECONDS_A_DAY) + valid_data.labels[val_mask] = 0 + test_data = data.sub_days(params["training_end_day"] + 1 * params["valid_test_size"], + params["training_end_day"] + 2 * params["valid_test_size"]) + test_data.x.insert(test_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - test_data.click_ts) / SECONDS_FSIW_NORM) + elif "oracle" in name: + cd = parse_float_arg(name, "cd") + training_start = params["training_end_day"] - params["training_duration"] + train_data = data.sub_days(training_start, params["training_end_day"]).shuffle() + train_data.x.insert(train_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - train_data.click_ts) / SECONDS_FSIW_NORM) + + mask = train_data.pay_ts > (params["training_end_day"] * SECONDS_A_DAY) + fn_data = DataDF(train_data.x.iloc[mask], + train_data.click_ts[mask], + train_data.pay_ts[mask], + train_data.sample_ts[mask], + train_data.labels[mask]) + fn_data.labels[:] = 0 + + valid_data = data.sub_days(params["training_end_day"], + params["training_end_day"] + 1 * params["valid_test_size"]) + valid_data.x.insert(valid_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - valid_data.click_ts) / SECONDS_FSIW_NORM) + test_data = data.sub_days(params["training_end_day"] + 1 * params["valid_test_size"], + params["training_end_day"] + 2 * params["valid_test_size"]) + test_data.x.insert(test_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - test_data.click_ts) / SECONDS_FSIW_NORM) + else: + raise NotImplementedError("{} dataset does not exist".format(name)) + if params["data_cache_path"] != "None": + with open(cache_path, "wb") as f: + if ("next" in name) or ("oracle" in name): + pickle.dump({"train": train_data, "test": test_data, "valid": valid_data, "fn": fn_data}, f) + else: + pickle.dump({"train": train_data, "test": test_data}, f) + + return train_data, test_data, valid_data, fn_data diff --git a/benchmark/ascend/resnet/scripts/cache_util.sh b/research/recommend/ULC/src/loss.py similarity index 41% rename from benchmark/ascend/resnet/scripts/cache_util.sh rename to research/recommend/ULC/src/loss.py index a3aa77e54a8309e5f2e6ed63703a69d41cfc18ee..743d958334e186bec5f9aef14e5f48fd69696ed9 100644 --- a/benchmark/ascend/resnet/scripts/cache_util.sh +++ b/research/recommend/ULC/src/loss.py @@ -1,5 +1,4 @@ -#!/bin/bash -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2023 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,37 +12,25 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -bootup_cache_server() -{ - echo "Booting up cache server..." - result=$(cache_admin --start 2>&1) - rc=$? - echo "${result}" - if [ "${rc}" -ne 0 ] && [[ ! ${result} =~ "Cache server is already up and running" ]]; then - echo "cache_admin command failure!" "${result}" - exit 1 - fi -} -generate_cache_session() -{ - result=$(cache_admin -g | awk 'END {print $NF}') - rc=$? - echo "${result}" - if [ "${rc}" -ne 0 ]; then - echo "cache_admin command failure!" "${result}" - exit 1 - fi -} +import mindspore.numpy as np +import mindspore.ops as ops +import mindspore -shutdown_cache_server() -{ - echo "Shutting down cache server..." - result=$(cache_admin --stop 2>&1) - rc=$? - echo "${result}" - if [ "${rc}" -ne 0 ] && [[ ! ${result} =~ "Server on port 50052 is not reachable or has been shutdown already" ]]; then - echo "cache_admin command failure!" "${result}" - exit 1 - fi -} + +def stable_log1pex(x): + return -np.where(x < 0, x, np.zeros_like(x)) + np.log(1 + np.exp(-np.absolute(x))) + +def cross_entropy_loss(targets, outputs, params=None): + z = targets["label"] + x = outputs + x = ops.Reshape()(x, (-1,)) + z = z.float() + loss_value = ops.binary_cross_entropy_with_logits(x, z, mindspore.Tensor([1.0]), mindspore.Tensor([1.0])) + + return {"loss": loss_value} + +def get_loss_fn(name): + if name == "cross_entropy_loss": + return cross_entropy_loss + raise NotImplementedError("{} loss does not implemented".format(name)) diff --git a/research/recommend/ULC/src/main.py b/research/recommend/ULC/src/main.py new file mode 100644 index 0000000000000000000000000000000000000000..406db7bdab98adbfd93d92479ab124eebca845dc --- /dev/null +++ b/research/recommend/ULC/src/main.py @@ -0,0 +1,79 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import argparse +import os +import pathlib +from copy import deepcopy + +import numpy as np + +from alternate_train import alternate_run + +wandb = None + + +def run_params(args): + params = deepcopy(vars(args)) + params["model"] = "MLP_SIG" + if args.data_cache_path != "None": + pathlib.Path(args.data_cache_path).mkdir(parents=True, exist_ok=True) + + if args.method == "ULC": + params["loss"] = "cross_entropy_loss" + params["dataset"] = "last_30_train_test_fsiw_next" + "_end_" + str(args.training_end_day) + "_seed_" + str( + args.seed) + + return params + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--method", choices=["ULC"], + type=str, required=True) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--dataset_source", type=str, default="criteo", choices=["criteo"]) + parser.add_argument("--CD", type=int, default=7, + help="interval between counterfactual deadline and actual deadline") + parser.add_argument("--lr", type=float, default=1e-3) + parser.add_argument("--data_path", type=str, default="./data/data.txt", + help="path of the data.txt in criteo dataset") + parser.add_argument("--data_cache_path", type=str, default="./data") + parser.add_argument("--batch_size", type=int, + default=1024) + parser.add_argument("--epoch", type=int, default=5, + help="training epoch of pretraining") + parser.add_argument("--l2_reg", type=float, default=0, + help="l2 regularizer strength") + parser.add_argument("--training_end_day", type=int, default=58, + help="deadline for training data") + parser.add_argument("--training_duration", type=int, default=21, + help="duration of training data") + parser.add_argument("--valid_test_size", type=float, default=1, + help="duration of valid/test data") + parser.add_argument("--train_epoch", type=int, default=100, + help="max train epoch") + parser.add_argument("--early_stop", type=int, default=4) + parser.add_argument("--cuda_device", type=str, default="0") + parser.add_argument("--optimizer", type=str, default="Adam") + parser.add_argument("--save_model", type=int, default=0) + parser.add_argument("--base_model", type=str, default="MLP", choices=["MLP"]) + + args = parser.parse_args() + params = run_params(args) + os.environ["CUDA_VISIBLE_DEVICES"] = params["cuda_device"] + np.random.seed(args.seed) + + alternate_run(params, wandb) diff --git a/research/recommend/ULC/src/metrics.py b/research/recommend/ULC/src/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..274a0e5080840871819d0589ecc9f7132dcb5500 --- /dev/null +++ b/research/recommend/ULC/src/metrics.py @@ -0,0 +1,73 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from sklearn import metrics +import numpy as np + +def sigmoid(x): + return 1/(1+np.exp(np.clip(-x, a_min=-1e50, a_max=1e20))) + +def cal_auc(label, pos_prob): + fpr, tpr, _ = metrics.roc_curve(label, pos_prob, pos_label=1) + auc = metrics.auc(fpr, tpr) + return auc + +def stable_log1pex(x): + return -np.minimum(x, 0) + np.log(1+np.exp(-np.abs(x))) + +def cal_llloss_with_logits(label, logits): + ll = -np.mean(label*(-stable_log1pex(logits)) + (1-label)*(-logits - stable_log1pex(logits))) + return ll + +def cal_llloss_with_logits_and_weight(label, logits, logits0, logits1): + x = logits + + pos_loss = stable_log1pex(x) + neg_loss = x + stable_log1pex(x) + + pos_weight = 1/(logits1+1e-8) + neg_weight = logits0 + + clf_loss = np.mean( + pos_loss*pos_weight*label + neg_loss*neg_weight*(1-label)) + + weight = np.mean(pos_weight*label + neg_weight*(1-label)) + + return clf_loss/weight + +def prob_clip(x): + return np.clip(x, a_min=1e-20, a_max=1) + +def cal_llloss_with_neg_log_prob(label, neg_log_prob): + ll = -np.mean((1-label)*neg_log_prob + label*(np.log(prob_clip(1 - prob_clip(np.exp(neg_log_prob)))))) + return ll + +def cal_llloss_with_prob(label, prob): + ll = -np.mean(label*np.log(prob_clip(prob)) + (1-label)*(np.log(prob_clip(1-prob)))) + return ll + +def cal_prauc(label, pos_prob): + precision, recall, _ = metrics.precision_recall_curve(label, pos_prob) + area = metrics.auc(recall, precision) + return area + +def cal_acc(label, prob): + label = np.reshape(label, (-1,)) + prob = np.reshape(label, (-1,)) + prob_acc = np.mean(label*prob) + return prob_acc + +def stable_softplus(x): + return np.log(1 + np.exp(-np.abs(x))) + np.maximum(x, 0) diff --git a/research/recommend/ULC/src/models.py b/research/recommend/ULC/src/models.py new file mode 100644 index 0000000000000000000000000000000000000000..d8d63ab2575f88533668da2d7b68cb020ebde6c5 --- /dev/null +++ b/research/recommend/ULC/src/models.py @@ -0,0 +1,103 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from mindspore import nn +from mindspore import ops + +class MLP(nn.Cell): + def __init__(self, name, params): + super(MLP, self).__init__() + self.model_name = name + self.params = params + self.dataset_source = params["dataset_source"] + if params["dataset_source"] == "criteo": + self.category_embeddings = nn.CellList([ + nn.Embedding(55824, 64), + nn.Embedding(5443, 64), + nn.Embedding(13073, 64), + nn.Embedding(13170, 64), + nn.Embedding(3145, 64), + nn.Embedding(33843, 64), + nn.Embedding(14304, 64), + nn.Embedding(11, 64), + nn.Embedding(13601, 64) + ]) + + self.numeric_embeddings = nn.CellList([ + nn.Embedding(64, 64), + nn.Embedding(16, 64), + nn.Embedding(128, 64), + nn.Embedding(64, 64), + nn.Embedding(128, 64), + nn.Embedding(64, 64), + nn.Embedding(512, 64), + nn.Embedding(512, 64) + ]) + presize = 1088 + + if name == "MLP_FSIW": + print("using elapse feature") + presize += 1 + + self.mlp = nn.CellList([ + nn.Dense(presize, 256, activation='leakyrelu'), + nn.Dense(256, 256, activation='leakyrelu'), + nn.Dense(256, 128, activation='leakyrelu') + ]) + + if self.model_name in ["MLP_SIG", "MLP_FSIW"]: + self.mlp.append(nn.Dense(128, 1)) + else: + raise ValueError("model name {} not exist".format(name)) + + def construct(self, x): + if self.dataset_source == "criteo": + cate_embeddings = [] + nume_embeddings = [] + if self.model_name == "MLP_FSIW": + for i in range(8): + nume_embeddings.append(self.numeric_embeddings[i](x[:, i].int())) + + for i in range(9): + cate_embeddings.append(self.category_embeddings[8 - i](x[:, -i - 2].int())) + + features = nume_embeddings + cate_embeddings + [x[:, -1:]] + x = ops.Concat(axis=1)(features) + else: + for i in range(8): + nume_embeddings.append(self.numeric_embeddings[i](x[:, i].int())) + + for i in range(9): + cate_embeddings.append(self.category_embeddings[8 - i](x[:, -i - 2].int())) + + features = nume_embeddings + cate_embeddings + x = ops.Concat(axis=1)(features) + + for layer in self.mlp: + x = layer(x) + + if self.model_name in ["MLP_SIG", "MLP_FSIW"]: + return x + raise NotImplementedError() + +def get_model(name, params): + if name in ["MLP_tn_dp", "MLP_FSIW"]: + return MLP(name, params) + if name == "MLP_SIG": + if params["base_model"] == "MLP": + return MLP(name, params) + else: + raise NotImplementedError() + return 0 diff --git a/benchmark/ascend/resnet/create_imagenet2012_label.py b/research/recommend/ULC/src/utils.py similarity index 37% rename from benchmark/ascend/resnet/create_imagenet2012_label.py rename to research/recommend/ULC/src/utils.py index c0c102c92250b4e21634fb3f7f37c69009fa1f95..b5aca4c044eea0cfdeb5077537e9576e121aab8d 100644 --- a/benchmark/ascend/resnet/create_imagenet2012_label.py +++ b/research/recommend/ULC/src/utils.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2023 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,40 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""create_imagenet2012_label""" -import os -import json -import argparse -parser = argparse.ArgumentParser(description="resnet imagenet2012 label") -parser.add_argument("--img_path", type=str, required=True, help="imagenet2012 file path.") -args = parser.parse_args() +import re +import mindspore.nn as nn -def create_label(file_path): - ''' - Create image_label.json from image files. - ''' - print("[WARNING] Create imagenet label. Currently only use for Imagenet2012!") - dirs = os.listdir(file_path) - file_list = [] - for file in dirs: - file_list.append(file) - file_list = sorted(file_list) +def get_optimizer(p, name, params): + if name == "Adam": + return nn.Adam(params=p, learning_rate=params["lr"], weight_decay=params["l2_reg"]) + return 0 - total = 0 - img_label = {} - for i, file_dir in enumerate(file_list): - files = os.listdir(os.path.join(file_path, file_dir)) - for f in files: - img_label[f] = i - total += len(files) - with open("imagenet_label.json", "w+") as label: - json.dump(img_label, label) - - print("[INFO] Completed! Total {} data.".format(total)) - - -if __name__ == '__main__': - create_label(args.img_path) +def parse_float_arg(Input, prefix): + p = re.compile(prefix+"_[+-]?([0-9]*[.])?[0-9]+") + m = p.search(Input) + if m is None: + return None + Input = m.group() + p = re.compile("[+-]?([0-9]*[.])?[0-9]+") + m = p.search(Input) + return float(m.group())