From aa6efd857b038c73deb37cf22ae510810fa8c03b Mon Sep 17 00:00:00 2001 From: j00542052 Date: Tue, 2 Jan 2024 17:48:43 +0800 Subject: [PATCH 01/44] KDD2023 ULC --- research/recommend/ULC/README.md | 90 ++++++ research/recommend/ULC/src/alternate_train.py | 234 +++++++++++++++ research/recommend/ULC/src/data.py | 284 ++++++++++++++++++ research/recommend/ULC/src/loss.py | 36 +++ research/recommend/ULC/src/main.py | 79 +++++ research/recommend/ULC/src/metrics.py | 73 +++++ research/recommend/ULC/src/models.py | 103 +++++++ research/recommend/ULC/src/utils.py | 34 +++ 8 files changed, 933 insertions(+) create mode 100644 research/recommend/ULC/README.md create mode 100644 research/recommend/ULC/src/alternate_train.py create mode 100644 research/recommend/ULC/src/data.py create mode 100644 research/recommend/ULC/src/loss.py create mode 100644 research/recommend/ULC/src/main.py create mode 100644 research/recommend/ULC/src/metrics.py create mode 100644 research/recommend/ULC/src/models.py create mode 100644 research/recommend/ULC/src/utils.py diff --git a/research/recommend/ULC/README.md b/research/recommend/ULC/README.md new file mode 100644 index 000000000..8b46f25d0 --- /dev/null +++ b/research/recommend/ULC/README.md @@ -0,0 +1,90 @@ + +# Contents + +- [Contents](#contents) +- [UCL Description](#TAML-description) +- [Dataset](#dataset) +- [Environment Requirements](#environment-requirements) +- [Quick Start](#quick-start) +- [Script Description](#script-description) + - [Script and Sample Code](#script-and-sample-code) + - [Training Process](#training-process) + - [Training](#training) +- [ModelZoo Homepage](#modelzoo-homepage) + +# [ULC Description](#contents) + +Conversion rate prediction is critical to many online applications such as digital display advertising. To capture +dynamic data distribution, industrial systems often require retraining models on recent data daily or weekly. However, +the delay of conversion behavior usually leads to incorrect labeling, which is called delayed feedback problem. Existing +work may fail to introduce the correct information about false negative samples due to data sparsity and dynamic data +distribution. To directly introduce the correct feedback label information, we propose an Unbiased delayed feedback +Label Correction framework (ULC), which uses an auxiliary model to correct labels for observed negative feedback +samples. Firstly, we theoretically prove that the label-corrected loss is an unbiased estimate of the oracle loss using +true labels. Then, as there are no ready training data for label correction, counterfactual labeling is used to +construct artificial training data. Furthermore, since counterfactual labeling utilizes only partial training data, we +design an embedding-based alternative training method to enhance performance. Comparative experiments on both public and +private datasets and detailed analyses show that our proposed approach effectively alleviates the delayed feedback +problem and consistently outperforms the previous state-of-the-art methods. + +A preprint version of our paper is available at http://arxiv.org/abs/2307.12756. + +# [Dataset](#contents) + +- [Criteo dataset](https://drive.google.com/file/d/1x4KktfZtls9QjNdFYKCjTpfjM4tG2PcK/view?usp=sharing) + +# [Environment Requirements](#contents) + +- Hardware(CPU) + - Prepare hardware environment with GPU processor. +- Framework + - [MindSpore-2.0.0](https://www.mindspore.cn/install/en) + +- Requirements + +```shell + + $ conda create --name --file requirements.txt + +``` + +- For more information, please check the resources below: + - [MindSpore Tutorials](https://www.mindspore.cn/tutorials/en/r2.0/index.html) + - [MindSpore Python API](https://www.mindspore.cn/docs/en/r2.0/index.html) + +# [Quick Start](#contents) + +After installing MindSpore via the official website, you can start training and evaluation as follows: + +- processing dataset + +# [Script Description](#contents) + +## [Script and Sample Code](#contents) + +```bash +. +└─ULC + └─src + ├─alternate_train.py # modules in ULC + ├─data.py # data process + ├─loss.py # loss in ULC + ├─main.py # train ULC + ├─metric.py # metrics in ULC + ├─models.py # ULC structure + └─utils.py # modules in ULC +``` + +## [Training Process](#contents) + +### Training + +- running on CPU + + ```python + python ./src/main.py --method ULC --l2_reg 0.00001 --cuda_device 0 --lr 0.0001 --CD 7 --batch_size 1024 --optimizer Adam --seed 0 + ``` + +# [ModelZoo Homepage](#contents) + + Please check the official [homepage](https://gitee.com/mindspore/models) \ No newline at end of file diff --git a/research/recommend/ULC/src/alternate_train.py b/research/recommend/ULC/src/alternate_train.py new file mode 100644 index 000000000..8a2945c01 --- /dev/null +++ b/research/recommend/ULC/src/alternate_train.py @@ -0,0 +1,234 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from copy import deepcopy + +from models import get_model +from loss import get_loss_fn +from utils import get_optimizer +from metrics import cal_llloss_with_logits, cal_auc, cal_prauc +from data import get_criteo_dataset, RandomAccessDataset +from tqdm import tqdm +import numpy as np +import mindspore.dataset as ds +import mindspore.nn as nn +import mindspore + +def test(model, test_data, params): + all_logits = [] + all_probs = [] + all_labels = [] + model.set_train(False) + + for batch in tqdm(test_data): + batch_x = batch[0] + batch_y = batch[1] + logits = model(batch_x) + all_logits.append(logits.asnumpy()) + all_labels.append(batch_y.asnumpy()) + all_probs.append(nn.Sigmoid()(logits).asnumpy()) + + all_logits = np.reshape(np.concatenate(all_logits, axis=0), (-1,)) + all_labels = np.reshape(np.concatenate(all_labels, axis=0), (-1,)) + all_probs = np.reshape(np.concatenate(all_probs, axis=0), (-1,)) + llloss = cal_llloss_with_logits(all_labels, all_logits) + auc = cal_auc(all_labels, all_probs) + prauc = cal_prauc(all_labels, all_probs) + return auc, prauc, llloss + +def get_valid_llloss_blc(model, test_data, correction_model): + all_logits = [] + all_labels = [] + model.set_train(False) + + for batch in tqdm(test_data): + batch_x = batch[0] + batch_y = batch[1] + logits0 = correction_model(batch_x) + corrections = (nn.Sigmoid()(logits0)).flatten() + corrections = mindspore.numpy.where(batch_y < 1, corrections, batch_y.float()) + logits = model(batch_x) + all_logits.append(logits.asnumpy()) + all_labels.append(corrections.asnumpy()) + + all_logits = np.reshape(np.concatenate(all_logits, axis=0), (-1,)) + all_labels = np.reshape(np.concatenate(all_labels, axis=0), (-1,)) + + + llloss = cal_llloss_with_logits(all_labels, all_logits) + return llloss + +def alternate_run(params, wandb): + cvr_model = None + sub_model = None + dataset = get_criteo_dataset(params) + sub_params = deepcopy(params) + + sub_params["dataset"] = "fsiwsg_cd_"+str(params["CD"])\ + +"_end_"+str(params["training_end_day"])+"_seed_"+str(params["seed"]) + np.random.seed(params["seed"]) + sub_dataset = get_criteo_dataset(sub_params)["train"] + np.random.seed(params["seed"]) + + params["log_step"] = 0 + params["idx"] = 1 + for _ in range(2): + sub_model = sub_train(cvr_model, sub_dataset, params) + cvr_model = cvr_train(sub_model, dataset, params, wandb) + params["idx"] += 1 + +def sub_train(cvr_model, sub_dataset, params): + train_data_x = sub_dataset["x"].to_numpy().astype(np.float32) + train_data_label = sub_dataset["labels"] + train_data_label = 1 - train_data_label + train_data = RandomAccessDataset(train_data_x, train_data_label) + train_data_loader = ds.GeneratorDataset(source=train_data, shuffle=True, column_names=['feature', 'label']) + train_data_loader = train_data_loader.batch(batch_size=params["batch_size"]) + + model = get_model("MLP_FSIW", params) + + if cvr_model is not None: + sd = cvr_model.parameters_dict() + part_sd = {k: v for k, v in sd.items() if ("category_embeddings" in k) or ("numeric_embeddings" in k)} + model_dict = model.parameters_dict() + model_dict.update(part_sd) + mindspore.load_param_into_net(model, model_dict) + + optimizer = nn.Adam(params=model.trainable_params(), learning_rate=0.001, weight_decay=0) + loss_fn = get_loss_fn("cross_entropy_loss") + + def forward_fn(data, label): + outputs = model(data) + targets = {"label": label} + loss_dict = loss_fn(targets, outputs, params) + loss = loss_dict["loss"] + return loss + + grad_fn = mindspore.value_and_grad(forward_fn, None, optimizer.parameters) + + for _ in range(5): + for batch in train_data_loader: + batch_x = batch[0] + batch_y = batch[1][:, 0] + targets = {"label": batch_y} + + model.set_train(True) + _, grads = grad_fn(batch_x, targets["label"]) + + optimizer(grads) + + return model + +def cvr_train(sub_model, datasets, params, wandb): + model = get_model("MLP_SIG", params) + models = {"model": model, "submodel": sub_model} + + optimizer = get_optimizer(models["model"].trainable_params(), params["optimizer"], params) + + train_dataset = datasets["train"] + train_data_x = train_dataset["x"].to_numpy().astype(np.float32) + train_data_label = train_dataset["labels"] + train_data = RandomAccessDataset(train_data_x, train_data_label) + train_data_loader = ds.GeneratorDataset(source=train_data, shuffle=True, column_names=['feature', 'label']) + train_data_loader = train_data_loader.batch(batch_size=params["batch_size"]) + + valid_dataset = datasets["valid"] + valid_data_x = valid_dataset["x"].to_numpy().astype(np.float32) + valid_data_label = valid_dataset["labels"] + valid_data = RandomAccessDataset(valid_data_x, valid_data_label) + valid_data_loader = ds.GeneratorDataset(source=valid_data, column_names=['feature', 'label']) + valid_data_loader = valid_data_loader.batch(batch_size=params["batch_size"]) + + test_dataset = datasets["test"] + test_data_x = test_dataset["x"].to_numpy().astype(np.float32) + test_data_label = test_dataset["labels"] + test_data = RandomAccessDataset(test_data_x, test_data_label) + test_data_loader = ds.GeneratorDataset(source=test_data, column_names=['feature', 'label']) + test_data_loader = test_data_loader.batch(batch_size=params["batch_size"]) + + data_loaders = { + "train_data": train_data_loader, + "test_data": test_data_loader, + "valid_data": valid_data_loader + } + optimizers = { + "optimizer": optimizer + } + + + return train(models, optimizers, data_loaders, params, wandb) + + +def train(models, optimizers, data_loaders, params, wandb): + train_data = data_loaders["train_data"] + valid_data = data_loaders["valid_data"] + test_data = data_loaders["test_data"] + best_model = None + + optimizer = optimizers["optimizer"] + + loss_fn = get_loss_fn(params["loss"]) + val_llloss = [] + test_auc, test_prauc, test_llloss = [], [], [] + + def forward_fn(data, label): + outputs = models["model"](data) + logits0 = models["submodel"](data) + correction_label = nn.Sigmoid()(logits0).flatten() + label = mindspore.numpy.where(label < 1, correction_label, label.float()) + targets = {"label": label} + loss_dict = loss_fn(targets, outputs, params) + loss = loss_dict["loss"] + + return loss + + grad_fn = mindspore.value_and_grad(forward_fn, None, optimizer.parameters) + + for ep in range(params["train_epoch"]): + vllloss = get_valid_llloss_blc(models["model"], valid_data, models["submodel"]) + print("Val ep{}, llloss {}".format(ep, vllloss)) + tauc, tprauc, tllloss = test(models["model"], test_data, params) + print("Test ep{}, auc {}, prauc {}, llloss {}".format(ep, tauc, tprauc, tllloss)) + + if not val_llloss or vllloss < min(val_llloss): + best_model = models["model"].parameters_dict() + + val_llloss.append(vllloss) + test_auc.append(tauc) + test_prauc.append(tprauc) + test_llloss.append(tllloss) + + if len(val_llloss) - val_llloss.index(min(val_llloss)) > params["early_stop"]: + best_ep = val_llloss.index(min(val_llloss)) + print("Early stop at ep {}. Best ep {}. Best val_lloss {}.".format(ep, best_ep, min(val_llloss))) + print("Final test evaluation: auc {}, prauc {}, llloss {}."\ + .format(test_auc[best_ep], test_prauc[best_ep], test_llloss[best_ep])) + break + train_loss = [] + for batch in tqdm(train_data): + batch_x = batch[0] + batch_y = batch[1] + + models["model"].set_train(True) + models["submodel"].set_train(False) + loss, grads = grad_fn(batch_x, batch_y) + + train_loss.append(loss.asnumpy()) + optimizer(grads) + params["log_step"] += 1 + print("Train ep{}, loss {}".format(ep, np.mean(train_loss))) + + mindspore.load_param_into_net(models["model"], best_model) + return models["model"] diff --git a/research/recommend/ULC/src/data.py b/research/recommend/ULC/src/data.py new file mode 100644 index 000000000..b15ee49d4 --- /dev/null +++ b/research/recommend/ULC/src/data.py @@ -0,0 +1,284 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import copy +import os +import pickle +import datetime +import pandas as pd +import numpy as np + +from sklearn.preprocessing import LabelEncoder + +from utils import parse_float_arg + +SECONDS_A_DAY = 60 * 60 * 24 +SECONDS_AN_HOUR = 60 * 60 +SECONDS_DELAY_NORM = 1 +SECONDS_FSIW_NORM = SECONDS_A_DAY * 5 +num_bin_size = (64, 16, 128, 64, 128, 64, 512, 512) + + +class RandomAccessDataset: + def __init__(self, data, label): + self._data = np.array(data) + self._label = np.array(label) + + def __getitem__(self, index): + return (self._data[index], self._label[index]) + + def __len__(self): + return len(self._data) + + +def get_data_df(params): + if params["dataset_source"] in ["criteo"]: + df = pd.read_csv(params["data_path"], sep="\t", header=None) + click_ts = df[df.columns[0]].to_numpy() + pay_ts = df[df.columns[1]].fillna(-1).to_numpy() + + if params["dataset_source"] == "criteo": + df = df[df.columns[2:]] + for c in df.columns[8:]: + df[c] = df[c].fillna("") + df[c] = df[c].astype(str) + + label_encoder = LabelEncoder() + for c in df.columns[8:]: + df[c] = label_encoder.fit_transform(df[c]) + + for i, c in enumerate(df.columns[:8]): + df[c] = df[c].fillna(-1) + df[c] = (df[c] - df[c].min()) / (df[c].max() - df[c].min()) + df[c] = np.floor(df[c] * (num_bin_size[i] - 0.00001)).astype(str) + df.columns = [str(i) for i in range(17)] + return df, click_ts, pay_ts + + +class DataDF: + + def __init__(self, features, click_ts, pay_ts, sample_ts=None, labels=None, delay_label=None): + self.x = features.copy(deep=True) + self.click_ts = copy.deepcopy(click_ts) + self.pay_ts = copy.deepcopy(pay_ts) + self.delay_label = delay_label + if sample_ts is not None: + self.sample_ts = copy.deepcopy(sample_ts) + else: + self.sample_ts = copy.deepcopy(click_ts) + if labels is not None: + self.labels = copy.deepcopy(labels) + else: + self.labels = (pay_ts > 0).astype(np.int32) + + def sub_days(self, start_day, end_day): + start_ts = start_day * SECONDS_A_DAY + end_ts = end_day * SECONDS_A_DAY + mask = np.logical_and(self.sample_ts >= start_ts, + self.sample_ts < end_ts) + return DataDF(self.x.iloc[mask], + self.click_ts[mask], + self.pay_ts[mask], + self.sample_ts[mask], + self.labels[mask]) + + def to_fsiw_1(self, cd, T): # build pre-training dataset 1 of FSIW + mask = np.logical_and(self.click_ts < T - cd, self.pay_ts > 0) + mask = np.logical_and(mask, self.pay_ts < T) + x = self.x.iloc[mask].copy(deep=True) + pay_ts = self.pay_ts[mask] + click_ts = self.click_ts[mask] + sample_ts = self.click_ts[mask] + label = np.zeros((x.shape[0],)) + label[pay_ts < T - cd] = 1 + # FSIW needs elapsed time information + x.insert(x.shape[1], column="elapse", value=( + T - click_ts - cd) / SECONDS_FSIW_NORM) + return DataDF(x, + click_ts, + pay_ts, + sample_ts, + label) + + def to_fsiw_0(self, cd, T): # build pre-training dataset 0 of FSIW + mask = np.logical_or(self.pay_ts >= T - cd, self.pay_ts < 0) + mask = np.logical_or(mask, self.pay_ts > T) + mask = np.logical_and(self.click_ts < T - cd, mask) + x = self.x.iloc[mask].copy(deep=True) + pay_ts = self.pay_ts[mask] + click_ts = self.click_ts[mask] + sample_ts = self.sample_ts[mask] + label = np.zeros((x.shape[0],)) + label[np.logical_or(pay_ts < 0, pay_ts > T)] = 1 + x.insert(x.shape[1], column="elapse", value=( + T - click_ts - cd) / SECONDS_FSIW_NORM) + return DataDF(x, + click_ts, + pay_ts, + sample_ts, + label) + + def shuffle(self): + idx = list(range(self.x.shape[0])) + np.random.shuffle(idx) + return DataDF(self.x.iloc[idx], + self.click_ts[idx], + self.pay_ts[idx], + self.sample_ts[idx], + self.labels[idx]) + + +def get_criteo_dataset(params): + name = params["dataset"] + print("loading datasest {}".format(name)) + cache_path = os.path.join( + params["data_cache_path"], "{}.pkl".format(name)) + if params["data_cache_path"] != "None" and os.path.isfile(cache_path): + print("cache_path {}".format(cache_path)) + print("\nloading from dataset cache") + with open(cache_path, "rb") as f: + data = pickle.load(f) + train_data = data["train"] + test_data = data["test"] + if "valid" in data: + valid_data = data["valid"] + if "clean" in data: + _ = data["clean"] + if "fn" in data: + fn_data = data["fn"] + else: + train_data, test_data, valid_data, fn_data = build_criteo_dataset(params, name, cache_path) + result = { + "train": { + "x": train_data.x, + "click_ts": train_data.click_ts, + "pay_ts": train_data.pay_ts, + "sample_ts": train_data.sample_ts, + "labels": train_data.labels, + }, + "test": { + "x": test_data.x, + "click_ts": test_data.click_ts, + "pay_ts": test_data.pay_ts, + "sample_ts": train_data.sample_ts, + "labels": test_data.labels, + } + } + if ("next" in name) or ("oracle" in name): + result["valid"] = { + "x": valid_data.x, + "click_ts": valid_data.click_ts, + "pay_ts": valid_data.pay_ts, + "sample_ts": valid_data.sample_ts, + "labels": valid_data.labels, + } + result["fn"] = { + "x": fn_data.x, + "click_ts": fn_data.click_ts, + "pay_ts": fn_data.pay_ts, + "sample_ts": fn_data.sample_ts, + "labels": fn_data.labels, + } + return result + + +def build_criteo_dataset(params, name, cache_path): + print("\nbuilding dataset") + + starttime = datetime.datetime.now() + if params["dataset_source"] == "criteo": + source_cache_path = "./cache_data.pkl" + if os.path.isfile(source_cache_path): + with open(source_cache_path, "rb") as f: + data = pickle.load(f) + else: + df, click_ts, pay_ts = get_data_df(params) + data = DataDF(df, click_ts, pay_ts) + with open(source_cache_path, "wb") as f: + pickle.dump(data, f, protocol=4) + endtime = datetime.datetime.now() + print("Time:{}s".format((endtime - starttime).total_seconds())) + + if "fsiwsg" in name: + cd = parse_float_arg(name, "cd") + training_start = params["training_end_day"] - params["training_duration"] + train_data = data.sub_days(training_start, params["training_end_day"]).shuffle() + test_data = data.sub_days(params["training_end_day"], params["training_end_day"] + 1) + train_data = train_data.to_fsiw_0( + cd=cd * SECONDS_A_DAY, T=params["training_end_day"] * SECONDS_A_DAY) + cvrs = np.reshape(train_data.pay_ts > 0, (-1, 1)) + pot_cvr = np.reshape(train_data.pay_ts > params["training_end_day"] * SECONDS_A_DAY, (-1, 1)) + train_data.labels = np.reshape(train_data.labels, (-1, 1)) + train_data.labels = np.concatenate( + [train_data.labels, cvrs, pot_cvr], axis=1) + test_data = test_data.to_fsiw_0( + cd=cd * SECONDS_A_DAY, T=params["training_end_day"] * SECONDS_A_DAY) + elif "fsiw_next" in name: + cd = parse_float_arg(name, "cd") + training_start = params["training_end_day"] - params["training_duration"] + train_data = data.sub_days(training_start, params["training_end_day"]).shuffle() + mask = train_data.pay_ts > (params["training_end_day"] * SECONDS_A_DAY) + train_data.labels[mask] = 0 + train_data.x.insert(train_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - train_data.click_ts) / SECONDS_FSIW_NORM) + fn_data = DataDF(train_data.x.iloc[mask], + train_data.click_ts[mask], + train_data.pay_ts[mask], + train_data.sample_ts[mask], + train_data.labels[mask]) + valid_data = data.sub_days(params["training_end_day"], + params["training_end_day"] + 1 * params["valid_test_size"]) + valid_data.x.insert(valid_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - valid_data.click_ts) / SECONDS_FSIW_NORM) + val_mask = valid_data.pay_ts > ( + (params["training_end_day"] + 1 * params["valid_test_size"]) * SECONDS_A_DAY) + valid_data.labels[val_mask] = 0 + test_data = data.sub_days(params["training_end_day"] + 1 * params["valid_test_size"], + params["training_end_day"] + 2 * params["valid_test_size"]) + test_data.x.insert(test_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - test_data.click_ts) / SECONDS_FSIW_NORM) + elif "oracle" in name: + cd = parse_float_arg(name, "cd") + training_start = params["training_end_day"] - params["training_duration"] + train_data = data.sub_days(training_start, params["training_end_day"]).shuffle() + train_data.x.insert(train_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - train_data.click_ts) / SECONDS_FSIW_NORM) + + mask = train_data.pay_ts > (params["training_end_day"] * SECONDS_A_DAY) + fn_data = DataDF(train_data.x.iloc[mask], + train_data.click_ts[mask], + train_data.pay_ts[mask], + train_data.sample_ts[mask], + train_data.labels[mask]) + fn_data.labels[:] = 0 + + valid_data = data.sub_days(params["training_end_day"], + params["training_end_day"] + 1 * params["valid_test_size"]) + valid_data.x.insert(valid_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - valid_data.click_ts) / SECONDS_FSIW_NORM) + test_data = data.sub_days(params["training_end_day"] + 1 * params["valid_test_size"], + params["training_end_day"] + 2 * params["valid_test_size"]) + test_data.x.insert(test_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - test_data.click_ts) / SECONDS_FSIW_NORM) + else: + raise NotImplementedError("{} dataset does not exist".format(name)) + if params["data_cache_path"] != "None": + with open(cache_path, "wb") as f: + if ("next" in name) or ("oracle" in name): + pickle.dump({"train": train_data, "test": test_data, "valid": valid_data, "fn": fn_data}, f) + else: + pickle.dump({"train": train_data, "test": test_data}, f) + + return train_data, test_data, valid_data, fn_data diff --git a/research/recommend/ULC/src/loss.py b/research/recommend/ULC/src/loss.py new file mode 100644 index 000000000..743d95833 --- /dev/null +++ b/research/recommend/ULC/src/loss.py @@ -0,0 +1,36 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import mindspore.numpy as np +import mindspore.ops as ops +import mindspore + + +def stable_log1pex(x): + return -np.where(x < 0, x, np.zeros_like(x)) + np.log(1 + np.exp(-np.absolute(x))) + +def cross_entropy_loss(targets, outputs, params=None): + z = targets["label"] + x = outputs + x = ops.Reshape()(x, (-1,)) + z = z.float() + loss_value = ops.binary_cross_entropy_with_logits(x, z, mindspore.Tensor([1.0]), mindspore.Tensor([1.0])) + + return {"loss": loss_value} + +def get_loss_fn(name): + if name == "cross_entropy_loss": + return cross_entropy_loss + raise NotImplementedError("{} loss does not implemented".format(name)) diff --git a/research/recommend/ULC/src/main.py b/research/recommend/ULC/src/main.py new file mode 100644 index 000000000..406db7bda --- /dev/null +++ b/research/recommend/ULC/src/main.py @@ -0,0 +1,79 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import argparse +import os +import pathlib +from copy import deepcopy + +import numpy as np + +from alternate_train import alternate_run + +wandb = None + + +def run_params(args): + params = deepcopy(vars(args)) + params["model"] = "MLP_SIG" + if args.data_cache_path != "None": + pathlib.Path(args.data_cache_path).mkdir(parents=True, exist_ok=True) + + if args.method == "ULC": + params["loss"] = "cross_entropy_loss" + params["dataset"] = "last_30_train_test_fsiw_next" + "_end_" + str(args.training_end_day) + "_seed_" + str( + args.seed) + + return params + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--method", choices=["ULC"], + type=str, required=True) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--dataset_source", type=str, default="criteo", choices=["criteo"]) + parser.add_argument("--CD", type=int, default=7, + help="interval between counterfactual deadline and actual deadline") + parser.add_argument("--lr", type=float, default=1e-3) + parser.add_argument("--data_path", type=str, default="./data/data.txt", + help="path of the data.txt in criteo dataset") + parser.add_argument("--data_cache_path", type=str, default="./data") + parser.add_argument("--batch_size", type=int, + default=1024) + parser.add_argument("--epoch", type=int, default=5, + help="training epoch of pretraining") + parser.add_argument("--l2_reg", type=float, default=0, + help="l2 regularizer strength") + parser.add_argument("--training_end_day", type=int, default=58, + help="deadline for training data") + parser.add_argument("--training_duration", type=int, default=21, + help="duration of training data") + parser.add_argument("--valid_test_size", type=float, default=1, + help="duration of valid/test data") + parser.add_argument("--train_epoch", type=int, default=100, + help="max train epoch") + parser.add_argument("--early_stop", type=int, default=4) + parser.add_argument("--cuda_device", type=str, default="0") + parser.add_argument("--optimizer", type=str, default="Adam") + parser.add_argument("--save_model", type=int, default=0) + parser.add_argument("--base_model", type=str, default="MLP", choices=["MLP"]) + + args = parser.parse_args() + params = run_params(args) + os.environ["CUDA_VISIBLE_DEVICES"] = params["cuda_device"] + np.random.seed(args.seed) + + alternate_run(params, wandb) diff --git a/research/recommend/ULC/src/metrics.py b/research/recommend/ULC/src/metrics.py new file mode 100644 index 000000000..274a0e508 --- /dev/null +++ b/research/recommend/ULC/src/metrics.py @@ -0,0 +1,73 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from sklearn import metrics +import numpy as np + +def sigmoid(x): + return 1/(1+np.exp(np.clip(-x, a_min=-1e50, a_max=1e20))) + +def cal_auc(label, pos_prob): + fpr, tpr, _ = metrics.roc_curve(label, pos_prob, pos_label=1) + auc = metrics.auc(fpr, tpr) + return auc + +def stable_log1pex(x): + return -np.minimum(x, 0) + np.log(1+np.exp(-np.abs(x))) + +def cal_llloss_with_logits(label, logits): + ll = -np.mean(label*(-stable_log1pex(logits)) + (1-label)*(-logits - stable_log1pex(logits))) + return ll + +def cal_llloss_with_logits_and_weight(label, logits, logits0, logits1): + x = logits + + pos_loss = stable_log1pex(x) + neg_loss = x + stable_log1pex(x) + + pos_weight = 1/(logits1+1e-8) + neg_weight = logits0 + + clf_loss = np.mean( + pos_loss*pos_weight*label + neg_loss*neg_weight*(1-label)) + + weight = np.mean(pos_weight*label + neg_weight*(1-label)) + + return clf_loss/weight + +def prob_clip(x): + return np.clip(x, a_min=1e-20, a_max=1) + +def cal_llloss_with_neg_log_prob(label, neg_log_prob): + ll = -np.mean((1-label)*neg_log_prob + label*(np.log(prob_clip(1 - prob_clip(np.exp(neg_log_prob)))))) + return ll + +def cal_llloss_with_prob(label, prob): + ll = -np.mean(label*np.log(prob_clip(prob)) + (1-label)*(np.log(prob_clip(1-prob)))) + return ll + +def cal_prauc(label, pos_prob): + precision, recall, _ = metrics.precision_recall_curve(label, pos_prob) + area = metrics.auc(recall, precision) + return area + +def cal_acc(label, prob): + label = np.reshape(label, (-1,)) + prob = np.reshape(label, (-1,)) + prob_acc = np.mean(label*prob) + return prob_acc + +def stable_softplus(x): + return np.log(1 + np.exp(-np.abs(x))) + np.maximum(x, 0) diff --git a/research/recommend/ULC/src/models.py b/research/recommend/ULC/src/models.py new file mode 100644 index 000000000..d8d63ab25 --- /dev/null +++ b/research/recommend/ULC/src/models.py @@ -0,0 +1,103 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from mindspore import nn +from mindspore import ops + +class MLP(nn.Cell): + def __init__(self, name, params): + super(MLP, self).__init__() + self.model_name = name + self.params = params + self.dataset_source = params["dataset_source"] + if params["dataset_source"] == "criteo": + self.category_embeddings = nn.CellList([ + nn.Embedding(55824, 64), + nn.Embedding(5443, 64), + nn.Embedding(13073, 64), + nn.Embedding(13170, 64), + nn.Embedding(3145, 64), + nn.Embedding(33843, 64), + nn.Embedding(14304, 64), + nn.Embedding(11, 64), + nn.Embedding(13601, 64) + ]) + + self.numeric_embeddings = nn.CellList([ + nn.Embedding(64, 64), + nn.Embedding(16, 64), + nn.Embedding(128, 64), + nn.Embedding(64, 64), + nn.Embedding(128, 64), + nn.Embedding(64, 64), + nn.Embedding(512, 64), + nn.Embedding(512, 64) + ]) + presize = 1088 + + if name == "MLP_FSIW": + print("using elapse feature") + presize += 1 + + self.mlp = nn.CellList([ + nn.Dense(presize, 256, activation='leakyrelu'), + nn.Dense(256, 256, activation='leakyrelu'), + nn.Dense(256, 128, activation='leakyrelu') + ]) + + if self.model_name in ["MLP_SIG", "MLP_FSIW"]: + self.mlp.append(nn.Dense(128, 1)) + else: + raise ValueError("model name {} not exist".format(name)) + + def construct(self, x): + if self.dataset_source == "criteo": + cate_embeddings = [] + nume_embeddings = [] + if self.model_name == "MLP_FSIW": + for i in range(8): + nume_embeddings.append(self.numeric_embeddings[i](x[:, i].int())) + + for i in range(9): + cate_embeddings.append(self.category_embeddings[8 - i](x[:, -i - 2].int())) + + features = nume_embeddings + cate_embeddings + [x[:, -1:]] + x = ops.Concat(axis=1)(features) + else: + for i in range(8): + nume_embeddings.append(self.numeric_embeddings[i](x[:, i].int())) + + for i in range(9): + cate_embeddings.append(self.category_embeddings[8 - i](x[:, -i - 2].int())) + + features = nume_embeddings + cate_embeddings + x = ops.Concat(axis=1)(features) + + for layer in self.mlp: + x = layer(x) + + if self.model_name in ["MLP_SIG", "MLP_FSIW"]: + return x + raise NotImplementedError() + +def get_model(name, params): + if name in ["MLP_tn_dp", "MLP_FSIW"]: + return MLP(name, params) + if name == "MLP_SIG": + if params["base_model"] == "MLP": + return MLP(name, params) + else: + raise NotImplementedError() + return 0 diff --git a/research/recommend/ULC/src/utils.py b/research/recommend/ULC/src/utils.py new file mode 100644 index 000000000..b5aca4c04 --- /dev/null +++ b/research/recommend/ULC/src/utils.py @@ -0,0 +1,34 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import re +import mindspore.nn as nn + + +def get_optimizer(p, name, params): + if name == "Adam": + return nn.Adam(params=p, learning_rate=params["lr"], weight_decay=params["l2_reg"]) + return 0 + + +def parse_float_arg(Input, prefix): + p = re.compile(prefix+"_[+-]?([0-9]*[.])?[0-9]+") + m = p.search(Input) + if m is None: + return None + Input = m.group() + p = re.compile("[+-]?([0-9]*[.])?[0-9]+") + m = p.search(Input) + return float(m.group()) -- Gitee From e4f3f9a2a8bd7c4f9c7b6a4d92aab947867f9847 Mon Sep 17 00:00:00 2001 From: daiyuxin0511 <455472400@qq.com> Date: Wed, 3 Jan 2024 09:31:57 +0800 Subject: [PATCH 02/44] cast the output of range --- .../src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py | 1 + .../maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py | 1 + 2 files changed, 2 insertions(+) diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py index 07c5bc621..470f709d6 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py @@ -214,6 +214,7 @@ class BboxAssignSampleForRcnn(nn.Cell): # normalized box coordinate boxes = boxes / self.image_h_w box_ids = F.range(self.start, self.limit, self.delta) + box_ids = self.cast(box_ids, mstype.int32) pos_masks_fb = self.expand_dims(pos_masks_fb, -1) boxes = self.cast(boxes, mstype.float32) pos_masks_fb = self.crop_and_resize(pos_masks_fb, boxes, box_ids, self.mask_shape) diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py index e97ee0a83..681b8a300 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py @@ -212,6 +212,7 @@ class BboxAssignSampleForRcnn(nn.Cell): # normalized box coordinate boxes = boxes / self.image_h_w box_ids = F.range(self.start, self.limit, self.delta) + box_ids = self.cast(box_ids, mstype.int32) pos_masks_fb = self.expand_dims(pos_masks_fb, -1) boxes = self.cast(boxes, mstype.float32) pos_masks_fb = self.crop_and_resize(pos_masks_fb, boxes, box_ids, self.mask_shape) -- Gitee From 073b71d00822ff526f85181c9c7644e25a96b34e Mon Sep 17 00:00:00 2001 From: daiyuxin0511 <455472400@qq.com> Date: Thu, 4 Jan 2024 10:00:58 +0800 Subject: [PATCH 03/44] Revert "modify the input of concat from ms.bool_ to ms.int32" This reverts commit da505fa86daecff3255a862e6d523b5efcfe202c. --- .../src/FasterRcnn/bbox_assign_sample.py | 8 +------- .../src/FasterRcnn/bbox_assign_sample_stage2.py | 10 ++-------- .../src/FasterRcnn/proposal_generator.py | 14 +++----------- .../src/maskrcnn_mobilenetv1/fpn_neck.py | 10 ++++++---- .../maskrcnn_resnet50/src/maskrcnn/fpn_neck.py | 11 ++++++----- 5 files changed, 18 insertions(+), 35 deletions(-) diff --git a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py index a49572c6a..57c758a51 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py @@ -144,13 +144,7 @@ class BboxAssignSample(nn.Cell): num_pos = self.cast(self.logicalnot(valid_pos_index), self.ms_type) num_pos = self.sum_inds(num_pos, -1) unvalid_pos_index = self.less(self.range_pos_size, num_pos) - valid_neg_index = self.logicaland( - self.cast(self.concat(( - self.cast(self.check_neg_mask, ms.int32), - self.cast(unvalid_pos_index, ms.int32) - )), ms.bool_), - self.cast(valid_neg_index, ms.bool_) - ) + valid_neg_index = self.logicaland(self.concat((self.check_neg_mask, unvalid_pos_index)), valid_neg_index) pos_bboxes_ = self.gatherND(bboxes, pos_index) pos_gt_bboxes_ = self.gatherND(gt_bboxes_i, pos_assigned_gt_index) diff --git a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py index 7602adcc5..5cbc16ee5 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py @@ -114,7 +114,7 @@ class BboxAssignSampleForRcnn(nn.Cell): gt_bboxes_i, self.check_gt_one) bboxes = self.select(self.cast(self.tile(self.reshape(self.cast(valid_mask, ms.int32), \ (self.num_bboxes, 1)), (1, 4)), ms.bool_), \ - self.cast(bboxes, ms.float16), self.check_anchor_two) + bboxes, self.check_anchor_two) overlaps = self.iou(bboxes, gt_bboxes_i) @@ -171,13 +171,7 @@ class BboxAssignSampleForRcnn(nn.Cell): neg_index, valid_neg_index = self.random_choice_with_mask_neg(self.equal(assigned_gt_inds5, 0)) unvalid_pos_index = self.less(self.range_pos_size, num_pos) - valid_neg_index = self.logicaland( - self.cast(self.concat(( - self.cast(self.check_neg_mask, ms.int32), - self.cast(unvalid_pos_index, ms.int32) - )), ms.bool_), - self.cast(valid_neg_index, ms.bool_) - ) + valid_neg_index = self.logicaland(self.concat((self.check_neg_mask, unvalid_pos_index)), valid_neg_index) neg_index = self.reshape(neg_index, self.reshape_shape_neg) valid_neg_index = self.cast(valid_neg_index, ms.int32) diff --git a/official/cv/FasterRCNN/src/FasterRcnn/proposal_generator.py b/official/cv/FasterRCNN/src/FasterRcnn/proposal_generator.py index 5317ca51c..16e2b4265 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/proposal_generator.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/proposal_generator.py @@ -183,21 +183,13 @@ class Proposal(nn.Cell): mlvl_proposals = mlvl_proposals + (proposals,) mlvl_mask = mlvl_mask + (mask_valid,) - proposals = self.concat_axis0( - tuple(self.cast(proposal, ms.int64) for proposal in mlvl_proposals) - ) - masks = self.concat_axis0( - tuple(self.cast(mask, ms.int64) for mask in mlvl_mask) - ) + proposals = self.concat_axis0(mlvl_proposals) + masks = self.concat_axis0(mlvl_mask) _, _, _, _, scores = self.split(proposals) scores = self.squeeze(scores) topk_mask = self.cast(self.topK_mask, self.ms_type) - scores_using = self.cast(self.select( - self.cast(masks, ms.bool_), - self.cast(scores, ms.bool_), - self.cast(topk_mask, ms.bool_) - ), ms.int32) + scores_using = self.select(masks, scores, topk_mask) _, topk_inds = self.topKv2(scores_using, self.max_num) diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py index 2ad832bbd..38ca57bb2 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py @@ -89,7 +89,9 @@ class FeatPyramidNeck(nn.Cell): self.fpn_convs_.append(fpn_conv) self.lateral_convs_list = nn.layer.CellList(self.lateral_convs_list_) self.fpn_convs_list = nn.layer.CellList(self.fpn_convs_) - self.interpolate = P.ResizeBilinearV2() + self.interpolate1 = P.ResizeBilinear((48, 80)) + self.interpolate2 = P.ResizeBilinear((96, 160)) + self.interpolate3 = P.ResizeBilinear((192, 320)) self.cast = P.Cast() self.maxpool = P.MaxPool(kernel_size=1, strides=2, pad_mode="same") @@ -99,9 +101,9 @@ class FeatPyramidNeck(nn.Cell): x += (self.lateral_convs_list[i](inputs[i]),) y = (x[3],) - y = y + (x[2] + self.cast(self.interpolate(y[self.fpn_layer - 4], (48, 80)), self.platform_mstype),) - y = y + (x[1] + self.cast(self.interpolate(y[self.fpn_layer - 3], (96, 160)), self.platform_mstype),) - y = y + (x[0] + self.cast(self.interpolate(y[self.fpn_layer - 2], (192, 320)), self.platform_mstype),) + y = y + (x[2] + self.cast(self.interpolate1(y[self.fpn_layer - 4]), self.platform_mstype),) + y = y + (x[1] + self.cast(self.interpolate2(y[self.fpn_layer - 3]), self.platform_mstype),) + y = y + (x[0] + self.cast(self.interpolate3(y[self.fpn_layer - 2]), self.platform_mstype),) z = () for i in range(self.fpn_layer - 1, -1, -1): diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py index 68590ec9d..6a98de5d2 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py @@ -92,8 +92,9 @@ class FeatPyramidNeck(nn.Cell): self.fpn_convs_.append(fpn_conv) self.lateral_convs_list = nn.layer.CellList(self.lateral_convs_list_) self.fpn_convs_list = nn.layer.CellList(self.fpn_convs_) - self.interpolate = P.ResizeBilinearV2() - self.feature_shapes = feature_shapes + self.interpolate1 = P.ResizeBilinear(feature_shapes[2]) + self.interpolate2 = P.ResizeBilinear(feature_shapes[1]) + self.interpolate3 = P.ResizeBilinear(feature_shapes[0]) self.cast = P.Cast() self.maxpool = P.MaxPool(kernel_size=1, strides=2, pad_mode="same") @@ -103,9 +104,9 @@ class FeatPyramidNeck(nn.Cell): x += (self.lateral_convs_list[i](inputs[i]),) y = (x[3],) - y = y + (x[2] + self.cast(self.interpolate(y[self.fpn_layer - 4], self.feature_shapes[2]), self.cast_type),) - y = y + (x[1] + self.cast(self.interpolate(y[self.fpn_layer - 3], self.feature_shapes[1]), self.cast_type),) - y = y + (x[0] + self.cast(self.interpolate(y[self.fpn_layer - 2], self.feature_shapes[0]), self.cast_type),) + y = y + (x[2] + self.cast(self.interpolate1(y[self.fpn_layer - 4]), self.cast_type),) + y = y + (x[1] + self.cast(self.interpolate2(y[self.fpn_layer - 3]), self.cast_type),) + y = y + (x[0] + self.cast(self.interpolate3(y[self.fpn_layer - 2]), self.cast_type),) z = () for i in range(self.fpn_layer - 1, -1, -1): -- Gitee From bd2ea20d4bb74bce1875db5c52fffebf70df64ed Mon Sep 17 00:00:00 2001 From: wilsonleehw Date: Wed, 10 Jan 2024 14:42:14 +0800 Subject: [PATCH 04/44] update README for TinySAM --- research/cv/TinySAM/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/research/cv/TinySAM/README.md b/research/cv/TinySAM/README.md index bd0e17c07..d01ba5f57 100644 --- a/research/cv/TinySAM/README.md +++ b/research/cv/TinySAM/README.md @@ -54,6 +54,10 @@ SNN-MLP After installing MindSpore via the official website, you can start evaluation as follows: +### Download + +Download ckpts from [modelzoo](https://download-mindspore.osinfra.cn/model_zoo/research/cv/TinySAM/tinysam_mindspore.ckpt). + ### Launch ```bash -- Gitee From d8785b55ecf8faf6459f6d1c615cd5b19df1de85 Mon Sep 17 00:00:00 2001 From: zhangyifan Date: Wed, 17 Jan 2024 14:40:19 +0800 Subject: [PATCH 05/44] add OWNERS --- OWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/OWNERS b/OWNERS index 0cd9b621a..8d29f4f1f 100644 --- a/OWNERS +++ b/OWNERS @@ -10,3 +10,4 @@ approvers: - baochong - luoyang42 - wang_hua_2019 +- zhangyifan999 -- Gitee From 9f2cf06f9c48d40d7d1006cc60e9adedd3395ec5 Mon Sep 17 00:00:00 2001 From: ash Date: Thu, 18 Jan 2024 22:11:32 +0800 Subject: [PATCH 06/44] fix fasterrcnn loss nan --- official/cv/FasterRCNN/scripts/run_distribute_train_ascend.sh | 1 + official/cv/FasterRCNN/scripts/run_standalone_train_ascend.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/official/cv/FasterRCNN/scripts/run_distribute_train_ascend.sh b/official/cv/FasterRCNN/scripts/run_distribute_train_ascend.sh index 86588bf77..8087e1f88 100644 --- a/official/cv/FasterRCNN/scripts/run_distribute_train_ascend.sh +++ b/official/cv/FasterRCNN/scripts/run_distribute_train_ascend.sh @@ -96,6 +96,7 @@ export HCCL_CONNECT_TIMEOUT=600 export DEVICE_NUM=8 export RANK_SIZE=8 export RANK_TABLE_FILE=$PATH1 +export MS_ASCEND_CHECK_OVERFLOW_MODE="SATURATION_MODE" for((i=0; i<${DEVICE_NUM}; i++)) do diff --git a/official/cv/FasterRCNN/scripts/run_standalone_train_ascend.sh b/official/cv/FasterRCNN/scripts/run_standalone_train_ascend.sh index 565f1c562..828f5133e 100644 --- a/official/cv/FasterRCNN/scripts/run_standalone_train_ascend.sh +++ b/official/cv/FasterRCNN/scripts/run_standalone_train_ascend.sh @@ -88,6 +88,7 @@ export DEVICE_NUM=1 export DEVICE_ID=$4 export RANK_ID=0 export RANK_SIZE=1 +export MS_ASCEND_CHECK_OVERFLOW_MODE="SATURATION_MODE" if [ -d "train" ]; then -- Gitee From 6736ad2212724cac681aea5e5dc85c9a72e370f7 Mon Sep 17 00:00:00 2001 From: daiyuxin0511 <455472400@qq.com> Date: Tue, 23 Jan 2024 17:16:46 +0800 Subject: [PATCH 07/44] remove ResizeBilinear from MaskRCNN --- .../src/maskrcnn_mobilenetv1/fpn_neck.py | 10 ++++------ .../maskrcnn_resnet50/src/maskrcnn/fpn_neck.py | 11 +++++------ 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py index 38ca57bb2..2ad832bbd 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py @@ -89,9 +89,7 @@ class FeatPyramidNeck(nn.Cell): self.fpn_convs_.append(fpn_conv) self.lateral_convs_list = nn.layer.CellList(self.lateral_convs_list_) self.fpn_convs_list = nn.layer.CellList(self.fpn_convs_) - self.interpolate1 = P.ResizeBilinear((48, 80)) - self.interpolate2 = P.ResizeBilinear((96, 160)) - self.interpolate3 = P.ResizeBilinear((192, 320)) + self.interpolate = P.ResizeBilinearV2() self.cast = P.Cast() self.maxpool = P.MaxPool(kernel_size=1, strides=2, pad_mode="same") @@ -101,9 +99,9 @@ class FeatPyramidNeck(nn.Cell): x += (self.lateral_convs_list[i](inputs[i]),) y = (x[3],) - y = y + (x[2] + self.cast(self.interpolate1(y[self.fpn_layer - 4]), self.platform_mstype),) - y = y + (x[1] + self.cast(self.interpolate2(y[self.fpn_layer - 3]), self.platform_mstype),) - y = y + (x[0] + self.cast(self.interpolate3(y[self.fpn_layer - 2]), self.platform_mstype),) + y = y + (x[2] + self.cast(self.interpolate(y[self.fpn_layer - 4], (48, 80)), self.platform_mstype),) + y = y + (x[1] + self.cast(self.interpolate(y[self.fpn_layer - 3], (96, 160)), self.platform_mstype),) + y = y + (x[0] + self.cast(self.interpolate(y[self.fpn_layer - 2], (192, 320)), self.platform_mstype),) z = () for i in range(self.fpn_layer - 1, -1, -1): diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py index 6a98de5d2..68590ec9d 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py @@ -92,9 +92,8 @@ class FeatPyramidNeck(nn.Cell): self.fpn_convs_.append(fpn_conv) self.lateral_convs_list = nn.layer.CellList(self.lateral_convs_list_) self.fpn_convs_list = nn.layer.CellList(self.fpn_convs_) - self.interpolate1 = P.ResizeBilinear(feature_shapes[2]) - self.interpolate2 = P.ResizeBilinear(feature_shapes[1]) - self.interpolate3 = P.ResizeBilinear(feature_shapes[0]) + self.interpolate = P.ResizeBilinearV2() + self.feature_shapes = feature_shapes self.cast = P.Cast() self.maxpool = P.MaxPool(kernel_size=1, strides=2, pad_mode="same") @@ -104,9 +103,9 @@ class FeatPyramidNeck(nn.Cell): x += (self.lateral_convs_list[i](inputs[i]),) y = (x[3],) - y = y + (x[2] + self.cast(self.interpolate1(y[self.fpn_layer - 4]), self.cast_type),) - y = y + (x[1] + self.cast(self.interpolate2(y[self.fpn_layer - 3]), self.cast_type),) - y = y + (x[0] + self.cast(self.interpolate3(y[self.fpn_layer - 2]), self.cast_type),) + y = y + (x[2] + self.cast(self.interpolate(y[self.fpn_layer - 4], self.feature_shapes[2]), self.cast_type),) + y = y + (x[1] + self.cast(self.interpolate(y[self.fpn_layer - 3], self.feature_shapes[1]), self.cast_type),) + y = y + (x[0] + self.cast(self.interpolate(y[self.fpn_layer - 2], self.feature_shapes[0]), self.cast_type),) z = () for i in range(self.fpn_layer - 1, -1, -1): -- Gitee From bf11fd9a2af157e8e33e5c88c8b51f01db4629f0 Mon Sep 17 00:00:00 2001 From: r1chardf1d0 Date: Mon, 29 Jan 2024 17:26:14 +0800 Subject: [PATCH 08/44] disable trace for pangu alpha --- official/nlp/Pangu_alpha/src/pangu_alpha.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/official/nlp/Pangu_alpha/src/pangu_alpha.py b/official/nlp/Pangu_alpha/src/pangu_alpha.py index 00f594b3d..02f1570be 100644 --- a/official/nlp/Pangu_alpha/src/pangu_alpha.py +++ b/official/nlp/Pangu_alpha/src/pangu_alpha.py @@ -23,7 +23,6 @@ from mindspore import Tensor, Parameter from mindspore.ops import operations as P from mindspore.ops import functional as F from mindspore.nn import Cell -from mindspore.ops._tracefunc import trace from mindformers.modules.transformer import VocabEmbedding, TransformerEncoder, TransformerEncoderLayer, \ AttentionMask, MoEConfig @@ -306,7 +305,6 @@ class PanguAlpha_Model(Cell): self.load_embedding_from_ckpt(config.load_ckpt_path) self.run_type = config.run_type - @trace def construct_blocks(self, hidden_state, encoder_masks, init_reset, batch_valid_length): if self.blocks is not None: for i in range(self.num_layers - 1): -- Gitee From 169176618719aa9393944946dd5946941a62db80 Mon Sep 17 00:00:00 2001 From: PingqiLi Date: Mon, 29 Jan 2024 17:43:00 +0800 Subject: [PATCH 09/44] modify the learning rate of the example in SSD training scripts --- official/cv/SSD/scripts/run_distribute_train.sh | 2 +- official/cv/SSD/scripts/run_distribute_train_gpu.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/official/cv/SSD/scripts/run_distribute_train.sh b/official/cv/SSD/scripts/run_distribute_train.sh index 893f4a76d..37fe46d37 100644 --- a/official/cv/SSD/scripts/run_distribute_train.sh +++ b/official/cv/SSD/scripts/run_distribute_train.sh @@ -17,7 +17,7 @@ echo "==============================================================================================================" echo "Please run the script as: " echo "bash run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET RANK_TABLE_FILE CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" -echo "for example: bash run_distribute_train.sh 8 500 0.2 coco /data/hccl.json /config_path /opt/ssd-300.ckpt(optional) 200(optional)" +echo "for example: bash run_distribute_train.sh 8 500 0.05 coco /data/hccl.json /config_path /opt/ssd-300.ckpt(optional) 200(optional)" echo "It is better to use absolute path." echo "=================================================================================================================" diff --git a/official/cv/SSD/scripts/run_distribute_train_gpu.sh b/official/cv/SSD/scripts/run_distribute_train_gpu.sh index 0778ad70f..0ff4b1818 100644 --- a/official/cv/SSD/scripts/run_distribute_train_gpu.sh +++ b/official/cv/SSD/scripts/run_distribute_train_gpu.sh @@ -17,7 +17,7 @@ echo "==============================================================================================================" echo "Please run the script as: " echo "bash run_distribute_train_gpu.sh DEVICE_NUM EPOCH_SIZE LR DATASET CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" -echo "for example: bash run_distribute_train_gpu.sh 8 500 0.2 coco /config_path /opt/ssd-300.ckpt(optional) 200(optional)" +echo "for example: bash run_distribute_train_gpu.sh 8 500 0.05 coco /config_path /opt/ssd-300.ckpt(optional) 200(optional)" echo "It is better to use absolute path." echo "=================================================================================================================" -- Gitee From 77d00581d59e1beefcef041ac8e054018c72de64 Mon Sep 17 00:00:00 2001 From: ash Date: Sun, 4 Feb 2024 11:11:40 +0800 Subject: [PATCH 10/44] fix MELGAN readme file name --- official/audio/MELGAN/README.md | 2 +- official/audio/MELGAN/README_CN.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/official/audio/MELGAN/README.md b/official/audio/MELGAN/README.md index 448fa1bb5..4185a79a5 100644 --- a/official/audio/MELGAN/README.md +++ b/official/audio/MELGAN/README.md @@ -46,7 +46,7 @@ Dataset used: [LJ Speech]() - Dataset size:2.6GB,13,100 short audio clips of a single speaker reading passages from 7 non-fiction books. - Data format:Each audio file is a single-channel 16-bit PCM WAV with a sample rate of 22050 Hz - - The audio data needs to be processed to a mel-spectrum, and you can refer to the script in [mel-spectrogram data creation](https://github.com/seungwonpark/melgan/blob/master/preprocess.py). Non CUDA environment needs to delete `. cuda()` in `utils/stfy.py`. To save data in the `npy` format, `preprocess.py` also needs to be modified. As follows: + - The audio data needs to be processed to a mel-spectrum, and you can refer to the script in [mel-spectrogram data creation](https://github.com/seungwonpark/melgan/blob/master/preprocess.py). Non CUDA environment needs to delete `. cuda()` in `utils/stft.py`. To save data in the `npy` format, `preprocess.py` also needs to be modified. As follows: ``` # 37 - 38 lines diff --git a/official/audio/MELGAN/README_CN.md b/official/audio/MELGAN/README_CN.md index 03a3b7bad..5d80d749a 100644 --- a/official/audio/MELGAN/README_CN.md +++ b/official/audio/MELGAN/README_CN.md @@ -46,7 +46,7 @@ MelGAN模型是非自回归全卷积模型。它的参数比同类模型少得 - Dataset size:2.6GB,包含13,100条只有一个说话人的短语音。语音的内容来自7本纪实书籍。 - 数据格式:每条语音文件都是单声道、16-bit以及采样率为22050。 - - 语音需要被处理为Mel谱, 可以参考脚本[Mel谱处理脚本](https://github.com/seungwonpark/melgan/blob/master/preprocess.py)。非CUDA环境需删除`utils/stfy.py`中的`.cuda()`,因为要保存`npy`格式的数据,所以`preproccess.py`也需要修改以下,参考代码如下: + - 语音需要被处理为Mel谱, 可以参考脚本[Mel谱处理脚本](https://github.com/seungwonpark/melgan/blob/master/preprocess.py)。非CUDA环境需删除`utils/stft.py`中的`.cuda()`,因为要保存`npy`格式的数据,所以`preproccess.py`也需要修改以下,参考代码如下: ``` # 37 - 38 行 -- Gitee From e38c7377188d0a6e1f560d85c7e29e4c0c8046ed Mon Sep 17 00:00:00 2001 From: tomzwang11 Date: Mon, 5 Feb 2024 15:07:49 +0800 Subject: [PATCH 11/44] update fasterrcnn --- official/cv/FasterRCNN/README_CN.md | 2 +- official/cv/FasterRCNN/scripts/run_distribute_train_gpu.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/official/cv/FasterRCNN/README_CN.md b/official/cv/FasterRCNN/README_CN.md index 676b27231..ace823dbf 100644 --- a/official/cv/FasterRCNN/README_CN.md +++ b/official/cv/FasterRCNN/README_CN.md @@ -625,7 +625,7 @@ bash run_infer_cpp.sh [MINDIR_PATH] [DATA_PATH] [ANNO_PATH] [DEVICE_TYPE] [IMAGE | 上传日期 | 2020/8/31 | 2021/2/10 |2022/8/10| | MindSpore版本 | 1.0.0 |1.2.0 |1.7.0| | 数据集 | COCO 2017 |COCO 2017 |FaceMaskDetection| -| 训练参数 | epoch=12, batch_size=2 |epoch=12, batch_size=2 |epoch=20,batch_size=2| +| 训练参数 | epoch=12, batch_size=2 |epoch=20, batch_size=2 |epoch=20,batch_size=2| | 优化器 | SGD |SGD |SGD| | 损失函数 | Softmax交叉熵,Sigmoid交叉熵,SmoothL1Loss |Softmax交叉熵,Sigmoid交叉熵,SmoothL1Loss |Softmax交叉熵,Sigmoid交叉熵,SmoothL1Loss| | 速度 | 1卡:190毫秒/步;8卡:200毫秒/步 | 1卡:320毫秒/步;8卡:335毫秒/步 |1卡:7328毫秒/步| diff --git a/official/cv/FasterRCNN/scripts/run_distribute_train_gpu.sh b/official/cv/FasterRCNN/scripts/run_distribute_train_gpu.sh index 8b27d1c67..d7af4ca64 100644 --- a/official/cv/FasterRCNN/scripts/run_distribute_train_gpu.sh +++ b/official/cv/FasterRCNN/scripts/run_distribute_train_gpu.sh @@ -97,4 +97,5 @@ mpirun -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout --all --pre_trained=$PRETRAINED_PATH \ --backbone=$3 \ --coco_root=$PATH3 \ + --base_lr=0.008 \ --mindrecord_dir=$mindrecord_dir > train.log 2>&1 & \ No newline at end of file -- Gitee From 39ec85593dc2d611a679efe15f0f49a8fdfe5ab4 Mon Sep 17 00:00:00 2001 From: ash Date: Sat, 17 Feb 2024 11:04:39 +0800 Subject: [PATCH 12/44] fix ssd error in 910A+GE --- official/cv/SSD/config/ssd_mobilenet_v1_fpn_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/official/cv/SSD/config/ssd_mobilenet_v1_fpn_config.yaml b/official/cv/SSD/config/ssd_mobilenet_v1_fpn_config.yaml index 099ea3758..6fa43d8a5 100644 --- a/official/cv/SSD/config/ssd_mobilenet_v1_fpn_config.yaml +++ b/official/cv/SSD/config/ssd_mobilenet_v1_fpn_config.yaml @@ -23,7 +23,7 @@ match_threshold: 0.5 nms_threshold: 0.6 min_score: 0.1 max_boxes: 100 -all_reduce_fusion_config: [29, 58, 89] +all_reduce_fusion_config: [29, 58, 89, 201] # learning rate settings lr_init: 0.01333 -- Gitee From fcf960d7d002a29aca2411b6b4a53d31caf77e39 Mon Sep 17 00:00:00 2001 From: gaoshuanglong Date: Tue, 20 Feb 2024 19:48:02 +0800 Subject: [PATCH 13/44] Fix Tacotron2 bug and readme --- official/audio/Tacotron2/README.md | 6 +++--- official/audio/Tacotron2/README_CN.md | 6 +++--- official/audio/Tacotron2/src/tacotron2.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/official/audio/Tacotron2/README.md b/official/audio/Tacotron2/README.md index 53283ba9b..51c939693 100644 --- a/official/audio/Tacotron2/README.md +++ b/official/audio/Tacotron2/README.md @@ -76,8 +76,8 @@ After installing MindSpore via the official website, you can start training and # example: bash run_standalone_train.sh /path/ljspeech.hdf5 0 # run distributed training - bash run_distributed_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] - # example: bash run_distributed_train.sh /path/ljspeech.h5 ../hccl_8p_01234567_127.0.0.1.json 8 0 + bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] + # example: bash run_distribute_train.sh /path/ljspeech.h5 ../hccl_8p_01234567_127.0.0.1.json 8 0 # run evaluation bash run_eval.sh [OUTPUT_PATH] [MODEL_CKPT] [DEVICE_ID] text is set in config.py( can modify text of ljspeech_config.yaml) @@ -246,7 +246,7 @@ Parameters for both training and evaluation can be set in [DATASET]_config.yaml ```bash cd scripts - bash run_distributed_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] + bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] ``` Note: `DATASET_PATH` is the directory contains hdf5 file. diff --git a/official/audio/Tacotron2/README_CN.md b/official/audio/Tacotron2/README_CN.md index e104f6e48..aaf135f92 100644 --- a/official/audio/Tacotron2/README_CN.md +++ b/official/audio/Tacotron2/README_CN.md @@ -76,8 +76,8 @@ Tacotron2实质上是一种包含编码器和解码器的序列到序列模型 # 示例:bash run_standalone_train.sh /path/ljspeech.hdf5 0 # 运行分布式训练 - bash run_distributed_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] - # 示例:bash run_distributed_train.sh /path/ljspeech.h5 ../hccl_8p_01234567_127.0.0.1.json 8 0 + bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] + # 示例:bash run_distribute_train.sh /path/ljspeech.h5 ../hccl_8p_01234567_127.0.0.1.json 8 0 # 运行评估 bash run_eval.sh [OUTPUT_PATH] [MODEL_CKPT] [DEVICE_ID] text is set in config.py( can modify text of ljspeech_config.yaml) @@ -246,7 +246,7 @@ tacotron2/ ```bash cd scripts - bash run_distributed_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] + bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] ``` 注:`DATASET_PATH`是包含HDF5文件的目录。 diff --git a/official/audio/Tacotron2/src/tacotron2.py b/official/audio/Tacotron2/src/tacotron2.py index e685cf0f8..a5b573f31 100644 --- a/official/audio/Tacotron2/src/tacotron2.py +++ b/official/audio/Tacotron2/src/tacotron2.py @@ -1168,7 +1168,7 @@ class TrainStepWrap(nn.Cell): overflow = ops.logical_not(amp.all_finite(grads)) if self.reducer_flag: - overflow = self.allreduce(overflow.to(mstype.float32)) >= self.base + overflow = self.all_reduce(overflow.to(mstype.float32)) >= self.base overflow = self.loss_scaling_manager(self.loss_scale, overflow) -- Gitee From 0b79514ce5098bdf015baacfc46a0b7959d9b858 Mon Sep 17 00:00:00 2001 From: The-truthh <821372701@qq.com> Date: Tue, 27 Feb 2024 17:24:37 +0800 Subject: [PATCH 14/44] fix the low evaluation accuracy of fasterrcnn --- official/cv/FasterRCNN/scripts/run_eval_ascend.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/official/cv/FasterRCNN/scripts/run_eval_ascend.sh b/official/cv/FasterRCNN/scripts/run_eval_ascend.sh index d27654617..d9b2be113 100644 --- a/official/cv/FasterRCNN/scripts/run_eval_ascend.sh +++ b/official/cv/FasterRCNN/scripts/run_eval_ascend.sh @@ -95,6 +95,7 @@ export DEVICE_NUM=1 export RANK_SIZE=$DEVICE_NUM export DEVICE_ID=$5 export RANK_ID=0 +export MS_ASCEND_CHECK_OVERFLOW_MODE="SATURATION_MODE" if [ -d "eval" ]; then -- Gitee From c63fb183e748427c2d59d96e5a79f9543f56844d Mon Sep 17 00:00:00 2001 From: ash Date: Thu, 14 Mar 2024 16:48:40 +0800 Subject: [PATCH 15/44] fix resnet 32 card --- official/cv/ResNet/scripts/run_distribute_train.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/official/cv/ResNet/scripts/run_distribute_train.sh b/official/cv/ResNet/scripts/run_distribute_train.sh index 7cb80caaa..a9a464c67 100644 --- a/official/cv/ResNet/scripts/run_distribute_train.sh +++ b/official/cv/ResNet/scripts/run_distribute_train.sh @@ -97,6 +97,7 @@ ulimit -u unlimited export DEVICE_NUM=8 export RANK_SIZE=8 export RANK_TABLE_FILE=$PATH1 +export MS_ASCEND_CHECK_OVERFLOW_MODE="SATURATION_MODE" export SERVER_ID=0 rank_start=$((DEVICE_NUM * SERVER_ID)) -- Gitee From 993b04d83c826d2096bd91a296fe573528799b36 Mon Sep 17 00:00:00 2001 From: yuchaojie Date: Fri, 15 Mar 2024 10:14:12 +0800 Subject: [PATCH 16/44] fix FasterRCNN compile time --- official/cv/FasterRCNN/train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/official/cv/FasterRCNN/train.py b/official/cv/FasterRCNN/train.py index 0bf819657..5de58ceab 100644 --- a/official/cv/FasterRCNN/train.py +++ b/official/cv/FasterRCNN/train.py @@ -287,7 +287,8 @@ def train_fasterrcnn(): if __name__ == '__main__': set_seed(1) - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) + ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id(), + ascend_config={"ge_options": {"ge.exec.memoryOptimizationPolicy": ""}}) set_ascend_max_device_memory() local_path = '/'.join(os.path.realpath(__file__).split('/')[:-1]) summary_dir = local_path + "/train/summary/" -- Gitee From 7075ca15a546a91be1037843fab948e75ea3a528 Mon Sep 17 00:00:00 2001 From: gaoshuanglong Date: Mon, 18 Mar 2024 11:10:27 +0800 Subject: [PATCH 17/44] Fix FasterRcnn ArgMaxWithValue op data type changes --- official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py | 3 ++- .../FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py index 57c758a51..0cb30ad1c 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py @@ -115,7 +115,8 @@ class BboxAssignSample(nn.Cell): pos_sample_iou_mask = self.greaterequal(max_overlaps_w_gt, self.pos_iou_thr) assigned_gt_inds3 = self.select(pos_sample_iou_mask, - max_overlaps_w_gt_index + self.assigned_gt_ones, assigned_gt_inds2) + max_overlaps_w_gt_index.astype(ms.int32) + self.assigned_gt_ones, + assigned_gt_inds2) assigned_gt_inds4 = assigned_gt_inds3 for j in range(self.num_gts): max_overlaps_w_ac_j = max_overlaps_w_ac[j:j + 1:1] diff --git a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py index 5cbc16ee5..63a3355c4 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py @@ -129,8 +129,9 @@ class BboxAssignSampleForRcnn(nn.Cell): assigned_gt_inds2 = self.select(neg_sample_iou_mask, self.assigned_gt_zeros, self.assigned_gt_inds) pos_sample_iou_mask = self.greaterequal(max_overlaps_w_gt, self.scalar_pos_iou_thr) - assigned_gt_inds3 = self.select(pos_sample_iou_mask, \ - max_overlaps_w_gt_index + self.assigned_gt_ones, assigned_gt_inds2) + assigned_gt_inds3 = self.select(pos_sample_iou_mask, + max_overlaps_w_gt_index.astype(ms.int32) + self.assigned_gt_ones, + assigned_gt_inds2) for j in range(self.num_gts): max_overlaps_w_ac_j = max_overlaps_w_ac[j:j+1:1] -- Gitee From 8bc109455443788b442897315477d0ede81aa05d Mon Sep 17 00:00:00 2001 From: yuchaojie Date: Mon, 18 Mar 2024 16:03:59 +0800 Subject: [PATCH 18/44] fix fasterrcnn ascend option --- official/cv/FasterRCNN/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/official/cv/FasterRCNN/train.py b/official/cv/FasterRCNN/train.py index 5de58ceab..4511033cb 100644 --- a/official/cv/FasterRCNN/train.py +++ b/official/cv/FasterRCNN/train.py @@ -288,7 +288,7 @@ def train_fasterrcnn(): if __name__ == '__main__': set_seed(1) ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id(), - ascend_config={"ge_options": {"ge.exec.memoryOptimizationPolicy": ""}}) + ascend_config={"ge_options": {"global": {"ge.exec.memoryOptimizationPolicy": ""}}}) set_ascend_max_device_memory() local_path = '/'.join(os.path.realpath(__file__).split('/')[:-1]) summary_dir = local_path + "/train/summary/" -- Gitee From 5e9af039f6e0ed8c705e72859166f031412c46b2 Mon Sep 17 00:00:00 2001 From: ash Date: Wed, 20 Mar 2024 18:04:50 +0800 Subject: [PATCH 19/44] update rank table to msrun --- .../scripts/run_distribute_train_msrun.sh | 138 ++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 official/cv/ResNet/scripts/run_distribute_train_msrun.sh diff --git a/official/cv/ResNet/scripts/run_distribute_train_msrun.sh b/official/cv/ResNet/scripts/run_distribute_train_msrun.sh new file mode 100644 index 000000000..4ca63ed1d --- /dev/null +++ b/official/cv/ResNet/scripts/run_distribute_train_msrun.sh @@ -0,0 +1,138 @@ +#!/bin/bash +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +CURPATH="$(dirname "$0")" +# shellcheck source=/dev/null +. ${CURPATH}/cache_util.sh + +if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] +then + echo "Usage: bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH]" + echo "Usage: bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional)" + echo "Usage: bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" + echo "Usage: bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) [RESUME_CKPT](optional)" + exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +CONFIG_FILE=$(get_real_path $2) +str="Boost_" +if [[ $CONFIG_FILE =~ $str ]] +then + export MS_DISABLE_REF_MODE=1 + export MS_ENABLE_FORMAT_MODE=0 +fi + +if [ $# == 3 ] +then + RESUME_CKPT=$(get_real_path $3) +fi + +if [ $# == 4 ] +then + RUN_EVAL=$3 + EVAL_DATASET_PATH=$(get_real_path $4) +fi + +if [ $# == 5 ] +then + RUN_EVAL=$3 + EVAL_DATASET_PATH=$(get_real_path $4) + RESUME_CKPT=$(get_real_path $5) +fi + +if [ ! -d $PATH1 ] +then + echo "error: DATASET_PATH=$PATH1 is not a directory" +exit 1 +fi + +if [ $# == 3 ] && [ ! -f $RESUME_CKPT ] +then + echo "error: RESUME_CKPT=$RESUME_CKPT is not a file" +exit 1 +fi + +if [ "x${RUN_EVAL}" == "xTrue" ] && [ ! -d $EVAL_DATASET_PATH ] +then + echo "error: EVAL_DATASET_PATH=$EVAL_DATASET_PATH is not a directory" + exit 1 +fi + +if [ "x${RUN_EVAL}" == "xTrue" ] +then + bootup_cache_server + CACHE_SESSION_ID=$(generate_cache_session) +fi + +ulimit -u unlimited +export DEVICE_NUM=8 +export RANK_SIZE=8 +ulimit -u unlimited +export MS_ASCEND_CHECK_OVERFLOW_MODE="SATURATION_MODE" + +cd .. +env > env.log +echo "start training" +if [ $# == 2 ] +then + msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port=8118 \ + --log_dir=msrun_log --join=True --cluster_time_out=300 \ + train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH1 \ + --config_path=$CONFIG_FILE --output_dir './outputs' +fi + +if [ $# == 3 ] +then + msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port=8118 \ + --log_dir=msrun_log --join=True --cluster_time_out=300 \ + train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH1 --resume_ckpt=$RESUME_CKPT \ + --config_path=$CONFIG_FILE --output_dir './outputs' +fi + +if [ $# == 4 ] +then + msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port=8118 \ + --log_dir=msrun_log --join=True --cluster_time_out=300 \ + train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH1 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir './outputs' + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi +fi + +if [ $# == 5 ] +then + msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port=8118 \ + --log_dir=msrun_log --join=True --cluster_time_out=300 \ + train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH1 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --resume_ckpt=$RESUME_CKPT \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir './outputs' + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi +fi -- Gitee From 38844ff20b3aa0ab8fcbb8cc64579b8b914be875 Mon Sep 17 00:00:00 2001 From: gaoshuanglong Date: Thu, 21 Mar 2024 14:34:03 +0800 Subject: [PATCH 20/44] Fix MaskRCNN ArgMaxWithValue op data type changes --- .../src/maskrcnn_mobilenetv1/bbox_assign_sample.py | 5 +++-- .../src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py | 5 +++-- .../maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample.py | 5 +++-- .../src/maskrcnn/bbox_assign_sample_stage2.py | 5 +++-- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample.py index e0345dfc3..7cc702ab8 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample.py @@ -124,8 +124,9 @@ class BboxAssignSample(nn.Cell): assigned_gt_inds2 = self.select(neg_sample_iou_mask, self.assigned_gt_zeros, self.assigned_gt_inds) pos_sample_iou_mask = self.greaterequal(max_overlaps_w_gt, self.pos_iou_thr) - assigned_gt_inds3 = self.select(pos_sample_iou_mask, \ - max_overlaps_w_gt_index + self.assigned_gt_ones, assigned_gt_inds2) + assigned_gt_inds3 = self.select(pos_sample_iou_mask, + max_overlaps_w_gt_index.astype(mstype.int32) + self.assigned_gt_ones, + assigned_gt_inds2) assigned_gt_inds4 = assigned_gt_inds3 for j in range(self.num_gts): max_overlaps_w_ac_j = max_overlaps_w_ac[j:j+1:1] diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py index 470f709d6..de567eca9 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py @@ -148,8 +148,9 @@ class BboxAssignSampleForRcnn(nn.Cell): assigned_gt_inds2 = self.select(neg_sample_iou_mask, self.assigned_gt_zeros, self.assigned_gt_inds) pos_sample_iou_mask = self.greaterequal(max_overlaps_w_gt, self.scalar_pos_iou_thr) - assigned_gt_inds3 = self.select(pos_sample_iou_mask, \ - max_overlaps_w_gt_index + self.assigned_gt_ones, assigned_gt_inds2) + assigned_gt_inds3 = self.select(pos_sample_iou_mask, + max_overlaps_w_gt_index.astype(mstype.int32) + self.assigned_gt_ones, + assigned_gt_inds2) for j in range(self.num_gts): max_overlaps_w_ac_j = max_overlaps_w_ac[j:j+1:1] diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample.py index e4421110c..01542f0f9 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample.py @@ -121,8 +121,9 @@ class BboxAssignSample(nn.Cell): assigned_gt_inds2 = self.select(neg_sample_iou_mask, self.assigned_gt_zeros, self.assigned_gt_inds) pos_sample_iou_mask = self.greaterequal(max_overlaps_w_gt, self.pos_iou_thr) - assigned_gt_inds3 = self.select(pos_sample_iou_mask, \ - max_overlaps_w_gt_index + self.assigned_gt_ones, assigned_gt_inds2) + assigned_gt_inds3 = self.select(pos_sample_iou_mask, + max_overlaps_w_gt_index.astype(mstype.int32) + self.assigned_gt_ones, + assigned_gt_inds2) assigned_gt_inds4 = assigned_gt_inds3 for j in range(self.num_gts): max_overlaps_w_ac_j = max_overlaps_w_ac[j:j+1:1] diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py index 681b8a300..d6d57e00a 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py @@ -146,8 +146,9 @@ class BboxAssignSampleForRcnn(nn.Cell): assigned_gt_inds2 = self.select(neg_sample_iou_mask, self.assigned_gt_zeros, self.assigned_gt_inds) pos_sample_iou_mask = self.greaterequal(max_overlaps_w_gt, self.scalar_pos_iou_thr) - assigned_gt_inds3 = self.select(pos_sample_iou_mask, \ - max_overlaps_w_gt_index + self.assigned_gt_ones, assigned_gt_inds2) + assigned_gt_inds3 = self.select(pos_sample_iou_mask, + max_overlaps_w_gt_index.astype(mstype.int32) + self.assigned_gt_ones, + assigned_gt_inds2) for j in range(self.num_gts): max_overlaps_w_ac_j = max_overlaps_w_ac[j:j+1:1] -- Gitee From a80fb3472bd78d01c4fe8dbb0f37153af467510f Mon Sep 17 00:00:00 2001 From: ash Date: Fri, 22 Mar 2024 10:37:16 +0800 Subject: [PATCH 21/44] add msrun of bert for 8p --- .../run_distributed_pretrain_ascend_msrun.sh | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 official/nlp/Bert/scripts/run_distributed_pretrain_ascend_msrun.sh diff --git a/official/nlp/Bert/scripts/run_distributed_pretrain_ascend_msrun.sh b/official/nlp/Bert/scripts/run_distributed_pretrain_ascend_msrun.sh new file mode 100644 index 000000000..d429f59d0 --- /dev/null +++ b/official/nlp/Bert/scripts/run_distributed_pretrain_ascend_msrun.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run_distributed_pretrain_ascend_msrun.sh DATA_DIR" +echo "for example: bash run_distributed_pretrain_ascend_msrun.sh /path/dataset" +echo "It is better to use absolute path." +echo "==============================================================================================================" +export RANK_SIZE=8 +export DEPLOY_MODE=0 +export GE_USE_STATIC_MEMORY=1 +ulimit -s 302400 +cd .. +msrun --bind_core=True --worker_num=8 --local_worker_num=8 \ + --master_port=8118 --log_dir=msrun_log --join=True --cluster_time_out=300 \ + run_pretrain.py --data_dir=$1 --distribute=true --epoch_size=40 \ + --enable_save_ckpt=true --do_shuffle=true --enable_data_sink=true \ + --data_sink_steps=100 --accumulation_steps=1 --allreduce_post_accumulation=true \ + --save_checkpoint_path=./ckpt --save_checkpoint_num=1 --config_path=../../pretrain_config.yaml -- Gitee From 257f37288a38296b2d4642f4e5100b9fda68e45a Mon Sep 17 00:00:00 2001 From: The-truthh <821372701@qq.com> Date: Wed, 27 Mar 2024 10:53:27 +0800 Subject: [PATCH 22/44] add msrun of fasterrcnn for 8p --- .../run_distribute_train_ascend_msrun.sh | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 official/cv/FasterRCNN/scripts/run_distribute_train_ascend_msrun.sh diff --git a/official/cv/FasterRCNN/scripts/run_distribute_train_ascend_msrun.sh b/official/cv/FasterRCNN/scripts/run_distribute_train_ascend_msrun.sh new file mode 100644 index 000000000..061b967c2 --- /dev/null +++ b/official/cv/FasterRCNN/scripts/run_distribute_train_ascend_msrun.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# -le 2 ] +then + echo "Usage: bash run_distribute_train_ascend_msrun.sh [PRETRAINED_PATH] [BACKBONE] [COCO_ROOT] [MINDRECORD_DIR](option)" +exit 1 +fi + +if [ $2 != "resnet_v1_50" ] && [ $2 != "resnet_v1.5_50" ] && [ $2 != "resnet_v1_101" ] && [ $2 != "resnet_v1_152" ] && [ $2 != "inception_resnet_v2" ] +then + echo "error: the selected backbone must be resnet_v1_50, resnet_v1.5_50, resnet_v1_101, resnet_v1_152, inception_resnet_v2" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $3) +echo $PATH1 +echo $PATH2 + +if [ ! -f $PATH1 ] +then + echo "error: PRETRAINED_PATH=$PATH1 is not a file" +exit 1 +fi + +if [ ! -d $PATH2 ] +then + echo "error: COCO_ROOT=$PATH2 is not a dir" +exit 1 +fi + +mindrecord_dir=$PATH2/MindRecord_COCO_TRAIN/ +if [ $# -eq 4 ] +then + mindrecord_dir=$(get_real_path $4) + if [ ! -d $mindrecord_dir ] + then + echo "error: mindrecord_dir=$mindrecord_dir is not a dir" + exit 1 + fi +fi +echo $mindrecord_dir + +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) +if [ $# -ge 1 ]; then + if [ $2 == 'resnet_v1.5_50' ]; then + CONFIG_FILE="${BASE_PATH}/../default_config.yaml" + elif [ $2 == 'resnet_v1_101' ]; then + CONFIG_FILE="${BASE_PATH}/../default_config_101.yaml" + elif [ $2 == 'resnet_v1_152' ]; then + CONFIG_FILE="${BASE_PATH}/../default_config_152.yaml" + elif [ $2 == 'resnet_v1_50' ]; then + CONFIG_FILE="${BASE_PATH}/../default_config.yaml" + elif [ $2 == 'inception_resnet_v2' ]; then + CONFIG_FILE="${BASE_PATH}/../default_config_InceptionResnetV2.yaml" + else + echo "Unrecognized parameter" + exit 1 + fi +else + CONFIG_FILE="${BASE_PATH}/../default_config.yaml" +fi + +ulimit -u unlimited +export HCCL_CONNECT_TIMEOUT=600 +export DEVICE_NUM=8 +export RANK_SIZE=8 +export MS_ASCEND_CHECK_OVERFLOW_MODE="SATURATION_MODE" + +echo "Start training..." +msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port=8118 \ + --log_dir=msrun_log --join=True --cluster_time_out=300 \ + ${BASE_PATH}/../train.py --config_path=$CONFIG_FILE --coco_root=$PATH2 --mindrecord_dir=$mindrecord_dir \ + --run_distribute=True --device_num=$DEVICE_NUM --pre_trained=$PATH1 --backbone=$2 & -- Gitee From da1fc7efa83043c88185b3bea84e3a44ff72693c Mon Sep 17 00:00:00 2001 From: PingqiLi Date: Mon, 1 Apr 2024 14:55:43 +0800 Subject: [PATCH 23/44] Add msrun scripts for OpenPose and Unet --- .../scripts/run_distribute_train_msrun.sh | 31 +++++++++++++ .../scripts/run_distribute_train_msrun.sh | 44 +++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 official/cv/OpenPose/scripts/run_distribute_train_msrun.sh create mode 100644 official/cv/Unet/scripts/run_distribute_train_msrun.sh diff --git a/official/cv/OpenPose/scripts/run_distribute_train_msrun.sh b/official/cv/OpenPose/scripts/run_distribute_train_msrun.sh new file mode 100644 index 000000000..e210bfb00 --- /dev/null +++ b/official/cv/OpenPose/scripts/run_distribute_train_msrun.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +if [ $# != 4 ] +then + echo "Usage: bash scripts/run_distribute_train_msrun.sh [IAMGEPATH_TRAIN] [JSONPATH_TRAIN] [MASKPATH_TRAIN] [VGG_PATH]" +exit 1 +fi + +export DEVICE_NUM=8 +export RANK_SIZE=8 +unlimit -u unlimited +env > env.log + +echo "start training" +msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port 8118 \ + --log_dir=msrun_log --join=True --cluster_time_out=300 \ + train.py --imgpath_train=$1 --jsonpath_train=$2 --maskpath_train=$3 --vgg_path=$4 + diff --git a/official/cv/Unet/scripts/run_distribute_train_msrun.sh b/official/cv/Unet/scripts/run_distribute_train_msrun.sh new file mode 100644 index 000000000..e4a62db33 --- /dev/null +++ b/official/cv/Unet/scripts/run_distribute_train_msrun.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] +then + echo "==============================================================================================================" + echo "Usage: bash scripts/run_distribute_train_msrun.sh [DATASET] [CONFIG_PATH]" + echo "Please run the script as: " + echo "bash scripts/run_distribute_train_msrun.sh [DATASET] [CONFIG_PATH]" + echo "for example: bash run_distribute_train_msrun.sh /absolute/path/to/data /absolute/path/to/config" + echo "==============================================================================================================" + exit 1 +fi + +get_real_path() { + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +DATASET=$(get_real_path $1) +CONFIG_PATH=$(get_real_path $2) +ulimit -u unlimited +env > env.log + +echo "start training" +msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port 8118 \ + --log_dir=msrun_log --join=True --cluster_time_out=300 \ + train.py --run_distribute_train --data_path=$DATASET --config_path=$CONFIG_PATH --output_path './output' -- Gitee From 75dfb0f1c7191b5811d7b77e8ce380097f646fb8 Mon Sep 17 00:00:00 2001 From: PingqiLi Date: Tue, 9 Apr 2024 16:32:44 +0800 Subject: [PATCH 24/44] Add msrun script for mobilenetv2 --- .../scripts/run_distribute_train_msrun.sh | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 official/cv/MobileNet/mobilenetv2/scripts/run_distribute_train_msrun.sh diff --git a/official/cv/MobileNet/mobilenetv2/scripts/run_distribute_train_msrun.sh b/official/cv/MobileNet/mobilenetv2/scripts/run_distribute_train_msrun.sh new file mode 100644 index 000000000..1b2de9149 --- /dev/null +++ b/official/cv/MobileNet/mobilenetv2/scripts/run_distribute_train_msrun.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash scripts/run_distributed_train_msrun.sh [DATA_PATH] [RANK_SIZE]" +echo "For example: bash scripts/run_distributed_train_msrun.sh /path/dataset 8" +echo "It is better to use the absolute path." +echo "==============================================================================================================" + +DATA_PATH=$1 +RANK_SIZE=$2 +export DATA_PATH=${DATA_PATH} +export RANK_SIZE=${RANK_SIZE} +export HCCL_CONNECT_TIMEOUT=600 +ulimit -s 302400 + +EXEC_PATH=$(pwd) +CONFIG_PATH=${EXEC_PATH}/default_config.yaml + +if [ ! -d "${DATA_PATH}" ] +then + echo "ERROR: ${DATA_PATH} is not a valid path for dataset, please check." + exit 0 +fi + +env > env.log +echo "start training" +msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port 8118 \ + --log_dir=msrun_log --join=True --cluster_time_out=300 \ + train.py --run_distribute True --config_path=${CONFIG_PATH} --platform Ascend --dataset_path=${DATA_PATH} --rank_size ${RANK_SIZE} + -- Gitee From 778802f7d4ca90d4e8afaefdc9f89c08f2f27b1d Mon Sep 17 00:00:00 2001 From: zhaoting Date: Fri, 12 Apr 2024 07:06:42 +0800 Subject: [PATCH 25/44] bert 910b --- official/nlp/Bert/pretrain_config_Ascend_Boost.yaml | 1 + official/nlp/Bert/run_pretrain.py | 9 +++++++++ official/nlp/Bert/src/bert_for_pre_training.py | 12 +++++++++--- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/official/nlp/Bert/pretrain_config_Ascend_Boost.yaml b/official/nlp/Bert/pretrain_config_Ascend_Boost.yaml index 0a9680235..032ac0db4 100644 --- a/official/nlp/Bert/pretrain_config_Ascend_Boost.yaml +++ b/official/nlp/Bert/pretrain_config_Ascend_Boost.yaml @@ -14,6 +14,7 @@ enable_profiling: False # ============================================================================== description: 'run_pretrain' distribute: 'false' +max_device_memory: "28.5GB" epoch_size: 40 device_id: 0 device_num: 1 diff --git a/official/nlp/Bert/run_pretrain.py b/official/nlp/Bert/run_pretrain.py index 1d1ba0d25..477d2b47b 100644 --- a/official/nlp/Bert/run_pretrain.py +++ b/official/nlp/Bert/run_pretrain.py @@ -17,6 +17,7 @@ python run_pretrain.py """ import os +import mindspore as ms import mindspore.communication.management as D from mindspore.communication.management import get_rank import mindspore.common.dtype as mstype @@ -159,6 +160,13 @@ def modelarts_pre_process(): cfg.save_checkpoint_path = os.path.join(cfg.output_path, cfg.save_checkpoint_path) +def set_ascend_max_device_memory(config): + if ms.get_context("enable_ge") and ms.get_context("mode") == ms.GRAPH_MODE and \ + hasattr(config, "max_device_memory"): + logger.warning("When encountering a memory shortage situation in 1980B, reduce the max_device_memory.") + ms.set_context(max_device_memory=config.max_device_memory) + + def InitNetWithGrads(net_with_loss, optimizer): '''init net with grads''' if cfg.enable_lossscale == "true": @@ -204,6 +212,7 @@ def run_pretrain(): device_num = 1 if cfg.distribute == "true": if cfg.device_target == 'Ascend': + set_ascend_max_device_memory(cfg) D.init() device_num = cfg.device_num rank = cfg.device_id % device_num diff --git a/official/nlp/Bert/src/bert_for_pre_training.py b/official/nlp/Bert/src/bert_for_pre_training.py index 16ba3407b..654760201 100644 --- a/official/nlp/Bert/src/bert_for_pre_training.py +++ b/official/nlp/Bert/src/bert_for_pre_training.py @@ -28,6 +28,7 @@ from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.context import ParallelMode from mindspore.communication.management import get_group_size from mindspore import context, amp, ops +from mindspore._c_expression import MSContext from .bert_model import BertModel GRADIENT_CLIP_TYPE = 1 @@ -366,6 +367,7 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): if scale_update_cell: self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) self.enable_tuple_broaden = True + self.ascend_910a_target = (MSContext.get_instance().get_ascend_soc_version() == 'ascend910') @jit def clip_grads(self, grads): @@ -395,6 +397,7 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): else: scaling_sens = sens status, scaling_sens = self.start_overflow_check(loss, scaling_sens) + scaling_sens = self.cast(scaling_sens, mstype.float32) grads = self.grad(self.network, weights)(input_ids, input_mask, token_type_id, @@ -402,11 +405,14 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): masked_lm_positions, masked_lm_ids, masked_lm_weights, - self.cast(scaling_sens, - mstype.float32)) + scaling_sens) # apply grad reducer on grads grads = self.grad_reducer(grads) - degree_sens = self.cast(scaling_sens * self.degree, mstype.float32) + if not self.ascend_910a_target: + scaling_sens = F.depend(scaling_sens, grads) + degree_sens = self.allreduce(scaling_sens) + else: + degree_sens = scaling_sens * Tensor(self.degree, mstype.float32) grads = self.hyper_map(F.partial(grad_scale, degree_sens), grads) grads = self.clip_grads(grads) -- Gitee From 6b4f7d045c047d97a4115f9c1229498d0d411dbc Mon Sep 17 00:00:00 2001 From: gaoshuanglong Date: Fri, 12 Apr 2024 17:30:06 +0800 Subject: [PATCH 26/44] Add &> log.txt & to msrun.sh --- .../scripts/run_distribute_train_ascend_msrun.sh | 2 +- .../mobilenetv2/scripts/run_distribute_train_msrun.sh | 2 +- .../cv/OpenPose/scripts/run_distribute_train_msrun.sh | 2 +- official/cv/ResNet/scripts/run_distribute_train_msrun.sh | 8 ++++---- official/cv/Unet/scripts/run_distribute_train_msrun.sh | 2 +- .../Bert/scripts/run_distributed_pretrain_ascend_msrun.sh | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/official/cv/FasterRCNN/scripts/run_distribute_train_ascend_msrun.sh b/official/cv/FasterRCNN/scripts/run_distribute_train_ascend_msrun.sh index 061b967c2..730440d27 100644 --- a/official/cv/FasterRCNN/scripts/run_distribute_train_ascend_msrun.sh +++ b/official/cv/FasterRCNN/scripts/run_distribute_train_ascend_msrun.sh @@ -93,4 +93,4 @@ echo "Start training..." msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port=8118 \ --log_dir=msrun_log --join=True --cluster_time_out=300 \ ${BASE_PATH}/../train.py --config_path=$CONFIG_FILE --coco_root=$PATH2 --mindrecord_dir=$mindrecord_dir \ - --run_distribute=True --device_num=$DEVICE_NUM --pre_trained=$PATH1 --backbone=$2 & + --run_distribute=True --device_num=$DEVICE_NUM --pre_trained=$PATH1 --backbone=$2 &> log.txt & diff --git a/official/cv/MobileNet/mobilenetv2/scripts/run_distribute_train_msrun.sh b/official/cv/MobileNet/mobilenetv2/scripts/run_distribute_train_msrun.sh index 1b2de9149..636e337f6 100644 --- a/official/cv/MobileNet/mobilenetv2/scripts/run_distribute_train_msrun.sh +++ b/official/cv/MobileNet/mobilenetv2/scripts/run_distribute_train_msrun.sh @@ -41,5 +41,5 @@ env > env.log echo "start training" msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port 8118 \ --log_dir=msrun_log --join=True --cluster_time_out=300 \ - train.py --run_distribute True --config_path=${CONFIG_PATH} --platform Ascend --dataset_path=${DATA_PATH} --rank_size ${RANK_SIZE} + train.py --run_distribute True --config_path=${CONFIG_PATH} --platform Ascend --dataset_path=${DATA_PATH} --rank_size ${RANK_SIZE} &> log.txt & diff --git a/official/cv/OpenPose/scripts/run_distribute_train_msrun.sh b/official/cv/OpenPose/scripts/run_distribute_train_msrun.sh index e210bfb00..c4ad264e6 100644 --- a/official/cv/OpenPose/scripts/run_distribute_train_msrun.sh +++ b/official/cv/OpenPose/scripts/run_distribute_train_msrun.sh @@ -27,5 +27,5 @@ env > env.log echo "start training" msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port 8118 \ --log_dir=msrun_log --join=True --cluster_time_out=300 \ - train.py --imgpath_train=$1 --jsonpath_train=$2 --maskpath_train=$3 --vgg_path=$4 + train.py --imgpath_train=$1 --jsonpath_train=$2 --maskpath_train=$3 --vgg_path=$4 &> log.txt & diff --git a/official/cv/ResNet/scripts/run_distribute_train_msrun.sh b/official/cv/ResNet/scripts/run_distribute_train_msrun.sh index 4ca63ed1d..156daf70a 100644 --- a/official/cv/ResNet/scripts/run_distribute_train_msrun.sh +++ b/official/cv/ResNet/scripts/run_distribute_train_msrun.sh @@ -100,7 +100,7 @@ then msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port=8118 \ --log_dir=msrun_log --join=True --cluster_time_out=300 \ train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH1 \ - --config_path=$CONFIG_FILE --output_dir './outputs' + --config_path=$CONFIG_FILE --output_dir './outputs' &> log.txt & fi if [ $# == 3 ] @@ -108,7 +108,7 @@ then msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port=8118 \ --log_dir=msrun_log --join=True --cluster_time_out=300 \ train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH1 --resume_ckpt=$RESUME_CKPT \ - --config_path=$CONFIG_FILE --output_dir './outputs' + --config_path=$CONFIG_FILE --output_dir './outputs' &> log.txt & fi if [ $# == 4 ] @@ -117,7 +117,7 @@ then --log_dir=msrun_log --join=True --cluster_time_out=300 \ train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH1 \ --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True \ - --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir './outputs' + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir './outputs' &> log.txt & if [ "x${RUN_EVAL}" == "xTrue" ] then echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" @@ -130,7 +130,7 @@ then --log_dir=msrun_log --join=True --cluster_time_out=300 \ train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH1 \ --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --resume_ckpt=$RESUME_CKPT \ - --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir './outputs' + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir './outputs' &> log.txt & if [ "x${RUN_EVAL}" == "xTrue" ] then echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" diff --git a/official/cv/Unet/scripts/run_distribute_train_msrun.sh b/official/cv/Unet/scripts/run_distribute_train_msrun.sh index e4a62db33..df54ff886 100644 --- a/official/cv/Unet/scripts/run_distribute_train_msrun.sh +++ b/official/cv/Unet/scripts/run_distribute_train_msrun.sh @@ -41,4 +41,4 @@ env > env.log echo "start training" msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port 8118 \ --log_dir=msrun_log --join=True --cluster_time_out=300 \ - train.py --run_distribute_train --data_path=$DATASET --config_path=$CONFIG_PATH --output_path './output' + train.py --run_distribute_train --data_path=$DATASET --config_path=$CONFIG_PATH --output_path './output' &> log.txt & diff --git a/official/nlp/Bert/scripts/run_distributed_pretrain_ascend_msrun.sh b/official/nlp/Bert/scripts/run_distributed_pretrain_ascend_msrun.sh index d429f59d0..f55364d99 100644 --- a/official/nlp/Bert/scripts/run_distributed_pretrain_ascend_msrun.sh +++ b/official/nlp/Bert/scripts/run_distributed_pretrain_ascend_msrun.sh @@ -30,4 +30,4 @@ msrun --bind_core=True --worker_num=8 --local_worker_num=8 \ run_pretrain.py --data_dir=$1 --distribute=true --epoch_size=40 \ --enable_save_ckpt=true --do_shuffle=true --enable_data_sink=true \ --data_sink_steps=100 --accumulation_steps=1 --allreduce_post_accumulation=true \ - --save_checkpoint_path=./ckpt --save_checkpoint_num=1 --config_path=../../pretrain_config.yaml + --save_checkpoint_path=./ckpt --save_checkpoint_num=1 --config_path=../../pretrain_config.yaml &> log.txt & -- Gitee From 34670c91a1821928f030d03c8aef111205c14794 Mon Sep 17 00:00:00 2001 From: zhaoting Date: Mon, 15 Apr 2024 01:38:30 +0800 Subject: [PATCH 27/44] fix a bug in bert 1p --- official/nlp/Bert/run_pretrain.py | 6 ++++-- official/nlp/Bert/src/bert_for_pre_training.py | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/official/nlp/Bert/run_pretrain.py b/official/nlp/Bert/run_pretrain.py index 477d2b47b..ce0709b27 100644 --- a/official/nlp/Bert/run_pretrain.py +++ b/official/nlp/Bert/run_pretrain.py @@ -31,6 +31,7 @@ from mindspore.train.train_thor import ConvertModelUtils from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecay, thor from mindspore import log as logger from mindspore.common import set_seed +from mindspore._c_expression import MSContext from src import BertNetworkWithLoss, BertNetworkMatchBucket, \ BertTrainOneStepCell, \ BertTrainOneStepWithLossScaleCell, \ @@ -161,8 +162,9 @@ def modelarts_pre_process(): def set_ascend_max_device_memory(config): - if ms.get_context("enable_ge") and ms.get_context("mode") == ms.GRAPH_MODE and \ - hasattr(config, "max_device_memory"): + is_ascend910b_ge = ms.get_context("enable_ge") and ms.get_context("mode") == ms.GRAPH_MODE and \ + MSContext.get_instance().get_ascend_soc_version() != 'ascend910' + if is_ascend910b_ge and hasattr(config, "max_device_memory"): logger.warning("When encountering a memory shortage situation in 1980B, reduce the max_device_memory.") ms.set_context(max_device_memory=config.max_device_memory) diff --git a/official/nlp/Bert/src/bert_for_pre_training.py b/official/nlp/Bert/src/bert_for_pre_training.py index 654760201..e45d95e65 100644 --- a/official/nlp/Bert/src/bert_for_pre_training.py +++ b/official/nlp/Bert/src/bert_for_pre_training.py @@ -397,7 +397,6 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): else: scaling_sens = sens status, scaling_sens = self.start_overflow_check(loss, scaling_sens) - scaling_sens = self.cast(scaling_sens, mstype.float32) grads = self.grad(self.network, weights)(input_ids, input_mask, token_type_id, @@ -405,14 +404,15 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): masked_lm_positions, masked_lm_ids, masked_lm_weights, - scaling_sens) + self.cast(scaling_sens, mstype.float32)) # apply grad reducer on grads grads = self.grad_reducer(grads) - if not self.ascend_910a_target: + if not self.ascend_910a_target and self.reducer_flag: + scaling_sens = self.cast(scaling_sens, mstype.float32) scaling_sens = F.depend(scaling_sens, grads) degree_sens = self.allreduce(scaling_sens) else: - degree_sens = scaling_sens * Tensor(self.degree, mstype.float32) + degree_sens = self.cast(scaling_sens * self.degree, mstype.float32) grads = self.hyper_map(F.partial(grad_scale, degree_sens), grads) grads = self.clip_grads(grads) -- Gitee From 4304389c3877d2b533a8a49496362344dadb7102 Mon Sep 17 00:00:00 2001 From: PingqiLi Date: Mon, 15 Apr 2024 12:06:41 +0800 Subject: [PATCH 28/44] fix the msrun script of Unet --- official/cv/Unet/scripts/run_distribute_train_msrun.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/official/cv/Unet/scripts/run_distribute_train_msrun.sh b/official/cv/Unet/scripts/run_distribute_train_msrun.sh index df54ff886..b0f9d697d 100644 --- a/official/cv/Unet/scripts/run_distribute_train_msrun.sh +++ b/official/cv/Unet/scripts/run_distribute_train_msrun.sh @@ -41,4 +41,4 @@ env > env.log echo "start training" msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port 8118 \ --log_dir=msrun_log --join=True --cluster_time_out=300 \ - train.py --run_distribute_train --data_path=$DATASET --config_path=$CONFIG_PATH --output_path './output' &> log.txt & + train.py --run_distribute=True --data_path=$DATASET --config_path=$CONFIG_PATH --output_path './output' &> log.txt & -- Gitee From 384b55399ac463cfc1241966e297627c51bed72b Mon Sep 17 00:00:00 2001 From: gaoshuanglong Date: Mon, 15 Apr 2024 15:50:59 +0800 Subject: [PATCH 29/44] Fix msrun script ulimit cmd error. --- official/cv/OpenPose/scripts/run_distribute_train_msrun.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/official/cv/OpenPose/scripts/run_distribute_train_msrun.sh b/official/cv/OpenPose/scripts/run_distribute_train_msrun.sh index e210bfb00..26e65b3d2 100644 --- a/official/cv/OpenPose/scripts/run_distribute_train_msrun.sh +++ b/official/cv/OpenPose/scripts/run_distribute_train_msrun.sh @@ -21,7 +21,7 @@ fi export DEVICE_NUM=8 export RANK_SIZE=8 -unlimit -u unlimited +ulimit -u unlimited env > env.log echo "start training" -- Gitee From 239067e663e08ee44c216b8c70877cb0b68d24c1 Mon Sep 17 00:00:00 2001 From: liuchongming Date: Mon, 15 Apr 2024 19:53:46 +0800 Subject: [PATCH 30/44] Remove add_pipeline_stage --- official/nlp/Pangu_alpha/src/pangu_alpha.py | 1 - 1 file changed, 1 deletion(-) diff --git a/official/nlp/Pangu_alpha/src/pangu_alpha.py b/official/nlp/Pangu_alpha/src/pangu_alpha.py index 02f1570be..244dde1fd 100644 --- a/official/nlp/Pangu_alpha/src/pangu_alpha.py +++ b/official/nlp/Pangu_alpha/src/pangu_alpha.py @@ -402,7 +402,6 @@ class PanguAlphaModel(nn.Cell): parallel_config=copied_parallel_config) self.head.pipeline_stage = config.parallel_config.pipeline_stage - 1 self.backbone = PanguAlpha_Model(config) - self.backbone.embedding.word_embedding.embedding_table.add_pipeline_stage(self.head.pipeline_stage) def construct(self, input_ids, input_position, attention_mask, init_reset=True, batch_valid_length=None): -- Gitee From 8bef075c4792f68edf34793756a0912f1eded811 Mon Sep 17 00:00:00 2001 From: ash Date: Thu, 18 Apr 2024 11:29:47 +0800 Subject: [PATCH 31/44] fix resnet50 boost acc with unset env variables --- official/cv/ResNet/README.md | 58 +++++++++++++++------------ official/cv/ResNet/README_CN.md | 71 +++++++++++++++++---------------- 2 files changed, 69 insertions(+), 60 deletions(-) diff --git a/official/cv/ResNet/README.md b/official/cv/ResNet/README.md index 035dea1f0..a35b0b9e6 100644 --- a/official/cv/ResNet/README.md +++ b/official/cv/ResNet/README.md @@ -17,26 +17,31 @@ - [Usage](#usage) - [Running on Ascend](#running-on-ascend) - [Running on GPU](#running-on-gpu) - - [Running parameter server mode training](#running-parameter-server-mode-training) + - [Running parameter server mode training](#running-parameter-server-mode-training) - [Evaluation while training](#evaluation-while-training) - - [Result](#result) - - [Evaluation Process](#evaluation-process) - - [Usage](#usage-1) - - [Running on Ascend](#running-on-ascend-1) - - [Running on GPU](#running-on-gpu-1) - - [Result](#result-1) - - [Prediction Process](#prediction-process) - - [Prediction](#prediction) - - [Inference Process](#inference-process) - - [Export MindIR](#export-mindir) - - [Infer on Ascend310](#infer-on-ascend310) - - [result](#result-2) + - [Resume Process](#resume-process) + - [Usage](#usage-1) + - [Running on Ascend](#running-on-ascend-1) + - [Result](#result) + - [Evaluation Process](#evaluation-process) + - [Usage](#usage-2) + - [Running on Ascend](#running-on-ascend-2) + - [Running on GPU](#running-on-gpu-1) + - [Result](#result-1) + - [Prediction Process](#prediction-process) + - [Prediction](#prediction) + - [Inference Process](#inference-process) + - [Export MindIR](#export-mindir) + - [Infer on Ascend310](#infer-on-ascend310) + - [result](#result-2) - [Apply algorithm in MindSpore Golden Stick](#apply-algorithm-in-mindspore-golden-stick) - [Training Process](#training-process-1) - [Running on GPU](#running-on-gpu-2) - - [Evaluation Process](#evaluation-process-1) - - [Running on GPU](#running-on-gpu-3) - - [Result](#result-3) + - [Running on Ascend](#running-on-ascend-3) + - [Evaluation Process](#evaluation-process-1) + - [Running on GPU](#running-on-gpu-3) + - [Running on Ascend](#running-on-ascend-4) + - [Result](#result-3) - [Model Description](#model-description) - [Performance](#performance) - [Evaluation Performance](#evaluation-performance) @@ -46,17 +51,20 @@ - [ResNet50 on ImageNet2012](#resnet50-on-imagenet2012) - [ResNet34 on ImageNet2012](#resnet34-on-imagenet2012) - [ResNet101 on ImageNet2012](#resnet101-on-imagenet2012) + - [ResNet152 on ImageNet2012](#resnet152-on-imagenet2012) - [SE-ResNet50 on ImageNet2012](#se-resnet50-on-imagenet2012) - - [Inference Performance](#inference-performance) - - [ResNet18 on CIFAR-10](#resnet18-on-cifar-10-1) - - [ResNet18 on ImageNet2012](#resnet18-on-imagenet2012-1) - - [ResNet34 on ImageNet2012](#resnet34-on-imagenet2012-1) - - [ResNet50 on CIFAR-10](#resnet50-on-cifar-10-1) - - [ResNet50 on ImageNet2012](#resnet50-on-imagenet2012-1) - - [ResNet101 on ImageNet2012](#resnet101-on-imagenet2012-1) - - [SE-ResNet50 on ImageNet2012](#se-resnet50-on-imagenet2012-1) + - [Inference Performance](#inference-performance) + - [ResNet18 on CIFAR-10](#resnet18-on-cifar-10-1) + - [ResNet18 on ImageNet2012](#resnet18-on-imagenet2012-1) + - [ResNet34 on ImageNet2012](#resnet34-on-imagenet2012-1) + - [ResNet50 on CIFAR-10](#resnet50-on-cifar-10-1) + - [ResNet50 on ImageNet2012](#resnet50-on-imagenet2012-1) + - [ResNet101 on ImageNet2012](#resnet101-on-imagenet2012-1) + - [ResNet152 on ImageNet2012](#resnet152-on-imagenet2012-1) + - [SE-ResNet50 on ImageNet2012](#se-resnet50-on-imagenet2012-1) - [Description of Random Situation](#description-of-random-situation) - [ModelZoo Homepage](#modelzoo-homepage) +- [FAQ](#faq) # [ResNet Description](#contents) @@ -1466,7 +1474,7 @@ Refer to the [ModelZoo FAQ](https://gitee.com/mindspore/models#FAQ) for some com - **Q: How to use `boost` to get the best performance?** - **A**: We provide the `boost_level` in the `Model` interface, when you set it to `O1` or `O2` mode, the network will automatically speed up. The high-performance mode has been fully verified on resnet50, you can use the `resnet50_imagenet2012_Boost_config.yaml` to experience this mode. Meanwhile, in `O1` or `O2` mode, it is recommended to set the following environment variables: ` export ENV_FUSION_CLEAR=1; export DATASET_ENABLE_NUMA=True; export ENV_SINGLE_EVAL=1; export SKT_ENABLE=1;`. + **A**: We provide the `boost_level` in the `Model` interface, when you set it to `O1` or `O2` mode, the network will automatically speed up. The high-performance mode has been fully verified on resnet50, you can use the `resnet50_imagenet2012_Boost_config.yaml` to experience this mode. - **Q: How to use to preprocess imagenet2012 dataset?** diff --git a/official/cv/ResNet/README_CN.md b/official/cv/ResNet/README_CN.md index 939212592..0de3bba76 100644 --- a/official/cv/ResNet/README_CN.md +++ b/official/cv/ResNet/README_CN.md @@ -16,44 +16,45 @@ - [脚本及样例代码](#脚本及样例代码) - [脚本参数](#脚本参数) - [训练过程](#训练过程) - - [用法](#用法) - - [Ascend处理器环境运行](#ascend处理器环境运行) - - [GPU处理器环境运行](#gpu处理器环境运行) - - [运行参数服务器模式训练](#运行参数服务器模式训练) - - [训练时推理](#训练时推理) - - [迁移训练过程](#迁移训练过程) - - [迁移数据集处理](#迁移数据集处理) - - [迁移训练Ckpt获取](#迁移训练ckpt获取) - - [用法](#用法-1) - - [结果](#结果) - - [迁移训练推理过程](#迁移训练推理过程) - - [用法](#用法-2) - - [续训过程](#续训过程) - - [用法](#用法-3) - - [Ascend处理器环境运行](#ascend处理器环境运行-1) - - [结果](#结果-1) - - [评估过程](#评估过程) - - [用法](#用法-4) - - [Ascend处理器环境运行](#ascend处理器环境运行-2) - - [GPU处理器环境运行](#gpu处理器环境运行-1) - - [结果](#结果-2) - - [预测过程](#预测过程) - - [预测](#预测) - - [推理过程](#推理过程) - - [导出MindIR](#导出mindir) - - [ONNX的导出与推理](#onnx的导出与推理) - - [执行推理](#执行推理) - - [结果](#结果-3) + - [用法](#用法) + - [Ascend处理器环境运行](#ascend处理器环境运行) + - [GPU处理器环境运行](#gpu处理器环境运行) + - [运行参数服务器模式训练](#运行参数服务器模式训练) + - [训练时推理](#训练时推理) + - [迁移训练过程](#迁移训练过程) + - [迁移数据集处理](#迁移数据集处理) + - [迁移训练Ckpt获取](#迁移训练ckpt获取) + - [用法](#用法-1) + - [结果](#结果) + - [迁移训练推理过程](#迁移训练推理过程) + - [用法](#用法-2) + - [续训过程](#续训过程) + - [用法](#用法-3) + - [Ascend处理器环境运行](#ascend处理器环境运行-1) + - [结果](#结果-1) + - [评估过程](#评估过程) + - [用法](#用法-4) + - [Ascend处理器环境运行](#ascend处理器环境运行-2) + - [GPU处理器环境运行](#gpu处理器环境运行-1) + - [结果](#结果-2) + - [预测过程](#预测过程) + - [预测](#预测) + - [推理过程](#推理过程) + - [导出MindIR](#导出mindir) + - [ONNX的导出与推理](#onnx的导出与推理) + - [执行推理](#执行推理) + - [结果](#结果-3) - [应用MindSpore Golden Stick模型压缩算法](#应用mindspore-golden-stick模型压缩算法) + - [mindspore\_gs环境安装参考gloden-stick](#mindspore_gs环境安装参考gloden-stick) - [训练过程](#训练过程-1) - [GPU处理器环境运行](#gpu处理器环境运行-2) - [Ascend处理器环境运行](#ascend处理器环境运行-3) - - [评估过程](#评估过程-1) - - [GPU处理器环境运行](#gpu处理器环境运行-3) - - [Ascend处理器环境运行](#ascend处理器环境运行-4) - - [结果](#结果-4) - - [GPU结果](#gpu结果) - - [Ascend结果](#ascend结果) + - [评估过程](#评估过程-1) + - [GPU处理器环境运行](#gpu处理器环境运行-3) + - [Ascend处理器环境运行](#ascend处理器环境运行-4) + - [结果](#结果-4) + - [GPU结果](#gpu结果) + - [Ascend结果](#ascend结果) - [模型描述](#模型描述) - [性能](#性能) - [评估性能](#评估性能) @@ -1415,7 +1416,7 @@ result:{'top_1_accuracy': 0.928385416666666} prune_rate=0.45 ckpt=~/resnet50_cif - **Q: 如何使用`boost`功能获取最优的性能?** - **A**: 我们在`Model`中提供了`boost_level`的入参,当你将其设置为O1或者O2模式时,框架会自动对网络的性能进行优化。当前这个模式已在resnet50上充分验证,你可以使用`resnet50_imagenet2012_Boost_config.yaml`来体验该模式。同时,在O1或者O2模式下,建议设置以下环境变量:`export ENV_FUSION_CLEAR=1;export DATASET_ENABLE_NUMA=True;export ENV_SINGLE_EVAL=1;export SKT_ENABLE=1;`来获取更好的性能。 + **A**: 我们在`Model`中提供了`boost_level`的入参,当你将其设置为O1或者O2模式时,框架会自动对网络的性能进行优化。当前这个模式已在resnet50上充分验证,你可以使用`resnet50_imagenet2012_Boost_config.yaml`来体验该模式。 - **Q: 如何使用对ImageNet2012数据集进行预处理?** -- Gitee From 4e7c47d77fc3fee3cf28df91c32ccf2d268a0c63 Mon Sep 17 00:00:00 2001 From: ZhihaoLi Date: Fri, 26 Apr 2024 17:04:55 +0800 Subject: [PATCH 32/44] add bert large mlperf 16die --- .../bert/pretrain_config_Ascend_Boost.yaml | 43 ++-- benchmark/ascend/bert/pretrain_eval.py | 5 +- benchmark/ascend/bert/run_pretrain.py | 141 +++++++---- .../get_distribute_pretrain_cmd.py | 19 +- .../run_distributed_pretrain_ascend.sh | 11 +- .../scripts/run_standalone_pretrain_ascend.sh | 2 +- benchmark/ascend/bert/src/adam.py | 5 +- .../ascend/bert/src/bert_for_pre_training.py | 133 ++++++----- benchmark/ascend/bert/src/bert_model.py | 161 ++++++++++--- benchmark/ascend/bert/src/dataset.py | 219 ++++++++++++------ .../ascend/bert/src/model_utils/config.py | 8 +- benchmark/ascend/bert/src/utils.py | 25 +- 12 files changed, 504 insertions(+), 268 deletions(-) diff --git a/benchmark/ascend/bert/pretrain_config_Ascend_Boost.yaml b/benchmark/ascend/bert/pretrain_config_Ascend_Boost.yaml index 684780dcc..7ccb6a211 100644 --- a/benchmark/ascend/bert/pretrain_config_Ascend_Boost.yaml +++ b/benchmark/ascend/bert/pretrain_config_Ascend_Boost.yaml @@ -13,7 +13,7 @@ enable_profiling: False # ============================================================================== description: 'run_pretrain' -distribute: 'true' +distribute: 'false' epoch_size: 40 device_id: 0 device_num: 1 @@ -21,16 +21,18 @@ enable_save_ckpt: 'false' enable_lossscale: 'true' do_shuffle: 'true' enable_data_sink: 'true' -data_sink_steps: 100 +data_sink_steps: 1 accumulation_steps: 1 allreduce_post_accumulation: 'true' -save_checkpoint_path: './' -load_checkpoint_path: '' +save_checkpoint_path: '' +load_checkpoint_path: '/home/bertlarge/Bert/msdata/new_ckpt.ckpt' save_checkpoint_steps: 10000 -train_steps: 17000 -save_checkpoint_num: 5 -data_dir: '' +train_steps: 7000 +save_checkpoint_num: 1 +data_dir: '/data4/PCL/new_train_data' schema_dir: '' +dataset_format: "tfrecord" +num_samples: None # is the option which could be set by user to specify steps when bert_network is base # ============================================================================== # pretrain related @@ -39,19 +41,17 @@ batch_size: 32 bert_network: 'large_boost' loss_scale_value: 65536 scale_factor: 2 -scale_window: 1000 +scale_window: 6300 optimizer: 'Lamb' enable_global_norm: False # pretrain_eval related -train_with_eval: 'false' -eval_data_dir: "" +train_with_eval: 'true' +eval_data_dir: "/home/bertlarge/Bert/new_eval_data" schema_file: "" eval_ckpt: "" -eval_samples: 300000 +eval_samples: 150000 # bucket list, default: [] -bucket_list: [128, 256, 384, 512] -# use packed dataset and model, which is incompatible with bucket -use_packed: False +bucket_list: [1, 512] # optimizer related AdamWeightDecay: learning_rate: 0.00003 # 3e-5 @@ -63,15 +63,15 @@ AdamWeightDecay: warmup_steps: 10000 Lamb: - learning_rate: 0.00035 # 3.5e-4 + learning_rate: 0.0007 end_learning_rate: 1.0e-9 - power: 1.2 + power: 1.8 warmup_steps: 0 weight_decay: 0.0166629 decay_filter: ['layernorm', 'bias'] - eps: 0.000001 # 1e-6, - beta1: 0.86 - beta2: 0.98 + eps: 0.000001 + beta1: 0.85 + beta2: 0.97 Momentum: learning_rate: 0.00002 # 2e-5 @@ -128,7 +128,7 @@ nezha_net_cfg: dtype: mstype.float32 compute_type: mstype.float16 # large -large_batch_size: 24 +large_batch_size: 25 large_net_cfg: seq_length: 512 vocab_size: 30522 @@ -146,7 +146,7 @@ large_net_cfg: dtype: mstype.float32 compute_type: mstype.float16 # Accelerated large network which is only supported in Ascend yet. -large_boost_batch_size: 24 +large_boost_batch_size: 25 large_boost_net_cfg: seq_length: 512 vocab_size: 30522 @@ -200,3 +200,4 @@ enable_lossscale: ["true", "false"] do_shuffle: ["true", "false"] enable_data_sink: ["true", "false"] allreduce_post_accumulation: ["true", "false"] +dataset_format: ["tfrecord", "mindrecord"] diff --git a/benchmark/ascend/bert/pretrain_eval.py b/benchmark/ascend/bert/pretrain_eval.py index edb10579c..83c66a4f3 100644 --- a/benchmark/ascend/bert/pretrain_eval.py +++ b/benchmark/ascend/bert/pretrain_eval.py @@ -24,7 +24,7 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.utils import BertMetric from src.model_utils.config import config as cfg, bert_net_cfg from src.bert_for_pre_training import BertPretrainEval -from src.dataset import create_eval_dataset +from src.dataset import create_eval_dataset, CreateEvalDatasetInput def bert_predict(): @@ -33,7 +33,8 @@ def bert_predict(): ''' devid = int(os.getenv('DEVICE_ID')) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) - dataset = create_eval_dataset(cfg.batch_size, 1, data_dir=cfg.eval_data_dir) + inputs = CreateEvalDatasetInput(batchsize=cfg.batch_size, device_num=1, data_dir=cfg.eval_data_dir) + dataset = create_eval_dataset(inputs) net_for_pretraining = BertPretrainEval(bert_net_cfg) net_for_pretraining.set_train(False) param_dict = load_checkpoint(cfg.eval_ckpt) diff --git a/benchmark/ascend/bert/run_pretrain.py b/benchmark/ascend/bert/run_pretrain.py index 2bd22fbb6..b0fa3e1d1 100644 --- a/benchmark/ascend/bert/run_pretrain.py +++ b/benchmark/ascend/bert/run_pretrain.py @@ -1,4 +1,4 @@ -# Copyright 2020-2021 Huawei Technologies Co., Ltd +# Copyright 2020-2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,20 @@ #################pre_train bert example on zh-wiki######################## python run_pretrain.py """ +import datetime import os import mindspore.communication.management as D from mindspore.communication.management import get_rank import mindspore.common.dtype as mstype from mindspore import context +from mindspore import ops, Tensor, nn from mindspore.train.model import Model from mindspore.context import ParallelMode from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train.train_thor import ConvertModelUtils +from mindspore.communication.management import GlobalComm from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecay, thor from mindspore import log as logger from mindspore.common import set_seed @@ -35,9 +38,10 @@ from src import BertNetworkWithLoss, BertNetworkMatchBucket, \ BertTrainOneStepWithLossScaleCell, \ BertTrainAccumulationAllReduceEachWithLossScaleCell, \ BertTrainAccumulationAllReducePostWithLossScaleCell, \ + BertTrainOneStepWithLossScaleCellForAdam, \ BertPretrainEval, \ AdamWeightDecayForBert, AdamWeightDecayOp -from src.dataset import create_bert_dataset, create_eval_dataset +from src.dataset import create_bert_dataset, create_eval_dataset, CreateEvalDatasetInput from src.utils import LossCallBack, BertLearningRate, EvalCallBack, BertMetric from src.model_utils.config import config as cfg, bert_net_cfg from src.model_utils.moxing_adapter import moxing_wrapper @@ -45,11 +49,27 @@ from src.model_utils.device_adapter import get_device_id, get_device_num _current_dir = os.path.dirname(os.path.realpath(__file__)) +os.environ["GLOG_v"] = "1" +print(os.getenv("GLOG_v")) +print(os.getenv("P128")) + + +class AllreduceSync(nn.Cell): + def __init__(self): + super(AllreduceSync, self).__init__() + self.allreduce = ops.AllReduce(ops.ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP) + + def construct(self, x): + y = self.allreduce(x) + return y + + def _set_bert_all_reduce_split(): """set bert all_reduce fusion split, support num_hidden_layers is 12 and 24.""" device_target = context.get_context('device_target') enable_graph_kernel = context.get_context('enable_graph_kernel') device_num = context.get_auto_parallel_context('device_num') + print("device_num:", device_num) if bert_net_cfg.num_hidden_layers == 12: if bert_net_cfg.use_relative_positions: context.set_auto_parallel_context(all_reduce_fusion_config=[29, 58, 87, 116, 145, 174, 203, 217]) @@ -150,18 +170,6 @@ def _check_compute_type(args_opt): logger.warning(warning_message) -def _check_accumulation_steps(args_opt): - if args_opt.accumulation_steps > 1: - logger.info("accumulation steps: {}".format(args_opt.accumulation_steps)) - logger.info("global batch size: {}".format(args_opt.batch_size * cfg.accumulation_steps)) - if args_opt.enable_data_sink == "true": - args_opt.data_sink_steps *= cfg.accumulation_steps - logger.info("data sink steps: {}".format(args_opt.data_sink_steps)) - if args_opt.enable_save_ckpt == "true": - args_opt.save_checkpoint_steps *= cfg.accumulation_steps - logger.info("save checkpoint steps: {}".format(args_opt.save_checkpoint_steps)) - - def modelarts_pre_process(): '''modelarts pre process function.''' cfg.device_id = get_device_id() @@ -170,6 +178,40 @@ def modelarts_pre_process(): cfg.save_checkpoint_path = os.path.join(cfg.output_path, cfg.save_checkpoint_path) +def InitNetWithGrads(net_with_loss, optimizer): + '''init net with grads''' + if cfg.enable_lossscale == "true": + update_cell = DynamicLossScaleUpdateCell(loss_scale_value=cfg.loss_scale_value, + scale_factor=cfg.scale_factor, + scale_window=cfg.scale_window) + accumulation_steps = cfg.accumulation_steps + enable_global_norm = cfg.enable_global_norm + if accumulation_steps <= 1: + if cfg.optimizer == 'AdamWeightDecay' and cfg.device_target == 'GPU': + net_with_grads = BertTrainOneStepWithLossScaleCellForAdam(net_with_loss, optimizer=optimizer, + scale_update_cell=update_cell) + else: + net_with_grads = BertTrainOneStepWithLossScaleCell(net_with_loss, optimizer=optimizer, + scale_update_cell=update_cell) + else: + allreduce_post = cfg.distribute == "false" or cfg.allreduce_post_accumulation == "true" + net_with_accumulation = (BertTrainAccumulationAllReducePostWithLossScaleCell if allreduce_post else + BertTrainAccumulationAllReduceEachWithLossScaleCell) + net_with_grads = net_with_accumulation(net_with_loss, optimizer=optimizer, + scale_update_cell=update_cell, + accumulation_steps=accumulation_steps, + enable_global_norm=enable_global_norm) + else: + net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer, enable_clip_grad=True) + if cfg.optimizer == "Thor": + net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer, sens=cfg.Thor.loss_scale, + enable_clip_grad=False) + + if cfg.bucket_list: + net_with_grads = BertNetworkMatchBucket(net_with_grads, bert_net_cfg.seq_length, cfg.bucket_list) + return net_with_grads + + @moxing_wrapper(pre_process=modelarts_pre_process) def run_pretrain(): """pre-train bert_clue""" @@ -177,29 +219,39 @@ def run_pretrain(): context.set_context(reserve_class_name_in_scope=False) _set_graph_kernel_context(cfg.device_target) ckpt_save_dir = cfg.save_checkpoint_path + rank = 0 + device_num = 1 if cfg.distribute == "true": if cfg.device_target == 'Ascend': D.init() device_num = cfg.device_num - rank = cfg.device_id % device_num + rank = int(os.getenv("RANK_ID")) + else: + D.init() + device_num = D.get_group_size() + rank = D.get_rank() ckpt_save_dir = os.path.join(cfg.save_checkpoint_path, 'ckpt_' + str(get_rank())) context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) _set_bert_all_reduce_split() - else: - rank = 0 - device_num = 1 - _check_compute_type(cfg) - _check_accumulation_steps(cfg) + print(cfg, flush=True) + if cfg.accumulation_steps > 1: + logger.info("accumulation steps: {}".format(cfg.accumulation_steps)) + logger.info("global batch size: {}".format(cfg.batch_size * cfg.accumulation_steps)) + if cfg.enable_data_sink == "true": + cfg.data_sink_steps *= cfg.accumulation_steps + logger.info("data sink steps: {}".format(cfg.data_sink_steps)) + if cfg.enable_save_ckpt == "true": + cfg.save_checkpoint_steps *= cfg.accumulation_steps + logger.info("save checkpoint steps: {}".format(cfg.save_checkpoint_steps)) ds = create_bert_dataset(device_num, rank, cfg.do_shuffle, cfg.data_dir, cfg.schema_dir, cfg.batch_size, - cfg.bucket_list, cfg.use_packed) + cfg.bucket_list, cfg.dataset_format, cfg.num_samples) net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) - - new_repeat_count = cfg.epoch_size * ds.get_dataset_size() // cfg.data_sink_steps + new_repeat_count = cfg.epoch_size * ds.get_dataset_size() // cfg.data_sink_steps # 100w -> 100w/100 if cfg.train_steps > 0: train_steps = cfg.train_steps * cfg.accumulation_steps new_repeat_count = min(new_repeat_count, train_steps // cfg.data_sink_steps) @@ -220,43 +272,31 @@ def run_pretrain(): param_dict = load_checkpoint(cfg.load_checkpoint_path) load_param_into_net(net_with_loss, param_dict) - if cfg.enable_lossscale == "true": - update_cell = DynamicLossScaleUpdateCell(loss_scale_value=cfg.loss_scale_value, - scale_factor=cfg.scale_factor, - scale_window=cfg.scale_window) - accumulation_steps = cfg.accumulation_steps - enable_global_norm = cfg.enable_global_norm - if accumulation_steps <= 1: - net_with_grads = BertTrainOneStepWithLossScaleCell(net_with_loss, optimizer=optimizer, - scale_update_cell=update_cell) - else: - allreduce_post = cfg.distribute == "false" or cfg.allreduce_post_accumulation == "true" - net_with_accumulation = (BertTrainAccumulationAllReducePostWithLossScaleCell if allreduce_post else - BertTrainAccumulationAllReduceEachWithLossScaleCell) - net_with_grads = net_with_accumulation(net_with_loss, optimizer=optimizer, - scale_update_cell=update_cell, - accumulation_steps=accumulation_steps, - enable_global_norm=enable_global_norm) - else: - net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer, enable_clip_grad=True) - if cfg.optimizer == "Thor": - net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer, sens=cfg.Thor.loss_scale, - enable_clip_grad=False) - - if cfg.bucket_list: - net_with_grads = BertNetworkMatchBucket(net_with_grads, bert_net_cfg.seq_length, cfg.bucket_list) + net_with_grads = InitNetWithGrads(net_with_loss, optimizer) model = Model(net_with_grads) if cfg.train_with_eval == 'true': net_eval = BertPretrainEval(bert_net_cfg, network=net_with_loss.bert) - eval_ds = create_eval_dataset(cfg.batch_size, device_num, rank, cfg.eval_data_dir, - cfg.schema_dir, cfg.use_packed) + inputs = CreateEvalDatasetInput(batchsize=cfg.batch_size, device_num=device_num, rank=rank, + data_dir=cfg.eval_data_dir, schema_dir=cfg.schema_dir, + dataset_format=cfg.dataset_format, num_samples=cfg.num_samples) + eval_ds = create_eval_dataset(inputs) model = Model(net_with_grads, eval_network=net_eval, metrics={'bert_acc': BertMetric(cfg.batch_size)}) eval_callback = EvalCallBack(model, eval_ds, device_num * cfg.batch_size, cfg.eval_samples) callback.append(eval_callback) model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) + new_repeat_count = 160 + model.build(ds, eval_ds, cfg.data_sink_steps, new_repeat_count) + sync = AllreduceSync() + import numpy as np + sync(Tensor(np.ones(1), mstype.float32)) + print('using build>>>>>>>>>>>>>>>>>>>>>>>>>>') + + model.eval(eval_ds, dataset_sink_mode=(cfg.enable_data_sink == "true")) + print(f'new_repeat_count: {new_repeat_count}') + print(f'train start: {datetime.datetime.utcnow()}') model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) @@ -264,3 +304,4 @@ def run_pretrain(): if __name__ == '__main__': set_seed(0) run_pretrain() + print(">>>>>>>>>>>>>>>>>end") diff --git a/benchmark/ascend/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py b/benchmark/ascend/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py index c6c1945f7..db8554950 100644 --- a/benchmark/ascend/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py +++ b/benchmark/ascend/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py @@ -15,6 +15,7 @@ """distribute pretrain script""" import os import json +import configparser import multiprocessing from argparse import ArgumentParser @@ -35,12 +36,12 @@ def parse_args(): parser.add_argument("--run_script_dir", type=str, default="", help="Run script path, it is better to use absolute path") + parser.add_argument("--hyper_parameter_config_dir", type=str, default="", + help="Hyper Parameter config path, it is better to use absolute path") parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--hccl_config_dir", type=str, default="", help="Hccl config path, it is better to use absolute path") - parser.add_argument("--config", type=str, default="", - help="Path to the config yaml file, it is better to use absolute path") parser.add_argument("--cmd_file", type=str, default="distributed_cmd.sh", help="Path of the generated cmd file.") parser.add_argument("--hccl_time_out", type=int, default=120, @@ -90,7 +91,7 @@ def make_dirs(cmd, logic_id): return cmd -def print_info(rank_id, device_id, logic_id, cmdopt, data_dir, cur_dir): +def print_info(rank_id, device_id, logic_id, cmdopt, epoch_size, data_dir, cur_dir): """ Print some information about scripts. """ @@ -99,6 +100,7 @@ def print_info(rank_id, device_id, logic_id, cmdopt, data_dir, cur_dir): print("device_id:", device_id) print("logic_id", logic_id) print("core_nums:", cmdopt) + print("epoch_size:", epoch_size) print("data_dir:", data_dir) print("log_file_dir: " + cur_dir + "/LOG" + str(logic_id) + "/pretraining_log.txt") @@ -113,6 +115,9 @@ def distribute_pretrain(): run_script = args.run_script_dir data_dir = args.data_dir + cf = configparser.ConfigParser() + cf.read(args.hyper_parameter_config_dir) + cfg = dict(cf.items("config")) print("hccl_config_dir:", args.hccl_config_dir) print("hccl_time_out:", args.hccl_time_out) @@ -170,11 +175,15 @@ def distribute_pretrain(): cmd = make_dirs(cmd, logic_id) print_info(rank_id=rank_id, device_id=device_id, logic_id=logic_id, cmdopt=cmdopt, cur_dir=cur_dir, - data_dir=data_dir) + epoch_size=str(cfg['epoch_size']), data_dir=data_dir) run_cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " " + opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) + if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): + raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," + " 'device_num' or 'data_dir'! ") + run_cmd += opt run_cmd += " --data_dir=" + data_dir - run_cmd += " --config_path=" + args.config run_cmd += ' --device_id=' + str(logic_id) + ' --device_num=' \ + str(rank_size) + ' >./pretraining_log.txt 2>&1 &' diff --git a/benchmark/ascend/bert/scripts/run_distributed_pretrain_ascend.sh b/benchmark/ascend/bert/scripts/run_distributed_pretrain_ascend.sh index fc8c719c3..937928b45 100644 --- a/benchmark/ascend/bert/scripts/run_distributed_pretrain_ascend.sh +++ b/benchmark/ascend/bert/scripts/run_distributed_pretrain_ascend.sh @@ -17,16 +17,21 @@ echo "==============================================================================================================" echo "Please run the script as: " echo "bash scripts/run_distributed_pretrain_ascend.sh DATA_DIR RANK_TABLE_FILE" -echo "for example: bash scripts/run_distributed_pretrain_ascend.sh /path/dataset /path/hccl.json /path/config.yaml" +echo "for example: bash scripts/run_distributed_pretrain_ascend.sh /path/dataset /path/hccl.json" echo "It is better to use absolute path." +echo "For hyper parameter, please note that you should customize the scripts: + '{CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini' " echo "==============================================================================================================" + +export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE" + CUR_DIR=`pwd` -ulimit -s 102400 +ulimit -s 302400 python ${CUR_DIR}/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py \ --run_script_dir=${CUR_DIR}/run_pretrain.py \ + --hyper_parameter_config_dir=${CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini \ --data_dir=$1 \ --hccl_config_dir=$2 \ - --config=$3 \ --hccl_time_out=600 \ --hccn_config_file='/etc/hccn.conf' \ --cmd_file=distributed_cmd.sh diff --git a/benchmark/ascend/bert/scripts/run_standalone_pretrain_ascend.sh b/benchmark/ascend/bert/scripts/run_standalone_pretrain_ascend.sh index 8d0b9b961..85ef8b2d8 100644 --- a/benchmark/ascend/bert/scripts/run_standalone_pretrain_ascend.sh +++ b/benchmark/ascend/bert/scripts/run_standalone_pretrain_ascend.sh @@ -26,7 +26,7 @@ DATA_DIR=$3 SCHEMA_DIR=$4 ulimit -s 102400 -mkdir -p ms_log +mkdir -p ms_log PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) CUR_DIR=`pwd` export GLOG_log_dir=${CUR_DIR}/ms_log diff --git a/benchmark/ascend/bert/src/adam.py b/benchmark/ascend/bert/src/adam.py index 522c4bf1e..7e47adddd 100644 --- a/benchmark/ascend/bert/src/adam.py +++ b/benchmark/ascend/bert/src/adam.py @@ -191,11 +191,10 @@ def _run_off_load_opt(opt, beta1_power, beta2_power, beta1, beta2, eps, lr, grad def _check_param_value(beta1, beta2, eps, prim_name): """Check the type of inputs.""" - assert isinstance(beta1, float) and 0 <= beta1 <= 1.0, "beta1 should be float and between 0 and 1" - assert isinstance(beta2, float) and 0 <= beta2 <= 1.0, "beta2 should be float and between 0 and 1" + assert isinstance(beta1, float) and 0 <= beta1 <= 1, "beta1 should between 0 and 1" + assert isinstance(beta2, float) and 0 <= beta2 <= 1, "beta2 should between 0 and 1" assert isinstance(eps, float) and eps > 0, "eps should be bigger than 0" - class AdamWeightDecayForBert(Optimizer): """ Implements the Adam algorithm to fix the weight decay. diff --git a/benchmark/ascend/bert/src/bert_for_pre_training.py b/benchmark/ascend/bert/src/bert_for_pre_training.py index e098f8683..bb7df924b 100644 --- a/benchmark/ascend/bert/src/bert_for_pre_training.py +++ b/benchmark/ascend/bert/src/bert_for_pre_training.py @@ -14,20 +14,20 @@ # ============================================================================ """Bert for pretraining.""" import numpy as np - +import mindspore as ms import mindspore.nn as nn -from mindspore import amp, ops from mindspore.common.initializer import initializer, TruncatedNormal from mindspore.ops import operations as P from mindspore.ops import functional as F from mindspore.ops import composite as C from mindspore.common.tensor import Tensor from mindspore.common.parameter import Parameter +from mindspore.common.api import jit from mindspore.common import dtype as mstype from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.context import ParallelMode from mindspore.communication.management import get_group_size -from mindspore import context +from mindspore import context, amp, ops from .bert_model import BertModel GRADIENT_CLIP_TYPE = 1 @@ -165,9 +165,9 @@ class BertPreTraining(nn.Cell): self.cls2 = GetNextSentenceOutput(config) def construct(self, input_ids, input_mask, token_type_id, - masked_lm_positions, next_sentence_starts=None): + masked_lm_positions): sequence_output, pooled_output, embedding_table = \ - self.bert(input_ids, token_type_id, input_mask, next_sentence_starts) + self.bert(input_ids, token_type_id, input_mask) prediction_scores = self.cls1(sequence_output, embedding_table, masked_lm_positions) @@ -188,7 +188,6 @@ class BertPretrainingLoss(nn.Cell): def __init__(self, config): super(BertPretrainingLoss, self).__init__() - self.use_packed = config.use_packed self.vocab_size = config.vocab_size self.onehot = P.OneHot() self.on_value = Tensor(1.0, mstype.float32) @@ -201,12 +200,10 @@ class BertPretrainingLoss(nn.Cell): self.cast = P.Cast() def construct(self, prediction_scores, seq_relationship_score, masked_lm_ids, - masked_lm_weights, next_sentence_labels, next_sentence_weights=None): + masked_lm_weights, next_sentence_labels): """Defines the computation performed.""" label_ids = self.reshape(masked_lm_ids, self.last_idx) label_weights = self.cast(self.reshape(masked_lm_weights, self.last_idx), mstype.float32) - if self.use_packed: - label_weights = F.minimum(label_weights, 1.0) one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value) per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx)) @@ -219,16 +216,10 @@ class BertPretrainingLoss(nn.Cell): one_hot_labels = self.onehot(labels, 2, self.on_value, self.off_value) per_example_loss = self.neg(self.reduce_sum( one_hot_labels * seq_relationship_score, self.last_idx)) - if self.use_packed: - weights = self.cast(self.reshape(next_sentence_weights, self.last_idx), mstype.float32) - numerator = self.reduce_sum(weights * per_example_loss, ()) - denominator = F.maximum(self.reduce_sum(weights, ()), 1e-5) - next_sentence_loss = numerator / denominator - else: - next_sentence_loss = self.reduce_mean(per_example_loss, self.last_idx) + next_sentence_loss = self.reduce_mean(per_example_loss, self.last_idx) # total_loss - total_loss = masked_lm_loss + next_sentence_loss + total_loss = masked_lm_loss + next_sentence_loss #* tmp return total_loss @@ -251,6 +242,7 @@ class BertNetworkWithLoss(nn.Cell): self.bert = BertPreTraining(config, is_training, use_one_hot_embeddings) self.loss = BertPretrainingLoss(config) self.cast = P.Cast() + self.print = ops.Print() def construct(self, input_ids, @@ -259,14 +251,13 @@ class BertNetworkWithLoss(nn.Cell): next_sentence_labels, masked_lm_positions, masked_lm_ids, - masked_lm_weights, - next_sentence_starts=None, - next_sentence_weights=None): + masked_lm_weights): """Get pre-training loss""" prediction_scores, seq_relationship_score = \ - self.bert(input_ids, input_mask, token_type_id, masked_lm_positions, next_sentence_starts) + self.bert(input_ids, input_mask, token_type_id, masked_lm_positions) total_loss = self.loss(prediction_scores, seq_relationship_score, - masked_lm_ids, masked_lm_weights, next_sentence_labels, next_sentence_weights) + masked_lm_ids, masked_lm_weights, next_sentence_labels) + return self.cast(total_loss, mstype.float32) @@ -289,10 +280,16 @@ class BertTrainOneStepCell(nn.TrainOneStepCell): self.cast = P.Cast() self.hyper_map = C.HyperMap() self.enable_clip_grad = enable_clip_grad + self.enable_tuple_broaden = True def set_sens(self, value): self.sens = value + @jit + def clip_grads(self, grads): + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + return grads + def construct(self, input_ids, input_mask, @@ -321,7 +318,7 @@ class BertTrainOneStepCell(nn.TrainOneStepCell): self.cast(F.tuple_to_array((self.sens,)), mstype.float32)) if self.enable_clip_grad: - grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + grads = self.clip_grads(grads) grads = self.grad_reducer(grads) self.optimizer(grads) return loss @@ -370,6 +367,12 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) + self.enable_tuple_broaden = True + + @jit + def clip_grads(self, grads): + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + return grads def construct(self, input_ids, @@ -379,25 +382,27 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): masked_lm_positions, masked_lm_ids, masked_lm_weights, - next_sentence_starts=None, - next_sentence_weights=None, sens=None): """Defines the computation performed.""" weights = self.weights + + if sens is None: + scaling_sens = self.loss_scale + else: + scaling_sens = sens + flag = ops.cast(ops.sum(masked_lm_weights), ms.bool_) + flag_float = ops.cast(flag, ms.float32) + if not flag: + return flag_float.unsqueeze(-1), flag, scaling_sens.value() loss = self.network(input_ids, input_mask, token_type_id, next_sentence_labels, masked_lm_positions, masked_lm_ids, - masked_lm_weights, - next_sentence_starts, - next_sentence_weights) - if sens is None: - scaling_sens = self.loss_scale - else: - scaling_sens = sens + masked_lm_weights) status, scaling_sens = self.start_overflow_check(loss, scaling_sens) + grads = self.grad(self.network, weights)(input_ids, input_mask, token_type_id, @@ -405,15 +410,12 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): masked_lm_positions, masked_lm_ids, masked_lm_weights, - next_sentence_starts, - next_sentence_weights, self.cast(scaling_sens, mstype.float32)) # apply grad reducer on grads grads = self.grad_reducer(grads) degree_sens = self.cast(scaling_sens * self.degree, mstype.float32) grads = self.hyper_map(F.partial(grad_scale, degree_sens), grads) - grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) cond = self.get_overflow_status(status, grads) overflow = cond @@ -421,7 +423,7 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): overflow = self.loss_scaling_manager(self.loss_scale, cond) if not overflow: self.optimizer(grads) - return (loss, cond, scaling_sens.value()) + return loss, cond, scaling_sens.value() class BertTrainOneStepWithLossScaleCellForAdam(nn.TrainOneStepWithLossScaleCell): @@ -449,6 +451,12 @@ class BertTrainOneStepWithLossScaleCellForAdam(nn.TrainOneStepWithLossScaleCell) self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) + self.enable_tuple_broaden = True + + @jit + def clip_grads(self, grads): + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + return grads def construct(self, input_ids, @@ -486,13 +494,13 @@ class BertTrainOneStepWithLossScaleCellForAdam(nn.TrainOneStepWithLossScaleCell) # apply grad reducer on grads grads = self.grad_reducer(grads) grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads) - grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + grads = self.clip_grads(grads) cond = self.get_overflow_status(status, grads) overflow = cond if self.loss_scaling_manager is not None: overflow = self.loss_scaling_manager(scaling_sens, cond) self.optimizer(grads, overflow) - return (loss, cond, scaling_sens.value()) + return loss, cond, scaling_sens.value() cast = P.Cast() add_grads = C.MultitypeFuncGraph("add_grads") @@ -612,7 +620,7 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): scaling_sens = self.loss_scale else: scaling_sens = sens - + # alloc status and clear should be right before gradoperation # update accumulation parameters is_accu_step = self.not_equal(self.local_step, self.accumulation_steps) self.local_step = self.select(is_accu_step, self.local_step + self.one, self.one) @@ -634,8 +642,6 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): mean_loss = F.depend(mean_loss, accu_succ) overflow = ops.logical_not(amp.all_finite(grads)) - - overflow = self.logical_or(self.not_equal(self.accu_overflow, self.zero), overflow) accu_overflow = self.select(overflow, self.one, self.zero) self.accu_overflow = self.select(is_accu_step, accu_overflow, self.zero) @@ -659,7 +665,7 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): if not overflow: self.optimizer(grads) - return (mean_loss, overflow, scaling_sens.value()) + return mean_loss, overflow, scaling_sens.value() class BertTrainAccumulationAllReduceEachWithLossScaleCell(nn.Cell): @@ -755,9 +761,6 @@ class BertTrainAccumulationAllReduceEachWithLossScaleCell(nn.Cell): mean_loss = self.accu_loss / self.local_step is_accu_step = self.not_equal(self.local_step, self.accumulation_steps) - # alloc status and clear should be right before gradoperation - init = self.alloc_status() - self.clear_before_grad(init) grads = self.grad(self.network, weights)(input_ids, input_mask, token_type_id, @@ -777,6 +780,7 @@ class BertTrainAccumulationAllReduceEachWithLossScaleCell(nn.Cell): overflow = ops.logical_not(amp.all_finite(grads)) if self.reducer_flag: overflow = self.allreduce(overflow.to(mstype.float32)) >= self.base + overflow = self.logical_or(self.not_equal(self.accu_overflow, self.zero), overflow) accu_overflow = self.select(overflow, self.one, self.zero) self.accu_overflow = self.select(is_accu_step, accu_overflow, self.zero) overflow = self.reshape(overflow, (())) @@ -814,7 +818,19 @@ class BertNetworkMatchBucket(nn.Cell): self.network = network if not bucket_list or not isinstance(bucket_list, list): bucket_list = [seq_length] - self.bucket_list = [bucket for bucket in bucket_list if bucket <= seq_length] + self.bucket_list = [bucket for bucket in bucket_list if bucket < seq_length] + + if network.reducer_flag: + reuse_attr = 'reuse_communication_node' + if not network.grad_reducer.split_fusion: + hccl_op = network.grad_reducer.allreduce + network.grad_reducer.allreduce = hccl_op.add_prim_attr(reuse_attr, getattr(hccl_op, 'fusion')) + else: + new_op_list = [] + for hccl_op in network.grad_reducer.op_list: + new_op = hccl_op.add_prim_attr(reuse_attr, getattr(hccl_op, 'fusion')) + new_op_list.append(new_op) + network.grad_reducer.op_list = new_op_list def construct(self, input_ids, @@ -826,20 +842,6 @@ class BertNetworkMatchBucket(nn.Cell): masked_lm_weights, sentence_flag): """Switch network according to sentence length.""" - for bucket in self.bucket_list: - if sentence_flag == bucket: - input_ids = input_ids[:, :bucket] - input_mask = input_mask[:, :bucket] - token_type_id = token_type_id[:, :bucket] - loss = self.network(input_ids, - input_mask, - token_type_id, - next_sentence_labels, - masked_lm_positions, - masked_lm_ids, - masked_lm_weights) - return loss - loss = self.network(input_ids, input_mask, token_type_id, @@ -860,7 +862,6 @@ class BertPretrainEval(nn.Cell): self.network = BertPreTraining(config, False, False) else: self.network = network - self.use_packed = config.use_packed self.argmax = P.Argmax(axis=-1, output_type=mstype.int32) self.equal = P.Equal() self.sum = P.ReduceSum() @@ -880,18 +881,14 @@ class BertPretrainEval(nn.Cell): next_sentence_labels, masked_lm_positions, masked_lm_ids, - masked_lm_weights, - next_sentence_starts=None, - next_sentence_weights=None): + masked_lm_weights): """Calculate prediction scores""" bs, _ = self.shape(input_ids) - mlm, _ = self.network(input_ids, input_mask, token_type_id, masked_lm_positions, next_sentence_starts) - index = self.argmax(mlm) + mlm, _ = self.network(input_ids, input_mask, token_type_id, masked_lm_positions) + _, index = mlm.argmax_with_value(axis=-1) index = self.reshape(index, (bs, -1)) eval_acc = self.equal(index, masked_lm_ids) eval_acc = self.cast(eval_acc, mstype.float32) - if self.use_packed: - masked_lm_weights = F.minimum(masked_lm_weights, 1.0) real_acc = eval_acc * masked_lm_weights acc = self.sum(real_acc) total = self.sum(masked_lm_weights) diff --git a/benchmark/ascend/bert/src/bert_model.py b/benchmark/ascend/bert/src/bert_model.py index 0ce562f1b..23e17f6d6 100644 --- a/benchmark/ascend/bert/src/bert_model.py +++ b/benchmark/ascend/bert/src/bert_model.py @@ -19,13 +19,14 @@ import copy import numpy as np import mindspore.common.dtype as mstype import mindspore.nn as nn +import mindspore.ops as ops import mindspore.ops.functional as F from mindspore.common.initializer import TruncatedNormal, initializer -import mindspore.ops as ops from mindspore.ops import operations as P from mindspore.ops import composite as C from mindspore.common.tensor import Tensor from mindspore.common.parameter import Parameter +from mindspore.ops.operations.nn_ops import FlashAttentionScore class BertConfig: @@ -70,8 +71,7 @@ class BertConfig: initializer_range=0.02, use_relative_positions=False, dtype=mstype.float32, - compute_type=mstype.float32, - use_packed=False): + compute_type=mstype.float32): self.seq_length = seq_length self.vocab_size = vocab_size self.hidden_size = hidden_size @@ -87,7 +87,6 @@ class BertConfig: self.use_relative_positions = use_relative_positions self.dtype = dtype self.compute_type = compute_type - self.use_packed = use_packed class EmbeddingLookup(nn.Cell): @@ -357,6 +356,122 @@ class SaturateCast(nn.Cell): return self.cast(out, self.dst_type) +class BertFlashAttention(nn.Cell): + """ + Apply multi-headed attention from "from_tensor" to "to_tensor". + + Args: + from_tensor_width (int): Size of last dim of from_tensor. + to_tensor_width (int): Size of last dim of to_tensor. + num_attention_heads (int): Number of attention heads. Default: 1. + size_per_head (int): Size of each attention head. Default: 512. + query_act (str): Activation function for the query transform. Default: None. + key_act (str): Activation function for the key transform. Default: None. + value_act (str): Activation function for the value transform. Default: None. + has_attention_mask (bool): Specifies whether to use attention mask. Default: False. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.0. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + compute_type (:class:`mindspore.dtype`): Compute type in BertAttention. Default: mstype.float32. + """ + def __init__(self, + from_tensor_width, + to_tensor_width, + num_attention_heads=1, + size_per_head=512, + query_act=None, + key_act=None, + value_act=None, + has_attention_mask=False, + attention_probs_dropout_prob=0.0, + use_one_hot_embeddings=False, + initializer_range=0.02, + use_relative_positions=False, + compute_type=mstype.float32): + + super(BertFlashAttention, self).__init__() + self.num_attention_heads = num_attention_heads + self.size_per_head = size_per_head + self.has_attention_mask = has_attention_mask + self.use_relative_positions = use_relative_positions + + self.scores_mul = 1.0 / math.sqrt(float(self.size_per_head)) + self.reshape = P.Reshape() + self.shape_from_2d = (-1, from_tensor_width) + self.shape_to_2d = (-1, to_tensor_width) + weight = TruncatedNormal(initializer_range) + units = num_attention_heads * size_per_head + self.query_layer = nn.Dense(from_tensor_width, + units, + activation=query_act, + weight_init=weight).to_float(compute_type) + self.key_layer = nn.Dense(to_tensor_width, + units, + activation=key_act, + weight_init=weight).to_float(compute_type) + self.value_layer = nn.Dense(to_tensor_width, + units, + activation=value_act, + weight_init=weight).to_float(compute_type) + + self.matmul_trans_b = P.BatchMatMul(transpose_b=True) + self.multiply = P.Mul() + self.transpose = P.Transpose() + self.trans_shape = (0, 2, 1, 3) + self.trans_shape_relative = (2, 0, 1, 3) + self.trans_shape_position = (1, 2, 0, 3) + self.multiply_data = -10000.0 + self.matmul = P.BatchMatMul() + + self.softmax = nn.Softmax() + self.dropout = nn.Dropout(p=attention_probs_dropout_prob) + + self.shape_return = (-1, num_attention_heads * size_per_head) + + self.cast_compute_type = SaturateCast(dst_type=compute_type) + self.flash_attention = FlashAttentionScore(head_num=num_attention_heads, + input_layout="BNSD", + sparse_mode=0, # lxy + scale_value=1 / math.sqrt(size_per_head)) + if self.use_relative_positions: + self._generate_relative_positions_embeddings = \ + RelaPosEmbeddingsGenerator(depth=size_per_head, + max_relative_position=16, + initializer_range=initializer_range, + use_one_hot_embeddings=use_one_hot_embeddings) + + def construct(self, from_tensor, to_tensor, attention_mask): + """reshape 2d/3d input tensors to 2d""" + shape_from = F.shape(attention_mask)[2] # seq length + from_tensor = F.depend(from_tensor, shape_from) + from_tensor_2d = self.reshape(from_tensor, self.shape_from_2d) + to_tensor_2d = self.reshape(to_tensor, self.shape_to_2d) + query_out = self.query_layer(from_tensor_2d) + key_out = self.key_layer(to_tensor_2d) + value_out = self.value_layer(to_tensor_2d) + #b, s, n, d + query_layer = self.reshape(query_out, (-1, shape_from, self.num_attention_heads, self.size_per_head)) + query_layer = self.transpose(query_layer, self.trans_shape) + key_layer = self.reshape(key_out, (-1, shape_from, self.num_attention_heads, self.size_per_head)) + key_layer = self.transpose(key_layer, self.trans_shape) + # 25, 1, 512 + attention_mask = None + # 25, 1, 512, 512 + value_layer = self.reshape(value_out, (-1, shape_from, self.num_attention_heads, self.size_per_head)) + value_layer = self.transpose(value_layer, self.trans_shape) + _, _, _, context_layer = self.flash_attention(query_layer.astype(mstype.float16), + key_layer.astype(mstype.float16), + value_layer.astype(mstype.float16), + None, None, None, attention_mask, None) + + context_layer = self.transpose(context_layer, self.trans_shape) + context_layer = self.reshape(context_layer, self.shape_return) + + return context_layer + + class BertAttention(nn.Cell): """ Apply multi-headed attention from "from_tensor" to "to_tensor". @@ -492,7 +607,9 @@ class BertAttention(nn.Cell): attention_scores = self.multiply(self.scores_mul, attention_scores) if self.has_attention_mask: + # 25, 1, 512 attention_mask = self.expand_dims(attention_mask, 1) + # 25, 1, 1, 512 multiply_out = self.sub(self.cast(F.tuple_to_array((1.0,)), self.get_dtype(attention_scores)), self.cast(attention_mask, self.get_dtype(attention_scores))) @@ -569,8 +686,7 @@ class BertSelfAttention(nn.Cell): "of attention heads (%d)" % (hidden_size, num_attention_heads)) self.size_per_head = int(hidden_size / num_attention_heads) - - self.attention = BertAttention( + self.attention = BertFlashAttention( from_tensor_width=hidden_size, to_tensor_width=hidden_size, num_attention_heads=num_attention_heads, @@ -738,24 +854,14 @@ class CreateAttentionMaskFromInputMask(nn.Cell): """ def __init__(self, config): super(CreateAttentionMaskFromInputMask, self).__init__() - self.use_packed = config.use_packed self.input_mask = None self.cast = P.Cast() self.reshape = P.Reshape() - self.tile = P.Tile() - self.transpose = P.Transpose() def construct(self, input_mask): seq_length = F.shape(input_mask)[1] - if self.use_packed: - mask_tile = self.reshape(self.tile(input_mask, (1, seq_length)), (-1, seq_length)) - reshape_mask = F.broadcast_to(self.reshape(input_mask, (1, -1)), (seq_length, -1)) - transpose_mask = self.transpose(reshape_mask, (1, 0)) - attention_mask = self.reshape(self.cast(mask_tile == transpose_mask, mstype.float32), - (-1, seq_length, seq_length)) - else: - attention_mask = self.cast(self.reshape(input_mask, (-1, 1, seq_length)), mstype.float32) + attention_mask = self.cast(self.reshape(input_mask, (-1, 1, seq_length)), mstype.float32) return attention_mask @@ -781,7 +887,6 @@ class BertModel(nn.Cell): self.hidden_size = config.hidden_size self.num_hidden_layers = config.num_hidden_layers self.embedding_size = config.hidden_size - self.use_packed = config.use_packed self.token_type_ids = None self.last_idx = self.num_hidden_layers - 1 @@ -822,7 +927,6 @@ class BertModel(nn.Cell): self.dtype = config.dtype self.cast_compute_type = SaturateCast(dst_type=config.compute_type) self.slice = P.StridedSlice() - self.gather = P.Gather() self.squeeze_1 = P.Squeeze(axis=1) self.dense = nn.Dense(self.hidden_size, self.hidden_size, @@ -830,7 +934,7 @@ class BertModel(nn.Cell): weight_init=TruncatedNormal(config.initializer_range)).to_float(config.compute_type) self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask(config) - def construct(self, input_ids, token_type_ids, input_mask, next_sentence_starts=None): + def construct(self, input_ids, token_type_ids, input_mask): """Bidirectional Encoder Representations from Transformers.""" # embedding embedding_tables = self.bert_embedding_lookup.embedding_table @@ -849,18 +953,11 @@ class BertModel(nn.Cell): # pooler batch_size = P.Shape()(input_ids)[0] - if self.use_packed: - slices = [] - for i in range(batch_size): - slices.append(sequence_output[i][next_sentence_starts[i]]) - sequence_slice = F.stack(slices) - first_token = F.reshape(sequence_slice, (-1, self.hidden_size)) - else: - sequence_slice = self.slice(sequence_output, - (0, 0, 0), - (batch_size, 1, self.hidden_size), - (1, 1, 1)) - first_token = self.squeeze_1(sequence_slice) + sequence_slice = self.slice(sequence_output, + (0, 0, 0), + (batch_size, 1, self.hidden_size), + (1, 1, 1)) + first_token = self.squeeze_1(sequence_slice) pooled_output = self.dense(first_token) pooled_output = self.cast(pooled_output, self.dtype) diff --git a/benchmark/ascend/bert/src/dataset.py b/benchmark/ascend/bert/src/dataset.py index a6d70ca67..37aff4e36 100644 --- a/benchmark/ascend/bert/src/dataset.py +++ b/benchmark/ascend/bert/src/dataset.py @@ -1,22 +1,7 @@ -# Copyright 2020-2022 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -Data operations, will be used in run_pretrain.py -""" import os import math +from dataclasses import field, dataclass + import numpy as np import mindspore.common.dtype as mstype import mindspore.dataset as ds @@ -27,42 +12,48 @@ from mindspore import log as logger class BucketDatasetGenerator: """ Provide data distribution of different gears for the bert network. - Args: - dataset (Dataset): The training dataset. + dataset (Dataset): The training dataset batch_size (Int): The training batchsize. - bucket_list (List): List of different sentence lengths, such as [128, 256, 512]. Default: None. - valid_dataset_len (Int): Prevent communication failure at the end of the dataset. Default: 0.35. + bucket_list (List): List of different sentence lengths, such as [128, 256, 512]. Default: None """ - def __init__(self, dataset, batch_size, bucket_list=None, valid_dataset_len=0.35): + def __init__(self, dataset, batch_size, bucket_list=None, train_steps=None): self.dataset = dataset self.batch_size = batch_size self.bucket_list = bucket_list - bucket_size = len(bucket_list) - self.random_list = np.random.binomial(n=(bucket_size - 1), p=0.55, size=self.__len__()) - self.random_list = (self.random_list + 2) % bucket_size + self.train_steps = 20000 + self.random_list_strategy3() self.random_list = [bucket_list[i] for i in self.random_list] - valid_dataset_len = int(valid_dataset_len * self.__len__()) - self.random_list = self.random_list[:valid_dataset_len] + [bucket_list[-1]] * self.__len__() self._init_variables() + self.target_clip_length = bucket_list[0] + self.clipped_length_from = bucket_list[-1] def _init_variables(self): self.data_bucket = {bucket: [] for bucket in self.bucket_list} self.iter = 0 self.remaining_data = [] - self.remaining_data_size = 1 self.stage = 0 + def random_list_strategy3(self, p128=0.5): + p128 = os.getenv("P128", p128) + print("p128: ", p128) + p128 = int(float(p128) * 100) + p512 = 100 - p128 + rpt = self.__len__() // 100 + rmd = self.__len__() % 100 + self.random_list = np.array(([0] * p128 + [1] * p512) * rpt + [0] * rmd) + def __next__(self): if self.stage != 0: return self._process_remaining_data() for item in self.iterator: - for seq_length in self.bucket_list: - if np.sum(item[1]) <= seq_length: - self.data_bucket[seq_length].append(item) - break + if np.sum(item[1]) < 384: + item = self.clip_data(item) + self.data_bucket.get(self.bucket_list[0]).append(item) + else: + self.data_bucket.get(self.bucket_list[-1]).append(item) for key in self.data_bucket.keys(): data = self.data_bucket[key] if len(data) >= self.batch_size and self.random_list[self.iter] == key: @@ -72,8 +63,18 @@ class BucketDatasetGenerator: self.stage = 1 return self._process_remaining_data() + @staticmethod + def clip_data(item): + item_clip = item[:3] + item_clip.append(np.array([0], np.int64)) + masked_lm_positions = np.array([0] * 76) + masked_lm_ids = np.array([0] * 76) + masked_lm_weights = np.array([0.0] * 76, np.float32) + item_clip += [masked_lm_positions, masked_lm_ids, masked_lm_weights] + return item_clip + def _package_data(self, data, key): - """package a set of data.""" + """Package a set of data.""" arr = data[0] for i in range(1, self.batch_size): current_data = data[i] @@ -118,22 +119,36 @@ class BucketDatasetGenerator: def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, schema_dir=None, batch_size=32, - bucket_list=None, use_packed=False): + bucket_list=None, dataset_format="mindrecord", num_samples=None): """create train dataset""" # apply repeat operations files = os.listdir(data_dir) data_files = [] for file_name in files: - if use_packed or "tfrecord" in file_name: + condition1 = dataset_format == "tfrecord" and "tfrecord" in file_name + condition2 = dataset_format == "mindrecord" and "mindrecord" in file_name and "mindrecord.db" not in file_name + if condition1 or condition2: data_files.append(os.path.join(data_dir, file_name)) - columns_list = ["input_ids", "input_mask", "segment_ids", "next_sentence_labels", "masked_lm_positions", - "masked_lm_ids", "masked_lm_weights"] - if use_packed: - columns_list.extend(["next_sentence_positions", "next_sentence_weights"]) - data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, - columns_list=columns_list, - shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False, - num_shards=device_num, shard_id=rank, shard_equal_rows=True) + if dataset_format == "mindrecord": + if str(num_samples).lower() != "none": + data_set = ds.MindDataset(data_files, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + shuffle=False, num_shards=device_num, shard_id=rank, num_samples=num_samples) + else: + data_set = ds.MindDataset(data_files, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False, + num_shards=device_num, shard_id=rank) + elif dataset_format == "tfrecord": + data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False, + num_shards=device_num, shard_id=rank, shard_equal_rows=True) + else: + raise NotImplementedError("Only supported dataset_format for tfrecord or mindrecord.") if bucket_list: bucket_dataset = BucketDatasetGenerator(data_set, batch_size, bucket_list=bucket_list) data_set = ds.GeneratorDataset(bucket_dataset, @@ -152,9 +167,6 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") - if use_packed: - data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_positions") - data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_weights") # apply batch operations logger.info("data size: {}".format(data_set.get_dataset_size())) logger.info("repeat count: {}".format(data_set.get_repeat_count())) @@ -169,10 +181,12 @@ def create_ner_dataset(batch_size=1, assessment_method="accuracy", data_file_pat dataset = ds.MindDataset([data_file_path], columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle) - else: + elif dataset_format == "tfrecord": dataset = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle) + else: + raise NotImplementedError("Only supported dataset_format for tfrecord or mindrecord.") if assessment_method == "Spearman_correlation": type_cast_op_float = C.TypeCast(mstype.float32) dataset = dataset.map(operations=type_cast_op_float, input_columns="label_ids") @@ -186,13 +200,20 @@ def create_ner_dataset(batch_size=1, assessment_method="accuracy", data_file_pat return dataset -def create_classification_dataset(batch_size=1, assessment_method="accuracy", - data_file_path=None, schema_file_path=None, do_shuffle=True): +def create_classification_dataset(batch_size=1, assessment_method="accuracy", data_file_path=None, + schema_file_path=None, dataset_format="mindrecord", do_shuffle=True): """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) - data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + if dataset_format == "mindrecord": + data_set = ds.MindDataset([data_file_path], columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle) + elif dataset_format == "tfrecord": + data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], + shuffle=do_shuffle) + else: + raise NotImplementedError("Only supported dataset_format for tfrecord or mindrecord.") if assessment_method == "Spearman_correlation": type_cast_op_float = C.TypeCast(mstype.float32) data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids") @@ -212,19 +233,34 @@ def generator_squad(data_features): def create_squad_dataset(batch_size=1, data_file_path=None, schema_file_path=None, - is_training=True, do_shuffle=True): + is_training=True, do_shuffle=True, dataset_format="mindrecord"): """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) if is_training: - data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + if dataset_format == "mindrecord": + data_set = ds.MindDataset([data_file_path], columns_list=["input_ids", "input_mask", "segment_ids", "start_positions", "end_positions", "unique_ids", "is_impossible"], shuffle=do_shuffle) + elif dataset_format == "tfrecord": + data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "start_positions", + "end_positions", "unique_ids", "is_impossible"], + shuffle=do_shuffle) + else: + raise NotImplementedError("Only supported dataset_format for tfrecord or mindrecord.") data_set = data_set.map(operations=type_cast_op, input_columns="start_positions") data_set = data_set.map(operations=type_cast_op, input_columns="end_positions") else: - data_set = ds.GeneratorDataset(generator_squad(data_file_path), shuffle=do_shuffle, - column_names=["input_ids", "input_mask", "segment_ids", "unique_ids"]) + if dataset_format == "mindrecord": + data_set = ds.MindDataset([data_file_path], + columns_list=["input_ids", "input_mask", "segment_ids", "unique_ids"], + shuffle=do_shuffle) + elif dataset_format == "tfrecord": + data_set = ds.GeneratorDataset(generator_squad(data_file_path), shuffle=do_shuffle, + column_names=["input_ids", "input_mask", "segment_ids", "unique_ids"]) + else: + raise NotImplementedError("Only supported dataset_format for tfrecord or mindrecord.") data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") @@ -234,22 +270,54 @@ def create_squad_dataset(batch_size=1, data_file_path=None, schema_file_path=Non return data_set -def create_eval_dataset(batchsize=32, device_num=1, rank=0, data_dir=None, schema_dir=None, use_packed=False): +@dataclass +class CreateEvalDatasetInput: + batchsize: int = field(default=32) + device_num: int = field(default=1) + rank: int = field(default=0) + data_dir: str = field(default=None) + schema_dir: str = field(default=None) + dataset_format: str = field(default="mindrecord") + num_samples: int = field(default=None) + + +def create_eval_dataset(inputs=CreateEvalDatasetInput()): """create evaluation dataset""" + batchsize = inputs.batchsize + device_num = inputs.device_num + rank = inputs.rank + data_dir = inputs.data_dir + schema_dir = inputs.schema_dir + dataset_format = inputs.dataset_format + num_samples = inputs.num_samples data_files = [] if os.path.isdir(data_dir): files = os.listdir(data_dir) for file_name in files: - if use_packed or "tfrecord" in file_name: + condition1 = dataset_format == "tfrecord" and "tfrecord" in file_name + condition2 = dataset_format == "mindrecord" and "mindrecord" in file_name \ + and "mindrecord.db" not in file_name + if condition1 or condition2: data_files.append(os.path.join(data_dir, file_name)) else: data_files.append(data_dir) - columns_list = ["input_ids", "input_mask", "segment_ids", "next_sentence_labels", - "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"] - if use_packed: - columns_list.extend(["next_sentence_positions", "next_sentence_weights"]) - data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, - columns_list=columns_list, shard_equal_rows=True) + if dataset_format == "mindrecord": + if str(num_samples).lower() != "none": + data_set = ds.MindDataset(data_files, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + num_samples=num_samples) + else: + data_set = ds.MindDataset(data_files, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"]) + elif dataset_format == "tfrecord": + data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + shard_equal_rows=True) + else: + raise NotImplementedError("Only supported dataset_format for tfrecord or mindrecord.") ori_dataset_size = data_set.get_dataset_size() print("origin eval size: ", ori_dataset_size) dtypes = data_set.output_types() @@ -265,18 +333,30 @@ def create_eval_dataset(batchsize=32, device_num=1, rank=0, data_dir=None, schem "masked_lm_positions": np.zeros(shapes[4], dtypes[4]), "masked_lm_ids": np.zeros(shapes[5], dtypes[5]), "masked_lm_weights": np.zeros(shapes[6], dtypes[6])} - if use_packed: - item["next_sentence_positions"] = np.zeros(shapes[7], dtypes[7]) - item["next_sentence_weights"] = np.zeros(shapes[8], dtypes[8]) padded_samples = [item for x in range(padded_num)] padded_ds = ds.PaddedDataset(padded_samples) eval_ds = data_set + padded_ds sampler = ds.DistributedSampler(num_shards=device_num, shard_id=rank, shuffle=False) eval_ds.use_sampler(sampler) else: - eval_ds = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, - columns_list=columns_list, num_shards=device_num, - shard_id=rank, shard_equal_rows=True) + if dataset_format == "mindrecord": + if str(num_samples).lower() != "none": + eval_ds = ds.MindDataset(data_files, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + num_shards=device_num, shard_id=rank, num_samples=num_samples) + else: + eval_ds = ds.MindDataset(data_files, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + num_shards=device_num, shard_id=rank) + elif dataset_format == "tfrecord": + eval_ds = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + num_shards=device_num, shard_id=rank, shard_equal_rows=True) + else: + raise NotImplementedError("Only supported dataset_format for tfrecord or mindrecord.") type_cast_op = C.TypeCast(mstype.int32) eval_ds = eval_ds.map(input_columns="masked_lm_ids", operations=type_cast_op) @@ -285,9 +365,6 @@ def create_eval_dataset(batchsize=32, device_num=1, rank=0, data_dir=None, schem eval_ds = eval_ds.map(input_columns="segment_ids", operations=type_cast_op) eval_ds = eval_ds.map(input_columns="input_mask", operations=type_cast_op) eval_ds = eval_ds.map(input_columns="input_ids", operations=type_cast_op) - if use_packed: - eval_ds = eval_ds.map(input_columns="next_sentence_positions", operations=type_cast_op) - eval_ds = eval_ds.map(input_columns="next_sentence_weights", operations=type_cast_op) eval_ds = eval_ds.batch(batchsize, drop_remainder=True) print("eval data size: {}".format(eval_ds.get_dataset_size())) diff --git a/benchmark/ascend/bert/src/model_utils/config.py b/benchmark/ascend/bert/src/model_utils/config.py index b0687b792..089d3c477 100644 --- a/benchmark/ascend/bert/src/model_utils/config.py +++ b/benchmark/ascend/bert/src/model_utils/config.py @@ -113,13 +113,16 @@ def merge(args, cfg): def parse_dtype(dtype): - if dtype not in ["mstype.float32", "mstype.float16"]: + #if dtype not in ["mstype.float32", "mstype.float16"]: + if dtype not in ["mstype.float32", "mstype.float16", "mstype.bfloat16"]: #773491 raise ValueError("Not supported dtype") if dtype == "mstype.float32": return mstype.float32 if dtype == "mstype.float16": return mstype.float16 + if dtype == "mstype.bfloat16": #773491 + return mstype.bfloat16 return None def extra_operations(cfg): @@ -157,7 +160,6 @@ def extra_operations(cfg): _bert_net_cfg = cfg.large_boost_net_cfg else: pass - _bert_net_cfg.use_packed = cfg.use_packed cfg.bert_net_cfg = BertConfig(**_bert_net_cfg.__dict__) elif cfg.description == 'run_ner': cfg.optimizer_cfg.AdamWeightDecay.decay_filter = \ @@ -194,7 +196,7 @@ def get_config(): current_dir = os.path.dirname(os.path.abspath(__file__)) return os.path.join(current_dir, path_relative) parser = argparse.ArgumentParser(description="default name", add_help=False) - parser.add_argument("--config_path", type=get_abs_path, default="../../pretrain_config_Ascend_Boost.yaml", + parser.add_argument("--config_path", type=get_abs_path, default="../../pretrain_config.yaml", help="Config file path") path_args, _ = parser.parse_known_args() default, helper, choices = parse_yaml(path_args.config_path) diff --git a/benchmark/ascend/bert/src/utils.py b/benchmark/ascend/bert/src/utils.py index 682cbfc18..1eba86895 100644 --- a/benchmark/ascend/bert/src/utils.py +++ b/benchmark/ascend/bert/src/utils.py @@ -20,6 +20,7 @@ Functional Cells used in Bert finetune and evaluation. import os import math import collections +import datetime import numpy as np import mindspore.nn as nn from mindspore import log as logger @@ -84,6 +85,7 @@ def make_directory(path: str): raise TypeError("No write permission on the directory.") return real_path + class LossCallBack(Callback): """ Monitor the loss in training. @@ -96,24 +98,24 @@ class LossCallBack(Callback): def __init__(self, dataset_size=-1): super(LossCallBack, self).__init__() self._dataset_size = dataset_size + def step_end(self, run_context): """ Print loss after each step """ cb_params = run_context.original_args() - loss, is_overflow, loss_scale = [output.asnumpy().item() for output in cb_params.net_outputs] if self._dataset_size > 0: percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size) if percent == 0: percent = 1 epoch_num -= 1 - print("epoch: {}, current epoch percent: {}, step: {}, loss: {}, overflow: {}, loss scale: {}" - .format(int(epoch_num), "%.3f" % percent, cb_params.cur_step_num, loss, is_overflow, - int(loss_scale)), flush=True) + print("time: {}, epoch: {}, current epoch percent: {}, step: {}, outputs are {}" + .format(datetime.datetime.utcnow(), int(epoch_num), "%.3f" % percent, cb_params.cur_step_num, + str(cb_params.net_outputs)), + flush=True) else: - print("epoch: {}, step: {}, loss: {}, overflow: {}, loss scale: {}" - .format(cb_params.cur_epoch_num, cb_params.cur_step_num, loss, is_overflow, - int(loss_scale)), flush=True) + print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num, + str(cb_params.net_outputs)), flush=True) def LoadNewestCkpt(load_finetune_checkpoint_dir, prefix): @@ -149,6 +151,8 @@ class BertLearningRate(LearningRateSchedule): self.greater = P.Greater() self.one = Tensor(np.array([1.0]).astype(np.float32)) self.cast = P.Cast() + from mindspore import ops + self.print = ops.Print() def construct(self, global_step): decay_lr = self.decay_lr(global_step) @@ -176,6 +180,7 @@ def convert_labels_to_index(label_list): label2id[sub_label] = index return label2id + def _get_poly_lr(global_step, lr_init, lr_end, lr_max, warmup_steps, total_steps, poly_power): """ generate learning rate array @@ -252,13 +257,15 @@ class EvalCallBack(Callback): if num_samples < self.eval_samples: return self.last_eval_step = cb_params.cur_step_num - total_sumples = cb_params.cur_step_num * self.global_batch + total_samples = cb_params.cur_step_num * self.global_batch res = self.model.eval(self.eval_ds, dataset_sink_mode=True) res = res['bert_acc'] print("====================================", flush=True) - print("Accuracy is: ", "%.6f" % res, ", current samples is: ", total_sumples) + print(f"Time: {datetime.datetime.utcnow()} Accuracy is: ", "%.6f" % res, ", current samples is: ", + total_samples) print("====================================", flush=True) + class BertMetric(Metric): """ The metric of bert network. -- Gitee From 6e0189dc6c76d74491b52fa0a010f1243b780592 Mon Sep 17 00:00:00 2001 From: ZhihaoLi Date: Sun, 28 Apr 2024 11:46:26 +0800 Subject: [PATCH 33/44] add bert large mlperf 16die --- .../hyper_parameter_config.ini | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 benchmark/ascend/bert/scripts/ascend_distributed_launcher/hyper_parameter_config.ini diff --git a/benchmark/ascend/bert/scripts/ascend_distributed_launcher/hyper_parameter_config.ini b/benchmark/ascend/bert/scripts/ascend_distributed_launcher/hyper_parameter_config.ini new file mode 100644 index 000000000..c70edb2e3 --- /dev/null +++ b/benchmark/ascend/bert/scripts/ascend_distributed_launcher/hyper_parameter_config.ini @@ -0,0 +1,12 @@ +[config] +distribute=true +epoch_size=40 +enable_save_ckpt=false +do_shuffle=false +enable_data_sink=true +data_sink_steps=100 +accumulation_steps=1 +allreduce_post_accumulation=true +save_checkpoint_path=./ +save_checkpoint_num=1 +config_path=../../pretrain_config_Ascend_Boost.yaml -- Gitee From 86d23d04bd49fd2efc3b73126e1ff66e0cc127cb Mon Sep 17 00:00:00 2001 From: luxingyu2023 Date: Sun, 28 Apr 2024 14:50:23 +0800 Subject: [PATCH 34/44] add mlperf resnet --- .../resnet50_imagenet2012_Boost_config.yaml | 54 ++- ...esnet50_imagenet2012_Boost_config_16p.yaml | 114 ++++++ benchmark/ascend/resnet/eval.py | 113 ------ benchmark/ascend/resnet/export.py | 63 ---- benchmark/ascend/resnet/scripts/cache_util.sh | 49 --- .../resnet/scripts/run_distribute_train.sh | 49 ++- .../scripts/run_distribute_train_2node_16p.sh | 135 +++++++ .../resnet/scripts/run_distribute_train_4p.sh | 154 ++++++++ benchmark/ascend/resnet/scripts/run_eval.sh | 67 ---- benchmark/ascend/resnet/scripts/run_infer.sh | 67 ---- .../ascend/resnet/scripts/run_infer_310.sh | 145 -------- .../resnet/scripts/run_standalone_train.sh | 110 ------ .../scripts/run_standalone_train_gpu.sh | 115 ------ .../ascend/resnet/src/CrossEntropySmooth.py | 2 +- .../src/{eval_callback.py => callback.py} | 113 ++++-- .../data_split.py} | 78 ++-- benchmark/ascend/resnet/src/dataset.py | 337 ++--------------- benchmark/ascend/resnet/src/dataset_infer.py | 14 +- benchmark/ascend/resnet/src/logger.py | 87 +++++ benchmark/ascend/resnet/src/lr_generator.py | 40 +- benchmark/ascend/resnet/src/metric.py | 11 +- .../ascend/resnet/src/model_utils/config.py | 6 +- .../resnet/src/model_utils/device_adapter.py | 2 +- .../resnet/src/model_utils/local_adapter.py | 2 +- .../resnet/src/model_utils/moxing_adapter.py | 14 +- benchmark/ascend/resnet/src/momentum.py | 13 +- benchmark/ascend/resnet/src/resnet.py | 14 +- .../ascend/resnet/src/resnet_gpu_benchmark.py | 2 +- benchmark/ascend/resnet/src/util.py | 144 ++++++++ benchmark/ascend/resnet/train.py | 346 +++++------------- 30 files changed, 1023 insertions(+), 1437 deletions(-) create mode 100644 benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config_16p.yaml delete mode 100644 benchmark/ascend/resnet/eval.py delete mode 100644 benchmark/ascend/resnet/export.py delete mode 100644 benchmark/ascend/resnet/scripts/cache_util.sh create mode 100644 benchmark/ascend/resnet/scripts/run_distribute_train_2node_16p.sh create mode 100644 benchmark/ascend/resnet/scripts/run_distribute_train_4p.sh delete mode 100644 benchmark/ascend/resnet/scripts/run_eval.sh delete mode 100644 benchmark/ascend/resnet/scripts/run_infer.sh delete mode 100644 benchmark/ascend/resnet/scripts/run_infer_310.sh delete mode 100644 benchmark/ascend/resnet/scripts/run_standalone_train.sh delete mode 100644 benchmark/ascend/resnet/scripts/run_standalone_train_gpu.sh rename benchmark/ascend/resnet/src/{eval_callback.py => callback.py} (41%) rename benchmark/ascend/resnet/{create_imagenet2012_label.py => src/data_split.py} (35%) create mode 100644 benchmark/ascend/resnet/src/logger.py create mode 100644 benchmark/ascend/resnet/src/util.py diff --git a/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config.yaml b/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config.yaml index 8a426a6b5..55774fa52 100644 --- a/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config.yaml +++ b/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config.yaml @@ -5,10 +5,10 @@ data_url: "" train_url: "" checkpoint_url: "" # Path for local -run_distribute: False +run_distribute: True enable_profiling: False -data_path: "/cache/data" -output_path: "/cache/train" +data_path: "/data/resnet_tc/Imagenet2012/train" +output_dir: "../outputs" load_path: "/cache/checkpoint_path/" device_target: "Ascend" checkpoint_path: "./checkpoint/" @@ -19,47 +19,50 @@ checkpoint_file_path: "" optimizer: "LARS" infer_label: "" class_num: 1001 -batch_size: 256 +batch_size: 192 +eval_batch_size: 250 loss_scale: 1024 -momentum: 0.85 -weight_decay: 5.0e-5 -epoch_size: 60 -pretrain_epoch_size: 0 +momentum: 0.9 +weight_decay: 5.0e-05 +epoch_size: 36 +start_epoch: 0 +resume_ckpt: "" save_checkpoint: False save_checkpoint_epochs: 5 keep_checkpoint_max: 10 -warmup_epochs: 1 +warmup_epochs: 5 lr_decay_mode: "poly" use_label_smooth: True label_smooth_factor: 0.1 lr_init: 0 -lr_max: 13.01 -lr_end: 0.0 +lr_max: 7.6 +lr_end: 0.0001 lars_epsilon: 0.0 lars_coefficient: 0.001 net_name: "resnet50" dataset: "imagenet2012" -device_num: 1 +device_num: 8 pre_trained: "" -run_eval: False -eval_dataset_path: "" +run_eval: True +eval_dataset_path: "/data/resnet_tc/Imagenet2012/val" parameter_server: False filter_weight: False -save_best_ckpt: True -eval_start_epoch: 30 -eval_interval: 1 +save_best_ckpt: False +eval_start_epoch: 3 +eval_interval: 4 enable_cache: False cache_session_id: "" mode_name: "GRAPH" boost_mode: "O1" conv_init: "TruncatedNormal" -dense_init: "RandomNormal" +dense_init: "TruncatedNormal" all_reduce_fusion_config: - 85 - 160 train_image_size: 192 eval_image_size: 224 +max_device_memory: "30GB" # Export options device_id: 0 @@ -80,6 +83,18 @@ has_trained_step: 0 result_path: '' label_path: '' +# prediction +img_path: '' + +# lite inference +enable_predict: False +enable_predict_lite_backend: False +enable_predict_lite_mindir: False + +# lite mindir inference +mindir_path: 'net.mindir' + + --- # Help description for each configuration enable_modelarts: "Whether training on modelarts, default: False" @@ -92,8 +107,9 @@ device_target: "Target device type, available: [Ascend, GPU, CPU]" enable_profiling: "Whether enable profiling while training, default: False" num_classes: "Class for dataset" batch_size: "Batch size for training and evaluation" -epoch_size: "Total training epochs." +epoch_size: 47 checkpoint_path: "The location of the checkpoint file." checkpoint_file_path: "The location of the checkpoint file." save_graphs: "Whether save graphs during training, default: False." save_graphs_path: "Path to save graphs." +img_path: "image file path." diff --git a/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config_16p.yaml b/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config_16p.yaml new file mode 100644 index 000000000..1f11aadef --- /dev/null +++ b/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config_16p.yaml @@ -0,0 +1,114 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +run_distribute: True +enable_profiling: False +data_path: "/data/resnet_tc/Imagenet2012/train" +output_dir: "../outputs" +load_path: "/cache/checkpoint_path/" +device_target: "Ascend" +checkpoint_path: "./checkpoint/" +checkpoint_file_path: "" + +# ============================================================================== +# Training options +optimizer: "LARS" +infer_label: "" +class_num: 1001 +batch_size: 192 +eval_batch_size: 125 +loss_scale: 1024 +momentum: 0.9 +weight_decay: 5.0e-05 +epoch_size: 37 +start_epoch: 0 +resume_ckpt: "" +save_checkpoint: False +save_checkpoint_epochs: 5 +keep_checkpoint_max: 10 +warmup_epochs: 5 +lr_decay_mode: "poly" +use_label_smooth: True +label_smooth_factor: 0.1 +lr_init: 0 +lr_max: 11.0 +lr_end: 0.0001 +lars_epsilon: 0.0 +lars_coefficient: 0.001 + +net_name: "resnet50" +dataset: "imagenet2012" +device_num: 8 +pre_trained: "" +run_eval: True +eval_dataset_path: "/data/resnet_tc/Imagenet2012/val" +parameter_server: False +filter_weight: False +save_best_ckpt: False +eval_start_epoch: 4 +eval_interval: 4 +enable_cache: False +cache_session_id: "" +mode_name: "GRAPH" +boost_mode: "O1" +conv_init: "TruncatedNormal" +dense_init: "TruncatedNormal" +all_reduce_fusion_config: + - 85 + - 160 +train_image_size: 192 +eval_image_size: 224 +max_device_memory: "30GB" + +# Export options +device_id: 0 +width: 224 +height: 224 +file_name: "resnet50" +file_format: "MINDIR" +ckpt_file: "" +network_dataset: "resnet50_imagenet2012" + +# Retrain options +save_graphs: False +save_graphs_path: "./graphs" +has_trained_epoch: 0 +has_trained_step: 0 + +# postprocess resnet inference +result_path: '' +label_path: '' + +# prediction +img_path: '' + +# lite inference +enable_predict: False +enable_predict_lite_backend: False +enable_predict_lite_mindir: False + +# lite mindir inference +mindir_path: 'net.mindir' + + +# # Help description for each configuration +# enable_modelarts: "Whether training on modelarts, default: False" +# data_url: "Dataset url for obs" +# checkpoint_url: "The location of checkpoint for obs" +# data_path: "Dataset path for local" +# output_path: "Training output path for local" +# load_path: "The location of checkpoint for obs" +# device_target: "Target device type, available: [Ascend, GPU, CPU]" +# enable_profiling: "Whether enable profiling while training, default: False" +# num_classes: "Class for dataset" +# batch_size: "Batch size for training and evaluation" +# epoch_size: 37 +# checkpoint_path: "The location of the checkpoint file." +# checkpoint_file_path: "The location of the checkpoint file." +# save_graphs: "Whether save graphs during training, default: False." +# save_graphs_path: "Path to save graphs." +# img_path: "image file path." diff --git a/benchmark/ascend/resnet/eval.py b/benchmark/ascend/resnet/eval.py deleted file mode 100644 index d7be56d47..000000000 --- a/benchmark/ascend/resnet/eval.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright 2020-2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""eval resnet.""" -import os -import mindspore as ms -from mindspore import Tensor -from mindspore.nn.optim import Momentum -from mindspore.common import set_seed -from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits -from mindspore.train.model import Model -from src.CrossEntropySmooth import CrossEntropySmooth -from src.model_utils.config import config -from src.model_utils.moxing_adapter import moxing_wrapper - -set_seed(1) - -if config.net_name in ("resnet18", "resnet34", "resnet50", "resnet152"): - if config.net_name == "resnet18": - from src.resnet import resnet18 as resnet - elif config.net_name == "resnet34": - from src.resnet import resnet34 as resnet - elif config.net_name == "resnet50": - from src.resnet import resnet50 as resnet - else: - from src.resnet import resnet152 as resnet - if config.dataset == "cifar10": - from src.dataset import create_dataset1 as create_dataset - else: - from src.dataset import create_dataset2 as create_dataset -elif config.net_name == "resnet101": - from src.resnet import resnet101 as resnet - from src.dataset import create_dataset3 as create_dataset -else: - from src.resnet import se_resnet50 as resnet - from src.dataset import create_dataset4 as create_dataset - - -def init_group_params(net): - decayed_params = [] - no_decayed_params = [] - for param in net.trainable_params(): - if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: - decayed_params.append(param) - else: - no_decayed_params.append(param) - - group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay}, - {'params': no_decayed_params}, - {'order_params': net.trainable_params()}] - return group_params - - -@moxing_wrapper() -def eval_net(): - """eval net""" - target = config.device_target - - # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) - if target == "Ascend": - device_id = int(os.getenv('DEVICE_ID')) - ms.set_context(device_id=device_id) - - # create dataset - dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, - eval_image_size=config.eval_image_size, - target=target) - - # define net - net = resnet(class_num=config.class_num) - - # load checkpoint - param_dict = ms.load_checkpoint(config.checkpoint_file_path) - ms.load_param_into_net(net, param_dict) - net.set_train(False) - - # define loss, model - if config.dataset == "imagenet2012": - if not config.use_label_smooth: - config.label_smooth_factor = 0.0 - loss = CrossEntropySmooth(sparse=True, reduction='mean', - smooth_factor=config.label_smooth_factor, - num_classes=config.class_num) - else: - loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') - - #Currently, boost inference only supports scenarios with optimizers - #Optimizer waiting for decoupling in boost model - group_params = init_group_params(net) - opt = Momentum(group_params, Tensor(0.0), config.momentum, loss_scale=config.loss_scale) - - # define model, add boostmode for eval scenarios with train.py - model = Model(net, loss_fn=loss, boost_level=config.boost_mode, - optimizer=opt, metrics={'top_1_accuracy', 'top_5_accuracy'}) - - # eval model - res = model.eval(dataset) - print("result:", res, "ckpt=", config.checkpoint_file_path) - -if __name__ == '__main__': - eval_net() diff --git a/benchmark/ascend/resnet/export.py b/benchmark/ascend/resnet/export.py deleted file mode 100644 index 267869836..000000000 --- a/benchmark/ascend/resnet/export.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright 2020-2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -##############export checkpoint file into air and onnx models################# -python export.py -""" -import os - -import mindspore as ms -from src.model_utils.config import config -from src.model_utils.moxing_adapter import moxing_wrapper - -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) -if config.device_target != "GPU": - ms.set_context(device_id=config.device_id) - -def modelarts_pre_process(): - '''modelarts pre process function.''' - config.file_name = os.path.join(config.output_path, config.file_name) - -@moxing_wrapper(pre_process=modelarts_pre_process) -def run_export(): - """run export.""" - if config.network_dataset in ['resnet18_cifar10', 'resnet18_imagenet2012']: - from src.resnet import resnet18 as resnet - elif config.network_dataset == 'resnet34_imagenet2012': - from src.resnet import resnet34 as resnet - elif config.network_dataset in ['resnet50_cifar10', 'resnet50_imagenet2012']: - from src.resnet import resnet50 as resnet - elif config.network_dataset == 'resnet101_imagenet2012': - from src.resnet import resnet101 as resnet - elif config.network_dataset == 'resnet152_imagenet2012': - from src.resnet import resnet152 as resnet - elif config.network_dataset == 'se-resnet50_imagenet2012': - from src.resnet import se_resnet50 as resnet - else: - raise ValueError("network and dataset is not support.") - - net = resnet(config.class_num) - - assert config.checkpoint_file_path is not None, "checkpoint_path is None." - - param_dict = ms.load_checkpoint(config.checkpoint_file_path) - ms.load_param_into_net(net, param_dict) - - input_arr = ms.numpy.zeros([config.batch_size, 3, config.height, config.width], ms.float32) - ms.export(net, input_arr, file_name=config.file_name, file_format=config.file_format) - - -if __name__ == '__main__': - run_export() diff --git a/benchmark/ascend/resnet/scripts/cache_util.sh b/benchmark/ascend/resnet/scripts/cache_util.sh deleted file mode 100644 index a3aa77e54..000000000 --- a/benchmark/ascend/resnet/scripts/cache_util.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -bootup_cache_server() -{ - echo "Booting up cache server..." - result=$(cache_admin --start 2>&1) - rc=$? - echo "${result}" - if [ "${rc}" -ne 0 ] && [[ ! ${result} =~ "Cache server is already up and running" ]]; then - echo "cache_admin command failure!" "${result}" - exit 1 - fi -} - -generate_cache_session() -{ - result=$(cache_admin -g | awk 'END {print $NF}') - rc=$? - echo "${result}" - if [ "${rc}" -ne 0 ]; then - echo "cache_admin command failure!" "${result}" - exit 1 - fi -} - -shutdown_cache_server() -{ - echo "Shutting down cache server..." - result=$(cache_admin --stop 2>&1) - rc=$? - echo "${result}" - if [ "${rc}" -ne 0 ] && [[ ! ${result} =~ "Server on port 50052 is not reachable or has been shutdown already" ]]; then - echo "cache_admin command failure!" "${result}" - exit 1 - fi -} diff --git a/benchmark/ascend/resnet/scripts/run_distribute_train.sh b/benchmark/ascend/resnet/scripts/run_distribute_train.sh index c6577b16d..6dc972281 100644 --- a/benchmark/ascend/resnet/scripts/run_distribute_train.sh +++ b/benchmark/ascend/resnet/scripts/run_distribute_train.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2020-2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,10 +18,12 @@ CURPATH="$(dirname "$0")" # shellcheck source=/dev/null . ${CURPATH}/cache_util.sh -if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] +if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] && [ $# != 6 ] then - echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)" - echo " bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH]" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) [RESUME_CKPT](optional)" exit 1 fi @@ -39,13 +41,13 @@ CONFIG_FILE=$(get_real_path $3) str="Boost_" if [[ $CONFIG_FILE =~ $str ]] then - export MS_DISABLE_REF_MODE=1 + export MS_DISABLE_REF_MODE=0 export MS_ENABLE_FORMAT_MODE=0 fi if [ $# == 4 ] then - PATH3=$(get_real_path $4) + RESUME_CKPT=$(get_real_path $4) fi if [ $# == 5 ] @@ -54,6 +56,13 @@ then EVAL_DATASET_PATH=$(get_real_path $5) fi +if [ $# == 6 ] +then + RUN_EVAL=$4 + EVAL_DATASET_PATH=$(get_real_path $5) + RESUME_CKPT=$(get_real_path $6) +fi + if [ ! -f $PATH1 ] then echo "error: RANK_TABLE_FILE=$PATH1 is not a file" @@ -66,9 +75,9 @@ then exit 1 fi -if [ $# == 4 ] && [ ! -f $PATH3 ] +if [ $# == 4 ] && [ ! -f $RESUME_CKPT ] then - echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file" + echo "error: RESUME_CKPT=$RESUME_CKPT is not a file" exit 1 fi @@ -101,6 +110,7 @@ do start=`expr $i \* $avg` end=`expr $start \+ $gap` cmdopt=$start"-"$end + echo "773491: $cmdopt" export DEVICE_ID=${i} export RANK_ID=$((rank_start + i)) rm -rf ./train_parallel$i @@ -115,20 +125,31 @@ do if [ $# == 3 ] then taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ - --config_path=$CONFIG_FILE --output_path './output' &> log & + --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & fi - + if [ $# == 4 ] then - taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 --pre_trained=$PATH3 \ - --config_path=$CONFIG_FILE --output_path './output' &> log & + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 --resume_ckpt=$RESUME_CKPT \ + --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & fi if [ $# == 5 ] then taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ - --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True \ - --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_path './output' &> log & + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=False \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi + fi + + if [ $# == 6 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --resume_ckpt=$RESUME_CKPT \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & if [ "x${RUN_EVAL}" == "xTrue" ] then echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" diff --git a/benchmark/ascend/resnet/scripts/run_distribute_train_2node_16p.sh b/benchmark/ascend/resnet/scripts/run_distribute_train_2node_16p.sh new file mode 100644 index 000000000..6df3a9d69 --- /dev/null +++ b/benchmark/ascend/resnet/scripts/run_distribute_train_2node_16p.sh @@ -0,0 +1,135 @@ +#!/bin/bash +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +CURPATH="$(dirname "$0")" +# shellcheck source=/dev/null +. ${CURPATH}/cache_util.sh + +if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] && [ $# != 6 ] +then + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH]" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) [RESUME_CKPT](optional)" + exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) +CONFIG_FILE=$(get_real_path $3) +str="Boost_" +if [[ $CONFIG_FILE =~ $str ]] +then + export MS_DISABLE_REF_MODE=0 + export MS_ENABLE_FORMAT_MODE=0 +fi + +if [ $# == 5 ] +then + RUN_EVAL=$4 + EVAL_DATASET_PATH=$(get_real_path $5) +fi + +export SERVER_ID=0 + +if [ $# == 6 ] +then + RUN_EVAL=$4 + EVAL_DATASET_PATH=$(get_real_path $5) + export SERVER_ID=$6 +fi + +if [ ! -f $PATH1 ] +then + echo "error: RANK_TABLE_FILE=$PATH1 is not a file" +exit 1 +fi + +if [ ! -d $PATH2 ] +then + echo "error: DATASET_PATH=$PATH2 is not a directory" +exit 1 +fi + +if [ "x${RUN_EVAL}" == "xTrue" ] && [ ! -d $EVAL_DATASET_PATH ] +then + echo "error: EVAL_DATASET_PATH=$EVAL_DATASET_PATH is not a directory" + exit 1 +fi + + +ulimit -u unlimited +export DEVICE_NUM=8 +export RANK_SIZE=16 +export RANK_TABLE_FILE=$PATH1 +offset=0 + +rank_start=$((DEVICE_NUM * SERVER_ID)) + +cpus=`cat /proc/cpuinfo| grep "processor"| wc -l` +avg=`expr $cpus \/ $DEVICE_NUM` +gap=`expr $avg \- 1` + +for((i=0; i<${DEVICE_NUM}; i++)) +do + start=`expr $i \* $avg` + end=`expr $start \+ $gap` + cmdopt=$start"-"$end + echo "773491: $cmdopt" + export DEVICE_ID=$((offset + i)) + export RANK_ID=$((rank_start + i)) + rm -rf ./train_parallel$i + mkdir ./train_parallel$i + cp ../*.py ./train_parallel$i + cp *.sh ./train_parallel$i + cp -r ../config/*.yaml ./train_parallel$i + cp -r ../src ./train_parallel$i + cd ./train_parallel$i || exit + echo "start training for rank $RANK_ID, device $DEVICE_ID" + env > env.log + + if [ $# == 5 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=False \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi + fi + + if [ $# == 6 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=False \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi + fi + cd .. +done + diff --git a/benchmark/ascend/resnet/scripts/run_distribute_train_4p.sh b/benchmark/ascend/resnet/scripts/run_distribute_train_4p.sh new file mode 100644 index 000000000..6c6d1680b --- /dev/null +++ b/benchmark/ascend/resnet/scripts/run_distribute_train_4p.sh @@ -0,0 +1,154 @@ +#!/bin/bash +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +CURPATH="$(dirname "$0")" +# shellcheck source=/dev/null +. ${CURPATH}/cache_util.sh + +if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] && [ $# != 6 ] +then + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH]" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) [RESUME_CKPT](optional)" + exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) +CONFIG_FILE=$(get_real_path $3) +str="Boost_" +if [[ $CONFIG_FILE =~ $str ]] +then + export MS_DISABLE_REF_MODE=0 + export MS_ENABLE_FORMAT_MODE=0 +fi + +if [ $# == 4 ] +then + RESUME_CKPT=$(get_real_path $4) +fi + +if [ $# == 5 ] +then + RUN_EVAL=$4 + EVAL_DATASET_PATH=$(get_real_path $5) +fi + +if [ $# == 6 ] +then + RUN_EVAL=$4 + EVAL_DATASET_PATH=$(get_real_path $5) + RESUME_CKPT=$(get_real_path $6) +fi + +if [ ! -f $PATH1 ] +then + echo "error: RANK_TABLE_FILE=$PATH1 is not a file" +exit 1 +fi + +if [ ! -d $PATH2 ] +then + echo "error: DATASET_PATH=$PATH2 is not a directory" +exit 1 +fi + +if [ $# == 4 ] && [ ! -f $RESUME_CKPT ] +then + echo "error: RESUME_CKPT=$RESUME_CKPT is not a file" +exit 1 +fi + +if [ "x${RUN_EVAL}" == "xTrue" ] && [ ! -d $EVAL_DATASET_PATH ] +then + echo "error: EVAL_DATASET_PATH=$EVAL_DATASET_PATH is not a directory" + exit 1 +fi + + +ulimit -u unlimited +export DEVICE_NUM=4 +export RANK_SIZE=4 +export RANK_TABLE_FILE=$PATH1 + +export SERVER_ID=0 +rank_start=$((DEVICE_NUM * SERVER_ID)) + +cpus=`cat /proc/cpuinfo| grep "processor"| wc -l` +avg=`expr $cpus \/ $DEVICE_NUM` +gap=`expr $avg \- 1` + +for((i=0; i<${DEVICE_NUM}; i++)) +do + start=`expr $i \* $avg` + end=`expr $start \+ $gap` + cmdopt=$start"-"$end + echo "773491: $cmdopt" + export DEVICE_ID=$((4 + i)) + export RANK_ID=$((rank_start + i)) + rm -rf ./train_parallel$i + mkdir ./train_parallel$i + cp ../*.py ./train_parallel$i + cp *.sh ./train_parallel$i + cp -r ../config/*.yaml ./train_parallel$i + cp -r ../src ./train_parallel$i + cd ./train_parallel$i || exit + echo "start training for rank $RANK_ID, device $DEVICE_ID" + env > env.log + if [ $# == 3 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ + --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + fi + + if [ $# == 4 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 --resume_ckpt=$RESUME_CKPT \ + --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + fi + + if [ $# == 5 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=False \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi + fi + + if [ $# == 6 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --resume_ckpt=$RESUME_CKPT \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi + fi + cd .. +done diff --git a/benchmark/ascend/resnet/scripts/run_eval.sh b/benchmark/ascend/resnet/scripts/run_eval.sh deleted file mode 100644 index 97a7ba85c..000000000 --- a/benchmark/ascend/resnet/scripts/run_eval.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -# Copyright 2020-2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -if [ $# != 3 ] -then - echo "Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]" -exit 1 -fi - -get_real_path(){ - if [ "${1:0:1}" == "/" ]; then - echo "$1" - else - echo "$(realpath -m $PWD/$1)" - fi -} - -PATH1=$(get_real_path $1) -PATH2=$(get_real_path $2) -CONFIG_FILE=$(get_real_path $3) - - -if [ ! -d $PATH1 ] -then - echo "error: DATASET_PATH=$PATH1 is not a directory" -exit 1 -fi - -if [ ! -f $PATH2 ] -then - echo "error: CHECKPOINT_PATH=$PATH2 is not a file" -exit 1 -fi - -ulimit -u unlimited -export DEVICE_NUM=1 -export DEVICE_ID=0 -export RANK_SIZE=$DEVICE_NUM -export RANK_ID=0 - -if [ -d "eval" ]; -then - rm -rf ./eval -fi -mkdir ./eval -cp ../*.py ./eval -cp *.sh ./eval -cp -r ../config/*.yaml ./eval -cp -r ../src ./eval -cd ./eval || exit -env > env.log -echo "start evaluation for device $DEVICE_ID" -python eval.py --data_path=$PATH1 --checkpoint_file_path=$PATH2 --config_path=$CONFIG_FILE &> log & -cd .. diff --git a/benchmark/ascend/resnet/scripts/run_infer.sh b/benchmark/ascend/resnet/scripts/run_infer.sh deleted file mode 100644 index b73e956c1..000000000 --- a/benchmark/ascend/resnet/scripts/run_infer.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -if [ $# != 3 ] -then - echo "Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]" -exit 1 -fi - -get_real_path(){ - if [ "${1:0:1}" == "/" ]; then - echo "$1" - else - echo "$(realpath -m $PWD/$1)" - fi -} - -PATH1=$(get_real_path $1) -PATH2=$(get_real_path $2) -CONFIG_FILE=$(get_real_path $3) - - -if [ ! -d $PATH1 ] -then - echo "error: DATASET_PATH=$PATH1 is not a directory" -exit 1 -fi - -if [ ! -f $PATH2 ] -then - echo "error: CHECKPOINT_PATH=$PATH2 is not a file" -exit 1 -fi - -ulimit -u unlimited -export DEVICE_NUM=1 -export DEVICE_ID=0 -export RANK_SIZE=$DEVICE_NUM -export RANK_ID=0 - -if [ -d "infer" ]; -then - rm -rf ./infer -fi -mkdir ./infer -cp ../config/*.yaml ./infer -cp ../*.py ./infer -cp *.sh ./infer -cp -r ../src ./infer -cd ./infer || exit -env > env.log -echo "start evaluation for device $DEVICE_ID" -python infer.py --data_path=$PATH1 --checkpoint_file_path=$PATH2 --config_path=$CONFIG_FILE &> log & -cd .. diff --git a/benchmark/ascend/resnet/scripts/run_infer_310.sh b/benchmark/ascend/resnet/scripts/run_infer_310.sh deleted file mode 100644 index a733fd474..000000000 --- a/benchmark/ascend/resnet/scripts/run_infer_310.sh +++ /dev/null @@ -1,145 +0,0 @@ -#!/bin/bash -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -if [[ $# -lt 5 || $# -gt 6 ]]; then - echo "Usage: bash run_infer_310.sh [MINDIR_PATH] [NET_TYPE] [DATASET] [DATA_PATH] [CONFIG_PATH] [DEVICE_ID] - NET_TYPE can choose from [resnet18, resnet34, se-resnet50, resnet50, resnet101, resnet152] - DATASET can choose from [cifar10, imagenet] - DEVICE_ID is optional, it can be set by environment variable device_id, otherwise the value is zero" -exit 1 -fi - -get_real_path(){ - if [ "${1:0:1}" == "/" ]; then - echo "$1" - else - echo "$(realpath -m $PWD/$1)" - fi -} -model=$(get_real_path $1) -if [ $2 == 'resnet18' ] || [ $2 == 'resnet34' ] || [ $2 == 'se-resnet50' ] || [ $2 == 'resnet50' ] || [ $2 == 'resnet152' ] || [ $2 == 'resnet101' ]; then - network=$2 -else - echo "NET_TYPE can choose from [resnet18, se-resnet50]" - exit 1 -fi - -if [ $3 == 'cifar10' ] || [ $3 == 'imagenet' ]; then - dataset=$3 -else - echo "DATASET can choose from [cifar10, imagenet]" - exit 1 -fi - -data_path=$(get_real_path $4) -config_path=$(get_real_path $5) - -device_id=0 -if [ $# == 6 ]; then - device_id=$6 -fi - -echo "mindir name: "$model -echo "dataset path: "$data_path -echo "network: "$network -echo "dataset: "$dataset -echo "device id: "$device_id - -export ASCEND_HOME=/usr/local/Ascend/ -if [ -d ${ASCEND_HOME}/ascend-toolkit ]; then - export PATH=$ASCEND_HOME/fwkacllib/bin:$ASCEND_HOME/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/ascend-toolkit/latest/atc/bin:$PATH - export LD_LIBRARY_PATH=$ASCEND_HOME/fwkacllib/lib64:/usr/local/lib:$ASCEND_HOME/ascend-toolkit/latest/atc/lib64:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/lib64:$ASCEND_HOME/driver/lib64:$ASCEND_HOME/add-ons:$LD_LIBRARY_PATH - export TBE_IMPL_PATH=$ASCEND_HOME/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe - export PYTHONPATH=$ASCEND_HOME/fwkacllib/python/site-packages:${TBE_IMPL_PATH}:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/python/site-packages:$PYTHONPATH - export ASCEND_OPP_PATH=$ASCEND_HOME/ascend-toolkit/latest/opp -else - export PATH=$ASCEND_HOME/fwkacllib/bin:$ASCEND_HOME/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/atc/ccec_compiler/bin:$ASCEND_HOME/atc/bin:$PATH - export LD_LIBRARY_PATH=$ASCEND_HOME/fwkacllib/lib64:/usr/local/lib:$ASCEND_HOME/atc/lib64:$ASCEND_HOME/acllib/lib64:$ASCEND_HOME/driver/lib64:$ASCEND_HOME/add-ons:$LD_LIBRARY_PATH - export PYTHONPATH=$ASCEND_HOME/fwkacllib/python/site-packages:$ASCEND_HOME/atc/python/site-packages:$PYTHONPATH - export ASCEND_OPP_PATH=$ASCEND_HOME/opp -fi - -function compile_app() -{ - cd ../ascend310_infer/src/ || exit - if [ -f "Makefile" ]; then - make clean - fi - bash build.sh &> build.log -} - -function preprocess_data() -{ - if [ -d preprocess_Result ]; then - rm -rf ./preprocess_Result - fi - mkdir preprocess_Result - python ../preprocess.py --data_path=$data_path --output_path=./preprocess_Result --config_path=$config_path &> preprocess.log -} - -function infer() -{ - cd - || exit - if [ -d result_Files ]; then - rm -rf ./result_Files - fi - if [ -d time_Result ]; then - rm -rf ./time_Result - fi - mkdir result_Files - mkdir time_Result - ../ascend310_infer/src/main --mindir_path=$model --dataset_path=$data_path --network=$network --dataset=$dataset --device_id=$device_id &> infer.log -} - -function cal_acc() -{ - if [ "x${dataset}" == "xcifar10" ] || [ "x${dataset}" == "xCifar10" ]; then - python ../postprocess.py --dataset=$dataset --label_path=./preprocess_Result/label --result_path=result_Files --config_path=$config_path &> acc.log - else - python ../create_imagenet2012_label.py --img_path=$data_path - python ../postprocess.py --dataset=$dataset --result_path=./result_Files --label_path=./imagenet_label.json --config_path=$config_path &> acc.log - fi - if [ $? -ne 0 ]; then - echo "calculate accuracy failed" - exit 1 - fi -} - -if [ "x${dataset}" == "xcifar10" ] || [ "x${dataset}" == "xCifar10" ]; then - if [ $2 == 'resnet18' ]; then - CONFIG_PATH=resnet18_cifar10_config.yaml - else - CONFIG_PATH=resnet50_cifar10_config.yaml - fi - preprocess_data ${CONFIG_PATH} - data_path=./preprocess_Result/img_data -fi - -compile_app -if [ $? -ne 0 ]; then - echo "compile app code failed" - exit 1 -fi -infer -if [ $? -ne 0 ]; then - echo " execute inference failed" - exit 1 -fi -cal_acc -if [ $? -ne 0 ]; then - echo "calculate accuracy failed" - exit 1 -fi \ No newline at end of file diff --git a/benchmark/ascend/resnet/scripts/run_standalone_train.sh b/benchmark/ascend/resnet/scripts/run_standalone_train.sh deleted file mode 100644 index de5274f5a..000000000 --- a/benchmark/ascend/resnet/scripts/run_standalone_train.sh +++ /dev/null @@ -1,110 +0,0 @@ -#!/bin/bash -# Copyright 2020-2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -CURPATH="$(dirname "$0")" -# shellcheck source=/dev/null -. ${CURPATH}/cache_util.sh - -if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] -then - echo "Usage: bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)" - echo "bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" -exit 1 -fi - -get_real_path(){ - if [ "${1:0:1}" == "/" ]; then - echo "$1" - else - echo "$(realpath -m $PWD/$1)" - fi -} - -PATH1=$(get_real_path $1) -CONFIG_FILE=$(get_real_path $2) -if [ $# == 3 ] -then - PATH2=$(get_real_path $3) -fi - -if [ $# == 4 ] -then - RUN_EVAL=$2 - EVAL_DATASET_PATH=$(get_real_path $4) -fi - -if [ ! -d $PATH1 ] -then - echo "error: DATASET_PATH=$PATH1 is not a directory" -exit 1 -fi - -if [ $# == 3 ] && [ ! -f $PATH2 ] -then - echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" -exit 1 -fi - -if [ "x${RUN_EVAL}" == "xTrue" ] && [ ! -d $EVAL_DATASET_PATH ] -then - echo "error: EVAL_DATASET_PATH=$EVAL_DATASET_PATH is not a directory" - exit 1 -fi - -if [ "x${RUN_EVAL}" == "xTrue" ] -then - bootup_cache_server - CACHE_SESSION_ID=$(generate_cache_session) -fi - -ulimit -u unlimited -export DEVICE_NUM=1 -export RANK_ID=0 -export RANK_SIZE=1 - -if [ -d "train" ]; -then - rm -rf ./train -fi -mkdir ./train -cp ../config/*.yaml ./train -cp ../*.py ./train -cp *.sh ./train -cp -r ../src ./train -cd ./train || exit -echo "start training for device $DEVICE_ID" -env > env.log -if [ $# == 2 ] -then - python train.py --data_path=$PATH1 --config_path=$CONFIG_FILE --output_path './output' &> log & -fi - -if [ $# == 3 ] -then - python train.py --data_path=$PATH1 --pre_trained=$PATH2 --config_path=$CONFIG_FILE --output_path './output' &> log & -fi - -if [ $# == 4 ] -then - python train.py --data_path=$PATH1 --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH \ - --enable_cache=True --cache_session_id=$CACHE_SESSION_ID \ - --config_path=$CONFIG_FILE --output_path './output' &> log & - if [ "x${RUN_EVAL}" == "xTrue" ] - then - echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" - fi -fi -cd .. diff --git a/benchmark/ascend/resnet/scripts/run_standalone_train_gpu.sh b/benchmark/ascend/resnet/scripts/run_standalone_train_gpu.sh deleted file mode 100644 index b65d9df47..000000000 --- a/benchmark/ascend/resnet/scripts/run_standalone_train_gpu.sh +++ /dev/null @@ -1,115 +0,0 @@ -#!/bin/bash -# Copyright 2020-2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -CURPATH="$(dirname "$0")" -# shellcheck source=/dev/null -. ${CURPATH}/cache_util.sh - -if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] -then - echo "Usage: bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)" - echo " bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" -exit 1 -fi - -get_real_path(){ - if [ "${1:0:1}" == "/" ]; then - echo "$1" - else - echo "$(realpath -m $PWD/$1)" - fi -} - -PATH1=$(get_real_path $1) -CONFIG_FILE=$(get_real_path $2) - -if [ $# == 3 ] -then - PATH2=$(get_real_path $3) -fi - -if [ $# == 4 ] -then - RUN_EVAL=$3 - EVAL_DATASET_PATH=$(get_real_path $4) -fi - -if [ ! -d $PATH1 ] -then - echo "error: DATASET_PATH=$PATH1 is not a directory" -exit 1 -fi - -if [ $# == 3 ] && [ ! -f $PATH2 ] -then - echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" -exit 1 -fi - - -if [ "x${RUN_EVAL}" == "xTrue" ] && [ ! -d $EVAL_DATASET_PATH ] -then - echo "error: EVAL_DATASET_PATH=$EVAL_DATASET_PATH is not a directory" - exit 1 -fi - -if [ "x${RUN_EVAL}" == "xTrue" ] -then - bootup_cache_server - CACHE_SESSION_ID=$(generate_cache_session) -fi - -ulimit -u unlimited -export DEVICE_NUM=1 -export DEVICE_ID=0 -export RANK_ID=0 -export RANK_SIZE=1 - -if [ -d "train" ]; -then - rm -rf ./train -fi -mkdir ./train -cp ../config/*.yaml ./train -cp ../*.py ./train -cp *.sh ./train -cp -r ../src ./train -cd ./train || exit -echo "start training for device $DEVICE_ID" -env > env.log -if [ $# == 2 ] -then - python train.py --device_target="GPU" --data_path=$PATH1 \ - --config_path=$CONFIG_FILE --output_path './output' &> log & -fi - -if [ $# == 3 ] -then - python train.py --device_target="GPU" --data_path=$PATH1 --pre_trained=$PATH2 \ - --config_path=$CONFIG_FILE --output_path './output' &> log & -fi - -if [ $# == 4 ] -then - python train.py --device_target="GPU" --data_path=$PATH1 --run_eval=$RUN_EVAL \ - --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID \ - --config_path=$CONFIG_FILE --output_path './output' &> log & - if [ "x${RUN_EVAL}" == "xTrue" ] - then - echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" - fi -fi -cd .. diff --git a/benchmark/ascend/resnet/src/CrossEntropySmooth.py b/benchmark/ascend/resnet/src/CrossEntropySmooth.py index 1634033c2..2077b4a40 100644 --- a/benchmark/ascend/resnet/src/CrossEntropySmooth.py +++ b/benchmark/ascend/resnet/src/CrossEntropySmooth.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/benchmark/ascend/resnet/src/eval_callback.py b/benchmark/ascend/resnet/src/callback.py similarity index 41% rename from benchmark/ascend/resnet/src/eval_callback.py rename to benchmark/ascend/resnet/src/callback.py index 5e68632ad..5be6f4674 100644 --- a/benchmark/ascend/resnet/src/eval_callback.py +++ b/benchmark/ascend/resnet/src/callback.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,11 +17,85 @@ import os import stat import time -from mindspore import save_checkpoint -from mindspore import log as logger +import numpy as np +import mindspore as ms from mindspore.train.callback import Callback +class LossCallBack(Callback): + """ + Monitor the loss in training. + If the loss in NAN or INF terminating training. + """ + + def __init__(self, epoch_size, logger, lr, per_print_time=1, global_steps=0): + super(LossCallBack, self).__init__() + self.epoch_size = epoch_size + self.logger = logger + self.lr = lr + self.global_steps = global_steps + self.per_print_time = per_print_time + self.step_start_time = time.time() + self.epoch_start_time = time.time() + + def on_train_step_end(self, run_context): + cb_params = run_context.original_args() + loss = cb_params.net_outputs + data_sink_mode = cb_params.get('dataset_sink_mode', True) + if not data_sink_mode: + if isinstance(loss, (tuple, list)): + if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): + loss = loss[0] + + if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray): + loss = np.mean(loss.asnumpy()) + + cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 + cur_epoch_num = cb_params.cur_epoch_num + if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)): + raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format( + cb_params.cur_epoch_num, cur_step_in_epoch)) + + if self.per_print_time != 0 and cur_step_in_epoch % self.per_print_time == 0: + # pylint: disable=line-too-long + per_step_time = 1000 * (time.time() - self.step_start_time) / self.per_print_time + log_info = "epoch: [%s/%s] step: [%s/%s], lr: %.6f, loss: %.6f, per step time: %.3f ms" % ( + cur_epoch_num, self.epoch_size, cur_step_in_epoch, cb_params.batch_num, self.lr[self.global_steps], + loss, per_step_time) + self.logger.info(log_info) + self.step_start_time = time.time() + self.global_steps += 1 + + def on_train_epoch_begin(self, run_context): + self.epoch_start_time = time.time() + self.step_start_time = time.time() + + def on_train_epoch_end(self, run_context): + cb_params = run_context.original_args() + loss = cb_params.net_outputs + cur_epoch_num = cb_params.cur_epoch_num + if isinstance(loss, (tuple, list)): + if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): + loss = loss[0] + + if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray): + loss = np.mean(loss.asnumpy()) + + epoch_time = time.time() - self.epoch_start_time + log_info = 'epoch: [%s/%s] loss: %.6f, epoch time: %.3f s, per step time: %.3f ms' % ( + cur_epoch_num, self.epoch_size, loss, epoch_time, epoch_time * 1000 / cb_params.batch_num) + self.logger.info(log_info) + + +class ResumeCallback(Callback): + def __init__(self, start_epoch=0): + super(ResumeCallback, self).__init__() + self.start_epoch = start_epoch + + def on_train_epoch_begin(self, run_context): + run_context.original_args().cur_epoch_num += self.start_epoch + + class EvalCallBack(Callback): """ Evaluation callback when training. @@ -42,18 +116,20 @@ class EvalCallBack(Callback): >>> EvalCallBack(eval_function, eval_param_dict) """ - def __init__(self, eval_function, eval_param_dict, interval=1, eval_start_epoch=1, save_best_ckpt=True, - ckpt_directory="./", best_ckpt_name="best.ckpt", metrics_name="acc"): + def __init__(self, eval_function, eval_param_dict, interval=1, eval_start_epoch=1, rank_id=0, save_best_ckpt=True, + ckpt_directory="./", best_ckpt_name="best.ckpt", metrics_name="acc", logger=None): super(EvalCallBack, self).__init__() self.eval_param_dict = eval_param_dict self.eval_function = eval_function self.eval_start_epoch = eval_start_epoch + self.logger = logger if interval < 1: raise ValueError("interval should >= 1.") self.interval = interval self.save_best_ckpt = save_best_ckpt self.best_res = 0 self.best_epoch = 0 + self.rank_id = rank_id if not os.path.isdir(ckpt_directory): os.makedirs(ckpt_directory) self.best_ckpt_path = os.path.join(ckpt_directory, best_ckpt_name) @@ -65,11 +141,11 @@ class EvalCallBack(Callback): os.chmod(file_name, stat.S_IWRITE) os.remove(file_name) except OSError: - logger.warning("OSError, failed to remove the older ckpt file %s.", file_name) + self.logger.warning("OSError, failed to remove the older ckpt file %s.", file_name) except ValueError: - logger.warning("ValueError, failed to remove the older ckpt file %s.", file_name) + self.logger.warning("ValueError, failed to remove the older ckpt file %s.", file_name) - def epoch_end(self, run_context): + def on_train_epoch_end(self, run_context): """Callback when epoch end.""" cb_params = run_context.original_args() cur_epoch = cb_params.cur_epoch_num @@ -77,19 +153,8 @@ class EvalCallBack(Callback): eval_start = time.time() res = self.eval_function(self.eval_param_dict) eval_cost = time.time() - eval_start - print("epoch: {}, {}: {}, eval_cost:{:.2f}".format(cur_epoch, self.metrics_name, res, eval_cost), - flush=True) - if res >= self.best_res: - self.best_res = res - self.best_epoch = cur_epoch - print("update best result: {}".format(res), flush=True) - if self.save_best_ckpt: - if os.path.exists(self.best_ckpt_path): - self.remove_ckpoint_file(self.best_ckpt_path) - save_checkpoint(cb_params.train_network, self.best_ckpt_path) - print("update best checkpoint at: {}".format(self.best_ckpt_path), flush=True) - - def end(self, run_context): - print("End training, the best {0} is: {1}, the best {0} epoch is {2}".format(self.metrics_name, - self.best_res, - self.best_epoch), flush=True) + self.logger.info("epoch: {}, {}: {}, eval_cost:{:.2f}".format(cur_epoch, self.metrics_name, res, eval_cost)) + + def on_train_end(self, run_context): + self.logger.info("End training, the best %s is: %s, the best %s epoch is %s" % ( + self.metrics_name, self.best_res, self.metrics_name, self.best_epoch)) diff --git a/benchmark/ascend/resnet/create_imagenet2012_label.py b/benchmark/ascend/resnet/src/data_split.py similarity index 35% rename from benchmark/ascend/resnet/create_imagenet2012_label.py rename to benchmark/ascend/resnet/src/data_split.py index c0c102c92..dcdf649ae 100644 --- a/benchmark/ascend/resnet/create_imagenet2012_label.py +++ b/benchmark/ascend/resnet/src/data_split.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,40 +12,50 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""create_imagenet2012_label""" +""" +cpu_cut_data. +""" import os -import json -import argparse - -parser = argparse.ArgumentParser(description="resnet imagenet2012 label") -parser.add_argument("--img_path", type=str, required=True, help="imagenet2012 file path.") -args = parser.parse_args() - - -def create_label(file_path): - ''' - Create image_label.json from image files. - ''' - print("[WARNING] Create imagenet label. Currently only use for Imagenet2012!") - dirs = os.listdir(file_path) - file_list = [] - for file in dirs: - file_list.append(file) - file_list = sorted(file_list) - - total = 0 - img_label = {} - for i, file_dir in enumerate(file_list): - files = os.listdir(os.path.join(file_path, file_dir)) - for f in files: - img_label[f] = i - total += len(files) - - with open("imagenet_label.json", "w+") as label: - json.dump(img_label, label) - - print("[INFO] Completed! Total {} data.".format(total)) +import shutil + + +def generate_data(): + dirs = [] + path = "./" + _ = None + for _, j, _ in os.walk(path): + if len(j).__trunc__() > 0: + dirs.append(j) + + train_folder = os.path.exists("./train") + if not train_folder: + os.makedirs("./train") + test_folder = os.path.exists("./test") + if not test_folder: + os.makedirs("./test") + + for di in dirs[0]: + files = os.listdir(di) + train_set = files[: int(len(files) * 3 / 4)] + test_set = files[int(len(files) * 3 / 4):] + for file in train_set: + fname = "./train/" + di + "/" + folder = os.path.exists(fname) + if not folder: + os.makedirs(fname) + src_file = "./" + di + "/" + file + dst_file = fname + file + shutil.copyfile(src_file, dst_file) + + for file in test_set: + fname = "./test/" + di + "/" + folder = os.path.exists(fname) + if not folder: + os.makedirs(fname) + src_file = "./" + di + "/" + file + dst_file = fname + file + shutil.copyfile(src_file, dst_file) if __name__ == '__main__': - create_label(args.img_path) + generate_data() diff --git a/benchmark/ascend/resnet/src/dataset.py b/benchmark/ascend/resnet/src/dataset.py index 16f3af464..8376aa899 100644 --- a/benchmark/ascend/resnet/src/dataset.py +++ b/benchmark/ascend/resnet/src/dataset.py @@ -1,4 +1,4 @@ -# Copyright 2020-2022 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,75 +15,15 @@ """ create train or eval dataset. """ -import multiprocessing +from io import BytesIO +import numpy as np +from PIL import Image import mindspore as ms import mindspore.dataset as ds -from mindspore.communication.management import init, get_rank, get_group_size -def create_dataset1(dataset_path, do_train, batch_size=32, train_image_size=224, eval_image_size=224, - target="Ascend", distribute=False, enable_cache=False, cache_session_id=None): - """ - create a train or evaluate cifar10 dataset for resnet50 - Args: - dataset_path(string): the path of dataset. - do_train(bool): whether dataset is used for train or eval. - repeat_num(int): the repeat times of dataset. Default: 1 - batch_size(int): the batch size of dataset. Default: 32 - target(str): the device target. Default: Ascend - distribute(bool): data for distribute or not. Default: False - enable_cache(bool): whether tensor caching service is used for eval. Default: False - cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None - Returns: - dataset - """ - device_num, rank_id = _get_rank_info(distribute) - ds.config.set_prefetch_size(64) - if device_num == 1: - data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=get_num_parallel_workers(12), shuffle=True) - else: - data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=get_num_parallel_workers(12), shuffle=True, - num_shards=device_num, shard_id=rank_id) - - # define map operations - trans = [] - if do_train: - trans += [ - ds.vision.RandomCrop((32, 32), (4, 4, 4, 4)), - ds.vision.RandomHorizontalFlip(prob=0.5) - ] - - trans += [ - ds.vision.Resize((train_image_size, train_image_size)), - ds.vision.Rescale(1.0 / 255.0, 0.0), - ds.vision.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]), - ds.vision.HWC2CHW() - ] - - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) - - data_set = data_set.map(operations=type_cast_op, input_columns="label", - num_parallel_workers=get_num_parallel_workers(8)) - # only enable cache for eval - if do_train: - enable_cache = False - if enable_cache: - if not cache_session_id: - raise ValueError("A cache session_id must be provided to use cache.") - eval_cache = ds.DatasetCache(session_id=int(cache_session_id), size=0) - data_set = data_set.map(operations=trans, input_columns="image", - num_parallel_workers=get_num_parallel_workers(8), cache=eval_cache) - else: - data_set = data_set.map(operations=trans, input_columns="image", - num_parallel_workers=get_num_parallel_workers(8)) - - # apply batch operations - data_set = data_set.batch(batch_size, drop_remainder=True) - - return data_set - -def create_dataset2(dataset_path, do_train, batch_size=32, train_image_size=224, eval_image_size=224, - target="Ascend", distribute=False, enable_cache=False, cache_session_id=None): +def create_dataset(dataset_path, do_train, batch_size=32, train_image_size=224, eval_image_size=224, + target="Ascend", distribute=False, enable_cache=False, cache_session_id=None, drop_remainder=True): """ create a train or eval imagenet2012 dataset for resnet50 @@ -104,9 +44,9 @@ def create_dataset2(dataset_path, do_train, batch_size=32, train_image_size=224, ds.config.set_prefetch_size(64) if device_num == 1: - data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(12), shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(24), shuffle=True) else: - data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(12), shuffle=True, + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(24), shuffle=True, #12 num_shards=device_num, shard_id=rank_id) # Computed from random subset of ImageNet training images @@ -120,6 +60,31 @@ def create_dataset2(dataset_path, do_train, batch_size=32, train_image_size=224, ds.vision.RandomHorizontalFlip(prob=0.5) ] else: + batch_per_step = batch_size * device_num + print("eval batch per step:{}".format(batch_per_step)) + if batch_per_step < 50000: + if 50000 % batch_per_step == 0: + num_padded = 0 + else: + num_padded = batch_per_step - (50000 % batch_per_step) + else: + num_padded = batch_per_step - 50000 + print("eval padded samples:{}".format(num_padded)) + + if num_padded != 0: + white_io = BytesIO() + Image.new('RGB', (224, 224), (255, 255, 255)).save(white_io, 'JPEG') + padded_sample = { + "image": np.array(bytearray(white_io.getvalue()), dtype="uint8"), + "label": np.array(-1, np.int32) + } + sample = [padded_sample for x in range(num_padded)] + ds_pad = ds.PaddedDataset(sample) + ds_imagefolder = ds.ImageFolderDataset(dataset_path, num_parallel_workers=24) + data_set = ds_pad + ds_imagefolder + distributeSampler = ds.DistributedSampler(num_shards=device_num, + shard_id=rank_id, shuffle=False, num_samples=None) + data_set.use_sampler(distributeSampler) trans = [ ds.vision.Decode(), ds.vision.Resize(256), @@ -148,241 +113,9 @@ def create_dataset2(dataset_path, do_train, batch_size=32, train_image_size=224, cache=eval_cache) else: data_set = data_set.map(operations=type_cast_op, input_columns="label", - num_parallel_workers=get_num_parallel_workers(12)) + num_parallel_workers=get_num_parallel_workers(12)) #12 # apply batch operations - data_set = data_set.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=drop_remainder) return data_set - -def create_dataset_pynative(dataset_path, do_train, batch_size=32, train_image_size=224, - eval_image_size=224, target="Ascend", distribute=False, enable_cache=False, - cache_session_id=None): - """ - create a train or eval imagenet2012 dataset for resnet50 benchmark - - Args: - dataset_path(string): the path of dataset. - do_train(bool): whether dataset is used for train or eval. - repeat_num(int): the repeat times of dataset. Default: 1 - batch_size(int): the batch size of dataset. Default: 32 - target(str): the device target. Default: Ascend - distribute(bool): data for distribute or not. Default: False - enable_cache(bool): whether tensor caching service is used for eval. Default: False - cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None - - Returns: - dataset - """ - device_num, rank_id = _get_rank_info(distribute) - - if device_num == 1: - data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(8), shuffle=True) - else: - data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(2), shuffle=True, - num_shards=device_num, shard_id=rank_id) - - # Computed from random subset of ImageNet training images - mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] - std = [0.229 * 255, 0.224 * 255, 0.225 * 255] - - # define map operations - if do_train: - trans = [ - ds.vision.RandomCropDecodeResize(train_image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)), - ds.vision.RandomHorizontalFlip(prob=0.5), - ds.vision.Normalize(mean=mean, std=std), - ds.vision.HWC2CHW() - ] - else: - trans = [ - ds.vision.Decode(), - ds.vision.Resize(256), - ds.vision.CenterCrop(eval_image_size), - ds.vision.Normalize(mean=mean, std=std), - ds.vision.HWC2CHW() - ] - - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) - - data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=4) - # only enable cache for eval - if do_train: - enable_cache = False - if enable_cache: - if not cache_session_id: - raise ValueError("A cache session_id must be provided to use cache.") - eval_cache = ds.DatasetCache(session_id=int(cache_session_id), size=0) - data_set = data_set.map(operations=type_cast_op, input_columns="label", - num_parallel_workers=get_num_parallel_workers(2), - cache=eval_cache) - else: - data_set = data_set.map(operations=type_cast_op, input_columns="label", - num_parallel_workers=get_num_parallel_workers(2)) - - # apply batch operations - data_set = data_set.batch(batch_size, drop_remainder=True) - - return data_set - -def create_dataset3(dataset_path, do_train, batch_size=32, train_image_size=224, eval_image_size=224, - target="Ascend", distribute=False, enable_cache=False, cache_session_id=None): - """ - create a train or eval imagenet2012 dataset for resnet101 - Args: - dataset_path(string): the path of dataset. - do_train(bool): whether dataset is used for train or eval. - repeat_num(int): the repeat times of dataset. Default: 1 - batch_size(int): the batch size of dataset. Default: 32 - target(str): the device target. Default: Ascend - distribute(bool): data for distribute or not. Default: False - enable_cache(bool): whether tensor caching service is used for eval. Default: False - cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None - - Returns: - dataset - """ - device_num, rank_id = _get_rank_info(distribute) - if device_num == 1: - data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(8), shuffle=True) - else: - data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(8), shuffle=True, - num_shards=device_num, shard_id=rank_id) - - mean = [0.475 * 255, 0.451 * 255, 0.392 * 255] - std = [0.275 * 255, 0.267 * 255, 0.278 * 255] - - # define map operations - if do_train: - trans = [ - ds.vision.RandomCropDecodeResize(train_image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)), - ds.vision.RandomHorizontalFlip(rank_id / (rank_id + 1)), - ds.vision.Normalize(mean=mean, std=std), - ds.vision.HWC2CHW() - ] - else: - trans = [ - ds.vision.Decode(), - ds.vision.Resize(256), - ds.vision.CenterCrop(eval_image_size), - ds.vision.Normalize(mean=mean, std=std), - ds.vision.HWC2CHW() - ] - - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) - - data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=get_num_parallel_workers(8)) - # only enable cache for eval - if do_train: - enable_cache = False - if enable_cache: - if not cache_session_id: - raise ValueError("A cache session_id must be provided to use cache.") - eval_cache = ds.DatasetCache(session_id=int(cache_session_id), size=0) - data_set = data_set.map(operations=type_cast_op, input_columns="label", - num_parallel_workers=get_num_parallel_workers(8), - cache=eval_cache) - else: - data_set = data_set.map(operations=type_cast_op, input_columns="label", - num_parallel_workers=get_num_parallel_workers(8)) - - # apply batch operations - data_set = data_set.batch(batch_size, drop_remainder=True) - - return data_set - -def create_dataset4(dataset_path, do_train, batch_size=32, train_image_size=224, eval_image_size=224, - target="Ascend", distribute=False, enable_cache=False, cache_session_id=None): - """ - create a train or eval imagenet2012 dataset for se-resnet50 - - Args: - dataset_path(string): the path of dataset. - do_train(bool): whether dataset is used for train or eval. - repeat_num(int): the repeat times of dataset. Default: 1 - batch_size(int): the batch size of dataset. Default: 32 - target(str): the device target. Default: Ascend - distribute(bool): data for distribute or not. Default: False - enable_cache(bool): whether tensor caching service is used for eval. Default: False - cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None - - Returns: - dataset - """ - device_num, rank_id = _get_rank_info(distribute) - ds.config.set_prefetch_size(64) - if device_num == 1: - data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(12), shuffle=True) - else: - data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=get_num_parallel_workers(12), shuffle=True, - num_shards=device_num, shard_id=rank_id) - - # Computed from random subset of ImageNet training images - mean = [123.68, 116.78, 103.94] - std = [1.0, 1.0, 1.0] - - # define map operations - if do_train: - trans = [ - ds.vision.RandomCropDecodeResize(train_image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)), - ds.vision.RandomHorizontalFlip(prob=0.5), - ds.vision.Normalize(mean=mean, std=std), - ds.vision.HWC2CHW() - ] - else: - trans = [ - ds.vision.Decode(), - ds.vision.Resize(292), - ds.vision.CenterCrop(eval_image_size), - ds.vision.Normalize(mean=mean, std=std), - ds.vision.HWC2CHW() - ] - - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) - data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=get_num_parallel_workers(12)) - # only enable cache for eval - if do_train: - enable_cache = False - if enable_cache: - if not cache_session_id: - raise ValueError("A cache session_id must be provided to use cache.") - eval_cache = ds.DatasetCache(session_id=int(cache_session_id), size=0) - data_set = data_set.map(operations=type_cast_op, input_columns="label", - num_parallel_workers=get_num_parallel_workers(12), - cache=eval_cache) - else: - data_set = data_set.map(operations=type_cast_op, input_columns="label", - num_parallel_workers=get_num_parallel_workers(12)) - - # apply batch operations - data_set = data_set.batch(batch_size, drop_remainder=True) - - return data_set - -def _get_rank_info(distribute): - """ - get rank size and rank id - """ - if distribute: - init() - rank_id = get_rank() - device_num = get_group_size() - else: - rank_id = 0 - device_num = 1 - return device_num, rank_id - -def get_num_parallel_workers(num_parallel_workers): - """ - Get num_parallel_workers used in dataset operations. - If num_parallel_workers > the real CPU cores number, set num_parallel_workers = the real CPU cores number. - """ - cores = multiprocessing.cpu_count() - if isinstance(num_parallel_workers, int): - if cores < num_parallel_workers: - print("The num_parallel_workers {} is set too large, now set it {}".format(num_parallel_workers, cores)) - num_parallel_workers = cores - else: - print("The num_parallel_workers {} is invalid, now set it {}".format(num_parallel_workers, min(cores, 8))) - num_parallel_workers = min(cores, 8) - return num_parallel_workers diff --git a/benchmark/ascend/resnet/src/dataset_infer.py b/benchmark/ascend/resnet/src/dataset_infer.py index aa26e0a75..9561aedaa 100644 --- a/benchmark/ascend/resnet/src/dataset_infer.py +++ b/benchmark/ascend/resnet/src/dataset_infer.py @@ -1,4 +1,4 @@ -# Copyright 2021-2022 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -89,7 +89,8 @@ class ImgDataset: return len(self.data) -def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): +def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, image_size=224, + target="Ascend", distribute=False): """ create a train or eval imagenet2012 dataset for resnet50 @@ -123,7 +124,6 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" num_parallel_workers=8, shuffle=True, num_shards=device_num, shard_id=rank_id) - image_size = 224 # Computed from random subset of ImageNet training images mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] std = [0.229 * 255, 0.224 * 255, 0.225 * 255] @@ -164,7 +164,8 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" return data_set -def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): +def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, image_size=224, + target="Ascend", distribute=False): """ create a train or eval imagenet2012 dataset for resnet101 Args: @@ -196,7 +197,6 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= data_set = ds.GeneratorDataset(source=dataset_generator, column_names=["label", "image", "filename"], num_parallel_workers=8, shuffle=True, num_shards=device_num, shard_id=rank_id) - image_size = 224 mean = [0.475 * 255, 0.451 * 255, 0.392 * 255] std = [0.275 * 255, 0.267 * 255, 0.278 * 255] @@ -233,7 +233,8 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= return data_set -def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): +def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, image_size=224, + target="Ascend", distribute=False): """ create a train or eval imagenet2012 dataset for se-resnet50 @@ -265,7 +266,6 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target= data_set = ds.GeneratorDataset(source=dataset_generator, column_names=["label", "image", "filename"], num_parallel_workers=8, shuffle=True, num_shards=device_num, shard_id=rank_id) - image_size = 224 # Computed from random subset of ImageNet training images mean = [123.68, 116.78, 103.94] std = [1.0, 1.0, 1.0] diff --git a/benchmark/ascend/resnet/src/logger.py b/benchmark/ascend/resnet/src/logger.py new file mode 100644 index 000000000..e52a3be6d --- /dev/null +++ b/benchmark/ascend/resnet/src/logger.py @@ -0,0 +1,87 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ======================================================================================= +"""Custom Logger.""" +import os +import sys +import logging + + +class LOGGER(logging.Logger): + """ + Logger. + + Args: + logger_name: String. Logger name. + rank: Integer. Rank id. + """ + def __init__(self, logger_name, rank=0, param_server=False): + super(LOGGER, self).__init__(logger_name) + self.rank = rank + if rank % 8 == 0 or param_server or self.use_server(): + console = logging.StreamHandler(sys.stdout) + console.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') + console.setFormatter(formatter) + self.addHandler(console) + self.log_fn = None + + @staticmethod + def use_server(): + worked = os.getenv('MS_WORKER_NUM', None) + server = os.getenv('MS_SERVER_NUM', None) + if worked is not None and server is not None: + return True + return False + + def setup_logging_file(self, log_dir): + """Setup logging file.""" + if not os.path.exists(log_dir): + os.makedirs(log_dir, exist_ok=True) + log_name = 'log.txt' + self.log_fn = os.path.join(log_dir, log_name) + fh = logging.FileHandler(self.log_fn) + fh.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') + fh.setFormatter(formatter) + self.addHandler(fh) + + def info(self, msg, *args, **kwargs): + if self.isEnabledFor(logging.INFO): + self._log(logging.INFO, msg, args, **kwargs) + + def save_args(self, args): + self.info('Args:') + args_dict = vars(args) + for key in args_dict.keys(): + self.info('--> %s: %s', key, args_dict[key]) + self.info('') + + def important_info(self, msg, *args, **kwargs): + if self.isEnabledFor(logging.INFO) and self.rank == 0: + line_width = 2 + important_msg = '\n' + important_msg += ('*'*70 + '\n')*line_width + important_msg += ('*'*line_width + '\n')*2 + important_msg += '*'*line_width + ' '*8 + msg + '\n' + important_msg += ('*'*line_width + '\n')*2 + important_msg += ('*'*70 + '\n')*line_width + self.info(important_msg, *args, **kwargs) + + +def get_logger(path, rank, param_server=False): + """Get Logger.""" + logger = LOGGER('resnet', rank, param_server=param_server) + logger.setup_logging_file(os.path.join(path, 'rank_' + str(rank))) + return logger diff --git a/benchmark/ascend/resnet/src/lr_generator.py b/benchmark/ascend/resnet/src/lr_generator.py index d28c2acd0..fee7f7db4 100644 --- a/benchmark/ascend/resnet/src/lr_generator.py +++ b/benchmark/ascend/resnet/src/lr_generator.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import math import numpy as np -def _generate_steps_lr(lr_init, lr_max, total_steps, warmup_steps): +def _generate_steps_lr(lr_init, lr_max, total_steps, warmup_steps, start_steps): """ Applies three steps decay to generate learning rate array. @@ -45,10 +45,10 @@ def _generate_steps_lr(lr_init, lr_max, total_steps, warmup_steps): else: lr = lr_max * 0.001 lr_each_step.append(lr) - return lr_each_step + return lr_each_step[start_steps:] -def _generate_step_lr(lr_init, lr_max, total_steps, warmup_steps): +def _generate_step_lr(lr_max, total_steps, start_steps): """ Applies three steps decay to generate learning rate array. @@ -75,10 +75,10 @@ def _generate_step_lr(lr_init, lr_max, total_steps, warmup_steps): else: lr = 0.00005 lr_each_step.append(lr) - return lr_each_step + return lr_each_step[start_steps:] -def _generate_poly_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps): +def _generate_poly_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps, start_steps): """ Applies polynomial decay to generate learning rate array. @@ -102,14 +102,14 @@ def _generate_poly_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps): lr = float(lr_init) + inc_each_step * float(i) else: base = (1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps))) - lr = float(lr_max) * base * base + lr = (float(lr_max) - float(lr_end)) * base * base + float(lr_end) # 773491 if lr < 0.0: lr = 0.0 lr_each_step.append(lr) - return lr_each_step + return lr_each_step[start_steps:] -def _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps): +def _generate_cosine_lr(lr_init, lr_max, total_steps, warmup_steps, start_steps): """ Applies cosine decay to generate learning rate array. @@ -135,10 +135,10 @@ def _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps): decayed = linear_decay * cosine_decay + 0.00001 lr = lr_max * decayed lr_each_step.append(lr) - return lr_each_step + return lr_each_step[start_steps:] -def _generate_liner_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps): +def _generate_liner_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps, start_steps): """ Applies liner decay to generate learning rate array. @@ -159,11 +159,10 @@ def _generate_liner_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps): else: lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps) lr_each_step.append(lr) - return lr_each_step - + return lr_each_step[start_steps:] -def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode): +def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, start_epoch, steps_per_epoch, lr_decay_mode): """ generate learning rate array @@ -179,21 +178,20 @@ def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch Returns: np.array, learning rate array """ - lr_each_step = [] total_steps = steps_per_epoch * total_epochs warmup_steps = steps_per_epoch * warmup_epochs + start_steps = steps_per_epoch * start_epoch if lr_decay_mode == 'steps': - lr_each_step = _generate_steps_lr(lr_init, lr_max, total_steps, warmup_steps) + lr_each_step = _generate_steps_lr(lr_init, lr_max, total_steps, warmup_steps, start_steps) elif lr_decay_mode == 'step': - warmup_steps = warmup_epochs - lr_each_step = _generate_step_lr(lr_init, lr_max, total_steps, warmup_steps) + lr_each_step = _generate_step_lr(lr_max, total_steps, start_steps) elif lr_decay_mode == 'poly': - lr_each_step = _generate_poly_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps) + lr_each_step = _generate_poly_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps, start_steps) elif lr_decay_mode == 'cosine': - lr_each_step = _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps) + lr_each_step = _generate_cosine_lr(lr_init, lr_max, total_steps, warmup_steps, start_steps) else: - lr_each_step = _generate_liner_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps) + lr_each_step = _generate_liner_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps, start_steps) lr_each_step = np.array(lr_each_step).astype(np.float32) return lr_each_step diff --git a/benchmark/ascend/resnet/src/metric.py b/benchmark/ascend/resnet/src/metric.py index 75472e1b3..b56d3ffd8 100644 --- a/benchmark/ascend/resnet/src/metric.py +++ b/benchmark/ascend/resnet/src/metric.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ import mindspore as ms from mindspore.communication.management import GlobalComm import mindspore.ops as ops import mindspore.nn as nn +import mindspore.train as train class ClassifyCorrectCell(nn.Cell): r""" @@ -61,7 +62,7 @@ class ClassifyCorrectCell(nn.Cell): return (total_correct,) -class DistAccuracy(nn.Metric): +class DistAccuracy(train.Metric): r""" Calculates the accuracy for classification data in distributed mode. The accuracy class creates two local variables, correct number and total number that are used to compute the @@ -90,11 +91,11 @@ class DistAccuracy(nn.Metric): self.clear() self.batch_size = batch_size self.device_num = device_num + self._total_num = 50000 #TODO def clear(self): """Clears the internal evaluation result.""" self._correct_num = 0 - self._total_num = 0 def update(self, *inputs): """ @@ -113,7 +114,7 @@ class DistAccuracy(nn.Metric): raise ValueError('Distribute accuracy needs 1 input (y_correct), but got {}'.format(len(inputs))) y_correct = self._convert_data(inputs[0]) self._correct_num += y_correct - self._total_num += self.batch_size * self.device_num + # self._total_num += self.batch_size * self.device_num #TODO def eval(self): """ @@ -125,7 +126,7 @@ class DistAccuracy(nn.Metric): Raises: RuntimeError: If the sample size is 0. """ - if self._total_num == 0: raise RuntimeError('Accuracy can not be calculated, because the number of samples is 0.') + print("self._total_num********", self._total_num) return self._correct_num / self._total_num diff --git a/benchmark/ascend/resnet/src/model_utils/config.py b/benchmark/ascend/resnet/src/model_utils/config.py index 7cabf17cc..5d51ec9b7 100644 --- a/benchmark/ascend/resnet/src/model_utils/config.py +++ b/benchmark/ascend/resnet/src/model_utils/config.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ import os import ast import argparse -from pprint import pprint, pformat +from pprint import pformat import yaml _config_path = "./config/resnet50_cifar10_config.yaml" @@ -123,8 +123,8 @@ def get_config(): default, helper, choices = parse_yaml(path_args.config_path) args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) final_config = merge(args, default) - pprint(final_config) print("Please check the above information for the configurations", flush=True) return Config(final_config) + config = get_config() diff --git a/benchmark/ascend/resnet/src/model_utils/device_adapter.py b/benchmark/ascend/resnet/src/model_utils/device_adapter.py index 9c3d21d5e..1515acc01 100644 --- a/benchmark/ascend/resnet/src/model_utils/device_adapter.py +++ b/benchmark/ascend/resnet/src/model_utils/device_adapter.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/benchmark/ascend/resnet/src/model_utils/local_adapter.py b/benchmark/ascend/resnet/src/model_utils/local_adapter.py index 769fa6dc7..98df5674a 100644 --- a/benchmark/ascend/resnet/src/model_utils/local_adapter.py +++ b/benchmark/ascend/resnet/src/model_utils/local_adapter.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/benchmark/ascend/resnet/src/model_utils/moxing_adapter.py b/benchmark/ascend/resnet/src/model_utils/moxing_adapter.py index e5d77145e..5bc226c97 100644 --- a/benchmark/ascend/resnet/src/model_utils/moxing_adapter.py +++ b/benchmark/ascend/resnet/src/model_utils/moxing_adapter.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -89,14 +89,14 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.checkpoint_url, config.load_path) print("Preload downloaded: ", os.listdir(config.load_path)) if config.train_url: - sync_data(config.train_url, config.output_path) - print("Workspace downloaded: ", os.listdir(config.output_path)) + sync_data(config.train_url, config.output_dir) + print("Workspace downloaded: ", os.listdir(config.output_dir)) - ms.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + ms.set_context(save_graphs_path=os.path.join(config.output_dir, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() - if not os.path.exists(config.output_path): - os.makedirs(config.output_path) + if not os.path.exists(config.output_dir): + os.makedirs(config.output_dir) if pre_process: pre_process() @@ -110,6 +110,6 @@ def moxing_wrapper(pre_process=None, post_process=None): if config.train_url: print("Start to copy output directory") - sync_data(config.output_path, config.train_url) + sync_data(config.output_dir, config.train_url) return wrapped_func return wrapper diff --git a/benchmark/ascend/resnet/src/momentum.py b/benchmark/ascend/resnet/src/momentum.py index 65783bc37..d63a36fde 100644 --- a/benchmark/ascend/resnet/src/momentum.py +++ b/benchmark/ascend/resnet/src/momentum.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -94,8 +94,8 @@ class Momentum(Optimizer): tuple[bool], all elements are True. Raises: - AssertionError: If the momentum is less than 0.0. - AssertionError: If the momentum is not a float or use_nesterov is not a bool. + ValueError: If the momentum is less than 0.0. + TypeError: If the momentum is not a float or use_nesterov is not a bool. Supported Platforms: ``GPU`` @@ -121,7 +121,7 @@ class Momentum(Optimizer): """ def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0, use_nesterov=False): super(Momentum, self).__init__(learning_rate, params, weight_decay, loss_scale) - assert isinstance(momentum, float) and momentum >= 0.0, "momentum should be equal or bigger than 0" + assert isinstance(momentum, float) and momentum >= 0, "momentum should be equal or bigger than 0" assert isinstance(use_nesterov, bool), "use_nesterov should be bool" self.momentum = Parameter(Tensor(momentum, ms.float32), name="momentum") self.params = self.parameters @@ -131,15 +131,12 @@ class Momentum(Optimizer): self.opt = ops.FusedWeightScaleApplyMomentum() def construct(self, gradients): - ''' - Momentum construct - ''' params = self.params moments = self.moments weight_decay = Tensor(0.0, ms.float32) scale = Tensor(1.0, ms.float32) if self.exec_weight_decay: - weight_decay = self.weight_decay_tensor + weight_decay = self.weight_decay if self.need_scale: scale = self.reciprocal_scale lr = self.get_lr() diff --git a/benchmark/ascend/resnet/src/resnet.py b/benchmark/ascend/resnet/src/resnet.py index 6d14733d4..c4e636395 100644 --- a/benchmark/ascend/resnet/src/resnet.py +++ b/benchmark/ascend/resnet/src/resnet.py @@ -1,4 +1,4 @@ -# Copyright 2020-2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,9 +24,6 @@ from src.model_utils.config import config def conv_variance_scaling_initializer(in_channel, out_channel, kernel_size): - ''' - Initializer for conv - ''' fan_in = in_channel * kernel_size * kernel_size scale = 1.0 scale /= max(1., fan_in) @@ -111,9 +108,6 @@ def kaiming_uniform(inputs_shape, a=0., mode='fan_in', nonlinearity='leaky_relu' def _conv3x3(in_channel, out_channel, stride=1, use_se=False, res_base=False): - ''' - Create Conv2d with 3x3 kernel - ''' if use_se: weight = conv_variance_scaling_initializer(in_channel, out_channel, kernel_size=3) else: @@ -129,9 +123,6 @@ def _conv3x3(in_channel, out_channel, stride=1, use_se=False, res_base=False): def _conv1x1(in_channel, out_channel, stride=1, use_se=False, res_base=False): - ''' - Create Conv2d with 1x1 kernel - ''' if use_se: weight = conv_variance_scaling_initializer(in_channel, out_channel, kernel_size=1) else: @@ -147,9 +138,6 @@ def _conv1x1(in_channel, out_channel, stride=1, use_se=False, res_base=False): def _conv7x7(in_channel, out_channel, stride=1, use_se=False, res_base=False): - ''' - Create Conv2d with 7x7 kernel - ''' if use_se: weight = conv_variance_scaling_initializer(in_channel, out_channel, kernel_size=7) else: diff --git a/benchmark/ascend/resnet/src/resnet_gpu_benchmark.py b/benchmark/ascend/resnet/src/resnet_gpu_benchmark.py index 67ec6ffa6..869780617 100644 --- a/benchmark/ascend/resnet/src/resnet_gpu_benchmark.py +++ b/benchmark/ascend/resnet/src/resnet_gpu_benchmark.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/benchmark/ascend/resnet/src/util.py b/benchmark/ascend/resnet/src/util.py new file mode 100644 index 000000000..c072cf59d --- /dev/null +++ b/benchmark/ascend/resnet/src/util.py @@ -0,0 +1,144 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import os +import numpy as np +import mindspore as ms +import mindspore.nn as nn +from mindspore.communication.management import GlobalComm +import mindspore.ops as ops +from src.callback import EvalCallBack +from src.resnet import conv_variance_scaling_initializer + + +def filter_checkpoint_parameter_by_list(origin_dict, param_filter, cfg): + """remove useless parameters according to filter_list""" + for key in list(origin_dict.keys()): + for name in param_filter: + if name in key: + cfg.logger.info("Delete parameter from checkpoint: %s", key) + del origin_dict[key] + break + + +def apply_eval(eval_param): + eval_model = eval_param["model"] + eval_ds = eval_param["dataset"] + metrics_name = eval_param["metrics_name"] + res = eval_model.eval(eval_ds, dataset_sink_mode=True) + return res[metrics_name] + + +def init_group_params(net, cfg): + decayed_params = [] + no_decayed_params = [] + for param in net.trainable_params(): + if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: + decayed_params.append(param) + else: + no_decayed_params.append(param) + + group_params = [{'params': decayed_params, 'weight_decay': cfg.weight_decay}, + {'params': no_decayed_params}, + {'order_params': net.trainable_params()}] + return group_params + + +def eval_callback(model, cfg, eval_dataset): + eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"} + eval_cb = EvalCallBack(apply_eval, eval_param_dict, interval=cfg.eval_interval, + eval_start_epoch=cfg.eval_start_epoch, rank_id=cfg.rank_id, + save_best_ckpt=cfg.save_best_ckpt, ckpt_directory=cfg.save_ckpt_dir, + best_ckpt_name="best_acc.ckpt", metrics_name="acc", logger=cfg.logger) + return eval_cb + + +def set_output_dir(cfg): + """set save ckpt dir""" + cfg.output_dir = os.path.realpath(os.path.join(cfg.output_dir, cfg.net_name, cfg.dataset)) + cfg.save_ckpt_dir = os.path.join(cfg.output_dir, 'ckpt') + cfg.log_dir = os.path.join(cfg.output_dir, 'log') + return cfg + + +def set_golden_output_dir(cfg): + """set save ckpt dir""" + cfg.output_dir = os.path.realpath(os.path.join(cfg.output_dir, cfg.net_name, cfg.dataset, cfg.comp_algo)) + cfg.save_ckpt_dir = os.path.join(cfg.output_dir, 'ckpt') + cfg.log_dir = os.path.join(cfg.output_dir, 'log') + return cfg + + +def init_weight(net, cfg): + """init_weight""" + + if cfg.pre_trained: + if not os.path.isfile(cfg.pre_trained): + cfg.logger.warning("There is not ckpt file: %s", cfg.pre_trained) + else: + param_dict = ms.load_checkpoint(cfg.pre_trained) + if cfg.filter_weight: + filter_list = [x.name for x in net.end_point.get_parameters()] + filter_checkpoint_parameter_by_list(param_dict, filter_list) + ms.load_param_into_net(net, param_dict) + cfg.logger.info("Pre trained ckpt mode: %s loading", cfg.pre_trained) + else: + for _, cell in net.cells_and_names(): + if isinstance(cell, nn.Conv2d): + if cfg.conv_init == "XavierUniform": + cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + cell.weight.shape, + cell.weight.dtype)) + elif cfg.conv_init == "TruncatedNormal": + weight = conv_variance_scaling_initializer(cell.in_channels, + cell.out_channels, + cell.kernel_size[0]) + cell.weight.set_data(weight) + if isinstance(cell, nn.Dense): + if cfg.dense_init == "TruncatedNormal": + cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + cell.weight.shape, + cell.weight.dtype)) + elif cfg.dense_init == "RandomNormal": + in_channel = cell.in_channels + out_channel = cell.out_channels + weight = np.random.normal(loc=0, scale=0.01, size=out_channel * in_channel) + weight = ms.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) + cell.weight.set_data(weight) + + +class AllreduceSync(nn.Cell): + def __init__(self,): + super(AllreduceSync, self).__init__() + self.allreduce = ops.AllReduce(ops.ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP) + + def construct(self, x): + y = self.allreduce(x) + return y + + +def reset_weight(model, orign_params): + train_parameters = ms.ParameterTuple(model._train_network.get_parameters()) + for idx, params in enumerate(train_parameters): + if "global_step" in params.name: + print("before global_step is", params.name, params.asnumpy(), flush=True) + params.set_data(orign_params[idx]) + if "global_step" in params.name: + print("after global_step is", params.name, params.asnumpy(), flush=True) + + +def pre_build(model, train_dataset, val_dataset, sink_size, epoch): + model.build(train_dataset, val_dataset, sink_size=sink_size, epoch=epoch) + asyn = AllreduceSync() + asyn(ms.Tensor(np.ones(32).astype(np.float32))) diff --git a/benchmark/ascend/resnet/train.py b/benchmark/ascend/resnet/train.py index bfe937f69..6791a1bec 100644 --- a/benchmark/ascend/resnet/train.py +++ b/benchmark/ascend/resnet/train.py @@ -1,4 +1,4 @@ -# Copyright 2020-2021 Huawei Technologies Co., Ltd +# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,117 +13,40 @@ # limitations under the License. # ============================================================================ """train resnet.""" -import datetime -import glob import os -import numpy as np - import mindspore as ms -from mindspore import Tensor -from mindspore.nn.optim import Momentum, thor, LARS -from mindspore.train.model import Model -from mindspore.context import ParallelMode +import mindspore.nn as nn +import mindspore.log as logger from mindspore.train.train_thor import ConvertModelUtils -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor -from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits -from mindspore.train.loss_scale_manager import FixedLossScaleManager +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig from mindspore.communication.management import init, get_rank -from mindspore.common import set_seed from mindspore.parallel import set_algo_parameters -import mindspore.nn as nn -import mindspore.log as logger +from src.logger import get_logger from src.lr_generator import get_lr, warmup_cosine_annealing_lr from src.CrossEntropySmooth import CrossEntropySmooth -from src.eval_callback import EvalCallBack +from src.callback import LossCallBack, ResumeCallback +from src.util import eval_callback, init_weight, init_group_params, set_output_dir, pre_build from src.metric import DistAccuracy, ClassifyCorrectCell from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -from src.model_utils.device_adapter import get_rank_id, get_device_num -from src.resnet import conv_variance_scaling_initializer - - -set_seed(1) - - -class LossCallBack(LossMonitor): - """ - Monitor the loss in training. - If the loss in NAN or INF terminating training. - """ - - def __init__(self, has_trained_epoch=0): - super(LossCallBack, self).__init__() - self.has_trained_epoch = has_trained_epoch - - def step_end(self, run_context): - cb_params = run_context.original_args() - loss = cb_params.net_outputs - - if isinstance(loss, (tuple, list)): - if isinstance(loss[0], Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): - loss = loss[0] - - if isinstance(loss, Tensor) and isinstance(loss.asnumpy(), np.ndarray): - loss = np.mean(loss.asnumpy()) - - cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 - - if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)): - raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format( - cb_params.cur_epoch_num, cur_step_in_epoch)) - if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0: - print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num + int(self.has_trained_epoch), - cur_step_in_epoch, loss), flush=True) - - -if config.net_name in ("resnet18", "resnet34", "resnet50", "resnet152"): - if config.net_name == "resnet18": - from src.resnet import resnet18 as resnet - elif config.net_name == "resnet34": - from src.resnet import resnet34 as resnet - elif config.net_name == "resnet50": - from src.resnet import resnet50 as resnet - else: - from src.resnet import resnet152 as resnet - if config.dataset == "cifar10": - from src.dataset import create_dataset1 as create_dataset - else: - if config.mode_name == "GRAPH": - from src.dataset import create_dataset2 as create_dataset - else: - from src.dataset import create_dataset_pynative as create_dataset -elif config.net_name == "resnet101": - from src.resnet import resnet101 as resnet - from src.dataset import create_dataset3 as create_dataset +from src.model_utils.device_adapter import get_device_num + +from src.dataset import create_dataset +if config.net_name == "resnet18": + from src.resnet import resnet18 as resnet +elif config.net_name == "resnet34": + from src.resnet import resnet34 as resnet +elif config.net_name == "resnet50": + from src.resnet import resnet50 as resnet else: - from src.resnet import se_resnet50 as resnet - from src.dataset import create_dataset4 as create_dataset - - -def filter_checkpoint_parameter_by_list(origin_dict, param_filter): - """remove useless parameters according to filter_list""" - for key in list(origin_dict.keys()): - for name in param_filter: - if name in key: - print("Delete parameter from checkpoint: ", key) - del origin_dict[key] - break - - -def apply_eval(eval_param): - eval_model = eval_param["model"] - eval_ds = eval_param["dataset"] - metrics_name = eval_param["metrics_name"] - res = eval_model.eval(eval_ds) - return res[metrics_name] - + from src.resnet import resnet152 as resnet def set_graph_kernel_context(run_platform, net_name): if run_platform == "GPU" and net_name == "resnet101": ms.set_context(enable_graph_kernel=True) ms.set_context(graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D") - +ms.set_seed(1) def set_parameter(): """set_parameter""" @@ -134,7 +57,7 @@ def set_parameter(): # init context if config.mode_name == 'GRAPH': if target == "Ascend": - rank_save_graphs_path = os.path.join(config.save_graphs_path, "soma", str(os.getenv('DEVICE_ID'))) + rank_save_graphs_path = os.path.join(config.save_graphs_path, "soma", str(os.getenv('DEVICE_ID', '0'))) ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=config.save_graphs, save_graphs_path=rank_save_graphs_path) else: @@ -142,15 +65,15 @@ def set_parameter(): set_graph_kernel_context(target, config.net_name) else: ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) - + set_ascend_max_device_memory() if config.parameter_server: ms.set_ps_context(enable_ps=True) if config.run_distribute: if target == "Ascend": - device_id = int(os.getenv('DEVICE_ID')) + device_id = int(os.getenv('DEVICE_ID', '0')) ms.set_context(device_id=device_id) - ms.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, - gradients_mean=True) + ms.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ms.ParallelMode.DATA_PARALLEL, + gradients_mean=True, parameter_broadcast=False) set_algo_parameters(elementwise_op_strategy_follow=True) if config.net_name == "resnet50" or config.net_name == "se-resnet50": if config.boost_mode not in ["O1", "O2"]: @@ -162,92 +85,28 @@ def set_parameter(): else: init() ms.set_auto_parallel_context(device_num=get_device_num(), - parallel_mode=ParallelMode.DATA_PARALLEL, + parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True) if config.net_name == "resnet50": ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) - - -def load_pre_trained_checkpoint(): - """ - Load checkpoint according to pre_trained path. - """ - param_dict = None - if config.pre_trained: - if os.path.isdir(config.pre_trained): - ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path, "ckpt_0") - ckpt_pattern = os.path.join(ckpt_save_dir, "*.ckpt") - ckpt_files = glob.glob(ckpt_pattern) - if not ckpt_files: - logger.warning(f"There is no ckpt file in {ckpt_save_dir}, " - f"pre_trained is unsupported.") - else: - ckpt_files.sort(key=os.path.getmtime, reverse=True) - time_stamp = datetime.datetime.now() - print(f"time stamp {time_stamp.strftime('%Y.%m.%d-%H:%M:%S')}" - f" pre trained ckpt model {ckpt_files[0]} loading", - flush=True) - param_dict = ms.load_checkpoint(ckpt_files[0]) - elif os.path.isfile(config.pre_trained): - param_dict = ms.load_checkpoint(config.pre_trained) - else: - print(f"Invalid pre_trained {config.pre_trained} parameter.") - return param_dict - - -def init_weight(net, param_dict): - """init_weight""" - if config.pre_trained: - if param_dict: - if param_dict.get("epoch_num") and param_dict.get("step_num"): - config.has_trained_epoch = int(param_dict["epoch_num"].data.asnumpy()) - config.has_trained_step = int(param_dict["step_num"].data.asnumpy()) - else: - config.has_trained_epoch = 0 - config.has_trained_step = 0 - - if config.filter_weight: - filter_list = [x.name for x in net.end_point.get_parameters()] - filter_checkpoint_parameter_by_list(param_dict, filter_list) - ms.load_param_into_net(net, param_dict) - else: - for _, cell in net.cells_and_names(): - if isinstance(cell, nn.Conv2d): - if config.conv_init == "XavierUniform": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), - cell.weight.shape, - cell.weight.dtype)) - elif config.conv_init == "TruncatedNormal": - weight = conv_variance_scaling_initializer(cell.in_channels, - cell.out_channels, - cell.kernel_size[0]) - cell.weight.set_data(weight) - if isinstance(cell, nn.Dense): - if config.dense_init == "TruncatedNormal": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), - cell.weight.shape, - cell.weight.dtype)) - elif config.dense_init == "RandomNormal": - in_channel = cell.in_channels - out_channel = cell.out_channels - weight = np.random.normal(loc=0, scale=0.01, size=out_channel * in_channel) - weight = Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) - cell.weight.set_data(weight) + config.rank_id = get_rank() if config.run_distribute else 0 def init_lr(step_size): """init lr""" if config.optimizer == "Thor": from src.lr_generator import get_thor_lr - lr = get_thor_lr(0, config.lr_init, config.lr_decay, config.lr_end_epoch, step_size, decay_epochs=39) + lr = get_thor_lr(config.start_epoch * step_size, config.lr_init, config.lr_decay, config.lr_end_epoch, + step_size, decay_epochs=39) else: if config.net_name in ("resnet18", "resnet34", "resnet50", "resnet152", "se-resnet50"): + config.lr_max = config.lr_max #/ 8 * config.device_num lr = get_lr(lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, - warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, steps_per_epoch=step_size, - lr_decay_mode=config.lr_decay_mode) + warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, + start_epoch=config.start_epoch, steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode) else: lr = warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, config.epoch_size, - config.pretrain_epoch_size * step_size) + config.start_epoch * step_size) return lr @@ -258,52 +117,15 @@ def init_loss_scale(): loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=config.label_smooth_factor, num_classes=config.class_num) else: - loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') + loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') return loss -def init_group_params(net): - decayed_params = [] - no_decayed_params = [] - for param in net.trainable_params(): - if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: - decayed_params.append(param) - else: - no_decayed_params.append(param) - - group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay}, - {'params': no_decayed_params}, - {'order_params': net.trainable_params()}] - return group_params - - -def run_eval(target, model, ckpt_save_dir, cb): - """run_eval""" - if config.run_eval: - if config.eval_dataset_path is None or (not os.path.isdir(config.eval_dataset_path)): - raise ValueError("{} is not a existing path.".format(config.eval_dataset_path)) - eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, - batch_size=config.batch_size, train_image_size=config.train_image_size, - eval_image_size=config.eval_image_size, - target=target, enable_cache=config.enable_cache, - cache_session_id=config.cache_session_id) - eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"} - eval_cb = EvalCallBack(apply_eval, eval_param_dict, interval=config.eval_interval, - eval_start_epoch=config.eval_start_epoch, save_best_ckpt=config.save_best_ckpt, - ckpt_directory=ckpt_save_dir, best_ckpt_name="best_acc.ckpt", - metrics_name="acc") - cb += [eval_cb] - - -def set_save_ckpt_dir(): - """set save ckpt dir""" - ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path) - if config.enable_modelarts and config.run_distribute: - ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank_id()) + "/" - else: - if config.run_distribute: - ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank()) + "/" - return ckpt_save_dir +def set_ascend_max_device_memory(): + if ms.get_context("enable_ge") and ms.get_context("mode") == ms.GRAPH_MODE and \ + hasattr(config, "max_device_memory"): + logger.warning("When encountering a memory shortage situation in 1980B, reduce the max_device_memory.") + ms.set_context(max_device_memory=config.max_device_memory) @moxing_wrapper() @@ -311,7 +133,8 @@ def train_net(): """train net""" target = config.device_target set_parameter() - ckpt_param_dict = load_pre_trained_checkpoint() + set_output_dir(config) + config.logger = get_logger(config.log_dir, config.rank_id, config.parameter_server) dataset = create_dataset(dataset_path=config.data_path, do_train=True, batch_size=config.batch_size, train_image_size=config.train_image_size, eval_image_size=config.eval_image_size, target=target, @@ -320,65 +143,94 @@ def train_net(): net = resnet(class_num=config.class_num) if config.parameter_server: net.set_param_ps() + init_weight(net, config) + + if config.resume_ckpt: + resume_param = ms.load_checkpoint(config.resume_ckpt, + choice_func=lambda x: not x.startswith(('learning_rate', 'global_step'))) + config.start_epoch = int(resume_param.get('epoch_num', ms.Tensor(0, ms.int32)).asnumpy().item()) - init_weight(net=net, param_dict=ckpt_param_dict) - lr = Tensor(init_lr(step_size=step_size)) + lr = ms.Tensor(init_lr(step_size=step_size)) # define opt - group_params = init_group_params(net) - opt = Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale) + group_params = init_group_params(net, config) + opt = nn.Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale) if config.optimizer == "LARS": - opt = LARS(opt, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient, - lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name) + opt = nn.LARS(opt, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient, + lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name) loss = init_loss_scale() - loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) dist_eval_network = ClassifyCorrectCell(net) if config.run_distribute else None metrics = {"acc"} if config.run_distribute: metrics = {'acc': DistAccuracy(batch_size=config.batch_size, device_num=config.device_num)} if (config.net_name not in ("resnet18", "resnet34", "resnet50", "resnet101", "resnet152", "se-resnet50")) or \ - config.parameter_server or target == "CPU": - ## fp32 training - model = Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network) + config.parameter_server or target == "CPU": + # fp32 training + model = ms.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network) else: - model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, - amp_level="O2", boost_level=config.boost_mode, keep_batchnorm_fp32=False, - eval_network=dist_eval_network) + model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, + amp_level="O3", boost_level=config.boost_mode, + eval_network=dist_eval_network, + boost_config_dict={"boost": {"mode": "manual", "less_bn": True, "grad_freeze": False, + "adasum": False, "grad_accumulation": False, + "dim_reduce": False}}) if config.optimizer == "Thor" and config.dataset == "imagenet2012": from src.lr_generator import get_thor_damping - damping = get_thor_damping(0, config.damping_init, config.damping_decay, 70, step_size) + damping = get_thor_damping(step_size * config.start_epoch, config.damping_init, config.damping_decay, 70, + step_size) split_indices = [26, 53] - opt = thor(net, lr, Tensor(damping), config.momentum, config.weight_decay, config.loss_scale, - config.batch_size, split_indices=split_indices, frequency=config.frequency) + opt = nn.thor(net, lr, ms.Tensor(damping), config.momentum, config.weight_decay, config.loss_scale, + config.batch_size, split_indices=split_indices, frequency=config.frequency) model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, - amp_level="O2", keep_batchnorm_fp32=False) + amp_level="O3") config.run_eval = False - logger.warning("Thor optimizer not support evaluation while training.") + config.logger.warning("Thor optimizer not support evaluation while training.") + + # load resume param + if config.resume_ckpt: + ms.load_param_into_net(net, resume_param) + ms.load_param_into_net(opt, resume_param) + config.logger.info('resume train from epoch: %s', config.start_epoch) # define callbacks - time_cb = TimeMonitor(data_size=step_size) - loss_cb = LossCallBack(config.has_trained_epoch) - cb = [time_cb, loss_cb] - ckpt_save_dir = set_save_ckpt_dir() - if config.save_checkpoint: - ckpt_append_info = [{"epoch_num": config.has_trained_epoch, "step_num": config.has_trained_step}] + loss_cb = LossCallBack(config.epoch_size, config.logger, lr, per_print_time=1) + resume_cb = ResumeCallback(config.start_epoch) + cb = [loss_cb, resume_cb] + if config.save_checkpoint and config.rank_id == 0: + ckpt_append_info = [{"epoch_num": 0, "step_num": 0}] config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size, keep_checkpoint_max=config.keep_checkpoint_max, append_info=ckpt_append_info) - ckpt_cb = ModelCheckpoint(prefix="resnet", directory=ckpt_save_dir, config=config_ck) + ckpt_cb = ModelCheckpoint(prefix=config.net_name, directory=config.save_ckpt_dir, config=config_ck) cb += [ckpt_cb] - run_eval(target, model, ckpt_save_dir, cb) + + eval_dataset = None + if config.run_eval: + eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, + batch_size=config.eval_batch_size, train_image_size=config.train_image_size, + eval_image_size=config.eval_image_size, + target=target, enable_cache=False, + cache_session_id=config.cache_session_id, + distribute=config.run_distribute, drop_remainder=False) + eval_cb = eval_callback(model, config, eval_dataset) + cb.append(eval_cb) + # train model if config.net_name == "se-resnet50": config.epoch_size = config.train_epoch_size dataset_sink_mode = (not config.parameter_server) and target != "CPU" - config.pretrain_epoch_size = config.has_trained_epoch - model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb, - sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) + config.logger.save_args(config) + sink_size = dataset.get_dataset_size() + new_repeat_count = config.epoch_size * dataset.get_dataset_size() // sink_size + pre_build(model, dataset, eval_dataset, sink_size=sink_size, epoch=new_repeat_count) + config.logger.info("Build end, start training!") + + model.train(new_repeat_count, dataset, callbacks=cb, + sink_size=sink_size, dataset_sink_mode=dataset_sink_mode) - if config.run_eval and config.enable_cache: - print("Remember to shut down the cache server via \"cache_admin --stop\"") + config.logger.info("If run eval and enable_cache Remember to shut down the cache server via \"cache_admin --stop\"") if __name__ == '__main__': -- Gitee From 580f60f1e107ef49a1aafa294084cc347c34a640 Mon Sep 17 00:00:00 2001 From: luxingyu2023 Date: Sun, 28 Apr 2024 16:41:40 +0800 Subject: [PATCH 35/44] fix bug --- .../bert/pretrain_config_Ascend_Boost.yaml | 4 +-- .../resnet50_imagenet2012_Boost_config.yaml | 2 +- benchmark/ascend/resnet/src/dataset.py | 31 +++++++++++++++++++ 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/benchmark/ascend/bert/pretrain_config_Ascend_Boost.yaml b/benchmark/ascend/bert/pretrain_config_Ascend_Boost.yaml index 7ccb6a211..9e4dbb8dc 100644 --- a/benchmark/ascend/bert/pretrain_config_Ascend_Boost.yaml +++ b/benchmark/ascend/bert/pretrain_config_Ascend_Boost.yaml @@ -27,7 +27,7 @@ allreduce_post_accumulation: 'true' save_checkpoint_path: '' load_checkpoint_path: '/home/bertlarge/Bert/msdata/new_ckpt.ckpt' save_checkpoint_steps: 10000 -train_steps: 7000 +train_steps: 6300 save_checkpoint_num: 1 data_dir: '/data4/PCL/new_train_data' schema_dir: '' @@ -41,7 +41,7 @@ batch_size: 32 bert_network: 'large_boost' loss_scale_value: 65536 scale_factor: 2 -scale_window: 6300 +scale_window: 6000 optimizer: 'Lamb' enable_global_norm: False # pretrain_eval related diff --git a/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config.yaml b/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config.yaml index 55774fa52..21755da87 100644 --- a/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config.yaml +++ b/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config.yaml @@ -19,7 +19,7 @@ checkpoint_file_path: "" optimizer: "LARS" infer_label: "" class_num: 1001 -batch_size: 192 +batch_size: 256 eval_batch_size: 250 loss_scale: 1024 momentum: 0.9 diff --git a/benchmark/ascend/resnet/src/dataset.py b/benchmark/ascend/resnet/src/dataset.py index 8376aa899..8bf586c2e 100644 --- a/benchmark/ascend/resnet/src/dataset.py +++ b/benchmark/ascend/resnet/src/dataset.py @@ -15,11 +15,13 @@ """ create train or eval dataset. """ +import multiprocessing from io import BytesIO import numpy as np from PIL import Image import mindspore as ms import mindspore.dataset as ds +from mindspore.communication.management import init, get_rank, get_group_size def create_dataset(dataset_path, do_train, batch_size=32, train_image_size=224, eval_image_size=224, @@ -119,3 +121,32 @@ def create_dataset(dataset_path, do_train, batch_size=32, train_image_size=224, data_set = data_set.batch(batch_size, drop_remainder=drop_remainder) return data_set + +def _get_rank_info(distribute): + """ + get rank size and rank id + """ + if distribute: + init() + rank_id = get_rank() + device_num = get_group_size() + else: + rank_id = 0 + device_num = 1 + return device_num, rank_id + + +def get_num_parallel_workers(num_parallel_workers): + """ + Get num_parallel_workers used in dataset operations. + If num_parallel_workers > the real CPU cores number, set num_parallel_workers = the real CPU cores number. + """ + cores = multiprocessing.cpu_count() + if isinstance(num_parallel_workers, int): + if cores < num_parallel_workers: + print("The num_parallel_workers {} is set too large, now set it {}".format(num_parallel_workers, cores)) + num_parallel_workers = cores + else: + print("The num_parallel_workers {} is invalid, now set it {}".format(num_parallel_workers, min(cores, 8))) + num_parallel_workers = min(cores, 8) + return num_parallel_workers -- Gitee From 18152b0f9de2316e9f89f7689bf6ba0b2673db08 Mon Sep 17 00:00:00 2001 From: zhaoting Date: Sat, 4 May 2024 13:47:22 +0000 Subject: [PATCH 36/44] add max memory setting for googlenet and alexnet Signed-off-by: zhaoting --- research/cv/Alexnet/train.py | 4 +++- research/cv/googlenet/train.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/research/cv/Alexnet/train.py b/research/cv/Alexnet/train.py index ded98799b..385e92b1f 100644 --- a/research/cv/Alexnet/train.py +++ b/research/cv/Alexnet/train.py @@ -55,7 +55,9 @@ def train_alexnet(): device_target = config.device_target context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) context.set_context(save_graphs=False) - if device_target == "GPU": + if device_target == "Ascend": + context.set_context(max_device_memory="56GB") + elif device_target == "GPU": context.set_context(enable_graph_kernel=True) context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") diff --git a/research/cv/googlenet/train.py b/research/cv/googlenet/train.py index 653cb6db1..f3f8676ed 100644 --- a/research/cv/googlenet/train.py +++ b/research/cv/googlenet/train.py @@ -173,6 +173,7 @@ def run_train(): if cfg.device_target == "Ascend": device_id = get_device_id() context.set_context(device_id=device_id) + context.set_context(max_device_memory="56GB") if device_num > 1: context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, -- Gitee From 0f8dbb9b59d49bcda923a6a878f1c28fb18a5c48 Mon Sep 17 00:00:00 2001 From: tomzwang11 Date: Thu, 23 May 2024 15:28:02 +0800 Subject: [PATCH 37/44] add default ref mode --- official/cv/ResNet/src/callback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/official/cv/ResNet/src/callback.py b/official/cv/ResNet/src/callback.py index c34fa6fd5..dae793713 100644 --- a/official/cv/ResNet/src/callback.py +++ b/official/cv/ResNet/src/callback.py @@ -156,7 +156,7 @@ class EvalCallBack(Callback): eval_cost = time.time() - eval_start self.logger.info("epoch: {}, {}: {}, eval_cost:{:.2f}".format(cur_epoch, self.metrics_name, res, eval_cost)) if res >= self.best_res: - if ms.context.get_context("enable_ge") and int(os.getenv('MS_DISABLE_REF_MODE')) == 1: + if ms.context.get_context("enable_ge") and int(os.getenv('MS_DISABLE_REF_MODE', default="0")) == 1: from mindspore.train.callback import _set_cur_net _set_cur_net(cb_params.train_network) cb_params.train_network.exec_checkpoint_graph() -- Gitee From 9b42073c8164519e87a26106e6504c93103bef3c Mon Sep 17 00:00:00 2001 From: luxingyu2023 Date: Thu, 30 May 2024 19:23:38 +0800 Subject: [PATCH 38/44] add config for 32p --- ...esnet50_imagenet2012_Boost_config_32p.yaml | 114 +++++++++++++++ .../run_distribute_train_multi_server.sh | 134 ++++++++++++++++++ 2 files changed, 248 insertions(+) create mode 100644 benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config_32p.yaml create mode 100644 benchmark/ascend/resnet/scripts/run_distribute_train_multi_server.sh diff --git a/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config_32p.yaml b/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config_32p.yaml new file mode 100644 index 000000000..bc866eee6 --- /dev/null +++ b/benchmark/ascend/resnet/config/resnet50_imagenet2012_Boost_config_32p.yaml @@ -0,0 +1,114 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +run_distribute: True +enable_profiling: False +data_path: "/data/resnet_tc/Imagenet2012/train" +output_dir: "../outputs" +load_path: "/cache/checkpoint_path/" +device_target: "Ascend" +checkpoint_path: "./checkpoint/" +checkpoint_file_path: "" + +# ============================================================================== +# Training options +optimizer: "LARS" +infer_label: "" +class_num: 1001 +batch_size: 96 +eval_batch_size: 125 +loss_scale: 1024 +momentum: 0.9 +weight_decay: 5.0e-05 +epoch_size: 37 +start_epoch: 0 +resume_ckpt: "" +save_checkpoint: False +save_checkpoint_epochs: 5 +keep_checkpoint_max: 10 +warmup_epochs: 5 +lr_decay_mode: "poly" +use_label_smooth: True +label_smooth_factor: 0.1 +lr_init: 0 +lr_max: 11.0 +lr_end: 0.0001 +lars_epsilon: 0.0 +lars_coefficient: 0.001 + +net_name: "resnet50" +dataset: "imagenet2012" +device_num: 32 +pre_trained: "" +run_eval: True +eval_dataset_path: "/data/resnet_tc/Imagenet2012/val" +parameter_server: False +filter_weight: False +save_best_ckpt: False +eval_start_epoch: 4 +eval_interval: 4 +enable_cache: False +cache_session_id: "" +mode_name: "GRAPH" +boost_mode: "O1" +conv_init: "TruncatedNormal" +dense_init: "TruncatedNormal" +all_reduce_fusion_config: + - 85 + - 160 +train_image_size: 192 +eval_image_size: 224 +max_device_memory: "30GB" + +# Export options +device_id: 0 +width: 224 +height: 224 +file_name: "resnet50" +file_format: "MINDIR" +ckpt_file: "" +network_dataset: "resnet50_imagenet2012" + +# Retrain options +save_graphs: False +save_graphs_path: "./graphs" +has_trained_epoch: 0 +has_trained_step: 0 + +# postprocess resnet inference +result_path: '' +label_path: '' + +# prediction +img_path: '' + +# lite inference +enable_predict: False +enable_predict_lite_backend: False +enable_predict_lite_mindir: False + +# lite mindir inference +mindir_path: 'net.mindir' + + +# # Help description for each configuration +# enable_modelarts: "Whether training on modelarts, default: False" +# data_url: "Dataset url for obs" +# checkpoint_url: "The location of checkpoint for obs" +# data_path: "Dataset path for local" +# output_path: "Training output path for local" +# load_path: "The location of checkpoint for obs" +# device_target: "Target device type, available: [Ascend, GPU, CPU]" +# enable_profiling: "Whether enable profiling while training, default: False" +# num_classes: "Class for dataset" +# batch_size: "Batch size for training and evaluation" +# epoch_size: 37 +# checkpoint_path: "The location of the checkpoint file." +# checkpoint_file_path: "The location of the checkpoint file." +# save_graphs: "Whether save graphs during training, default: False." +# save_graphs_path: "Path to save graphs." +# img_path: "image file path." diff --git a/benchmark/ascend/resnet/scripts/run_distribute_train_multi_server.sh b/benchmark/ascend/resnet/scripts/run_distribute_train_multi_server.sh new file mode 100644 index 000000000..c8f66898c --- /dev/null +++ b/benchmark/ascend/resnet/scripts/run_distribute_train_multi_server.sh @@ -0,0 +1,134 @@ +#!/bin/bash +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +CURPATH="$(dirname "$0")" +# shellcheck source=/dev/null +. ${CURPATH}/cache_util.sh + +if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] && [ $# != 6 ] && [ $# != 7 ] +then + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH]" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) [RESUME_CKPT](optional)" + exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) +CONFIG_FILE=$(get_real_path $3) +str="Boost_" +if [[ $CONFIG_FILE =~ $str ]] +then + export MS_DISABLE_REF_MODE=0 + export MS_ENABLE_FORMAT_MODE=0 +fi + +if [ $# == 5 ] +then + RUN_EVAL=$4 + EVAL_DATASET_PATH=$(get_real_path $5) +fi + +export SERVER_ID=0 + +if [ $# == 7 ] +then + RUN_EVAL=$4 + EVAL_DATASET_PATH=$(get_real_path $5) + export SERVER_ID=$6 +fi + +if [ ! -f $PATH1 ] +then + echo "error: RANK_TABLE_FILE=$PATH1 is not a file" +exit 1 +fi + +if [ ! -d $PATH2 ] +then + echo "error: DATASET_PATH=$PATH2 is not a directory" +exit 1 +fi + +if [ "x${RUN_EVAL}" == "xTrue" ] && [ ! -d $EVAL_DATASET_PATH ] +then + echo "error: EVAL_DATASET_PATH=$EVAL_DATASET_PATH is not a directory" + exit 1 +fi + + +ulimit -u unlimited +export DEVICE_NUM=8 +export RANK_SIZE=$7 +export RANK_TABLE_FILE=$PATH1 + +rank_start=$((DEVICE_NUM * SERVER_ID)) + +cpus=`cat /proc/cpuinfo| grep "processor"| wc -l` +avg=`expr $cpus \/ $DEVICE_NUM` +gap=`expr $avg \- 1` + +for((i=0; i<${DEVICE_NUM}; i++)) +do + start=`expr $i \* $avg` + end=`expr $start \+ $gap` + cmdopt=$start"-"$end + echo "773491: $cmdopt" + export DEVICE_ID=${i} + export RANK_ID=$((rank_start + i)) + rm -rf ./train_parallel$i + mkdir ./train_parallel$i + cp ../*.py ./train_parallel$i + cp *.sh ./train_parallel$i + cp -r ../config/*.yaml ./train_parallel$i + cp -r ../src ./train_parallel$i + cd ./train_parallel$i || exit + echo "start training for rank $RANK_ID, device $DEVICE_ID" + env > env.log + + if [ $# == 5 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=False \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi + fi + + if [ $# == 7 ] + then + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ + --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=False \ + --cache_session_id=$CACHE_SESSION_ID --config_path=$CONFIG_FILE --output_dir '../outputs' &> log.txt & + if [ "x${RUN_EVAL}" == "xTrue" ] + then + echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" + fi + fi + cd .. +done + -- Gitee From 230e51c9ba4f025ed96a9ac3bed12e9c36ae4134 Mon Sep 17 00:00:00 2001 From: daiyuxin0511 <455472400@qq.com> Date: Fri, 31 May 2024 14:49:48 +0800 Subject: [PATCH 39/44] adapt for set_seed --- official/cv/OpenPose/train.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/official/cv/OpenPose/train.py b/official/cv/OpenPose/train.py index 8b8f56edf..baa538968 100644 --- a/official/cv/OpenPose/train.py +++ b/official/cv/OpenPose/train.py @@ -14,8 +14,7 @@ # ============================================================================ import os from ast import literal_eval as liter -import mindspore -from mindspore import context +from mindspore import context, set_seed from mindspore.context import ParallelMode from mindspore.communication.management import init from mindspore.train import Model @@ -31,7 +30,7 @@ from src.model_utils.config import config from src.model_utils.device_adapter import get_rank_id, get_device_num -mindspore.common.seed.set_seed(1) +set_seed(1) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) -- Gitee From 6ba92f72af79d355a320ecfe692bec4690d3d655 Mon Sep 17 00:00:00 2001 From: ash Date: Thu, 13 Jun 2024 17:51:43 +0800 Subject: [PATCH 40/44] update context API --- .jenkins/check/config/filter_linklint.txt | 9 +- official/audio/DeepSpeech2/eval.py | 4 +- official/audio/DeepSpeech2/export.py | 5 +- official/audio/DeepSpeech2/quick_start.py | 4 +- official/audio/DeepSpeech2/train.py | 13 +- official/audio/EcapaTDNN/eval.py | 7 +- official/audio/EcapaTDNN/export.py | 8 +- .../EcapaTDNN/modelart/ecapatdnn-modelart.py | 27 +- official/audio/EcapaTDNN/src/ecapa_tdnn.py | 50 +-- .../src/model_utils/moxing_adapter.py | 4 +- official/audio/EcapaTDNN/src/util.py | 6 +- official/audio/EcapaTDNN/train.py | 22 +- official/audio/LPCNet/eval.py | 4 +- official/audio/LPCNet/export.py | 4 +- official/audio/LPCNet/src/rnns/rnns.py | 4 +- official/audio/LPCNet/train.py | 4 +- .../audio/LPCNet/train_lpcnet_parallel.py | 9 +- official/audio/MELGAN/eval.py | 6 +- official/audio/MELGAN/export.py | 4 +- .../audio/MELGAN/modelarts/train_modelarts.py | 15 +- .../MELGAN/src/model_utils/moxing_adapter.py | 4 +- official/audio/MELGAN/src/trainonestep.py | 2 +- official/audio/MELGAN/train.py | 13 +- official/audio/Tacotron2/eval.py | 3 +- .../Tacotron2/model_utils/moxing_adapter.py | 4 +- official/audio/Tacotron2/src/tacotron2.py | 5 +- official/audio/Tacotron2/train.py | 10 +- official/cv/Arcface/eval_ijbc.py | 5 +- official/cv/Arcface/export.py | 7 +- official/cv/Arcface/modelarts/start.py | 11 +- official/cv/Arcface/train.py | 11 +- official/cv/Arcface/val.py | 7 +- official/cv/CRNN/eval.py | 6 +- official/cv/CRNN/export.py | 10 +- official/cv/CRNN/modelarts/start.py | 16 +- official/cv/CRNN/src/crnn_for_train.py | 6 +- .../cv/CRNN/src/model_utils/moxing_adapter.py | 4 +- official/cv/CRNN/train.py | 12 +- official/cv/CTPN/README.md | 6 +- official/cv/CTPN/eval.py | 4 +- official/cv/CTPN/export.py | 10 +- .../cv/CTPN/src/CTPN/bbox_assign_sample.py | 4 +- .../cv/CTPN/src/CTPN/proposal_generator.py | 5 +- official/cv/CTPN/src/CTPN/rpn.py | 5 +- official/cv/CTPN/src/ctpn.py | 6 +- .../cv/CTPN/src/model_utils/moxing_adapter.py | 4 +- official/cv/CTPN/train.py | 15 +- official/cv/CycleGAN/eval.py | 10 +- official/cv/CycleGAN/export.py | 10 +- official/cv/CycleGAN/src/models/cycle_gan.py | 21 +- official/cv/CycleGAN/train.py | 14 +- official/cv/DBNet/eval.py | 4 +- official/cv/DBNet/export.py | 10 +- .../DBNet/src/model_utils/moxing_adapter.py | 4 +- .../cv/DBNet/src/modules/backbone/__init__.py | 12 +- official/cv/DBNet/src/modules/data_offload.py | 6 +- official/cv/DBNet/src/modules/loss.py | 16 +- official/cv/DBNet/src/utils/callback.py | 12 +- official/cv/DBNet/src/utils/env.py | 22 +- official/cv/DBNet/src/utils/eval_utils.py | 4 +- official/cv/DBNet/src/utils/post_process.py | 4 +- official/cv/DBNet/train.py | 20 +- official/cv/DeepLabV3P/eval.py | 4 +- official/cv/DeepLabV3P/export.py | 5 +- official/cv/DeepLabV3P/train.py | 12 +- official/cv/DeepLabv3/eval.py | 4 +- official/cv/DeepLabv3/export.py | 7 +- .../DeepLabv3/model_utils/moxing_adapter.py | 4 +- .../cv/DeepLabv3/modelarts/train_start.py | 13 +- official/cv/DeepLabv3/train.py | 10 +- official/cv/DeepText/README.md | 4 +- official/cv/DeepText/eval.py | 6 +- official/cv/DeepText/export.py | 10 +- .../cv/DeepText/model_utils/moxing_adapter.py | 4 +- .../src/Deeptext/proposal_generator.py | 7 +- official/cv/DeepText/train.py | 15 +- official/cv/EDSR/export.py | 10 +- .../cv/EDSR/model_utils/moxing_adapter.py | 4 +- official/cv/EDSR/src/utils.py | 18 +- .../cv/Efficientnet/efficientnet-b0/eval.py | 4 +- .../cv/Efficientnet/efficientnet-b0/export.py | 5 +- .../cv/Efficientnet/efficientnet-b0/train.py | 14 +- .../cv/Efficientnet/efficientnet-b1/eval.py | 5 +- .../cv/Efficientnet/efficientnet-b1/export.py | 5 +- .../src/model_utils/moxing_adapter.py | 4 +- .../cv/Efficientnet/efficientnet-b1/train.py | 6 +- .../cv/Efficientnet/efficientnet-b2/eval.py | 9 +- .../cv/Efficientnet/efficientnet-b2/export.py | 5 +- .../cv/Efficientnet/efficientnet-b2/train.py | 22 +- .../cv/Efficientnet/efficientnet-b3/eval.py | 9 +- .../cv/Efficientnet/efficientnet-b3/export.py | 5 +- .../efficientnet-b3/modelarts/train_start.py | 18 +- .../cv/Efficientnet/efficientnet-b3/train.py | 18 +- .../cv/Efficientnet/efficientnetv2/eval.py | 13 +- .../cv/Efficientnet/efficientnetv2/export.py | 8 +- .../efficientnetv2/src/tools/get_misc.py | 15 +- .../cv/Efficientnet/efficientnetv2/train.py | 13 +- official/cv/FasterRCNN/README.md | 2 +- official/cv/FasterRCNN/README_CN.md | 2 +- official/cv/FasterRCNN/eval.py | 14 +- official/cv/FasterRCNN/export.py | 22 +- .../src/FasterRcnn/bbox_assign_sample.py | 28 +- .../FasterRcnn/bbox_assign_sample_stage2.py | 20 +- .../FasterRCNN/src/FasterRcnn/faster_rcnn.py | 47 ++- .../cv/FasterRCNN/src/FasterRcnn/fpn_neck.py | 4 +- .../src/FasterRcnn/proposal_generator.py | 4 +- official/cv/FasterRCNN/src/FasterRcnn/rcnn.py | 32 +- .../cv/FasterRCNN/src/FasterRcnn/roi_align.py | 12 +- official/cv/FasterRCNN/src/FasterRcnn/rpn.py | 34 +- .../cv/FasterRCNN/src/convert_checkpoint.py | 10 +- official/cv/FasterRCNN/src/dataset.py | 4 +- official/cv/FasterRCNN/src/eval_callback.py | 4 +- official/cv/FasterRCNN/src/eval_utils.py | 10 +- .../src/model_utils/moxing_adapter.py | 4 +- official/cv/FasterRCNN/src/network_define.py | 4 +- official/cv/FasterRCNN/src/quick_start.py | 8 +- official/cv/FasterRCNN/train.py | 30 +- official/cv/Inception/inceptionv3/README.md | 4 +- official/cv/Inception/inceptionv3/eval.py | 6 +- official/cv/Inception/inceptionv3/export.py | 10 +- .../inceptionv3/modelarts/train_start.py | 13 +- .../src/model_utils/moxing_adapter.py | 4 +- official/cv/Inception/inceptionv3/train.py | 17 +- official/cv/Inception/inceptionv4/eval.py | 6 +- official/cv/Inception/inceptionv4/export.py | 10 +- .../inceptionv4/modelarts/train_start.py | 10 +- .../src/model_utils/moxing_adapter.py | 4 +- official/cv/Inception/inceptionv4/train.py | 12 +- official/cv/Inception/xception/eval.py | 7 +- official/cv/Inception/xception/export.py | 7 +- .../src/model_utils/moxing_adapter.py | 4 +- official/cv/Inception/xception/train.py | 11 +- .../cv/MaskRCNN/maskrcnn_mobilenetv1/eval.py | 9 +- .../MaskRCNN/maskrcnn_mobilenetv1/export.py | 7 +- .../maskrcnn_mobilenetv1/src/dataset.py | 4 +- .../bbox_assign_sample.py | 6 +- .../bbox_assign_sample_stage2.py | 4 +- .../src/maskrcnn_mobilenetv1/fpn_neck.py | 4 +- .../mask_rcnn_mobilenetv1.py | 4 +- .../proposal_generator.py | 6 +- .../src/maskrcnn_mobilenetv1/rcnn_cls.py | 6 +- .../src/maskrcnn_mobilenetv1/rcnn_mask.py | 6 +- .../src/maskrcnn_mobilenetv1/rpn.py | 4 +- .../src/model_utils/moxing_adapter.py | 4 +- .../src/network_define.py | 4 +- .../cv/MaskRCNN/maskrcnn_mobilenetv1/train.py | 17 +- .../cv/MaskRCNN/maskrcnn_resnet50/README.md | 2 +- .../MaskRCNN/maskrcnn_resnet50/README_CN.md | 2 +- .../cv/MaskRCNN/maskrcnn_resnet50/eval.py | 5 +- .../cv/MaskRCNN/maskrcnn_resnet50/export.py | 7 +- .../src/maskrcnn/bbox_assign_sample.py | 4 +- .../src/maskrcnn/bbox_assign_sample_stage2.py | 4 +- .../src/maskrcnn/fpn_neck.py | 4 +- .../src/maskrcnn/mask_rcnn_r50.py | 4 +- .../src/maskrcnn/proposal_generator.py | 4 +- .../src/maskrcnn/rcnn_cls.py | 6 +- .../src/maskrcnn/rcnn_mask.py | 6 +- .../src/maskrcnn/resnet50.py | 4 +- .../maskrcnn_resnet50/src/maskrcnn/rpn.py | 5 +- .../src/model_utils/moxing_adapter.py | 4 +- .../cv/MaskRCNN/maskrcnn_resnet50/train.py | 9 +- official/cv/MobileNet/mobilenetv1/README.md | 2 +- official/cv/MobileNet/mobilenetv1/eval.py | 14 +- official/cv/MobileNet/mobilenetv1/export.py | 12 +- .../mobilenetv1/src/CrossEntropySmooth.py | 6 +- .../cv/MobileNet/mobilenetv1/src/dataset.py | 6 +- .../src/model_utils/moxing_adapter.py | 6 +- official/cv/MobileNet/mobilenetv1/train.py | 32 +- official/cv/MobileNet/mobilenetv2/README.md | 4 +- official/cv/MobileNet/mobilenetv2/eval.py | 6 +- official/cv/MobileNet/mobilenetv2/export.py | 8 +- .../golden_stick/quantization/simqat/eval.py | 8 +- .../golden_stick/quantization/simqat/train.py | 20 +- .../cv/MobileNet/mobilenetv2/src/dataset.py | 10 +- .../cv/MobileNet/mobilenetv2/src/metric.py | 6 +- .../MobileNet/mobilenetv2/src/mobilenetV2.py | 14 +- .../mobilenetv2/src/mobilenetV2_fusion.py | 22 +- .../src/model_utils/moxing_adapter.py | 6 +- .../cv/MobileNet/mobilenetv2/src/models.py | 20 +- official/cv/MobileNet/mobilenetv2/train.py | 24 +- official/cv/MobileNet/mobilenetv3/Readme.md | 4 +- official/cv/MobileNet/mobilenetv3/eval.py | 12 +- official/cv/MobileNet/mobilenetv3/export.py | 16 +- .../cv/MobileNet/mobilenetv3/infer_onnx.py | 4 +- .../cv/MobileNet/mobilenetv3/src/dataset.py | 6 +- .../MobileNet/mobilenetv3/src/mobilenetV3.py | 14 +- official/cv/MobileNet/mobilenetv3/train.py | 36 +- official/cv/OCRNet/eval.py | 5 +- official/cv/OCRNet/export.py | 5 +- .../OCRNet/src/model_utils/moxing_adapter.py | 4 +- official/cv/OCRNet/train.py | 9 +- official/cv/OpenPose/eval.py | 5 +- official/cv/OpenPose/export.py | 6 +- official/cv/OpenPose/modelarts/train_start.py | 8 +- official/cv/OpenPose/src/loss.py | 6 +- .../src/model_utils/moxing_adapter.py | 4 +- official/cv/OpenPose/src/openposenet.py | 4 +- official/cv/OpenPose/train.py | 9 +- official/cv/PVNet/eval.py | 3 +- official/cv/PVNet/export.py | 16 +- official/cv/PVNet/modelarts/start_train.py | 11 +- official/cv/PVNet/src/loss_scale.py | 6 +- official/cv/PVNet/train.py | 9 +- official/cv/PointNet/eval.py | 12 +- official/cv/PointNet/preprocess.py | 4 +- official/cv/PointNet/src/export.py | 5 +- official/cv/PointNet/src/preprocess.py | 4 +- official/cv/PointNet/train.py | 25 +- official/cv/PointNet2/eval.py | 8 +- official/cv/PointNet2/export.py | 9 +- official/cv/PointNet2/src/pointnet2_utils.py | 24 +- official/cv/PointNet2/train.py | 18 +- official/cv/ResNet/README.md | 2 +- official/cv/ResNet/README_CN.md | 2 +- official/cv/ResNet/eval.py | 14 +- official/cv/ResNet/export.py | 14 +- official/cv/ResNet/fine_tune.py | 12 +- official/cv/ResNet/golden_stick/ghost/eval.py | 14 +- .../cv/ResNet/golden_stick/ghost/train.py | 34 +- .../ResNet/golden_stick/pruner/scop/eval.py | 14 +- .../ResNet/golden_stick/pruner/scop/infer.py | 8 +- .../ResNet/golden_stick/pruner/scop/train.py | 42 +-- .../golden_stick/pruner/uni_pruning/eval.py | 12 +- .../golden_stick/pruner/uni_pruning/export.py | 10 +- .../golden_stick/pruner/uni_pruning/train.py | 34 +- .../golden_stick/quantization/simqat/eval.py | 14 +- .../golden_stick/quantization/simqat/train.py | 42 +-- .../golden_stick/quantization/slb/eval.py | 14 +- .../golden_stick/quantization/slb/train.py | 34 +- official/cv/ResNet/gpu_resnet_benchmark.py | 46 +-- official/cv/ResNet/infer.py | 12 +- .../ResNet/modelarts/ResNet152/train_start.py | 64 ++-- .../modelarts/ResNet18/modelarts_train.py | 54 +-- official/cv/ResNet/predict.py | 24 +- official/cv/ResNet/src/CrossEntropySmooth.py | 6 +- official/cv/ResNet/src/callback.py | 12 +- official/cv/ResNet/src/dataset.py | 10 +- official/cv/ResNet/src/dataset_infer.py | 8 +- official/cv/ResNet/src/metric.py | 6 +- .../ResNet/src/model_utils/moxing_adapter.py | 4 +- official/cv/ResNet/src/momentum.py | 8 +- .../cv/ResNet/src/resnet_gpu_benchmark.py | 4 +- official/cv/ResNet/src/util.py | 12 +- official/cv/ResNet/train.py | 52 +-- official/cv/RetinaFace_ResNet50/eval.py | 8 +- official/cv/RetinaFace_ResNet50/export.py | 8 +- official/cv/RetinaFace_ResNet50/src/loss.py | 24 +- .../cv/RetinaFace_ResNet50/src/network.py | 12 +- official/cv/RetinaFace_ResNet50/train.py | 26 +- official/cv/RetinaNet/eval.py | 5 +- official/cv/RetinaNet/eval_onnx.py | 5 +- official/cv/RetinaNet/export.py | 5 +- .../cv/RetinaNet/modelarts/train_start.py | 4 +- .../src/model_utils/moxing_adapter.py | 4 +- official/cv/RetinaNet/src/retinanet.py | 14 +- official/cv/RetinaNet/train.py | 19 +- official/cv/SSD/README.md | 12 +- official/cv/SSD/eval.py | 8 +- official/cv/SSD/eval_onnx.py | 5 +- official/cv/SSD/export.py | 14 +- .../cv/SSD/infer_ssd_mobilenet_v1_fpn_onnx.py | 5 +- .../cv/SSD/src/model_utils/moxing_adapter.py | 4 +- official/cv/SSD/src/ssd.py | 42 +-- official/cv/SSD/train.py | 48 +-- official/cv/ShuffleNet/shufflenetv1/eval.py | 5 +- official/cv/ShuffleNet/shufflenetv1/export.py | 10 +- .../shufflenetv1/infer_shufflenetv1_onnx.py | 4 +- .../src/model_utils/moxing_adapter.py | 4 +- official/cv/ShuffleNet/shufflenetv1/train.py | 15 +- .../ShuffleNet/shufflenetv2/cpu_transfer.py | 6 +- official/cv/ShuffleNet/shufflenetv2/eval.py | 8 +- official/cv/ShuffleNet/shufflenetv2/export.py | 10 +- .../shufflenetv2/infer_shufflenetv2_onnx.py | 4 +- .../shufflenetv2/modelarts/train_start.py | 15 +- official/cv/ShuffleNet/shufflenetv2/train.py | 16 +- official/cv/SwinTransformer/eval.py | 13 +- official/cv/SwinTransformer/export.py | 8 +- .../cv/SwinTransformer/src/tools/get_misc.py | 20 +- official/cv/SwinTransformer/train.py | 12 +- official/cv/Unet/eval.py | 7 +- official/cv/Unet/export.py | 7 +- .../golden_stick/pruner/uni_pruning/eval.py | 5 +- .../golden_stick/pruner/uni_pruning/train.py | 8 +- .../cv/Unet/src/model_utils/moxing_adapter.py | 4 +- official/cv/Unet/train.py | 10 +- official/cv/VGG/vgg16/eval.py | 7 +- official/cv/VGG/vgg16/export.py | 7 +- official/cv/VGG/vgg16/fine_tune.py | 18 +- .../VGG/vgg16/model_utils/moxing_adapter.py | 4 +- official/cv/VGG/vgg16/modelarts/start.py | 18 +- official/cv/VGG/vgg16/src/data_split.py | 4 +- official/cv/VGG/vgg16/src/dataset.py | 4 +- official/cv/VGG/vgg16/train.py | 14 +- official/cv/VGG/vgg19/eval.py | 7 +- official/cv/VGG/vgg19/export.py | 7 +- .../VGG/vgg19/model_utils/moxing_adapter.py | 4 +- .../cv/VGG/vgg19/modelarts/train_modelarts.py | 14 +- official/cv/VGG/vgg19/train.py | 14 +- official/cv/VIT/README.md | 7 +- official/cv/VIT/README_CN.md | 4 +- official/cv/VIT/eval.py | 12 +- official/cv/VIT/export.py | 14 +- official/cv/VIT/modelarts/train_modelarts.py | 22 +- official/cv/VIT/src/cross_entropy.py | 20 +- official/cv/VIT/src/dataset.py | 4 +- official/cv/VIT/src/eval_engine.py | 6 +- official/cv/VIT/src/metric.py | 18 +- .../cv/VIT/src/model_utils/moxing_adapter.py | 5 +- official/cv/VIT/src/optimizer.py | 22 +- official/cv/VIT/src/vit.py | 38 +- official/cv/VIT/train.py | 12 +- official/cv/WGAN/eval.py | 6 +- official/cv/WGAN/export.py | 7 +- official/cv/WGAN/modelarts/start.py | 8 +- official/cv/WGAN/src/cell.py | 2 +- official/cv/WGAN/train.py | 8 +- official/cv/YOLOX/eval.py | 10 +- official/cv/YOLOX/export.py | 10 +- .../cv/YOLOX/model_utils/moxing_adapter.py | 4 +- official/cv/YOLOX/predict.py | 5 +- official/cv/YOLOX/train.py | 22 +- official/cv/YOLOv3/convert_weight.py | 14 +- official/cv/YOLOv3/eval.py | 14 +- official/cv/YOLOv3/eval_onnx.py | 10 +- official/cv/YOLOv3/export.py | 14 +- .../cv/YOLOv3/model_utils/moxing_adapter.py | 5 +- official/cv/YOLOv3/src/initializer.py | 16 +- official/cv/YOLOv3/src/util.py | 8 +- official/cv/YOLOv3/src/yolo.py | 22 +- official/cv/YOLOv3/train.py | 54 +-- official/cv/YOLOv4/eval.py | 10 +- official/cv/YOLOv4/export.py | 6 +- official/cv/YOLOv4/infer/README.md | 4 +- .../cv/YOLOv4/model_utils/moxing_adapter.py | 4 +- official/cv/YOLOv4/modelarts/modelarts.py | 9 +- official/cv/YOLOv4/src/yolo.py | 33 +- official/cv/YOLOv4/test.py | 10 +- official/cv/YOLOv4/train.py | 15 +- official/cv/YOLOv5/eval.py | 15 +- official/cv/YOLOv5/eval_onnx.py | 8 +- official/cv/YOLOv5/export.py | 14 +- .../cv/YOLOv5/model_utils/moxing_adapter.py | 5 +- official/cv/YOLOv5/modelarts/train_start.py | 18 +- official/cv/YOLOv5/src/initializer.py | 16 +- official/cv/YOLOv5/src/util.py | 8 +- official/cv/YOLOv5/src/yolo.py | 26 +- official/cv/YOLOv5/train.py | 18 +- official/nlp/Bert/export.py | 7 +- official/nlp/Bert/modelarts/train_start.py | 13 +- official/nlp/Bert/pretrain_eval.py | 4 +- official/nlp/Bert/quick_start.py | 8 +- official/nlp/Bert/run_classifier.py | 15 +- official/nlp/Bert/run_ner.py | 17 +- official/nlp/Bert/run_pretrain.py | 43 ++- official/nlp/Bert/run_squad.py | 12 +- .../nlp/Bert/src/bert_for_pre_training.py | 11 +- official/nlp/Bert/src/finetune_eval_model.py | 4 +- .../Bert/src/model_utils/moxing_adapter.py | 4 +- official/nlp/Bert_thor/README.md | 6 +- official/nlp/Bert_thor/pretrain_eval.py | 4 +- official/nlp/Bert_thor/run_pretrain.py | 12 +- .../Bert_thor/src/bert_for_pre_training.py | 8 +- official/nlp/GPT/eval.py | 4 +- official/nlp/GPT/src/gpt_wrapcell.py | 8 +- official/nlp/GPT/train.py | 10 +- official/nlp/LSTM/eval.py | 7 +- official/nlp/LSTM/export.py | 7 +- official/nlp/LSTM/modelarts/data_process.py | 4 +- official/nlp/LSTM/modelarts/train_start.py | 19 +- .../LSTM/src/model_utils/device_adapter.py | 1 + .../LSTM/src/model_utils/moxing_adapter.py | 4 +- official/nlp/LSTM/train.py | 13 +- official/nlp/Pangu_alpha/predict.py | 17 +- official/nlp/Pangu_alpha/src/callbacks.py | 14 +- official/nlp/Pangu_alpha/src/dataset.py | 6 +- official/nlp/Pangu_alpha/src/metrics.py | 4 +- .../Pangu_alpha/src/pangu_alpha_wrapcell.py | 10 +- official/nlp/Pangu_alpha/src/utils.py | 10 +- official/nlp/Pangu_alpha/train.py | 41 ++- official/nlp/Transformer/eval.py | 18 +- official/nlp/Transformer/eval_onnx.py | 6 +- official/nlp/Transformer/export.py | 10 +- .../nlp/Transformer/mindspore_hub_conf.py | 6 +- .../Transformer/modelarts/train_modelarts.py | 17 +- official/nlp/Transformer/src/beam_search.py | 46 +-- official/nlp/Transformer/src/dataset.py | 4 +- .../src/model_utils/device_adapter.py | 1 + .../src/model_utils/moxing_adapter.py | 5 +- .../Transformer/src/transformer_for_train.py | 40 +-- .../nlp/Transformer/src/transformer_model.py | 72 ++-- official/nlp/Transformer/train.py | 30 +- official/recommend/DeepFM/eval.py | 4 +- official/recommend/DeepFM/export.py | 7 +- official/recommend/DeepFM/modelart/start.py | 26 +- official/recommend/DeepFM/src/deepfm.py | 2 +- .../DeepFM/src/model_utils/device_adapter.py | 1 + .../DeepFM/src/model_utils/moxing_adapter.py | 4 +- official/recommend/DeepFM/train.py | 28 +- official/recommend/Wide_and_Deep/eval.py | 5 +- official/recommend/Wide_and_Deep/export.py | 8 +- .../recommend/Wide_and_Deep/modelart/start.py | 8 +- .../recommend/Wide_and_Deep/src/callbacks.py | 11 +- .../src/model_utils/device_adapter.py | 1 + .../src/model_utils/moxing_adapter.py | 4 +- .../Wide_and_Deep/src/wide_and_deep.py | 19 +- official/recommend/Wide_and_Deep/train.py | 10 +- .../recommend/Wide_and_Deep/train_and_eval.py | 9 +- .../train_and_eval_auto_parallel.py | 24 +- .../train_and_eval_distribute.py | 18 +- ...in_and_eval_parameter_server_distribute.py | 28 +- ...in_and_eval_parameter_server_standalone.py | 14 +- .../Wide_and_Deep/train_distribute.py | 18 +- .../Wide_and_Deep_Multitable/eval.py | 6 +- .../src/wide_and_deep.py | 11 +- .../train_and_eval.py | 6 +- .../train_and_eval_distribute.py | 10 +- research/cv/RepVGG/README.md | 3 +- research/cv/ResNeXt/README.md | 2 +- research/cv/ResidualAttentionNet/README.md | 2 +- research/cv/east/README.md | 2 +- research/cv/eppmvsnet/README.md | 2 +- research/cv/googlenet/README.md | 2 +- research/cv/llnet/README.md | 2 +- research/cv/nasnet/README.md | 2 +- research/cv/osnet/README.md | 2 +- research/cv/pnasnet/README.md | 2 +- research/cv/proxylessnas/README.md | 2 +- research/cv/repvgg/__init__.py | 14 + research/cv/repvgg/eval.py | 113 ++++-- research/cv/repvgg/export.py | 65 ++-- research/cv/repvgg/infer_onnx.py | 128 +++++-- research/cv/repvgg/requriments.txt | 6 + research/cv/repvgg/scripts/run_infer_onnx.sh | 46 ++- research/cv/repvgg/src/tools/__init__.py | 15 + research/cv/repvgg/src/tools/callback.py | 340 ++++++++++++++++-- research/cv/repvgg/src/tools/criterion.py | 23 +- research/cv/repvgg/src/tools/optimizer.py | 42 ++- research/cv/repvgg/src/tools/schedulers.py | 29 +- research/cv/repvgg/train.py | 215 ++++++++--- research/cv/resnetv2_50_frn/README.md | 2 +- research/cv/resnext152_64x4d/README.md | 2 +- research/cv/se_resnext50/README.md | 6 +- research/cv/se_resnext50/README_CN.md | 4 +- research/cv/squeezenet/README.md | 4 +- research/cv/squeezenet1_1/README.md | 4 +- research/cv/ssd_ghostnet/README.md | 4 +- research/cv/ssd_inception_v2/README.md | 4 +- research/cv/ssd_mobilenetV2/README.md | 12 +- research/cv/ssd_mobilenetV2_FPNlite/README.md | 8 +- research/cv/ssd_resnet34/README.md | 4 +- research/cv/ssd_resnet50/README.md | 6 +- research/cv/ssd_resnet_34/README.md | 4 +- research/cv/tnt/eval.py | 116 ++---- research/cv/tnt/export.py | 6 +- research/cv/tnt/src/args.py | 47 +-- research/cv/tnt/src/configs/parser.py | 2 +- research/cv/tnt/src/data/__init__.py | 2 +- research/cv/tnt/src/data/augment/__init__.py | 2 +- .../cv/tnt/src/data/augment/auto_augment.py | 58 ++- research/cv/tnt/src/data/augment/mixup.py | 6 +- .../cv/tnt/src/data/augment/random_erasing.py | 2 +- .../tnt/src/data/data_utils/moxing_adapter.py | 2 +- research/cv/tnt/src/data/imagenet.py | 213 +---------- research/cv/tnt/src/models/__init__.py | 5 +- research/cv/tnt/src/models/tnt/__init__.py | 5 +- research/cv/tnt/src/models/tnt/tnt.py | 325 ++++++++--------- research/cv/tnt/src/tools/cell.py | 9 +- research/cv/tnt/src/tools/criterion.py | 21 +- research/cv/tnt/src/tools/get_misc.py | 17 +- research/cv/tnt/src/tools/optimizer.py | 2 +- research/cv/tnt/src/tools/schedulers.py | 2 +- research/cv/tnt/src/trainers/__init__.py | 2 +- ...ne_step_with_scale_and_clip_global_norm.py | 2 +- research/cv/tnt/train.py | 83 ++--- research/nlp/tprr/README.md | 4 +- research/nlp/transX/README.md | 2 +- 476 files changed, 3591 insertions(+), 3216 deletions(-) diff --git a/.jenkins/check/config/filter_linklint.txt b/.jenkins/check/config/filter_linklint.txt index e46649dcb..8f5359c3c 100644 --- a/.jenkins/check/config/filter_linklint.txt +++ b/.jenkins/check/config/filter_linklint.txt @@ -7,4 +7,11 @@ http://vllab.ucmerced.edu/wlai24/LapSRN/results/* https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.pyPaper https://www.mindspore.cn/install/en https://www.mindspore.cn/resources/hub/details/en?mindspore/1.3/resnest50_imagenet2012 -https://mindspore.cn/resources/hub/details/en?MindSpore/ascend/1.2/mobilenetv2_v1.2_imagenet2012 \ No newline at end of file +https://mindspore.cn/resources/hub/details/en?MindSpore/ascend/1.2/mobilenetv2_v1.2_imagenet2012 +https://arxiv.org/abs/1909.13719AugMix +https://arxiv.org/abs/1805.09501Learning +https://arxiv.org/abs/1805.09501policy +https://arxiv.org/abs/1906.11172RandAugment +https://arxiv.org/abs/2104.00298Acc +https://github.com/google/automl/tree/master/efficientnetv2paper +https://github.com/google-research/augmix/blob/master/imagenet.pyFrom \ No newline at end of file diff --git a/official/audio/DeepSpeech2/eval.py b/official/audio/DeepSpeech2/eval.py index 166f0379a..4f6cbd052 100644 --- a/official/audio/DeepSpeech2/eval.py +++ b/official/audio/DeepSpeech2/eval.py @@ -24,7 +24,7 @@ from src.config import eval_config from src.deepspeech2 import DeepSpeechModel, PredictWithSoftmax from src.dataset import create_dataset from src.greedydecoder import MSGreedyDecoder -from mindspore import context +import mindspore from mindspore.train.serialization import load_checkpoint, load_param_into_net parser = argparse.ArgumentParser(description='DeepSpeech evaluation') @@ -36,7 +36,7 @@ parser.add_argument('--device_target', type=str, default="GPU", choices=("GPU", args = parser.parse_args() if __name__ == '__main__': - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=args.device_target, save_graphs=False) config = eval_config with open(config.DataConfig.labels_path) as label_file: labels = json.load(label_file) diff --git a/official/audio/DeepSpeech2/export.py b/official/audio/DeepSpeech2/export.py index 1b914ba1e..b9b704f1c 100644 --- a/official/audio/DeepSpeech2/export.py +++ b/official/audio/DeepSpeech2/export.py @@ -18,7 +18,8 @@ export checkpoint file to mindir model import json import argparse import numpy as np -from mindspore import context, Tensor +import mindspore +from mindspore import Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net, export from src.deepspeech2 import DeepSpeechModel from src.config import train_config @@ -33,7 +34,7 @@ args = parser.parse_args() if __name__ == '__main__': config = train_config - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=args.device_target, save_graphs=False) with open(config.DataConfig.labels_path) as label_file: labels = json.load(label_file) diff --git a/official/audio/DeepSpeech2/quick_start.py b/official/audio/DeepSpeech2/quick_start.py index 00b53b9b7..7bd0585a2 100644 --- a/official/audio/DeepSpeech2/quick_start.py +++ b/official/audio/DeepSpeech2/quick_start.py @@ -20,7 +20,7 @@ from src.qs_config import quickstart_config from src.deepspeech2 import DeepSpeechModel, PredictWithSoftmax from src.dataset import create_dataset from src.greedydecoder import MSGreedyDecoder -from mindspore import context +import mindspore from mindspore.train.serialization import load_checkpoint, load_param_into_net parser = argparse.ArgumentParser(description='DeepSpeech evaluation') @@ -32,7 +32,7 @@ parser.add_argument('--device_target', type=str, default="CPU", choices=("GPU", args = parser.parse_args() if __name__ == '__main__': - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=args.device_target, save_graphs=False) config = quickstart_config with open(config.DataConfig.labels_path) as label_file: labels = json.load(label_file) diff --git a/official/audio/DeepSpeech2/train.py b/official/audio/DeepSpeech2/train.py index 014a4e089..4c01d3772 100644 --- a/official/audio/DeepSpeech2/train.py +++ b/official/audio/DeepSpeech2/train.py @@ -18,9 +18,10 @@ import argparse import json import os -from mindspore import context, Tensor, ParameterTuple +import mindspore +from mindspore import Tensor, ParameterTuple from mindspore.communication.management import init, get_rank, get_group_size -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn import TrainOneStepCell from mindspore.nn.optim import Adam from mindspore.train import Model @@ -47,15 +48,15 @@ if __name__ == '__main__': group_size = 1 config = train_config data_sink = (args.device_target != "CPU") - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=args.device_target, save_graphs=False) if args.device_target == "GPU": - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) if args.is_distributed: init() rank_id = get_rank() group_size = get_group_size() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) with open(config.DataConfig.labels_path) as label_file: diff --git a/official/audio/EcapaTDNN/eval.py b/official/audio/EcapaTDNN/eval.py index a3d8fe83a..4d24d41e1 100644 --- a/official/audio/EcapaTDNN/eval.py +++ b/official/audio/EcapaTDNN/eval.py @@ -21,14 +21,15 @@ import pickle import numpy as np from scipy.spatial.distance import cosine from sklearn.metrics.pairwise import cosine_similarity +import mindspore from mindspore import Tensor -from mindspore import context, load_checkpoint, load_param_into_net +from mindspore import load_checkpoint, load_param_into_net from src.ecapa_tdnn import ECAPA_TDNN from src.reader import DatasetGenerator from src.metrics import get_EER_from_scores from src.model_utils.config import config as hparams -context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") +mindspore.set_context(mode=0, device_target="Ascend") excluded_set = {2302, 2303, 2304, 2305, 2306, 2307, 2308, 2309, 2310, 2311, 2312, 2313, 2314, 2315, 2316, 2317, 2318, 2319, 2320, 2321, 2322, 2323, 2324, 2325, 2326, 2327, 2328, 2329, @@ -191,7 +192,7 @@ def compute_embeddings(embedder, dataloader, startidx=0, dur=50000, exc_set=None if __name__ == "__main__": - context.set_context(device_id=hparams.device_id) + mindspore.set_context(device_id=hparams.device_id) in_channels = hparams.in_channels channels = hparams.channels emb_size = hparams.emb_size diff --git a/official/audio/EcapaTDNN/export.py b/official/audio/EcapaTDNN/export.py index 8c6002a41..5dba563c0 100644 --- a/official/audio/EcapaTDNN/export.py +++ b/official/audio/EcapaTDNN/export.py @@ -20,8 +20,8 @@ import os import sys from hyperpyyaml import load_hyperpyyaml import numpy as np -import mindspore as ms -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.ecapa_tdnn import ECAPA_TDNN def modelarts_pre_process(): @@ -29,7 +29,7 @@ def modelarts_pre_process(): config.file_name = os.path.join(config.output_path, config.file_name) def run_export(hparams): - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") + mindspore.set_context(mode=0, device_target="Ascend") in_channels = hparams["in_channels"] channels = hparams["channels"] @@ -43,7 +43,7 @@ def run_export(hparams): load_param_into_net(net, param_dict) file_name = hparams["file_name"] file_format = hparams["file_format"] - input_arr = Tensor(np.ones([1, hparams["length"], hparams["channel"]]), ms.float32) + input_arr = Tensor(np.ones([1, hparams["length"], hparams["channel"]]), mindspore.float32) export(net, input_arr, file_name=file_name, file_format=file_format) if __name__ == '__main__': diff --git a/official/audio/EcapaTDNN/modelart/ecapatdnn-modelart.py b/official/audio/EcapaTDNN/modelart/ecapatdnn-modelart.py index 6901c07eb..a35462e96 100644 --- a/official/audio/EcapaTDNN/modelart/ecapatdnn-modelart.py +++ b/official/audio/EcapaTDNN/modelart/ecapatdnn-modelart.py @@ -22,17 +22,16 @@ import ast from datetime import datetime import math import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn -from mindspore import Tensor +from mindspore import Tensor, ParallelMode import mindspore.dataset as ds import mindspore.ops as ops from mindspore.nn import FixedLossScaleUpdateCell -from mindspore import context, load_checkpoint, load_param_into_net, export +from mindspore import load_checkpoint, load_param_into_net, export from mindspore.train.callback import ModelCheckpoint from mindspore.train.callback import CheckpointConfig from mindspore.train.callback import RunContext, _InternalCallbackParam -from mindspore.context import ParallelMode from mindspore.communication.management import init, get_rank, get_group_size from src.ecapa_tdnn import ECAPA_TDNN, Classifier from src.reader import DatasetGeneratorBatch as DatasetGenerator @@ -49,7 +48,7 @@ args, unknown = parser.parse_known_args() def save_ckpt_to_air(save_ckpt_path, path): - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") + mindspore.set_context(mode=0, device_target="Ascend") in_channels = 80 channels = 1024 @@ -60,7 +59,7 @@ def save_ckpt_to_air(save_ckpt_path, path): # assert config.ckpt_file is not None, "config.ckpt_file is None." param_dict = load_checkpoint(path) load_param_into_net(net, param_dict) - input_arr = Tensor(np.ones([1, 301, 80]), ms.float32) + input_arr = Tensor(np.ones([1, 301, 80]), mindspore.float32) export(net, input_arr, file_name=save_ckpt_path+'ecapatdnn', file_format="AIR") @@ -87,12 +86,12 @@ def create_dataset(cfg, data_home, shuffle=False): class CorrectLabelNum(nn.Cell): def __init__(self): super(CorrectLabelNum, self).__init__() - self.argmax = ms.ops.Argmax(axis=1) - self.sum = ms.ops.ReduceSum() + self.argmax = ops.Argmax(axis=1) + self.sum = ops.ReduceSum() def construct(self, output, target): output = self.argmax(output) - correct = self.sum((output == target).astype(ms.dtype.float32)) + correct = self.sum((output == target).astype(mindspore.float32)) return correct @@ -105,7 +104,7 @@ class BuildTrainNetwork(nn.Cell): self.criterion = my_criterion self.lossfunc = lossfunc # Initialize self.output - self.output = ms.Parameter(Tensor(np.ones((train_batch_size, class_num_)), ms.float32), requires_grad=False) + self.output = mindspore.Parameter(Tensor(np.ones((train_batch_size, class_num_)), mindspore.float32), requires_grad=False) self.depth = class_num_ def construct(self, input_data, label): @@ -219,17 +218,17 @@ def train(): # init distributed if hparams.run_distribute: device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=device_id) init() hparams.rank = get_rank() hparams.group_size = get_group_size() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=8, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=8, parameter_broadcast=True) else: hparams.rank = 0 hparams.group_size = 1 - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=hparams.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=hparams.device_id) data_dir = args.data_url in_channels = hparams.in_channels channels = hparams.channels diff --git a/official/audio/EcapaTDNN/src/ecapa_tdnn.py b/official/audio/EcapaTDNN/src/ecapa_tdnn.py index f3f39137c..3a01234e8 100644 --- a/official/audio/EcapaTDNN/src/ecapa_tdnn.py +++ b/official/audio/EcapaTDNN/src/ecapa_tdnn.py @@ -14,13 +14,13 @@ # ============================================================================ import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore import Tensor from mindspore.common.initializer import initializer, XavierUniform -ms.set_seed(0) +mindspore.set_seed(0) class MyBatchNorm1d(nn.Cell): def __init__( @@ -94,8 +94,8 @@ class Res2NetBlock(nn.Cell): ] ) self.scale = scale - self.cat = ms.ops.Concat(axis=1) - self.split = ms.ops.Split(1, scale) + self.cat = ops.Concat(axis=1) + self.split = ops.Split(1, scale) self.print = ops.operations.Print() def construct(self, x): y = [] @@ -131,12 +131,12 @@ class SEBlock(nn.Cell): in_channels=in_channels, out_channels=se_channels, kernel_size=1, has_bias=True, weight_init='he_uniform', bias_init='truncatedNormal' ) - self.relu = ms.nn.ReLU() - self.conv2 = ms.nn.Conv1d( + self.relu = nn.ReLU() + self.conv2 = nn.Conv1d( in_channels=se_channels, out_channels=out_channels, kernel_size=1, has_bias=True, weight_init='he_uniform', bias_init='truncatedNormal' ) - self.sigmoid = ms.nn.Sigmoid() + self.sigmoid = nn.Sigmoid() self.print = ops.operations.Print() def construct(self, x, lengths=None): s = x.mean((2), True) @@ -202,7 +202,7 @@ class SERes2NetBlock(nn.Cell): self.shortcut = None if in_channels != out_channels: - self.shortcut = Conv1d( + self.shortcut = nn.Conv1d( in_channels=in_channels, out_channels=out_channels, kernel_size=1, @@ -247,14 +247,14 @@ class AttentiveStatisticsPooling(nn.Cell): in_channels=attention_channels, out_channels=channels, kernel_size=1, has_bias=bias, weight_init='he_uniform', bias_init='truncatedNormal' ) - self.sqrt = ms.ops.Sqrt() - self.pow = ms.ops.Pow() - self.expandDim = ms.ops.ExpandDims() - self.softmax = ms.ops.Softmax(axis=2) - self.cat = ms.ops.Concat(axis=1) + self.sqrt = ops.Sqrt() + self.pow = ops.Pow() + self.expandDim = ops.ExpandDims() + self.softmax = ops.Softmax(axis=2) + self.cat = ops.Concat(axis=1) self.print = ops.operations.Print() - self.ones = ms.ops.Ones() - self.tile = ms.ops.Tile() + self.ones = ops.Ones() + self.tile = ops.Tile() def construct(self, x, lengths=None): def _compute_statistics(x, m, dim=2, eps=self.eps): mean = (m * x).sum(dim) @@ -372,11 +372,11 @@ class ECAPA_TDNN(nn.Cell): weight_init='he_uniform', bias_init='truncatedNormal' ) - self.expandDim = ms.ops.ExpandDims() - self.softmax = ms.ops.Softmax(axis=2) - self.cat = ms.ops.Concat(axis=1) + self.expandDim = ops.ExpandDims() + self.softmax = ops.Softmax(axis=2) + self.cat = ops.Concat(axis=1) self.print = ops.operations.Print() - self.transpose = ms.ops.Transpose() + self.transpose = ops.Transpose() def construct(self, x, lengths=None): # Minimize transpose for efficiency @@ -438,14 +438,14 @@ class Classifier(nn.Cell): input_size = lin_neurons input_size = lin_neurons # Final Layer - tensor1 = initializer(XavierUniform(), [out_neurons, input_size], ms.float32) - self.weight = ms.Parameter( + tensor1 = initializer(XavierUniform(), [out_neurons, input_size], mindspore.float32) + self.weight = mindspore.Parameter( tensor1 ) - self.norm = ms.ops.L2Normalize(axis=1) + self.norm = ops.L2Normalize(axis=1) self.print = ops.operations.Print() - self.matmul = ms.ops.MatMul() - self.expand_dims = ms.ops.ExpandDims() + self.matmul = ops.MatMul() + self.expand_dims = ops.ExpandDims() def construct(self, x): """Returns the output probabilities over speakers. @@ -461,7 +461,7 @@ class Classifier(nn.Cell): return output if __name__ == '__main__': - input_feats = Tensor(np.ones([1, 32, 60]), ms.float32) + input_feats = Tensor(np.ones([1, 32, 60]), mindspore.float32) compute_embedding = ECAPA_TDNN(32, channels=[256, 256, 256, 256, 768], lin_neurons=192) outputs = compute_embedding(input_feats) print(outputs.shape_) diff --git a/official/audio/EcapaTDNN/src/model_utils/moxing_adapter.py b/official/audio/EcapaTDNN/src/model_utils/moxing_adapter.py index 7d40450e9..df07d4816 100644 --- a/official/audio/EcapaTDNN/src/model_utils/moxing_adapter.py +++ b/official/audio/EcapaTDNN/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from src.model_utils.config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/audio/EcapaTDNN/src/util.py b/official/audio/EcapaTDNN/src/util.py index c02e83117..b321b978d 100644 --- a/official/audio/EcapaTDNN/src/util.py +++ b/official/audio/EcapaTDNN/src/util.py @@ -14,7 +14,7 @@ # ============================================================================ import math -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.numpy as np @@ -48,8 +48,8 @@ class AdditiveAngularMargin(nn.Cell): self.sin_m = math.sin(self.margin) self.th = math.cos(math.pi - self.margin) self.mm = math.sin(math.pi - self.margin) * self.margin - self.sqrt = ms.ops.Sqrt() - self.pow = ms.ops.Pow() + self.sqrt = mindspore.ops.Sqrt() + self.pow = mindspore.ops.Pow() def construct(self, outputs, targets): """ diff --git a/official/audio/EcapaTDNN/train.py b/official/audio/EcapaTDNN/train.py index 7cd48094b..56117e3a0 100644 --- a/official/audio/EcapaTDNN/train.py +++ b/official/audio/EcapaTDNN/train.py @@ -20,17 +20,17 @@ import time from datetime import datetime import math import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore import Tensor import mindspore.dataset as ds import mindspore.ops as ops from mindspore.nn import FixedLossScaleUpdateCell -from mindspore import context, load_checkpoint, load_param_into_net +from mindspore import load_checkpoint, load_param_into_net from mindspore.train.callback import ModelCheckpoint from mindspore.train.callback import CheckpointConfig from mindspore.train.callback import RunContext, _InternalCallbackParam -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import init, get_rank, get_group_size from src.ecapa_tdnn import ECAPA_TDNN, Classifier from src.reader import DatasetGeneratorBatch as DatasetGenerator @@ -63,12 +63,12 @@ def create_dataset(cfg, data_home, shuffle=False): class CorrectLabelNum(nn.Cell): def __init__(self): super(CorrectLabelNum, self).__init__() - self.argmax = ms.ops.Argmax(axis=1) - self.sum = ms.ops.ReduceSum() + self.argmax = ops.Argmax(axis=1) + self.sum = ops.ReduceSum() def construct(self, output, target): output = self.argmax(output) - correct = self.sum((output == target).astype(ms.dtype.float32)) + correct = self.sum((output == target).astype(mindspore.float32)) return correct @@ -81,7 +81,7 @@ class BuildTrainNetwork(nn.Cell): self.criterion = my_criterion self.lossfunc = lossfunc # Initialize self.output - self.output = ms.Parameter(Tensor(np.ones((train_batch_size, class_num_)), ms.float32), requires_grad=False) + self.output = mindspore.Parameter(Tensor(np.ones((train_batch_size, class_num_)), mindspore.float32), requires_grad=False) self.depth = class_num_ def construct(self, input_data, label): @@ -191,17 +191,17 @@ def train(): # init distributed if hparams.run_distribute: device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=device_id) init() hparams.rank = get_rank() hparams.group_size = get_group_size() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=8, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=8, parameter_broadcast=True) else: hparams.rank = 0 hparams.group_size = 1 - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=hparams.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=hparams.device_id) data_dir = hparams.train_data_path in_channels = hparams.in_channels channels = hparams.channels diff --git a/official/audio/LPCNet/eval.py b/official/audio/LPCNet/eval.py index 67006a9e7..1ac5a4645 100644 --- a/official/audio/LPCNet/eval.py +++ b/official/audio/LPCNet/eval.py @@ -19,7 +19,7 @@ from pathlib import Path import numpy as np import mindspore import mindspore.numpy as mnp -from mindspore import context, load_checkpoint +from mindspore import load_checkpoint from src import lpcnet from src.ulaw import lin2ulaw, ulaw2lin @@ -92,7 +92,7 @@ if __name__ == "__main__": device_id = args.device_id # NOTE: fails without max_call_depth due to RNN - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", + mindspore.set_context(mode=0, device_target="Ascend", max_call_depth=5000, device_id=device_id) _model = lpcnet.WithLossLPCNet() diff --git a/official/audio/LPCNet/export.py b/official/audio/LPCNet/export.py index 9fe594d11..d19682a72 100644 --- a/official/audio/LPCNet/export.py +++ b/official/audio/LPCNet/export.py @@ -17,7 +17,7 @@ from argparse import ArgumentParser import numpy as np import mindspore -from mindspore import context, export, load_checkpoint +from mindspore import export, load_checkpoint from src import lpcnet @@ -40,7 +40,7 @@ if __name__ == "__main__": f.write(f"#define MAXLEN {args.max_len}") # NOTE: fails without max_call_depth due to RNN - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, + mindspore.set_context(mode=0, device_target=args.device_target, max_call_depth=30000, device_id=args.device_id) model = lpcnet.WithLossLPCNet() diff --git a/official/audio/LPCNet/src/rnns/rnns.py b/official/audio/LPCNet/src/rnns/rnns.py index f12b6022e..8b973a269 100644 --- a/official/audio/LPCNet/src/rnns/rnns.py +++ b/official/audio/LPCNet/src/rnns/rnns.py @@ -16,6 +16,7 @@ '''RNN operators module, include RNN, GRU, LSTM''' import math import numpy as np +import mindspore import mindspore.nn as nn import mindspore.ops as P import mindspore.numpy as mnp @@ -23,7 +24,6 @@ from mindspore.common import dtype as mstype from mindspore.ops.primitive import constexpr from mindspore import Tensor, Parameter, ParameterTuple from mindspore import log as logger -from mindspore import context from .rnn_cells import rnn_relu_cell, rnn_tanh_cell, gru_cell, lstm_cell @constexpr @@ -248,7 +248,7 @@ class _RNNBase(nn.Cell): def __init__(self, mode, input_size, hidden_size, num_layers=1, has_bias=True, batch_first=False, dropout=0, bidirectional=False): super().__init__() - is_ascend = context.get_context("device_target") == "Ascend" + is_ascend = mindspore.get_context("device_target") == "Ascend" if not 0 <= dropout <= 1: raise ValueError("dropout should be a number in range [0, 1] " "representing the probability of an element being " diff --git a/official/audio/LPCNet/train.py b/official/audio/LPCNet/train.py index 54503dedb..a23b772dd 100644 --- a/official/audio/LPCNet/train.py +++ b/official/audio/LPCNet/train.py @@ -19,7 +19,7 @@ from pathlib import Path import mindspore import mindspore.dataset as ds import mindspore.numpy as np -from mindspore import Model, context, nn, ops +from mindspore import Model, nn, ops from mindspore.train.callback import (CheckpointConfig, LossMonitor, ModelCheckpoint, TimeMonitor) @@ -117,7 +117,7 @@ if __name__ == "__main__": if retrain: input_model = args.retrain - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, + mindspore.set_context(mode=0, device_target=args.device_target, max_call_depth=5000) # NOTE: fails without max_call_depth due to RNN ds.config.set_prefetch_size(16) diff --git a/official/audio/LPCNet/train_lpcnet_parallel.py b/official/audio/LPCNet/train_lpcnet_parallel.py index a0b8f09c7..7cebfc0fa 100644 --- a/official/audio/LPCNet/train_lpcnet_parallel.py +++ b/official/audio/LPCNet/train_lpcnet_parallel.py @@ -20,9 +20,8 @@ from pathlib import Path import mindspore import mindspore.dataset as ds import mindspore.numpy as np -from mindspore import Model, context, nn, ops +from mindspore import Model, nn, ops, ParallelMode from mindspore.communication import get_group_size, init -from mindspore.context import ParallelMode from mindspore.train.callback import (CheckpointConfig, LossMonitor, ModelCheckpoint, TimeMonitor) @@ -120,12 +119,12 @@ if __name__ == "__main__": input_model = args.retrain device_id = int(os.getenv('DEVICE_ID')) - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, + mindspore.set_context(mode=0, device_target=args.device_target, max_call_depth=5000) # NOTE: fails without max_call_depth due to RNN - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) init() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=False, parameter_broadcast=True) ds.config.set_prefetch_size(16) diff --git a/official/audio/MELGAN/eval.py b/official/audio/MELGAN/eval.py index 7ac894034..033bee0df 100644 --- a/official/audio/MELGAN/eval.py +++ b/official/audio/MELGAN/eval.py @@ -17,19 +17,19 @@ import os import numpy as np from scipy.io.wavfile import write +import mindspore from mindspore import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.common.tensor import Tensor -import mindspore.context as context from src.model import Generator from src.model_utils.config import config as cfg -context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") +mindspore.set_context(mode=0, device_target="Ascend") if __name__ == '__main__': - context.set_context(device_id=cfg.device_id) + mindspore.set_context(device_id=cfg.device_id) if not os.path.exists(cfg.output_path): os.mkdir(cfg.output_path) diff --git a/official/audio/MELGAN/export.py b/official/audio/MELGAN/export.py index 0ae2e1a9f..dc5aea4de 100644 --- a/official/audio/MELGAN/export.py +++ b/official/audio/MELGAN/export.py @@ -16,7 +16,7 @@ import argparse import numpy as np -import mindspore as ms +import mindspore from mindspore import Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net, export @@ -35,5 +35,5 @@ if __name__ == '__main__': param_dict = load_checkpoint(args_opt.checkpoint_path) load_param_into_net(net, param_dict) - input_arr = Tensor(np.random.uniform(0.0, 1.0, size=[1, 80, 240]), ms.float32) + input_arr = Tensor(np.random.uniform(0.0, 1.0, size=[1, 80, 240]), mindspore.float32) export(net, input_arr, file_name=args_opt.model_name, file_format=args_opt.format) diff --git a/official/audio/MELGAN/modelarts/train_modelarts.py b/official/audio/MELGAN/modelarts/train_modelarts.py index 681db402e..3a13b099e 100644 --- a/official/audio/MELGAN/modelarts/train_modelarts.py +++ b/official/audio/MELGAN/modelarts/train_modelarts.py @@ -17,15 +17,14 @@ import os import time import numpy as np -import mindspore as ms +import mindspore import mindspore.common.dtype as mstype -import mindspore.context as context import mindspore.dataset as de import mindspore.nn as nn from mindspore.common import set_seed from mindspore.common.tensor import Tensor from mindspore.communication.management import init, get_rank, get_group_size -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.callback import RunContext, ModelCheckpoint, CheckpointConfig, _InternalCallbackParam from mindspore.train.loss_scale_manager import DynamicLossScaleManager from mindspore.train.serialization import load_checkpoint, load_param_into_net, export @@ -74,17 +73,17 @@ def train(): # init distributed if cfg.run_distribute: device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=device_id) init() cfg.rank = get_rank() cfg.group_size = get_group_size() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=8, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=8, parameter_broadcast=True) else: cfg.rank = 0 cfg.group_size = 1 - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=cfg.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=cfg.device_id) # get network and init net_D = MultiDiscriminator() net_G = Generator(alpha=cfg.leaky_alpha) @@ -169,7 +168,7 @@ def train(): duration = time.perf_counter() - epoch_t print('finish in {:.2f}mins'.format(duration / 60)) - input_arr = Tensor(np.random.uniform(0.0, 1.0, size=[1, 80, 240]), ms.float32) + input_arr = Tensor(np.random.uniform(0.0, 1.0, size=[1, 80, 240]), mindspore.float32) export(net_G, input_arr, file_name=os.path.join(cfg.train_url, 'melgan_final'), file_format="AIR") diff --git a/official/audio/MELGAN/src/model_utils/moxing_adapter.py b/official/audio/MELGAN/src/model_utils/moxing_adapter.py index aabd5ac6c..32c4e5ab6 100644 --- a/official/audio/MELGAN/src/model_utils/moxing_adapter.py +++ b/official/audio/MELGAN/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from src.model_utils.config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/audio/MELGAN/src/trainonestep.py b/official/audio/MELGAN/src/trainonestep.py index 7f5f1b387..2eaa0bc17 100644 --- a/official/audio/MELGAN/src/trainonestep.py +++ b/official/audio/MELGAN/src/trainonestep.py @@ -18,7 +18,7 @@ import mindspore.common.dtype as mstype from mindspore.common.tensor import Tensor from mindspore.ops import functional as F from mindspore.ops import operations as P -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.cell import Cell from mindspore.common.parameter import ParameterTuple from mindspore.ops.operations import ReduceSum, \ diff --git a/official/audio/MELGAN/train.py b/official/audio/MELGAN/train.py index 0c6f3a9fc..20a4127b4 100644 --- a/official/audio/MELGAN/train.py +++ b/official/audio/MELGAN/train.py @@ -16,14 +16,13 @@ import time import os -import mindspore.nn as nn +import mindspore +from mindspore import ParallelMode, nn from mindspore.common import set_seed import mindspore.common.dtype as mstype from mindspore.common.tensor import Tensor -from mindspore.context import ParallelMode from mindspore.communication.management import init, get_rank, get_group_size import mindspore.dataset as de -import mindspore.context as context from mindspore.train.loss_scale_manager import DynamicLossScaleManager from mindspore.train.callback import RunContext, ModelCheckpoint, CheckpointConfig, _InternalCallbackParam from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -67,17 +66,17 @@ def train(): # init distributed if cfg.run_distribute: device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=device_id) init() cfg.rank = get_rank() cfg.group_size = get_group_size() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=8, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=8, parameter_broadcast=True) else: cfg.rank = 0 cfg.group_size = 1 - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=cfg.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=cfg.device_id) # get network and init net_D = MultiDiscriminator() net_G = Generator(alpha=cfg.leaky_alpha) diff --git a/official/audio/Tacotron2/eval.py b/official/audio/Tacotron2/eval.py index e04016a47..c6150ddc3 100644 --- a/official/audio/Tacotron2/eval.py +++ b/official/audio/Tacotron2/eval.py @@ -24,7 +24,6 @@ import matplotlib.pylab as plt import numpy as np import mindspore -from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore import Tensor @@ -39,7 +38,7 @@ from model_utils.device_adapter import get_device_id, get_device_num matplotlib.use('Agg') -context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target=config.device_target) +mindspore.set_context(mode=0, save_graphs=False, device_target=config.device_target) def load_model(ckpt_pth): diff --git a/official/audio/Tacotron2/model_utils/moxing_adapter.py b/official/audio/Tacotron2/model_utils/moxing_adapter.py index 25838a7da..189ff0667 100644 --- a/official/audio/Tacotron2/model_utils/moxing_adapter.py +++ b/official/audio/Tacotron2/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/audio/Tacotron2/src/tacotron2.py b/official/audio/Tacotron2/src/tacotron2.py index a5b573f31..314bb0f30 100644 --- a/official/audio/Tacotron2/src/tacotron2.py +++ b/official/audio/Tacotron2/src/tacotron2.py @@ -22,8 +22,7 @@ from mindspore.ops import operations as P from mindspore.ops import composite as C from mindspore.ops import functional as F from mindspore.ops import Argmax as indexArgmax -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_group_size from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.parallel._utils import _get_gradients_mean @@ -1112,7 +1111,7 @@ class TrainStepWrap(nn.Cell): dtype=mindspore.float32)) self.reducer_flag = False - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.is_distributed = True diff --git a/official/audio/Tacotron2/train.py b/official/audio/Tacotron2/train.py index 9b6bc2ad6..a47dd3a8c 100644 --- a/official/audio/Tacotron2/train.py +++ b/official/audio/Tacotron2/train.py @@ -20,12 +20,12 @@ import numpy as np import mindspore import mindspore.dataset as ds -from mindspore.context import ParallelMode +import mindspore +from mindspore import ParallelMode from mindspore.communication import management as MultiDevice from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell -from mindspore import context from mindspore import Model from mindspore import Tensor from mindspore import dtype as mstype @@ -53,7 +53,7 @@ mindspore.common.set_seed(1024) time_stamp_init = False time_stamp_first = 0 -context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target=config.device_target, max_call_depth=8000) +mindspore.set_context(mode=0, save_graphs=False, device_target=config.device_target, max_call_depth=8000) def prepare_dataloaders(dataset_path, rank_id, group_size): @@ -195,9 +195,9 @@ def _build_training_pipeline(pre_dataset, run_distribute=False): def set_parallel_env(): '''set parallel context''' - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() MultiDevice.init() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, device_num=MultiDevice.get_group_size(), gradients_mean=True) diff --git a/official/cv/Arcface/eval_ijbc.py b/official/cv/Arcface/eval_ijbc.py index 26f5180b1..f4f3c6a55 100644 --- a/official/cv/Arcface/eval_ijbc.py +++ b/official/cv/Arcface/eval_ijbc.py @@ -40,8 +40,9 @@ import cv2 from skimage import transform as trans +import mindspore from mindspore.train.serialization import load_checkpoint, load_param_into_net -from mindspore import Tensor, context +from mindspore import Tensor from mindspore import dtype as mstype import mindspore.ops as ops import mindspore.nn as nn @@ -348,7 +349,7 @@ def read_score(path): def main(): - context.set_context(mode=context.GRAPH_MODE, device_id=0) + mindspore.set_context(mode=0, device_id=0) # # Step1: Load Meta Data # In[ ]: diff --git a/official/cv/Arcface/export.py b/official/cv/Arcface/export.py index 52ca1d9b2..f6bd1a0ee 100644 --- a/official/cv/Arcface/export.py +++ b/official/cv/Arcface/export.py @@ -19,8 +19,9 @@ python export.py import argparse import numpy as np +import mindspore from mindspore import dtype as mstype -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.iresnet import iresnet100 @@ -36,9 +37,9 @@ parser.add_argument('--dataset_name', type=str, default='MS1MV2', choices=['MS1M help='dataset name.') args = parser.parse_args() -context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) +mindspore.set_context(mode=0, device_target=args.device_target) if args.device_target == "Ascend": - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) if __name__ == '__main__': if args.dataset_name != 'MS1MV2': diff --git a/official/cv/Arcface/modelarts/start.py b/official/cv/Arcface/modelarts/start.py index db6b7ce2e..5317ef683 100644 --- a/official/cv/Arcface/modelarts/start.py +++ b/official/cv/Arcface/modelarts/start.py @@ -19,10 +19,11 @@ import os import glob import argparse import numpy as np +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore import export -from mindspore import context, Tensor +from mindspore import Tensor from mindspore import dtype as mstype from mindspore.parallel import set_algo_parameters from mindspore.train.model import Model, ParallelMode @@ -114,15 +115,15 @@ if __name__ == "__main__": ckpt_save_path = CKPT_PATH train_epoch = args.epochs target = args.device_target - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if args.device_num > 1: device_id = int(os.getenv('DEVICE_ID')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) else: - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) if args.device_num > 1: - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, ) cost_model_context.set_cost_model_context(device_memory_capacity=32.0 * 1024.0 * 1024.0 * 1024.0, diff --git a/official/cv/Arcface/train.py b/official/cv/Arcface/train.py index 80c2a8584..0dda876a6 100644 --- a/official/cv/Arcface/train.py +++ b/official/cv/Arcface/train.py @@ -18,9 +18,10 @@ python train.py import argparse import os import numpy as np + import mindspore import mindspore.nn as nn -from mindspore import context, Tensor +from mindspore import Tensor import mindspore.ops as ops from mindspore.train.model import Model, ParallelMode from mindspore import dtype as mstype @@ -99,14 +100,14 @@ class MyNetWithLoss(nn.Cell): if __name__ == "__main__": train_epoch = args.epochs target = args.device_target - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=target, save_graphs=False) device_id = args.device_id if args.device_num > 1: if target == 'Ascend': device_id = int(os.getenv('DEVICE_ID')) - context.set_context(device_id=device_id) - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_context(device_id=device_id) + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, ) cost_model_context.set_cost_model_context(device_memory_capacity=32.0 * 1024.0 * 1024.0 * 1024.0, @@ -116,7 +117,7 @@ if __name__ == "__main__": init() elif target == 'GPU': init() - context.set_auto_parallel_context(device_num=args.device_num, + mindspore.set_auto_parallel_context(device_num=args.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, auto_parallel_search_mode="recursive_programming") diff --git a/official/cv/Arcface/val.py b/official/cv/Arcface/val.py index 1e4243c32..f2d4b010b 100644 --- a/official/cv/Arcface/val.py +++ b/official/cv/Arcface/val.py @@ -28,9 +28,8 @@ from sklearn.decomposition import PCA from sklearn.model_selection import KFold import matplotlib.pyplot as plt from scipy import interpolate -import mindspore as ms +import mindspore from mindspore.train.serialization import load_checkpoint, load_param_into_net -from mindspore import context from src.iresnet import iresnet100 @@ -251,7 +250,7 @@ def test(data_set, backbone, batch_size, nfolds=10): time0 = datetime.datetime.now() img = ((_data / 255) - 0.5) / 0.5 - net_out = backbone(ms.Tensor(img, ms.float32)) + net_out = backbone(mindspore.Tensor(img, mindspore.float32)) _embeddings = net_out.asnumpy() time_now = datetime.datetime.now() diff = time_now - time0 @@ -305,7 +304,7 @@ def main(): parser.add_argument('--max', default='', type=str, help='') parser.add_argument('--nfolds', default=10, type=int, help='') args = parser.parse_args() - context.set_context(device_id=args.device_id, mode=context.GRAPH_MODE, + mindspore.set_context(device_id=args.device_id, mode=0, device_target=args.device_target) image_size = [112, 112] time0 = datetime.datetime.now() diff --git a/official/cv/CRNN/eval.py b/official/cv/CRNN/eval.py index 635ae6124..8e50e63a7 100644 --- a/official/cv/CRNN/eval.py +++ b/official/cv/CRNN/eval.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ """Warpctc evaluation""" -from mindspore import context +import mindspore from mindspore.common import set_seed from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -29,14 +29,14 @@ from src.model_utils.device_adapter import get_device_id set_seed(1) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False) +mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) @moxing_wrapper(pre_process=None) def crnn_eval(): if config.device_target == 'Ascend': device_id = get_device_id() - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) config.batch_size = 1 max_text_length = config.max_text_length diff --git a/official/cv/CRNN/export.py b/official/cv/CRNN/export.py index 9a89c30bb..10986006d 100644 --- a/official/cv/CRNN/export.py +++ b/official/cv/CRNN/export.py @@ -16,14 +16,14 @@ """ export model for CRNN """ import os import numpy as np -import mindspore as ms -from mindspore import Tensor, context, load_checkpoint, export +import mindspore +from mindspore import Tensor, load_checkpoint, export from src.crnn import crnn from src.model_utils.moxing_adapter import moxing_wrapper from src.model_utils.config import config from src.model_utils.device_adapter import get_device_id -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False) +mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) def modelarts_pre_process(): config.file_name = os.path.join(config.output_path, config.file_name) @@ -32,7 +32,7 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def model_export(): if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) config.batch_size = 1 net = crnn(config, full_precision=config.device_target != 'Ascend') @@ -40,7 +40,7 @@ def model_export(): load_checkpoint(config.ckpt_file, net=net) net.set_train(False) - input_data = Tensor(np.zeros([1, 3, config.image_height, config.image_width]), ms.float32) + input_data = Tensor(np.zeros([1, 3, config.image_height, config.image_width]), mindspore.float32) export(net, input_data, file_name=config.file_name, file_format=config.file_format) diff --git a/official/cv/CRNN/modelarts/start.py b/official/cv/CRNN/modelarts/start.py index 29826dcf5..0c271214d 100644 --- a/official/cv/CRNN/modelarts/start.py +++ b/official/cv/CRNN/modelarts/start.py @@ -19,11 +19,11 @@ import glob import shutil import numpy as np import mindspore.nn as nn -import mindspore as ms -from mindspore import context, Tensor, export +import mindspore +from mindspore import Tensor, export from mindspore.common import set_seed from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.wrap import WithLossCell from mindspore.train.callback import TimeMonitor, LossMonitor, CheckpointConfig, ModelCheckpoint from mindspore.train.serialization import load_checkpoint @@ -40,7 +40,7 @@ from src.model_utils.device_adapter import get_rank_id, get_device_num, get_devi set_seed(1) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False) +mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) CKPT_OUTPUT_PATH = config.train_url CKPT_OUTPUT_FILE_PATH = os.path.join(CKPT_OUTPUT_PATH, 'ckpt_0') @@ -60,7 +60,7 @@ def modelarts_pre_process(): def train(): if config.device_target == 'Ascend': device_id = get_device_id() - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) if config.model_version == 'V1' and config.device_target != 'Ascend': raise ValueError("model version V1 is only supported on Ascend, pls check the config.") @@ -75,8 +75,8 @@ def train(): init() device_num = get_group_size() rank = get_rank() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: @@ -152,7 +152,7 @@ def model_trans(): net.set_train(False) input_data = Tensor( - np.zeros([1, 3, config.image_height, config.image_width]), ms.float32) + np.zeros([1, 3, config.image_height, config.image_width]), mindspore.float32) export(net, input_data, file_name='crnn', file_format='AIR') shutil.copy('crnn.air', CKPT_OUTPUT_PATH) diff --git a/official/cv/CRNN/src/crnn_for_train.py b/official/cv/CRNN/src/crnn_for_train.py index aedf04f2b..cd2ad6629 100644 --- a/official/cv/CRNN/src/crnn_for_train.py +++ b/official/cv/CRNN/src/crnn_for_train.py @@ -14,8 +14,8 @@ # ============================================================================ """Automatic differentiation with grad clip.""" import numpy as np -from mindspore import context -from mindspore.context import ParallelMode +import mindspore +from mindspore import ParallelMode from mindspore.common import dtype as mstype from mindspore.ops import composite as C from mindspore.ops import functional as F @@ -85,7 +85,7 @@ class TrainOneStepCellWithGradClip(Cell): self.cast = P.Cast() self.concat = P.Concat(axis=0) self.ten = Tensor(np.array([10.0]).astype(np.float32)) - parallel_mode = context.get_auto_parallel_context("parallel_mode") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: diff --git a/official/cv/CRNN/src/model_utils/moxing_adapter.py b/official/cv/CRNN/src/model_utils/moxing_adapter.py index c2d228240..344dfc034 100644 --- a/official/cv/CRNN/src/model_utils/moxing_adapter.py +++ b/official/cv/CRNN/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config @@ -101,7 +101,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print('Workspace downloaded: ', os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/CRNN/train.py b/official/cv/CRNN/train.py index 5b0808855..a1dc2e0b8 100644 --- a/official/cv/CRNN/train.py +++ b/official/cv/CRNN/train.py @@ -14,11 +14,11 @@ # ============================================================================ """crnn training""" import os +import mindspore import mindspore.nn as nn -from mindspore import context from mindspore.common import set_seed from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.wrap import WithLossCell from mindspore.train.callback import CheckpointConfig, ModelCheckpoint from mindspore.communication.management import init, get_group_size, get_rank @@ -39,7 +39,7 @@ from src.model_utils.lr_scheduler import cosine_decay_lr_with_start_step set_seed(1) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False) +mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) def apply_eval(eval_param): @@ -69,7 +69,7 @@ def train(): if config.device_target == 'Ascend': device_id = get_device_id() - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) if config.model_version == 'V1' and config.device_target != 'Ascend': raise ValueError("model version V1 is only supported on Ascend, pls check the config.") @@ -86,8 +86,8 @@ def train(): # lr_scale = 1 device_num = get_group_size() rank = get_rank() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: diff --git a/official/cv/CTPN/README.md b/official/cv/CTPN/README.md index a4b601bb0..47be063e7 100644 --- a/official/cv/CTPN/README.md +++ b/official/cv/CTPN/README.md @@ -282,7 +282,7 @@ ICDAR2013, SCUT-FORU to improve precision and recall, and when doing Finetune, w ### Result -Training result will be stored in the example path. Checkpoints will be stored at `ckpt_path` by default, and training log will be redirected to `./log`, also the loss will be redirected to `./loss_0.log` like followings. +Training result will be stored in the example path. Checkpoints will be stored at `ckpt_path` by default, and training log will be redirected to `./log`, also the loss will be redirected to `./loss_0.log` like following. ```python 377 epoch: 1 step: 229 ,rpn_loss: 0.00355 @@ -391,7 +391,7 @@ You can add `run_eval` to start shell and set it True, if you want evaluation wh ### Result -Evaluation result will be stored in the example path, you can find result like the followings in `log`. +Evaluation result will be stored in the example path, you can find result like the following in `log`. ```text {"precision": 0.90791, "recall": 0.86118, "hmean": 0.88393} @@ -547,7 +547,7 @@ bash eval_res.sh ### Result -Evaluation result will be stored in the example path, you can find result like the followings in `log`. +Evaluation result will be stored in the example path, you can find result like the following in `log`. ```text {"precision": 0.88913, "recall": 0.86082, "hmean": 0.87475} diff --git a/official/cv/CTPN/eval.py b/official/cv/CTPN/eval.py index 47af42290..e45ce1840 100644 --- a/official/cv/CTPN/eval.py +++ b/official/cv/CTPN/eval.py @@ -15,7 +15,7 @@ """Evaluation for CTPN""" import os -from mindspore import context +import mindspore from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.common import set_seed from src.ctpn import CTPN @@ -28,7 +28,7 @@ from src.model_utils.device_adapter import get_device_id set_seed(1) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) +mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) def modelarts_pre_process(): pass diff --git a/official/cv/CTPN/export.py b/official/cv/CTPN/export.py index ebd87ce0a..9c42196a3 100644 --- a/official/cv/CTPN/export.py +++ b/official/cv/CTPN/export.py @@ -15,18 +15,18 @@ """export checkpoint file into air, onnx, mindir models""" import os import numpy as np -import mindspore as ms -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.ctpn import CTPN_Infer from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) def modelarts_pre_process(): @@ -50,7 +50,7 @@ def model_export(): load_param_into_net(net, param_dict_new) - img = Tensor(np.zeros([config.test_batch_size, 3, config.img_height, config.img_width]), ms.float16) + img = Tensor(np.zeros([config.test_batch_size, 3, config.img_height, config.img_width]), mindspore.float16) export(net, img, file_name=config.file_name, file_format=config.file_format) diff --git a/official/cv/CTPN/src/CTPN/bbox_assign_sample.py b/official/cv/CTPN/src/CTPN/bbox_assign_sample.py index 93d348c7b..94f47c12d 100644 --- a/official/cv/CTPN/src/CTPN/bbox_assign_sample.py +++ b/official/cv/CTPN/src/CTPN/bbox_assign_sample.py @@ -15,15 +15,15 @@ """CTPN positive and negative sample screening for RPN.""" import numpy as np +import mindspore import mindspore.nn as nn -from mindspore import context from mindspore.ops import operations as P from mindspore.common.tensor import Tensor import mindspore.common.dtype as mstype from src.CTPN.BoundingBoxEncode import BoundingBoxEncode -if context.get_context("device_target") == "Ascend": +if mindspore.get_context("device_target") == "Ascend": mtype = mstype.float16 nptype = np.float16 else: diff --git a/official/cv/CTPN/src/CTPN/proposal_generator.py b/official/cv/CTPN/src/CTPN/proposal_generator.py index f1dd79548..78a3d9aeb 100644 --- a/official/cv/CTPN/src/CTPN/proposal_generator.py +++ b/official/cv/CTPN/src/CTPN/proposal_generator.py @@ -15,14 +15,15 @@ """CTPN proposal generator.""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P -from mindspore import Tensor, context +from mindspore import Tensor from src.CTPN.BoundingBoxDecode import BoundingBoxDecode -if context.get_context("device_target") == "Ascend": +if mindspore.get_context("device_target") == "Ascend": mtype = mstype.float16 nptype = np.float16 else: diff --git a/official/cv/CTPN/src/CTPN/rpn.py b/official/cv/CTPN/src/CTPN/rpn.py index 903c18207..1eca50b49 100644 --- a/official/cv/CTPN/src/CTPN/rpn.py +++ b/official/cv/CTPN/src/CTPN/rpn.py @@ -14,15 +14,16 @@ # ============================================================================ """RPN for fasterRCNN""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P -from mindspore import Tensor, context +from mindspore import Tensor from mindspore.ops import functional as F from src.CTPN.bbox_assign_sample import BboxAssignSample -if context.get_context("device_target") == "Ascend": +if mindspore.get_context("device_target") == "Ascend": mtype = mstype.float16 nptype = np.float16 else: diff --git a/official/cv/CTPN/src/ctpn.py b/official/cv/CTPN/src/ctpn.py index 5ae25fa22..d0e90977f 100644 --- a/official/cv/CTPN/src/ctpn.py +++ b/official/cv/CTPN/src/ctpn.py @@ -15,7 +15,7 @@ """CPTN network definition.""" import numpy as np -from mindspore import context +import mindspore import mindspore.nn as nn from mindspore import Tensor, Parameter from mindspore.common import dtype as mstype @@ -26,7 +26,7 @@ from src.CTPN.proposal_generator import Proposal from src.CTPN.vgg16 import VGG16FeatureExtraction from src.weight_init import lstm_default_state -if context.get_context("device_target") == "Ascend": +if mindspore.get_context("device_target") == "Ascend": mtype = mstype.float16 nptype = np.float16 else: @@ -113,7 +113,7 @@ class CTPN(nn.Cell): self.transpose = P.Transpose() self.cast = P.Cast() self.is_training = is_training - self.device_target = context.get_context("device_target") + self.device_target = mindspore.get_context("device_target") # rpn block self.rpn_with_loss = RPN(config, diff --git a/official/cv/CTPN/src/model_utils/moxing_adapter.py b/official/cv/CTPN/src/model_utils/moxing_adapter.py index c2d228240..344dfc034 100644 --- a/official/cv/CTPN/src/model_utils/moxing_adapter.py +++ b/official/cv/CTPN/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config @@ -101,7 +101,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print('Workspace downloaded: ', os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/CTPN/train.py b/official/cv/CTPN/train.py index 7e932179f..f52df437b 100644 --- a/official/cv/CTPN/train.py +++ b/official/cv/CTPN/train.py @@ -18,12 +18,13 @@ import os import ast import operator import numpy as np +import mindspore import mindspore.common.dtype as mstype -from mindspore import context, Tensor, Parameter +from mindspore import Tensor, Parameter from mindspore.communication.management import init, get_group_size, get_rank from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor from mindspore.train import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.nn import Momentum from mindspore.common import set_seed @@ -41,11 +42,11 @@ from src.model_utils.device_adapter import get_device_id set_seed(1) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) +mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) # Set mempool block size in PYNATIVE_MODE for improving memory utilization, which will not take effect in GRAPH_MODE -if context.get_context("mode") == context.PYNATIVE_MODE: - context.set_context(mempool_block_size="20GB") +if mindspore.get_context("mode") == 1: + mindspore.set_context(mempool_block_size="20GB") binOps = { ast.Add: operator.add, @@ -95,10 +96,10 @@ def train(): config.weight_decay = arithmeticeval(config.weight_decay) if config.run_distribute: init() - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() rank = get_rank() device_num = get_group_size() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: rank = 0 diff --git a/official/cv/CycleGAN/eval.py b/official/cv/CycleGAN/eval.py index 8135ccdbd..44f3c11d4 100644 --- a/official/cv/CycleGAN/eval.py +++ b/official/cv/CycleGAN/eval.py @@ -16,7 +16,7 @@ """Cycle GAN test.""" import os -import mindspore as ms +import mindspore from src.models.cycle_gan import get_generator from src.utils.args import get_args from src.dataset.cyclegan_dataset import create_dataset @@ -27,12 +27,12 @@ from src.utils.tools import save_image, load_ckpt def predict(): """Predict function.""" args = get_args("predict") - ms.set_context(mode=ms.GRAPH_MODE, device_target=args.platform, + mindspore.set_context(mode=0, device_target=args.platform, save_graphs=args.save_graphs, device_id=args.device_id) args.rank = 0 args.device_num = 1 if args.platform == "GPU": - ms.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) G_A = get_generator(args) G_B = get_generator(args) G_A.set_train(True) @@ -50,7 +50,7 @@ def predict(): reporter = Reporter(args) reporter.start_predict("A to B") for data in ds.create_dict_iterator(output_numpy=True): - img_A = ms.Tensor(data["image"]) + img_A = mindspore.Tensor(data["image"]) path_A = data["image_name"][0] path_B = path_A[0:-4] + "_fake_B.jpg" fake_B = G_A(img_A) @@ -63,7 +63,7 @@ def predict(): reporter.dataset_size = args.dataset_size reporter.start_predict("B to A") for data in ds.create_dict_iterator(output_numpy=True): - img_B = ms.Tensor(data["image"]) + img_B = mindspore.Tensor(data["image"]) path_B = data["image_name"][0] path_A = path_B[0:-4] + "_fake_A.jpg" fake_A = G_B(img_B) diff --git a/official/cv/CycleGAN/export.py b/official/cv/CycleGAN/export.py index 66d1c52e7..6da610d1d 100644 --- a/official/cv/CycleGAN/export.py +++ b/official/cv/CycleGAN/export.py @@ -16,7 +16,7 @@ """export file.""" import numpy as np -import mindspore as ms +import mindspore from src.models.cycle_gan import get_generator from src.utils.args import get_args from src.utils.tools import load_ckpt, enable_batch_statistics @@ -24,7 +24,7 @@ from src.utils.tools import load_ckpt, enable_batch_statistics if __name__ == '__main__': args = get_args("export") - ms.set_context(mode=ms.GRAPH_MODE, device_target=args.platform) + mindspore.set_context(mode=0, device_target=args.platform) G_A = get_generator(args) G_B = get_generator(args) # Use BatchNorm2d with batchsize=1, affine=False, use_batch_statistics=True instead of InstanceNorm2d @@ -34,8 +34,8 @@ if __name__ == '__main__': load_ckpt(args, G_A, G_B) input_shp = [args.export_batch_size, 3, args.image_size, args.image_size] - input_array = ms.Tensor(np.random.uniform(-1.0, 1.0, size=input_shp).astype(np.float32)) + input_array = mindspore.Tensor(np.random.uniform(-1.0, 1.0, size=input_shp).astype(np.float32)) G_A_file = f"{args.export_file_name}_AtoB" - ms.export(G_A, input_array, file_name=G_A_file, file_format=args.export_file_format) + mindspore.export(G_A, input_array, file_name=G_A_file, file_format=args.export_file_format) G_B_file = f"{args.export_file_name}_BtoA" - ms.export(G_B, input_array, file_name=G_B_file, file_format=args.export_file_format) + mindspore.export(G_B, input_array, file_name=G_B_file, file_format=args.export_file_format) diff --git a/official/cv/CycleGAN/src/models/cycle_gan.py b/official/cv/CycleGAN/src/models/cycle_gan.py index 521bf800e..762761f1b 100644 --- a/official/cv/CycleGAN/src/models/cycle_gan.py +++ b/official/cv/CycleGAN/src/models/cycle_gan.py @@ -15,10 +15,9 @@ """Cycle GAN network.""" -import mindspore as ms +import mindspore import mindspore.nn as nn -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.communication.management import get_group_size import mindspore.ops as ops @@ -176,17 +175,17 @@ class TrainOneStepG(nn.Cell): self.G.D_B.set_train(False) self.grad = ops.GradOperation(get_by_list=True, sens_param=True) self.sens = sens - self.weights = ms.ParameterTuple(generator.trainable_params()) + self.weights = mindspore.ParameterTuple(generator.trainable_params()) self.net = WithLossCell(G) self.reducer_flag = False self.grad_reducer = None - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True if self.reducer_flag: - mean = context.get_auto_parallel_context("gradients_mean") + mean = mindspore.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): - degree = context.get_auto_parallel_context("device_num") + degree = mindspore.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree) @@ -224,16 +223,16 @@ class TrainOneStepD(nn.Cell): self.D.set_train() self.grad = ops.GradOperation(get_by_list=True, sens_param=True) self.sens = sens - self.weights = ms.ParameterTuple(D.trainable_params()) + self.weights = mindspore.ParameterTuple(D.trainable_params()) self.reducer_flag = False self.grad_reducer = None - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True if self.reducer_flag: - mean = context.get_auto_parallel_context("gradients_mean") + mean = mindspore.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): - degree = context.get_auto_parallel_context("device_num") + degree = mindspore.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree) diff --git a/official/cv/CycleGAN/train.py b/official/cv/CycleGAN/train.py index d777ec9ce..9e0f1d8af 100644 --- a/official/cv/CycleGAN/train.py +++ b/official/cv/CycleGAN/train.py @@ -20,7 +20,7 @@ Example: python train.py --dataroot ./data/horse2zebra --model ResNet """ -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.communication.management import init, get_rank, get_group_size from src.utils.args import get_args @@ -30,26 +30,26 @@ from src.dataset.cyclegan_dataset import create_dataset from src.models.losses import DiscriminatorLoss, GeneratorLoss from src.models.cycle_gan import get_generator, get_discriminator, Generator, TrainOneStepG, TrainOneStepD -ms.set_seed(1) +mindspore.set_seed(1) def train(): """Train function.""" args = get_args("train") if args.device_num > 1: - ms.set_context(mode=ms.GRAPH_MODE, device_target=args.platform, save_graphs=args.save_graphs) + mindspore.set_context(mode=0, device_target=args.platform, save_graphs=args.save_graphs) init() - ms.reset_auto_parallel_context() - ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True) + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) args.rank = get_rank() args.group_size = get_group_size() else: - ms.set_context(mode=ms.GRAPH_MODE, device_target=args.platform, + mindspore.set_context(mode=0, device_target=args.platform, save_graphs=args.save_graphs, device_id=args.device_id) args.rank = 0 args.device_num = 1 if args.platform == "GPU": - ms.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) if args.need_profiler: from mindspore.profiler.profiling import Profiler profiler = Profiler(output_path=args.outputs_dir, is_detail=True, is_show_op_path=True) diff --git a/official/cv/DBNet/eval.py b/official/cv/DBNet/eval.py index 1147f9312..1c8971093 100644 --- a/official/cv/DBNet/eval.py +++ b/official/cv/DBNet/eval.py @@ -16,7 +16,7 @@ import os import sys -import mindspore as ms +import mindspore from src.datasets.load import create_dataset from src.utils.eval_utils import WithEval @@ -59,7 +59,7 @@ def evaluate(cfg, path): eval_net = WithEval(eval_net, cfg) eval_net.model.set_train(False) cfg.logger.info(f"infer {p}") - ms.load_checkpoint(p, eval_net.model) + mindspore.load_checkpoint(p, eval_net.model) metrics, fps = eval_net.eval(val_dataset, show_imgs=cfg.eval.show_images) params = sum([param.size for param in eval_net.model.get_parameters()]) / (1024 ** 2) cfg.logger.info(f"Param: {params} M") diff --git a/official/cv/DBNet/export.py b/official/cv/DBNet/export.py index 926d5c3aa..6bc848e21 100644 --- a/official/cv/DBNet/export.py +++ b/official/cv/DBNet/export.py @@ -16,7 +16,7 @@ import os import sys -import mindspore as ms +import mindspore from src.utils.env import init_env from src.modules.model import get_dbnet @@ -32,14 +32,14 @@ def export(): init_env(config) config.backbone.pretrained = False eval_net = get_dbnet(config.net, config, isTrain=False) - ms.load_checkpoint(config.ckpt_path, eval_net) + mindspore.load_checkpoint(config.ckpt_path, eval_net) eval_net.set_train(False) if not config.dataset.offload: - inp = ms.ops.ones((1, 3, *config.eval.eval_size), ms.float32) + inp = mindspore.ops.ones((1, 3, *config.eval.eval_size), mindspore.float32) else: - inp = ms.ops.ones((1, *config.eval.eval_size, 3), ms.float32) + inp = mindspore.ops.ones((1, *config.eval.eval_size, 3), mindspore.float32) file_name = config.net + '_' + config.backbone.initializer - ms.export(eval_net, inp, file_name=file_name, file_format='MINDIR') + mindspore.export(eval_net, inp, file_name=file_name, file_format='MINDIR') print("MINDIR saved at", file_name+".mindir") diff --git a/official/cv/DBNet/src/model_utils/moxing_adapter.py b/official/cv/DBNet/src/model_utils/moxing_adapter.py index ed75cf910..6e96d5c08 100644 --- a/official/cv/DBNet/src/model_utils/moxing_adapter.py +++ b/official/cv/DBNet/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config _global_sync_count = 0 @@ -116,7 +116,7 @@ def moxing_wrapper(pre_process=None, post_process=None): config.eval.gt_dir = os.path.join(config.data_path, "test_gts") config.backbone.backbone_ckpt = os.path.join(config.data_path, config.backbone.backbone_ckpt) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_dir): diff --git a/official/cv/DBNet/src/modules/backbone/__init__.py b/official/cv/DBNet/src/modules/backbone/__init__.py index 415403f76..02eb50367 100644 --- a/official/cv/DBNet/src/modules/backbone/__init__.py +++ b/official/cv/DBNet/src/modules/backbone/__init__.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ -import mindspore as ms +import mindspore from .resnet import ResNet, Bottleneck, BasicBlock from .mobilenetv3 import MobileNetV3 @@ -21,21 +21,21 @@ from .mobilenetv3 import MobileNetV3 def mobilenetv3(pretrained=True, backbone_ckpt=None, **kwargs): model = MobileNetV3(**kwargs) if pretrained: - ms.load_checkpoint(backbone_ckpt, model) + mindspore.load_checkpoint(backbone_ckpt, model) return model def resnet18(pretrained=True, backbone_ckpt=None, **kwargs): model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) if pretrained: - ms.load_checkpoint(backbone_ckpt, model) + mindspore.load_checkpoint(backbone_ckpt, model) return model def deformable_resnet18(pretrained=True, backbone_ckpt=None, **kwargs): model = ResNet(BasicBlock, [2, 2, 2, 2], dcn=True, **kwargs) if pretrained: - ms.load_checkpoint(backbone_ckpt, model) + mindspore.load_checkpoint(backbone_ckpt, model) return model @@ -46,7 +46,7 @@ def resnet50(pretrained=True, backbone_ckpt=None, **kwargs): """ model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) if pretrained: - ms.load_checkpoint(backbone_ckpt, model) + mindspore.load_checkpoint(backbone_ckpt, model) return model @@ -57,7 +57,7 @@ def deformable_resnet50(pretrained=True, backbone_ckpt=None, **kwargs): """ model = ResNet(Bottleneck, [3, 4, 6, 3], dcn=True, **kwargs) if pretrained: - ms.load_checkpoint(backbone_ckpt, model) + mindspore.load_checkpoint(backbone_ckpt, model) return model diff --git a/official/cv/DBNet/src/modules/data_offload.py b/official/cv/DBNet/src/modules/data_offload.py index 3dc8d19c5..2fb4e359d 100644 --- a/official/cv/DBNet/src/modules/data_offload.py +++ b/official/cv/DBNet/src/modules/data_offload.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ """Supported dataset operations applied on devices""" -import mindspore as ms +import mindspore from mindspore import ops, nn from mindspore.dataset.engine.offload import RandomColorAdjust @@ -24,8 +24,8 @@ class Normalize(nn.Cell): """ def __init__(self, mean, std): super(Normalize, self).__init__(auto_prefix=False) - self.mean = ms.Tensor(mean, ms.float32) - self.std = ms.Tensor(std, ms.float32) + self.mean = mindspore.Tensor(mean, mindspore.float32) + self.std = mindspore.Tensor(std, mindspore.float32) def construct(self, img): img = (img - self.mean.reshape((1, 1, 1, -1))) / self.std.reshape((1, 1, 1, -1)) diff --git a/official/cv/DBNet/src/modules/loss.py b/official/cv/DBNet/src/modules/loss.py index 7da245734..b6afa69a2 100644 --- a/official/cv/DBNet/src/modules/loss.py +++ b/official/cv/DBNet/src/modules/loss.py @@ -16,7 +16,7 @@ """Loss functions.""" from mindspore import nn, ops -import mindspore as ms +import mindspore import mindspore.numpy as mnp @@ -140,15 +140,15 @@ class BalanceCrossEntropyLoss(nn.LossBase): pred = pred.squeeze(axis=1) gt = gt.squeeze(axis=1) - pos = (gt * mask).astype(ms.float32) - neg = ((1 - gt) * mask).astype(ms.float32) + pos = (gt * mask).astype(mindspore.float32) + neg = ((1 - gt) * mask).astype(mindspore.float32) - positive_count = pos.sum(axis=(1, 2), keepdims=True).astype(ms.int32) - negative_count = neg.sum(axis=(1, 2), keepdims=True).astype(ms.int32) + positive_count = pos.sum(axis=(1, 2), keepdims=True).astype(mindspore.int32) + negative_count = neg.sum(axis=(1, 2), keepdims=True).astype(mindspore.int32) negative_count = self.min(negative_count, positive_count * self.negative_ratio).squeeze(axis=(1, 2)) - loss = self.bceloss(pred.astype(ms.float32), gt.astype(ms.float32)) + loss = self.bceloss(pred.astype(mindspore.float32), gt.astype(mindspore.float32)) positive_loss = loss * pos N = loss.shape[0] @@ -159,13 +159,13 @@ class BalanceCrossEntropyLoss(nn.LossBase): neg_index = self.stack((batch_iter, negative_count)) min_neg_score = self.unsqueeze(self.gather(negative_value, neg_index), 1) - masked_neg_loss = self.cast(negative_loss >= min_neg_score, ms.float32) # filter out losses less than topk loss. + masked_neg_loss = self.cast(negative_loss >= min_neg_score, mindspore.float32) # filter out losses less than topk loss. masked_neg_loss = ops.stop_gradient(masked_neg_loss) masked_neg_loss = masked_neg_loss * negative_loss balance_loss = (positive_loss.sum() + masked_neg_loss.sum()) / \ - ((positive_count + negative_count).astype(ms.float32).sum() + self.eps) + ((positive_count + negative_count).astype(mindspore.float32).sum() + self.eps) return balance_loss diff --git a/official/cv/DBNet/src/utils/callback.py b/official/cv/DBNet/src/utils/callback.py index 55c45ade8..5ea153a66 100644 --- a/official/cv/DBNet/src/utils/callback.py +++ b/official/cv/DBNet/src/utils/callback.py @@ -17,7 +17,7 @@ import os import time import numpy as np -import mindspore as ms +import mindspore from mindspore.train.callback import Callback from src.datasets.load import create_dataset @@ -100,10 +100,10 @@ class DBNetMonitor(Callback): def handle_loss(self, net_outputs): """Handle loss""" if isinstance(net_outputs, (tuple, list)): - if isinstance(net_outputs[0], ms.Tensor) and isinstance(net_outputs[0].asnumpy(), np.ndarray): + if isinstance(net_outputs[0], mindspore.Tensor) and isinstance(net_outputs[0].asnumpy(), np.ndarray): loss = net_outputs[0].asnumpy() - elif isinstance(net_outputs, ms.Tensor) and isinstance(net_outputs.asnumpy(), np.ndarray): + elif isinstance(net_outputs, mindspore.Tensor) and isinstance(net_outputs.asnumpy(), np.ndarray): loss = float(np.mean(net_outputs.asumpy())) return loss @@ -196,11 +196,11 @@ class DBNetMonitor(Callback): f'best fmeasure is: {cur_f:.2f}, ' f'e2e cost: {time.time() - self.train_start:.2f} s, ' f'current train time: {sum(self.train_time_list):.2f} s') - if ms.context.get_context("enable_ge"): + if mindspore.get_context("enable_ge"): from mindspore.train.callback import _set_cur_net _set_cur_net(cb_params.train_network) cb_params.train_network.exec_checkpoint_graph() - ms.save_checkpoint(self.eval_net.model, + mindspore.save_checkpoint(self.eval_net.model, os.path.join(self.save_ckpt_dir, f"best_rank{self.config.rank_id}.ckpt")) self.max_f = cur_f if self.early_stop and isinstance(self.stop_value, dict) and self.stop_value: @@ -213,7 +213,7 @@ class DBNetMonitor(Callback): f"best recall: {metrics['recall'].avg:.2f}, " f"precision: {metrics['precision'].avg:.2f}, " f"fmeasure: {metrics['fmeasure'].avg:.2f}") - ms.save_checkpoint(self.eval_net.model, + mindspore.save_checkpoint(self.eval_net.model, os.path.join(self.save_ckpt_dir, f"best_rank{self.config.rank_id}.ckpt")) run_context.request_stop() e2e_time = time.time() - self.epoch_start_time diff --git a/official/cv/DBNet/src/utils/env.py b/official/cv/DBNet/src/utils/env.py index c633c102e..79822ec3d 100644 --- a/official/cv/DBNet/src/utils/env.py +++ b/official/cv/DBNet/src/utils/env.py @@ -15,29 +15,29 @@ """Environ setting.""" import os import cv2 -import mindspore as ms +import mindspore from mindspore.communication.management import init, get_rank, get_group_size def init_env(cfg): os.environ["OPENBLAS_NUM_THREADS"] = "1" cv2.setNumThreads(2) - ms.set_seed(cfg.seed) + mindspore.set_seed(cfg.seed) if cfg.device_target != "None": if cfg.device_target not in ["Ascend", "GPU", "CPU"]: raise ValueError(f"Invalid device_target: {cfg.device_target}, " f"should be in ['None', 'Ascend', 'GPU', 'CPU") - ms.set_context(device_target=cfg.device_target) + mindspore.set_context(device_target=cfg.device_target) if cfg.context_mode not in ["graph", "pynative"]: raise ValueError(f"Invalid context_mode: {cfg.context_mode}, " f"should be in ['graph', 'pynative") - context_mode = ms.GRAPH_MODE if cfg.context_mode == "graph" else ms.PYNATIVE_MODE - ms.set_context(mode=context_mode) - ms.set_context(ascend_config={'atomic_clean_policy': 0}) + context_mode = 0 if cfg.context_mode == "graph" else 1 + mindspore.set_context(mode=context_mode) + mindspore.set_context(ascend_config={'atomic_clean_policy': 0}) - cfg.device_target = ms.get_context("device_target") + cfg.device_target = mindspore.get_context("device_target") if cfg.device_target == "CPU": print("run on CPU !!!") cfg.device_id = 0 @@ -45,7 +45,7 @@ def init_env(cfg): cfg.rank_id = 0 if cfg.device_target == 'Ascend': - ms.set_context(device_id=cfg.device_id, ascend_config={"precision_mode": "allow_fp32_to_fp16"}) + mindspore.set_context(device_id=cfg.device_id, ascend_config={"precision_mode": "allow_fp32_to_fp16"}) if cfg.device_num > 1: init() @@ -53,13 +53,13 @@ def init_env(cfg): if cfg.device_num != group_size: raise ValueError(f"the setting device_num: {cfg.device_num} not equal to the real group_size: {group_size}") cfg.rank_id = get_rank() - ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True) + mindspore.set_auto_parallel_context(parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) if hasattr(cfg, "all_reduce_fusion_config"): - ms.set_auto_parallel_context(all_reduce_fusion_config=cfg.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=cfg.all_reduce_fusion_config) cpu_affinity(cfg.rank_id, cfg.device_num) else: if hasattr(cfg, "device_id") and isinstance(cfg.device_id, int) and cfg.device_target == 'Ascend': - ms.set_context(device_id=cfg.device_id) + mindspore.set_context(device_id=cfg.device_id) cfg.device_num = 1 cfg.rank_id = 0 diff --git a/official/cv/DBNet/src/utils/eval_utils.py b/official/cv/DBNet/src/utils/eval_utils.py index e516e975e..a5ec3a8e3 100644 --- a/official/cv/DBNet/src/utils/eval_utils.py +++ b/official/cv/DBNet/src/utils/eval_utils.py @@ -21,7 +21,7 @@ import numpy as np import cv2 from tqdm.auto import tqdm -import mindspore as ms +import mindspore from .metric import QuadMetric from .post_process import SegDetectorRepresenter @@ -39,7 +39,7 @@ class WithEval: config.eval.dest) def once_eval(self, batch): start = time.time() - img = ms.Tensor(batch['img']) + img = mindspore.Tensor(batch['img']) preds = self.model(img).asnumpy() boxes, scores = self.post_process({'binary': preds}) cur_time = time.time() - start diff --git a/official/cv/DBNet/src/utils/post_process.py b/official/cv/DBNet/src/utils/post_process.py index f7fe02ad0..aa2f6b4d1 100644 --- a/official/cv/DBNet/src/utils/post_process.py +++ b/official/cv/DBNet/src/utils/post_process.py @@ -19,7 +19,7 @@ import numpy as np from shapely.geometry import Polygon import pyclipper -import mindspore as ms +import mindspore import mindspore.ops as ops @@ -52,7 +52,7 @@ class SegDetectorRepresenter: dest_dict = {'binary': 0, 'thresh': 1, 'thresh_binary': 2} idx = dest_dict[self.dest] pred = pred[idx][:, 0, :, :] - if isinstance(pred, ms.Tensor): + if isinstance(pred, mindspore.Tensor): pred = pred.asnumpy() segmentation = self.binarize(pred) boxes_batch = [] diff --git a/official/cv/DBNet/train.py b/official/cv/DBNet/train.py index 72951125e..30a5f3f7e 100644 --- a/official/cv/DBNet/train.py +++ b/official/cv/DBNet/train.py @@ -16,7 +16,7 @@ import os import sys -import mindspore as ms +import mindspore from mindspore import nn from mindspore.train.callback import CheckpointConfig, ModelCheckpoint import src.modules.loss as loss @@ -71,17 +71,17 @@ def train(): net = get_dbnet(config.net, config, isTrain=True) if config.device_num > 1: params_num = len(net.trainable_params()) - ms.set_auto_parallel_context(all_reduce_fusion_config=[params_num // 2, params_num // 3 * 2, params_num - 1]) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[params_num // 2, params_num // 3 * 2, params_num - 1]) if config.train.pretrained_ckpt: - ms.load_checkpoint(net, config.train.pretrained_ckpt) + mindspore.load_checkpoint(net, config.train.pretrained_ckpt) config.logger.info("load pretrained checkpoint: %s", config.train.pretrained_ckpt) if config.train.resume_ckpt: - resume_param = ms.load_checkpoint(config.train.resume_ckpt, + resume_param = mindspore.load_checkpoint(config.train.resume_ckpt, choice_func=lambda x: not x.startswith(('learning_rate', 'global_step'))) - config.train.start_epoch_num = int(resume_param.get('epoch_num', ms.Tensor(0, ms.int32)).asnumpy().item()) + config.train.start_epoch_num = int(resume_param.get('epoch_num', mindspore.Tensor(0, mindspore.int32)).asnumpy().item()) - lr = ms.Tensor(warmup_polydecay(base_lr=config.optimizer.lr.base_lr, + lr = mindspore.Tensor(warmup_polydecay(base_lr=config.optimizer.lr.base_lr, target_lr=config.optimizer.lr.target_lr, warmup_epoch=config.optimizer.lr.warmup_epoch, total_epoch=config.train.total_epochs, @@ -107,8 +107,8 @@ def train(): bce_scale=config.loss.bce_scale, bce_replace=config.loss.bce_replace) if config.mix_precision: # only resnet run with float16 - net.to_float(ms.float32) - net.backbone.to_float(ms.float16) + net.to_float(mindspore.float32) + net.backbone.to_float(mindspore.float16) net_with_loss = WithLossCell(net, criterion) train_net = TrainOneStepCell(net_with_loss, optimizer=opt, scale_sense=nn.FixedLossScaleUpdateCell(1024.), clip_grad=config.train.clip_grad, force_update=config.train.force_update) @@ -123,11 +123,11 @@ def train(): cb_default.append(ModelCheckpoint(config=ckpt_config, directory=config.save_ckpt_dir, prefix=config.net + '-' + config.backbone.initializer)) if config.train.resume_ckpt: - ms.load_param_into_net(train_net, resume_param) + mindspore.load_param_into_net(train_net, resume_param) cb_default.append(ResumeCallback(config.train.start_epoch_num)) config.logger.info("Resume train from epoch: %s", config.train.start_epoch_num) cb_default.append(DBNetMonitor(config, net, lr.asnumpy(), per_print_times=config.per_print_times)) - model = ms.Model(train_net) + model = mindspore.Model(train_net) config.logger.save_args(config) if config.run_profiler: model.train(3, train_dataset, callbacks=cb_default, sink_size=20, diff --git a/official/cv/DeepLabV3P/eval.py b/official/cv/DeepLabV3P/eval.py index 9bceefc5d..194f3c1b2 100644 --- a/official/cv/DeepLabV3P/eval.py +++ b/official/cv/DeepLabV3P/eval.py @@ -17,10 +17,10 @@ import os import argparse import numpy as np import cv2 +import mindspore from mindspore import Tensor import mindspore.common.dtype as mstype import mindspore.nn as nn -from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.deeplab_v3plus import DeepLabV3Plus @@ -156,7 +156,7 @@ def eval_batch_scales(args, eval_net, img_lst, scales, def net_eval(): """net_eval""" args = parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False, + mindspore.set_context(mode=0, device_target=args.device_target, save_graphs=False, device_id=args.device_id) # data list with open(args.data_lst) as f: diff --git a/official/cv/DeepLabV3P/export.py b/official/cv/DeepLabV3P/export.py index 3ffeafc55..b36ccf42b 100644 --- a/official/cv/DeepLabV3P/export.py +++ b/official/cv/DeepLabV3P/export.py @@ -15,12 +15,13 @@ """export MINDIR file.""" import argparse import numpy as np +import mindspore import mindspore.nn as nn import mindspore.ops as ops -from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.deeplab_v3plus import DeepLabV3Plus -context.set_context(mode=context.GRAPH_MODE, device_target='Ascend') +mindspore.set_context(mode=0, device_target='Ascend') class BuildEvalNetwork(nn.Cell): """BuildEvalNetwork""" diff --git a/official/cv/DeepLabV3P/train.py b/official/cv/DeepLabV3P/train.py index 8a57aa73f..49591835f 100644 --- a/official/cv/DeepLabV3P/train.py +++ b/official/cv/DeepLabV3P/train.py @@ -16,9 +16,9 @@ import os import argparse import ast -from mindspore import context +import mindspore from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode import mindspore.nn as nn from mindspore.train.callback import ModelCheckpoint, CheckpointConfig from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -100,9 +100,9 @@ def parse_args(): def train(): """train""" args = parse_args() - context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target=args.device_target) + mindspore.set_context(mode=0, save_graphs=False, device_target=args.device_target) if args.device_target != "CPU": - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) # init multicards training if args.modelArts_mode: @@ -116,7 +116,7 @@ def train(): args.rank = get_rank() args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=args.group_size) local_data_url = os.path.join(local_data_url, str(device_id)) # download dataset from obs to cache @@ -131,7 +131,7 @@ def train(): args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=args.group_size) data_file = args.data_file ckpt_file = args.ckpt_pre_trained diff --git a/official/cv/DeepLabv3/eval.py b/official/cv/DeepLabv3/eval.py index 5a407a601..a34c290e0 100644 --- a/official/cv/DeepLabv3/eval.py +++ b/official/cv/DeepLabv3/eval.py @@ -18,11 +18,11 @@ import os import time import numpy as np import cv2 +import mindspore from mindspore import Tensor import mindspore.common.dtype as mstype import mindspore.nn as nn import mindspore.ops as ops -from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.nets import net_factory @@ -187,7 +187,7 @@ def net_eval(): config.scales = config.scales_list[config.scales_type] args = config - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False, + mindspore.set_context(mode=0, device_target=args.device_target, save_graphs=False, device_id=get_device_id()) # data list diff --git a/official/cv/DeepLabv3/export.py b/official/cv/DeepLabv3/export.py index 0375e4048..af2433be5 100644 --- a/official/cv/DeepLabv3/export.py +++ b/official/cv/DeepLabv3/export.py @@ -16,9 +16,10 @@ import os import numpy as np +import mindspore import mindspore.nn as nn import mindspore.ops as ops -from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.nets import net_factory from model_utils.config import config @@ -48,9 +49,9 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def run_export(): '''run export.''' - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) if config.export_model == 'deeplab_v3_s16': network = net_factory.nets_map['deeplab_v3_s16']('eval', config.num_classes, 16, config.freeze_bn) diff --git a/official/cv/DeepLabv3/model_utils/moxing_adapter.py b/official/cv/DeepLabv3/model_utils/moxing_adapter.py index 25838a7da..189ff0667 100644 --- a/official/cv/DeepLabv3/model_utils/moxing_adapter.py +++ b/official/cv/DeepLabv3/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/DeepLabv3/modelarts/train_start.py b/official/cv/DeepLabv3/modelarts/train_start.py index 0a0f3f3e4..227230d45 100644 --- a/official/cv/DeepLabv3/modelarts/train_start.py +++ b/official/cv/DeepLabv3/modelarts/train_start.py @@ -19,9 +19,10 @@ import glob import argparse import moxing as mox import numpy as np -from mindspore import context, export, Tensor +import mindspore +from mindspore import export, Tensor from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode import mindspore.nn as nn import mindspore.ops as ops from mindspore.train.callback import ModelCheckpoint, CheckpointConfig @@ -157,9 +158,9 @@ def get_device_id(): def train(args, train_url, data_file, ckpt_pre_trained): if args.device_target == "CPU": - context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="CPU") + mindspore.set_context(mode=0, save_graphs=False, device_target="CPU") else: - context.set_context(mode=context.GRAPH_MODE, save_graphs=False, + mindspore.set_context(mode=0, save_graphs=False, device_target="Ascend", device_id=get_device_id()) # init multicards training @@ -169,7 +170,7 @@ def train(args, train_url, data_file, ckpt_pre_trained): args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=args.group_size) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=args.group_size) # dataset dataset = data_generator.SegDataset(image_mean=args.image_mean, @@ -257,7 +258,7 @@ def train(args, train_url, data_file, ckpt_pre_trained): def export_air(args, train_url): '''run export.''' - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) + mindspore.set_context(mode=0, device_target=args.device_target) ckpt_list = glob.glob(train_url + "/*.ckpt") ckpt_list.sort(key=os.path.getmtime) ckpt_model = ckpt_list[-1] diff --git a/official/cv/DeepLabv3/train.py b/official/cv/DeepLabv3/train.py index 1a115a66f..0f374d335 100644 --- a/official/cv/DeepLabv3/train.py +++ b/official/cv/DeepLabv3/train.py @@ -16,9 +16,9 @@ import os import time -from mindspore import context +import mindspore from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode import mindspore.nn as nn from mindspore.train.callback import ModelCheckpoint, CheckpointConfig from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -107,9 +107,9 @@ def train(): args = config if args.device_target == "CPU": - context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="CPU") + mindspore.set_context(mode=0, save_graphs=False, device_target="CPU") else: - context.set_context(mode=context.GRAPH_MODE, save_graphs=False, + mindspore.set_context(mode=0, save_graphs=False, device_target="Ascend", device_id=get_device_id()) # init multicards training @@ -119,7 +119,7 @@ def train(): args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=args.group_size) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=args.group_size) # dataset dataset = data_generator.SegDataset(image_mean=args.image_mean, diff --git a/official/cv/DeepText/README.md b/official/cv/DeepText/README.md index 22867dfc4..73854bf83 100644 --- a/official/cv/DeepText/README.md +++ b/official/cv/DeepText/README.md @@ -273,7 +273,7 @@ Here we used 4 datasets for training, and 1 datasets for Evaluation. ### Result -Training result will be stored in the example path. Checkpoints will be stored at `ckpt_path` by default, and training log will be redirected to `./log`, also the loss will be redirected to `./loss_0.log` like followings. +Training result will be stored in the example path. Checkpoints will be stored at `ckpt_path` by default, and training log will be redirected to `./log`, also the loss will be redirected to `./loss_0.log` like following. ```python 469 epoch: 1 step: 982 ,rpn_loss: 0.03940, rcnn_loss: 0.48169, rpn_cls_loss: 0.02910, rpn_reg_loss: 0.00344, rcnn_cls_loss: 0.41943, rcnn_reg_loss: 0.06223, total_loss: 0.52109 @@ -306,7 +306,7 @@ You can start training using python or shell scripts. The usage of shell scripts ### Result -Evaluation result will be stored in the example path, you can find result like the followings in `log`. +Evaluation result will be stored in the example path, you can find result like the following in `log`. ```python ======================================== diff --git a/official/cv/DeepText/eval.py b/official/cv/DeepText/eval.py index 9aa93a432..d88905a51 100644 --- a/official/cv/DeepText/eval.py +++ b/official/cv/DeepText/eval.py @@ -18,8 +18,8 @@ import os import time import numpy as np +import mindspore import mindspore.common.dtype as mstype -from mindspore import context from mindspore.common import set_seed from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -33,7 +33,7 @@ from model_utils.device_adapter import get_device_id, get_device_num set_seed(1) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) +mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) def deeptext_eval_test(dataset_path='', ckpt_path=''): @@ -51,7 +51,7 @@ def deeptext_eval_test(dataset_path='', ckpt_path=''): print("\n========================================\n", flush=True) print("Processing, please wait a moment.", flush=True) - device_type = "Ascend" if context.get_context("device_target") == "Ascend" else "Others" + device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "Others" if device_type == "Ascend": net.to_float(mstype.float16) diff --git a/official/cv/DeepText/export.py b/official/cv/DeepText/export.py index 4f668b707..c74305d62 100644 --- a/official/cv/DeepText/export.py +++ b/official/cv/DeepText/export.py @@ -16,8 +16,8 @@ import os import numpy as np -import mindspore as ms -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.Deeptext.deeptext_vgg16 import Deeptext_VGG16_Infer @@ -36,8 +36,8 @@ def modelarts_pre_process(): def run_export(): '''run export.''' config.test_batch_size = config.export_batch_size - context.set_context(mode=context.GRAPH_MODE, device_target=config.export_device_target) - context.set_context(device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=config.export_device_target) + mindspore.set_context(device_id=get_device_id()) net = Deeptext_VGG16_Infer(config=config) net.set_train(False) @@ -50,7 +50,7 @@ def run_export(): load_param_into_net(net, param_dict_new) - img_data = Tensor(np.zeros([config.test_batch_size, 3, config.img_height, config.img_width]), ms.float32) + img_data = Tensor(np.zeros([config.test_batch_size, 3, config.img_height, config.img_width]), mindspore.float32) export(net, img_data, file_name=config.file_name, file_format=config.file_format) diff --git a/official/cv/DeepText/model_utils/moxing_adapter.py b/official/cv/DeepText/model_utils/moxing_adapter.py index 25838a7da..189ff0667 100644 --- a/official/cv/DeepText/model_utils/moxing_adapter.py +++ b/official/cv/DeepText/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/DeepText/src/Deeptext/proposal_generator.py b/official/cv/DeepText/src/Deeptext/proposal_generator.py index 2b484be88..ff7803964 100644 --- a/official/cv/DeepText/src/Deeptext/proposal_generator.py +++ b/official/cv/DeepText/src/Deeptext/proposal_generator.py @@ -15,11 +15,12 @@ """Deeptext proposal generator.""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import functional as F from mindspore.ops import operations as P -from mindspore import context, Tensor +from mindspore import Tensor class Proposal(nn.Cell): @@ -113,7 +114,7 @@ class Proposal(nn.Cell): cfg = config self.topK_stage1 = () self.topK_shape = () - self.exec_mode = context.get_context("mode") + self.exec_mode = mindspore.get_context("mode") total_max_topk_input = 0 if not self.training_local: self.num_pre = cfg.rpn_nms_pre @@ -148,7 +149,7 @@ class Proposal(nn.Cell): bbox_pred_list = bbox_pred_list + (rpn_bbox_pred_i,) proposals, masks = self.get_bboxes_single(cls_score_list, bbox_pred_list, anchor_list) - if self.exec_mode == context.PYNATIVE_MODE: + if self.exec_mode == 1: proposals = F.stop_gradient(proposals) masks = F.stop_gradient(masks) proposals_tuple += (proposals,) diff --git a/official/cv/DeepText/train.py b/official/cv/DeepText/train.py index ad6b95bff..d00d1f60b 100644 --- a/official/cv/DeepText/train.py +++ b/official/cv/DeepText/train.py @@ -28,11 +28,12 @@ from model_utils.config import config from model_utils.moxing_adapter import moxing_wrapper from model_utils.device_adapter import get_device_id, get_device_num, get_rank_id +import mindspore import mindspore.common.dtype as mstype -from mindspore import context, Tensor, Parameter +from mindspore import Tensor, Parameter from mindspore.common import set_seed from mindspore.communication.management import init, get_group_size, get_rank -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn import Momentum, TrainOneStepWithLossScaleCell from mindspore.train import Model from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor @@ -42,7 +43,7 @@ np.set_printoptions(threshold=np.inf) set_seed(1001) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) +mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) def modelarts_pre_process(): @@ -102,15 +103,15 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def run_train(): - device_type = "Ascend" if context.get_context("device_target") == "Ascend" else "GPU" + device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "GPU" if device_type == "GPU": - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) if config.run_distribute: init() - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() rank = get_rank() device_num = get_group_size() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: rank = get_rank_id() diff --git a/official/cv/EDSR/export.py b/official/cv/EDSR/export.py index 2aadb54c1..1e5458442 100644 --- a/official/cv/EDSR/export.py +++ b/official/cv/EDSR/export.py @@ -19,8 +19,8 @@ python export.py import os import numpy as np -import mindspore as ms -from mindspore import Tensor, export, context +import mindspore +from mindspore import Tensor, export from src.utils import init_net from model_utils.config import config @@ -28,9 +28,9 @@ from model_utils.device_adapter import get_device_id from model_utils.moxing_adapter import moxing_wrapper -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) MAX_HR_SIZE = 2040 @@ -48,7 +48,7 @@ def run_export(): net = init_net(cfg) max_lr_size = MAX_HR_SIZE // cfg.scale - input_arr = Tensor(np.ones([1, cfg.n_colors, max_lr_size, max_lr_size]), ms.float32) + input_arr = Tensor(np.ones([1, cfg.n_colors, max_lr_size, max_lr_size]), mindspore.float32) file_name = os.path.splitext(os.path.basename(cfg.pre_trained))[0] file_name = file_name + f"_InputSize{max_lr_size}" file_path = os.path.join(cfg.output_path, file_name) diff --git a/official/cv/EDSR/model_utils/moxing_adapter.py b/official/cv/EDSR/model_utils/moxing_adapter.py index b9cab7332..60c1b6779 100644 --- a/official/cv/EDSR/model_utils/moxing_adapter.py +++ b/official/cv/EDSR/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config @@ -95,7 +95,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/EDSR/src/utils.py b/official/cv/EDSR/src/utils.py index 65eea98a7..3bb96b4c9 100644 --- a/official/cv/EDSR/src/utils.py +++ b/official/cv/EDSR/src/utils.py @@ -18,9 +18,9 @@ import os import time -from mindspore import context +import mindspore from mindspore.communication.management import init -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint from model_utils.config import config @@ -33,21 +33,21 @@ def init_env(cfg): """ init env for mindspore """ - context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target) + mindspore.set_context(mode=0, device_target=cfg.device_target) device_num = get_device_num() if cfg.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) if device_num > 1: init() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) elif cfg.device_target == "GPU": - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) if device_num > 1: init() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) elif cfg.device_target == "CPU": pass diff --git a/official/cv/Efficientnet/efficientnet-b0/eval.py b/official/cv/Efficientnet/efficientnet-b0/eval.py index 00b068987..173714967 100644 --- a/official/cv/Efficientnet/efficientnet-b0/eval.py +++ b/official/cv/Efficientnet/efficientnet-b0/eval.py @@ -15,8 +15,8 @@ """evaluate imagenet""" import time +import mindspore from mindspore import nn -from mindspore import context from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -33,7 +33,7 @@ if __name__ == '__main__': else: raise NotImplementedError("This model currently not supported") - context.set_context(mode=context.GRAPH_MODE, device_target=config.platform) + mindspore.set_context(mode=0, device_target=config.platform) if model_name == 'efficientnet_b0': net = efficientnet_b0(num_classes=config.num_classes, diff --git a/official/cv/Efficientnet/efficientnet-b0/export.py b/official/cv/Efficientnet/efficientnet-b0/export.py index d6a5fceef..769745cf4 100644 --- a/official/cv/Efficientnet/efficientnet-b0/export.py +++ b/official/cv/Efficientnet/efficientnet-b0/export.py @@ -15,11 +15,12 @@ """export file""" import numpy as np -from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.efficientnet import efficientnet_b0 from src.config import config -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if __name__ == "__main__": diff --git a/official/cv/Efficientnet/efficientnet-b0/train.py b/official/cv/Efficientnet/efficientnet-b0/train.py index 32c2a93dd..f600e7575 100644 --- a/official/cv/Efficientnet/efficientnet-b0/train.py +++ b/official/cv/Efficientnet/efficientnet-b0/train.py @@ -19,10 +19,10 @@ import os import numpy as np import mindspore from mindspore import nn -from mindspore import Tensor, context +from mindspore import Tensor from mindspore.communication.management import get_group_size, get_rank, init from mindspore.nn import SGD, RMSProp -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.callback import (CheckpointConfig, LossMonitor, ModelCheckpoint, TimeMonitor) from mindspore.train.loss_scale_manager import FixedLossScaleManager @@ -107,14 +107,14 @@ if __name__ == '__main__': summary_dir = local_path + "/train/summary/" rank_id, rank_size = 0, 1 - context.set_context(mode=context.GRAPH_MODE) + mindspore.set_context(mode=0) if config.platform == "GPU": dataset_sink_mode = True - context.set_context(device_target='GPU', enable_graph_kernel=True) + mindspore.set_context(device_target='GPU', enable_graph_kernel=True) elif config.platform == "CPU": dataset_sink_mode = False - context.set_context(device_target='CPU') + mindspore.set_context(device_target='CPU') else: raise NotImplementedError("Training only supported for CPU and GPU.") @@ -123,10 +123,10 @@ if __name__ == '__main__': init("nccl") else: raise NotImplementedError("Distributed Training only supported for GPU.") - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() rank_id = get_rank() rank_size = get_group_size() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=rank_size) summary_dir += "thread_num_" + str(rank_id) + "/" diff --git a/official/cv/Efficientnet/efficientnet-b1/eval.py b/official/cv/Efficientnet/efficientnet-b1/eval.py index 74cd2144d..c9950c006 100644 --- a/official/cv/Efficientnet/efficientnet-b1/eval.py +++ b/official/cv/Efficientnet/efficientnet-b1/eval.py @@ -17,8 +17,9 @@ import ast import timeit import argparse +import mindspore import mindspore.nn as nn -from mindspore import context, Model +from mindspore import Model from mindspore.common import set_seed from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -62,7 +63,7 @@ def parse_args(): @moxing_wrapper(config) def main(): """Main function for model evaluation.""" - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) dataset = create_imagenet(dataset_path=config.data_path, do_train=False, repeat_num=1, input_size=config.input_size, batch_size=config.batchsize, target=config.device_target, distribute=config.run_distribute) diff --git a/official/cv/Efficientnet/efficientnet-b1/export.py b/official/cv/Efficientnet/efficientnet-b1/export.py index e3ff75791..3aa647033 100644 --- a/official/cv/Efficientnet/efficientnet-b1/export.py +++ b/official/cv/Efficientnet/efficientnet-b1/export.py @@ -15,7 +15,8 @@ """export efficientnet IR.""" import argparse import numpy as np -from mindspore import context, Tensor, load_checkpoint, load_param_into_net, export +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.models.effnet import EfficientNet from src.config import efficientnet_b1_config_ascend as config @@ -30,7 +31,7 @@ parser.add_argument("--file_format", type=str, choices=["AIR", "ONNX", "MINDIR"] args_opt = parser.parse_args() if __name__ == "__main__": - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") + mindspore.set_context(mode=0, device_target="Ascend") net = EfficientNet(width_coeff=config.width_coeff, depth_coeff=config.depth_coeff, dropout_rate=config.dropout_rate, drop_connect_rate=config.drop_connect_rate, diff --git a/official/cv/Efficientnet/efficientnet-b1/src/model_utils/moxing_adapter.py b/official/cv/Efficientnet/efficientnet-b1/src/model_utils/moxing_adapter.py index fe7456991..50b6b969d 100644 --- a/official/cv/Efficientnet/efficientnet-b1/src/model_utils/moxing_adapter.py +++ b/official/cv/Efficientnet/efficientnet-b1/src/model_utils/moxing_adapter.py @@ -16,7 +16,7 @@ import os import time import functools -from mindspore import context +import mindspore from src.config import show_config @@ -106,7 +106,7 @@ def moxing_wrapper(config, pre_process=None, post_process=None): sync_data(config.eval_data_url, config.eval_data_path) print("Workspace downloaded: ", os.listdir(config.eval_data_path), flush=True) - context.set_context(save_graphs_path=os.path.join(config.train_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.train_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.train_path): diff --git a/official/cv/Efficientnet/efficientnet-b1/train.py b/official/cv/Efficientnet/efficientnet-b1/train.py index c73fffe62..430467c52 100644 --- a/official/cv/Efficientnet/efficientnet-b1/train.py +++ b/official/cv/Efficientnet/efficientnet-b1/train.py @@ -17,8 +17,8 @@ import os import ast import argparse +import mindspore import mindspore.nn as nn -from mindspore import context from mindspore.train.model import Model, ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.communication.management import init @@ -77,13 +77,13 @@ def parse_args(): @moxing_wrapper(config) def main(): - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.run_distribute: init() device_id = int(os.getenv("DEVICE_ID")) device_num = int(os.getenv("RANK_SIZE")) parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=device_num) else: diff --git a/official/cv/Efficientnet/efficientnet-b2/eval.py b/official/cv/Efficientnet/efficientnet-b2/eval.py index 728393eb2..b3f7e2131 100644 --- a/official/cv/Efficientnet/efficientnet-b2/eval.py +++ b/official/cv/Efficientnet/efficientnet-b2/eval.py @@ -16,7 +16,8 @@ import os import ast import argparse -from mindspore import context, nn +import mindspore +from mindspore import nn from mindspore.train.model import Model from mindspore.common import set_seed from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -42,20 +43,20 @@ if __name__ == '__main__': parser.add_argument('--device_target', type=str, choices=["Ascend", "GPU"], default="Ascend", help='Device target') args_opt = parser.parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=args_opt.device_target, save_graphs=False) if args_opt.run_modelarts: import moxing as mox device_id = int(os.getenv('DEVICE_ID')) device_num = int(os.getenv('RANK_SIZE')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) local_data_url = '/cache/data/' local_train_url = '/cache/ckpt/' mox.file.copy_parallel(args_opt.data_url, local_data_url) mox.file.copy_parallel(args_opt.train_url, local_train_url) else: - context.set_context(device_id=args_opt.device_id) + mindspore.set_context(device_id=args_opt.device_id) if args_opt.device_target == "GPU": config = config_gpu diff --git a/official/cv/Efficientnet/efficientnet-b2/export.py b/official/cv/Efficientnet/efficientnet-b2/export.py index d758e080a..005e05e46 100644 --- a/official/cv/Efficientnet/efficientnet-b2/export.py +++ b/official/cv/Efficientnet/efficientnet-b2/export.py @@ -17,7 +17,8 @@ efficientnet export. """ import argparse import numpy as np -from mindspore import context, Tensor, load_checkpoint, load_param_into_net, export +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.models.effnet import EfficientNet parser = argparse.ArgumentParser(description='Image classification') @@ -30,7 +31,7 @@ parser.add_argument('--device_target', type=str, choices=["Ascend", "GPU"], defa args_opt = parser.parse_args() if __name__ == '__main__': - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) + mindspore.set_context(mode=0, device_target=args_opt.device_target) net = EfficientNet(1.1, 1.2, dropout_rate=0.3) diff --git a/official/cv/Efficientnet/efficientnet-b2/train.py b/official/cv/Efficientnet/efficientnet-b2/train.py index f5c3b8796..2236ff007 100644 --- a/official/cv/Efficientnet/efficientnet-b2/train.py +++ b/official/cv/Efficientnet/efficientnet-b2/train.py @@ -17,7 +17,7 @@ import os import ast import argparse -from mindspore import context +import mindspore from mindspore import Tensor from mindspore.nn import SGD, RMSProp from mindspore.train.model import Model, ParallelMode @@ -53,7 +53,7 @@ if __name__ == '__main__': parser.add_argument('--device_target', type=str, choices=["Ascend", "GPU"], default="Ascend", help='Device target') args_opt = parser.parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=args_opt.device_target, save_graphs=False) # init distributed if args_opt.run_modelarts: @@ -61,36 +61,36 @@ if __name__ == '__main__': device_id = int(os.getenv('DEVICE_ID')) device_num = int(os.getenv('RANK_SIZE')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) local_data_url = '/cache/data' local_train_url = '/cache/ckpt' if device_num > 1: init() - context.set_auto_parallel_context(device_num=device_num, parallel_mode='data_parallel', gradients_mean=True) + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode='data_parallel', gradients_mean=True) local_data_url = os.path.join(local_data_url, str(device_id)) mox.file.copy_parallel(args_opt.data_url, local_data_url) else: if args_opt.run_distribute: if args_opt.device_target == "GPU": init() - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() device_id = get_rank() device_num = get_group_size() print("run distribute......", "deviceNum:", device_num, ",rank_id:", device_id) - context.set_auto_parallel_context(device_num=device_num, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: device_id = int(os.getenv('DEVICE_ID')) device_num = int(os.getenv('RANK_SIZE')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) init() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, - parallel_mode=context.ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) else: - context.set_context(device_id=args_opt.device_id) + mindspore.set_context(device_id=args_opt.device_id) device_num = 1 device_id = 0 diff --git a/official/cv/Efficientnet/efficientnet-b3/eval.py b/official/cv/Efficientnet/efficientnet-b3/eval.py index de055d85d..d56c3e8b2 100644 --- a/official/cv/Efficientnet/efficientnet-b3/eval.py +++ b/official/cv/Efficientnet/efficientnet-b3/eval.py @@ -16,7 +16,8 @@ import os import ast import argparse -from mindspore import context, nn +import mindspore +from mindspore import nn from mindspore.train.model import Model from mindspore.common import set_seed from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -42,20 +43,20 @@ if __name__ == '__main__': parser.add_argument('--run_modelarts', type=ast.literal_eval, default=False, help='Run distribute') args_opt = parser.parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=args_opt.device_target, save_graphs=False) if args_opt.run_modelarts: import moxing as mox device_id = int(os.getenv('DEVICE_ID')) device_num = int(os.getenv('RANK_SIZE')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) local_data_url = '/cache/data/' local_train_url = '/cache/ckpt/' mox.file.copy_parallel(args_opt.data_url, local_data_url) mox.file.copy_parallel(args_opt.train_url, local_train_url) else: - context.set_context(device_id=args_opt.device_id) + mindspore.set_context(device_id=args_opt.device_id) # create dataset if args_opt.run_modelarts: diff --git a/official/cv/Efficientnet/efficientnet-b3/export.py b/official/cv/Efficientnet/efficientnet-b3/export.py index cfaec24bc..2494270f4 100644 --- a/official/cv/Efficientnet/efficientnet-b3/export.py +++ b/official/cv/Efficientnet/efficientnet-b3/export.py @@ -17,7 +17,8 @@ efficientnet export. """ import argparse import numpy as np -from mindspore import context, Tensor, load_checkpoint, load_param_into_net, export +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.models.effnet import EfficientNet @@ -31,7 +32,7 @@ parser.add_argument("--device_target", type=str, choices=["Ascend", "GPU"], defa args_opt = parser.parse_args() if __name__ == '__main__': - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) + mindspore.set_context(mode=0, device_target=args_opt.device_target) net = EfficientNet() diff --git a/official/cv/Efficientnet/efficientnet-b3/modelarts/train_start.py b/official/cv/Efficientnet/efficientnet-b3/modelarts/train_start.py index bdc632457..b04303192 100644 --- a/official/cv/Efficientnet/efficientnet-b3/modelarts/train_start.py +++ b/official/cv/Efficientnet/efficientnet-b3/modelarts/train_start.py @@ -18,11 +18,11 @@ import ast import argparse import numpy as np -from mindspore import context +import mindspore from mindspore import Tensor from mindspore.nn import SGD, RMSProp from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.callback import ModelCheckpoint, CheckpointConfig from mindspore.communication.management import init from mindspore.train.loss_scale_manager import FixedLossScaleManager @@ -55,33 +55,33 @@ if __name__ == '__main__': parser.add_argument('--resume', type=str, default='', help='resume training with existed checkpoint') args_opt = parser.parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") + mindspore.set_context(mode=0, device_target="Ascend") # init distributed if args_opt.run_modelarts: import moxing as mox device_id = int(os.getenv('DEVICE_ID')) device_num = int(os.getenv('RANK_SIZE')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) local_data_url = '/cache/data' local_train_url = '/cache/ckpt' if device_num > 1: init() - context.set_auto_parallel_context(device_num=device_num, parallel_mode='data_parallel', gradients_mean=True) + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode='data_parallel', gradients_mean=True) local_data_url = os.path.join(local_data_url, str(device_id)) mox.file.copy_parallel(args_opt.data_url, local_data_url) else: if args_opt.run_distribute: device_id = int(os.getenv('DEVICE_ID')) device_num = int(os.getenv('RANK_SIZE')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) init() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: - context.set_context(device_id=args_opt.device_id) + mindspore.set_context(device_id=args_opt.device_id) device_num = 1 device_id = 0 diff --git a/official/cv/Efficientnet/efficientnet-b3/train.py b/official/cv/Efficientnet/efficientnet-b3/train.py index dfa9d8d4d..35cbd4d3d 100644 --- a/official/cv/Efficientnet/efficientnet-b3/train.py +++ b/official/cv/Efficientnet/efficientnet-b3/train.py @@ -17,7 +17,7 @@ import os import ast import argparse -from mindspore import context +import mindspore from mindspore import Tensor from mindspore.nn import SGD, RMSProp from mindspore.train.model import Model @@ -53,7 +53,7 @@ if __name__ == '__main__': parser.add_argument('--resume', type=str, default='', help='resume training with existed checkpoint') args_opt = parser.parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=args_opt.device_target, save_graphs=False) # init distributed if args_opt.run_modelarts: @@ -62,27 +62,27 @@ if __name__ == '__main__': device_id = int(os.getenv('DEVICE_ID')) rank = int(os.getenv('RANK_ID')) device_num = int(os.getenv('RANK_SIZE')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) local_data_url = '/cache/data' local_train_url = '/cache/ckpt' if device_num > 1: init() - context.set_auto_parallel_context(device_num=device_num, parallel_mode='data_parallel', gradients_mean=True) + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode='data_parallel', gradients_mean=True) local_data_url = os.path.join(local_data_url, str(device_id)) mox.file.copy_parallel(args_opt.data_url, local_data_url) else: if args_opt.run_distribute: if os.getenv('DEVICE_ID', "not_set").isdigit(): - context.set_context(device_id=int(os.getenv("DEVICE_ID"))) + mindspore.set_context(device_id=int(os.getenv("DEVICE_ID"))) init() rank = get_rank() device_num = get_group_size() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, - parallel_mode=context.ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) else: - context.set_context(device_id=args_opt.device_id) + mindspore.set_context(device_id=args_opt.device_id) device_num = 1 rank = 0 diff --git a/official/cv/Efficientnet/efficientnetv2/eval.py b/official/cv/Efficientnet/efficientnetv2/eval.py index 56d6f376f..134c5d91d 100644 --- a/official/cv/Efficientnet/efficientnetv2/eval.py +++ b/official/cv/Efficientnet/efficientnetv2/eval.py @@ -13,9 +13,8 @@ # limitations under the License. # ============================================================================ """eval""" - +import mindspore from mindspore import Model -from mindspore import context from mindspore import nn from mindspore.common import set_seed @@ -30,13 +29,13 @@ set_seed(args.seed) def main(): mode = { - 0: context.GRAPH_MODE, - 1: context.PYNATIVE_MODE + 0: 0, + 1: 1 } - context.set_context(mode=mode[args.graph_mode], device_target=args.device_target) - context.set_context(enable_graph_kernel=False) + mindspore.set_context(mode=mode[args.graph_mode], device_target=args.device_target) + mindspore.set_context(enable_graph_kernel=False) if args.device_target == "Ascend": - context.set_context(enable_auto_mixed_precision=True) + mindspore.set_context(enable_auto_mixed_precision=True) set_device(args) # get model diff --git a/official/cv/Efficientnet/efficientnetv2/export.py b/official/cv/Efficientnet/efficientnetv2/export.py index 3d94ed906..1cb6f26d6 100644 --- a/official/cv/Efficientnet/efficientnetv2/export.py +++ b/official/cv/Efficientnet/efficientnetv2/export.py @@ -18,7 +18,9 @@ python export.py """ import numpy as np -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context + +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from mindspore import dtype as mstype from src.args import args @@ -26,10 +28,10 @@ from src.tools.cell import cast_amp from src.tools.criterion import get_criterion, NetWithLoss from src.tools.get_misc import get_model -context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) +mindspore.set_context(mode=0, device_target=args.device_target) if args.device_target in ["Ascend", "GPU"]: - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) if __name__ == '__main__': net = get_model(args) diff --git a/official/cv/Efficientnet/efficientnetv2/src/tools/get_misc.py b/official/cv/Efficientnet/efficientnetv2/src/tools/get_misc.py index f1f4168ae..e3e7b05b4 100644 --- a/official/cv/Efficientnet/efficientnetv2/src/tools/get_misc.py +++ b/official/cv/Efficientnet/efficientnetv2/src/tools/get_misc.py @@ -14,11 +14,10 @@ # ============================================================================ """misc functions for program""" import os - -from mindspore import context +import mindspore from mindspore import nn from mindspore.communication.management import init, get_rank -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from src import models, data @@ -35,16 +34,16 @@ def set_device(args): if device_target == "Ascend": if device_num > 1: - context.set_context(device_id=int(os.environ["DEVICE_ID"])) + mindspore.set_context(device_id=int(os.environ["DEVICE_ID"])) init(backend_name='hccl') - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) - # context.set_auto_parallel_context(pipeline_stages=2, full_batch=True) + # mindspore.set_auto_parallel_context(pipeline_stages=2, full_batch=True) rank = get_rank() else: - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) else: raise ValueError("Unsupported platform.") diff --git a/official/cv/Efficientnet/efficientnetv2/train.py b/official/cv/Efficientnet/efficientnetv2/train.py index 59b1e8847..1b2c3762f 100644 --- a/official/cv/Efficientnet/efficientnetv2/train.py +++ b/official/cv/Efficientnet/efficientnetv2/train.py @@ -20,7 +20,8 @@ Acc: ImageNet1k-84.9% (pretrained on ImageNet22k) """ import os -from mindspore import Model, nn, context, set_seed +import mindspore +from mindspore import Model, nn, set_seed from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from src.args import args @@ -34,13 +35,13 @@ from src.tools.optimizer import get_optimizer def main(): set_seed(args.seed) mode = { - 0: context.GRAPH_MODE, - 1: context.PYNATIVE_MODE + 0: 0, # GRAPH_MODE + 1: 1 # PYNATIVE_MODE } - context.set_context(mode=mode[args.graph_mode], device_target=args.device_target) - context.set_context(enable_graph_kernel=False) + mindspore.set_context(mode=mode[args.graph_mode], device_target=args.device_target) + mindspore.set_context(enable_graph_kernel=False) if args.device_target == "Ascend": - context.set_context(enable_auto_mixed_precision=True) + mindspore.set_context(enable_auto_mixed_precision=True) rank = set_device(args) # get model and cast amp_level diff --git a/official/cv/FasterRCNN/README.md b/official/cv/FasterRCNN/README.md index 58a1897b9..130130c80 100644 --- a/official/cv/FasterRCNN/README.md +++ b/official/cv/FasterRCNN/README.md @@ -82,7 +82,7 @@ Dataset used: [FaceMaskDetection]() - Docker base image - - [Ascend Hub](ascend.huawei.com/ascendhub/#/home) + - [Ascend Hub](www.hiascend.com/developer/ascendhub) - Install [MindSpore](https://www.mindspore.cn/install/en). diff --git a/official/cv/FasterRCNN/README_CN.md b/official/cv/FasterRCNN/README_CN.md index ace823dbf..ed214c0af 100644 --- a/official/cv/FasterRCNN/README_CN.md +++ b/official/cv/FasterRCNN/README_CN.md @@ -116,7 +116,7 @@ Faster R-CNN是一个两阶段目标检测网络,该网络采用RPN,可以 - 获取基础镜像 - - [Ascend Hub](https://ascend.huawei.com/ascendhub/#/home) + - [Ascend Hub](www.hiascend.com/developer/ascendhub) - 安装[MindSpore](https://www.mindspore.cn/install)。 diff --git a/official/cv/FasterRCNN/eval.py b/official/cv/FasterRCNN/eval.py index fc60bafb2..87cb9cd91 100644 --- a/official/cv/FasterRCNN/eval.py +++ b/official/cv/FasterRCNN/eval.py @@ -20,7 +20,7 @@ from collections import defaultdict import numpy as np from pycocotools.coco import COCO -import mindspore as ms +import mindspore from mindspore.common import set_seed, Parameter from src.dataset import data_to_mindrecord_byte_image, create_fasterrcnn_dataset, parse_json_annos_from_txt @@ -29,7 +29,7 @@ from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.model_utils.device_adapter import get_device_id from src.FasterRcnn.faster_rcnn import Faster_Rcnn -ms.context.set_context(max_call_depth=2000) +mindspore.set_context(max_call_depth=2000) def fasterrcnn_eval(dataset_path, ckpt_path, anno_path): """FasterRcnn evaluation.""" @@ -39,7 +39,7 @@ def fasterrcnn_eval(dataset_path, ckpt_path, anno_path): net = Faster_Rcnn(config) try: - param_dict = ms.load_checkpoint(ckpt_path) + param_dict = mindspore.load_checkpoint(ckpt_path) except RuntimeError as ex: ex = str(ex) print("Traceback:\n", ex, flush=True) @@ -60,12 +60,12 @@ def fasterrcnn_eval(dataset_path, ckpt_path, anno_path): for key, value in param_dict.items(): tensor = value.asnumpy().astype(np.float32) param_dict[key] = Parameter(tensor, key) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) - device_type = "Ascend" if ms.get_context("device_target") == "Ascend" else "Others" + device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "Others" if device_type == "Ascend": - net.to_float(ms.float16) + net.to_float(mindspore.float16) eval_iter = 0 total = ds.get_dataset_size() @@ -199,6 +199,6 @@ def eval_fasterrcnn(): if __name__ == '__main__': set_seed(1) - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) eval_fasterrcnn() diff --git a/official/cv/FasterRCNN/export.py b/official/cv/FasterRCNN/export.py index e525cc4b7..da6507f11 100644 --- a/official/cv/FasterRCNN/export.py +++ b/official/cv/FasterRCNN/export.py @@ -15,16 +15,16 @@ """export checkpoint file into air, onnx, mindir models""" import numpy as np -import mindspore as ms +import mindspore from mindspore import Tensor from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.model_utils.device_adapter import get_device_id from src.FasterRcnn.faster_rcnn import FasterRcnn_Infer -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, max_call_depth=2000) +mindspore.set_context(mode=0, device_target=config.device_target, max_call_depth=2000) if config.device_target == "Ascend": - ms.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_pre_process(): @@ -40,7 +40,7 @@ def export_fasterrcnn(): net = FasterRcnn_Infer(config=config) try: - param_dict = ms.load_checkpoint(config.ckpt_file) + param_dict = mindspore.load_checkpoint(config.ckpt_file) except RuntimeError as ex: ex = str(ex) print("Traceback:\n", ex, flush=True) @@ -53,22 +53,22 @@ def export_fasterrcnn(): key = key.replace("ncek", "neck") param_dict_new["network." + key] = value - ms.load_param_into_net(net, param_dict_new) + mindspore.load_param_into_net(net, param_dict_new) - device_type = "Ascend" if ms.get_context("device_target") == "Ascend" else "Others" + device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "Others" if device_type == "Ascend": - net.to_float(ms.float16) + net.to_float(mindspore.float16) - img = Tensor(np.zeros([config.test_batch_size, 3, config.img_height, config.img_width]), ms.float32) - img_metas = Tensor(np.random.uniform(0.0, 1.0, size=[config.test_batch_size, 4]), ms.float32) + img = Tensor(np.zeros([config.test_batch_size, 3, config.img_height, config.img_width]), mindspore.float32) + img_metas = Tensor(np.random.uniform(0.0, 1.0, size=[config.test_batch_size, 4]), mindspore.float32) if not config.restore_bbox: print("[WARNING] When parameter 'restore_bbox' set to False, " "ascend310_infer of this project provided will not be available " "and need to complete 310 infer function by yourself.") - ms.export(net, img, file_name=config.file_name, file_format=config.file_format) + mindspore.export(net, img, file_name=config.file_name, file_format=config.file_format) else: - ms.export(net, img, img_metas, file_name=config.file_name, file_format=config.file_format) + mindspore.export(net, img, img_metas, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': diff --git a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py index 0cb30ad1c..9bf645695 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py @@ -15,7 +15,7 @@ """FasterRcnn positive and negative sample screening for RPN.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore.common.tensor import Tensor @@ -46,7 +46,7 @@ class BboxAssignSample(nn.Cell): super(BboxAssignSample, self).__init__() cfg = config self.dtype = np.float32 - self.ms_type = ms.float32 + self.ms_type = mindspore.float32 self.batch_size = batch_size self.neg_iou_thr = Tensor(cfg.neg_iou_thr, self.ms_type) @@ -99,11 +99,11 @@ class BboxAssignSample(nn.Cell): self.check_anchor_two = Tensor(np.full((self.num_bboxes, 4), -2, dtype=self.dtype)) def construct(self, gt_bboxes_i, gt_labels_i, valid_mask, bboxes, gt_valids): - gt_bboxes_i = self.select(self.cast(self.tile(self.reshape(self.cast(gt_valids, ms.int32), - (self.num_gts, 1)), (1, 4)), ms.bool_), gt_bboxes_i, + gt_bboxes_i = self.select(self.cast(self.tile(self.reshape(self.cast(gt_valids, mindspore.int32), + (self.num_gts, 1)), (1, 4)), mindspore.bool_), gt_bboxes_i, self.check_gt_one) - bboxes = self.select(self.cast(self.tile(self.reshape(self.cast(valid_mask, ms.int32), - (self.num_bboxes, 1)), (1, 4)), ms.bool_), bboxes, + bboxes = self.select(self.cast(self.tile(self.reshape(self.cast(valid_mask, mindspore.int32), + (self.num_bboxes, 1)), (1, 4)), mindspore.bool_), bboxes, self.check_anchor_two) overlaps = self.iou(bboxes, gt_bboxes_i) max_overlaps_w_gt_index, max_overlaps_w_gt = self.max_gt(overlaps) @@ -115,7 +115,7 @@ class BboxAssignSample(nn.Cell): pos_sample_iou_mask = self.greaterequal(max_overlaps_w_gt, self.pos_iou_thr) assigned_gt_inds3 = self.select(pos_sample_iou_mask, - max_overlaps_w_gt_index.astype(ms.int32) + self.assigned_gt_ones, + max_overlaps_w_gt_index.astype(mindspore.int32) + self.assigned_gt_ones, assigned_gt_inds2) assigned_gt_inds4 = assigned_gt_inds3 for j in range(self.num_gts): @@ -134,10 +134,10 @@ class BboxAssignSample(nn.Cell): pos_check_valid = self.cast(self.greater(assigned_gt_inds5, 0), self.ms_type) pos_check_valid = self.sum_inds(pos_check_valid, -1) valid_pos_index = self.less(self.range_pos_size, pos_check_valid) - pos_index = pos_index * self.reshape(self.cast(valid_pos_index, ms.int32), (self.num_expected_pos, 1)) + pos_index = pos_index * self.reshape(self.cast(valid_pos_index, mindspore.int32), (self.num_expected_pos, 1)) pos_assigned_gt_index = self.gatherND(assigned_gt_inds5, pos_index) - self.assigned_pos_ones - pos_assigned_gt_index = pos_assigned_gt_index * self.cast(valid_pos_index, ms.int32) + pos_assigned_gt_index = pos_assigned_gt_index * self.cast(valid_pos_index, mindspore.int32) pos_assigned_gt_index = self.reshape(pos_assigned_gt_index, (self.num_expected_pos, 1)) neg_index, valid_neg_index = self.random_choice_with_mask_neg(self.equal(assigned_gt_inds5, 0)) @@ -153,15 +153,15 @@ class BboxAssignSample(nn.Cell): pos_bbox_targets_ = self.bounding_box_encode(pos_bboxes_, pos_gt_bboxes_) - valid_pos_index = self.cast(valid_pos_index, ms.int32) - valid_neg_index = self.cast(valid_neg_index, ms.int32) + valid_pos_index = self.cast(valid_pos_index, mindspore.int32) + valid_neg_index = self.cast(valid_neg_index, mindspore.int32) total_index = self.concat((pos_index, neg_index)) - pos_index = self.cast(pos_index, ms.int64) + pos_index = self.cast(pos_index, mindspore.int64) bbox_targets_total = self.scatterNd(pos_index, pos_bbox_targets_, (self.num_bboxes, 4)) bbox_weights_total = self.scatterNd(pos_index, valid_pos_index, (self.num_bboxes,)) labels_total = self.scatterNd(pos_index, pos_gt_labels, (self.num_bboxes,)) total_valid_index = self.concat((valid_pos_index, valid_neg_index)) label_weights_total = self.scatterNd(total_index, total_valid_index, (self.num_bboxes,)) - return bbox_targets_total, self.cast(bbox_weights_total, ms.bool_), \ - labels_total, self.cast(label_weights_total, ms.bool_) + return bbox_targets_total, self.cast(bbox_weights_total, mindspore.bool_), \ + labels_total, self.cast(label_weights_total, mindspore.bool_) diff --git a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py index 63a3355c4..942527051 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py @@ -15,7 +15,7 @@ """FasterRcnn tpositive and negative sample screening for Rcnn.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore.common.tensor import Tensor @@ -46,7 +46,7 @@ class BboxAssignSampleForRcnn(nn.Cell): super(BboxAssignSampleForRcnn, self).__init__() cfg = config self.dtype = np.float32 - self.ms_type = ms.float32 + self.ms_type = mindspore.float32 self.batch_size = batch_size self.neg_iou_thr = cfg.neg_iou_thr_stage2 self.pos_iou_thr = cfg.pos_iou_thr_stage2 @@ -109,11 +109,11 @@ class BboxAssignSampleForRcnn(nn.Cell): self.scalar_min_pos_iou = Tensor(self.min_pos_iou, dtype=self.ms_type) def construct(self, gt_bboxes_i, gt_labels_i, valid_mask, bboxes, gt_valids): - gt_bboxes_i = self.select(self.cast(self.tile(self.reshape(self.cast(gt_valids, ms.int32), \ - (self.num_gts, 1)), (1, 4)), ms.bool_), \ + gt_bboxes_i = self.select(self.cast(self.tile(self.reshape(self.cast(gt_valids, mindspore.int32), \ + (self.num_gts, 1)), (1, 4)), mindspore.bool_), \ gt_bboxes_i, self.check_gt_one) - bboxes = self.select(self.cast(self.tile(self.reshape(self.cast(valid_mask, ms.int32), \ - (self.num_bboxes, 1)), (1, 4)), ms.bool_), \ + bboxes = self.select(self.cast(self.tile(self.reshape(self.cast(valid_mask, mindspore.int32), \ + (self.num_bboxes, 1)), (1, 4)), mindspore.bool_), \ bboxes, self.check_anchor_two) overlaps = self.iou(bboxes, gt_bboxes_i) @@ -130,7 +130,7 @@ class BboxAssignSampleForRcnn(nn.Cell): pos_sample_iou_mask = self.greaterequal(max_overlaps_w_gt, self.scalar_pos_iou_thr) assigned_gt_inds3 = self.select(pos_sample_iou_mask, - max_overlaps_w_gt_index.astype(ms.int32) + self.assigned_gt_ones, + max_overlaps_w_gt_index.astype(mindspore.int32) + self.assigned_gt_ones, assigned_gt_inds2) for j in range(self.num_gts): @@ -154,10 +154,10 @@ class BboxAssignSampleForRcnn(nn.Cell): pos_check_valid = self.cast(self.greater(assigned_gt_inds5, 0), self.ms_type) pos_check_valid = self.sum_inds(pos_check_valid, -1) valid_pos_index = self.less(self.range_pos_size, pos_check_valid) - pos_index = pos_index * self.reshape(self.cast(valid_pos_index, ms.int32), (self.num_expected_pos, 1)) + pos_index = pos_index * self.reshape(self.cast(valid_pos_index, mindspore.int32), (self.num_expected_pos, 1)) num_pos = self.sum_inds(self.cast(self.logicalnot(valid_pos_index), self.ms_type), -1) - valid_pos_index = self.cast(valid_pos_index, ms.int32) + valid_pos_index = self.cast(valid_pos_index, mindspore.int32) pos_index = self.reshape(pos_index, self.reshape_shape_pos) valid_pos_index = self.reshape(valid_pos_index, self.reshape_shape_pos) pos_index = pos_index * valid_pos_index @@ -175,7 +175,7 @@ class BboxAssignSampleForRcnn(nn.Cell): valid_neg_index = self.logicaland(self.concat((self.check_neg_mask, unvalid_pos_index)), valid_neg_index) neg_index = self.reshape(neg_index, self.reshape_shape_neg) - valid_neg_index = self.cast(valid_neg_index, ms.int32) + valid_neg_index = self.cast(valid_neg_index, mindspore.int32) valid_neg_index = self.reshape(valid_neg_index, self.reshape_shape_neg) neg_index = neg_index * valid_neg_index diff --git a/official/cv/FasterRCNN/src/FasterRcnn/faster_rcnn.py b/official/cv/FasterRCNN/src/FasterRcnn/faster_rcnn.py index 9e78ec349..551eaedd1 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/faster_rcnn.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/faster_rcnn.py @@ -15,10 +15,9 @@ """FasterRcnn""" import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn -from mindspore import context from mindspore.ops import functional as F from mindspore.ops.primitive import constexpr from mindspore.common.tensor import Tensor @@ -63,7 +62,7 @@ class Faster_Rcnn(nn.Cell): def __init__(self, config): super(Faster_Rcnn, self).__init__() self.dtype = np.float32 - self.ms_type = ms.float32 + self.ms_type = mindspore.float32 self.train_batch_size = config.batch_size self.without_bg_loss = config.without_bg_loss self.num_classes = config.num_classes @@ -161,7 +160,7 @@ class Faster_Rcnn(nn.Cell): # Init tensor self.init_tensor(config) - self.device_type = "Ascend" if context.get_context("device_target") == "Ascend" else "Others" + self.device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "Others" def roi_init(self, config): """ @@ -281,16 +280,16 @@ class Faster_Rcnn(nn.Cell): labels_tuple = () mask_tuple = () if self.training: - gt_labels = self.cast(gt_labels, ms.int32) - gt_valids = self.cast(gt_valids, ms.int32) + gt_labels = self.cast(gt_labels, mindspore.int32) + gt_valids = self.cast(gt_valids, mindspore.int32) for i in range(self.train_batch_size): gt_bboxes_i = self.squeeze(gt_bboxes[i:i + 1:1, ::]) gt_labels_i = self.squeeze(gt_labels[i:i + 1:1, ::]) - gt_labels_i = self.cast(gt_labels_i, ms.uint8) + gt_labels_i = self.cast(gt_labels_i, mindspore.uint8) gt_valids_i = self.squeeze(gt_valids[i:i + 1:1, ::]) - gt_valids_i = self.cast(gt_valids_i, ms.bool_) + gt_valids_i = self.cast(gt_valids_i, mindspore.bool_) bboxes, deltas, labels, mask = self.bbox_assigner_sampler_for_rcnn(gt_bboxes_i, gt_labels_i, @@ -306,7 +305,7 @@ class Faster_Rcnn(nn.Cell): rcnn_labels = self.concat(labels_tuple) bbox_targets = ops.stop_gradient(bbox_targets) rcnn_labels = ops.stop_gradient(rcnn_labels) - rcnn_labels = self.cast(rcnn_labels, ms.int32) + rcnn_labels = self.cast(rcnn_labels, mindspore.int32) else: mask_tuple += proposal_mask bbox_targets = proposal_mask @@ -326,29 +325,29 @@ class Faster_Rcnn(nn.Cell): else: bboxes_all = bboxes_tuple[0] if self.device_type == "Ascend": - bboxes_all = self.cast(bboxes_all, ms.float16) + bboxes_all = self.cast(bboxes_all, mindspore.float16) rois = self.concat_1((self.roi_align_index_test_tensor, bboxes_all)) - rois = self.cast(rois, ms.float32) + rois = self.cast(rois, mindspore.float32) rois = ops.stop_gradient(rois) if self.training: roi_feats = self.roi_align(rois, - self.cast(x[0], ms.float32), - self.cast(x[1], ms.float32), - self.cast(x[2], ms.float32), - self.cast(x[3], ms.float32)) + self.cast(x[0], mindspore.float32), + self.cast(x[1], mindspore.float32), + self.cast(x[2], mindspore.float32), + self.cast(x[3], mindspore.float32)) else: roi_feats = self.roi_align_test(rois, - self.cast(x[0], ms.float32), - self.cast(x[1], ms.float32), - self.cast(x[2], ms.float32), - self.cast(x[3], ms.float32)) + self.cast(x[0], mindspore.float32), + self.cast(x[1], mindspore.float32), + self.cast(x[2], mindspore.float32), + self.cast(x[3], mindspore.float32)) roi_feats = self.cast(roi_feats, self.ms_type) rcnn_masks = self.concat(mask_tuple) rcnn_masks = ops.stop_gradient(rcnn_masks) - rcnn_mask_squeeze = self.squeeze(self.cast(rcnn_masks, ms.bool_)) + rcnn_mask_squeeze = self.squeeze(self.cast(rcnn_masks, mindspore.bool_)) rcnn_loss, rcnn_cls_loss, rcnn_reg_loss, _ = self.rcnn(roi_feats, bbox_targets, rcnn_labels, @@ -375,7 +374,7 @@ class Faster_Rcnn(nn.Cell): img_metas_all = self.split(img_metas) scores_all = self.split(scores) - mask_all = self.split(self.cast(mask_logits, ms.int32)) + mask_all = self.split(self.cast(mask_logits, mindspore.int32)) boxes_all_with_batchsize = () for i in range(self.test_batch_size): @@ -403,7 +402,7 @@ class Faster_Rcnn(nn.Cell): for i in range(self.test_batch_size): bboxes = boxes_all[i] scores = scores_all[i] - masks = self.cast(mask_all[i], ms.bool_) + masks = self.cast(mask_all[i], mindspore.bool_) res_boxes_tuple = () res_labels_tuple = () @@ -421,7 +420,7 @@ class Faster_Rcnn(nn.Cell): cls_mask = self.greater(_cls_scores, self.test_score_thresh) _mask = self.logicand(_mask_o, cls_mask) - _reg_mask = self.cast(self.tile(self.cast(_mask, ms.int32), (1, 4)), ms.bool_) + _reg_mask = self.cast(self.tile(self.cast(_mask, mindspore.int32), (1, 4)), mindspore.bool_) _bboxes = self.select(_reg_mask, _bboxes, self.test_box_zeros) _cls_scores = self.select(_mask, _cls_scores, self.test_score_zeros) @@ -506,7 +505,7 @@ def generator_img_meta(n, ori_h, ori_w, in_h, in_w): resize_scale = width_scale if width_scale < height_scale else height_scale img_metas.append([ori_h, ori_w, resize_scale, resize_scale]) - img_metas = Tensor(np.array(img_metas), ms.float32) + img_metas = Tensor(np.array(img_metas), mindspore.float32) return img_metas diff --git a/official/cv/FasterRCNN/src/FasterRcnn/fpn_neck.py b/official/cv/FasterRCNN/src/FasterRcnn/fpn_neck.py index b22da232f..a1dd2fa80 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/fpn_neck.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/fpn_neck.py @@ -15,7 +15,7 @@ """FasterRcnn feature pyramid network.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore.common.tensor import Tensor @@ -30,7 +30,7 @@ def bias_init_zeros(shape): def _conv(in_channels, out_channels, kernel_size=3, stride=1, padding=0, pad_mode='pad'): """Conv2D wrapper.""" shape = (out_channels, in_channels, kernel_size, kernel_size) - weights = ms.common.initializer.initializer("XavierUniform", shape=shape, dtype=ms.float32).init_data() + weights = mindspore.common.initializer.initializer("XavierUniform", shape=shape, dtype=mindspore.float32).init_data() shape_bias = (out_channels,) biass = bias_init_zeros(shape_bias) return nn.Conv2d(in_channels, out_channels, diff --git a/official/cv/FasterRCNN/src/FasterRcnn/proposal_generator.py b/official/cv/FasterRCNN/src/FasterRcnn/proposal_generator.py index 16e2b4265..9068334fd 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/proposal_generator.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/proposal_generator.py @@ -15,7 +15,7 @@ """FasterRcnn proposal generator.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore import Tensor @@ -103,7 +103,7 @@ class Proposal(nn.Cell): self.set_train_local(config, training=True) self.dtype = np.float32 - self.ms_type = ms.float32 + self.ms_type = mindspore.float32 self.multi_10 = Tensor(10.0, self.ms_type) diff --git a/official/cv/FasterRCNN/src/FasterRcnn/rcnn.py b/official/cv/FasterRCNN/src/FasterRcnn/rcnn.py index fa02da335..be6f00d95 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/rcnn.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/rcnn.py @@ -15,7 +15,7 @@ """FasterRcnn Rcnn network.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore.common.tensor import Tensor @@ -27,20 +27,20 @@ class DenseNoTranpose(nn.Cell): def __init__(self, input_channels, output_channels, weight_init): super(DenseNoTranpose, self).__init__() - self.weight = Parameter(ms.common.initializer.initializer(weight_init, \ - [input_channels, output_channels], ms.float32)) - self.bias = Parameter(ms.common.initializer.initializer("zeros", \ - [output_channels], ms.float32)) + self.weight = Parameter(mindspore.common.initializer.initializer(weight_init, \ + [input_channels, output_channels], mindspore.float32)) + self.bias = Parameter(mindspore.common.initializer.initializer("zeros", \ + [output_channels], mindspore.float32)) self.matmul = ops.MatMul(transpose_b=False) self.bias_add = ops.BiasAdd() self.cast = ops.Cast() - self.device_type = "Ascend" if ms.get_context("device_target") == "Ascend" else "Others" + self.device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "Others" def construct(self, x): if self.device_type == "Ascend": - x = self.cast(x, ms.float16) - weight = self.cast(self.weight, ms.float16) + x = self.cast(x, mindspore.float16) + weight = self.cast(self.weight, mindspore.float16) output = self.bias_add(self.matmul(x, weight), self.bias) else: output = self.bias_add(self.matmul(x, self.weight), self.bias) @@ -78,7 +78,7 @@ class Rcnn(nn.Cell): super(Rcnn, self).__init__() cfg = config self.dtype = np.float32 - self.ms_type = ms.float32 + self.ms_type = mindspore.float32 self.rcnn_loss_cls_weight = Tensor(np.array(cfg.rcnn_loss_cls_weight).astype(self.dtype)) self.rcnn_loss_reg_weight = Tensor(np.array(cfg.rcnn_loss_reg_weight).astype(self.dtype)) self.rcnn_fc_out_channels = cfg.rcnn_fc_out_channels @@ -94,17 +94,17 @@ class Rcnn(nn.Cell): self.test_batch_size = cfg.test_batch_size shape_0 = (self.rcnn_fc_out_channels, representation_size) - weights_0 = ms.common.initializer.initializer("XavierUniform", shape=shape_0[::-1], \ + weights_0 = mindspore.common.initializer.initializer("XavierUniform", shape=shape_0[::-1], \ dtype=self.ms_type).init_data() shape_1 = (self.rcnn_fc_out_channels, self.rcnn_fc_out_channels) - weights_1 = ms.common.initializer.initializer("XavierUniform", shape=shape_1[::-1], \ + weights_1 = mindspore.common.initializer.initializer("XavierUniform", shape=shape_1[::-1], \ dtype=self.ms_type).init_data() self.shared_fc_0 = DenseNoTranpose(representation_size, self.rcnn_fc_out_channels, weights_0) self.shared_fc_1 = DenseNoTranpose(self.rcnn_fc_out_channels, self.rcnn_fc_out_channels, weights_1) - cls_weight = ms.common.initializer.initializer('Normal', shape=[num_classes, self.rcnn_fc_out_channels][::-1], + cls_weight = mindspore.common.initializer.initializer('Normal', shape=[num_classes, self.rcnn_fc_out_channels][::-1], dtype=self.ms_type).init_data() - reg_weight = ms.common.initializer.initializer('Normal', shape=[self.num_classes_fronted * 4, + reg_weight = mindspore.common.initializer.initializer('Normal', shape=[self.num_classes_fronted * 4, self.rcnn_fc_out_channels][::-1], dtype=self.ms_type).init_data() self.cls_scores = DenseNoTranpose(self.rcnn_fc_out_channels, num_classes, cls_weight) @@ -126,8 +126,8 @@ class Rcnn(nn.Cell): self.gather = ops.GatherNd() self.argmax = ops.ArgMaxWithValue(axis=1) - self.on_value = Tensor(1.0, ms.float32) - self.off_value = Tensor(0.0, ms.float32) + self.on_value = Tensor(1.0, mindspore.float32) + self.off_value = Tensor(0.0, mindspore.float32) self.value = Tensor(1.0, self.ms_type) self.num_bboxes = (cfg.num_expected_pos_stage2 + cfg.num_expected_neg_stage2) * batch_size @@ -151,7 +151,7 @@ class Rcnn(nn.Cell): x_reg = self.reg_scores(x) if self.training: - bbox_weights = self.cast(self.logicaland(self.greater(labels, 0), mask), ms.int32) * labels + bbox_weights = self.cast(self.logicaland(self.greater(labels, 0), mask), mindspore.int32) * labels labels = self.onehot(labels, self.num_classes, self.on_value, self.off_value) bbox_targets = self.tile(self.expandims(bbox_targets, 1), (1, self.num_classes_fronted, 1)) diff --git a/official/cv/FasterRCNN/src/FasterRcnn/roi_align.py b/official/cv/FasterRCNN/src/FasterRcnn/roi_align.py index e3c4c0490..4fd33e287 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/roi_align.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/roi_align.py @@ -15,7 +15,7 @@ """FasterRcnn ROIAlign module.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore.nn import layer as L @@ -99,7 +99,7 @@ class SingleRoIExtractor(nn.Cell): _mode_16 = False self.dtype = np.float16 if _mode_16 else np.float32 - self.ms_dtype = ms.float16 if _mode_16 else ms.float32 + self.ms_dtype = mindspore.float16 if _mode_16 else mindspore.float32 self.set_train_local(cfg, training=True) def set_train_local(self, config, training=True): @@ -158,7 +158,7 @@ class SingleRoIExtractor(nn.Cell): target_lvls = self.log2(scale / self.finest_scale + self.epslion) target_lvls = ops.Floor()(target_lvls) - target_lvls = self.cast(target_lvls, ms.int32) + target_lvls = self.cast(target_lvls, mindspore.int32) target_lvls = self.clamp(target_lvls, self.zeros, self.max_levels) return target_lvls @@ -168,11 +168,11 @@ class SingleRoIExtractor(nn.Cell): res = self.res_ target_lvls = self._c_map_roi_levels(rois) for i in range(self.num_levels): - mask = self.equal(target_lvls, ops.ScalarToTensor()(i, ms.int32)) + mask = self.equal(target_lvls, ops.ScalarToTensor()(i, mindspore.int32)) mask = ops.Reshape()(mask, (-1, 1, 1, 1)) roi_feats_t = self.roi_layers[i](feats[i], rois) - mask = self.cast(ops.Tile()(self.cast(mask, ms.int32),\ - (1, 256, self.out_size, self.out_size)), ms.bool_) + mask = self.cast(ops.Tile()(self.cast(mask, mindspore.int32),\ + (1, 256, self.out_size, self.out_size)), mindspore.bool_) res = self.select(mask, roi_feats_t, res) return res diff --git a/official/cv/FasterRCNN/src/FasterRcnn/rpn.py b/official/cv/FasterRCNN/src/FasterRcnn/rpn.py index eea59e0c3..908ca6512 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/rpn.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/rpn.py @@ -14,7 +14,7 @@ # ============================================================================ """RPN for fasterRCNN""" import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore import Tensor @@ -99,8 +99,8 @@ class RPN(nn.Cell): super(RPN, self).__init__() cfg_rpn = config self.dtype = np.float32 - self.ms_type = ms.float32 - self.device_type = "Ascend" if ms.get_context("device_target") == "Ascend" else "Others" + self.ms_type = mindspore.float32 + self.device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "Others" self.num_bboxes = cfg_rpn.num_bboxes self.slice_index = () self.feature_anchor_shape = () @@ -115,7 +115,7 @@ class RPN(nn.Cell): self.batch_size = batch_size self.test_batch_size = cfg_rpn.test_batch_size self.num_layers = 5 - self.real_ratio = ms.Tensor(np.ones((1, 1), self.dtype)) + self.real_ratio = mindspore.Tensor(np.ones((1, 1), self.dtype)) self.rpn_convs_list = nn.layer.CellList(self._make_rpn_layer(self.num_layers, in_channels, feat_channels, num_anchors, cls_out_channels)) @@ -124,7 +124,7 @@ class RPN(nn.Cell): self.reshape = ops.Reshape() self.concat = ops.Concat(axis=0) self.fill = ops.Fill() - self.placeh1 = ms.Tensor(np.ones((1,), self.dtype)) + self.placeh1 = mindspore.Tensor(np.ones((1,), self.dtype)) self.trans_shape = (0, 2, 3, 1) @@ -143,9 +143,9 @@ class RPN(nn.Cell): self.cast = ops.Cast() self.tile = ops.Tile() self.zeros_like = ops.ZerosLike() - self.loss = ms.Tensor(np.zeros((1,), self.dtype)) - self.clsloss = ms.Tensor(np.zeros((1,), self.dtype)) - self.regloss = ms.Tensor(np.zeros((1,), self.dtype)) + self.loss = mindspore.Tensor(np.zeros((1,), self.dtype)) + self.clsloss = mindspore.Tensor(np.zeros((1,), self.dtype)) + self.regloss = mindspore.Tensor(np.zeros((1,), self.dtype)) def _make_rpn_layer(self, num_layers, in_channels, feat_channels, num_anchors, cls_out_channels): """ @@ -165,25 +165,25 @@ class RPN(nn.Cell): shp_weight_conv = (feat_channels, in_channels, 3, 3) shp_bias_conv = (feat_channels,) - weight_conv = ms.common.initializer.initializer('Normal', shape=shp_weight_conv, dtype=self.ms_type).init_data() - bias_conv = ms.common.initializer.initializer(0, shape=shp_bias_conv, dtype=self.ms_type).init_data() + weight_conv = mindspore.common.initializer.initializer('Normal', shape=shp_weight_conv, dtype=self.ms_type).init_data() + bias_conv = mindspore.common.initializer.initializer(0, shape=shp_bias_conv, dtype=self.ms_type).init_data() shp_weight_cls = (num_anchors * cls_out_channels, feat_channels, 1, 1) shp_bias_cls = (num_anchors * cls_out_channels,) - weight_cls = ms.common.initializer.initializer('Normal', shape=shp_weight_cls, dtype=self.ms_type).init_data() - bias_cls = ms.common.initializer.initializer(0, shape=shp_bias_cls, dtype=self.ms_type).init_data() + weight_cls = mindspore.common.initializer.initializer('Normal', shape=shp_weight_cls, dtype=self.ms_type).init_data() + bias_cls = mindspore.common.initializer.initializer(0, shape=shp_bias_cls, dtype=self.ms_type).init_data() shp_weight_reg = (num_anchors * 4, feat_channels, 1, 1) shp_bias_reg = (num_anchors * 4,) - weight_reg = ms.common.initializer.initializer('Normal', shape=shp_weight_reg, dtype=self.ms_type).init_data() - bias_reg = ms.common.initializer.initializer(0, shape=shp_bias_reg, dtype=self.ms_type).init_data() + weight_reg = mindspore.common.initializer.initializer('Normal', shape=shp_weight_reg, dtype=self.ms_type).init_data() + bias_reg = mindspore.common.initializer.initializer(0, shape=shp_bias_reg, dtype=self.ms_type).init_data() for i in range(num_layers): rpn_reg_cls_block = RpnRegClsBlock(in_channels, feat_channels, num_anchors, cls_out_channels, \ weight_conv, bias_conv, weight_cls, \ bias_cls, weight_reg, bias_reg) if self.device_type == "Ascend": - rpn_reg_cls_block.to_float(ms.float16) + rpn_reg_cls_block.to_float(mindspore.float16) rpn_layer.append(rpn_reg_cls_block) for i in range(1, num_layers): @@ -235,7 +235,7 @@ class RPN(nn.Cell): for j in range(self.num_layers): res = self.cast(self.CheckValid(anchor_list[j], self.squeeze(img_metas[i:i + 1:1, ::])), - ms.int32) + mindspore.int32) multi_level_flags = multi_level_flags + (res,) anchor_list_tuple = anchor_list_tuple + (anchor_list[j],) @@ -249,7 +249,7 @@ class RPN(nn.Cell): bbox_target, bbox_weight, label, label_weight = self.get_targets(gt_bboxes_i, gt_labels_i, self.cast(valid_flag_list, - ms.bool_), + mindspore.bool_), anchor_using_list, gt_valids_i) bbox_target = self.cast(bbox_target, self.ms_type) diff --git a/official/cv/FasterRCNN/src/convert_checkpoint.py b/official/cv/FasterRCNN/src/convert_checkpoint.py index cadff4526..abf0fb8ca 100644 --- a/official/cv/FasterRCNN/src/convert_checkpoint.py +++ b/official/cv/FasterRCNN/src/convert_checkpoint.py @@ -15,7 +15,7 @@ """ convert pretrain model to faster_rcnn backbone pretrain model """ -import mindspore as ms +import mindspore from mindspore.common.parameter import Parameter from mindspore.common.tensor import Tensor from model_utils.config import config @@ -32,7 +32,7 @@ def load_weights(model_path, use_fp16_weight): Returns: parameter list(list): pretrain model weight list. """ - ms_ckpt = ms.load_checkpoint(model_path) + ms_ckpt = mindspore.load_checkpoint(model_path) weights = {} for msname in ms_ckpt: if msname.startswith("layer") or msname.startswith("conv1") or msname.startswith("bn"): @@ -45,9 +45,9 @@ def load_weights(model_path, use_fp16_weight): param_name = param_name.replace("down_sample_layer.1", "bn_down_sample") weights[param_name] = ms_ckpt[msname].data.asnumpy() if use_fp16_weight: - dtype = ms.float16 + dtype = mindspore.float16 else: - dtype = ms.float32 + dtype = mindspore.float32 parameter_dict = {} for name in weights: parameter_dict[name] = Parameter(Tensor(weights[name], dtype), name=name) @@ -58,4 +58,4 @@ def load_weights(model_path, use_fp16_weight): if __name__ == "__main__": parameter_list = load_weights(config.ckpt_file, use_fp16_weight=False) - ms.save_checkpoint(parameter_list, "backbone.ckpt") + mindspore.save_checkpoint(parameter_list, "backbone.ckpt") diff --git a/official/cv/FasterRCNN/src/dataset.py b/official/cv/FasterRCNN/src/dataset.py index 21a058d4d..e3acdaf2b 100644 --- a/official/cv/FasterRCNN/src/dataset.py +++ b/official/cv/FasterRCNN/src/dataset.py @@ -21,7 +21,7 @@ import numpy as np from numpy import random import cv2 -import mindspore as ms +import mindspore import mindspore.dataset as de from mindspore.mindrecord import FileWriter @@ -553,7 +553,7 @@ def create_fasterrcnn_dataset(config, mindrecord_file, batch_size=2, device_num= de.config.set_prefetch_size(8) ds = de.MindDataset(mindrecord_file, columns_list=["image", "annotation"], num_shards=device_num, shard_id=rank_id, num_parallel_workers=4, shuffle=is_training) - decode = ms.dataset.vision.Decode() + decode = mindspore.dataset.vision.Decode() ds = ds.map(input_columns=["image"], operations=decode) compose_map_func = (lambda image, annotation: preprocess_fn(image, annotation, is_training, config=config)) diff --git a/official/cv/FasterRCNN/src/eval_callback.py b/official/cv/FasterRCNN/src/eval_callback.py index 00ea880bc..fbf0b1717 100644 --- a/official/cv/FasterRCNN/src/eval_callback.py +++ b/official/cv/FasterRCNN/src/eval_callback.py @@ -15,7 +15,7 @@ import os import shutil -import mindspore as ms +import mindspore from mindspore.train.callback import Callback @@ -71,7 +71,7 @@ class EvalCallBack(Callback): shutil.rmtree(self.best_ckpt_path) os.mkdir(self.best_ckpt_path) - ms.save_checkpoint(cb_params.train_network, os.path.join(self.best_ckpt_path, "best.ckpt")) + mindspore.save_checkpoint(cb_params.train_network, os.path.join(self.best_ckpt_path, "best.ckpt")) print("update best result: {} in the {} th epoch".format(self.best_res, self.best_epoch), flush=True) diff --git a/official/cv/FasterRCNN/src/eval_utils.py b/official/cv/FasterRCNN/src/eval_utils.py index 8c0158098..0705134be 100644 --- a/official/cv/FasterRCNN/src/eval_utils.py +++ b/official/cv/FasterRCNN/src/eval_utils.py @@ -21,7 +21,7 @@ import numpy as np from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval -import mindspore as ms +import mindspore from mindspore.common import Parameter from src.dataset import data_to_mindrecord_byte_image, create_fasterrcnn_dataset, parse_json_annos_from_txt from src.util import bbox2result_1image, results2json @@ -55,17 +55,17 @@ def apply_eval(net, config, dataset_path, ckpt_path, anno_path): raise RuntimeError("CheckPoint file {} is not valid.".format(ckpt_path)) ds = create_fasterrcnn_dataset(config, dataset_path, batch_size=config.test_batch_size, is_training=False) - param_dict = ms.load_checkpoint(ckpt_path) + param_dict = mindspore.load_checkpoint(ckpt_path) if config.device_target == "GPU": for key, value in param_dict.items(): tensor = value.asnumpy().astype(np.float32) param_dict[key] = Parameter(tensor, key) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) - device_type = "Ascend" if ms.get_context("device_target") == "Ascend" else "Others" + device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "Others" if device_type == "Ascend": - net.to_float(ms.float16) + net.to_float(mindspore.float16) eval_iter = 0 total = ds.get_dataset_size() diff --git a/official/cv/FasterRCNN/src/model_utils/moxing_adapter.py b/official/cv/FasterRCNN/src/model_utils/moxing_adapter.py index 830d19a6f..9c6d88e5d 100644 --- a/official/cv/FasterRCNN/src/model_utils/moxing_adapter.py +++ b/official/cv/FasterRCNN/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/FasterRCNN/src/network_define.py b/official/cv/FasterRCNN/src/network_define.py index 9638c6b70..c2fcf55df 100644 --- a/official/cv/FasterRCNN/src/network_define.py +++ b/official/cv/FasterRCNN/src/network_define.py @@ -15,7 +15,7 @@ """FasterRcnn training network wrapper.""" import time -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore.train.callback import Callback @@ -136,7 +136,7 @@ class TrainOneStepCell(nn.TrainOneStepWithLossScaleCell): def __init__(self, network, optimizer, scale_sense=1, grad_clip=False): if isinstance(scale_sense, (int, float)): - scale_sense = ms.Tensor(scale_sense, ms.float32) + scale_sense = mindspore.Tensor(scale_sense, mindspore.float32) super(TrainOneStepCell, self).__init__(network, optimizer, scale_sense) self.grad_clip = grad_clip diff --git a/official/cv/FasterRCNN/src/quick_start.py b/official/cv/FasterRCNN/src/quick_start.py index 9574fbaf9..ca3ce2bd7 100644 --- a/official/cv/FasterRCNN/src/quick_start.py +++ b/official/cv/FasterRCNN/src/quick_start.py @@ -19,7 +19,7 @@ import cv2 import numpy as np from tqdm import tqdm -import mindspore as ms +import mindspore from mindspore.common.tensor import Tensor import mindspore.ops.operations as P @@ -27,7 +27,7 @@ from src.model_utils.config import config from src.maskrcnn.mask_rcnn_r50 import Mask_Rcnn_Resnet50 from src.model_utils.device_adapter import get_device_id random.seed(1) -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) +mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) def rescale_with_tuple(img, scale): h, w = img.shape[:2] @@ -118,8 +118,8 @@ def save_result(img, boxes, labels, img_metas_, save_name): def det(): net = Mask_Rcnn_Resnet50(config) - param_dict = ms.load_checkpoint(config.ckpt_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.ckpt_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) image_list = os.listdir(config.image_folder) max_num = config.num_gts diff --git a/official/cv/FasterRCNN/train.py b/official/cv/FasterRCNN/train.py index 4511033cb..f5cd9abd4 100644 --- a/official/cv/FasterRCNN/train.py +++ b/official/cv/FasterRCNN/train.py @@ -19,14 +19,14 @@ import os import time from pprint import pprint import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore import Tensor, Parameter, ParameterTuple from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor from mindspore.train import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn import SGD, Adam from mindspore.common import set_seed from mindspore.train.callback import SummaryCollector @@ -64,7 +64,7 @@ class TrainOneStepCellCPU(nn.Cell): self.optimizer = optimizer self.grad = ops.GradOperation(get_by_list=True, sens_param=True) - self.sens = Tensor([sens,], ms.float32) + self.sens = Tensor([sens,], mindspore.float32) self.reduce_flag = reduce_flag if reduce_flag: self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) @@ -146,14 +146,14 @@ def load_ckpt_to_network(net): "rcnn.reg_scores.bias", "accum.rcnn.cls_scores.weight", "accum.rcnn.cls_scores.bias", "accum.rcnn.reg_scores.weight", "accum.rcnn.reg_scores.bias" ] - param_dict = ms.load_checkpoint(load_path, choice_func=lambda x: not x.startswith(tuple(param_not_load))) + param_dict = mindspore.load_checkpoint(load_path, choice_func=lambda x: not x.startswith(tuple(param_not_load))) for key, val in param_dict.items(): # Correct previous misspellings key = key.replace("ncek", "neck") new_param[key] = val else: print(f"\n[{rank}]", "===> Loading from checkpoint:", load_path) - param_dict = ms.load_checkpoint(load_path) + param_dict = mindspore.load_checkpoint(load_path) key_mapping = {'down_sample_layer.1.beta': 'bn_down_sample.beta', 'down_sample_layer.1.gamma': 'bn_down_sample.gamma', 'down_sample_layer.0.weight': 'conv_down_sample.weight', @@ -182,7 +182,7 @@ def load_ckpt_to_network(net): new_param = param_dict try: - ms.load_param_into_net(net, new_param) + mindspore.load_param_into_net(net, new_param) except RuntimeError as ex: ex = str(ex) print("Traceback:\n", ex, flush=True) @@ -196,10 +196,10 @@ def load_ckpt_to_network(net): def set_ascend_max_device_memory(): """Set the maximum memory on 910B used by MindSpore""" - if ms.get_context("enable_ge") and ms.get_context("mode") == ms.GRAPH_MODE and \ + if mindspore.get_context("enable_ge") and mindspore.get_context("mode") == 0 and \ hasattr(config, "max_device_memory"): print("[WARNING] When encountering a memory shortage situation in 910B, reduce the max_device_memory.") - ms.set_context(max_device_memory=config.max_device_memory) + mindspore.set_context(max_device_memory=config.max_device_memory) @moxing_wrapper(pre_process=modelarts_pre_process) @@ -213,10 +213,10 @@ def train_fasterrcnn(): net = net.set_train() net = load_ckpt_to_network(net) - device_type = "Ascend" if ms.get_context("device_target") == "Ascend" else "Others" + device_type = "Ascend" if mindspore.get_context("device_target") == "Ascend" else "Others" print(f"\n[{rank}]", "===> Device type:", device_type, "\n") if device_type == "Ascend": - net.to_float(ms.float16) + net.to_float(mindspore.float16) # single card, original base_lr is for 8 cards if not config.run_distribute: @@ -227,9 +227,9 @@ def train_fasterrcnn(): if config.lr_type.lower() not in ("dynamic", "multistep"): raise ValueError("Optimize type should be 'dynamic' or 'dynamic'") if config.lr_type.lower() == "dynamic": - lr = Tensor(dynamic_lr(config, dataset_size), ms.float32) + lr = Tensor(dynamic_lr(config, dataset_size), mindspore.float32) else: - lr = Tensor(multistep_lr(config, dataset_size), ms.float32) + lr = Tensor(multistep_lr(config, dataset_size), mindspore.float32) if config.opt_type.lower() not in ("sgd", "adam"): raise ValueError("Optimize type should be 'SGD' or 'Adam'") @@ -287,19 +287,19 @@ def train_fasterrcnn(): if __name__ == '__main__': set_seed(1) - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id(), + mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id(), ascend_config={"ge_options": {"global": {"ge.exec.memoryOptimizationPolicy": ""}}}) set_ascend_max_device_memory() local_path = '/'.join(os.path.realpath(__file__).split('/')[:-1]) summary_dir = local_path + "/train/summary/" if config.device_target == "GPU": - ms.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) if config.run_distribute: init() rank = get_rank() device_num = get_group_size() - ms.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) summary_dir += "thread_num_" + str(rank) + "/" else: diff --git a/official/cv/Inception/inceptionv3/README.md b/official/cv/Inception/inceptionv3/README.md index 75542628c..e710f3a29 100644 --- a/official/cv/Inception/inceptionv3/README.md +++ b/official/cv/Inception/inceptionv3/README.md @@ -332,7 +332,7 @@ bash scripts/run_standalone_train_cpu.sh DATA_PATH ### Result -Training result will be stored in the example path. Checkpoints will be stored at `./ckpt` by default, and training log will be redirected to `./log.txt` like followings. +Training result will be stored in the example path. Checkpoints will be stored at `./ckpt` by default, and training log will be redirected to `./log.txt` like following. #### Ascend @@ -391,7 +391,7 @@ You can start training using python or shell scripts. The usage of shell scripts ### Result -Evaluation result will be stored in the example path, you can find result like the followings in `eval.log`. +Evaluation result will be stored in the example path, you can find result like the following in `eval.log`. ```python metric: {'Loss': 1.778, 'Top1-Acc':0.788, 'Top5-Acc':0.942} diff --git a/official/cv/Inception/inceptionv3/eval.py b/official/cv/Inception/inceptionv3/eval.py index 35e0a436b..508a0cbf3 100644 --- a/official/cv/Inception/inceptionv3/eval.py +++ b/official/cv/Inception/inceptionv3/eval.py @@ -24,8 +24,8 @@ from src.dataset import create_dataset_imagenet, create_dataset_cifar10 from src.inception_v3 import InceptionV3 from src.loss import CrossEntropy_Val +import mindspore import mindspore.nn as nn -from mindspore import context from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -98,10 +98,10 @@ def eval_inceptionv3(): if config.platform == 'Ascend': device_id = int(os.getenv('DEVICE_ID')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) create_dataset = DS_DICT[config.ds_type] - context.set_context(mode=context.GRAPH_MODE, device_target=config.platform) + mindspore.set_context(mode=0, device_target=config.platform) net = InceptionV3(num_classes=config.num_classes, is_training=False) ckpt = load_checkpoint(config.checkpoint) load_param_into_net(net, ckpt) diff --git a/official/cv/Inception/inceptionv3/export.py b/official/cv/Inception/inceptionv3/export.py index 31f09ec24..622d884d9 100644 --- a/official/cv/Inception/inceptionv3/export.py +++ b/official/cv/Inception/inceptionv3/export.py @@ -20,15 +20,15 @@ from src.model_utils.moxing_adapter import moxing_wrapper from src.model_utils.device_adapter import get_device_id from src.inception_v3 import InceptionV3 -import mindspore as ms -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export config.batch_size = 1 -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_process(): pass @@ -41,7 +41,7 @@ def export_inceptionv3(): load_param_into_net(net, param_dict) input_arr = Tensor(np.random.uniform(0.0, 1.0, size=[config.batch_size, 3, config.width, \ - config.height]), ms.float32) + config.height]), mindspore.float32) export(net, input_arr, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': diff --git a/official/cv/Inception/inceptionv3/modelarts/train_start.py b/official/cv/Inception/inceptionv3/modelarts/train_start.py index b4db0ea47..453519ecf 100644 --- a/official/cv/Inception/inceptionv3/modelarts/train_start.py +++ b/official/cv/Inception/inceptionv3/modelarts/train_start.py @@ -18,10 +18,9 @@ import argparse import glob import moxing as mox import numpy as np -import mindspore as ms +import mindspore from mindspore import Model from mindspore import Tensor -from mindspore import context from mindspore.common import set_seed from mindspore.common.initializer import XavierUniform, initializer from mindspore.communication import init, get_rank, get_group_size @@ -75,7 +74,7 @@ def frozen_to_air(network, args): param_dict_t = load_checkpoint(args.get("ckpt_file")) load_param_into_net(network, param_dict_t) input_arr = Tensor(np.random.uniform(0.0, 1.0, size=[args.get("batch_size"), 3, args.get("width"), \ - args.get("height")]), ms.float32) + args.get("height")]), mindspore.float32) export(network, input_arr, file_name=args.get("file_name"), file_format=args.get("file_format")) @@ -95,11 +94,11 @@ if __name__ == '__main__': config.dataset_path = os.path.join(config.dataset_path, "train") if config.platform == "GPU": - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) - context.set_context(mode=context.GRAPH_MODE, device_target=config.platform, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.platform, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): - context.set_context(device_id=int(os.getenv('DEVICE_ID'))) + mindspore.set_context(device_id=int(os.getenv('DEVICE_ID'))) # init distributed if config.is_distributed: @@ -107,7 +106,7 @@ if __name__ == '__main__': config.rank = get_rank() config.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=config.group_size, + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=config.group_size, gradients_mean=True) else: config.rank = 0 diff --git a/official/cv/Inception/inceptionv3/src/model_utils/moxing_adapter.py b/official/cv/Inception/inceptionv3/src/model_utils/moxing_adapter.py index 830d19a6f..9c6d88e5d 100644 --- a/official/cv/Inception/inceptionv3/src/model_utils/moxing_adapter.py +++ b/official/cv/Inception/inceptionv3/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/Inception/inceptionv3/train.py b/official/cv/Inception/inceptionv3/train.py index 9b2fe4e96..746723251 100644 --- a/official/cv/Inception/inceptionv3/train.py +++ b/official/cv/Inception/inceptionv3/train.py @@ -24,11 +24,10 @@ from src.inception_v3 import InceptionV3 from src.lr_generator import get_lr from src.loss import CrossEntropy -import mindspore as ms +import mindspore import mindspore.log as logger from mindspore import Tensor -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.rmsprop import RMSProp from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor @@ -103,10 +102,10 @@ def modelarts_pre_process(): def set_ascend_max_device_memory(): - if ms.get_context("enable_ge") and ms.get_context("mode") == ms.GRAPH_MODE and \ + if mindspore.get_context("enable_ge") and mindspore.get_context("mode") == 0 and \ hasattr(config, "max_device_memory"): logger.warning("When encountering a memory shortage situation in 1980B, reduce the max_device_memory.") - ms.set_context(max_device_memory=config.max_device_memory) + mindspore.set_context(max_device_memory=config.max_device_memory) @moxing_wrapper(pre_process=modelarts_pre_process) @@ -114,11 +113,11 @@ def train_inceptionv3(): create_dataset = DS_DICT[config.ds_type] if config.platform == "GPU": - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) - context.set_context(mode=context.GRAPH_MODE, device_target=config.platform, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.platform, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): - context.set_context(device_id=int(os.getenv('DEVICE_ID'))) + mindspore.set_context(device_id=int(os.getenv('DEVICE_ID'))) set_ascend_max_device_memory() # init distributed if config.is_distributed: @@ -126,7 +125,7 @@ def train_inceptionv3(): config.rank = get_rank() config.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=config.group_size, + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=config.group_size, gradients_mean=True) else: config.rank = 0 diff --git a/official/cv/Inception/inceptionv4/eval.py b/official/cv/Inception/inceptionv4/eval.py index af369ffe2..538744899 100644 --- a/official/cv/Inception/inceptionv4/eval.py +++ b/official/cv/Inception/inceptionv4/eval.py @@ -22,8 +22,8 @@ from src.model_utils.device_adapter import get_device_id, get_device_num from src.dataset import create_dataset_imagenet, create_dataset_cifar10 from src.inceptionv4 import Inceptionv4 +import mindspore import mindspore.nn as nn -from mindspore import context from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -96,11 +96,11 @@ def inception_v4_eval(): if config.platform == 'Ascend': device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) create_dataset = DS_DICT[config.ds_type] - context.set_context(mode=context.GRAPH_MODE, device_target=config.platform) + mindspore.set_context(mode=0, device_target=config.platform) net = Inceptionv4(classes=config.num_classes) ckpt = load_checkpoint(config.checkpoint_path) load_param_into_net(net, ckpt) diff --git a/official/cv/Inception/inceptionv4/export.py b/official/cv/Inception/inceptionv4/export.py index 5557e035e..825ea99c3 100644 --- a/official/cv/Inception/inceptionv4/export.py +++ b/official/cv/Inception/inceptionv4/export.py @@ -20,16 +20,16 @@ from src.model_utils.device_adapter import get_device_id from src.model_utils.moxing_adapter import moxing_wrapper from src.inceptionv4 import Inceptionv4 -import mindspore as ms +import mindspore from mindspore import Tensor -from mindspore.train.serialization import load_checkpoint, load_param_into_net, export, context +from mindspore.train.serialization import load_checkpoint, load_param_into_net, export config.batch_size = 1 -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_process(): pass @@ -41,7 +41,7 @@ def export_inceptionv4(): param_dict = load_checkpoint(config.ckpt_file) load_param_into_net(net, param_dict) - input_arr = Tensor(np.ones([config.batch_size, 3, config.width, config.height]), ms.float32) + input_arr = Tensor(np.ones([config.batch_size, 3, config.width, config.height]), mindspore.float32) export(net, input_arr, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': diff --git a/official/cv/Inception/inceptionv4/modelarts/train_start.py b/official/cv/Inception/inceptionv4/modelarts/train_start.py index 7cdd84458..dbb6c311f 100644 --- a/official/cv/Inception/inceptionv4/modelarts/train_start.py +++ b/official/cv/Inception/inceptionv4/modelarts/train_start.py @@ -20,9 +20,9 @@ import glob import moxing as mox import numpy as np +import mindspore from mindspore import Model from mindspore import Tensor -from mindspore import context from mindspore.common import set_seed from mindspore.common.initializer import XavierUniform, initializer from mindspore.communication import init, get_rank, get_group_size @@ -162,16 +162,16 @@ if __name__ == '__main__': print('epoch_size: {} batch_size: {} class_num {}'.format(config.epoch_size, config.batch_size, config.num_classes)) - context.set_context(mode=context.GRAPH_MODE, device_target=config.platform) + mindspore.set_context(mode=0, device_target=config.platform) if config.platform == "Ascend": - context.set_context(device_id=get_device_id()) - context.set_context(enable_graph_kernel=False) + mindspore.set_context(device_id=get_device_id()) + mindspore.set_context(enable_graph_kernel=False) if device_num > 1: init() config.rank = get_rank() config.group_size = get_group_size() - context.set_auto_parallel_context(device_num=device_num, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[200, 400]) diff --git a/official/cv/Inception/inceptionv4/src/model_utils/moxing_adapter.py b/official/cv/Inception/inceptionv4/src/model_utils/moxing_adapter.py index 830d19a6f..9c6d88e5d 100644 --- a/official/cv/Inception/inceptionv4/src/model_utils/moxing_adapter.py +++ b/official/cv/Inception/inceptionv4/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/Inception/inceptionv4/train.py b/official/cv/Inception/inceptionv4/train.py index acc1138ee..442052e38 100644 --- a/official/cv/Inception/inceptionv4/train.py +++ b/official/cv/Inception/inceptionv4/train.py @@ -24,9 +24,9 @@ from src.model_utils.device_adapter import get_device_id, get_device_num from src.dataset import create_dataset_imagenet, create_dataset_cifar10 from src.inceptionv4 import Inceptionv4 +import mindspore from mindspore import Model from mindspore import Tensor -from mindspore import context from mindspore.common import set_seed from mindspore.common.initializer import XavierUniform, initializer from mindspore.communication import init, get_rank, get_group_size @@ -35,7 +35,7 @@ from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor, LossMonitor from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.train.serialization import load_checkpoint, load_param_into_net -from mindspore.context import ParallelMode +from mindspore import ParallelMode os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python' @@ -151,16 +151,16 @@ def inception_v4_train(): """ print('epoch_size: {} batch_size: {} class_num {}'.format(config.epoch_size, config.batch_size, config.num_classes)) - context.set_context(mode=context.GRAPH_MODE, device_target=config.platform) + mindspore.set_context(mode=0, device_target=config.platform) if config.platform == "Ascend": - context.set_context(device_id=get_device_id()) - context.set_context(enable_graph_kernel=False) + mindspore.set_context(device_id=get_device_id()) + mindspore.set_context(enable_graph_kernel=False) if device_num > 1: init() config.rank = get_rank() config.group_size = get_group_size() - context.set_auto_parallel_context(device_num=device_num, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[200, 400]) diff --git a/official/cv/Inception/xception/eval.py b/official/cv/Inception/xception/eval.py index 7eebabc11..15224453b 100644 --- a/official/cv/Inception/xception/eval.py +++ b/official/cv/Inception/xception/eval.py @@ -15,7 +15,8 @@ """eval Xception.""" import time import os -from mindspore import context, nn +import mindspore +from mindspore import nn from mindspore.train.model import Model from mindspore.common import set_seed from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -95,8 +96,8 @@ def run_eval(): else: raise ValueError("Unsupported device_target.") - context.set_context(device_id=args_opt.device_id) - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, save_graphs=False) + mindspore.set_context(device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target=args_opt.device_target, save_graphs=False) # create dataset dataset = create_dataset(args_opt.test_data_dir, do_train=False, batch_size=config.batch_size, device_num=1, rank=0) diff --git a/official/cv/Inception/xception/export.py b/official/cv/Inception/xception/export.py index 7df6f0403..67efbc223 100644 --- a/official/cv/Inception/xception/export.py +++ b/official/cv/Inception/xception/export.py @@ -16,7 +16,8 @@ import os import numpy as np -from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.Xception import xception from src.model_utils.config import config as args, config_gpu, config_ascend @@ -39,8 +40,8 @@ def run_export(): else: raise ValueError("Unsupported device_target.") - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - context.set_context(device_id=args.device_id) + mindspore.set_context(mode=0, device_target=args.device_target) + mindspore.set_context(device_id=args.device_id) net = xception(class_num=config.class_num) # load checkpoint diff --git a/official/cv/Inception/xception/src/model_utils/moxing_adapter.py b/official/cv/Inception/xception/src/model_utils/moxing_adapter.py index 09cb0f0cf..a6d8a3fce 100644 --- a/official/cv/Inception/xception/src/model_utils/moxing_adapter.py +++ b/official/cv/Inception/xception/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from src.model_utils.config import config @@ -94,7 +94,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/Inception/xception/train.py b/official/cv/Inception/xception/train.py index b80783c44..7758ff006 100644 --- a/official/cv/Inception/xception/train.py +++ b/official/cv/Inception/xception/train.py @@ -15,11 +15,10 @@ """train Xception.""" import os import time - -from mindspore import context +import mindspore from mindspore import Tensor from mindspore.nn.optim.momentum import Momentum -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.model import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor, LossMonitor from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -106,18 +105,18 @@ def run_train(): # init distributed if args_opt.is_distributed: - context.set_context(device_id=get_device_id(), mode=context.GRAPH_MODE, device_target=args_opt.device_target, + mindspore.set_context(device_id=get_device_id(), mode=0, device_target=args_opt.device_target, save_graphs=False) init() rank = get_rank_id() group_size = get_device_num() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=True) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=True) else: rank = 0 group_size = 1 device_id = get_device_id() - context.set_context(device_id=device_id, mode=context.GRAPH_MODE, device_target=args_opt.device_target, + mindspore.set_context(device_id=device_id, mode=0, device_target=args_opt.device_target, save_graphs=False) # define network net = xception(class_num=config.class_num) diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/eval.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/eval.py index 11d608a58..3ca708dfa 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/eval.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/eval.py @@ -18,7 +18,8 @@ import os import time import numpy as np from pycocotools.coco import COCO -from mindspore import context, Tensor +import mindspore +from mindspore import Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.common import set_seed @@ -164,12 +165,12 @@ def modelarts_process(): @moxing_wrapper(pre_process=modelarts_process) def eval_(): device_target = config.device_target - context.set_context(mode=context.GRAPH_MODE, device_target=device_target) + mindspore.set_context(mode=0, device_target=device_target) if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) else: - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) config.mindrecord_dir = os.path.join(config.coco_root, config.mindrecord_dir) print('\nconfig:\n', config) diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/export.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/export.py index 81f868ed7..3e0459ddd 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/export.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/export.py @@ -15,7 +15,8 @@ """export checkpoint file into air, mindir models""" import re import numpy as np -from mindspore import Tensor, context, load_checkpoint, export, load_param_into_net +import mindspore +from mindspore import Tensor, load_checkpoint, export, load_param_into_net from src.model_utils.config import config from src.model_utils.device_adapter import get_device_id from src.model_utils.moxing_adapter import moxing_wrapper @@ -33,9 +34,9 @@ def config_(cfg): config = config_(config) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_process(): pass diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/dataset.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/dataset.py index f817f7555..96e742f98 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/dataset.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/dataset.py @@ -22,10 +22,10 @@ import cv2 import numpy as np from numpy import random +import mindspore import mindspore.dataset as de import mindspore.dataset.vision as C from mindspore.mindrecord import FileWriter -from mindspore import context from src.model_utils.config import config @@ -315,7 +315,7 @@ def flip_column(img, img_shape, gt_bboxes, gt_label, gt_num, gt_mask): def transpose_column(img, img_shape, gt_bboxes, gt_label, gt_num, gt_mask): """transpose operation for image""" - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": platform_dtype = np.float32 else: platform_dtype = np.float16 diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample.py index 7cc702ab8..a0c1446e9 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample.py @@ -15,11 +15,11 @@ """MaskRcnn positive and negative sample screening for RPN.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor import mindspore.common.dtype as mstype -from mindspore import context class BboxAssignSample(nn.Cell): @@ -48,7 +48,7 @@ class BboxAssignSample(nn.Cell): cfg = config self.batch_size = batch_size - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.cast_type = mstype.float32 else: self.cast_type = mstype.float16 @@ -98,7 +98,7 @@ class BboxAssignSample(nn.Cell): self.check_neg_mask = Tensor(np.array(np.ones(self.num_expected_neg - self.num_expected_pos), dtype=np.bool_)) - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.check_gt_one = Tensor(np.array(-1 * np.ones((self.num_gts, 4)), dtype=np.float32)) self.check_anchor_two = Tensor(np.array(-2 * np.ones((self.num_bboxes, 4)), dtype=np.float32)) self.range_pos_size = Tensor(np.arange(self.num_expected_pos).astype(np.float32)) diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py index de567eca9..96c79a9c9 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py @@ -15,12 +15,12 @@ """MaskRcnn tpositive and negative sample screening for Rcnn.""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P from mindspore.ops import function as F from mindspore.common.tensor import Tensor -from mindspore import context class BboxAssignSampleForRcnn(nn.Cell): @@ -81,7 +81,7 @@ class BboxAssignSampleForRcnn(nn.Cell): self.tile = P.Tile() # Check - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.cast_type = mstype.float32 self.np_cast_type = np.float32 self.int_cast_type = np.int32 diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py index 2ad832bbd..5e512961f 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py @@ -15,12 +15,12 @@ """MaskRcnn feature pyramid network.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor from mindspore.common import dtype as mstype from mindspore.common.initializer import initializer -from mindspore import context def bias_init_zeros(shape): @@ -67,7 +67,7 @@ class FeatPyramidNeck(nn.Cell): out_channels, num_outs): super(FeatPyramidNeck, self).__init__() - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.platform_mstype = mstype.float32 else: self.platform_mstype = mstype.float16 diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/mask_rcnn_mobilenetv1.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/mask_rcnn_mobilenetv1.py index 1ad4867ca..f1f9ed355 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/mask_rcnn_mobilenetv1.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/mask_rcnn_mobilenetv1.py @@ -15,12 +15,12 @@ """MaskRcnn based on mobilenetv1.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor import mindspore.common.dtype as mstype from mindspore.ops import functional as F -from mindspore import context from .mobilenetv1 import MobileNetV1_FeatureSelector from .bbox_assign_sample_stage2 import BboxAssignSampleForRcnn from .fpn_neck import FeatPyramidNeck @@ -484,7 +484,7 @@ class Mask_Rcnn_Mobilenetv1(nn.Cell): return mask_fb_pred_all def init_datatype(self): - self.platform = context.get_context("device_target") + self.platform = mindspore.get_context("device_target") if self.platform == "CPU" or self.platform == "GPU": self.platform_dtype = np.float32 self.platform_mstype = mstype.float32 diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/proposal_generator.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/proposal_generator.py index 21f77b1cc..718557eb6 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/proposal_generator.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/proposal_generator.py @@ -15,11 +15,11 @@ """MaskRcnn proposal generator.""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P from mindspore import Tensor -from mindspore import context class Proposal(nn.Cell): """ @@ -50,7 +50,7 @@ class Proposal(nn.Cell): ): super(Proposal, self).__init__() - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.platform_dtype = np.float32 self.platform_mstype = mstype.float32 else: @@ -112,7 +112,7 @@ class Proposal(nn.Cell): self.multi_10 = Tensor(10.0, self.platform_mstype) - self.platform = context.get_context("device_target") + self.platform = mindspore.get_context("device_target") def set_train_local(self, config, training=True): """Set training flag.""" diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_cls.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_cls.py index 2f9fbc214..f6885ece5 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_cls.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_cls.py @@ -15,13 +15,13 @@ """MaskRcnn Rcnn classification and box regression network.""" import numpy as np +import mindspore import mindspore.common.dtype as mstype import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor from mindspore.common.initializer import initializer from mindspore.common.parameter import Parameter -from mindspore import context class DenseNoTranpose(nn.Cell): """Dense method""" @@ -41,7 +41,7 @@ class FpnCls(nn.Cell): """dense layer of classification and box head""" def __init__(self, input_channels, output_channels, num_classes, pool_size): super(FpnCls, self).__init__() - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.platform_mstype = mstype.float32 else: self.platform_mstype = mstype.float16 @@ -105,7 +105,7 @@ class RcnnCls(nn.Cell): ): super(RcnnCls, self).__init__() cfg = config - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.platform_mstype = mstype.float32 self.platform_dtype = np.float32 else: diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_mask.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_mask.py index 71fd749b6..e07b8a020 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_mask.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_mask.py @@ -15,12 +15,12 @@ """MaskRcnn Rcnn for mask network.""" import numpy as np +import mindspore import mindspore.common.dtype as mstype import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor from mindspore.common.initializer import initializer -from mindspore import context def _conv(in_channels, out_channels, kernel_size=1, stride=1, padding=0, pad_mode='pad'): """Conv2D wrapper.""" @@ -46,7 +46,7 @@ class FpnMask(nn.Cell): """conv layers of mask head""" def __init__(self, input_channels, output_channels, num_classes): super(FpnMask, self).__init__() - self.platform = context.get_context("device_target") + self.platform = mindspore.get_context("device_target") if self.platform == "CPU" or self.platform == "GPU": self.platform_mstype = mstype.float32 else: @@ -120,7 +120,7 @@ class RcnnMask(nn.Cell): ): super(RcnnMask, self).__init__() cfg = config - self.platform = context.get_context("device_target") + self.platform = mindspore.get_context("device_target") if self.platform == "CPU" or self.platform == "GPU": self.platform_dtype = np.float32 self.platform_mstype = mstype.float32 diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rpn.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rpn.py index 8df441ba0..eb2697697 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rpn.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rpn.py @@ -14,13 +14,13 @@ # ============================================================================ """RPN for MaskRCNN""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P from mindspore import Tensor from mindspore.ops import functional as F from mindspore.common.initializer import initializer -from mindspore import context from .bbox_assign_sample import BboxAssignSample @@ -101,7 +101,7 @@ class RPN(nn.Cell): cls_out_channels): super(RPN, self).__init__() cfg_rpn = config - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.platform_dtype = np.float32 self.platform_mstype = mstype.float32 else: diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/model_utils/moxing_adapter.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/model_utils/moxing_adapter.py index 830d19a6f..9c6d88e5d 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/model_utils/moxing_adapter.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/network_define.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/network_define.py index 077d42460..c12852e1c 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/network_define.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/network_define.py @@ -16,6 +16,7 @@ import time import numpy as np +import mindspore import mindspore.nn as nn from mindspore.common.tensor import Tensor from mindspore.ops import functional as F @@ -23,7 +24,6 @@ from mindspore.ops import composite as C from mindspore import ParameterTuple from mindspore.train.callback import Callback from mindspore.nn.wrap.grad_reducer import DistributedGradReducer -from mindspore import context from src.maskrcnn_mobilenetv1.mask_rcnn_mobilenetv1 import Mask_Rcnn_Mobilenetv1 time_stamp_init = False @@ -167,7 +167,7 @@ class TrainOneStepCell(nn.Cell): self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) - if context.get_context("device_target") == "CPU" or context.get_context("device_target") == "GPU": + if mindspore.get_context("device_target") == "CPU" or mindspore.get_context("device_target") == "GPU": self.sens = Tensor((np.ones((1,)) * sens).astype(np.float32)) else: self.sens = Tensor((np.ones((1,)) * sens).astype(np.float16)) diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/train.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/train.py index 80300601f..a4eeddf2f 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/train.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/train.py @@ -18,12 +18,13 @@ import os import time +import mindspore import mindspore.common.dtype as mstype -from mindspore import context, Tensor +from mindspore import Tensor from mindspore.communication.management import init, get_rank from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor from mindspore.train import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.nn import Momentum from mindspore.common import set_seed @@ -116,13 +117,13 @@ def create_mindrecord_files(rank, mindrecord_file, mindrecord_dir, prefix): while not os.path.exists(mindrecord_file+".db"): time.sleep(5) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) # Set mempool block size for improving memory utilization, which will not take effect in GRAPH_MODE -if context.get_context("mode") == context.PYNATIVE_MODE: - context.set_context(mempool_block_size="28GB") +if mindspore.get_context("mode") == 1: + mindspore.set_context(mempool_block_size="28GB") @moxing_wrapper(pre_process=modelarts_pre_process) def train_maskrcnn_mobilenetv1(): @@ -132,13 +133,13 @@ def train_maskrcnn_mobilenetv1(): device_num = get_device_num() if config.device_target == "Ascend": rank = get_rank_id() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() elif config.device_target == "GPU": init() rank = get_rank() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: rank = 0 diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/README.md b/official/cv/MaskRCNN/maskrcnn_resnet50/README.md index 0d53f6878..0de91cf8f 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/README.md +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/README.md @@ -67,7 +67,7 @@ Note that you can run the scripts based on the dataset mentioned in original pap - Framework - [MindSpore](https://gitee.com/mindspore/mindspore) - Docker base image - - [Ascend Hub](https://ascend.huawei.com/ascendhub/#/home) + - [Ascend Hub](https://www.hiascend.com/developer/ascendhub) - For more information, please check the resources below: - [MindSpore Tutorials](https://www.mindspore.cn/tutorials/en/master/index.html) - [MindSpore Python API](https://www.mindspore.cn/docs/en/master/api_python/mindspore.html) diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/README_CN.md b/official/cv/MaskRCNN/maskrcnn_resnet50/README_CN.md index e033c138e..9dd2156c8 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/README_CN.md +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/README_CN.md @@ -65,7 +65,7 @@ MaskRCNN是一个两级目标检测网络,作为FasterRCNN的扩展模型, - 框架 - [MindSpore](https://gitee.com/mindspore/mindspore) - 获取基础镜像 - - [Ascend Hub](https://ascend.huawei.com/ascendhub/#/home) + - [Ascend Hub](https://www.hiascend.com/developer/ascendhub) - 如需查看详情,请参见如下资源: - [MindSpore教程](https://www.mindspore.cn/tutorials/zh-CN/master/index.html) - [MindSpore Python API](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore.html) diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/eval.py b/official/cv/MaskRCNN/maskrcnn_resnet50/eval.py index 64625732f..532b351ca 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/eval.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/eval.py @@ -26,7 +26,8 @@ from src.dataset import data_to_mindrecord_byte_image, create_maskrcnn_dataset from src.util import coco_eval, bbox2result_1image, results2json, get_seg_masks from pycocotools.coco import COCO -from mindspore import context, Tensor +import mindspore +from mindspore import Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.common import set_seed @@ -161,7 +162,7 @@ def modelarts_process(): @moxing_wrapper(pre_process=modelarts_process) def eval_(): device_target = config.device_target - context.set_context(mode=context.GRAPH_MODE, device_target=device_target, device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=device_target, device_id=get_device_id()) config.mindrecord_dir = os.path.join(config.coco_root, config.mindrecord_dir) print('\neval.py config:\n', config) diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/export.py b/official/cv/MaskRCNN/maskrcnn_resnet50/export.py index 7bdaa768e..5fefa87bf 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/export.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/export.py @@ -19,15 +19,16 @@ from src.model_utils.config import config from src.model_utils.device_adapter import get_device_id from src.model_utils.moxing_adapter import moxing_wrapper from src.maskrcnn.mask_rcnn_r50 import MaskRcnn_Infer -from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export if not config.enable_modelarts: config.ckpt_file = config.ckpt_file_local -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_process(): pass diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample.py index 01542f0f9..a49fb28f0 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample.py @@ -15,10 +15,10 @@ """MaskRcnn positive and negative sample screening for RPN.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor -from mindspore import context import mindspore.common.dtype as mstype @@ -47,7 +47,7 @@ class BboxAssignSample(nn.Cell): super(BboxAssignSample, self).__init__() cfg = config - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 self.np_cast_type = np.float16 else: diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py index d6d57e00a..599585d23 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py @@ -15,12 +15,12 @@ """MaskRcnn tpositive and negative sample screening for Rcnn.""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P from mindspore.ops import function as F from mindspore.common.tensor import Tensor -from mindspore import context class BboxAssignSampleForRcnn(nn.Cell): @@ -44,7 +44,7 @@ class BboxAssignSampleForRcnn(nn.Cell): super(BboxAssignSampleForRcnn, self).__init__() cfg = config - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 self.np_cast_type = np.float16 else: diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py index 68590ec9d..94b0d7ffa 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py @@ -15,12 +15,12 @@ """MaskRcnn feature pyramid network.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor from mindspore.common import dtype as mstype from mindspore.common.initializer import initializer -from mindspore import context def bias_init_zeros(shape): @@ -69,7 +69,7 @@ class FeatPyramidNeck(nn.Cell): feature_shapes): super(FeatPyramidNeck, self).__init__() - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 else: self.cast_type = mstype.float32 diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/mask_rcnn_r50.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/mask_rcnn_r50.py index 6c72d8c08..d36b68dd3 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/mask_rcnn_r50.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/mask_rcnn_r50.py @@ -15,12 +15,12 @@ """MaskRcnn based on ResNet50.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor import mindspore.common.dtype as mstype from mindspore.ops import functional as F -from mindspore import context from .resnet50 import ResNetFea, ResidualBlockUsing from .bbox_assign_sample_stage2 import BboxAssignSampleForRcnn from .fpn_neck import FeatPyramidNeck @@ -54,7 +54,7 @@ class Mask_Rcnn_Resnet50(nn.Cell): def __init__(self, config): super(Mask_Rcnn_Resnet50, self).__init__() - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 self.np_cast_type = np.float16 else: diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/proposal_generator.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/proposal_generator.py index f556d0e17..f3f698de1 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/proposal_generator.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/proposal_generator.py @@ -15,11 +15,11 @@ """MaskRcnn proposal generator.""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P from mindspore import Tensor -from mindspore import context class Proposal(nn.Cell): @@ -52,7 +52,7 @@ class Proposal(nn.Cell): super(Proposal, self).__init__() cfg = config - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 self.np_cast_type = np.float16 else: diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rcnn_cls.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rcnn_cls.py index 28a46e70e..ad8ce0de6 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rcnn_cls.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rcnn_cls.py @@ -15,13 +15,13 @@ """MaskRcnn Rcnn classification and box regression network.""" import numpy as np +import mindspore import mindspore.common.dtype as mstype import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor from mindspore.common.initializer import initializer from mindspore.common.parameter import Parameter -from mindspore import context class DenseNoTranpose(nn.Cell): @@ -45,7 +45,7 @@ class FpnCls(nn.Cell): def __init__(self, input_channels, output_channels, num_classes, pool_size): super(FpnCls, self).__init__() - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 else: self.cast_type = mstype.float32 @@ -112,7 +112,7 @@ class RcnnCls(nn.Cell): super(RcnnCls, self).__init__() cfg = config - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 self.np_cast_type = np.float16 else: diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rcnn_mask.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rcnn_mask.py index c34df040f..e54bd16fa 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rcnn_mask.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rcnn_mask.py @@ -15,11 +15,11 @@ """MaskRcnn Rcnn for mask network.""" import numpy as np +import mindspore import mindspore.common.dtype as mstype import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor -from mindspore import context def _conv(in_channels, out_channels, kernel_size=1, stride=1, padding=0, pad_mode='pad', gain=1): @@ -59,7 +59,7 @@ class FpnMask(nn.Cell): def __init__(self, input_channels, output_channels, num_classes): super(FpnMask, self).__init__() - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 else: self.cast_type = mstype.float32 @@ -136,7 +136,7 @@ class RcnnMask(nn.Cell): super(RcnnMask, self).__init__() cfg = config - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 self.np_cast_type = np.float16 else: diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/resnet50.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/resnet50.py index cb943c174..358ba0a9a 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/resnet50.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/resnet50.py @@ -15,14 +15,14 @@ """Resnet50 backbone.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor from mindspore.ops import functional as F import mindspore.common.dtype as mstype -from mindspore import context -if context.get_context("device_target") == "Ascend": +if mindspore.get_context("device_target") == "Ascend": ms_cast_type = mstype.float16 else: ms_cast_type = mstype.float32 diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rpn.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rpn.py index edc062581..cbb8dbfb0 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rpn.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/rpn.py @@ -14,10 +14,11 @@ # ============================================================================ """RPN for MaskRCNN""" import numpy as np +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P -from mindspore import Tensor, context +from mindspore import Tensor from mindspore.ops import functional as F from mindspore.common.initializer import initializer from .bbox_assign_sample import BboxAssignSample @@ -100,7 +101,7 @@ class RPN(nn.Cell): super(RPN, self).__init__() cfg_rpn = config - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": self.cast_type = mstype.float16 self.np_cast_type = np.float16 else: diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/model_utils/moxing_adapter.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/model_utils/moxing_adapter.py index 830d19a6f..9c6d88e5d 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/model_utils/moxing_adapter.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/train.py b/official/cv/MaskRCNN/maskrcnn_resnet50/train.py index 19728e310..0de706e7f 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/train.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/train.py @@ -26,12 +26,13 @@ from src.network_define import LossCallBack, WithLossCell, TrainOneStepCell, Los from src.dataset import data_to_mindrecord_byte_image, create_maskrcnn_dataset from src.lr_schedule import dynamic_lr +import mindspore import mindspore.common.dtype as mstype -from mindspore import context, Tensor, Parameter +from mindspore import Tensor, Parameter from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor from mindspore.train import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.nn import Momentum from mindspore.common import set_seed @@ -152,7 +153,7 @@ def load_pretrained_ckpt(net, load_path, device_target): @moxing_wrapper(pre_process=modelarts_pre_process) def train_maskrcnn(): device_target = config.device_target - context.set_context(mode=context.GRAPH_MODE, device_target=device_target, device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=device_target, device_id=get_device_id()) config.mindrecord_dir = os.path.join(config.coco_root, config.mindrecord_dir) print('\ntrain.py config:\n', config) @@ -164,7 +165,7 @@ def train_maskrcnn(): rank = get_rank() dataset_sink_mode_flag = device_target == 'Ascend' device_num = get_group_size() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: rank = 0 diff --git a/official/cv/MobileNet/mobilenetv1/README.md b/official/cv/MobileNet/mobilenetv1/README.md index 93b2a871f..685c53126 100644 --- a/official/cv/MobileNet/mobilenetv1/README.md +++ b/official/cv/MobileNet/mobilenetv1/README.md @@ -335,7 +335,7 @@ You can start training using python or shell scripts.If the train method is trai ### Result -Inference result will be stored in the example path, you can find result like the followings in `eval/log`. +Inference result will be stored in the example path, you can find result like the following in `eval/log`. ```shell Ascend diff --git a/official/cv/MobileNet/mobilenetv1/eval.py b/official/cv/MobileNet/mobilenetv1/eval.py index 30c8a5f30..ce88276f6 100644 --- a/official/cv/MobileNet/mobilenetv1/eval.py +++ b/official/cv/MobileNet/mobilenetv1/eval.py @@ -14,7 +14,7 @@ # ============================================================================ """eval mobilenet_v1.""" import os -import mindspore as ms +import mindspore from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from src.CrossEntropySmooth import CrossEntropySmooth from src.mobilenet_v1 import mobilenet_v1 as mobilenet @@ -22,7 +22,7 @@ from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper, modelarts_process -ms.set_seed(1) +mindspore.set_seed(1) if config.dataset == 'cifar10': from src.dataset import create_dataset1 as create_dataset @@ -39,10 +39,10 @@ def eval_mobilenetv1(): target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID', '0')) - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # create dataset dataset = create_dataset(dataset_path=config.dataset_path, do_train=False, batch_size=config.batch_size, @@ -53,8 +53,8 @@ def eval_mobilenetv1(): net = mobilenet(class_num=config.class_num) # load checkpoint - param_dict = ms.load_checkpoint(config.checkpoint_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.checkpoint_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) # define loss, model @@ -67,7 +67,7 @@ def eval_mobilenetv1(): loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) # eval model res = model.eval(dataset) diff --git a/official/cv/MobileNet/mobilenetv1/export.py b/official/cv/MobileNet/mobilenetv1/export.py index 679c4f5d1..bb6a9471a 100644 --- a/official/cv/MobileNet/mobilenetv1/export.py +++ b/official/cv/MobileNet/mobilenetv1/export.py @@ -15,7 +15,7 @@ import numpy as np -import mindspore as ms +import mindspore from src.mobilenet_v1 import mobilenet_v1 as mobilenet from src.model_utils.config import config @@ -23,7 +23,7 @@ from src.model_utils.device_adapter import get_device_id from src.model_utils.moxing_adapter import moxing_wrapper, modelarts_export_preprocess -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) @moxing_wrapper(pre_process=modelarts_export_preprocess) @@ -31,13 +31,13 @@ def export_mobilenetv1(): """ export_mobilenetv1 """ target = config.device_target if target != "GPU": - ms.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) network = mobilenet(class_num=config.class_num) - ms.load_checkpoint(config.ckpt_file, net=network) + mindspore.load_checkpoint(config.ckpt_file, net=network) network.set_train(False) - input_data = ms.numpy.zeros([config.batch_size, 3, config.height, config.width]).astype(np.float32) - ms.export(network, input_data, file_name=config.file_name, file_format=config.file_format) + input_data = mindspore.numpy.zeros([config.batch_size, 3, config.height, config.width]).astype(np.float32) + mindspore.export(network, input_data, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': diff --git a/official/cv/MobileNet/mobilenetv1/src/CrossEntropySmooth.py b/official/cv/MobileNet/mobilenetv1/src/CrossEntropySmooth.py index f8283eb76..7c2c5d00c 100644 --- a/official/cv/MobileNet/mobilenetv1/src/CrossEntropySmooth.py +++ b/official/cv/MobileNet/mobilenetv1/src/CrossEntropySmooth.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ """define loss function for network""" -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops @@ -24,8 +24,8 @@ class CrossEntropySmooth(nn.LossBase): super(CrossEntropySmooth, self).__init__() self.onehot = ops.OneHot() self.sparse = sparse - self.on_value = ms.Tensor(1.0 - smooth_factor, ms.float32) - self.off_value = ms.Tensor(1.0 * smooth_factor / (num_classes - 1), ms.float32) + self.on_value = mindspore.Tensor(1.0 - smooth_factor, mindspore.float32) + self.off_value = mindspore.Tensor(1.0 * smooth_factor / (num_classes - 1), mindspore.float32) self.ce = nn.SoftmaxCrossEntropyWithLogits(reduction=reduction) def construct(self, logit, label): diff --git a/official/cv/MobileNet/mobilenetv1/src/dataset.py b/official/cv/MobileNet/mobilenetv1/src/dataset.py index 476d04b1d..46f33dd90 100644 --- a/official/cv/MobileNet/mobilenetv1/src/dataset.py +++ b/official/cv/MobileNet/mobilenetv1/src/dataset.py @@ -17,7 +17,7 @@ create train or eval dataset. """ import os from multiprocessing import cpu_count -import mindspore as ms +import mindspore import mindspore.dataset as ds import mindspore.communication as comm @@ -58,7 +58,7 @@ def create_dataset1(dataset_path, do_train, device_num=1, batch_size=32, target= ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=THREAD_NUM) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=THREAD_NUM) @@ -112,7 +112,7 @@ def create_dataset2(dataset_path, do_train, device_num=1, batch_size=32, target= ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=THREAD_NUM) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=THREAD_NUM) diff --git a/official/cv/MobileNet/mobilenetv1/src/model_utils/moxing_adapter.py b/official/cv/MobileNet/mobilenetv1/src/model_utils/moxing_adapter.py index 31233d6fc..d8cbe4824 100644 --- a/official/cv/MobileNet/mobilenetv1/src/model_utils/moxing_adapter.py +++ b/official/cv/MobileNet/mobilenetv1/src/model_utils/moxing_adapter.py @@ -20,7 +20,7 @@ import functools import zipfile import time -import mindspore as ms +import mindspore from .config import config _global_sync_count = 0 @@ -95,7 +95,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - ms.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): @@ -105,7 +105,7 @@ def moxing_wrapper(pre_process=None, post_process=None): pre_process() if config.enable_profiling: - profiler = ms.profiler.Profiler() + profiler = mindspore.profiler.Profiler() run_func(*args, **kwargs) diff --git a/official/cv/MobileNet/mobilenetv1/train.py b/official/cv/MobileNet/mobilenetv1/train.py index e8737c295..8436b1aac 100644 --- a/official/cv/MobileNet/mobilenetv1/train.py +++ b/official/cv/MobileNet/mobilenetv1/train.py @@ -15,7 +15,7 @@ """train mobilenet_v1.""" import os -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.communication as comm import mindspore.common.initializer as weight_init @@ -29,7 +29,7 @@ from src.model_utils.moxing_adapter import moxing_wrapper, modelarts_process from src.model_utils.device_adapter import get_device_num -ms.set_seed(1) +mindspore.set_seed(1) if config.dataset == 'cifar10': from src.dataset import create_dataset1 as create_dataset @@ -40,8 +40,8 @@ else: def init_weigth(net): # init weight if config.pre_trained: - param_dict = ms.load_checkpoint(config.pre_trained) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.pre_trained) + mindspore.load_param_into_net(net, param_dict) else: for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): @@ -63,26 +63,26 @@ def train_mobilenetv1(): ckpt_save_dir = config.save_checkpoint_path # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) # Set mempool block size in PYNATIVE_MODE for improving memory utilization, which will not take effect in GRAPH_MODE - if ms.get_context("mode") == ms.PYNATIVE_MODE: - ms.set_context(mempool_block_size="31GB") + if mindspore.get_context("mode") == 1: + mindspore.set_context(mempool_block_size="31GB") if config.parameter_server: - ms.set_ps_context(enable_ps=True) + mindspore.set_ps_context(enable_ps=True) device_id = int(os.getenv('DEVICE_ID', '0')) if config.run_distribute: if target == "Ascend": - ms.set_context(device_id=device_id) - ms.set_auto_parallel_context(device_num=get_device_num(), parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_context(device_id=device_id) + mindspore.set_auto_parallel_context(device_num=get_device_num(), parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) comm.init() - ms.set_auto_parallel_context(all_reduce_fusion_config=[75]) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[75]) # GPU target else: comm.init() - ms.set_auto_parallel_context(device_num=comm.get_group_size(), parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=comm.get_group_size(), parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(comm.get_rank()) + "/" @@ -102,7 +102,7 @@ def train_mobilenetv1(): lr = get_lr(lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode) - lr = ms.Tensor(lr) + lr = mindspore.Tensor(lr) # define opt decayed_params = [] @@ -129,12 +129,12 @@ def train_mobilenetv1(): smooth_factor=config.label_smooth_factor, num_classes=config.class_num) else: loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') - loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + loss_scale = mindspore.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) if target == "Ascend": - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, amp_level="O2", keep_batchnorm_fp32=False) else: - model = ms.Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) # define callbacks time_cb = TimeMonitor(data_size=step_size) diff --git a/official/cv/MobileNet/mobilenetv2/README.md b/official/cv/MobileNet/mobilenetv2/README.md index 66be81f6e..bf808e417 100644 --- a/official/cv/MobileNet/mobilenetv2/README.md +++ b/official/cv/MobileNet/mobilenetv2/README.md @@ -312,7 +312,7 @@ You can start training using python or shell scripts. The usage of shell scripts ### Result -Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train.log` like followings with the platform CPU and GPU. +Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train.log` like following with the platform CPU and GPU. ```log epoch: [ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100] @@ -352,7 +352,7 @@ You can start training using python or shell scripts.If the train method is trai ### Result -Inference result will be stored in the example path, you can find result like the followings in `eval.log`. +Inference result will be stored in the example path, you can find result like the following in `eval.log`. ```log result: {'acc': 0.71976314102564111} ckpt=./ckpt_0/mobilenet-200_625.ckpt diff --git a/official/cv/MobileNet/mobilenetv2/eval.py b/official/cv/MobileNet/mobilenetv2/eval.py index f348ca8ec..bd39c52b0 100644 --- a/official/cv/MobileNet/mobilenetv2/eval.py +++ b/official/cv/MobileNet/mobilenetv2/eval.py @@ -16,7 +16,7 @@ eval. """ import os -import mindspore as ms +import mindspore import mindspore.nn as nn from src.dataset import create_dataset from src.models import define_net, load_ckpt @@ -29,7 +29,7 @@ config.is_training = config.is_training_eval @moxing_wrapper(pre_process=modelarts_process) def eval_mobilenetv2(): - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.platform, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.platform, save_graphs=False) config.dataset_path = os.path.join(config.dataset_path, 'validation_preprocess') print('\nconfig: \n', config) if not config.device_id: @@ -46,7 +46,7 @@ def eval_mobilenetv2(): net.set_train(False) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') - model = ms.Model(net, loss_fn=loss, metrics={'acc'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'acc'}) res = model.eval(dataset) print(f"result:{res}\npretrain_ckpt={config.pretrain_ckpt}") diff --git a/official/cv/MobileNet/mobilenetv2/export.py b/official/cv/MobileNet/mobilenetv2/export.py index 095fbc824..66ed700fb 100644 --- a/official/cv/MobileNet/mobilenetv2/export.py +++ b/official/cv/MobileNet/mobilenetv2/export.py @@ -16,7 +16,7 @@ mobilenetv2 export file. """ import numpy as np -import mindspore as ms +import mindspore from src.models import define_net, load_ckpt from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper @@ -29,13 +29,13 @@ config.is_training = config.is_training_export def export_mobilenetv2(): """ export_mobilenetv2 """ print('\nconfig: \n', config) - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.platform, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.platform, save_graphs=False) _, _, net = define_net(config, config.is_training) load_ckpt(net, config.ckpt_file) input_shp = [config.batch_size, 3, config.image_height, config.image_width] - input_array = ms.Tensor(np.random.uniform(-1.0, 1.0, size=input_shp).astype(np.float32)) - ms.export(net, input_array, file_name=config.file_name, file_format=config.file_format) + input_array = mindspore.Tensor(np.random.uniform(-1.0, 1.0, size=input_shp).astype(np.float32)) + mindspore.export(net, input_array, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': diff --git a/official/cv/MobileNet/mobilenetv2/golden_stick/quantization/simqat/eval.py b/official/cv/MobileNet/mobilenetv2/golden_stick/quantization/simqat/eval.py index 799308e9d..7787481e2 100644 --- a/official/cv/MobileNet/mobilenetv2/golden_stick/quantization/simqat/eval.py +++ b/official/cv/MobileNet/mobilenetv2/golden_stick/quantization/simqat/eval.py @@ -15,7 +15,7 @@ """ eval. """ -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.train.serialization import load_checkpoint, load_param_into_net from simqat import create_simqat @@ -33,9 +33,9 @@ config.is_training = config.is_training_eval def eval_mobilenetv2(): """eval_mobilenetv2 """ if config.mode_name == "GRAPH": - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.platform, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.platform, save_graphs=False) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=config.platform, save_graphs=False) + mindspore.set_context(mode=1, device_target=config.platform, save_graphs=False) config.dataset_path = config.dataset_path print('\nconfig: \n', config) if not config.device_id: @@ -48,7 +48,7 @@ def eval_mobilenetv2(): net.set_train(False) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') - model = ms.Model(net, loss_fn=loss, metrics={'acc'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'acc'}) dataset = create_dataset_cifar10(dataset_path=config.dataset_path, do_train=False, config=config) step_size = dataset.get_dataset_size() if step_size == 0: diff --git a/official/cv/MobileNet/mobilenetv2/golden_stick/quantization/simqat/train.py b/official/cv/MobileNet/mobilenetv2/golden_stick/quantization/simqat/train.py index ed1cea70d..047625382 100644 --- a/official/cv/MobileNet/mobilenetv2/golden_stick/quantization/simqat/train.py +++ b/official/cv/MobileNet/mobilenetv2/golden_stick/quantization/simqat/train.py @@ -19,7 +19,7 @@ import time import random import numpy as np -import mindspore as ms +import mindspore import mindspore.communication as comm import mindspore.nn as nn from simqat import create_simqat @@ -33,7 +33,7 @@ from src.model_utils.moxing_adapter import moxing_wrapper, modelarts_process from src.model_utils.device_adapter import get_device_id -ms.set_seed(1) +mindspore.set_seed(1) @moxing_wrapper(pre_process=modelarts_process) def train_mobilenetv2(): @@ -41,16 +41,16 @@ def train_mobilenetv2(): if config.platform != "GPU": raise NotImplementedError("SimQAT only support running on GPU now!") if config.mode_name == "GRAPH": - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.platform, + mindspore.set_context(mode=0, device_target=config.platform, save_graphs=False, enable_graph_kernel=True) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=config.platform, + mindspore.set_context(mode=1, device_target=config.platform, save_graphs=False, enable_graph_kernel=True) if config.run_distribute: comm.init() config.rank_id = comm.get_rank() config.rank_size = comm.get_group_size() - ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) config.train_dataset_path = config.dataset_path config.eval_dataset_path = config.dataset_path @@ -88,7 +88,7 @@ def train_mobilenetv2(): epoch_size = config.epoch_size # get learning rate - lr = ms.Tensor(get_lr(global_step=0, + lr = mindspore.Tensor(get_lr(global_step=0, lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, @@ -100,7 +100,7 @@ def train_mobilenetv2(): eval_dataset = None if config.pretrain_ckpt == "" or config.freeze_layer != "backbone": opt = nn.Momentum(net.trainable_params(), lr, config.momentum, config.weight_decay) - model = ms.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network, boost_level=config.boost_mode) cb = config_ckpoint(config, lr, step_size, model, eval_dataset) print("============== Starting Training ==============") @@ -127,15 +127,15 @@ def train_mobilenetv2(): epoch_start = time.time() losses = [] for j in idx_list: - feature = ms.Tensor(np.load(os.path.join(features_path, "feature_{}.npy".format(j)))) - label = ms.Tensor(np.load(os.path.join(features_path, "label_{}.npy".format(j)))) + feature = mindspore.Tensor(np.load(os.path.join(features_path, "feature_{}.npy".format(j)))) + label = mindspore.Tensor(np.load(os.path.join(features_path, "label_{}.npy".format(j)))) losses.append(network(feature, label).asnumpy()) epoch_mseconds = (time.time()-epoch_start) * 1000 per_step_mseconds = epoch_mseconds / step_size print("epoch[{}/{}], iter[{}] cost: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}"\ .format(epoch + 1, epoch_size, step_size, epoch_mseconds, per_step_mseconds, np.mean(np.array(losses)))) if (epoch + 1) % config.save_checkpoint_epochs == 0: - ms.save_checkpoint(net, os.path.join(save_ckpt_path, "mobilenetv2_{}.ckpt".format(epoch + 1))) + mindspore.save_checkpoint(net, os.path.join(save_ckpt_path, "mobilenetv2_{}.ckpt".format(epoch + 1))) print("total cost {:5.4f} s".format(time.time() - start)) diff --git a/official/cv/MobileNet/mobilenetv2/src/dataset.py b/official/cv/MobileNet/mobilenetv2/src/dataset.py index 3204020f5..c4dec7274 100644 --- a/official/cv/MobileNet/mobilenetv2/src/dataset.py +++ b/official/cv/MobileNet/mobilenetv2/src/dataset.py @@ -18,7 +18,7 @@ create train or eval dataset. import os import numpy as np -import mindspore as ms +import mindspore import mindspore.dataset as ds @@ -70,7 +70,7 @@ def create_dataset(dataset_path, do_train, config, enable_cache=False, cache_ses else: trans = [decode_op, resize_op, center_crop, normalize_op, change_swap_op] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=num_workers) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_workers) @@ -133,7 +133,7 @@ def create_dataset_cifar10(dataset_path, do_train, config, enable_cache=False, c ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_workers) @@ -168,7 +168,7 @@ def extract_features(net, dataset_path, config): raise ValueError("The step_size of dataset is zero. Check if the images count of train dataset is more \ than batch_size in config.py") - model = ms.Model(net) + model = mindspore.Model(net) for i, data in enumerate(dataset.create_dict_iterator(output_numpy=True)): features_path = os.path.join(features_folder, f"feature_{i}.npy") @@ -176,7 +176,7 @@ def extract_features(net, dataset_path, config): if not os.path.exists(features_path) or not os.path.exists(label_path): image = data["image"] label = data["label"] - features = model.predict(ms.Tensor(image)) + features = model.predict(mindspore.Tensor(image)) np.save(features_path, features.asnumpy()) np.save(label_path, label) print(f"Complete the batch {i + 1}/{step_size}") diff --git a/official/cv/MobileNet/mobilenetv2/src/metric.py b/official/cv/MobileNet/mobilenetv2/src/metric.py index bf10de06d..3a6ee94d1 100644 --- a/official/cv/MobileNet/mobilenetv2/src/metric.py +++ b/official/cv/MobileNet/mobilenetv2/src/metric.py @@ -14,7 +14,7 @@ # ============================================================================ """evaluation metric.""" -import mindspore as ms +import mindspore import mindspore.communication as comm import mindspore.ops as ops import mindspore.nn as nn @@ -56,9 +56,9 @@ class ClassifyCorrectCell(nn.Cell): def construct(self, data, label): outputs = self._network(data) y_pred = self.argmax(outputs) - y_pred = self.cast(y_pred, ms.int32) + y_pred = self.cast(y_pred, mindspore.int32) y_correct = self.equal(y_pred, label) - y_correct = self.cast(y_correct, ms.float32) + y_correct = self.cast(y_correct, mindspore.float32) y_correct = self.reduce_sum(y_correct) if self.run_distribute: y_correct = self.allreduce(y_correct) diff --git a/official/cv/MobileNet/mobilenetv2/src/mobilenetV2.py b/official/cv/MobileNet/mobilenetv2/src/mobilenetV2.py index c762e8ea9..c4a50ba13 100644 --- a/official/cv/MobileNet/mobilenetv2/src/mobilenetV2.py +++ b/official/cv/MobileNet/mobilenetv2/src/mobilenetV2.py @@ -14,7 +14,7 @@ # ============================================================================ """MobileNetV2 model define""" import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops @@ -211,17 +211,17 @@ class MobileNetV2Backbone(nn.Cell): for _, m in self.cells_and_names(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - m.weight.set_data(ms.Tensor(np.random.normal(0, np.sqrt(2. / n), + m.weight.set_data(mindspore.Tensor(np.random.normal(0, np.sqrt(2. / n), m.weight.data.shape).astype("float32"))) if m.bias is not None: m.bias.set_data( - ms.Tensor(np.zeros(m.bias.data.shape, dtype="float32"), dtype=ms.float32)) + mindspore.Tensor(np.zeros(m.bias.data.shape, dtype="float32"), dtype=mindspore.float32)) elif isinstance(m, nn.BatchNorm2d): m.gamma.set_data( - ms.Tensor(np.ones(m.gamma.data.shape, dtype="float32"))) + mindspore.Tensor(np.ones(m.gamma.data.shape, dtype="float32"))) m.beta.set_data( - ms.Tensor(np.zeros(m.beta.data.shape, dtype="float32"), dtype=ms.float32)) + mindspore.Tensor(np.zeros(m.beta.data.shape, dtype="float32"), dtype=mindspore.float32)) @property def get_features(self): @@ -280,11 +280,11 @@ class MobileNetV2Head(nn.Cell): self.init_parameters_data() for _, m in self.cells_and_names(): if isinstance(m, nn.Dense): - m.weight.set_data(ms.Tensor(np.random.normal( + m.weight.set_data(mindspore.Tensor(np.random.normal( 0, 0.01, m.weight.data.shape).astype("float32"))) if m.bias is not None: m.bias.set_data( - ms.Tensor(np.zeros(m.bias.data.shape, dtype="float32"), dtype=ms.float32)) + mindspore.Tensor(np.zeros(m.bias.data.shape, dtype="float32"), dtype=mindspore.float32)) class MobileNetV2Combine(nn.Cell): diff --git a/official/cv/MobileNet/mobilenetv2/src/mobilenetV2_fusion.py b/official/cv/MobileNet/mobilenetv2/src/mobilenetV2_fusion.py index 6141916cd..acd3788c2 100644 --- a/official/cv/MobileNet/mobilenetv2/src/mobilenetV2_fusion.py +++ b/official/cv/MobileNet/mobilenetv2/src/mobilenetV2_fusion.py @@ -15,7 +15,7 @@ # """MobileNetV2 Quant model define""" import numpy as np -import minspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops @@ -215,25 +215,25 @@ class mobilenetV2(nn.Cell): for _, m in self.cells_and_names(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - w = ms.Tensor(np.random.normal(0, np.sqrt(2. / n), m.weight.data.shape).astype("float32")) + w = mindspore.Tensor(np.random.normal(0, np.sqrt(2. / n), m.weight.data.shape).astype("float32")) m.weight.set_data(w) if m.bias is not None: - m.bias.set_data(ms.numpy.zeros(m.bias.data.shape, dtype="float32")) + m.bias.set_data(mindspore.numpy.zeros(m.bias.data.shape, dtype="float32")) elif isinstance(m, nn.Conv2dBnAct): n = m.conv.kernel_size[0] * m.conv.kernel_size[1] * m.conv.out_channels - w = ms.Tensor(np.random.normal(0, np.sqrt(2. / n), m.conv.weight.data.shape).astype("float32")) + w = mindspore.Tensor(np.random.normal(0, np.sqrt(2. / n), m.conv.weight.data.shape).astype("float32")) m.conv.weight.set_data(w) if m.conv.bias is not None: - m.conv.bias.set_data(ms.numpy.zeros(m.conv.bias.data.shape, dtype="float32")) + m.conv.bias.set_data(mindspore.numpy.zeros(m.conv.bias.data.shape, dtype="float32")) elif isinstance(m, nn.BatchNorm2d): - m.gamma.set_data(ms.Tensor(np.ones(m.gamma.data.shape, dtype="float32"))) - m.beta.set_data(ms.numpy.zeros(m.beta.data.shape, dtype="float32")) + m.gamma.set_data(mindspore.Tensor(np.ones(m.gamma.data.shape, dtype="float32"))) + m.beta.set_data(mindspore.numpy.zeros(m.beta.data.shape, dtype="float32")) elif isinstance(m, nn.Dense): - m.weight.set_data(ms.Tensor(np.random.normal(0, 0.01, m.weight.data.shape).astype("float32"))) + m.weight.set_data(mindspore.Tensor(np.random.normal(0, 0.01, m.weight.data.shape).astype("float32"))) if m.bias is not None: - m.bias.set_data(ms.numpy.zeros(m.bias.data.shape, dtype="float32")) + m.bias.set_data(mindspore.numpy.zeros(m.bias.data.shape, dtype="float32")) elif isinstance(m, nn.DenseBnAct): m.dense.weight.set_data( - ms.Tensor(np.random.normal(0, 0.01, m.dense.weight.data.shape).astype("float32"))) + mindspore.Tensor(np.random.normal(0, 0.01, m.dense.weight.data.shape).astype("float32"))) if m.dense.bias is not None: - m.dense.bias.set_data(ms.numpy.zeros(m.dense.bias.data.shape, dtype="float32")) + m.dense.bias.set_data(mindspore.numpy.zeros(m.dense.bias.data.shape, dtype="float32")) diff --git a/official/cv/MobileNet/mobilenetv2/src/model_utils/moxing_adapter.py b/official/cv/MobileNet/mobilenetv2/src/model_utils/moxing_adapter.py index 12a72538d..8f604f142 100644 --- a/official/cv/MobileNet/mobilenetv2/src/model_utils/moxing_adapter.py +++ b/official/cv/MobileNet/mobilenetv2/src/model_utils/moxing_adapter.py @@ -18,7 +18,7 @@ import os import functools import time -import mindspore as ms +import mindspore from .config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - ms.context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): @@ -102,7 +102,7 @@ def moxing_wrapper(pre_process=None, post_process=None): pre_process() if config.enable_profiling: - profiler = ms.profiler.Profiler() + profiler = mindspore.profiler.Profiler() run_func(*args, **kwargs) diff --git a/official/cv/MobileNet/mobilenetv2/src/models.py b/official/cv/MobileNet/mobilenetv2/src/models.py index 9e4324f90..03b778cff 100644 --- a/official/cv/MobileNet/mobilenetv2/src/models.py +++ b/official/cv/MobileNet/mobilenetv2/src/models.py @@ -14,7 +14,7 @@ # ============================================================================ import time import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore.train.callback import Callback @@ -40,15 +40,15 @@ class CrossEntropyWithLabelSmooth(nn.LossBase): def __init__(self, smooth_factor=0., num_classes=1000): super(CrossEntropyWithLabelSmooth, self).__init__() self.onehot = ops.OneHot() - self.on_value = ms.Tensor(1.0 - smooth_factor, ms.float32) - self.off_value = ms.Tensor(1.0 * smooth_factor / - (num_classes - 1), ms.float32) + self.on_value = mindspore.Tensor(1.0 - smooth_factor, mindspore.float32) + self.off_value = mindspore.Tensor(1.0 * smooth_factor / + (num_classes - 1), mindspore.float32) self.ce = nn.SoftmaxCrossEntropyWithLogits() self.mean = ops.ReduceMean(False) self.cast = ops.Cast() def construct(self, logit, label): - one_hot_label = self.onehot(self.cast(label, ms.int32), ops.shape(logit)[1], + one_hot_label = self.onehot(self.cast(label, mindspore.int32), ops.shape(logit)[1], self.on_value, self.off_value) out_loss = self.ce(logit, one_hot_label) out_loss = self.mean(out_loss, 0) @@ -66,7 +66,7 @@ class Monitor(Callback): None Examples: - >>> Monitor(100,lr_init=ms.Tensor([0.05]*100).asnumpy()) + >>> Monitor(100,lr_init=mindspore.Tensor([0.05]*100).asnumpy()) """ def __init__(self, lr_init=None, model=None, eval_dataset=None): @@ -107,9 +107,9 @@ class Monitor(Callback): step_mseconds = (time.time() - self.step_time) * 1000 step_loss = cb_params.net_outputs - if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], ms.Tensor): + if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], mindspore.Tensor): step_loss = step_loss[0] - if isinstance(step_loss, ms.Tensor): + if isinstance(step_loss, mindspore.Tensor): step_loss = np.mean(step_loss.asnumpy()) self.losses.append(step_loss) @@ -123,8 +123,8 @@ class Monitor(Callback): def load_ckpt(network, pretrain_ckpt_path, trainable=True): """load checkpoint into network.""" - param_dict = ms.load_checkpoint(pretrain_ckpt_path) - ms.load_param_into_net(network, param_dict) + param_dict = mindspore.load_checkpoint(pretrain_ckpt_path) + mindspore.load_param_into_net(network, param_dict) if not trainable: for param in network.get_parameters(): param.requires_grad = False diff --git a/official/cv/MobileNet/mobilenetv2/train.py b/official/cv/MobileNet/mobilenetv2/train.py index d6c657445..283c5fcff 100644 --- a/official/cv/MobileNet/mobilenetv2/train.py +++ b/official/cv/MobileNet/mobilenetv2/train.py @@ -19,7 +19,7 @@ import time import random import numpy as np -import mindspore as ms +import mindspore import mindspore.communication as comm import mindspore.nn as nn @@ -33,7 +33,7 @@ from src.model_utils.moxing_adapter import moxing_wrapper, modelarts_process from src.model_utils.device_adapter import get_device_id -ms.set_seed(1) +mindspore.set_seed(1) @moxing_wrapper(pre_process=modelarts_process) @@ -41,12 +41,12 @@ def train_mobilenetv2(): """ train_mobilenetv2 """ if config.platform == "CPU": config.run_distribute = False - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.platform, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.platform, save_graphs=False) if config.run_distribute: comm.init() config.rank_id = comm.get_rank() config.rank_size = comm.get_group_size() - ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) config.train_dataset_path = os.path.join(config.dataset_path, 'train') config.eval_dataset_path = os.path.join(config.dataset_path, 'validation_preprocess') @@ -61,7 +61,7 @@ def train_mobilenetv2(): enable_cache=config.enable_cache, cache_session_id=config.cache_session_id) step_size = dataset.get_dataset_size() if config.platform == "GPU": - ms.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) if config.pretrain_ckpt: if config.freeze_layer == "backbone": load_ckpt(backbone_net, config.pretrain_ckpt, trainable=False) @@ -84,7 +84,7 @@ def train_mobilenetv2(): epoch_size = config.epoch_size # get learning rate - lr = ms.Tensor(get_lr(global_step=0, + lr = mindspore.Tensor(get_lr(global_step=0, lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, @@ -100,10 +100,10 @@ def train_mobilenetv2(): eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, config=config) if config.pretrain_ckpt == "" or config.freeze_layer != "backbone": if config.platform == "Ascend": - loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + loss_scale = mindspore.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) group_params = build_params_groups(net) opt = nn.Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale) - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, eval_network=dist_eval_network, amp_level="O2", keep_batchnorm_fp32=False, boost_level=config.boost_mode, @@ -111,7 +111,7 @@ def train_mobilenetv2(): else: opt = nn.Momentum(net.trainable_params(), lr, config.momentum, config.weight_decay) - model = ms.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network, boost_level=config.boost_mode) cb = config_ckpoint(config, lr, step_size, model, eval_dataset) print("============== Starting Training ==============") @@ -138,15 +138,15 @@ def train_mobilenetv2(): epoch_start = time.time() losses = [] for j in idx_list: - feature = ms.Tensor(np.load(os.path.join(features_path, "feature_{}.npy".format(j)))) - label = ms.Tensor(np.load(os.path.join(features_path, "label_{}.npy".format(j)))) + feature = mindspore.Tensor(np.load(os.path.join(features_path, "feature_{}.npy".format(j)))) + label = mindspore.Tensor(np.load(os.path.join(features_path, "label_{}.npy".format(j)))) losses.append(network(feature, label).asnumpy()) epoch_mseconds = (time.time()-epoch_start) * 1000 per_step_mseconds = epoch_mseconds / step_size print("epoch[{}/{}], iter[{}] cost: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}"\ .format(epoch + 1, epoch_size, step_size, epoch_mseconds, per_step_mseconds, np.mean(np.array(losses)))) if (epoch + 1) % config.save_checkpoint_epochs == 0: - ms.save_checkpoint(net, os.path.join(save_ckpt_path, "mobilenetv2_{}.ckpt".format(epoch + 1))) + mindspore.save_checkpoint(net, os.path.join(save_ckpt_path, "mobilenetv2_{}.ckpt".format(epoch + 1))) print("total cost {:5.4f} s".format(time.time() - start)) if config.enable_cache: diff --git a/official/cv/MobileNet/mobilenetv3/Readme.md b/official/cv/MobileNet/mobilenetv3/Readme.md index f0d5116e0..c14f0f44f 100644 --- a/official/cv/MobileNet/mobilenetv3/Readme.md +++ b/official/cv/MobileNet/mobilenetv3/Readme.md @@ -105,7 +105,7 @@ You can start training using python or shell scripts. The usage of shell scripts ### Result -Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log` like followings. +Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log` like following. ```bash epoch: [ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100] @@ -140,7 +140,7 @@ You can start training using python or shell scripts. The usage of shell scripts ### Result -Inference result will be stored in the example path, you can find result like the followings in `val.log`. +Inference result will be stored in the example path, you can find result like the following in `val.log`. ```bash result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.ckpt diff --git a/official/cv/MobileNet/mobilenetv3/eval.py b/official/cv/MobileNet/mobilenetv3/eval.py index 38657813e..33a85755b 100644 --- a/official/cv/MobileNet/mobilenetv3/eval.py +++ b/official/cv/MobileNet/mobilenetv3/eval.py @@ -16,7 +16,7 @@ eval. """ import argparse -import mindspore as ms +import mindspore from mindspore import nn from src.dataset import create_dataset from src.dataset import create_dataset_cifar @@ -36,7 +36,7 @@ if __name__ == '__main__': config = None if args_opt.device_target == "GPU": config = config_gpu - ms.set_context(mode=ms.GRAPH_MODE, + mindspore.set_context(mode=0, device_target="GPU", save_graphs=False) dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, @@ -45,7 +45,7 @@ if __name__ == '__main__': batch_size=config.batch_size) elif args_opt.device_target == "CPU": config = config_cpu - ms.set_context(mode=ms.GRAPH_MODE, + mindspore.set_context(mode=0, device_target="CPU", save_graphs=False) dataset = create_dataset_cifar(dataset_path=args_opt.dataset_path, do_train=False, @@ -59,10 +59,10 @@ if __name__ == '__main__': step_size = dataset.get_dataset_size() if args_opt.checkpoint_path: - param_dict = ms.load_checkpoint(args_opt.checkpoint_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(args_opt.checkpoint_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) - model = ms.Model(net, loss_fn=loss, metrics={'acc'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'acc'}) res = model.eval(dataset) print("result:", res, "ckpt=", args_opt.checkpoint_path) diff --git a/official/cv/MobileNet/mobilenetv3/export.py b/official/cv/MobileNet/mobilenetv3/export.py index 6dabadfd7..f40a07900 100644 --- a/official/cv/MobileNet/mobilenetv3/export.py +++ b/official/cv/MobileNet/mobilenetv3/export.py @@ -17,7 +17,7 @@ mobilenetv3 export mindir. """ import argparse import numpy as np -import mindspore as ms +import mindspore from src.config import config_gpu from src.config import config_cpu from src.config import config_ascend @@ -35,20 +35,20 @@ if __name__ == '__main__': cfg = None if args_opt.device_target == "GPU": cfg = config_gpu - ms.set_context(mode=ms.GRAPH_MODE, device_target="GPU") + mindspore.set_context(mode=0, device_target="GPU") elif args_opt.device_target == "CPU": cfg = config_cpu - ms.set_context(mode=ms.GRAPH_MODE, device_target="CPU") + mindspore.set_context(mode=0, device_target="CPU") elif args_opt.device_target == "Ascend": cfg = config_ascend - ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend") + mindspore.set_context(mode=0, device_target="Ascend") else: raise ValueError("Unsupported device_target.") net = mobilenet_v3_large(num_classes=cfg.num_classes, activation="Softmax") - param_dict = ms.load_checkpoint(args_opt.checkpoint_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(args_opt.checkpoint_path) + mindspore.load_param_into_net(net, param_dict) input_shp = [1, 3, cfg.image_height, cfg.image_width] - input_array = ms.Tensor(np.random.uniform(-1.0, 1.0, size=input_shp).astype(np.float32)) - ms.export(net, input_array, file_name=args_opt.file_name, file_format=args_opt.file_format) + input_array = mindspore.Tensor(np.random.uniform(-1.0, 1.0, size=input_shp).astype(np.float32)) + mindspore.export(net, input_array, file_name=args_opt.file_name, file_format=args_opt.file_format) diff --git a/official/cv/MobileNet/mobilenetv3/infer_onnx.py b/official/cv/MobileNet/mobilenetv3/infer_onnx.py index b478067af..75fc717cb 100644 --- a/official/cv/MobileNet/mobilenetv3/infer_onnx.py +++ b/official/cv/MobileNet/mobilenetv3/infer_onnx.py @@ -18,7 +18,7 @@ import argparse import onnxruntime import numpy as np -import mindspore as ms +import mindspore from mindspore import Tensor from mindspore import ops from src.dataset import create_dataset @@ -64,7 +64,7 @@ if __name__ == '__main__': model_predict = session.run(None, inputs) model_predict = np.expand_dims(np.squeeze(model_predict), axis=0) - input_x = Tensor(model_predict[0], ms.float16) + input_x = Tensor(model_predict[0], mindspore.float16) _, k_label = topk(input_x, k) if k_label[0] == labels: correct_top1 = correct_top1 + 1 diff --git a/official/cv/MobileNet/mobilenetv3/src/dataset.py b/official/cv/MobileNet/mobilenetv3/src/dataset.py index 43c2b7b63..006d88100 100644 --- a/official/cv/MobileNet/mobilenetv3/src/dataset.py +++ b/official/cv/MobileNet/mobilenetv3/src/dataset.py @@ -15,7 +15,7 @@ """ create train or eval dataset. """ -import mindspore as ms +import mindspore import mindspore.dataset as ds @@ -67,7 +67,7 @@ def create_dataset(dataset_path, do_train, config, device_target, batch_size=32, else: trans = [decode_op, resize_op, center_crop, normalize_op, change_swap_op] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) @@ -117,7 +117,7 @@ def create_dataset_cifar(dataset_path, ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=type_cast_op, input_columns="label", diff --git a/official/cv/MobileNet/mobilenetv3/src/mobilenetV3.py b/official/cv/MobileNet/mobilenetv3/src/mobilenetV3.py index d243e70e6..58361ded6 100644 --- a/official/cv/MobileNet/mobilenetv3/src/mobilenetV3.py +++ b/official/cv/MobileNet/mobilenetv3/src/mobilenetV3.py @@ -15,7 +15,7 @@ """MobileNetV3 model define""" from functools import partial import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops @@ -339,21 +339,21 @@ class MobileNetV3(nn.Cell): for _, m in self.cells_and_names(): if isinstance(m, (nn.Conv2d)): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - m.weight.set_data(ms.Tensor(np.random.normal(0, np.sqrt(2. / n), + m.weight.set_data(mindspore.Tensor(np.random.normal(0, np.sqrt(2. / n), m.weight.data.shape).astype("float32"))) if m.bias is not None: m.bias.set_data( - ms.numpy.zeros(m.bias.data.shape, dtype="float32")) + mindspore.numpy.zeros(m.bias.data.shape, dtype="float32")) elif isinstance(m, nn.BatchNorm2d): m.gamma.set_data( - ms.Tensor(np.ones(m.gamma.data.shape, dtype="float32"))) + mindspore.Tensor(np.ones(m.gamma.data.shape, dtype="float32"))) m.beta.set_data( - ms.numpy.zeros(m.beta.data.shape, dtype="float32")) + mindspore.numpy.zeros(m.beta.data.shape, dtype="float32")) elif isinstance(m, nn.Dense): - m.weight.set_data(ms.Tensor(np.random.normal( + m.weight.set_data(mindspore.Tensor(np.random.normal( 0, 0.01, m.weight.data.shape).astype("float32"))) if m.bias is not None: - m.bias.set_data(ms.numpy.zeros(m.bias.data.shape, dtype="float32")) + m.bias.set_data(mindspore.numpy.zeros(m.bias.data.shape, dtype="float32")) def mobilenet_v3(model_name, **kwargs): diff --git a/official/cv/MobileNet/mobilenetv3/train.py b/official/cv/MobileNet/mobilenetv3/train.py index 20f51ba9a..32a4dd57f 100644 --- a/official/cv/MobileNet/mobilenetv3/train.py +++ b/official/cv/MobileNet/mobilenetv3/train.py @@ -19,7 +19,7 @@ import argparse import ast import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, Callback @@ -32,7 +32,7 @@ from src.config import config_gpu from src.config import config_cpu from src.mobilenetV3 import mobilenet_v3_large -ms.set_seed(1) +mindspore.set_seed(1) parser = argparse.ArgumentParser(description='Image classification') parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') @@ -42,16 +42,16 @@ parser.add_argument('--run_distribute', type=ast.literal_eval, default=False, he args_opt = parser.parse_args() if args_opt.device_target == "GPU": - ms.set_context(mode=ms.GRAPH_MODE, + mindspore.set_context(mode=0, device_target="GPU", save_graphs=False) if args_opt.run_distribute: init() - ms.set_auto_parallel_context(device_num=get_group_size(), - parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=get_group_size(), + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) elif args_opt.device_target == "CPU": - ms.set_context(mode=ms.GRAPH_MODE, + mindspore.set_context(mode=0, device_target="CPU", save_graphs=False) else: @@ -76,14 +76,14 @@ class CrossEntropyWithLabelSmooth(nn.LossBase): def __init__(self, smooth_factor=0., num_classes=1000): super(CrossEntropyWithLabelSmooth, self).__init__() self.onehot = ops.OneHot() - self.on_value = ms.Tensor(1.0 - smooth_factor, ms.float32) - self.off_value = ms.Tensor(1.0 * smooth_factor / (num_classes - 1), ms.float32) + self.on_value = mindspore.Tensor(1.0 - smooth_factor, mindspore.float32) + self.off_value = mindspore.Tensor(1.0 * smooth_factor / (num_classes - 1), mindspore.float32) self.ce = nn.SoftmaxCrossEntropyWithLogits() self.mean = ops.ReduceMean(False) self.cast = ops.Cast() def construct(self, logit, label): - one_hot_label = self.onehot(self.cast(label, ms.int32), ops.shape(logit)[1], + one_hot_label = self.onehot(self.cast(label, mindspore.int32), ops.shape(logit)[1], self.on_value, self.off_value) out_loss = self.ce(logit, one_hot_label) out_loss = self.mean(out_loss, 0) @@ -101,7 +101,7 @@ class Monitor(Callback): None Examples: - >>> Monitor(100,lr_init=ms.Tensor([0.05]*100).asnumpy()) + >>> Monitor(100,lr_init=mindspore.Tensor([0.05]*100).asnumpy()) """ def __init__(self, lr_init=None): @@ -130,9 +130,9 @@ class Monitor(Callback): step_mseconds = (time.time() - self.step_time) * 1000 step_loss = cb_params.net_outputs - if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], ms.Tensor): + if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], mindspore.Tensor): step_loss = step_loss[0] - if isinstance(step_loss, ms.Tensor): + if isinstance(step_loss, mindspore.Tensor): step_loss = np.mean(step_loss.asnumpy()) self.losses.append(step_loss) @@ -148,7 +148,7 @@ if __name__ == '__main__': config_ = None if args_opt.device_target == "GPU": config_ = config_gpu - ms.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) elif args_opt.device_target == "CPU": config_ = config_cpu else: @@ -183,12 +183,12 @@ if __name__ == '__main__': step_size = dataset.get_dataset_size() # resume if args_opt.pre_trained: - param_dict = ms.load_checkpoint(args_opt.pre_trained) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(args_opt.pre_trained) + mindspore.load_param_into_net(net, param_dict) # define optimizer - loss_scale = ms.FixedLossScaleManager( + loss_scale = mindspore.FixedLossScaleManager( config_.loss_scale, drop_overflow_update=False) - lr = ms.Tensor(get_lr(global_step=0, + lr = mindspore.Tensor(get_lr(global_step=0, lr_init=0, lr_end=0, lr_max=config_.lr, @@ -198,7 +198,7 @@ if __name__ == '__main__': opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config_.momentum, config_.weight_decay, config_.loss_scale) # define model - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale) + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale) cb = [Monitor(lr_init=lr.asnumpy())] if args_opt.run_distribute and args_opt.device_target != "CPU": diff --git a/official/cv/OCRNet/eval.py b/official/cv/OCRNet/eval.py index 08d0f7617..312610065 100644 --- a/official/cv/OCRNet/eval.py +++ b/official/cv/OCRNet/eval.py @@ -18,7 +18,8 @@ import argparse import ast import numpy as np -from mindspore import context, DatasetHelper +import mindspore +from mindspore import DatasetHelper from mindspore import ops as P from mindspore.dataset import engine as de from mindspore.train.serialization import load_param_into_net, load_checkpoint @@ -113,7 +114,7 @@ def testval(dataset, helper, model, num_classes=19, ignore_label=255, scales=Non def main(): """Inference process.""" # Set context - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) # Initialize network net = get_seg_model(config) param_dict = load_checkpoint(ckpt_file_name=config.checkpoint_path) diff --git a/official/cv/OCRNet/export.py b/official/cv/OCRNet/export.py index 7f0279a11..cdd49101c 100644 --- a/official/cv/OCRNet/export.py +++ b/official/cv/OCRNet/export.py @@ -17,7 +17,8 @@ import argparse import numpy as np -from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.config import config_hrnetv2_w48 as config from src.seg_hrnet_ocr import get_seg_model @@ -35,7 +36,7 @@ def main(): args = parser.parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=args.device_id) + mindspore.set_context(mode=0, device_target=args.device_target, device_id=args.device_id) net = get_seg_model(config) params_dict = load_checkpoint(args.checkpoint_file) diff --git a/official/cv/OCRNet/src/model_utils/moxing_adapter.py b/official/cv/OCRNet/src/model_utils/moxing_adapter.py index 08a6797de..33381866d 100644 --- a/official/cv/OCRNet/src/model_utils/moxing_adapter.py +++ b/official/cv/OCRNet/src/model_utils/moxing_adapter.py @@ -16,7 +16,7 @@ """Moxing adapter for ModelArts""" import os import functools -from mindspore import context +import mindspore from src.config import show_config @@ -101,7 +101,7 @@ def moxing_wrapper(config, pre_process=None, post_process=None): sync_data(config.eval_data_url, config.eval_data_path) print("Workspace downloaded: ", os.listdir(config.eval_data_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/OCRNet/train.py b/official/cv/OCRNet/train.py index df6b6f37f..1697afe84 100644 --- a/official/cv/OCRNet/train.py +++ b/official/cv/OCRNet/train.py @@ -19,11 +19,12 @@ import ast import os import numpy as np -from mindspore import context, Model +import mindspore +from mindspore import Model from mindspore import dataset as de from mindspore.common import set_seed from mindspore.communication.management import init, get_rank, get_group_size -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn import SGD from mindspore.train.callback import LossMonitor, TimeMonitor, ModelCheckpoint, CheckpointConfig from mindspore.train.loss_scale_manager import FixedLossScaleManager @@ -109,13 +110,13 @@ def parse_args(): @moxing_wrapper(config) def main(): """Training process.""" - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.run_distribute: init() device_id = int(os.getenv("DEVICE_ID")) if config.device_target == "Ascend" else get_rank() device_num = int(os.getenv("RANK_SIZE")) if config.device_target == "Ascend" else get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=device_num) else: diff --git a/official/cv/OpenPose/eval.py b/official/cv/OpenPose/eval.py index 081ed9b30..410fc8ac5 100644 --- a/official/cv/OpenPose/eval.py +++ b/official/cv/OpenPose/eval.py @@ -23,7 +23,8 @@ from scipy.ndimage.filters import gaussian_filter from tqdm import tqdm from pycocotools.coco import COCO as LoadAnn from pycocotools.cocoeval import COCOeval as MapEval -from mindspore import context, Tensor +import mindspore +from mindspore import Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.communication.management import init from mindspore.common import dtype as mstype @@ -36,7 +37,7 @@ from src.model_utils.device_adapter import get_device_id, get_rank_id, get_devic warnings.filterwarnings("ignore") devid = get_device_id() -context.set_context(mode=context.GRAPH_MODE, +mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, device_id=devid) show_gt = 0 diff --git a/official/cv/OpenPose/export.py b/official/cv/OpenPose/export.py index 5e83cbe8a..5941341bf 100644 --- a/official/cv/OpenPose/export.py +++ b/official/cv/OpenPose/export.py @@ -15,15 +15,15 @@ """export""" import os import numpy as np +import mindspore from mindspore import Tensor -from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net, export from src.openposenet import OpenPoseNet from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=config.device_id) +mindspore.set_context(mode=0, device_target=config.device_target, device_id=config.device_id) def modelarts_pre_process(): @@ -32,7 +32,7 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=None) def model_export(): - context.set_context(mode=context.GRAPH_MODE, save_graphs=False) + mindspore.set_context(mode=0, save_graphs=False) # define net net = OpenPoseNet() diff --git a/official/cv/OpenPose/modelarts/train_start.py b/official/cv/OpenPose/modelarts/train_start.py index 282d111aa..58bd855a6 100644 --- a/official/cv/OpenPose/modelarts/train_start.py +++ b/official/cv/OpenPose/modelarts/train_start.py @@ -17,11 +17,11 @@ import os import argparse import glob from ast import literal_eval as liter +import mindspore from mindspore import Tensor -from mindspore import context from mindspore import export from mindspore.common import set_seed -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import init from mindspore.train import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor @@ -103,7 +103,7 @@ if __name__ == "__main__": if args_opt.vgg_path: config.vgg_path = os.path.join("/cache/data/", args_opt.vgg_path) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) + mindspore.set_context(mode=0, device_target="Ascend", save_graphs=False) config.lr = liter(config.lr) config.outputs_dir = config.save_model_path @@ -111,7 +111,7 @@ if __name__ == "__main__": if device_num > 1: init() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) config.rank = get_rank_id() config.outputs_dir = os.path.join(config.outputs_dir, "ckpt_{}/".format(config.rank)) diff --git a/official/cv/OpenPose/src/loss.py b/official/cv/OpenPose/src/loss.py index e10effb0c..8de7bab20 100644 --- a/official/cv/OpenPose/src/loss.py +++ b/official/cv/OpenPose/src/loss.py @@ -12,19 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ +import mindspore import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore.ops import operations as P from mindspore.nn.cell import Cell from mindspore.ops import functional as F from mindspore.ops import composite as C -from mindspore.context import ParallelMode, get_auto_parallel_context +from mindspore import ParallelMode, get_auto_parallel_context from mindspore.communication.management import get_group_size -from mindspore import context from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from src.model_utils.config import config -context.set_context(mode=context.GRAPH_MODE) +mindspore.set_context(mode=0) time_stamp_init = False time_stamp_first = 0 grad_scale = C.MultitypeFuncGraph("grad_scale") diff --git a/official/cv/OpenPose/src/model_utils/moxing_adapter.py b/official/cv/OpenPose/src/model_utils/moxing_adapter.py index c2d228240..344dfc034 100644 --- a/official/cv/OpenPose/src/model_utils/moxing_adapter.py +++ b/official/cv/OpenPose/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config @@ -101,7 +101,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print('Workspace downloaded: ', os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/OpenPose/src/openposenet.py b/official/cv/OpenPose/src/openposenet.py index a3ea966af..396d37e48 100644 --- a/official/cv/OpenPose/src/openposenet.py +++ b/official/cv/OpenPose/src/openposenet.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ +import mindspore import mindspore.nn as nn from mindspore.nn import Conv2d, ReLU from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.ops import operations as P -from mindspore import context -context.set_context(mode=context.GRAPH_MODE) +mindspore.set_context(mode=0) time_stamp_init = False time_stamp_first = 0 diff --git a/official/cv/OpenPose/train.py b/official/cv/OpenPose/train.py index baa538968..6830a93b0 100644 --- a/official/cv/OpenPose/train.py +++ b/official/cv/OpenPose/train.py @@ -14,8 +14,9 @@ # ============================================================================ import os from ast import literal_eval as liter -from mindspore import context, set_seed -from mindspore.context import ParallelMode +import mindspore +from mindspore import set_seed +from mindspore import ParallelMode from mindspore.communication.management import init from mindspore.train import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor @@ -31,7 +32,7 @@ from src.model_utils.device_adapter import get_rank_id, get_device_num set_seed(1) -context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) +mindspore.set_context(mode=0, device_target="Ascend", save_graphs=False) def modelarts_pre_process(): @@ -47,7 +48,7 @@ def train(): if device_num > 1: init() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) config.rank = get_rank_id() config.outputs_dir = os.path.join(config.outputs_dir, "ckpt_{}/".format(config.rank)) diff --git a/official/cv/PVNet/eval.py b/official/cv/PVNet/eval.py index a9b2088f3..5c06ec08a 100644 --- a/official/cv/PVNet/eval.py +++ b/official/cv/PVNet/eval.py @@ -20,7 +20,6 @@ import numpy as np import mindspore import mindspore.dataset.transforms as C import mindspore.dataset.vision as V -from mindspore import context from model_utils.config import config as cfg from model_utils.data_file_utils import read_pickle, read_rgb_np @@ -38,7 +37,7 @@ def test(args): assert seg_dim == 2 # set graph mode and parallel mode - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=args.rank) + mindspore.set_context(mode=0, device_target=args.device_target, device_id=args.rank) # load model parameters net = Resnet18_8s(ver_dim=args.vote_num * 2) diff --git a/official/cv/PVNet/export.py b/official/cv/PVNet/export.py index 6b2d40416..732b49a01 100644 --- a/official/cv/PVNet/export.py +++ b/official/cv/PVNet/export.py @@ -14,21 +14,21 @@ # ============================================================================ """export to mindir""" import numpy as np -import mindspore as ms -from mindspore import context, Tensor +import mindspore +from mindspore import Tensor from src.model_reposity import Resnet18_8s from model_utils.config import config as cfg -context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target) +mindspore.set_context(mode=0, device_target=cfg.device_target) if cfg.device_target == "Ascend": - context.set_context(device_id=cfg.rank) + mindspore.set_context(device_id=cfg.rank) if __name__ == "__main__": net = Resnet18_8s(ver_dim=cfg.vote_num * 2) - param_dict = ms.load_checkpoint(cfg.ckpt_file) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(cfg.ckpt_file) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) - input_data = Tensor(np.zeros([1, 3, cfg.img_height, cfg.img_width]), ms.float32) - ms.export(net, input_data, file_name=cfg.file_name, file_format=cfg.file_format) + input_data = Tensor(np.zeros([1, 3, cfg.img_height, cfg.img_width]), mindspore.float32) + mindspore.export(net, input_data, file_name=cfg.file_name, file_format=cfg.file_format) diff --git a/official/cv/PVNet/modelarts/start_train.py b/official/cv/PVNet/modelarts/start_train.py index 1d18a9b00..77e43e9d1 100644 --- a/official/cv/PVNet/modelarts/start_train.py +++ b/official/cv/PVNet/modelarts/start_train.py @@ -22,7 +22,6 @@ import numpy as np import mindspore -import mindspore.context as context from mindspore import Tensor from mindspore import nn from mindspore.communication import get_rank, init, get_group_size @@ -56,7 +55,7 @@ def export_AIR(args_opt): print("checkpoint path", ckpt_model) # if args.device_target == "Ascend": - # context.set_context(device_id=args.rank) + # mindspore.set_context(device_id=args.rank) net = Resnet18_8s(ver_dim=args.vote_num * 2) param_dict = mindspore.load_checkpoint(ckpt_model) mindspore.load_param_into_net(net, param_dict) @@ -172,7 +171,7 @@ class Train: def network_init(argvs): """ init distribute training """ - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=argvs.device_target, save_graphs=False, device_id=int(os.getenv('DEVICE_ID', '0')), @@ -182,9 +181,9 @@ def network_init(argvs): init() argvs.rank = get_rank() argvs.group_size = get_group_size() - context.reset_auto_parallel_context() - parallel_mode = context.ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=argvs.group_size) + mindspore.reset_auto_parallel_context() + parallel_mode = mindspore.ParallelMode.DATA_PARALLEL + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=argvs.group_size) def parse_args(): diff --git a/official/cv/PVNet/src/loss_scale.py b/official/cv/PVNet/src/loss_scale.py index 3e5dfcfe3..b6bba9227 100644 --- a/official/cv/PVNet/src/loss_scale.py +++ b/official/cv/PVNet/src/loss_scale.py @@ -13,8 +13,8 @@ # limitations under the License. # ============================================================================ """dynamic loss scale """ -import mindspore.context as context -from mindspore.context import ParallelMode +import mindspore +from mindspore import ParallelMode from mindspore import nn from mindspore.nn import Cell from mindspore import Tensor, RowTensor @@ -248,7 +248,7 @@ class TrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): self.less_equal = P.LessEqual() self.allreduce = P.AllReduce() self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) - self.gpu_target = (context.get_context("device_target") == "GPU") + self.gpu_target = (mindspore.get_context("device_target") == "GPU") self.loss_scaling_manager = None self.clip_gradients = ClipGradients() if isinstance(scale_sense, Cell): diff --git a/official/cv/PVNet/train.py b/official/cv/PVNet/train.py index 65d0d7c7e..c23fc031d 100644 --- a/official/cv/PVNet/train.py +++ b/official/cv/PVNet/train.py @@ -17,7 +17,6 @@ import os import time import mindspore -import mindspore.context as context from mindspore import Tensor from mindspore import nn from mindspore.communication import get_rank, init, get_group_size @@ -140,7 +139,7 @@ class Train: def network_init(argvs): """ init distribute training """ - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=argvs.device_target, save_graphs=False, device_id=int(os.getenv('DEVICE_ID', '0')), @@ -150,9 +149,9 @@ def network_init(argvs): init() argvs.rank = get_rank() argvs.group_size = get_group_size() - context.reset_auto_parallel_context() - parallel_mode = context.ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=argvs.group_size) + mindspore.reset_auto_parallel_context() + parallel_mode = mindspore.ParallelMode.DATA_PARALLEL + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=argvs.group_size) if __name__ == '__main__': diff --git a/official/cv/PointNet/eval.py b/official/cv/PointNet/eval.py index 67eb612be..358d9b086 100644 --- a/official/cv/PointNet/eval.py +++ b/official/cv/PointNet/eval.py @@ -20,7 +20,7 @@ import random import math import numpy as np import mindspore -from mindspore import load_checkpoint, load_param_into_net, context +from mindspore import load_checkpoint, load_param_into_net import mindspore.dataset as ds import mindspore.ops as ops from src.dataset import ShapeNetDataset @@ -100,18 +100,18 @@ if __name__ == "__main__": local_data_url = './cache/data' local_train_url = './cache/ckpt' device_target = args.device_target - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - context.set_context(save_graphs=False) + mindspore.set_context(mode=0, device_target=args.device_target) + mindspore.set_context(save_graphs=False) if device_target == "Ascend": - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) else: raise ValueError("Unsupported platform.") import moxing as mox mox.file.copy_parallel(src_url=args.data_url, dst_url=local_data_url) else: - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - context.set_context(save_graphs=False) + mindspore.set_context(mode=0, device_target=args.device_target) + mindspore.set_context(save_graphs=False) if not os.path.exists(local_train_url): os.makedirs(local_train_url) diff --git a/official/cv/PointNet/preprocess.py b/official/cv/PointNet/preprocess.py index a689e2171..be42b4b7e 100644 --- a/official/cv/PointNet/preprocess.py +++ b/official/cv/PointNet/preprocess.py @@ -15,7 +15,7 @@ """pre process for 310 inference""" import os import argparse -from mindspore import context +import mindspore import mindspore.dataset as ds import numpy as np from src.dataset import ShapeNetDataset @@ -31,7 +31,7 @@ parser.add_argument( '--batchSize', type=int, default=1, help='input batch size') args = parser.parse_args() -context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend", device_id=args.device_id) +mindspore.set_context(mode=1, device_target="Ascend", device_id=args.device_id) if __name__ == '__main__': dataset_generator = ShapeNetDataset( root=args.dataset_path, diff --git a/official/cv/PointNet/src/export.py b/official/cv/PointNet/src/export.py index cd4da5c6e..5a3c39662 100644 --- a/official/cv/PointNet/src/export.py +++ b/official/cv/PointNet/src/export.py @@ -16,7 +16,8 @@ import os import argparse import numpy as np -from mindspore import Tensor, export, load_checkpoint, context +import mindspore +from mindspore import Tensor, export, load_checkpoint from src.network import PointNetDenseCls parser = argparse.ArgumentParser(description='MindSpore Pointnet Segmentation') parser.add_argument( @@ -28,7 +29,7 @@ parser.add_argument('--file_format', type=str, default='MINDIR', help="export fi parser.add_argument('--feature_transform', action='store_true', help="use feature transform") args = parser.parse_args() -context.set_context(mode=context.PYNATIVE_MODE, device_target=args.device_target) +mindspore.set_context(mode=1, device_target=args.device_target) num_classes = 4 classifier = PointNetDenseCls(k=num_classes, feature_transform=args.feature_transform) if not os.path.exists('./mindir'): diff --git a/official/cv/PointNet/src/preprocess.py b/official/cv/PointNet/src/preprocess.py index d7c0ffd30..ab2ca7222 100644 --- a/official/cv/PointNet/src/preprocess.py +++ b/official/cv/PointNet/src/preprocess.py @@ -15,7 +15,7 @@ """pre process for 310 inference""" import os import argparse -from mindspore import context +import mindspore import mindspore.dataset as ds import numpy as np from src.dataset import ShapeNetDataset @@ -32,7 +32,7 @@ parser.add_argument( '--batchSize', type=int, default=1, help='input batch size') args = parser.parse_args() -context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend", device_id=args.device_id) +mindspore.set_context(mode=1, device_target="Ascend", device_id=args.device_id) if __name__ == '__main__': dataset_generator = ShapeNetDataset( root=args.dataset_path, diff --git a/official/cv/PointNet/train.py b/official/cv/PointNet/train.py index 35472fbe1..9420ab993 100644 --- a/official/cv/PointNet/train.py +++ b/official/cv/PointNet/train.py @@ -21,8 +21,7 @@ import math import numpy as np import mindspore import mindspore.nn as nn -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode import mindspore.dataset as ds from mindspore import save_checkpoint import mindspore.ops as ops @@ -128,14 +127,14 @@ if __name__ == "__main__": device_target = args.device_target num_shards = int(os.getenv("RANK_SIZE")) shard_id = int(os.getenv("DEVICE_ID")) - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - context.set_context(save_graphs=False) + mindspore.set_context(mode=0, device_target=args.device_target) + mindspore.set_context(save_graphs=False) if device_target == "Ascend": - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) if device_num > 1: args.learning_rate *= 2 - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() local_data_url = os.path.join(local_data_url, str(device_id)) @@ -147,15 +146,15 @@ if __name__ == "__main__": mox.file.copy_parallel(src_url=args.data_url, dst_url=local_data_url) else: # run on the local server - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - context.set_context(save_graphs=False) + mindspore.set_context(mode=0, device_target=args.device_target) + mindspore.set_context(save_graphs=False) if args.device_target == "GPU": - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) if device_num > 1: args.learning_rate = args.learning_rate * 2 - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() shard_id = get_rank() @@ -187,7 +186,7 @@ if __name__ == "__main__": num_classes = dataset_generator.num_seg_classes classifier = PointNetDenseCls(k=num_classes, feature_transform=args.feature_transform) classifier.set_train(True) - if context.get_context("device_target") == "Ascend": + if mindspore.get_context("device_target") == "Ascend": classifier.to_float(mindspore.float16) for _, cell in classifier.cells_and_names(): if isinstance(cell, nn.LogSoftmax): diff --git a/official/cv/PointNet2/eval.py b/official/cv/PointNet2/eval.py index e70bc4897..efaffbf5d 100644 --- a/official/cv/PointNet2/eval.py +++ b/official/cv/PointNet2/eval.py @@ -19,8 +19,8 @@ import ast import os import time +import mindspore import mindspore.dataset as ds -from mindspore import context from mindspore.nn.metrics import Accuracy from mindspore.train import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -70,10 +70,10 @@ def run_eval(): pretrained_ckpt_path = args.pretrained_ckpt if args.platform == "Ascend": - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id) - context.set_context(max_call_depth=2048) + mindspore.set_context(mode=0, device_target="Ascend", device_id=device_id) + mindspore.set_context(max_call_depth=2048) else: - context.set_context(mode=context.GRAPH_MODE, device_target="GPU", max_call_depth=2000, device_id=device_id) + mindspore.set_context(mode=0, device_target="GPU", max_call_depth=2000, device_id=device_id) print(args) diff --git a/official/cv/PointNet2/export.py b/official/cv/PointNet2/export.py index b1f9d7166..510072c33 100644 --- a/official/cv/PointNet2/export.py +++ b/official/cv/PointNet2/export.py @@ -21,8 +21,9 @@ import ast import os import numpy as np +import mindspore import mindspore.common.dtype as mstype -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.pointnet2 import PointNet2 @@ -38,9 +39,9 @@ parser.add_argument('--num_category', default=40, type=int, choices=[10, 40], he parser.add_argument('--use_normals', action='store_true', default=False, help='use normals') # channels = 6 if true args = parser.parse_args() -context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") -context.set_context(device_id=int(os.getenv('DEVICE_ID', '0'))) -context.set_context(max_call_depth=2048) +mindspore.set_context(mode=0, device_target="Ascend") +mindspore.set_context(device_id=int(os.getenv('DEVICE_ID', '0'))) +mindspore.set_context(max_call_depth=2048) if args.enable_modelarts: import moxing as mox diff --git a/official/cv/PointNet2/src/pointnet2_utils.py b/official/cv/PointNet2/src/pointnet2_utils.py index 828a00abf..6fdddc060 100644 --- a/official/cv/PointNet2/src/pointnet2_utils.py +++ b/official/cv/PointNet2/src/pointnet2_utils.py @@ -15,7 +15,7 @@ """network definition utils""" import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.numpy as mnp import mindspore.ops as P @@ -28,14 +28,14 @@ from src.layers import Conv2d @constexpr def generate_tensor_fps(B, N): """generate tensor""" - farthest = Tensor(np.random.randint(N, size=(B,)), ms.int32) + farthest = Tensor(np.random.randint(N, size=(B,)), mindspore.int32) return farthest @constexpr def generate_tensor_batch_indices(B): """generate tensor""" - return Tensor(np.arange(B), ms.int32) + return Tensor(np.arange(B), mindspore.int32) def square_distance(src, dst): @@ -91,16 +91,16 @@ def farthest_point_sample(xyz, npoint): centroids: sampled pointcloud index, [B, npoint] """ B, N, _ = xyz.shape - centroids = mnp.zeros((npoint, B), ms.int32) - distance = mnp.ones((B, N), ms.int32) * 1e9 + centroids = mnp.zeros((npoint, B), mindspore.int32) + distance = mnp.ones((B, N), mindspore.int32) * 1e9 farthest = generate_tensor_fps(B, N) batch_indices = generate_tensor_batch_indices(B) for i in range(npoint): - centroids = P.Cast()(centroids, ms.float32) - farthest = P.Cast()(farthest, ms.float32) + centroids = P.Cast()(centroids, mindspore.float32) + farthest = P.Cast()(farthest, mindspore.float32) centroids[i] = farthest - centroids = P.Cast()(centroids, ms.int32) - farthest = P.Cast()(farthest, ms.int32) + centroids = P.Cast()(centroids, mindspore.int32) + farthest = P.Cast()(farthest, mindspore.int32) index = P.Concat(-1)((batch_indices.reshape(batch_indices.shape + (1,)), farthest.reshape(farthest.shape + (1,)))) centroid = P.GatherNd()(xyz, index).reshape((B, 1, 3)) @@ -122,15 +122,15 @@ def query_ball_point(radius, nsample, xyz, new_xyz): """ B, N, _ = xyz.shape _, S, _ = new_xyz.shape - group_idx = mnp.arange(0, N, 1, ms.int32).view(1, 1, N) + group_idx = mnp.arange(0, N, 1, mindspore.int32).view(1, 1, N) group_idx = P.Tile()(group_idx, (B, S, 1)) sqrdists = square_distance(new_xyz, xyz) idx = sqrdists > radius ** 2 group_idx = P.Select()(idx, -1 * P.OnesLike()(group_idx), group_idx) - group_idx = P.Cast()(group_idx, ms.float32) + group_idx = P.Cast()(group_idx, mindspore.float32) group_idx, _ = P.TopK()(group_idx, nsample) - group_idx = P.Cast()(group_idx, ms.int32) + group_idx = P.Cast()(group_idx, mindspore.int32) group_first = group_idx[:, :, 0].view(B, S, 1) group_first = P.Tile()(group_first, (1, 1, nsample)) # [B, S, nsample] diff --git a/official/cv/PointNet2/train.py b/official/cv/PointNet2/train.py index 3698219fe..e998e45f6 100644 --- a/official/cv/PointNet2/train.py +++ b/official/cv/PointNet2/train.py @@ -23,8 +23,8 @@ import time import mindspore import mindspore.dataset as ds import mindspore.nn as nn -from mindspore import Model, Tensor, context, load_checkpoint, load_param_into_net -from mindspore.context import ParallelMode +from mindspore import Model, Tensor, load_checkpoint, load_param_into_net +from mindspore import ParallelMode from mindspore.profiler import Profiler from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.communication.management import init, get_rank @@ -76,29 +76,29 @@ def content_init(args, device_id, device_num): raise ValueError("Unsupported platform {}".format(args.platform)) if _platform == "ascend": - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target="Ascend", device_id=device_id) - context.set_context(max_call_depth=2048) + mindspore.set_context(max_call_depth=2048) if device_num > 1: init() - context.set_auto_parallel_context( + mindspore.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target="GPU", max_call_depth=2048) if device_num > 1: mindspore.dataset.config.set_enable_shared_mem(False) - context.set_auto_parallel_context( - parallel_mode=context.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context( + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) mindspore.common.set_seed(1234) init() else: - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) def get_data_url(args, rank_id=0): diff --git a/official/cv/ResNet/README.md b/official/cv/ResNet/README.md index a35b0b9e6..c3f919be5 100644 --- a/official/cv/ResNet/README.md +++ b/official/cv/ResNet/README.md @@ -1482,7 +1482,7 @@ Refer to the [ModelZoo FAQ](https://gitee.com/mindspore/models#FAQ) for some com - **Q: How to solve the memory shortage caused by accumulation operators such as ReduceMean and BiasAddGrad on 910B?** - **A**: Suggested adding `ms.set_context(ascend_config={"atomic_clean_policy": 0})` in `train.py`. If the problem still hasn't been resolved, please go to the [MindSpore community](https://gitee.com/mindspore/mindspore/issues) to submit an issue. + **A**: Suggested adding `mindspore.set_context(ascend_config={"atomic_clean_policy": 0})` in `train.py`. If the problem still hasn't been resolved, please go to the [MindSpore community](https://gitee.com/mindspore/mindspore/issues) to submit an issue. - **Q: How to solve the problem of `out of memory`?** diff --git a/official/cv/ResNet/README_CN.md b/official/cv/ResNet/README_CN.md index 0de3bba76..dc59a8b82 100644 --- a/official/cv/ResNet/README_CN.md +++ b/official/cv/ResNet/README_CN.md @@ -1424,7 +1424,7 @@ result:{'top_1_accuracy': 0.928385416666666} prune_rate=0.45 ckpt=~/resnet50_cif - **Q: 如何解决910B硬件上因ReduceMean、BiasAddGrad等累加算子导致的内存不足?** - **A**: 建议在`train.py`中添加`ms.set_context(ascend_config={"atomic_clean_policy": 0})`,如果还是没有解决问题,请到[MindSpore社区](https://gitee.com/mindspore/mindspore/issues)提issue。 + **A**: 建议在`train.py`中添加`mindspore.set_context(ascend_config={"atomic_clean_policy": 0})`,如果还是没有解决问题,请到[MindSpore社区](https://gitee.com/mindspore/mindspore/issues)提issue。 - **Q: 遇到`out of memory`如何解决?** diff --git a/official/cv/ResNet/eval.py b/official/cv/ResNet/eval.py index 239ae7d86..0b65eb324 100644 --- a/official/cv/ResNet/eval.py +++ b/official/cv/ResNet/eval.py @@ -14,7 +14,7 @@ # ============================================================================ """eval resnet.""" import os -import mindspore as ms +import mindspore from mindspore import Tensor from mindspore.nn.optim import Momentum from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits @@ -22,7 +22,7 @@ from src.CrossEntropySmooth import CrossEntropySmooth from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -ms.set_seed(1) +mindspore.set_seed(1) if config.net_name in ("resnet18", "resnet34", "resnet50", "resnet152"): if config.net_name == "resnet18": @@ -66,10 +66,10 @@ def eval_net(): """eval net""" target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID', '0')) - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, @@ -80,8 +80,8 @@ def eval_net(): net = resnet(class_num=config.class_num) # load checkpoint - param_dict = ms.load_checkpoint(config.checkpoint_file_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) # define loss, model @@ -100,7 +100,7 @@ def eval_net(): opt = Momentum(group_params, Tensor(0.0), config.momentum, loss_scale=config.loss_scale) # define model, add boostmode for eval scenarios with train.py - model = ms.Model(net, loss_fn=loss, boost_level=config.boost_mode, + model = mindspore.Model(net, loss_fn=loss, boost_level=config.boost_mode, optimizer=opt, metrics={'top_1_accuracy', 'top_5_accuracy'}) # eval model diff --git a/official/cv/ResNet/export.py b/official/cv/ResNet/export.py index 106992af2..2fe07032d 100644 --- a/official/cv/ResNet/export.py +++ b/official/cv/ResNet/export.py @@ -18,13 +18,13 @@ python export.py """ import os -import mindspore as ms +import mindspore from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target != "GPU": - ms.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) def modelarts_pre_process(): @@ -54,10 +54,10 @@ def run_export(): assert config.checkpoint_file_path is not None, "checkpoint_path is None." - param_dict = ms.load_checkpoint(config.checkpoint_file_path) - ms.load_param_into_net(net, param_dict) - input_arr = ms.numpy.zeros([config.batch_size, 3, config.height, config.width], ms.float32) - ms.export(net, input_arr, file_name=config.file_name, file_format=config.file_format) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) + mindspore.load_param_into_net(net, param_dict) + input_arr = mindspore.numpy.zeros([config.batch_size, 3, config.height, config.width], mindspore.float32) + mindspore.export(net, input_arr, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': run_export() diff --git a/official/cv/ResNet/fine_tune.py b/official/cv/ResNet/fine_tune.py index a43f2fe79..0799aa8a7 100644 --- a/official/cv/ResNet/fine_tune.py +++ b/official/cv/ResNet/fine_tune.py @@ -15,7 +15,7 @@ """train resnet34.""" import os -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.train.model import Model @@ -29,7 +29,7 @@ from src.util import eval_callback, set_output_dir from src.logger import get_logger -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, save_graphs=False) +mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) def import_data(): @@ -77,7 +77,7 @@ def init_weight(net, param_dict): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(param_dict, filter_list) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) def eval_net(net, dataset): @@ -87,7 +87,7 @@ def eval_net(net, dataset): loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) # eval model res = model.eval(dataset) @@ -99,7 +99,7 @@ def finetune_train(): config.logger = get_logger(config.log_dir, 0) dataset_train, data_val = import_data() - ckpt_param_dict = ms.load_checkpoint(config.checkpoint_path) + ckpt_param_dict = mindspore.load_checkpoint(config.checkpoint_path) net = resnet34(class_num=1001) init_weight(net=net, param_dict=ckpt_param_dict) config.logger.info("net parameter:") @@ -133,7 +133,7 @@ def finetune_train(): # define callbacks step_size = dataset_train.get_dataset_size() time_cb = TimeMonitor(data_size=step_size) - lr = ms.Tensor([config.learning_rate] * step_size * config.epoch_size) + lr = mindspore.Tensor([config.learning_rate] * step_size * config.epoch_size) loss_cb = LossCallBack(config.epoch_size, config.logger, lr, per_print_time=10) cb = [time_cb, loss_cb] diff --git a/official/cv/ResNet/golden_stick/ghost/eval.py b/official/cv/ResNet/golden_stick/ghost/eval.py index 916f37e18..c579ee434 100644 --- a/official/cv/ResNet/golden_stick/ghost/eval.py +++ b/official/cv/ResNet/golden_stick/ghost/eval.py @@ -15,7 +15,7 @@ """eval resnet.""" import os import numpy as np -import mindspore as ms +import mindspore from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore_gs import GhostAlgo from src.CrossEntropySmooth import CrossEntropySmooth @@ -27,7 +27,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) def eval_net(): @@ -35,10 +35,10 @@ def eval_net(): target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, @@ -48,9 +48,9 @@ def eval_net(): net = resnet(class_num=config.class_num) algo = GhostAlgo({}) net = algo.apply(net) - param_dict = ms.load_checkpoint(config.checkpoint_file_path) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) # load checkpoint - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) # define loss, model @@ -64,7 +64,7 @@ def eval_net(): loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy'}) # eval model res = model.eval(dataset) diff --git a/official/cv/ResNet/golden_stick/ghost/train.py b/official/cv/ResNet/golden_stick/ghost/train.py index b07362708..dcde2bfb0 100644 --- a/official/cv/ResNet/golden_stick/ghost/train.py +++ b/official/cv/ResNet/golden_stick/ghost/train.py @@ -15,7 +15,7 @@ """train resnet.""" import os -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.common.initializer as weight_init from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor @@ -33,7 +33,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) def filter_checkpoint_parameter_by_list(origin_dict, param_filter): @@ -54,27 +54,27 @@ def set_parameter(): # init context if config.mode_name == "GRAPH": - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=1, device_target=target, save_graphs=False) if config.run_distribute: if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - ms.set_context(device_id=device_id) - ms.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_context(device_id=device_id) + mindspore.set_auto_parallel_context(device_num=config.device_num, parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) set_algo_parameters(elementwise_op_strategy_follow=True) if config.boost_mode not in ["O1", "O2"]: - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) init() else: # GPU target init() - ms.set_auto_parallel_context(device_num=config.device_num, - parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=config.device_num, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) def init_weight(net, param_dict): @@ -90,7 +90,7 @@ def init_weight(net, param_dict): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(param_dict, filter_list) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) else: for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): @@ -106,11 +106,11 @@ def init_weight(net, param_dict): def load_fp32_ckpt(net): if config.fp32_ckpt: if os.path.isfile(config.fp32_ckpt): - ckpt = ms.load_checkpoint(config.fp32_ckpt) + ckpt = mindspore.load_checkpoint(config.fp32_ckpt) if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - ms.load_param_into_net(net, ckpt) + mindspore.load_param_into_net(net, ckpt) else: print(f"Invalid fp32_ckpt {config.fp32_ckpt} parameter.") @@ -118,7 +118,7 @@ def load_fp32_ckpt(net): def load_pretrained_ckpt(net): if config.pre_trained: if os.path.isfile(config.pre_trained): - ckpt = ms.load_checkpoint(config.pre_trained) + ckpt = mindspore.load_checkpoint(config.pre_trained) if ckpt.get("epoch_num") and ckpt.get("step_num"): config.has_trained_epoch = int(ckpt["epoch_num"].data.asnumpy()) config.has_trained_step = int(ckpt["step_num"].data.asnumpy()) @@ -134,7 +134,7 @@ def load_pretrained_ckpt(net): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - not_load_param, _ = ms.load_param_into_net(net, ckpt) + not_load_param, _ = mindspore.load_param_into_net(net, ckpt) if not_load_param: raise RuntimeError("Load param into net fail.") else: @@ -207,7 +207,7 @@ def train_net(): loss_scale=config.loss_scale ) kf_loss_fn = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') - loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + loss_scale = mindspore.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) ckpt_save_dir = set_save_ckpt_dir() config_ck = CheckpointConfig(save_checkpoint_steps=5 * step_size, keep_checkpoint_max=config.keep_checkpoint_max) @@ -217,7 +217,7 @@ def train_net(): loss_cb = LossMonitor() metrics = {"acc"} cb = [loss_cb, time_cb, ckpt_cb] - model = ms.Model(net, loss_fn=kf_loss_fn, optimizer=optimizer, loss_scale_manager=loss_scale, metrics=metrics, + model = mindspore.Model(net, loss_fn=kf_loss_fn, optimizer=optimizer, loss_scale_manager=loss_scale, metrics=metrics, boost_level=config.boost_mode, boost_config_dict={"grad_freeze": {"total_steps": config.epoch_size * step_size}}) model.train(config.epoch_size, dataset, callbacks=cb, dataset_sink_mode=True) diff --git a/official/cv/ResNet/golden_stick/pruner/scop/eval.py b/official/cv/ResNet/golden_stick/pruner/scop/eval.py index cb866278e..ed5d545e8 100644 --- a/official/cv/ResNet/golden_stick/pruner/scop/eval.py +++ b/official/cv/ResNet/golden_stick/pruner/scop/eval.py @@ -15,7 +15,7 @@ """eval resnet.""" import os import numpy as np -import mindspore as ms +import mindspore from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore_gs import PrunerKfCompressAlgo, PrunerFtCompressAlgo from mindspore_gs.pruner.scop.scop_pruner import KfConv2d, MaskedConv2dbn @@ -28,7 +28,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) def eval_net(): @@ -36,10 +36,10 @@ def eval_net(): target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, @@ -49,7 +49,7 @@ def eval_net(): net = resnet(class_num=config.class_num) net = PrunerKfCompressAlgo({}).apply(net) out_index = [] - param_dict = ms.load_checkpoint(config.checkpoint_file_path) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) for key in param_dict.keys(): if 'out_index' in key: out_index.append(param_dict[key]) @@ -60,7 +60,7 @@ def eval_net(): net = ft_algo._recover_conv(net) # load checkpoint - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) # define loss, model @@ -74,7 +74,7 @@ def eval_net(): loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy'}) # eval model res = model.eval(dataset) diff --git a/official/cv/ResNet/golden_stick/pruner/scop/infer.py b/official/cv/ResNet/golden_stick/pruner/scop/infer.py index 70597796d..f30157e88 100644 --- a/official/cv/ResNet/golden_stick/pruner/scop/infer.py +++ b/official/cv/ResNet/golden_stick/pruner/scop/infer.py @@ -14,7 +14,7 @@ # ============================================================================ """infer scop_resnet mindir.""" import datetime -import mindspore as ms +import mindspore import mindspore.nn as nn from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper @@ -32,7 +32,7 @@ def infer_net(): raise ValueError("Currently only support GPU.") # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, @@ -40,7 +40,7 @@ def infer_net(): step_size = dataset.get_dataset_size() # load mindir - graph = ms.load(config.mindir_path) + graph = mindspore.load(config.mindir_path) net = nn.GraphCell(graph) print("start infer") @@ -49,7 +49,7 @@ def infer_net(): for _, data in enumerate(data_loader): images = data["image"] start_time = datetime.datetime.now() - net(ms.Tensor(images)) + net(mindspore.Tensor(images)) end_time = datetime.datetime.now() total_time += (end_time - start_time).microseconds diff --git a/official/cv/ResNet/golden_stick/pruner/scop/train.py b/official/cv/ResNet/golden_stick/pruner/scop/train.py index 6e1cd12c5..13d7d6677 100644 --- a/official/cv/ResNet/golden_stick/pruner/scop/train.py +++ b/official/cv/ResNet/golden_stick/pruner/scop/train.py @@ -16,7 +16,7 @@ import os import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor @@ -36,7 +36,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) class NetWithLossCell(nn.WithLossCell): @@ -74,27 +74,27 @@ def set_parameter(): # init context if config.mode_name == "GRAPH": - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=1, device_target=target, save_graphs=False) if config.run_distribute: if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - ms.set_context(device_id=device_id) - ms.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_context(device_id=device_id) + mindspore.set_auto_parallel_context(device_num=config.device_num, parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) set_algo_parameters(elementwise_op_strategy_follow=True) if config.boost_mode not in ["O1", "O2"]: - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) init() else: # GPU target init() - ms.set_auto_parallel_context(device_num=config.device_num, - parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=config.device_num, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) def init_weight(net, param_dict): @@ -110,12 +110,12 @@ def init_weight(net, param_dict): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(param_dict, filter_list) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) else: for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): if config.conv_init == "XavierUniform": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.XavierUniform(), cell.weight.shape, cell.weight.dtype)) elif config.conv_init == "TruncatedNormal": @@ -125,25 +125,25 @@ def init_weight(net, param_dict): cell.weight.set_data(weight) if isinstance(cell, nn.Dense): if config.dense_init == "TruncatedNormal": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) elif config.dense_init == "RandomNormal": in_channel = cell.in_channels out_channel = cell.out_channels weight = np.random.normal(loc=0, scale=0.01, size=out_channel * in_channel) - weight = ms.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) + weight = mindspore.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) cell.weight.set_data(weight) def load_fp32_ckpt(net): if config.fp32_ckpt: if os.path.isfile(config.fp32_ckpt): - ckpt = ms.load_checkpoint(config.fp32_ckpt) + ckpt = mindspore.load_checkpoint(config.fp32_ckpt) if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - ms.load_param_into_net(net, ckpt) + mindspore.load_param_into_net(net, ckpt) else: print(f"Invalid fp32_ckpt {config.fp32_ckpt} parameter.") @@ -151,7 +151,7 @@ def load_fp32_ckpt(net): def load_pretrained_ckpt(net): if config.pre_trained: if os.path.isfile(config.pre_trained): - ckpt = ms.load_checkpoint(config.pre_trained) + ckpt = mindspore.load_checkpoint(config.pre_trained) if ckpt.get("epoch_num") and ckpt.get("step_num"): config.has_trained_epoch = int(ckpt["epoch_num"].data.asnumpy()) config.has_trained_step = int(ckpt["step_num"].data.asnumpy()) @@ -167,7 +167,7 @@ def load_pretrained_ckpt(net): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - not_load_param, _ = ms.load_param_into_net(net, ckpt) + not_load_param, _ = mindspore.load_param_into_net(net, ckpt) if not_load_param: raise RuntimeError("Load param into net fail.") else: @@ -247,7 +247,7 @@ def train_net(): if config.pre_trained: train_ft(net) else: - model = ms.Model(net, loss_fn=kf_loss_fn, optimizer=optimizer) + model = mindspore.Model(net, loss_fn=kf_loss_fn, optimizer=optimizer) model.train(config.epoch_kf, dataset, callbacks=cb, dataset_sink_mode=False) train_ft(net) @@ -261,7 +261,7 @@ def train_ft(net): algo_ft = PrunerFtCompressAlgo({'prune_rate': config.prune_rate}) net = algo_ft.apply(net) load_pretrained_ckpt(net) - lr_ft_new = ms.Tensor(get_lr(lr_init=config.lr_init, + lr_ft_new = mindspore.Tensor(get_lr(lr_init=config.lr_init, lr_end=config.lr_end_ft, lr_max=config.lr_max_ft, warmup_epochs=config.warmup_epochs, @@ -279,7 +279,7 @@ def train_ft(net): metrics = {"acc"} loss_scale = FixedLossScaleManager(1024, drop_overflow_update=False) ft_loss_fn = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') - model_ft = ms.Model(net, loss_fn=ft_loss_fn, optimizer=optimizer_ft, loss_scale_manager=loss_scale, + model_ft = mindspore.Model(net, loss_fn=ft_loss_fn, optimizer=optimizer_ft, loss_scale_manager=loss_scale, metrics=metrics, amp_level="O2", boost_level="O0", keep_batchnorm_fp32=False) diff --git a/official/cv/ResNet/golden_stick/pruner/uni_pruning/eval.py b/official/cv/ResNet/golden_stick/pruner/uni_pruning/eval.py index 256685f0c..989abf9ca 100644 --- a/official/cv/ResNet/golden_stick/pruner/uni_pruning/eval.py +++ b/official/cv/ResNet/golden_stick/pruner/uni_pruning/eval.py @@ -16,7 +16,7 @@ import os import json import numpy as np -import mindspore as ms +import mindspore from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore_gs.pruner.uni_pruning import UniPruner @@ -29,7 +29,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) def eval_net(): @@ -37,12 +37,12 @@ def eval_net(): target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) else: device_id = config.device_id - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, @@ -73,7 +73,7 @@ def eval_net(): else: mask = None tag = 'original' - ms.load_checkpoint(config.checkpoint_file_path, net) + mindspore.load_checkpoint(config.checkpoint_file_path, net) algo.prune_by_mask(net, mask, config, tag) @@ -88,7 +88,7 @@ def eval_net(): loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy'}) # eval model res = model.eval(dataset) diff --git a/official/cv/ResNet/golden_stick/pruner/uni_pruning/export.py b/official/cv/ResNet/golden_stick/pruner/uni_pruning/export.py index 60b8e0672..d2e5ee532 100644 --- a/official/cv/ResNet/golden_stick/pruner/uni_pruning/export.py +++ b/official/cv/ResNet/golden_stick/pruner/uni_pruning/export.py @@ -17,7 +17,7 @@ pruning masks (.json format) are obtained during training in the experiment directory.""" import os import numpy as np -import mindspore as ms +import mindspore from mindspore_gs.pruner.uni_pruning import UniPruner #pylint: disable=ungrouped-imports from src.resnet import resnet18, resnet50 @@ -30,12 +30,12 @@ def export(): target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) else: device_id = config.device_id - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # define net if config.net_name == 'resnet18': @@ -60,8 +60,8 @@ def export(): ckpt_path=config.checkpoint_file_path, mask_path=config.mask_path) inputs = np.random.uniform(0.0, 1.0, size=input_size).astype(np.float32) - inputs = ms.Tensor(inputs) - ms.export(net_deploy, inputs, file_name=f"{save_path}_pruned.mindir", file_format="MINDIR") + inputs = mindspore.Tensor(inputs) + mindspore.export(net_deploy, inputs, file_name=f"{save_path}_pruned.mindir", file_format="MINDIR") if __name__ == '__main__': diff --git a/official/cv/ResNet/golden_stick/pruner/uni_pruning/train.py b/official/cv/ResNet/golden_stick/pruner/uni_pruning/train.py index 4bb654423..bf3512640 100644 --- a/official/cv/ResNet/golden_stick/pruner/uni_pruning/train.py +++ b/official/cv/ResNet/golden_stick/pruner/uni_pruning/train.py @@ -16,8 +16,8 @@ import os import numpy as np -import mindspore as ms -from mindspore import context, nn +import mindspore +from mindspore import nn from mindspore.train.model import ParallelMode from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.communication.management import init, get_rank @@ -39,7 +39,7 @@ else: else: from src.dataset import create_dataset_pynative as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) def filter_checkpoint_parameter_by_list(origin_dict, param_filter): @@ -58,10 +58,10 @@ def init_env(args): device_num = 1 if args.mode_name == 'GRAPH' and args.device_target == "GPU": print('GPU GRAPH MODE') - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=args.device_target, device_id=args.device_id) if args.device_num > 1: - context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True) init("nccl") rank = get_rank() @@ -72,19 +72,19 @@ def init_env(args): device_num = int(os.getenv('RANK_SIZE')) device_id = int(os.getenv('DEVICE_ID')) rank = int(os.getenv('RANK_ID')) - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - context.set_context(max_call_depth=2000) + mindspore.set_context(mode=0, device_target=args.device_target) + mindspore.set_context(max_call_depth=2000) if device_num > 1: os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = os.getenv('RANK_TABLE_FILE') - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) if device_num > 1: - context.set_auto_parallel_context(device_num=device_num, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() else: print(f'Single node pynative mode on {args.device_target}') - context.set_context(mode=context.PYNATIVE_MODE, device_target=args.device_target, + mindspore.set_context(mode=1, device_target=args.device_target, device_id=args.device_id) return rank @@ -95,7 +95,7 @@ def init_weight(net): for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): if config.conv_init == "XavierUniform": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.XavierUniform(), cell.weight.shape, cell.weight.dtype)) elif config.conv_init == "TruncatedNormal": @@ -105,14 +105,14 @@ def init_weight(net): cell.weight.set_data(weight) if isinstance(cell, nn.Dense): if config.dense_init == "TruncatedNormal": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) elif config.dense_init == "RandomNormal": in_channel = cell.in_channels out_channel = cell.out_channels weight = np.random.normal(loc=0, scale=0.01, size=out_channel * in_channel) - weight = ms.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) + weight = mindspore.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) cell.weight.set_data(weight) @@ -155,7 +155,7 @@ def load_pretrained_ckpt(net): """load checkpoint""" if config.pre_trained: if os.path.isfile(config.pre_trained): - ckpt = ms.load_checkpoint(config.pre_trained) + ckpt = mindspore.load_checkpoint(config.pre_trained) if ckpt.get("epoch_num") and ckpt.get("step_num"): config.has_trained_epoch = int(ckpt["epoch_num"].data.asnumpy()) config.has_trained_step = int(ckpt["step_num"].data.asnumpy()) @@ -166,7 +166,7 @@ def load_pretrained_ckpt(net): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - ms.load_param_into_net(net, ckpt) + mindspore.load_param_into_net(net, ckpt) else: print(f"Invalid pre_trained {config.pre_trained} parameter.") @@ -218,7 +218,7 @@ def train_net(): if config.pre_trained: lr = lr[config.has_trained_epoch * step_size:] - lr = ms.Tensor(lr) + lr = mindspore.Tensor(lr) # define optimizer group_params = init_group_params(net) if config.optimizer == 'Momentum': @@ -230,7 +230,7 @@ def train_net(): metrics = {"acc"} if config.run_distribute: metrics = {'acc': DistAccuracy(batch_size=config.batch_size, device_num=config.device_num)} - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, amp_level="O2", boost_level="O0", keep_batchnorm_fp32=False) # define callbacks diff --git a/official/cv/ResNet/golden_stick/quantization/simqat/eval.py b/official/cv/ResNet/golden_stick/quantization/simqat/eval.py index fcee1eaa1..9910c1cb5 100644 --- a/official/cv/ResNet/golden_stick/quantization/simqat/eval.py +++ b/official/cv/ResNet/golden_stick/quantization/simqat/eval.py @@ -14,7 +14,7 @@ # ============================================================================ """eval resnet.""" import os -import mindspore as ms +import mindspore import mindspore.log as logger from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from src.CrossEntropySmooth import CrossEntropySmooth @@ -26,7 +26,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) def get_comp_algo(): @@ -44,10 +44,10 @@ def eval_net(): target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, @@ -59,8 +59,8 @@ def eval_net(): net = algo.apply(net) # load checkpoint - param_dict = ms.load_checkpoint(config.checkpoint_file_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) # define loss, model @@ -74,7 +74,7 @@ def eval_net(): loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) # eval model res = model.eval(dataset) diff --git a/official/cv/ResNet/golden_stick/quantization/simqat/train.py b/official/cv/ResNet/golden_stick/quantization/simqat/train.py index a99243329..853574628 100644 --- a/official/cv/ResNet/golden_stick/quantization/simqat/train.py +++ b/official/cv/ResNet/golden_stick/quantization/simqat/train.py @@ -16,7 +16,7 @@ import os import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.communication.management import init, get_rank @@ -35,7 +35,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) class LossCallBack(LossMonitor): @@ -53,10 +53,10 @@ class LossCallBack(LossMonitor): loss = cb_params.net_outputs if isinstance(loss, (tuple, list)): - if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): + if isinstance(loss[0], mindspore.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): loss = loss[0] - if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray): + if isinstance(loss, mindspore.Tensor) and isinstance(loss.asnumpy(), np.ndarray): loss = np.mean(loss.asnumpy()) cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 @@ -88,27 +88,27 @@ def set_parameter(): # init context if config.mode_name == "GRAPH": - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=1, device_target=target, save_graphs=False) if config.run_distribute: if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - ms.set_context(device_id=device_id) - ms.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_context(device_id=device_id) + mindspore.set_auto_parallel_context(device_num=config.device_num, parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) set_algo_parameters(elementwise_op_strategy_follow=True) if config.boost_mode not in ["O1", "O2"]: - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) init() # GPU target else: init() - ms.set_auto_parallel_context(device_num=config.device_num, - parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=config.device_num, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) def init_weight(net): @@ -116,7 +116,7 @@ def init_weight(net): for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): if config.conv_init == "XavierUniform": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.XavierUniform(), cell.weight.shape, cell.weight.dtype)) elif config.conv_init == "TruncatedNormal": @@ -126,25 +126,25 @@ def init_weight(net): cell.weight.set_data(weight) if isinstance(cell, nn.Dense): if config.dense_init == "TruncatedNormal": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) elif config.dense_init == "RandomNormal": in_channel = cell.in_channels out_channel = cell.out_channels weight = np.random.normal(loc=0, scale=0.01, size=out_channel * in_channel) - weight = ms.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) + weight = mindspore.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) cell.weight.set_data(weight) def load_fp32_ckpt(net): if config.fp32_ckpt: if os.path.isfile(config.fp32_ckpt): - ckpt = ms.load_checkpoint(config.fp32_ckpt) + ckpt = mindspore.load_checkpoint(config.fp32_ckpt) if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - ms.load_param_into_net(net, ckpt) + mindspore.load_param_into_net(net, ckpt) else: print(f"Invalid fp32_ckpt {config.fp32_ckpt} parameter.") @@ -152,7 +152,7 @@ def load_fp32_ckpt(net): def load_pretrained_ckpt(net): if config.pre_trained: if os.path.isfile(config.pre_trained): - ckpt = ms.load_checkpoint(config.pre_trained) + ckpt = mindspore.load_checkpoint(config.pre_trained) if ckpt.get("epoch_num") and ckpt.get("step_num"): config.has_trained_epoch = int(ckpt["epoch_num"].data.asnumpy()) config.has_trained_step = int(ckpt["step_num"].data.asnumpy()) @@ -168,7 +168,7 @@ def load_pretrained_ckpt(net): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - not_load_param, _ = ms.load_param_into_net(net, ckpt) + not_load_param, _ = mindspore.load_param_into_net(net, ckpt) if not_load_param: raise RuntimeError("Load param into net fail.") else: @@ -239,7 +239,7 @@ def train_net(): lr_decay_mode='cosine') if config.pre_trained: lr = lr[config.has_trained_epoch * step_size:] - lr = ms.Tensor(lr) + lr = mindspore.Tensor(lr) # define opt group_params = init_group_params(net) opt = nn.Momentum(group_params, lr, config.momentum, weight_decay=config.weight_decay, loss_scale=config.loss_scale) @@ -250,7 +250,7 @@ def train_net(): metrics = {"acc"} if config.run_distribute: metrics = {'acc': DistAccuracy(batch_size=config.batch_size, device_num=config.device_num)} - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, amp_level="O0", boost_level=config.boost_mode, keep_batchnorm_fp32=False, boost_config_dict={"grad_freeze": {"total_steps": config.epoch_size * step_size}}) diff --git a/official/cv/ResNet/golden_stick/quantization/slb/eval.py b/official/cv/ResNet/golden_stick/quantization/slb/eval.py index 1f078f616..bbbb659e8 100644 --- a/official/cv/ResNet/golden_stick/quantization/slb/eval.py +++ b/official/cv/ResNet/golden_stick/quantization/slb/eval.py @@ -14,7 +14,7 @@ # ============================================================================ """eval resnet.""" -import mindspore as ms +import mindspore import mindspore.log as logger from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from slb import create_slb @@ -26,7 +26,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) def eval_net(): """eval net""" @@ -36,9 +36,9 @@ def eval_net(): # init context if config.mode_name == "GRAPH": - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=1, device_target=target, save_graphs=False) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, @@ -50,15 +50,15 @@ def eval_net(): net = algo.apply(net) # load checkpoint - param_dict = ms.load_checkpoint(config.checkpoint_file_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) # define loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) # eval model res = model.eval(dataset) diff --git a/official/cv/ResNet/golden_stick/quantization/slb/train.py b/official/cv/ResNet/golden_stick/quantization/slb/train.py index b7c6a80e9..7efbdda6c 100644 --- a/official/cv/ResNet/golden_stick/quantization/slb/train.py +++ b/official/cv/ResNet/golden_stick/quantization/slb/train.py @@ -16,7 +16,7 @@ import os import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.communication.management import init, get_rank @@ -34,7 +34,7 @@ if config.dataset == "cifar10": else: from src.dataset import create_dataset2 as create_dataset -ms.set_seed(1) +mindspore.set_seed(1) class LossCallBack(LossMonitor): @@ -52,10 +52,10 @@ class LossCallBack(LossMonitor): loss = cb_params.net_outputs if isinstance(loss, (tuple, list)): - if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): + if isinstance(loss[0], mindspore.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): loss = loss[0] - if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray): + if isinstance(loss, mindspore.Tensor) and isinstance(loss.asnumpy(), np.ndarray): loss = np.mean(loss.asnumpy()) cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 @@ -84,18 +84,18 @@ def set_parameter(): # init context if config.mode_name == "GRAPH": - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=1, device_target=target, save_graphs=False) if config.run_distribute: # GPU target init() - ms.set_auto_parallel_context(device_num=config.device_num, - parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=config.device_num, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) # Allreduce is not supported for network with dynamic control flow. - ms.set_auto_parallel_context(comm_fusion={"allreduce": {"mode": "size", "config": 0}}) + mindspore.set_auto_parallel_context(comm_fusion={"allreduce": {"mode": "size", "config": 0}}) def init_weight(net): @@ -103,7 +103,7 @@ def init_weight(net): for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): if config.conv_init == "XavierUniform": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.XavierUniform(), cell.weight.shape, cell.weight.dtype)) elif config.conv_init == "TruncatedNormal": @@ -113,14 +113,14 @@ def init_weight(net): cell.weight.set_data(weight) if isinstance(cell, nn.Dense): if config.dense_init == "TruncatedNormal": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) elif config.dense_init == "RandomNormal": in_channel = cell.in_channels out_channel = cell.out_channels weight = np.random.normal(loc=0, scale=0.01, size=out_channel * in_channel) - weight = ms.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) + weight = mindspore.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) cell.weight.set_data(weight) @@ -128,7 +128,7 @@ def get_pretrained_epoch(net): """get_pretrained_epoch""" if config.pre_trained: if os.path.isfile(config.pre_trained): - ckpt = ms.load_checkpoint(config.pre_trained) + ckpt = mindspore.load_checkpoint(config.pre_trained) if ckpt.get("epoch_num") and ckpt.get("step_num"): config.has_trained_epoch = int(ckpt["epoch_num"].data.asnumpy()) config.has_trained_step = int(ckpt["step_num"].data.asnumpy()) @@ -151,7 +151,7 @@ def load_pretrained_ckpt(net): """load_pretrained_ckpt""" if config.pre_trained: if os.path.isfile(config.pre_trained): - ckpt = ms.load_checkpoint(config.pre_trained) + ckpt = mindspore.load_checkpoint(config.pre_trained) if ckpt.get("epoch_num") and ckpt.get("step_num"): config.has_trained_epoch = int(ckpt["epoch_num"].data.asnumpy()) config.has_trained_step = int(ckpt["step_num"].data.asnumpy()) @@ -166,7 +166,7 @@ def load_pretrained_ckpt(net): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - not_load_param, _ = ms.load_param_into_net(net, ckpt) + not_load_param, _ = mindspore.load_param_into_net(net, ckpt) if not_load_param: raise RuntimeError("Load param into net fail.") else: @@ -237,7 +237,7 @@ def train_net(): lr_decay_mode=config.lr_decay_mode) if config.pre_trained: lr = lr[config.has_trained_epoch * step_size:] - lr = ms.Tensor(lr) + lr = mindspore.Tensor(lr) # define optimizer group_params = init_group_params(net) if config.optimizer == 'Momentum': @@ -251,7 +251,7 @@ def train_net(): metrics = {"acc"} if config.run_distribute: metrics = {'acc': DistAccuracy(batch_size=config.batch_size, device_num=config.device_num)} - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, amp_level="O0", boost_level=config.boost_mode, keep_batchnorm_fp32=False, eval_network=dist_eval_network, boost_config_dict={"grad_freeze": {"total_steps": config.epoch_size * step_size}}) diff --git a/official/cv/ResNet/gpu_resnet_benchmark.py b/official/cv/ResNet/gpu_resnet_benchmark.py index 43ac12d17..ee2d7d94b 100644 --- a/official/cv/ResNet/gpu_resnet_benchmark.py +++ b/official/cv/ResNet/gpu_resnet_benchmark.py @@ -16,7 +16,7 @@ import os import time import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.dataset as ds from mindspore.train.callback import Callback, ModelCheckpoint, CheckpointConfig @@ -27,7 +27,7 @@ from src.momentum import Momentum as MomentumWeightDecay from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -ms.set_seed(1) +mindspore.set_seed(1) class MyTimeMonitor(Callback): def __init__(self, batch_size, sink_size, dataset_size, mode): @@ -45,10 +45,10 @@ class MyTimeMonitor(Callback): loss = cb_params.net_outputs if isinstance(loss, (tuple, list)): - if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): + if isinstance(loss[0], mindspore.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): loss = loss[0] - if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray): + if isinstance(loss, mindspore.Tensor) and isinstance(loss.asnumpy(), np.ndarray): loss = np.mean(loss.asnumpy()) @@ -148,17 +148,17 @@ def train(): device_num = 1 # init context if config.mode_name == "GRAPH": - mode = ms.GRAPH_MODE + mode = 0 all_reduce_fusion_config = [85, 160] else: - mode = ms.PYNATIVE_MODE + mode = 1 all_reduce_fusion_config = [30, 90, 160] - ms.set_context(mode=mode, device_target=dev, save_graphs=False) + mindspore.set_context(mode=mode, device_target=dev, save_graphs=False) ckpt_save_dir = os.path.join(config.output_dir, config.checkpoint_path) if config.run_distribute: init() device_num = get_group_size() - ms.set_auto_parallel_context(device_num=device_num, parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=all_reduce_fusion_config) ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank()) + "/" @@ -176,18 +176,18 @@ def train(): # init weight for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.XavierUniform(), cell.weight.shape, cell.weight.dtype)) if isinstance(cell, nn.Dense): - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) # init lr lr = get_liner_lr(lr_init=0, lr_end=0, lr_max=0.8, warmup_epochs=0, total_epochs=epoch_size, steps_per_epoch=step_size) - lr = ms.Tensor(lr) + lr = mindspore.Tensor(lr) # define opt decayed_params = [] @@ -201,18 +201,18 @@ def train(): # define loss, model loss = CrossEntropySmooth(sparse=True, reduction='mean', smooth_factor=0.1, num_classes=1001) opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 1e-4) - loss_scale = ms.FixedLossScaleManager(1024, drop_overflow_update=False) - model = ms.Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) + loss_scale = mindspore.FixedLossScaleManager(1024, drop_overflow_update=False) + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) # Mixed precision if compute_type == "fp16": - if mode == ms.PYNATIVE_MODE: + if mode == 1: opt = MomentumWeightDecay(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 1e-4, 1024) else: opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 1e-4, 1024) - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, amp_level="O2", keep_batchnorm_fp32=False) # define callbacks - if mode == ms.PYNATIVE_MODE: + if mode == 1: print_per_steps = 1 time_cb = MyTimeMonitor(total_batch, print_per_steps, step_size, mode) cb = [time_cb] @@ -222,7 +222,7 @@ def train(): cb += [ckpt_cb] # train model print("========START RESNET50 GPU BENCHMARK========") - if mode == ms.GRAPH_MODE: + if mode == 0: model.train(int(epoch_size * step_size / print_per_steps), dataset, \ callbacks=cb, sink_size=print_per_steps, dataset_sink_mode=True) else: @@ -237,23 +237,23 @@ def eval_(): total_batch = int(config.batch_size) # init context if config.mode_name == "GRAPH": - mode = ms.GRAPH_MODE + mode = 0 else: - mode = ms.PYNATIVE_MODE - ms.set_context(mode=mode, device_target=dev, save_graphs=False) + mode = 1 + mindspore.set_context(mode=mode, device_target=dev, save_graphs=False) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, repeat_num=1, batch_size=total_batch, target=dev, dtype=compute_type) # define net net = resnet(class_num=1001, dtype=compute_type) # load checkpoint - param_dict = ms.load_checkpoint(ckpt_dir) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(ckpt_dir) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) # define loss, model loss = CrossEntropySmooth(sparse=True, reduction='mean', smooth_factor=0.1, num_classes=1001) # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) # eval model print("========START EVAL RESNET50 ON GPU ========") res = model.eval(dataset) diff --git a/official/cv/ResNet/infer.py b/official/cv/ResNet/infer.py index 0fde653eb..ae87c7678 100644 --- a/official/cv/ResNet/infer.py +++ b/official/cv/ResNet/infer.py @@ -15,7 +15,7 @@ """train resnet.""" import os import numpy as np -import mindspore as ms +import mindspore from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper @@ -54,10 +54,10 @@ def infer_net(): target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID', '0')) - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # create dataset dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, @@ -68,8 +68,8 @@ def infer_net(): net = resnet(class_num=config.class_num) # load checkpoint - param_dict = ms.load_checkpoint(config.checkpoint_file_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) print("start infer") @@ -81,7 +81,7 @@ def infer_net(): images = data["image"] label = data["label"] file_name = data["filename"] - res = net(ms.Tensor(images)) + res = net(mindspore.Tensor(images)) res = res.asnumpy() predict_id = np.argmax(res, axis=1) predict_negative, only_file = show_predict_info(label.tolist(), predict_id.tolist(), diff --git a/official/cv/ResNet/modelarts/ResNet152/train_start.py b/official/cv/ResNet/modelarts/ResNet152/train_start.py index f8f33ff4c..a1eed503d 100644 --- a/official/cv/ResNet/modelarts/ResNet152/train_start.py +++ b/official/cv/ResNet/modelarts/ResNet152/train_start.py @@ -19,7 +19,7 @@ import os import numpy as np import moxing as mox -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.train.train_thor import ConvertModelUtils from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor @@ -39,7 +39,7 @@ from src.model_utils.device_adapter import get_rank_id, get_device_num from src.resnet import conv_variance_scaling_initializer -ms.set_seed(1) +mindspore.set_seed(1) class LossCallBack(LossMonitor): @@ -58,10 +58,10 @@ class LossCallBack(LossMonitor): loss = cb_params.net_outputs if isinstance(loss, (tuple, list)): - if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): + if isinstance(loss[0], mindspore.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): loss = loss[0] - if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray): + if isinstance(loss, mindspore.Tensor) and isinstance(loss.asnumpy(), np.ndarray): loss = np.mean(loss.asnumpy()) cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 @@ -118,8 +118,8 @@ def apply_eval(eval_param): def set_graph_kernel_context(run_platform, net_name): if run_platform == "GPU" and net_name == "resnet101": - ms.set_context(enable_graph_kernel=True) - ms.set_context(graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D") + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D") def set_parameter(): @@ -134,37 +134,37 @@ def set_parameter(): if config.mode_name == 'GRAPH': if target == "Ascend": rank_save_graphs_path = os.path.join(config.save_graphs_path, "soma", str(os.getenv('DEVICE_ID', '0'))) - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=config.save_graphs, + mindspore.set_context(mode=0, device_target=target, save_graphs=config.save_graphs, save_graphs_path=rank_save_graphs_path) else: - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=config.save_graphs) + mindspore.set_context(mode=0, device_target=target, save_graphs=config.save_graphs) set_graph_kernel_context(target, config.net_name) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=1, device_target=target, save_graphs=False) if config.parameter_server: - ms.set_ps_context(enable_ps=True) + mindspore.set_ps_context(enable_ps=True) if config.run_distribute: if target == "Ascend": device_id = int(os.getenv('DEVICE_ID', '0')) - ms.set_context(device_id=device_id) - ms.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_context(device_id=device_id) + mindspore.set_auto_parallel_context(device_num=config.device_num, parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) set_algo_parameters(elementwise_op_strategy_follow=True) if config.net_name == "resnet50" or config.net_name == "se-resnet50": if config.boost_mode not in ["O1", "O2"]: - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) elif config.net_name in ["resnet101", "resnet152"]: - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) init() # GPU target else: init() - ms.set_auto_parallel_context(device_num=get_device_num(), - parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=get_device_num(), + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) if config.net_name == "resnet50": - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) def load_pre_trained_checkpoint(): @@ -187,9 +187,9 @@ def load_pre_trained_checkpoint(): print(f"time stamp {time_stamp.strftime('%Y.%m.%d-%H:%M:%S')}" f" pre trained ckpt model {ckpt_files[0]} loading", flush=True) - param_dict = ms.load_checkpoint(ckpt_files[0]) + param_dict = mindspore.load_checkpoint(ckpt_files[0]) elif os.path.isfile(config.pre_trained): - param_dict = ms.load_checkpoint(config.pre_trained) + param_dict = mindspore.load_checkpoint(config.pre_trained) else: print(f"Invalid pre_trained {config.pre_trained} parameter.") return param_dict @@ -209,12 +209,12 @@ def init_weight(net, param_dict): if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(param_dict, filter_list) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) else: for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): if config.conv_init == "XavierUniform": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.XavierUniform(), cell.weight.shape, cell.weight.dtype)) elif config.conv_init == "TruncatedNormal": @@ -224,14 +224,14 @@ def init_weight(net, param_dict): cell.weight.set_data(weight) if isinstance(cell, nn.Dense): if config.dense_init == "TruncatedNormal": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) elif config.dense_init == "RandomNormal": in_channel = cell.in_channels out_channel = cell.out_channels weight = np.random.normal(loc=0, scale=0.01, size=out_channel * in_channel) - weight = ms.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) + weight = mindspore.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) cell.weight.set_data(weight) @@ -323,13 +323,13 @@ def _export_air(ckpt_dir): if not ckpt_file: return net = resnet(config.class_num) - param_dict = ms.load_checkpoint(ckpt_file) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(ckpt_file) + mindspore.load_param_into_net(net, param_dict) - input_arr = ms.numpy.zeros([1, 3, 304, 304], ms.float32) + input_arr = mindspore.numpy.zeros([1, 3, 304, 304], mindspore.float32) print("Start export air.") - ms.export(net, input_arr, file_name=config.file_name, file_format="AIR") + mindspore.export(net, input_arr, file_name=config.file_name, file_format="AIR") file_name = config.file_name + ".air" mox.file.copy(file_name, os.path.join(config.output_dir, file_name)) @@ -352,7 +352,7 @@ def train_net(): net.set_param_ps() init_weight(net=net, param_dict=ckpt_param_dict) - lr = ms.Tensor(init_lr(step_size=step_size)) + lr = mindspore.Tensor(init_lr(step_size=step_size)) # define opt group_params = init_group_params(net) opt = nn.Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale) @@ -360,7 +360,7 @@ def train_net(): opt = nn.LARS(opt, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient, lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name) loss = init_loss_scale() - loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + loss_scale = mindspore.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) dist_eval_network = ClassifyCorrectCell(net) if config.run_distribute else None metrics = {"acc"} if config.run_distribute: @@ -368,9 +368,9 @@ def train_net(): if (config.net_name not in ("resnet18", "resnet34", "resnet50", "resnet101", "resnet152", "se-resnet50")) or \ config.parameter_server or target == "CPU": ## fp32 training - model = ms.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network) + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network) else: - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, amp_level="O2", # boost_level=config.boost_mode, keep_batchnorm_fp32=False, eval_network=dist_eval_network) @@ -379,7 +379,7 @@ def train_net(): from src.lr_generator import get_thor_damping damping = get_thor_damping(0, config.damping_init, config.damping_decay, 70, step_size) split_indices = [26, 53] - opt = nn.thor(net, lr, ms.Tensor(damping), config.momentum, config.weight_decay, config.loss_scale, + opt = nn.thor(net, lr, mindspore.Tensor(damping), config.momentum, config.weight_decay, config.loss_scale, config.batch_size, split_indices=split_indices, frequency=config.frequency) model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, diff --git a/official/cv/ResNet/modelarts/ResNet18/modelarts_train.py b/official/cv/ResNet/modelarts/ResNet18/modelarts_train.py index 2900c32a0..424875e79 100644 --- a/official/cv/ResNet/modelarts/ResNet18/modelarts_train.py +++ b/official/cv/ResNet/modelarts/ResNet18/modelarts_train.py @@ -17,7 +17,7 @@ import os import argparse import ast import moxing as mox -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.log as logger @@ -93,7 +93,7 @@ args_opt = parser.parse_args() CKPT_OUTPUT_PATH = "./output" -ms.set_seed(1) +mindspore.set_seed(1) if config.optimizer == "Thor": if args_opt.device_target == "Ascend": @@ -122,7 +122,7 @@ def apply_eval(eval_param): def set_graph_kernel_context(run_platform, net_name): if run_platform == "GPU" and net_name == "resnet101": - ms.set_context(enable_graph_kernel=True, + mindspore.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion") @@ -141,12 +141,12 @@ def _export_air(ckpt_dir): if not ckpt_file: return net = resnet(config.class_num) - param_dict = ms.load_checkpoint(ckpt_file) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(ckpt_file) + mindspore.load_param_into_net(net, param_dict) - input_arr = ms.numpy.zeros([1, 3, 304, 304], ms.float32) + input_arr = mindspore.numpy.zeros([1, 3, 304, 304], mindspore.float32) file_path = os.path.join(args_opt.train_url, "resnet") - ms.export(net, input_arr, file_name=file_path, file_format="AIR") + mindspore.export(net, input_arr, file_name=file_path, file_format="AIR") def set_config(): @@ -156,58 +156,58 @@ def set_config(): def init_context(target): if args_opt.mode == 'GRAPH': - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) set_graph_kernel_context(target, args_opt.net) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=1, device_target=target, save_graphs=False) if args_opt.parameter_server: - ms.set_ps_context(enable_ps=True) + mindspore.set_ps_context(enable_ps=True) if args_opt.run_distribute: if target == "Ascend": device_id = int(os.getenv('DEVICE_ID', '0')) - ms.set_context(device_id=device_id) - ms.set_auto_parallel_context( + mindspore.set_context(device_id=device_id) + mindspore.set_auto_parallel_context( device_num=args_opt.device_num, - parallel_mode=ms.ParallelMode.DATA_PARALLEL, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) set_algo_parameters(elementwise_op_strategy_follow=True) if args_opt.net == "resnet50" or args_opt.net == "se-resnet50": - ms.set_auto_parallel_context( + mindspore.set_auto_parallel_context( all_reduce_fusion_config=[85, 160]) elif args_opt.net == "resnet101": - ms.set_auto_parallel_context( + mindspore.set_auto_parallel_context( all_reduce_fusion_config=[80, 210, 313]) init() # GPU target else: init() - ms.set_auto_parallel_context( + mindspore.set_auto_parallel_context( device_num=get_group_size(), - parallel_mode=ms.ParallelMode.DATA_PARALLEL, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) if args_opt.net == "resnet50": - ms.set_auto_parallel_context( + mindspore.set_auto_parallel_context( all_reduce_fusion_config=[85, 160]) def init_weight(net): if args_opt.pre_trained: - param_dict = ms.load_checkpoint(args_opt.pre_trained) + param_dict = mindspore.load_checkpoint(args_opt.pre_trained) if args_opt.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(param_dict, filter_list) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) else: for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): cell.weight.set_data( - ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + mindspore.common.initializer.initializer(mindspore.common.initializer.XavierUniform(), cell.weight.shape, cell.weight.dtype)) if isinstance(cell, nn.Dense): cell.weight.set_data( - ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + mindspore.common.initializer.initializer(mindspore.common.initializer.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) @@ -229,7 +229,7 @@ def init_lr(step_size): lr = warmup_cosine_annealing_lr( config.lr, step_size, config.warmup_epochs, config.epoch_size, config.pretrain_epoch_size * step_size) - return ms.Tensor(lr) + return mindspore.Tensor(lr) def define_opt(net, lr): @@ -260,7 +260,7 @@ def define_model(net, opt, target): num_classes=config.class_num) else: loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') - loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + loss_scale = mindspore.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) dist_eval_network = ClassifyCorrectCell( net) if args_opt.run_distribute else None @@ -272,10 +272,10 @@ def define_model(net, opt, target): "se-resnet50")) or args_opt.parameter_server \ or target == "CPU": # fp32 training - model = ms.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network) else: - model = ms.Model(net, loss_fn=loss, optimizer=opt, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, amp_level="O2", keep_batchnorm_fp32=False, eval_network=dist_eval_network) @@ -345,7 +345,7 @@ def main(): damping = get_thor_damping(0, config.damping_init, config.damping_decay, 70, step_size) split_indices = [26, 53] - opt = nn.thor(net, lr, ms.Tensor(damping), config.momentum, + opt = nn.thor(net, lr, mindspore.Tensor(damping), config.momentum, config.weight_decay, config.loss_scale, config.batch_size, split_indices=split_indices, frequency=config.frequency) diff --git a/official/cv/ResNet/predict.py b/official/cv/ResNet/predict.py index 5a898b733..2ebde600a 100644 --- a/official/cv/ResNet/predict.py +++ b/official/cv/ResNet/predict.py @@ -26,11 +26,11 @@ import time import numpy as np from PIL import Image -import mindspore as ms +import mindspore import mindspore.dataset as ds from src.model_utils.config import config -ms.set_seed(1) +mindspore.set_seed(1) if config.net_name in ("resnet18", "resnet34", "resnet50", "resnet152"): if config.net_name == "resnet18": @@ -56,10 +56,10 @@ def create_model(): # load checkpoint if config.checkpoint_file_path: - param_dict = ms.load_checkpoint(config.checkpoint_file_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) - ms_model = ms.Model(net) + ms_model = mindspore.Model(net) return ms_model @@ -72,7 +72,7 @@ def read_image(img_path): ds.vision.HWC2CHW()] for transform in transform_list: img = transform(img) - img = ms.Tensor(np.expand_dims(img, axis=0), ms.float32) + img = mindspore.Tensor(np.expand_dims(img, axis=0), mindspore.float32) return img @@ -102,7 +102,7 @@ def predict_mindir(data_input): raise RuntimeError("Only support single input in this net.") inputs[0].set_data_from_numpy(data_input.asnumpy()) outputs = lite_mode_input.predict(inputs) - return ms.Tensor(outputs[0].get_data_to_numpy()) + return mindspore.Tensor(outputs[0].get_data_to_numpy()) def _get_lite_context(l_context): lite_context_properties = { @@ -111,18 +111,18 @@ def predict_mindir(data_input): "gpu": ["device_id", "precision_mode"], "ascend": ["device_id", "precision_mode", "provider", "rank_id"] } - lite_device_target = ms.get_context('device_target').lower() + lite_device_target = mindspore.get_context('device_target').lower() if lite_device_target not in ['cpu', 'gpu', 'ascend']: raise RuntimeError(f"Device target should be in ['cpu', 'gpu', 'ascend'], but got {lite_device_target}") l_context.target = [lite_device_target] l_context_device_dict = {'cpu': l_context.cpu, 'gpu': l_context.gpu, 'ascend': l_context.ascend} for single_property in lite_context_properties.get(lite_device_target): try: - context_value = ms.get_context(single_property) + context_value = mindspore.get_context(single_property) if context_value: setattr(l_context_device_dict.get(lite_device_target), single_property, context_value) except ValueError: - print(f'For set lite context, fail to get parameter {single_property} from ms.context.' + print(f'For set lite context, fail to get parameter {single_property} from mindspore.context.' f' Will use default value') return l_context @@ -149,10 +149,10 @@ def predict_net(data_input): """predict net""" target = config.device_target # init context - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - ms.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) # model predict ms_model = create_model() diff --git a/official/cv/ResNet/src/CrossEntropySmooth.py b/official/cv/ResNet/src/CrossEntropySmooth.py index 1634033c2..e129832d9 100644 --- a/official/cv/ResNet/src/CrossEntropySmooth.py +++ b/official/cv/ResNet/src/CrossEntropySmooth.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ """define loss function for network""" -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore import Tensor from mindspore.nn.loss import LossBase @@ -26,8 +26,8 @@ class CrossEntropySmooth(LossBase): super(CrossEntropySmooth, self).__init__() self.onehot = ops.OneHot() self.sparse = sparse - self.on_value = Tensor(1.0 - smooth_factor, ms.float32) - self.off_value = Tensor(1.0 * smooth_factor / (num_classes - 1), ms.float32) + self.on_value = Tensor(1.0 - smooth_factor, mindspore.float32) + self.off_value = Tensor(1.0 * smooth_factor / (num_classes - 1), mindspore.float32) self.ce = nn.SoftmaxCrossEntropyWithLogits(reduction=reduction) def construct(self, logit, label): diff --git a/official/cv/ResNet/src/callback.py b/official/cv/ResNet/src/callback.py index dae793713..21536bf35 100644 --- a/official/cv/ResNet/src/callback.py +++ b/official/cv/ResNet/src/callback.py @@ -18,7 +18,7 @@ import os import stat import time import numpy as np -import mindspore as ms +import mindspore from mindspore import save_checkpoint from mindspore.train.callback import Callback @@ -45,10 +45,10 @@ class LossCallBack(Callback): data_sink_mode = cb_params.get('dataset_sink_mode', True) if not data_sink_mode: if isinstance(loss, (tuple, list)): - if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): + if isinstance(loss[0], mindspore.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): loss = loss[0] - if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray): + if isinstance(loss, mindspore.Tensor) and isinstance(loss.asnumpy(), np.ndarray): loss = np.mean(loss.asnumpy()) cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 @@ -76,10 +76,10 @@ class LossCallBack(Callback): loss = cb_params.net_outputs cur_epoch_num = cb_params.cur_epoch_num if isinstance(loss, (tuple, list)): - if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): + if isinstance(loss[0], mindspore.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): loss = loss[0] - if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray): + if isinstance(loss, mindspore.Tensor) and isinstance(loss.asnumpy(), np.ndarray): loss = np.mean(loss.asnumpy()) epoch_time = time.time() - self.epoch_start_time @@ -156,7 +156,7 @@ class EvalCallBack(Callback): eval_cost = time.time() - eval_start self.logger.info("epoch: {}, {}: {}, eval_cost:{:.2f}".format(cur_epoch, self.metrics_name, res, eval_cost)) if res >= self.best_res: - if ms.context.get_context("enable_ge") and int(os.getenv('MS_DISABLE_REF_MODE', default="0")) == 1: + if mindspore.get_context("enable_ge") and int(os.getenv('MS_DISABLE_REF_MODE', default="0")) == 1: from mindspore.train.callback import _set_cur_net _set_cur_net(cb_params.train_network) cb_params.train_network.exec_checkpoint_graph() diff --git a/official/cv/ResNet/src/dataset.py b/official/cv/ResNet/src/dataset.py index 614c0eb14..1d4ddc87c 100644 --- a/official/cv/ResNet/src/dataset.py +++ b/official/cv/ResNet/src/dataset.py @@ -16,7 +16,7 @@ create train or eval dataset. """ import multiprocessing -import mindspore as ms +import mindspore import mindspore.dataset as ds from mindspore.communication.management import init, get_rank, get_group_size @@ -61,7 +61,7 @@ def create_dataset1(dataset_path, do_train, batch_size=32, train_image_size=224, ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=get_num_parallel_workers(8)) @@ -129,7 +129,7 @@ def create_dataset2(dataset_path, do_train, batch_size=32, train_image_size=224, ] trans_norm = [ds.vision.Normalize(mean=mean, std=std), ds.vision.HWC2CHW()] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) if device_num == 1: trans_work_num = 24 else: @@ -202,7 +202,7 @@ def create_dataset3(dataset_path, do_train, batch_size=32, train_image_size=224, ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=get_num_parallel_workers(8)) # only enable cache for eval @@ -272,7 +272,7 @@ def create_dataset4(dataset_path, do_train, batch_size=32, train_image_size=224, ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=get_num_parallel_workers(12)) # only enable cache for eval if do_train: diff --git a/official/cv/ResNet/src/dataset_infer.py b/official/cv/ResNet/src/dataset_infer.py index 92ff4aab1..75d48204a 100644 --- a/official/cv/ResNet/src/dataset_infer.py +++ b/official/cv/ResNet/src/dataset_infer.py @@ -17,7 +17,7 @@ create train or eval dataset. """ import os import numpy as np -import mindspore as ms +import mindspore import mindspore.dataset as ds from mindspore.communication.management import init, get_rank, get_group_size from src.model_utils.config import config @@ -145,7 +145,7 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, image_si ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) @@ -217,7 +217,7 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, image_s ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) @@ -287,7 +287,7 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, image_s ds.vision.HWC2CHW() ] - type_cast_op = ds.transforms.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.transforms.TypeCast(mindspore.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=12) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12) if do_train: diff --git a/official/cv/ResNet/src/metric.py b/official/cv/ResNet/src/metric.py index 7babb9455..89a82696f 100644 --- a/official/cv/ResNet/src/metric.py +++ b/official/cv/ResNet/src/metric.py @@ -14,7 +14,7 @@ # ============================================================================ """evaluation metric.""" -import mindspore as ms +import mindspore from mindspore.communication.management import GlobalComm import mindspore.ops as ops import mindspore.nn as nn @@ -54,9 +54,9 @@ class ClassifyCorrectCell(nn.Cell): def construct(self, data, label): outputs = self._network(data) y_pred = self.argmax(outputs) - y_pred = self.cast(y_pred, ms.int32) + y_pred = self.cast(y_pred, mindspore.int32) y_correct = self.equal(y_pred, label) - y_correct = self.cast(y_correct, ms.float32) + y_correct = self.cast(y_correct, mindspore.float32) y_correct = self.reduce_sum(y_correct) total_correct = self.allreduce(y_correct) return (total_correct,) diff --git a/official/cv/ResNet/src/model_utils/moxing_adapter.py b/official/cv/ResNet/src/model_utils/moxing_adapter.py index 8ad202e15..81f98ee9f 100644 --- a/official/cv/ResNet/src/model_utils/moxing_adapter.py +++ b/official/cv/ResNet/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -import mindspore as ms +import mindspore from src.model_utils.config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_dir) print("Workspace downloaded: ", os.listdir(config.output_dir)) - ms.set_context(save_graphs_path=os.path.join(config.output_dir, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_dir, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_dir): diff --git a/official/cv/ResNet/src/momentum.py b/official/cv/ResNet/src/momentum.py index de24e0f48..6d4f7fcfb 100644 --- a/official/cv/ResNet/src/momentum.py +++ b/official/cv/ResNet/src/momentum.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ """momentum""" -import mindspore as ms +import mindspore import mindspore.ops as ops from mindspore.common.parameter import Parameter from mindspore.common.tensor import Tensor @@ -123,7 +123,7 @@ class Momentum(Optimizer): super(Momentum, self).__init__(learning_rate, params, weight_decay, loss_scale) assert isinstance(momentum, float) and momentum >= 0, "momentum should be equal or bigger than 0" assert isinstance(use_nesterov, bool), "use_nesterov should be bool" - self.momentum = Parameter(Tensor(momentum, ms.float32), name="momentum") + self.momentum = Parameter(Tensor(momentum, mindspore.float32), name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = ops.HyperMap() @@ -133,8 +133,8 @@ class Momentum(Optimizer): def construct(self, gradients): params = self.params moments = self.moments - weight_decay = Tensor(0.0, ms.float32) - scale = Tensor(1.0, ms.float32) + weight_decay = Tensor(0.0, mindspore.float32) + scale = Tensor(1.0, mindspore.float32) if self.exec_weight_decay: weight_decay = self.weight_decay if self.need_scale: diff --git a/official/cv/ResNet/src/resnet_gpu_benchmark.py b/official/cv/ResNet/src/resnet_gpu_benchmark.py index 67ec6ffa6..282fe1d16 100644 --- a/official/cv/ResNet/src/resnet_gpu_benchmark.py +++ b/official/cv/ResNet/src/resnet_gpu_benchmark.py @@ -15,7 +15,7 @@ """ResNet.""" import numpy as np from scipy.stats import truncnorm -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.common.tensor import Tensor @@ -35,7 +35,7 @@ def _conv_variance_scaling_initializer(in_channel, out_channel, kernel_size): mu, sigma = 0, stddev weight = truncnorm(-2, 2, loc=mu, scale=sigma).rvs(out_channel * in_channel * kernel_size * kernel_size) weight = np.reshape(weight, (out_channel, kernel_size, kernel_size, in_channel)) - return Tensor(weight, dtype=ms.float32) + return Tensor(weight, dtype=mindspore.float32) def _weight_variable(shape, factor=0.01): init_value = np.random.randn(*shape).astype(np.float32) * factor diff --git a/official/cv/ResNet/src/util.py b/official/cv/ResNet/src/util.py index bef16d139..d4e4ef817 100644 --- a/official/cv/ResNet/src/util.py +++ b/official/cv/ResNet/src/util.py @@ -1,6 +1,6 @@ import os import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn from src.callback import EvalCallBack from src.resnet import conv_variance_scaling_initializer @@ -71,17 +71,17 @@ def init_weight(net, cfg): if not os.path.isfile(cfg.pre_trained): cfg.logger.warning("There is not ckpt file: %s", cfg.pre_trained) else: - param_dict = ms.load_checkpoint(cfg.pre_trained) + param_dict = mindspore.load_checkpoint(cfg.pre_trained) if cfg.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(param_dict, filter_list) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) cfg.logger.info("Pre trained ckpt mode: %s loading", cfg.pre_trained) else: for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): if cfg.conv_init == "XavierUniform": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.XavierUniform(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.XavierUniform(), cell.weight.shape, cell.weight.dtype)) elif cfg.conv_init == "TruncatedNormal": @@ -91,12 +91,12 @@ def init_weight(net, cfg): cell.weight.set_data(weight) if isinstance(cell, nn.Dense): if cfg.dense_init == "TruncatedNormal": - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.TruncatedNormal(), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) elif cfg.dense_init == "RandomNormal": in_channel = cell.in_channels out_channel = cell.out_channels weight = np.random.normal(loc=0, scale=0.01, size=out_channel * in_channel) - weight = ms.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) + weight = mindspore.Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype) cell.weight.set_data(weight) diff --git a/official/cv/ResNet/train.py b/official/cv/ResNet/train.py index c5d10d2ef..67ba139ea 100644 --- a/official/cv/ResNet/train.py +++ b/official/cv/ResNet/train.py @@ -15,7 +15,7 @@ """train resnet.""" import os -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.log as logger from mindspore.train.train_thor import ConvertModelUtils @@ -33,7 +33,7 @@ from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.model_utils.device_adapter import get_device_num -ms.set_seed(1) +mindspore.set_seed(1) if config.net_name in ("resnet18", "resnet34", "resnet50", "resnet152"): if config.net_name == "resnet18": @@ -58,8 +58,8 @@ else: def set_graph_kernel_context(run_platform, net_name): if run_platform == "GPU" and net_name == "resnet101": - ms.set_context(enable_graph_kernel=True) - ms.set_context(graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D") + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D") def set_parameter(): @@ -72,37 +72,37 @@ def set_parameter(): if config.mode_name == 'GRAPH': if target == "Ascend": rank_save_graphs_path = os.path.join(config.save_graphs_path, "soma", str(os.getenv('DEVICE_ID', '0'))) - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=config.save_graphs, + mindspore.set_context(mode=0, device_target=target, save_graphs=config.save_graphs, save_graphs_path=rank_save_graphs_path) else: - ms.set_context(mode=ms.GRAPH_MODE, device_target=target, save_graphs=config.save_graphs) + mindspore.set_context(mode=0, device_target=target, save_graphs=config.save_graphs) set_graph_kernel_context(target, config.net_name) else: - ms.set_context(mode=ms.PYNATIVE_MODE, device_target=target, save_graphs=False) + mindspore.set_context(mode=1, device_target=target, save_graphs=False) set_ascend_max_device_memory() if config.parameter_server: - ms.set_ps_context(enable_ps=True) + mindspore.set_ps_context(enable_ps=True) if config.run_distribute: if target == "Ascend": device_id = int(os.getenv('DEVICE_ID', '0')) - ms.set_context(device_id=device_id) - ms.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_context(device_id=device_id) + mindspore.set_auto_parallel_context(device_num=config.device_num, parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) set_algo_parameters(elementwise_op_strategy_follow=True) if config.net_name == "resnet50" or config.net_name == "se-resnet50": if config.boost_mode not in ["O1", "O2"]: - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) elif config.net_name in ["resnet101", "resnet152"]: - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) init() # GPU target else: init() - ms.set_auto_parallel_context(device_num=get_device_num(), - parallel_mode=ms.ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=get_device_num(), + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) if config.net_name == "resnet50": - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) config.rank_id = get_rank() if config.run_distribute else 0 @@ -136,10 +136,10 @@ def init_loss_scale(): def set_ascend_max_device_memory(): - if ms.get_context("enable_ge") and ms.get_context("mode") == ms.GRAPH_MODE and \ + if mindspore.get_context("enable_ge") and mindspore.get_context("mode") == 0 and \ hasattr(config, "max_device_memory"): logger.warning("When encountering a memory shortage situation in 1980B, reduce the max_device_memory.") - ms.set_context(max_device_memory=config.max_device_memory) + mindspore.set_context(max_device_memory=config.max_device_memory) @moxing_wrapper() @@ -160,11 +160,11 @@ def train_net(): init_weight(net, config) if config.resume_ckpt: - resume_param = ms.load_checkpoint(config.resume_ckpt, + resume_param = mindspore.load_checkpoint(config.resume_ckpt, choice_func=lambda x: not x.startswith(('learning_rate', 'global_step'))) - config.start_epoch = int(resume_param.get('epoch_num', ms.Tensor(0, ms.int32)).asnumpy().item()) + config.start_epoch = int(resume_param.get('epoch_num', mindspore.Tensor(0, mindspore.int32)).asnumpy().item()) - lr = ms.Tensor(init_lr(step_size=step_size)) + lr = mindspore.Tensor(init_lr(step_size=step_size)) # define opt group_params = init_group_params(net, config) opt = nn.Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale) @@ -172,7 +172,7 @@ def train_net(): opt = nn.LARS(opt, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient, lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name) loss = init_loss_scale() - loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + loss_scale = mindspore.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) dist_eval_network = ClassifyCorrectCell(net) if config.run_distribute else None metrics = {"acc"} if config.run_distribute: @@ -180,9 +180,9 @@ def train_net(): if (config.net_name not in ("resnet18", "resnet34", "resnet50", "resnet101", "resnet152", "se-resnet50")) or \ config.parameter_server or target == "CPU": # fp32 training - model = ms.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network) + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network) else: - model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, + model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, amp_level="O3", boost_level=config.boost_mode, eval_network=dist_eval_network, boost_config_dict={"grad_freeze": {"total_steps": config.epoch_size * step_size}}) @@ -192,7 +192,7 @@ def train_net(): damping = get_thor_damping(step_size * config.start_epoch, config.damping_init, config.damping_decay, 70, step_size) split_indices = [26, 53] - opt = nn.thor(net, lr, ms.Tensor(damping), config.momentum, config.weight_decay, config.loss_scale, + opt = nn.thor(net, lr, mindspore.Tensor(damping), config.momentum, config.weight_decay, config.loss_scale, config.batch_size, split_indices=split_indices, frequency=config.frequency) model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, @@ -202,8 +202,8 @@ def train_net(): # load resume param if config.resume_ckpt: - ms.load_param_into_net(net, resume_param) - ms.load_param_into_net(opt, resume_param) + mindspore.load_param_into_net(net, resume_param) + mindspore.load_param_into_net(opt, resume_param) config.logger.info('resume train from epoch: %s', config.start_epoch) # define callbacks diff --git a/official/cv/RetinaFace_ResNet50/eval.py b/official/cv/RetinaFace_ResNet50/eval.py index 6e79c5f29..27559fbf8 100644 --- a/official/cv/RetinaFace_ResNet50/eval.py +++ b/official/cv/RetinaFace_ResNet50/eval.py @@ -20,7 +20,7 @@ import datetime import numpy as np import cv2 -import mindspore as ms +import mindspore from mindspore import Tensor from mindspore.common import set_seed @@ -296,7 +296,7 @@ class DetectionEngine: def val(): - ms.set_context(mode=ms.GRAPH_MODE, device_target='GPU', save_graphs=False) + mindspore.set_context(mode=0, device_target='GPU', save_graphs=False) cfg = cfg_res50 @@ -307,10 +307,10 @@ def val(): # load checkpoint assert cfg['val_model'] is not None, 'val_model is None.' - param_dict = ms.load_checkpoint(cfg['val_model']) + param_dict = mindspore.load_checkpoint(cfg['val_model']) print('Load trained model done. {}'.format(cfg['val_model'])) network.init_parameters_data() - ms.load_param_into_net(network, param_dict) + mindspore.load_param_into_net(network, param_dict) # testing dataset testset_folder = cfg['val_dataset_folder'] diff --git a/official/cv/RetinaFace_ResNet50/export.py b/official/cv/RetinaFace_ResNet50/export.py index f2d531895..2a3a54ea0 100644 --- a/official/cv/RetinaFace_ResNet50/export.py +++ b/official/cv/RetinaFace_ResNet50/export.py @@ -15,7 +15,7 @@ """EXPORT ONNX MODEL WITH CKPT MODEL BASED ON MINDSPORE""" from __future__ import print_function import numpy as np -import mindspore as ms +import mindspore from mindspore import Tensor, export from src.network import RetinaFace, resnet50 from src.config import cfg_res50 @@ -24,7 +24,7 @@ from src.config import cfg_res50 def export_ONNX_model(): cfg = cfg_res50 - ms.set_context(mode=ms.GRAPH_MODE, device_target=cfg.get('device')) + mindspore.set_context(mode=0, device_target=cfg.get('device')) # build network backbone = resnet50(1001) @@ -33,9 +33,9 @@ def export_ONNX_model(): network.set_train(False) # load checkpoint into network - param_dict = ms.load_checkpoint(cfg['ckpt_model']) + param_dict = mindspore.load_checkpoint(cfg['ckpt_model']) network.init_parameters_data() - ms.load_param_into_net(network, param_dict) + mindspore.load_param_into_net(network, param_dict) # build input data input_data = Tensor(np.ones([1, 3, 2176, 2176]).astype(np.float32)) diff --git a/official/cv/RetinaFace_ResNet50/src/loss.py b/official/cv/RetinaFace_ResNet50/src/loss.py index 01fc8a34c..6a4b67bab 100644 --- a/official/cv/RetinaFace_ResNet50/src/loss.py +++ b/official/cv/RetinaFace_ResNet50/src/loss.py @@ -14,7 +14,7 @@ # ============================================================================ """Loss.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore import Tensor @@ -26,8 +26,8 @@ class SoftmaxCrossEntropyWithLogits(nn.Cell): self.log_softmax = ops.LogSoftmax() self.neg = ops.Neg() self.one_hot = ops.OneHot() - self.on_value = Tensor(1.0, ms.float32) - self.off_value = Tensor(0.0, ms.float32) + self.on_value = Tensor(1.0, mindspore.float32) + self.off_value = Tensor(0.0, mindspore.float32) self.reduce_sum = ops.ReduceSum() def construct(self, logits, labels): @@ -61,12 +61,12 @@ class MultiBoxLoss(nn.Cell): self.exp = ops.Exp() self.concat = ops.Concat(axis=1) self.reduce_sum2 = ops.ReduceSum(keep_dims=True) - self.idx = Tensor(np.reshape(np.arange(batch_size * num_boxes), (-1, 1)), ms.int32) + self.idx = Tensor(np.reshape(np.arange(batch_size * num_boxes), (-1, 1)), mindspore.int32) def construct(self, loc_data, loc_t, conf_data, conf_t, landm_data, landm_t): # landm loss - mask_pos1 = ops.cast(self.less(0.0, ops.cast(conf_t, ms.float32)), ms.float32) + mask_pos1 = ops.cast(self.less(0.0, ops.cast(conf_t, mindspore.float32)), mindspore.float32) N1 = self.maximum(self.reduce_sum(mask_pos1), 1) mask_pos_idx1 = self.tile(self.expand_dims(mask_pos1, -1), (1, 1, 10)) @@ -74,8 +74,8 @@ class MultiBoxLoss(nn.Cell): loss_landm = loss_landm / N1 # Localization Loss - mask_pos = ops.cast(self.notequal(0, conf_t), ms.float32) - conf_t = ops.cast(mask_pos, ms.int32) + mask_pos = ops.cast(self.notequal(0, conf_t), mindspore.float32) + conf_t = ops.cast(mask_pos, mindspore.int32) N = self.maximum(self.reduce_sum(mask_pos), 1) mask_pos_idx = self.tile(self.expand_dims(mask_pos, -1), (1, 1, 4)) @@ -95,17 +95,17 @@ class MultiBoxLoss(nn.Cell): # hard example mining num_matched_boxes = ops.reshape(self.reduce_sum(mask_pos, 1), (-1,)) - neg_masked_cross_entropy = ops.cast(loss_c * (1 - mask_pos), ms.float32) + neg_masked_cross_entropy = ops.cast(loss_c * (1 - mask_pos), mindspore.float32) _, loss_idx = self.sort_descend(neg_masked_cross_entropy, self.num_boxes) - _, relative_position = self.sort(ops.cast(loss_idx, ms.float32), self.num_boxes) - relative_position = ops.cast(relative_position, ms.float32) + _, relative_position = self.sort(ops.cast(loss_idx, mindspore.float32), self.num_boxes) + relative_position = ops.cast(relative_position, mindspore.float32) relative_position = relative_position[:, ::-1] - relative_position = ops.cast(relative_position, ms.int32) + relative_position = ops.cast(relative_position, mindspore.int32) num_neg_boxes = self.minimum(num_matched_boxes * self.neg_pre_positive, self.num_boxes - 1) tile_num_neg_boxes = self.tile(self.expand_dims(num_neg_boxes, -1), (1, self.num_boxes)) - top_k_neg_mask = ops.cast(self.less(relative_position, tile_num_neg_boxes), ms.float32) + top_k_neg_mask = ops.cast(self.less(relative_position, tile_num_neg_boxes), mindspore.float32) cross_entropy = self.cross_entropy(batch_conf, conf_t) cross_entropy = ops.reshape(cross_entropy, conf_t_shape) diff --git a/official/cv/RetinaFace_ResNet50/src/network.py b/official/cv/RetinaFace_ResNet50/src/network.py index b417e615d..56cdc94ef 100644 --- a/official/cv/RetinaFace_ResNet50/src/network.py +++ b/official/cv/RetinaFace_ResNet50/src/network.py @@ -17,7 +17,7 @@ import math from functools import reduce import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops from mindspore import Tensor @@ -496,20 +496,20 @@ class TrainingWrapper(nn.Cell): def __init__(self, network, optimizer, sens=1.0): super(TrainingWrapper, self).__init__(auto_prefix=False) self.network = network - self.weights = ms.ParameterTuple(network.trainable_params()) + self.weights = mindspore.ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = ops.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None - self.parallel_mode = ms.get_auto_parallel_context("parallel_mode") - class_list = [ms.ParallelMode.DATA_PARALLEL, ms.ParallelMode.HYBRID_PARALLEL] + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") + class_list = [mindspore.ParallelMode.DATA_PARALLEL, mindspore.ParallelMode.HYBRID_PARALLEL] if self.parallel_mode in class_list: self.reducer_flag = True if self.reducer_flag: - mean = ms.get_auto_parallel_context("gradients_mean") + mean = mindspore.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): - degree = ms.get_auto_parallel_context("device_num") + degree = mindspore.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree) diff --git a/official/cv/RetinaFace_ResNet50/train.py b/official/cv/RetinaFace_ResNet50/train.py index e318bd377..7e23e50a2 100644 --- a/official/cv/RetinaFace_ResNet50/train.py +++ b/official/cv/RetinaFace_ResNet50/train.py @@ -16,9 +16,9 @@ from __future__ import print_function import math import argparse -import mindspore as ms +import mindspore -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.communication.management import init, get_rank, get_group_size @@ -31,13 +31,13 @@ from src.lr_schedule import adjust_learning_rate def train(cfg, args): - ms.set_context(mode=ms.GRAPH_MODE, device_target='GPU', save_graphs=False) - if ms.get_context("device_target") == "GPU": + mindspore.set_context(mode=0, device_target='GPU', save_graphs=False) + if mindspore.get_context("device_target") == "GPU": # Enable graph kernel - ms.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion") + mindspore.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion") if args.is_distributed: init("nccl") - ms.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) cfg['ckpt_path'] = cfg['ckpt_path'] + "ckpt_" + str(get_rank()) + "/" @@ -65,8 +65,8 @@ def train(cfg, args): if cfg['pretrain'] and cfg['resume_net'] is None: pretrained_res50 = cfg['pretrain_path'] - param_dict_res50 = ms.load_checkpoint(pretrained_res50) - ms.load_param_into_net(backbone, param_dict_res50) + param_dict_res50 = mindspore.load_checkpoint(pretrained_res50) + mindspore.load_param_into_net(backbone, param_dict_res50) print('Load resnet50 from [{}] done.'.format(pretrained_res50)) net = RetinaFace(phase='train', backbone=backbone) @@ -74,8 +74,8 @@ def train(cfg, args): if cfg['resume_net'] is not None: pretrain_model_path = cfg['resume_net'] - param_dict_retinaface = ms.load_checkpoint(pretrain_model_path) - ms.load_param_into_net(net, param_dict_retinaface) + param_dict_retinaface = mindspore.load_checkpoint(pretrain_model_path) + mindspore.load_param_into_net(net, param_dict_retinaface) print('Resume Model from [{}] Done.'.format(cfg['resume_net'])) net = RetinaFaceWithLossCell(net, multibox_loss, cfg) @@ -84,9 +84,9 @@ def train(cfg, args): warmup_epoch=cfg['warmup_epoch']) if cfg['optim'] == 'momentum': - opt = ms.nn.Momentum(net.trainable_params(), lr, momentum) + opt = mindspore.nn.Momentum(net.trainable_params(), lr, momentum) elif cfg['optim'] == 'sgd': - opt = ms.nn.SGD(params=net.trainable_params(), learning_rate=lr, momentum=momentum, + opt = mindspore.nn.SGD(params=net.trainable_params(), learning_rate=lr, momentum=momentum, weight_decay=weight_decay, loss_scale=1) else: raise ValueError('optim is not define.') @@ -113,7 +113,7 @@ if __name__ == '__main__': arg, _ = parser.parse_known_args() config = cfg_res50 - ms.common.seed.set_seed(config['seed']) + mindspore.common.seed.set_seed(config['seed']) print('train config:\n', config) train(cfg=config, args=arg) diff --git a/official/cv/RetinaNet/eval.py b/official/cv/RetinaNet/eval.py index d18bda3ee..be4d99c29 100644 --- a/official/cv/RetinaNet/eval.py +++ b/official/cv/RetinaNet/eval.py @@ -21,7 +21,7 @@ import json import numpy as np from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval -from mindspore import context, Tensor +from mindspore import Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.retinanet import retinanet50, resnet50, retinanetInferWithDecoder from src.dataset import create_retinanet_dataset, data_to_mindrecord_byte_image, voc_data_to_mindrecord, \ @@ -30,6 +30,7 @@ from src.box_utils import default_boxes from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.model_utils.device_adapter import get_device_id, get_device_num +import mindspore def apply_nms(all_boxes, all_scores, thres, max_boxes): @@ -161,7 +162,7 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def retinanet_eval(): - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) prefix = "retinanet_eval.mindrecord" mindrecord_dir = config.mindrecord_dir mindrecord_file = os.path.join(mindrecord_dir, prefix + "0") diff --git a/official/cv/RetinaNet/eval_onnx.py b/official/cv/RetinaNet/eval_onnx.py index 863f663dd..ded56eb68 100644 --- a/official/cv/RetinaNet/eval_onnx.py +++ b/official/cv/RetinaNet/eval_onnx.py @@ -21,13 +21,14 @@ import numpy as np import onnxruntime as ort from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval -from mindspore import context from src.dataset import create_retinanet_dataset, data_to_mindrecord_byte_image, voc_data_to_mindrecord from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.model_utils.device_adapter import get_device_id, get_device_num +import mindspore + def create_session(onnx_path, target_device): """Create onnxruntime session""" if target_device == 'GPU': @@ -162,7 +163,7 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def retinanet_eval(): - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) prefix = "retinanet_eval.mindrecord" mindrecord_dir = config.mindrecord_dir mindrecord_file = os.path.join(mindrecord_dir, prefix + "0") diff --git a/official/cv/RetinaNet/export.py b/official/cv/RetinaNet/export.py index 9acd45892..e3506ecc7 100644 --- a/official/cv/RetinaNet/export.py +++ b/official/cv/RetinaNet/export.py @@ -15,8 +15,9 @@ """export for retinanet""" import os import numpy as np +import mindspore import mindspore.common.dtype as mstype -from mindspore import context, Tensor +from mindspore import Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net, export from src.retinanet import retinanet50, resnet50, retinanetInferWithDecoder from src.model_utils.config import config @@ -30,7 +31,7 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def model_export(): - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=config.device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=config.device_id) backbone = resnet50(config.num_classes) net = retinanet50(backbone, config) diff --git a/official/cv/RetinaNet/modelarts/train_start.py b/official/cv/RetinaNet/modelarts/train_start.py index a1b74ceb7..9cc3742dd 100644 --- a/official/cv/RetinaNet/modelarts/train_start.py +++ b/official/cv/RetinaNet/modelarts/train_start.py @@ -19,8 +19,8 @@ import os import subprocess import time import moxing as mox -from mindspore import context from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id +import mindspore _CACHE_DATA_URL = "/cache/data_url" _CACHE_TRAIN_URL = "/cache/train_url" @@ -189,7 +189,7 @@ def download_data(args): sync_data(args.train_url, args.output_path) print("Workspace downloaded: ", os.listdir(args.output_path)) - context.set_context(save_graphs_path=os.path.join(args.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(args.output_path, str(get_rank_id()))) args.device_num = get_device_num() args.device_id = get_device_id() # create output dir diff --git a/official/cv/RetinaNet/src/model_utils/moxing_adapter.py b/official/cv/RetinaNet/src/model_utils/moxing_adapter.py index c2d228240..344dfc034 100644 --- a/official/cv/RetinaNet/src/model_utils/moxing_adapter.py +++ b/official/cv/RetinaNet/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config @@ -101,7 +101,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print('Workspace downloaded: ', os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/RetinaNet/src/retinanet.py b/official/cv/RetinaNet/src/retinanet.py index ee36151b4..247ae0318 100644 --- a/official/cv/RetinaNet/src/retinanet.py +++ b/official/cv/RetinaNet/src/retinanet.py @@ -16,10 +16,10 @@ """retinanet based resnet.""" import mindspore.common.dtype as mstype -import mindspore as ms +import mindspore import mindspore.nn as nn -from mindspore import context, Tensor -from mindspore.context import ParallelMode +from mindspore import Tensor +from mindspore import ParallelMode from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.communication.management import get_group_size from mindspore.ops import operations as P @@ -292,19 +292,19 @@ class TrainingWrapper(nn.Cell): super(TrainingWrapper, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() - self.weights = ms.ParameterTuple(network.trainable_params()) + self.weights = mindspore.ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True if self.reducer_flag: - mean = context.get_auto_parallel_context("gradients_mean") + mean = mindspore.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): - degree = context.get_auto_parallel_context("device_num") + degree = mindspore.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree) diff --git a/official/cv/RetinaNet/train.py b/official/cv/RetinaNet/train.py index fcb40b81f..7c8090149 100644 --- a/official/cv/RetinaNet/train.py +++ b/official/cv/RetinaNet/train.py @@ -18,12 +18,13 @@ import os import ast import time +import mindspore import mindspore.nn as nn -from mindspore import context, Tensor +from mindspore import Tensor from mindspore.communication.management import init, get_rank from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, LossMonitor, TimeMonitor, Callback from mindspore.train import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.common import set_seed from src.retinanet import retinanetWithLossCell, TrainingWrapper, retinanet50, resnet50 @@ -117,7 +118,7 @@ def modelarts_pre_process(): def set_graph_kernel_context(device_target): if device_target == "GPU": # Enable graph kernel for default model ssd300 on GPU back-end. - context.set_context(enable_graph_kernel=True, + mindspore.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D") @@ -126,10 +127,10 @@ def main(): config.lr_init = ast.literal_eval(config.lr_init) config.lr_end_rate = ast.literal_eval(config.lr_end_rate) device_id = get_device_id() - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - if context.get_context("mode") == context.PYNATIVE_MODE: - context.set_context(mempool_block_size="31GB") + if mindspore.get_context("mode") == 1: + mindspore.set_context(mempool_block_size="31GB") elif config.device_target == "GPU": set_graph_kernel_context(config.device_target) elif config.device_target == "CPU": @@ -141,12 +142,12 @@ def main(): init() device_num = get_device_num() rank = get_rank() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: rank = 0 device_num = 1 - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) mindrecord_file = os.path.join(config.mindrecord_dir, "retinanet.mindrecord0") diff --git a/official/cv/SSD/README.md b/official/cv/SSD/README.md index 21b66dbd6..020ad730c 100644 --- a/official/cv/SSD/README.md +++ b/official/cv/SSD/README.md @@ -351,7 +351,8 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following +in log ```shell epoch: 1 step: 458, loss is 3.1681802 @@ -388,7 +389,8 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name is "LOG". Under this, you can find checkpoint files together with result like the followings in log +Training result will be stored in the current path, whose folder name is "LOG". Under this, you can find checkpoint files together with result like the following +in log ```shell epoch: 1 step: 1, loss is 420.11783 @@ -431,7 +433,8 @@ We need four parameters for this scripts. > checkpoint can be produced in training process. -Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the followings in log. +Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the following +in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.238 @@ -467,7 +470,8 @@ We need four parameters for this scripts. > checkpoint can be produced in training process. -Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the followings in log. +Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the following +in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.224 diff --git a/official/cv/SSD/eval.py b/official/cv/SSD/eval.py index 0fe0e6ce4..b6923d7c4 100644 --- a/official/cv/SSD/eval.py +++ b/official/cv/SSD/eval.py @@ -16,7 +16,7 @@ """Evaluation for SSD""" import os -import mindspore as ms +import mindspore from mindspore import Tensor from src.ssd import SSD300, SsdInferWithDecoder, ssd_mobilenet_v2, ssd_mobilenet_v1_fpn, ssd_mobilenet_v1, ssd_resnet50_fpn, ssd_vgg16 from src.dataset import create_ssd_dataset, create_mindrecord @@ -45,9 +45,9 @@ def ssd_eval(dataset_path, ckpt_path, anno_json): net = SsdInferWithDecoder(net, Tensor(default_boxes), config) print("Load Checkpoint!") - param_dict = ms.load_checkpoint(ckpt_path) + param_dict = mindspore.load_checkpoint(ckpt_path) net.init_parameters_data() - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) total = ds.get_dataset_size() * batch_size @@ -77,7 +77,7 @@ def eval_net(): else: raise ValueError('SSD eval only support dataset mode is coco and voc!') - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=config.device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=config.device_id) mindrecord_file = create_mindrecord(config.dataset, "ssd_eval.mindrecord", False) diff --git a/official/cv/SSD/eval_onnx.py b/official/cv/SSD/eval_onnx.py index a736ab41c..2a39265fc 100644 --- a/official/cv/SSD/eval_onnx.py +++ b/official/cv/SSD/eval_onnx.py @@ -17,12 +17,13 @@ import os import numpy as np import onnxruntime as ort -from mindspore import context from src.dataset import create_ssd_dataset, create_mindrecord from src.eval_utils import COCOMetrics from src.model_utils.config import config +import mindspore + def create_session(onnx_path, target_device): """Create onnxruntime session""" @@ -95,7 +96,7 @@ def eval_net(): else: raise ValueError('SSD eval only support dataset mode is coco and voc!') - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=config.device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=config.device_id) mindrecord_file = create_mindrecord(config.dataset, "ssd_eval.mindrecord", False) diff --git a/official/cv/SSD/export.py b/official/cv/SSD/export.py index 9917d18d9..653867145 100644 --- a/official/cv/SSD/export.py +++ b/official/cv/SSD/export.py @@ -16,16 +16,16 @@ import os import numpy as np -import mindspore as ms +import mindspore from mindspore import Tensor from src.ssd import SSD300, SsdInferWithDecoder, ssd_mobilenet_v2, ssd_mobilenet_v1_fpn, ssd_mobilenet_v1, ssd_resnet50_fpn, ssd_vgg16 from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.box_utils import default_boxes -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - ms.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) def modelarts_pre_process(): '''modelarts pre process function.''' @@ -56,14 +56,14 @@ def run_export(): net = SsdInferWithDecoder(net, Tensor(default_boxes), config) - param_dict = ms.load_checkpoint(config.checkpoint_file_path) + param_dict = mindspore.load_checkpoint(config.checkpoint_file_path) net.init_parameters_data() - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) input_shp = [config.batch_size, 3] + config.img_shape - input_array = Tensor(np.random.uniform(-1.0, 1.0, size=input_shp), ms.float32) - ms.export(net, input_array, file_name=config.file_name, file_format=config.file_format) + input_array = Tensor(np.random.uniform(-1.0, 1.0, size=input_shp), mindspore.float32) + mindspore.export(net, input_array, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': run_export() diff --git a/official/cv/SSD/infer_ssd_mobilenet_v1_fpn_onnx.py b/official/cv/SSD/infer_ssd_mobilenet_v1_fpn_onnx.py index 346d6bf4e..ee181c203 100644 --- a/official/cv/SSD/infer_ssd_mobilenet_v1_fpn_onnx.py +++ b/official/cv/SSD/infer_ssd_mobilenet_v1_fpn_onnx.py @@ -17,12 +17,13 @@ import os import numpy as np import onnxruntime as ort -from mindspore import context from src.dataset import create_ssd_dataset, create_mindrecord from src.eval_utils import COCOMetrics from src.model_utils.config import config +import mindspore + def create_session(onnx_path, target_device): if target_device == 'GPU': @@ -96,7 +97,7 @@ def eval_net(): else: raise ValueError('SSD eval only support dataset mode is coco and voc!') - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=config.device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=config.device_id) mindrecord_file = create_mindrecord(config.dataset, "ssd_eval.mindrecord", False) diff --git a/official/cv/SSD/src/model_utils/moxing_adapter.py b/official/cv/SSD/src/model_utils/moxing_adapter.py index 72b124bd0..c2cadef73 100644 --- a/official/cv/SSD/src/model_utils/moxing_adapter.py +++ b/official/cv/SSD/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -import mindspore as ms +import mindspore from src.model_utils.config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - ms.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/SSD/src/ssd.py b/official/cv/SSD/src/ssd.py index faf9804dd..a522f12c6 100644 --- a/official/cv/SSD/src/ssd.py +++ b/official/cv/SSD/src/ssd.py @@ -15,10 +15,10 @@ """SSD net based MobilenetV2.""" -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore import Tensor -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.communication.management import get_group_size import mindspore.ops as ops @@ -333,8 +333,8 @@ class SSD300(nn.Cell): pred_loc, pred_label = self.multi_box(multi_feature) if not self.is_training: pred_label = self.activation(pred_label) - pred_loc = ops.cast(pred_loc, ms.float32) - pred_label = ops.cast(pred_label, ms.float32) + pred_loc = ops.cast(pred_loc, mindspore.float32) + pred_label = ops.cast(pred_label, mindspore.float32) return pred_loc, pred_label @@ -364,8 +364,8 @@ class SsdMobilenetV1Fpn(nn.Cell): pred_loc, pred_label = self.multi_box(features) if not self.training: pred_label = self.activation(pred_label) - pred_loc = ops.cast(pred_loc, ms.float32) - pred_label = ops.cast(pred_label, ms.float32) + pred_loc = ops.cast(pred_loc, mindspore.float32) + pred_label = ops.cast(pred_label, mindspore.float32) return pred_loc, pred_label @@ -413,8 +413,8 @@ class SsdMobilenetV1Feature(nn.Cell): pred_loc, pred_label = self.multi_box(multi_feature) if not self.training: pred_label = self.activation(pred_label) - pred_loc = ops.cast(pred_loc, ms.float32) - pred_label = ops.cast(pred_label, ms.float32) + pred_loc = ops.cast(pred_loc, mindspore.float32) + pred_label = ops.cast(pred_label, mindspore.float32) return pred_loc, pred_label @@ -443,8 +443,8 @@ class SsdResNet50Fpn(nn.Cell): pred_loc, pred_label = self.multi_box(features) if not self.training: pred_label = self.activation(pred_label) - pred_loc = ops.cast(pred_loc, ms.float32) - pred_label = ops.cast(pred_label, ms.float32) + pred_loc = ops.cast(pred_loc, mindspore.float32) + pred_label = ops.cast(pred_label, mindspore.float32) return pred_loc, pred_label @@ -465,8 +465,8 @@ class SigmoidFocalClassificationLoss(nn.Cell): self.sigmoid = ops.Sigmoid() self.pow = ops.Pow() self.onehot = ops.OneHot() - self.on_value = Tensor(1.0, ms.float32) - self.off_value = Tensor(0.0, ms.float32) + self.on_value = Tensor(1.0, mindspore.float32) + self.off_value = Tensor(0.0, mindspore.float32) self.gamma = gamma self.alpha = alpha @@ -474,7 +474,7 @@ class SigmoidFocalClassificationLoss(nn.Cell): label = self.onehot(label, ops.shape(logits)[-1], self.on_value, self.off_value) sigmiod_cross_entropy = self.sigmiod_cross_entropy(logits, label) sigmoid = self.sigmoid(logits) - label = ops.cast(label, ms.float32) + label = ops.cast(label, mindspore.float32) p_t = label * sigmoid + (1 - label) * (1 - sigmoid) modulating_factor = self.pow(1 - p_t, self.gamma) alpha_weight_factor = label * self.alpha + (1 - label) * (1 - self.alpha) @@ -505,8 +505,8 @@ class SSDWithLossCell(nn.Cell): def construct(self, x, gt_loc, gt_label, num_matched_boxes): pred_loc, pred_label = self.network(x) - mask = ops.cast(self.less(0, gt_label), ms.float32) - num_matched_boxes = self.reduce_sum(ops.cast(num_matched_boxes, ms.float32)) + mask = ops.cast(self.less(0, gt_label), mindspore.float32) + num_matched_boxes = self.reduce_sum(ops.cast(num_matched_boxes, mindspore.float32)) # Localization Loss mask_loc = self.tile(self.expand_dims(mask, -1), (1, 1, 4)) @@ -543,20 +543,20 @@ class TrainingWrapper(nn.Cell): super(TrainingWrapper, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() - self.weights = ms.ParameterTuple(network.trainable_params()) + self.weights = mindspore.ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = ops.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None self.use_global_norm = use_global_norm - self.parallel_mode = ms.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True if self.reducer_flag: - mean = ms.get_auto_parallel_context("gradients_mean") + mean = mindspore.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): - degree = ms.get_auto_parallel_context("device_num") + degree = mindspore.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree) @@ -760,8 +760,8 @@ class SSD300VGG16(nn.Cell): pred_loc, pred_label = self.multi_box(multi_feature) if not self.training: pred_label = self.activation(pred_label) - pred_loc = ops.cast(pred_loc, ms.float32) - pred_label = ops.cast(pred_label, ms.float32) + pred_loc = ops.cast(pred_loc, mindspore.float32) + pred_label = ops.cast(pred_label, mindspore.float32) return pred_loc, pred_label diff --git a/official/cv/SSD/train.py b/official/cv/SSD/train.py index 8e8e0d55f..16bff8c78 100644 --- a/official/cv/SSD/train.py +++ b/official/cv/SSD/train.py @@ -16,13 +16,13 @@ """Train SSD and get checkpoint files.""" import os -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore import Tensor from mindspore.communication.management import init, get_rank from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, LossMonitor, TimeMonitor from mindspore.train import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.common import set_seed, dtype import mindspore.log as logger from src.ssd import SSD300, SsdInferWithDecoder, SSDWithLossCell, TrainingWrapper, ssd_mobilenet_v2, \ @@ -51,40 +51,40 @@ def ssd_model_build(): ssd = ssd_mobilenet_v1_fpn(config=config) init_net_param(ssd) if config.feature_extractor_base_param != "": - param_dict = ms.load_checkpoint(config.feature_extractor_base_param) + param_dict = mindspore.load_checkpoint(config.feature_extractor_base_param) for x in list(param_dict.keys()): param_dict["network.feature_extractor.mobilenet_v1." + x] = param_dict[x] del param_dict[x] - ms.load_param_into_net(ssd.feature_extractor.mobilenet_v1.network, param_dict) + mindspore.load_param_into_net(ssd.feature_extractor.mobilenet_v1.network, param_dict) elif config.model_name == "ssd_mobilenet_v1": ssd = ssd_mobilenet_v1(config=config) init_net_param(ssd) if config.feature_extractor_base_param != "": - param_dict = ms.load_checkpoint(config.feature_extractor_base_param) + param_dict = mindspore.load_checkpoint(config.feature_extractor_base_param) for x in list(param_dict.keys()): param_dict["network.feature_extractor.mobilenet_v1." + x] = param_dict[x] del param_dict[x] - ms.load_param_into_net(ssd.feature_extractor.mobilenet_v1.network, param_dict) + mindspore.load_param_into_net(ssd.feature_extractor.mobilenet_v1.network, param_dict) elif config.model_name == "ssd_resnet50_fpn": ssd = ssd_resnet50_fpn(config=config) init_net_param(ssd) if config.feature_extractor_base_param != "": - param_dict = ms.load_checkpoint(config.feature_extractor_base_param) + param_dict = mindspore.load_checkpoint(config.feature_extractor_base_param) for x in list(param_dict.keys()): param_dict["network.feature_extractor.resnet." + x] = param_dict[x] del param_dict[x] - ms.load_param_into_net(ssd.feature_extractor.resnet, param_dict) + mindspore.load_param_into_net(ssd.feature_extractor.resnet, param_dict) elif config.model_name == "ssd_vgg16": ssd = ssd_vgg16(config=config) init_net_param(ssd) if config.feature_extractor_base_param != "": - param_dict = ms.load_checkpoint(config.feature_extractor_base_param) + param_dict = mindspore.load_checkpoint(config.feature_extractor_base_param) from src.vgg16 import ssd_vgg_key_mapper for k in ssd_vgg_key_mapper: v = ssd_vgg_key_mapper[k] param_dict["network.backbone." + v + ".weight"] = param_dict[k + ".weight"] del param_dict[k + ".weight"] - ms.load_param_into_net(ssd.backbone, param_dict) + mindspore.load_param_into_net(ssd.backbone, param_dict) else: raise ValueError(f'config.model: {config.model_name} is not supported') return ssd @@ -93,23 +93,23 @@ def ssd_model_build(): def set_graph_kernel_context(device_target, model): if device_target == "GPU" and model == "ssd300": # Enable graph kernel for default model ssd300 on GPU back-end. - ms.set_context(enable_graph_kernel=True, + mindspore.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D") if device_target == "GPU" and model == "ssd_mobilenet_v1": # Enable graph kernel for default model ssd300 on GPU back-end. - ms.context.set_context(enable_graph_kernel=True, + mindspore.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D") def set_ascend_pynative_mempool_block_size(): - if ms.get_context("mode") == ms.PYNATIVE_MODE and config.device_target == "Ascend": - ms.set_context(mempool_block_size="31GB") + if mindspore.get_context("mode") == 1 and config.device_target == "Ascend": + mindspore.set_context(mempool_block_size="31GB") def set_ascend_max_device_memory(): - if ms.get_context("enable_ge") and ms.get_context("mode") == ms.GRAPH_MODE: + if mindspore.get_context("enable_ge") and mindspore.get_context("mode") == 0: logger.warning("When encountering a memory shortage situation in 1980B, reduce the max_device_memory.") - ms.set_context(max_device_memory="50GB") + mindspore.set_context(max_device_memory="50GB") @moxing_wrapper() @@ -126,22 +126,22 @@ def train_net(): loss_scale = float(config.loss_scale) if config.device_target == "CPU": loss_scale = 1.0 - ms.set_context(mode=ms.GRAPH_MODE, device_target="CPU") + mindspore.set_context(mode=0, device_target="CPU") else: - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=config.device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=config.device_id) # Only works on ascend chip of 1980B - ms.set_context(ascend_config={"precision_mode": "allow_fp32_to_fp16"}) + mindspore.set_context(ascend_config={"precision_mode": "allow_fp32_to_fp16"}) set_graph_kernel_context(config.device_target, config.model_name) set_ascend_pynative_mempool_block_size() set_ascend_max_device_memory() if config.run_distribute: device_num = config.device_num - ms.reset_auto_parallel_context() - ms.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) init() if config.all_reduce_fusion_config: - ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) rank = get_rank() mindrecord_file = create_mindrecord(config.dataset, "ssd.mindrecord", True) @@ -167,10 +167,10 @@ def train_net(): ckpoint_cb = ModelCheckpoint(prefix="ssd", directory=ckpt_save_dir, config=ckpt_config) if config.pre_trained: - param_dict = ms.load_checkpoint(config.pre_trained) + param_dict = mindspore.load_checkpoint(config.pre_trained) if config.filter_weight: filter_checkpoint_parameter_by_list(param_dict, config.checkpoint_filter_list) - ms.load_param_into_net(net, param_dict, True) + mindspore.load_param_into_net(net, param_dict, True) lr = Tensor(get_lr(global_step=config.pre_trained_epoch_size * dataset_size, lr_init=config.lr_init, lr_end=config.lr_end_rate * config.lr, lr_max=config.lr, diff --git a/official/cv/ShuffleNet/shufflenetv1/eval.py b/official/cv/ShuffleNet/shufflenetv1/eval.py index 83ba8464c..00e50cbf6 100644 --- a/official/cv/ShuffleNet/shufflenetv1/eval.py +++ b/official/cv/ShuffleNet/shufflenetv1/eval.py @@ -14,7 +14,8 @@ # ============================================================================ """test ShuffleNetV1""" import time -from mindspore import context, nn +import mindspore +from mindspore import nn from mindspore.train.model import Model from mindspore.common import set_seed from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -30,7 +31,7 @@ set_seed(1) @moxing_wrapper(pre_process=None) def test(): - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False, + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, device_id=get_device_id()) # create dataset diff --git a/official/cv/ShuffleNet/shufflenetv1/export.py b/official/cv/ShuffleNet/shufflenetv1/export.py index 5f5709d8c..f9618edd7 100644 --- a/official/cv/ShuffleNet/shufflenetv1/export.py +++ b/official/cv/ShuffleNet/shufflenetv1/export.py @@ -18,14 +18,14 @@ suggest run as python export.py --file_name [file name] --ckpt_path [ckpt path] """ import os import numpy as np -import mindspore as ms -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.model_utils.config import config from src.shufflenetv1 import ShuffleNetV1 from src.model_utils.moxing_adapter import moxing_wrapper -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) def modelarts_pre_process(): @@ -33,7 +33,7 @@ def modelarts_pre_process(): if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) @moxing_wrapper(pre_process=modelarts_pre_process) @@ -44,7 +44,7 @@ def model_export(): load_param_into_net(net, param_dict) image_height, image_width = (224, 224) - input_arr = Tensor(np.ones([config.batch_size, 3, image_height, image_width]), ms.float32) + input_arr = Tensor(np.ones([config.batch_size, 3, image_height, image_width]), mindspore.float32) export(net, input_arr, file_name=config.file_name, file_format=config.file_format) diff --git a/official/cv/ShuffleNet/shufflenetv1/infer_shufflenetv1_onnx.py b/official/cv/ShuffleNet/shufflenetv1/infer_shufflenetv1_onnx.py index b561b17fa..6636f6c2b 100644 --- a/official/cv/ShuffleNet/shufflenetv1/infer_shufflenetv1_onnx.py +++ b/official/cv/ShuffleNet/shufflenetv1/infer_shufflenetv1_onnx.py @@ -15,7 +15,7 @@ """test ShuffleNetV1""" import onnxruntime import numpy as np -import mindspore as ms +import mindspore from mindspore import Tensor from mindspore import ops from src.dataset import create_dataset @@ -51,7 +51,7 @@ def test(): model_predict = np.expand_dims(np.squeeze(model_predict), axis=0) for predict, label in zip(model_predict[0], labels): cnt = cnt + 1 - input_x = Tensor(predict, ms.float16) + input_x = Tensor(predict, mindspore.float16) _, k_label = topk(input_x, k) if k_label[0] == label: correct_top1 = correct_top1 + 1 diff --git a/official/cv/ShuffleNet/shufflenetv1/src/model_utils/moxing_adapter.py b/official/cv/ShuffleNet/shufflenetv1/src/model_utils/moxing_adapter.py index c2d228240..344dfc034 100644 --- a/official/cv/ShuffleNet/shufflenetv1/src/model_utils/moxing_adapter.py +++ b/official/cv/ShuffleNet/shufflenetv1/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config @@ -101,7 +101,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print('Workspace downloaded: ', os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/ShuffleNet/shufflenetv1/train.py b/official/cv/ShuffleNet/shufflenetv1/train.py index e758ce9c6..4a4b5ef16 100644 --- a/official/cv/ShuffleNet/shufflenetv1/train.py +++ b/official/cv/ShuffleNet/shufflenetv1/train.py @@ -15,11 +15,12 @@ """train ShuffleNetV1""" import os import time -from mindspore import context, nn +import mindspore +from mindspore import nn from mindspore import Tensor from mindspore.common import set_seed from mindspore.nn.optim.momentum import Momentum -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.model import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor, LossMonitor, Callback from mindspore.train.serialization import load_checkpoint, load_param_into_net, save_checkpoint @@ -110,24 +111,24 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def train(): - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) # init distributed if config.is_distributed: if os.getenv('DEVICE_ID', "not_set").isdigit(): - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) init() rank = get_rank() group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=True) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=True) else: rank = 0 group_size = 1 - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) if config.device_target == "GPU": - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) # define network net = ShuffleNetV1(model_size=config.model_size, n_class=config.num_classes) diff --git a/official/cv/ShuffleNet/shufflenetv2/cpu_transfer.py b/official/cv/ShuffleNet/shufflenetv2/cpu_transfer.py index e7e5c36ad..43e2960c1 100644 --- a/official/cv/ShuffleNet/shufflenetv2/cpu_transfer.py +++ b/official/cv/ShuffleNet/shufflenetv2/cpu_transfer.py @@ -16,7 +16,7 @@ import argparse import ast import time -from mindspore import context +import mindspore from mindspore import Tensor from mindspore.common import set_seed from mindspore.nn.optim.momentum import Momentum @@ -58,10 +58,10 @@ if __name__ == '__main__': help='run platform(Default:Ascend)') args_opt = parser.parse_args() if args_opt.use_pynative_mode: - context.set_context(mode=context.PYNATIVE_MODE, device_target=args_opt.platform, + mindspore.set_context(mode=1, device_target=args_opt.platform, device_id=config_cpu.device_id) else: - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.platform, + mindspore.set_context(mode=0, device_target=args_opt.platform, device_id=config_cpu.device_id, save_graphs=False) # define network diff --git a/official/cv/ShuffleNet/shufflenetv2/eval.py b/official/cv/ShuffleNet/shufflenetv2/eval.py index d0e20b91f..71f22a4c5 100644 --- a/official/cv/ShuffleNet/shufflenetv2/eval.py +++ b/official/cv/ShuffleNet/shufflenetv2/eval.py @@ -17,10 +17,8 @@ import argparse import ast import os import time - +import mindspore import mindspore.nn as nn - -from mindspore import context from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -88,10 +86,10 @@ if __name__ == '__main__': print('device_id = ', device_id) if args_opt.use_pynative_mode: - context.set_context(mode=context.PYNATIVE_MODE, device_target=args_opt.platform, + mindspore.set_context(mode=1, device_target=args_opt.platform, device_id=device_id) else: - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.platform, + mindspore.set_context(mode=0, device_target=args_opt.platform, device_id=device_id, save_graphs=False) dataset_path = args_opt.dataset_path diff --git a/official/cv/ShuffleNet/shufflenetv2/export.py b/official/cv/ShuffleNet/shufflenetv2/export.py index 30666f759..e7b0f6019 100644 --- a/official/cv/ShuffleNet/shufflenetv2/export.py +++ b/official/cv/ShuffleNet/shufflenetv2/export.py @@ -17,8 +17,8 @@ import argparse import ast import numpy as np -import mindspore as ms -from mindspore import context, Tensor, load_checkpoint, load_param_into_net, export +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.config import config_gpu as cfg from src.shufflenetv2 import ShuffleNetV2 @@ -44,14 +44,14 @@ if __name__ == '__main__': if args.overwrite_config: cfg.num_classes = args.num_classes - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) + mindspore.set_context(mode=0, device_target=args.device_target) if args.device_target == "Ascend" or args.device_target == "GPU": - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) net = ShuffleNetV2(n_class=cfg.num_classes) ckpt = load_checkpoint(args.ckpt_file) load_param_into_net(net, ckpt) net.set_train(False) - input_data = Tensor(np.ones([args.batch_size, 3, args.height, args.width]), ms.float32) + input_data = Tensor(np.ones([args.batch_size, 3, args.height, args.width]), mindspore.float32) export(net, input_data, file_name=args.file_name, file_format=args.file_format) diff --git a/official/cv/ShuffleNet/shufflenetv2/infer_shufflenetv2_onnx.py b/official/cv/ShuffleNet/shufflenetv2/infer_shufflenetv2_onnx.py index f753c2946..07378afb6 100644 --- a/official/cv/ShuffleNet/shufflenetv2/infer_shufflenetv2_onnx.py +++ b/official/cv/ShuffleNet/shufflenetv2/infer_shufflenetv2_onnx.py @@ -17,7 +17,7 @@ import argparse import numpy as np import onnxruntime -import mindspore as ms +import mindspore from mindspore import Tensor from mindspore import ops from src.dataset import create_dataset @@ -54,7 +54,7 @@ def test(onnx_path, onnx_dataset_path, device_target, device_id): model_predict = session.run(None, inputs) model_predict = np.expand_dims(np.squeeze(model_predict), axis=0) - input_x = Tensor(model_predict[0], ms.float16) + input_x = Tensor(model_predict[0], mindspore.float16) _, k_label = topk(input_x, k) if k_label[0] == labels: correct_top1 = correct_top1 + 1 diff --git a/official/cv/ShuffleNet/shufflenetv2/modelarts/train_start.py b/official/cv/ShuffleNet/shufflenetv2/modelarts/train_start.py index 592709420..d90af0291 100644 --- a/official/cv/ShuffleNet/shufflenetv2/modelarts/train_start.py +++ b/official/cv/ShuffleNet/shufflenetv2/modelarts/train_start.py @@ -19,10 +19,9 @@ import os import time import numpy as np +import mindspore import mindspore.nn as nn - -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore import Tensor from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.momentum import Momentum @@ -60,7 +59,7 @@ def export_models(checkpoint_path): if args_opt.export_mindir_model: export(network, input_data, file_name=output_file, file_format="AIR") - if args_opt.export_air_model and context.get_context("device_target") == "Ascend": + if args_opt.export_air_model and mindspore.get_context("device_target") == "Ascend": export(network, input_data, file_name=output_file, file_format="AIR") if args_opt.export_onnx_model: export(network, input_data, file_name=output_file, file_format="ONNX") @@ -166,10 +165,10 @@ if __name__ == '__main__': set_seed(config.random_seed) if args_opt.use_pynative_mode: - context.set_context(mode=context.PYNATIVE_MODE, device_target=args_opt.platform) + mindspore.set_context(mode=1, device_target=args_opt.platform) print('mode = PYNATIVE_MODE') else: - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.platform, save_graphs=False) + mindspore.set_context(mode=0, device_target=args_opt.platform, save_graphs=False) print('mode = GRAPH_MODE') # init distributed @@ -187,13 +186,13 @@ if __name__ == '__main__': device_id = get_rank() config.group_size = get_group_size() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, device_num=config.group_size, gradients_mean=True) else: device_id = args_opt.device_id config.group_size = 1 - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) rank_id = device_id config.rank = rank_id print('rank_id = ', rank_id, ' group_size = ', config.group_size) diff --git a/official/cv/ShuffleNet/shufflenetv2/train.py b/official/cv/ShuffleNet/shufflenetv2/train.py index c132ab257..649410de6 100644 --- a/official/cv/ShuffleNet/shufflenetv2/train.py +++ b/official/cv/ShuffleNet/shufflenetv2/train.py @@ -18,9 +18,9 @@ import ast import os import time +import mindspore import mindspore.nn as nn -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore import Tensor from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.momentum import Momentum @@ -103,15 +103,15 @@ if __name__ == '__main__': set_seed(config.random_seed) if args_opt.use_pynative_mode: - context.set_context(mode=context.PYNATIVE_MODE, device_target=args_opt.platform) + mindspore.set_context(mode=1, device_target=args_opt.platform) print('mode = PYNATIVE_MODE') else: - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.platform, save_graphs=False) + mindspore.set_context(mode=0, device_target=args_opt.platform, save_graphs=False) print('mode = GRAPH_MODE') # Set mempool block size in PYNATIVE_MODE for improving memory utilization, which will not take effect in GRAPH_MODE - if context.get_context("mode") == context.PYNATIVE_MODE: - context.set_context(mempool_block_size="25GB") + if mindspore.get_context("mode") == 1: + mindspore.set_context(mempool_block_size="25GB") # init distributed if args_opt.is_distributed: @@ -128,13 +128,13 @@ if __name__ == '__main__': device_id = get_rank() config.group_size = get_group_size() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, device_num=config.group_size, gradients_mean=True) else: device_id = args_opt.device_id config.group_size = 1 - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) rank_id = device_id config.rank = rank_id print('rank_id = ', rank_id, ' group_size = ', config.group_size) diff --git a/official/cv/SwinTransformer/eval.py b/official/cv/SwinTransformer/eval.py index f26694e15..9f1eaebcc 100644 --- a/official/cv/SwinTransformer/eval.py +++ b/official/cv/SwinTransformer/eval.py @@ -15,7 +15,6 @@ """eval""" from mindspore import Model -from mindspore import context from mindspore import nn from mindspore.common import set_seed @@ -25,18 +24,20 @@ from src.tools.criterion import get_criterion, NetWithLoss from src.tools.get_misc import get_dataset, set_device, get_model, pretrained, get_train_one_step from src.tools.optimizer import get_optimizer +import mindspore + set_seed(args.seed) def main(): mode = { - 0: context.GRAPH_MODE, - 1: context.PYNATIVE_MODE + 0: 0, + 1: 1 } - context.set_context(mode=mode[args.graph_mode], device_target=args.device_target) - context.set_context(enable_graph_kernel=False) + mindspore.set_context(mode=mode[args.graph_mode], device_target=args.device_target) + mindspore.set_context(enable_graph_kernel=False) if args.device_target == "Ascend": - context.set_context(enable_auto_mixed_precision=True) + mindspore.set_context(enable_auto_mixed_precision=True) set_device(args) # get model diff --git a/official/cv/SwinTransformer/export.py b/official/cv/SwinTransformer/export.py index 692a104e4..3aa941a04 100644 --- a/official/cv/SwinTransformer/export.py +++ b/official/cv/SwinTransformer/export.py @@ -16,9 +16,9 @@ ##############export checkpoint file into air, onnx or mindir model################# python export.py """ - import numpy as np -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +import mindspore +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from mindspore import dtype as mstype from src.args import args @@ -26,10 +26,10 @@ from src.tools.cell import cast_amp from src.tools.criterion import get_criterion, NetWithLoss from src.tools.get_misc import get_model -context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) +mindspore.set_context(mode=0, device_target=args.device_target) if args.device_target in ["Ascend", "GPU"]: - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) if __name__ == '__main__': net = get_model(args) diff --git a/official/cv/SwinTransformer/src/tools/get_misc.py b/official/cv/SwinTransformer/src/tools/get_misc.py index 73ae63120..abdf60a69 100644 --- a/official/cv/SwinTransformer/src/tools/get_misc.py +++ b/official/cv/SwinTransformer/src/tools/get_misc.py @@ -15,10 +15,10 @@ """misc functions for program""" import os -from mindspore import context +import mindspore from mindspore import nn from mindspore.communication.management import init, get_rank -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from src import models, data @@ -35,25 +35,25 @@ def set_device(args): if device_target == "Ascend": if device_num > 1: - context.set_context(device_id=int(os.environ["DEVICE_ID"])) + mindspore.set_context(device_id=int(os.environ["DEVICE_ID"])) init(backend_name='hccl') - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) - # context.set_auto_parallel_context(pipeline_stages=2, full_batch=True) + # mindspore.set_auto_parallel_context(pipeline_stages=2, full_batch=True) rank = get_rank() else: - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) elif device_target == "GPU": if device_num > 1: init(backend_name='nccl') - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) rank = get_rank() else: - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) else: raise ValueError("Unsupported platform.") diff --git a/official/cv/SwinTransformer/train.py b/official/cv/SwinTransformer/train.py index c4174fc82..feff1525a 100644 --- a/official/cv/SwinTransformer/train.py +++ b/official/cv/SwinTransformer/train.py @@ -15,8 +15,8 @@ """train""" import os +import mindspore from mindspore import Model -from mindspore import context from mindspore import nn from mindspore.common import set_seed from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor @@ -33,14 +33,14 @@ def main(): assert args.crop, f"{args.arch} is only for evaluation" set_seed(args.seed) mode = { - 0: context.GRAPH_MODE, - 1: context.PYNATIVE_MODE + 0: 0, + 1: 1 } - context.set_context(mode=mode[args.graph_mode], device_target=args.device_target) + mindspore.set_context(mode=mode[args.graph_mode], device_target=args.device_target) if args.device_target == "GPU": - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) if args.device_target == "Ascend": - context.set_context(enable_auto_mixed_precision=True) + mindspore.set_context(enable_auto_mixed_precision=True) rank = set_device(args) # get model and cast amp_level diff --git a/official/cv/Unet/eval.py b/official/cv/Unet/eval.py index d34b80a7a..5ab4b111f 100644 --- a/official/cv/Unet/eval.py +++ b/official/cv/Unet/eval.py @@ -14,7 +14,8 @@ # ============================================================================ import logging -from mindspore import context, Model +import mindspore +from mindspore import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.data_loader import create_dataset, create_multi_class_dataset @@ -59,10 +60,10 @@ def test_net(data_dir, if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) if config.device_target == "Ascend": device_id = get_device_id() - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) test_net(data_dir=config.data_path, ckpt_path=config.checkpoint_file_path, cross_valid_ind=config.cross_valid_ind) diff --git a/official/cv/Unet/export.py b/official/cv/Unet/export.py index 300fe1c02..b32da9333 100644 --- a/official/cv/Unet/export.py +++ b/official/cv/Unet/export.py @@ -16,7 +16,8 @@ import os import numpy as np -from mindspore import Tensor, export, load_checkpoint, load_param_into_net, context +import mindspore +from mindspore import Tensor, export, load_checkpoint, load_param_into_net from src.unet_medical.unet_model import UNetMedical from src.unet_nested import NestedUNet, UNet @@ -26,9 +27,9 @@ from src.model_utils.device_adapter import get_device_id from src.model_utils.moxing_adapter import moxing_wrapper -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_pre_process(): '''modelarts pre process function.''' diff --git a/official/cv/Unet/golden_stick/pruner/uni_pruning/eval.py b/official/cv/Unet/golden_stick/pruner/uni_pruning/eval.py index 4e87deff2..819449cc7 100644 --- a/official/cv/Unet/golden_stick/pruner/uni_pruning/eval.py +++ b/official/cv/Unet/golden_stick/pruner/uni_pruning/eval.py @@ -16,7 +16,8 @@ import logging import os import json -from mindspore import context, Model +import mindspore +from mindspore import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore_gs.pruner.uni_pruning import UniPruner @@ -83,7 +84,7 @@ def test_net(data_dir, if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) assert config.device_target == "GPU" test_net(data_dir=config.data_path, ckpt_path=config.checkpoint_file_path, diff --git a/official/cv/Unet/golden_stick/pruner/uni_pruning/train.py b/official/cv/Unet/golden_stick/pruner/uni_pruning/train.py index 0c776aeab..3aa899ee9 100644 --- a/official/cv/Unet/golden_stick/pruner/uni_pruning/train.py +++ b/official/cv/Unet/golden_stick/pruner/uni_pruning/train.py @@ -17,10 +17,10 @@ import logging import mindspore import mindspore.nn as nn -from mindspore import Model, context +from mindspore import Model from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.callback import CheckpointConfig, ModelCheckpoint -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore_gs.pruner.uni_pruning import UniPruner @@ -47,7 +47,7 @@ def train_net(cross_valid_ind=1, group_size = get_group_size() rank = get_rank() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=False) net = UNetMedical(n_channels=config.num_channels, n_classes=config.num_classes) @@ -121,7 +121,7 @@ def train_net(cross_valid_ind=1, if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') # to keep GetNext from timeout, set op_timeout=600 - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False, op_timeout=600) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, op_timeout=600) assert config.device_target == "GPU" epoch_size = config.epoch_size if not config.run_distribute else config.distribute_epochs batchsize = config.batch_size diff --git a/official/cv/Unet/src/model_utils/moxing_adapter.py b/official/cv/Unet/src/model_utils/moxing_adapter.py index aabd5ac6c..a5337e688 100644 --- a/official/cv/Unet/src/model_utils/moxing_adapter.py +++ b/official/cv/Unet/src/model_utils/moxing_adapter.py @@ -17,8 +17,8 @@ import os import functools -from mindspore import context from src.model_utils.config import config +import mindspore _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/Unet/train.py b/official/cv/Unet/train.py index 37143d3a4..1ca94e640 100644 --- a/official/cv/Unet/train.py +++ b/official/cv/Unet/train.py @@ -17,10 +17,10 @@ import logging import mindspore import mindspore.nn as nn -from mindspore import Model, context +from mindspore import Model from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.callback import CheckpointConfig, ModelCheckpoint -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.unet_medical import UNetMedical @@ -50,7 +50,7 @@ def train_net(cross_valid_ind=1, group_size = get_group_size() rank = get_rank() parallel_mode = ParallelMode.DATA_PARALLEL - context.set_auto_parallel_context(parallel_mode=parallel_mode, + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=False) need_slice = False @@ -130,10 +130,10 @@ def train_net(cross_valid_ind=1, if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') # to keep GetNext from timeout, set op_timeout=600 - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False, op_timeout=600) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, op_timeout=600) if config.device_target == "Ascend": device_id = get_device_id() - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) epoch_size = config.epochs if not config.run_distribute else config.distribute_epochs batchsize = config.batch_size if config.device_target == 'GPU' and config.run_distribute: diff --git a/official/cv/VGG/vgg16/eval.py b/official/cv/VGG/vgg16/eval.py index f179ad242..8c436f925 100644 --- a/official/cv/VGG/vgg16/eval.py +++ b/official/cv/VGG/vgg16/eval.py @@ -20,7 +20,8 @@ import glob import numpy as np import mindspore.nn as nn -from mindspore import Tensor, context +import mindspore +from mindspore import Tensor from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -139,10 +140,10 @@ def run_eval(): config.group_size = get_device_num() _enable_graph_kernel = config.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=_enable_graph_kernel, + mindspore.set_context(mode=0, enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit() and config.device_target == "Ascend": - context.set_context(device_id=int(os.getenv('DEVICE_ID'))) + mindspore.set_context(device_id=int(os.getenv('DEVICE_ID'))) config.outputs_dir = os.path.join(config.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) diff --git a/official/cv/VGG/vgg16/export.py b/official/cv/VGG/vgg16/export.py index c0dc7fe0b..2ab51df3f 100644 --- a/official/cv/VGG/vgg16/export.py +++ b/official/cv/VGG/vgg16/export.py @@ -16,7 +16,8 @@ import os import numpy as np -from mindspore import Tensor, context +import mindspore +from mindspore import Tensor import mindspore.common.dtype as mstype from mindspore.train.serialization import load_checkpoint, export @@ -35,10 +36,10 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def run_export(): - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": config.device_id = get_device_id() - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) if config.dataset == "cifar10": net = vgg16(num_classes=config.num_classes, args=config) diff --git a/official/cv/VGG/vgg16/fine_tune.py b/official/cv/VGG/vgg16/fine_tune.py index 881264fb3..4dce2b341 100644 --- a/official/cv/VGG/vgg16/fine_tune.py +++ b/official/cv/VGG/vgg16/fine_tune.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.train import Model from mindspore.train.callback import LossMonitor, TimeMonitor @@ -21,8 +21,8 @@ from model_utils.config import get_config from src.vgg import Vgg from src.dataset import create_dataset -ms.set_context(mode=ms.GRAPH_MODE, device_target="CPU", save_graphs=False) -ms.set_seed(21) +mindspore.set_context(mode=0, device_target="CPU", save_graphs=False) +mindspore.set_seed(21) def import_data(train_dataset_path="./datasets/train/", eval_dataset_path="./datasets/test/", batch_size=32): @@ -78,7 +78,7 @@ def init_weight(net, param_dict): has_trained_epoch = int(param_dict["epoch_num"].data.asnumpy()) has_trained_step = int(param_dict["step_num"].data.asnumpy()) - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) print("has_trained_epoch:", has_trained_epoch) print("has_trained_step:", has_trained_step) return has_trained_epoch, has_trained_step @@ -114,8 +114,8 @@ def eval_net(model_config, checkpoint_path='./vgg16.ckpt', net.classifier[6] = head # load checkpoint - param_dict = ms.load_checkpoint(checkpoint_path) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(checkpoint_path) + mindspore.load_param_into_net(net, param_dict) net.set_train(False) # define loss @@ -123,7 +123,7 @@ def eval_net(model_config, checkpoint_path='./vgg16.ckpt', loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define model - model = ms.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) + model = mindspore.Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) # eval step res = model.eval(data_val) @@ -176,7 +176,7 @@ def finetune_train(model_config, eval_dataset_path=eval_dataset_path, batch_size=batch_size) - ckpt_param_dict = ms.load_checkpoint(finetune_checkpoint_path) + ckpt_param_dict = mindspore.load_checkpoint(finetune_checkpoint_path) net = Vgg(cfg['16'], num_classes=1000, args=model_config, batch_norm=True) init_weight(net=net, param_dict=ckpt_param_dict) print("net parameter:") @@ -210,7 +210,7 @@ def finetune_train(model_config, # do training model.train(num_epochs, dataset_train, callbacks=callbacks, dataset_sink_mode=True) - ms.save_checkpoint(net, save_checkpoint_path) + mindspore.save_checkpoint(net, save_checkpoint_path) if __name__ == '__main__': diff --git a/official/cv/VGG/vgg16/model_utils/moxing_adapter.py b/official/cv/VGG/vgg16/model_utils/moxing_adapter.py index e6e15074e..953ec5214 100644 --- a/official/cv/VGG/vgg16/model_utils/moxing_adapter.py +++ b/official/cv/VGG/vgg16/model_utils/moxing_adapter.py @@ -17,8 +17,8 @@ import os import functools -from mindspore import context from .config import get_config +import mindspore config = get_config() @@ -94,7 +94,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/VGG/vgg16/modelarts/start.py b/official/cv/VGG/vgg16/modelarts/start.py index 23f345dc1..56b61d8b0 100644 --- a/official/cv/VGG/vgg16/modelarts/start.py +++ b/official/cv/VGG/vgg16/modelarts/start.py @@ -21,15 +21,15 @@ import time import numpy as np import moxing as mox +import mindspore import mindspore.nn as nn from mindspore import Tensor -from mindspore import context import mindspore.common.dtype as mstype from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.momentum import Momentum from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_param_into_net, load_checkpoint, export from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.common import set_seed @@ -125,10 +125,10 @@ def _get_last_ckpt(ckpt_dir): def run_export(ckpt_dir): ckpt_file = _get_last_ckpt(ckpt_dir) - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": config.device_id = get_device_id() - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) if config.dataset == "cifar10": net = vgg16(num_classes=config.num_classes, args=config) @@ -150,7 +150,7 @@ def run_train(): config.per_batch_size = config.batch_size _enable_graph_kernel = config.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target) config.rank = get_rank_id() config.device_id = get_device_id() @@ -159,7 +159,7 @@ def run_train(): if config.is_distributed: if config.device_target == "Ascend": init() - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) elif config.device_target == "GPU": if not config.enable_modelarts: init() @@ -168,12 +168,12 @@ def run_train(): init() device_num = config.group_size - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[2, 18]) else: if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) # select for master rank save ckpt or all rank save, compatible for model parallel config.rank_save_ckpt_flag = 0 diff --git a/official/cv/VGG/vgg16/src/data_split.py b/official/cv/VGG/vgg16/src/data_split.py index d643942da..d93792e14 100644 --- a/official/cv/VGG/vgg16/src/data_split.py +++ b/official/cv/VGG/vgg16/src/data_split.py @@ -16,7 +16,7 @@ import os import shutil import multiprocessing -import mindspore as ms +import mindspore import mindspore.dataset as ds @@ -72,7 +72,7 @@ def create_dataset(dataset_path, do_train, batch_size=32, train_image_size=224, ] trans_norm = [ds.vision.Normalize(mean=mean, std=std), ds.vision.HWC2CHW()] - type_cast_op = ds.transforms.TypeCast(ms.int32) + type_cast_op = ds.transforms.TypeCast(mindspore.int32) trans_work_num = 24 data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=get_num_parallel_workers(trans_work_num)) diff --git a/official/cv/VGG/vgg16/src/dataset.py b/official/cv/VGG/vgg16/src/dataset.py index 6f1be68c4..420e728e9 100644 --- a/official/cv/VGG/vgg16/src/dataset.py +++ b/official/cv/VGG/vgg16/src/dataset.py @@ -18,7 +18,7 @@ dataset processing. import os import multiprocessing from PIL import Image, ImageFile -import mindspore as ms +import mindspore from mindspore.common import dtype as mstype import mindspore.dataset as de import mindspore.dataset.transforms as C @@ -214,7 +214,7 @@ def create_dataset(dataset_path, do_train, batch_size=32, train_image_size=224, ] trans_norm = [de.vision.Normalize(mean=mean, std=std), de.vision.HWC2CHW()] - type_cast_op = de.transforms.TypeCast(ms.int32) + type_cast_op = de.transforms.TypeCast(mindspore.int32) trans_work_num = 24 data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=get_num_parallel_workers(trans_work_num)) diff --git a/official/cv/VGG/vgg16/train.py b/official/cv/VGG/vgg16/train.py index eddb70c3f..abb1ab615 100644 --- a/official/cv/VGG/vgg16/train.py +++ b/official/cv/VGG/vgg16/train.py @@ -19,14 +19,14 @@ import datetime import os import time +import mindspore import mindspore.nn as nn from mindspore import Tensor -from mindspore import context from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.momentum import Momentum from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_param_into_net, load_checkpoint from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.common import set_seed @@ -120,7 +120,7 @@ def run_train(): config.per_batch_size = config.batch_size _enable_graph_kernel = config.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target) config.rank = get_rank_id() config.device_id = get_device_id() @@ -129,7 +129,7 @@ def run_train(): if config.is_distributed: if config.device_target == "Ascend": init() - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) elif config.device_target == "GPU": if not config.enable_modelarts: init() @@ -138,12 +138,12 @@ def run_train(): init() device_num = config.group_size - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[15, 18]) else: if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) # select for master rank save ckpt or all rank save, compatible for model parallel config.rank_save_ckpt_flag = 0 diff --git a/official/cv/VGG/vgg19/eval.py b/official/cv/VGG/vgg19/eval.py index 0eed89da4..e2a6cba6b 100644 --- a/official/cv/VGG/vgg19/eval.py +++ b/official/cv/VGG/vgg19/eval.py @@ -20,7 +20,8 @@ import glob import numpy as np import mindspore.nn as nn -from mindspore import Tensor, context +import mindspore +from mindspore import Tensor from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.momentum import Momentum from mindspore.train.model import Model @@ -135,10 +136,10 @@ def run_eval(): _enable_graph_kernel = config.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=_enable_graph_kernel, + mindspore.set_context(mode=0, enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit() and config.device_target == "Ascend": - context.set_context(device_id=int(os.getenv('DEVICE_ID'))) + mindspore.set_context(device_id=int(os.getenv('DEVICE_ID'))) config.outputs_dir = os.path.join(config.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) diff --git a/official/cv/VGG/vgg19/export.py b/official/cv/VGG/vgg19/export.py index f008d9984..3a5d9f014 100644 --- a/official/cv/VGG/vgg19/export.py +++ b/official/cv/VGG/vgg19/export.py @@ -16,7 +16,8 @@ import os import numpy as np -from mindspore import Tensor, context +import mindspore +from mindspore import Tensor import mindspore.common.dtype as mstype from mindspore.train.serialization import load_checkpoint, export @@ -37,10 +38,10 @@ def run_export(): '''run_export function.''' config.image_size = list(map(int, config.image_size.split(','))) - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": config.device_id = get_device_id() - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) if config.dataset == "cifar10": net = vgg19(num_classes=config.num_classes, args=config) diff --git a/official/cv/VGG/vgg19/model_utils/moxing_adapter.py b/official/cv/VGG/vgg19/model_utils/moxing_adapter.py index e6e15074e..953ec5214 100644 --- a/official/cv/VGG/vgg19/model_utils/moxing_adapter.py +++ b/official/cv/VGG/vgg19/model_utils/moxing_adapter.py @@ -17,8 +17,8 @@ import os import functools -from mindspore import context from .config import get_config +import mindspore config = get_config() @@ -94,7 +94,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/VGG/vgg19/modelarts/train_modelarts.py b/official/cv/VGG/vgg19/modelarts/train_modelarts.py index 32d5e96f8..5084a6081 100644 --- a/official/cv/VGG/vgg19/modelarts/train_modelarts.py +++ b/official/cv/VGG/vgg19/modelarts/train_modelarts.py @@ -19,14 +19,14 @@ import datetime import os import time +import mindspore import mindspore.nn as nn from mindspore import Tensor -from mindspore import context from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.momentum import Momentum from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_param_into_net, load_checkpoint, export from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.common import set_seed @@ -133,7 +133,7 @@ def run_train(): config.per_batch_size = config.batch_size _enable_graph_kernel = config.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target) config.device_id = get_device_id() @@ -141,7 +141,7 @@ def run_train(): if config.is_distributed: if config.device_target == "Ascend": init() - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) elif config.device_target == "GPU": if not config.enable_modelarts: init() @@ -151,12 +151,12 @@ def run_train(): config.rank = get_rank() config.group_size = get_group_size() device_num = config.group_size - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[2, 18]) else: if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) # select for master rank save ckpt or all rank save, compatible for model parallel config.rank_save_ckpt_flag = 0 diff --git a/official/cv/VGG/vgg19/train.py b/official/cv/VGG/vgg19/train.py index 49b8a620f..4384d8a19 100644 --- a/official/cv/VGG/vgg19/train.py +++ b/official/cv/VGG/vgg19/train.py @@ -19,14 +19,14 @@ import datetime import os import time +import mindspore import mindspore.nn as nn from mindspore import Tensor -from mindspore import context from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.momentum import Momentum from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_param_into_net, load_checkpoint from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.common import set_seed @@ -121,7 +121,7 @@ def run_train(): config.per_batch_size = config.batch_size _enable_graph_kernel = config.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target) config.device_id = get_device_id() @@ -129,7 +129,7 @@ def run_train(): if config.is_distributed: if config.device_target == "Ascend": init() - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) elif config.device_target == "GPU": if not config.enable_modelarts: init() @@ -139,12 +139,12 @@ def run_train(): config.rank = get_rank() config.group_size = get_group_size() device_num = config.group_size - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[2, 18]) else: if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) # select for master rank save ckpt or all rank save, compatible for model parallel config.rank_save_ckpt_flag = 0 diff --git a/official/cv/VIT/README.md b/official/cv/VIT/README.md index 0e1cab662..d018a3aa0 100644 --- a/official/cv/VIT/README.md +++ b/official/cv/VIT/README.md @@ -395,7 +395,8 @@ Current batch_ Size can only be set to 1. Before running the command below, you should modify the config file. The items you should modify are batch_size and val_data_path. - Inference result will be stored in the example path, you can find result like the followings in acc.log. + Inference result will be stored in the example path, you can find result like the following +in acc.log. ```shell cd scripts @@ -458,8 +459,8 @@ If you need to use the trained model to perform inference on multiple hardware p lrs = ... ... # Set context - context.set_context(mode=context.GRAPH_HOME, device_target=args.device_target) - context.set_context(device_id=args.device_id) + mindspore.set_context(mode=context.GRAPH_HOME, device_target=args.device_target) + mindspore.set_context(device_id=args.device_id) # Load unseen dataset for inference dataset = dataset.create_dataset(args.data_path, 1, False) diff --git a/official/cv/VIT/README_CN.md b/official/cv/VIT/README_CN.md index 06b2267e2..31cfa9afd 100644 --- a/official/cv/VIT/README_CN.md +++ b/official/cv/VIT/README_CN.md @@ -461,8 +461,8 @@ python export.py --config_path=[CONFIG_PATH] lrs = ... ... # 设置上下文 - context.set_context(mode=context.GRAPH_HOME, device_target=args.device_target) - context.set_context(device_id=args.device_id) + mindspore.set_context(mode=context.GRAPH_HOME, device_target=args.device_target) + mindspore.set_context(device_id=args.device_id) # 加载未知数据集进行推理 dataset = dataset.create_dataset(args.data_path, 1, False) diff --git a/official/cv/VIT/eval.py b/official/cv/VIT/eval.py index 7dae46f0e..25ba24ddd 100644 --- a/official/cv/VIT/eval.py +++ b/official/cv/VIT/eval.py @@ -17,7 +17,7 @@ import os import numpy as np -import mindspore as ms +import mindspore from mindspore.train.model import Model, ParallelMode from mindspore.communication.management import init from mindspore.profiler.profiling import Profiler @@ -72,17 +72,17 @@ def eval_net(): np.random.seed(args.seed) args.logger = get_logger(args.save_checkpoint_path, rank=local_rank) - ms.set_context(device_id=device_id, - mode=ms.GRAPH_MODE, + mindspore.set_context(device_id=device_id, + mode=0, device_target="Ascend", save_graphs=False) if args.auto_tune: - ms.set_context(auto_tune_mode='GA') + mindspore.set_context(auto_tune_mode='GA') elif args.device_num == 1: pass else: - ms.set_auto_parallel_context(device_num=device_num, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) @@ -97,7 +97,7 @@ def eval_net(): net = get_network(backbone_name=args.backbone, args=args) if os.path.isfile(args.pretrained): - ms.load_checkpoint(args.pretrained, net, strict_load=False) + mindspore.load_checkpoint(args.pretrained, net, strict_load=False) # evaluation dataset eval_dataset = get_dataset(dataset_name=args.dataset_name, diff --git a/official/cv/VIT/export.py b/official/cv/VIT/export.py index 6d4703c16..632920610 100644 --- a/official/cv/VIT/export.py +++ b/official/cv/VIT/export.py @@ -18,14 +18,14 @@ python export.py """ import os -import mindspore as ms +import mindspore from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.vit import get_network -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - ms.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) def modelarts_pre_process(): '''modelarts pre process function.''' @@ -38,14 +38,14 @@ def run_export(): assert config.pretrained is not None, "checkpoint_path is None." - param_dict = ms.load_checkpoint(config.pretrained) - ms.load_param_into_net(net, param_dict) + param_dict = mindspore.load_checkpoint(config.pretrained) + mindspore.load_param_into_net(net, param_dict) config.height = config.train_image_size config.width = config.train_image_size - input_arr = ms.numpy.zeros([config.batch_size, 3, config.height, config.width], ms.float32) - ms.export(net, input_arr, file_name=config.file_name, file_format=config.file_format) + input_arr = mindspore.numpy.zeros([config.batch_size, 3, config.height, config.width], mindspore.float32) + mindspore.export(net, input_arr, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': run_export() diff --git a/official/cv/VIT/modelarts/train_modelarts.py b/official/cv/VIT/modelarts/train_modelarts.py index c3af183f6..d3ecd6d4c 100644 --- a/official/cv/VIT/modelarts/train_modelarts.py +++ b/official/cv/VIT/modelarts/train_modelarts.py @@ -21,7 +21,7 @@ import socket import glob import numpy as np import moxing as mox -import mindspore as ms +import mindspore from mindspore import Tensor from mindspore.train.model import Model, ParallelMode from mindspore.train.callback import ModelCheckpoint, CheckpointConfig @@ -84,11 +84,11 @@ def filter_checkpoint_parameter_by_list(origin_dict, param_filter): def frozen_to_air(network, args): - param_dict_t = ms.load_checkpoint(args.get("ckpt_file")) - ms.load_param_into_net(network, param_dict_t) + param_dict_t = mindspore.load_checkpoint(args.get("ckpt_file")) + mindspore.load_param_into_net(network, param_dict_t) input_arr = Tensor(np.random.uniform(0.0, 1.0, size=[args.get("batch_size"), 3, args.get("width"), \ - args.get("height")]), ms.float32) - ms.export(network, input_arr, file_name=args.get("file_name"), file_format=args.get("file_format")) + args.get("height")]), mindspore.float32) + mindspore.export(network, input_arr, file_name=args.get("file_name"), file_format=args.get("file_format")) if __name__ == '__main__': @@ -113,16 +113,16 @@ if __name__ == '__main__': config.batch_size = config.batch_size config.dataset_path = os.path.join(config.data_path, "train") - ms.set_context(device_id=device_id, - mode=ms.GRAPH_MODE, + mindspore.set_context(device_id=device_id, + mode=0, device_target="Ascend", save_graphs=False) if args_opt.auto_tune: - ms.set_context(auto_tune_mode='GA') + mindspore.set_context(auto_tune_mode='GA') elif args_opt.device_num == 1: pass else: - ms.set_auto_parallel_context(device_num=device_num, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) @@ -151,10 +151,10 @@ if __name__ == '__main__': print("warning!!!, no split point") if os.path.isfile(config.ckpt_path): - ckpt = ms.load_checkpoint(config.ckpt_path) + ckpt = mindspore.load_checkpoint(config.ckpt_path) filter_list = [x.name for x in net.head.get_parameters()] filter_checkpoint_parameter_by_list(ckpt, filter_list) - ms.load_param_into_net(net, ckpt) + mindspore.load_param_into_net(net, ckpt) # loss if not args_opt.use_label_smooth: diff --git a/official/cv/VIT/src/cross_entropy.py b/official/cv/VIT/src/cross_entropy.py index 8fc5953d7..aa5c29cf1 100644 --- a/official/cv/VIT/src/cross_entropy.py +++ b/official/cv/VIT/src/cross_entropy.py @@ -14,7 +14,7 @@ # ============================================================================ """loss functions""" -import mindspore as ms +import mindspore import mindspore.ops as ops from mindspore import nn from mindspore import Tensor @@ -29,8 +29,8 @@ class CrossEntropySmooth(Loss): self.aux_factor = aux_factor self.onehot = ops.OneHot() self.sparse = sparse - self.on_value = Tensor(1.0 - smooth_factor, ms.float32) - self.off_value = Tensor(1.0 * smooth_factor / (num_classes - 1), ms.float32) + self.on_value = Tensor(1.0 - smooth_factor, mindspore.float32) + self.off_value = Tensor(1.0 * smooth_factor / (num_classes - 1), mindspore.float32) self.ce = nn.SoftmaxCrossEntropyWithLogits(reduction=reduction) def construct(self, logits, label): @@ -52,14 +52,14 @@ class CrossEntropySmoothMixup(Loss): """CrossEntropy""" def __init__(self, reduction='mean', smooth_factor=0., num_classes=1000): super().__init__() - self.on_value = Tensor(1.0 - smooth_factor, ms.float32) + self.on_value = Tensor(1.0 - smooth_factor, mindspore.float32) self.off_value = 1.0 * smooth_factor / (num_classes - 2) self.cross_entropy = nn.SoftmaxCrossEntropyWithLogits(reduction=reduction) def construct(self, logit, label): off_label = ops.Select()(ops.Equal()(label, 0.0), \ - ops.Fill()(ms.float32, label.shape, self.off_value), \ - ops.Fill()(ms.float32, label.shape, 0.0)) + ops.Fill()(mindspore.float32, label.shape, self.off_value), \ + ops.Fill()(mindspore.float32, label.shape, 0.0)) label = self.on_value * label + off_label loss = self.cross_entropy(logit, label) @@ -71,8 +71,8 @@ class CrossEntropyIgnore(Loss): def __init__(self, num_classes=21, ignore_label=255): super().__init__() self.one_hot = ops.OneHot(axis=-1) - self.on_value = Tensor(1.0, ms.float32) - self.off_value = Tensor(0.0, ms.float32) + self.on_value = Tensor(1.0, mindspore.float32) + self.off_value = Tensor(0.0, mindspore.float32) self.cast = ops.Cast() self.ce = nn.SoftmaxCrossEntropyWithLogits() self.not_equal = ops.NotEqual() @@ -85,12 +85,12 @@ class CrossEntropyIgnore(Loss): self.reshape = ops.Reshape() def construct(self, logits, labels): - labels_int = self.cast(labels, ms.int32) + labels_int = self.cast(labels, mindspore.int32) labels_int = self.reshape(labels_int, (-1,)) logits_ = self.transpose(logits, (0, 2, 3, 1)) logits_ = self.reshape(logits_, (-1, self.num_cls)) weights = self.not_equal(labels_int, self.ignore_label) - weights = self.cast(weights, ms.float32) + weights = self.cast(weights, mindspore.float32) one_hot_labels = self.one_hot(labels_int, self.num_cls, self.on_value, self.off_value) loss = self.ce(logits_, one_hot_labels) loss = self.mul(weights, loss) diff --git a/official/cv/VIT/src/dataset.py b/official/cv/VIT/src/dataset.py index 4e3c5d766..b882c0fe0 100644 --- a/official/cv/VIT/src/dataset.py +++ b/official/cv/VIT/src/dataset.py @@ -20,7 +20,7 @@ from io import BytesIO from PIL import Image import numpy as np -import mindspore as ms +import mindspore import mindspore.dataset.engine as de import mindspore.dataset.vision as vision import mindspore.dataset.transforms as transforms @@ -134,7 +134,7 @@ def create_dataset(dataset_path, ] ds = ds.map(input_columns="image", num_parallel_workers=num_workers, operations=c_trans) - type_cast_op = transforms.TypeCast(ms.int32) + type_cast_op = transforms.TypeCast(mindspore.int32) ds = ds.map(input_columns="label", num_parallel_workers=1, operations=type_cast_op) if do_train and mixup > 0: diff --git a/official/cv/VIT/src/eval_engine.py b/official/cv/VIT/src/eval_engine.py index f69222789..86eb82cca 100644 --- a/official/cv/VIT/src/eval_engine.py +++ b/official/cv/VIT/src/eval_engine.py @@ -14,7 +14,7 @@ # ============================================================================ """eval engine""" -import mindspore as ms +import mindspore from mindspore import Tensor from src.metric import ClassifyCorrectWithCache, ClassifyCorrectCell, DistAccuracy @@ -53,12 +53,12 @@ class ImageNetCacheEvelEngine(BasicEvalEngine): self.args = args def compile(self, sink_size=-1): - index = Tensor(0, ms.int32) + index = Tensor(0, mindspore.int32) self.dist_eval_network.set_train(False) self.dist_eval_network.compile(index) def eval(self): - index = Tensor(0, ms.int32) + index = Tensor(0, mindspore.int32) output = self.dist_eval_network(index) output = output.asnumpy() / 50000 self.outputs = {"acc": output} diff --git a/official/cv/VIT/src/metric.py b/official/cv/VIT/src/metric.py index 2c86926cb..5cb2a7834 100644 --- a/official/cv/VIT/src/metric.py +++ b/official/cv/VIT/src/metric.py @@ -16,7 +16,7 @@ import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops from mindspore.communication.management import GlobalComm import mindspore.nn as nn @@ -35,7 +35,7 @@ class ClassifyCorrectWithCache(nn.Cell): self.allreduce = ops.AllReduce(ops.ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP) self.assign_add = ops.AssignAdd() self.assign = ops.Assign() - self._correct_num = Parameter(Tensor(0.0, ms.float32), name="correct_num", requires_grad=False) + self._correct_num = Parameter(Tensor(0.0, mindspore.float32), name="correct_num", requires_grad=False) # save data to parameter pdata = [] plabel = [] @@ -44,11 +44,11 @@ class ClassifyCorrectWithCache(nn.Cell): pdata.append(batch["image"]) plabel.append(batch["label"]) step_num = step_num + 1 - pdata = Tensor(np.array(pdata), ms.float32) - plabel = Tensor(np.array(plabel), ms.int32) + pdata = Tensor(np.array(pdata), mindspore.float32) + plabel = Tensor(np.array(plabel), mindspore.int32) self._data = Parameter(pdata, name="pdata", requires_grad=False) self._label = Parameter(plabel, name="plabel", requires_grad=False) - self._step_num = Tensor(step_num, ms.int32) + self._step_num = Tensor(step_num, mindspore.int32) def construct(self, index): self._correct_num = 0 @@ -57,9 +57,9 @@ class ClassifyCorrectWithCache(nn.Cell): label = self._label[index] outputs = self._network(data) y_pred = self.argmax(outputs) - y_pred = ops.cast(y_pred, ms.int32) + y_pred = ops.cast(y_pred, mindspore.int32) y_correct = self.equal(y_pred, label) - y_correct = ops.cast(y_correct, ms.float32) + y_correct = ops.cast(y_correct, mindspore.float32) y_correct_sum = self.reduce_sum(y_correct) self._correct_num += y_correct_sum #self.assign(self._correct_num, y_correct_sum) index = index + 1 @@ -80,9 +80,9 @@ class ClassifyCorrectCell(nn.Cell): def construct(self, data, label): outputs = self._network(data) y_pred = self.argmax(outputs) - y_pred = ops.cast(y_pred, ms.int32) + y_pred = ops.cast(y_pred, mindspore.int32) y_correct = self.equal(y_pred, label) - y_correct = ops.cast(y_correct, ms.float32) + y_correct = ops.cast(y_correct, mindspore.float32) y_correct = self.reduce_sum(y_correct) total_correct = self.allreduce(y_correct) return (total_correct,) diff --git a/official/cv/VIT/src/model_utils/moxing_adapter.py b/official/cv/VIT/src/model_utils/moxing_adapter.py index 77f40e59c..88179b79b 100644 --- a/official/cv/VIT/src/model_utils/moxing_adapter.py +++ b/official/cv/VIT/src/model_utils/moxing_adapter.py @@ -17,7 +17,8 @@ import os import functools -import mindspore as ms +import mindspore +import mindspore from .config import config _global_sync_count = 0 @@ -92,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - ms.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/VIT/src/optimizer.py b/official/cv/VIT/src/optimizer.py index 0a04acbdc..ecee958c8 100644 --- a/official/cv/VIT/src/optimizer.py +++ b/official/cv/VIT/src/optimizer.py @@ -16,7 +16,7 @@ import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops from mindspore import jit @@ -59,7 +59,7 @@ def scale_grad(gradients, reciprocal_scale): _adam_opt = ops.MultitypeFuncGraph("adam_opt") -_scaler_one = Tensor(1, ms.int32) +_scaler_one = Tensor(1, mindspore.int32) @_adam_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", @@ -91,15 +91,15 @@ def _update_run_op(beta1_power, beta2_power, beta1, beta2, eps, lr, weight_decay op_sqrt = ops.Sqrt() op_reshape = ops.Reshape() - param_fp32 = ops.cast(param, ms.float32) - m_fp32 = ops.cast(m, ms.float32) - v_fp32 = ops.cast(v, ms.float32) - gradient_fp32 = ops.cast(gradient, ms.float32) + param_fp32 = ops.cast(param, mindspore.float32) + m_fp32 = ops.cast(m, mindspore.float32) + v_fp32 = ops.cast(v, mindspore.float32) + gradient_fp32 = ops.cast(gradient, mindspore.float32) - next_m = op_mul(beta1, m_fp32) + op_mul(ops.cast(ops.tuple_to_array((1.0,)), ms.float32) + next_m = op_mul(beta1, m_fp32) + op_mul(ops.cast(ops.tuple_to_array((1.0,)), mindspore.float32) - beta1, gradient_fp32) - next_v = op_mul(beta2, v_fp32) + op_mul(ops.cast(ops.tuple_to_array((1.0,)), ms.float32) + next_v = op_mul(beta2, v_fp32) + op_mul(ops.cast(ops.tuple_to_array((1.0,)), mindspore.float32) - beta2, op_square(gradient_fp32)) regulate_m = next_m / (_scaler_one - beta1_power) @@ -135,10 +135,10 @@ class AdamW(Optimizer): self.moments1 = self.parameters.clone(prefix="adam_m", init='zeros') self.moments2 = self.parameters.clone(prefix="adam_v", init='zeros') self.hyper_map = ops.HyperMap() - self.beta1_power = Parameter(initializer(1, [1], ms.float32), name="beta1_power") - self.beta2_power = Parameter(initializer(1, [1], ms.float32), name="beta2_power") + self.beta1_power = Parameter(initializer(1, [1], mindspore.float32), name="beta1_power") + self.beta2_power = Parameter(initializer(1, [1], mindspore.float32), name="beta2_power") - self.reciprocal_scale = Tensor(1.0 / loss_scale, ms.float32) + self.reciprocal_scale = Tensor(1.0 / loss_scale, mindspore.float32) self.clip = clip @jit diff --git a/official/cv/VIT/src/vit.py b/official/cv/VIT/src/vit.py index f67911bca..9da3e1bb3 100644 --- a/official/cv/VIT/src/vit.py +++ b/official/cv/VIT/src/vit.py @@ -18,7 +18,7 @@ from importlib import import_module from easydict import EasyDict as edict import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops from mindspore.common.initializer import initializer from mindspore.common.parameter import Parameter @@ -35,39 +35,39 @@ class VitConfig: self.configs = configs # network init - self.network_norm = ms.nn.LayerNorm((configs.normalized_shape,)) - self.network_init = ms.common.initializer.Normal(sigma=1.0) + self.network_norm = mindspore.nn.LayerNorm((configs.normalized_shape,)) + self.network_init = mindspore.common.initializer.Normal(sigma=1.0) self.network_dropout_rate = 0.1 self.network_pool = 'cls' self.network = ViT # stem - self.stem_init = ms.common.initializer.XavierUniform() + self.stem_init = mindspore.common.initializer.XavierUniform() self.stem = VitStem # body - self.body_norm = ms.nn.LayerNorm + self.body_norm = mindspore.nn.LayerNorm self.body_drop_path_rate = 0.1 self.body = Transformer # body attention - self.attention_init = ms.common.initializer.XavierUniform() - self.attention_activation = ms.nn.Softmax() + self.attention_init = mindspore.common.initializer.XavierUniform() + self.attention_activation = mindspore.nn.Softmax() self.attention_dropout_rate = 0.1 self.attention = Attention # body feedforward - self.feedforward_init = ms.common.initializer.XavierUniform() - self.feedforward_activation = ms.nn.GELU() + self.feedforward_init = mindspore.common.initializer.XavierUniform() + self.feedforward_activation = mindspore.nn.GELU() self.feedforward_dropout_rate = 0.1 self.feedforward = FeedForward # head self.head = origin_head - self.head_init = ms.common.initializer.XavierUniform() + self.head_init = mindspore.common.initializer.XavierUniform() self.head_dropout_rate = 0.1 - self.head_norm = ms.nn.LayerNorm((configs.normalized_shape,)) - self.head_activation = ms.nn.GELU() + self.head_norm = mindspore.nn.LayerNorm((configs.normalized_shape,)) + self.head_activation = mindspore.nn.GELU() class DropPath(Cell): @@ -86,7 +86,7 @@ class DropPath(Cell): def construct(self, x): if self.training: x_shape = self.shape(x) # B N C - mask = self.ones((x_shape[0], 1, 1), ms.float32) + mask = self.ones((x_shape[0], 1, 1), mindspore.float32) x = self.dropout(mask)*x return x @@ -236,7 +236,7 @@ class ViT(Cell): else: x += self.pos_embedding[:, :seq_len] - y = ops.cast(x, ms.float32) + y = ops.cast(x, mindspore.float32) y = self.dropout(y) x = ops.cast(y, x.dtype) @@ -302,7 +302,7 @@ class Attention(Cell): if self.softmax_nz: q = self.reshape(q, (bs, seq_len, h, d)) q = self.transpose(q, (0, 2, 1, 3)) - q = ops.cast(q, ms.float32) + q = ops.cast(q, mindspore.float32) q = self.mul(q, self.scale) k = self.reshape(k, (bs, seq_len, h, d)) @@ -323,7 +323,7 @@ class Attention(Cell): v = self.transpose(v, (0, 2, 1, 3)) attn_scores = self.q_matmul_k(q, k) #bs x h x seq_len x seq_len - attn_scores = ops.cast(attn_scores, ms.float32) + attn_scores = ops.cast(attn_scores, mindspore.float32) attn_scores = self.mul(attn_scores, self.scale) attn_scores = ops.cast(attn_scores, x.dtype) attn_scores = self.activation(attn_scores) @@ -334,7 +334,7 @@ class Attention(Cell): out = self.to_out(out) out = self.reshape(out, (bs, seq_len, d_model)) #out = self.dropout(out) - y = ops.cast(out, ms.float32) + y = ops.cast(out, mindspore.float32) y = self.dropout(y) out = ops.cast(y, out.dtype) #out = self.reshape(out, (bs, seq_len, d_model)) @@ -361,12 +361,12 @@ class FeedForward(Cell): def construct(self, x): y = self.ff1(x) - y = ops.cast(y, ms.float32) + y = ops.cast(y, mindspore.float32) y = self.activation(y) y = self.dropout(y) y = ops.cast(y, x.dtype) y = self.ff2(y) - y = ops.cast(y, ms.float32) + y = ops.cast(y, mindspore.float32) y = self.dropout(y) y = ops.cast(y, x.dtype) return y diff --git a/official/cv/VIT/train.py b/official/cv/VIT/train.py index 02cec0302..abf623450 100644 --- a/official/cv/VIT/train.py +++ b/official/cv/VIT/train.py @@ -19,7 +19,7 @@ import time import socket import numpy as np -import mindspore as ms +import mindspore from mindspore import Tensor from mindspore.train.model import Model, ParallelMode from mindspore.train.callback import ModelCheckpoint, CheckpointConfig @@ -105,17 +105,17 @@ def train_setcontext(): np.random.seed(args.seed) args.logger = get_logger(args.save_checkpoint_path, rank=local_rank) - ms.set_context(device_id=device_id, - mode=ms.GRAPH_MODE, + mindspore.set_context(device_id=device_id, + mode=0, device_target="Ascend", save_graphs=False) if args.auto_tune: - ms.set_context(auto_tune_mode='GA') + mindspore.set_context(auto_tune_mode='GA') elif args.device_num == 1: pass else: - ms.set_auto_parallel_context(device_num=device_num, + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) @@ -152,7 +152,7 @@ def train_net(): print("warning!!!, no split point") if os.path.isfile(args.pretrained): - ms.load_checkpoint(args.pretrained, net, strict_load=False) + mindspore.load_checkpoint(args.pretrained, net, strict_load=False) # loss if not args.use_label_smooth: diff --git a/official/cv/WGAN/eval.py b/official/cv/WGAN/eval.py index 97532cd59..61836ab6d 100644 --- a/official/cv/WGAN/eval.py +++ b/official/cv/WGAN/eval.py @@ -15,11 +15,11 @@ """ test WGAN """ import os import json +import mindspore import mindspore.common.dtype as mstype import mindspore.ops as ops from mindspore import Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net -from mindspore import context import numpy as np from PIL import Image @@ -31,8 +31,8 @@ from src.args import get_args if __name__ == "__main__": args_opt = get_args('eval') - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) - context.set_context(device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target=args_opt.device_target) + mindspore.set_context(device_id=args_opt.device_id) with open(args_opt.config, 'r') as gencfg: generator_config = json.loads(gencfg.read()) diff --git a/official/cv/WGAN/export.py b/official/cv/WGAN/export.py index 1d9e0d007..b4d03851a 100644 --- a/official/cv/WGAN/export.py +++ b/official/cv/WGAN/export.py @@ -19,8 +19,9 @@ python export.py """ import json import numpy as np +import mindspore import mindspore.common.dtype as mstype -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +from mindspore import Tensor, load_checkpoint, load_param_into_net, export from src.args import get_args from src.dcgan_model import DcganG @@ -28,8 +29,8 @@ from src.dcgannobn_model import DcgannobnG if __name__ == '__main__': args_opt = get_args('export') - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) - context.set_context(device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target=args_opt.device_target) + mindspore.set_context(device_id=args_opt.device_id) with open(args_opt.config, 'r') as gencfg: generator_config = json.loads(gencfg.read()) diff --git a/official/cv/WGAN/modelarts/start.py b/official/cv/WGAN/modelarts/start.py index f096e114e..968455f6b 100644 --- a/official/cv/WGAN/modelarts/start.py +++ b/official/cv/WGAN/modelarts/start.py @@ -18,13 +18,13 @@ import os import random import json import numpy as np +import mindspore from mindspore import Tensor, export import mindspore.nn as nn import mindspore.dataset as ds import mindspore.ops as ops import mindspore.common.dtype as mstype from mindspore.common import initializer as init -from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net, save_checkpoint from PIL import Image from src.dataset import create_dataset @@ -41,14 +41,14 @@ if __name__ == '__main__': # init context target = args_opt.device_target - context.set_context(mode=context.GRAPH_MODE, device_target=target) + mindspore.set_context(mode=0, device_target=target) # whether train on modelarts or local server if not args_opt.is_modelarts: if args_opt.experiment is None: args_opt.experiment = 'samples' os.system('mkdir {0}'.format(args_opt.experiment)) - context.set_context(device_id=int(args_opt.device_id)) + mindspore.set_context(device_id=int(args_opt.device_id)) dataset = create_dataset(args_opt.dataroot, args_opt.dataset, args_opt.batchSize, args_opt.imageSize, 1, args_opt.workers, target) @@ -58,7 +58,7 @@ if __name__ == '__main__': if args_opt.experiment is None: args_opt.experiment = '/cache/train_output' os.system('mkdir {0}'.format(args_opt.experiment)) - context.set_context(device_id=int(os.getenv('DEVICE_ID'))) + mindspore.set_context(device_id=int(os.getenv('DEVICE_ID'))) data_name = 'LSUN-bedroom.zip' local_data_url = '/cache/data_path/' mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_url) diff --git a/official/cv/WGAN/src/cell.py b/official/cv/WGAN/src/cell.py index 2cc78f46f..3e53c9ce0 100644 --- a/official/cv/WGAN/src/cell.py +++ b/official/cv/WGAN/src/cell.py @@ -20,7 +20,7 @@ import mindspore.ops.operations as P import mindspore.ops.functional as F from mindspore.parallel._utils import (_get_device_num, _get_gradients_mean, _get_parallel_mode) -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.wrap.grad_reducer import DistributedGradReducer diff --git a/official/cv/WGAN/train.py b/official/cv/WGAN/train.py index ff6ca84ab..803eaff39 100644 --- a/official/cv/WGAN/train.py +++ b/official/cv/WGAN/train.py @@ -17,13 +17,13 @@ import os import random import json import time +import mindspore from mindspore import Tensor import mindspore.nn as nn import mindspore.dataset as ds import mindspore.ops as ops from mindspore.common import initializer as init import mindspore.common.dtype as mstype -from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net, save_checkpoint from PIL import Image import numpy as np @@ -41,14 +41,14 @@ if __name__ == '__main__': # init context target = args_opt.device_target - context.set_context(mode=context.GRAPH_MODE, device_target=target) + mindspore.set_context(mode=0, device_target=target) # whether train on modelarts or local server if not args_opt.is_modelarts: if args_opt.experiment is None: args_opt.experiment = 'samples' os.system('mkdir {0}'.format(args_opt.experiment)) - context.set_context(device_id=int(args_opt.device_id)) + mindspore.set_context(device_id=int(args_opt.device_id)) dataset = create_dataset(args_opt.dataroot, args_opt.dataset, args_opt.batchSize, args_opt.imageSize, 1, args_opt.workers, target) @@ -57,7 +57,7 @@ if __name__ == '__main__': if args_opt.experiment is None: args_opt.experiment = '/cache/train_output' os.system('mkdir {0}'.format(args_opt.experiment)) - context.set_context(device_id=int(os.getenv('DEVICE_ID'))) + mindspore.set_context(device_id=int(os.getenv('DEVICE_ID'))) data_name = 'LSUN-bedroom.zip' local_data_url = '/cache/data_path/' mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_url) diff --git a/official/cv/YOLOX/eval.py b/official/cv/YOLOX/eval.py index 0c17dbd8e..74f98243f 100644 --- a/official/cv/YOLOX/eval.py +++ b/official/cv/YOLOX/eval.py @@ -19,8 +19,8 @@ import os import datetime import shutil from model_utils.config import config -from mindspore.context import ParallelMode -from mindspore import context +import mindspore +from mindspore import ParallelMode from mindspore.communication.management import init, get_group_size, get_rank from src.logger import get_logger @@ -36,7 +36,7 @@ def run_test(): config.annFile = os.path.join(config.data_dir, 'annotations/instances_val2017.json') devid = int(os.getenv('DEVICE_ID', '0')) - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False, device_id=devid) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, device_id=devid) # logger config.log_dir = os.path.join( @@ -52,8 +52,8 @@ def run_test(): config.rank = get_rank() config.group_size = get_group_size() device_num = get_group_size() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=device_num) + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=device_num) # ------------------network create---------------------------------------------------------------------------- config.logger.info('Begin Creating Network....') if config.backbone == "yolox_darknet53": diff --git a/official/cv/YOLOX/export.py b/official/cv/YOLOX/export.py index 956cdc4dc..575a81182 100644 --- a/official/cv/YOLOX/export.py +++ b/official/cv/YOLOX/export.py @@ -19,8 +19,8 @@ python export.py import os import numpy as np -import mindspore as ms -from mindspore import Tensor, export, context +import mindspore +from mindspore import Tensor, export from model_utils.config import config from src.yolox import DetectionBlock @@ -33,10 +33,10 @@ def run_export(): Returns:None """ - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(device_id=device_id) + mindspore.set_context(device_id=device_id) if config.backbone == "yolox_darknet53": backbone = "yolofpn" else: @@ -45,7 +45,7 @@ def run_export(): network.set_train(False) assert config.val_ckpt is not None, "config.ckpt_file is None." network = load_weights(network, config.val_ckpt) - input_arr = Tensor(np.ones([config.export_bs, 3, config.input_size[0], config.input_size[1]]), ms.float32) + input_arr = Tensor(np.ones([config.export_bs, 3, config.input_size[0], config.input_size[1]]), mindspore.float32) file_name = backbone export(network, input_arr, file_name=file_name, file_format=config.file_format) diff --git a/official/cv/YOLOX/model_utils/moxing_adapter.py b/official/cv/YOLOX/model_utils/moxing_adapter.py index 7730180dd..5ed81870f 100644 --- a/official/cv/YOLOX/model_utils/moxing_adapter.py +++ b/official/cv/YOLOX/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/YOLOX/predict.py b/official/cv/YOLOX/predict.py index a3364306d..27a8d0664 100644 --- a/official/cv/YOLOX/predict.py +++ b/official/cv/YOLOX/predict.py @@ -16,7 +16,8 @@ import os import cv2 import numpy as np -from mindspore import Tensor, context +import mindspore +from mindspore import Tensor from model_utils.config import config from src.transform import preproc from src.util import load_weights, DetectionEngine @@ -39,7 +40,7 @@ LABELS = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'teddy bear', 'hair drier', 'toothbrush'] -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) class YoloxPredict: diff --git a/official/cv/YOLOX/train.py b/official/cv/YOLOX/train.py index 15fae3a9e..749e7ea96 100644 --- a/official/cv/YOLOX/train.py +++ b/official/cv/YOLOX/train.py @@ -21,11 +21,11 @@ import datetime import mindspore from mindspore import DynamicLossScaleManager -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.common import set_seed from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, SummaryCollector from mindspore.communication.management import init, get_rank, get_group_size -from mindspore import context, Model, load_checkpoint, load_param_into_net +from mindspore import Model, load_checkpoint, load_param_into_net from mindspore.profiler.profiling import Profiler from mindspore.common.tensor import Tensor @@ -71,9 +71,9 @@ def set_default(cfg): def set_graph_kernel_context(): - if context.get_context("device_target") == "GPU": - context.set_context(enable_graph_kernel=True) - context.set_context(graph_kernel_flags="--enable_parallel_fusion " + if mindspore.get_context("device_target") == "GPU": + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(graph_kernel_flags="--enable_parallel_fusion " "--enable_trans_op_optimize " "--disable_cluster_ops=ReduceMax,Reshape " "--enable_expand_ops=Conv2D") @@ -82,7 +82,7 @@ def set_graph_kernel_context(): def network_init(cfg): """ Network init """ device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=cfg.device_target, save_graphs=cfg.save_graphs, device_id=device_id, save_graphs_path="ir_path", max_call_depth=2000) set_graph_kernel_context() @@ -100,8 +100,8 @@ def network_init(cfg): init() cfg.rank = get_rank() cfg.group_size = get_group_size() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=cfg.group_size) # select for master rank save ckpt or all rank save, compatible for model parallel @@ -115,13 +115,13 @@ def network_init(cfg): def parallel_init(args): - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE degree = 1 if args.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL degree = get_group_size() - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) def modelarts_pre_process(cfg): @@ -314,7 +314,7 @@ def run_train(cfg): base_network = DetectionBlock(cfg, backbone=backbone) # syc bn only support distributed training in graph mode - if cfg.use_syc_bn and cfg.is_distributed and context.get_context('mode') == context.GRAPH_MODE: + if cfg.use_syc_bn and cfg.is_distributed and mindspore.get_context('mode') == 0: cfg.logger.info("Using Synchronized batch norm layer...") use_syc_bn(base_network) default_recurisive_init(base_network) diff --git a/official/cv/YOLOv3/convert_weight.py b/official/cv/YOLOv3/convert_weight.py index 98b22996f..6e7abf51e 100644 --- a/official/cv/YOLOv3/convert_weight.py +++ b/official/cv/YOLOv3/convert_weight.py @@ -15,7 +15,7 @@ """Convert weight to mindspore ckpt.""" import os import numpy as np -import mindspore as ms +import mindspore from src.yolo import YOLOV3DarkNet53 from model_utils.config import config @@ -61,14 +61,14 @@ def convert(weights_file, output_file): index += weight.size param_list.append({'name': weight.name, 'type': weight.dtype, 'shape': weight.shape, - 'data': ms.Tensor(weight_data)}) - param_list.append({'name': mean.name, 'type': mean.dtype, 'shape': mean.shape, 'data': ms.Tensor(mean_data)}) - param_list.append({'name': var.name, 'type': var.dtype, 'shape': var.shape, 'data': ms.Tensor(var_data)}) + 'data': mindspore.Tensor(weight_data)}) + param_list.append({'name': mean.name, 'type': mean.dtype, 'shape': mean.shape, 'data': mindspore.Tensor(mean_data)}) + param_list.append({'name': var.name, 'type': var.dtype, 'shape': var.shape, 'data': mindspore.Tensor(var_data)}) param_list.append({'name': gamma.name, 'type': gamma.dtype, 'shape': gamma.shape, - 'data': ms.Tensor(gamma_data)}) - param_list.append({'name': beta.name, 'type': beta.dtype, 'shape': beta.shape, 'data': ms.Tensor(beta_data)}) + 'data': mindspore.Tensor(gamma_data)}) + param_list.append({'name': beta.name, 'type': beta.dtype, 'shape': beta.shape, 'data': mindspore.Tensor(beta_data)}) - ms.save_checkpoint(param_list, output_file) + mindspore.save_checkpoint(param_list, output_file) if __name__ == "__main__": diff --git a/official/cv/YOLOv3/eval.py b/official/cv/YOLOv3/eval.py index 64da549fe..95e2d4737 100644 --- a/official/cv/YOLOv3/eval.py +++ b/official/cv/YOLOv3/eval.py @@ -17,7 +17,7 @@ import os import datetime import time -import mindspore as ms +import mindspore from src.yolo import YOLOV3DarkNet53 from src.logger import get_logger @@ -37,7 +37,7 @@ def conver_testing_shape(args): def load_parameters(network, file_name): config.logger.info("yolov3 pretrained network model: %s", file_name) - param_dict = ms.load_checkpoint(file_name) + param_dict = mindspore.load_checkpoint(file_name) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): @@ -46,7 +46,7 @@ def load_parameters(network, file_name): param_dict_new[key[13:]] = values else: param_dict_new[key] = values - ms.load_param_into_net(network, param_dict_new) + mindspore.load_param_into_net(network, param_dict_new) config.logger.info('load_model %s success', file_name) @@ -58,7 +58,7 @@ def run_test(): config.annFile = os.path.join(config.data_dir, 'annotations/instances_val2014.json') devid = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0 - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, save_graphs=False, device_id=devid) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, device_id=devid) # logger config.outputs_dir = os.path.join(config.log_path, @@ -66,9 +66,9 @@ def run_test(): rank_id = int(os.environ.get('RANK_ID')) if os.environ.get('RANK_ID') else 0 config.logger = get_logger(config.outputs_dir, rank_id) - ms.reset_auto_parallel_context() - parallel_mode = ms.ParallelMode.STAND_ALONE - ms.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) + mindspore.reset_auto_parallel_context() + parallel_mode = mindspore.ParallelMode.STAND_ALONE + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) config.logger.info('Creating Network....') network = YOLOV3DarkNet53(is_training=False) diff --git a/official/cv/YOLOv3/eval_onnx.py b/official/cv/YOLOv3/eval_onnx.py index e06ec68e6..dc6d6db2b 100644 --- a/official/cv/YOLOv3/eval_onnx.py +++ b/official/cv/YOLOv3/eval_onnx.py @@ -16,7 +16,7 @@ import os import datetime import time import onnxruntime -import mindspore as ms +import mindspore from src.logger import get_logger from src.yolo_dataset import create_yolo_dataset from src.util import DetectionEngine @@ -35,7 +35,7 @@ def run_test(): config.annFile = os.path.join(config.data_dir, 'annotations/instances_val2014.json') devid = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0 - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, save_graphs=False, device_id=devid) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, device_id=devid) # logger config.outputs_dir = os.path.join(config.log_path, @@ -43,9 +43,9 @@ def run_test(): rank_id = int(os.environ.get('RANK_ID')) if os.environ.get('RANK_ID') else 0 config.logger = get_logger(config.outputs_dir, rank_id) - ms.reset_auto_parallel_context() - parallel_mode = ms.ParallelMode.STAND_ALONE - ms.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) + mindspore.reset_auto_parallel_context() + parallel_mode = mindspore.ParallelMode.STAND_ALONE + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) print(config.device_target) if config.device_target == 'GPU': providers = ['CUDAExecutionProvider'] diff --git a/official/cv/YOLOv3/export.py b/official/cv/YOLOv3/export.py index 33c0555e7..7a477ef67 100644 --- a/official/cv/YOLOv3/export.py +++ b/official/cv/YOLOv3/export.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -import mindspore as ms +import mindspore from src.yolo import YOLOV3DarkNet53 from model_utils.config import config @@ -21,20 +21,20 @@ from model_utils.moxing_adapter import moxing_wrapper, modelarts_export_preproce @moxing_wrapper(pre_process=modelarts_export_preprocess) def run_export(): - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - ms.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) network = YOLOV3DarkNet53(is_training=False) - param_dict = ms.load_checkpoint(config.ckpt_file) - ms.load_param_into_net(network, param_dict) + param_dict = mindspore.load_checkpoint(config.ckpt_file) + mindspore.load_param_into_net(network, param_dict) network.set_train(False) shape = [config.batch_size, 3] + config.test_img_shape - input_data = ms.numpy.zeros(shape, ms.float32) + input_data = mindspore.numpy.zeros(shape, mindspore.float32) - ms.export(network, input_data, file_name=config.file_name, file_format=config.file_format) + mindspore.export(network, input_data, file_name=config.file_name, file_format=config.file_format) if __name__ == "__main__": diff --git a/official/cv/YOLOv3/model_utils/moxing_adapter.py b/official/cv/YOLOv3/model_utils/moxing_adapter.py index 24a6d90e8..120e57e89 100644 --- a/official/cv/YOLOv3/model_utils/moxing_adapter.py +++ b/official/cv/YOLOv3/model_utils/moxing_adapter.py @@ -18,7 +18,8 @@ import os import time import functools -import mindspore as ms +import mindspore +import mindspore from .config import config _global_sync_count = 0 @@ -155,7 +156,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - ms.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/YOLOv3/src/initializer.py b/official/cv/YOLOv3/src/initializer.py index a5c6c283d..5ba4bfbba 100644 --- a/official/cv/YOLOv3/src/initializer.py +++ b/official/cv/YOLOv3/src/initializer.py @@ -16,7 +16,7 @@ import math from functools import reduce import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn from .util import load_backbone @@ -137,7 +137,7 @@ def _calculate_fan_in_and_fan_out(arr): return fan_in, fan_out -class KaimingUniform(ms.common.initializer.Initializer): +class KaimingUniform(mindspore.common.initializer.Initializer): """Kaiming uniform initializer.""" def __init__(self, a=0, mode='fan_in', nonlinearity='leaky_relu'): super(KaimingUniform, self).__init__() @@ -154,20 +154,20 @@ def default_recurisive_init(custom_cell): """Initialize parameter.""" for _, cell in custom_cell.cells_and_names(): if isinstance(cell, nn.Conv2d): - cell.weight.set_data(ms.common.initializer.initializer(KaimingUniform(a=math.sqrt(5)), + cell.weight.set_data(mindspore.common.initializer.initializer(KaimingUniform(a=math.sqrt(5)), cell.weight.shape, cell.weight.dtype)) if cell.bias is not None: fan_in, _ = _calculate_fan_in_and_fan_out(cell.weight) bound = 1 / math.sqrt(fan_in) - cell.bias.set_data(ms.common.initializer.initializer(ms.common.initializer.Uniform(bound), + cell.bias.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.Uniform(bound), cell.bias.shape, cell.bias.dtype)) elif isinstance(cell, nn.Dense): - cell.weight.set_data(ms.common.initializer.initializer(KaimingUniform(a=math.sqrt(5)), + cell.weight.set_data(mindspore.common.initializer.initializer(KaimingUniform(a=math.sqrt(5)), cell.weight.shape, cell.weight.dtype)) if cell.bias is not None: fan_in, _ = _calculate_fan_in_and_fan_out(cell.weight) bound = 1 / math.sqrt(fan_in) - cell.bias.set_data(ms.common.initializer.initializer(ms.common.initializer.Uniform(bound), + cell.bias.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.Uniform(bound), cell.bias.shape, cell.bias.dtype)) elif isinstance(cell, (nn.BatchNorm2d, nn.BatchNorm1d)): pass @@ -182,7 +182,7 @@ def load_yolov3_params(args, network): args.logger.info('Not load pre-trained backbone, please be careful') if args.resume_yolov3: - param_dict = ms.load_checkpoint(args.resume_yolov3) + param_dict = mindspore.load_checkpoint(args.resume_yolov3) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): @@ -195,5 +195,5 @@ def load_yolov3_params(args, network): args.logger.info('in resume {}'.format(key)) args.logger.info('resume finished') - ms.load_param_into_net(network, param_dict_new) + mindspore.load_param_into_net(network, param_dict_new) args.logger.info('load_model {} success'.format(args.resume_yolov3)) diff --git a/official/cv/YOLOv3/src/util.py b/official/cv/YOLOv3/src/util.py index 3ac480b37..376f9f703 100644 --- a/official/cv/YOLOv3/src/util.py +++ b/official/cv/YOLOv3/src/util.py @@ -20,7 +20,7 @@ import numpy as np from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval -import mindspore as ms +import mindspore from .yolo import YoloLossBlock @@ -62,9 +62,9 @@ class AverageMeter: def load_backbone(net, ckpt_path, args): """Load darknet53 backbone checkpoint.""" - param_dict = ms.load_checkpoint(ckpt_path) + param_dict = mindspore.load_checkpoint(ckpt_path) net.init_parameters_data() - ms.load_param_into_net(net, param_dict) + mindspore.load_param_into_net(net, param_dict) param_not_load = [] for _, param in net.parameters_and_names(): @@ -148,7 +148,7 @@ def keep_loss_fp32(network): """Keep loss of network with float32""" for _, cell in network.cells_and_names(): if isinstance(cell, (YoloLossBlock,)): - cell.to_float(ms.float32) + cell.to_float(mindspore.float32) def cpu_affinity(rank_id, device_num): diff --git a/official/cv/YOLOv3/src/yolo.py b/official/cv/YOLOv3/src/yolo.py index 23e7bc085..86c98d41d 100644 --- a/official/cv/YOLOv3/src/yolo.py +++ b/official/cv/YOLOv3/src/yolo.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ """YOLOv3 based on DarkNet.""" -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops @@ -178,7 +178,7 @@ class DetectionBlock(nn.Cell): idx = (6, 7, 8) else: raise KeyError("Invalid scale value for DetectionBlock") - self.anchors = ms.Tensor([self.config.anchor_scales[i] for i in idx], ms.float32) + self.anchors = mindspore.Tensor([self.config.anchor_scales[i] for i in idx], mindspore.float32) self.num_anchors_per_scale = 3 self.num_attrib = 4+1+self.config.num_classes self.lambda_coord = 1 @@ -200,8 +200,8 @@ class DetectionBlock(nn.Cell): range_x = range(grid_size[1]) range_y = range(grid_size[0]) - grid_x = ops.Cast()(ops.tuple_to_array(range_x), ms.float32) - grid_y = ops.Cast()(ops.tuple_to_array(range_y), ms.float32) + grid_x = ops.Cast()(ops.tuple_to_array(range_x), mindspore.float32) + grid_y = ops.Cast()(ops.tuple_to_array(range_y), mindspore.float32) # Tensor of shape [grid_size[0], grid_size[1], 1, 1] representing the coordinate of x/y axis for each grid # [batch, gridx, gridy, 1, 1] grid_x = self.tile(self.reshape(grid_x, (1, 1, -1, 1, 1)), (1, grid_size[0], 1, 1, 1)) @@ -215,7 +215,7 @@ class DetectionBlock(nn.Cell): # gridsize1 is x # gridsize0 is y box_xy = (self.sigmoid(box_xy) + grid) / ops.Cast()(ops.tuple_to_array((grid_size[1], - grid_size[0])), ms.float32) + grid_size[0])), mindspore.float32) # box_wh is w->h box_wh = ops.Exp()(box_wh) * self.anchors / input_shape @@ -278,8 +278,8 @@ class YoloLossBlock(nn.Cell): idx = (6, 7, 8) else: raise KeyError("Invalid scale value for DetectionBlock") - self.anchors = ms.Tensor([self.config.anchor_scales[i] for i in idx], ms.float32) - self.ignore_threshold = ms.Tensor(self.config.ignore_threshold, ms.float32) + self.anchors = mindspore.Tensor([self.config.anchor_scales[i] for i in idx], mindspore.float32) + self.ignore_threshold = mindspore.Tensor(self.config.ignore_threshold, mindspore.float32) self.concat = ops.Concat(axis=-1) self.iou = Iou() self.reduce_max = ops.ReduceMax(keep_dims=False) @@ -299,7 +299,7 @@ class YoloLossBlock(nn.Cell): class_probs = y_true[:, :, :, :, 5:] grid_shape = ops.Shape()(prediction)[1:3] - grid_shape = ops.Cast()(ops.tuple_to_array(grid_shape[::-1]), ms.float32) + grid_shape = ops.Cast()(ops.tuple_to_array(grid_shape[::-1]), mindspore.float32) pred_boxes = self.concat((pred_xy, pred_wh)) true_xy = y_true[:, :, :, :, :2] * grid_shape - grid @@ -323,7 +323,7 @@ class YoloLossBlock(nn.Cell): # ignore_mask IOU too small ignore_mask = best_iou < self.ignore_threshold - ignore_mask = ops.Cast()(ignore_mask, ms.float32) + ignore_mask = ops.Cast()(ignore_mask, mindspore.float32) ignore_mask = ops.ExpandDims()(ignore_mask, -1) # ignore_mask backpro will cause a lot maximunGrad and minimumGrad time consume. # so we turn off its gradient @@ -373,7 +373,7 @@ class YOLOV3DarkNet53(nn.Cell): def construct(self, x): input_shape = ops.shape(x)[2:4] - input_shape = ops.cast(self.tenser_to_array(input_shape), ms.float32) + input_shape = ops.cast(self.tenser_to_array(input_shape), mindspore.float32) big_object_output, medium_object_output, small_object_output = self.feature_map(x) if not self.keep_detect: return big_object_output, medium_object_output, small_object_output @@ -397,7 +397,7 @@ class YoloWithLossCell(nn.Cell): def construct(self, x, y_true_0, y_true_1, y_true_2, gt_0, gt_1, gt_2): input_shape = ops.shape(x)[2:4] - input_shape = ops.cast(self.tenser_to_array(input_shape), ms.float32) + input_shape = ops.cast(self.tenser_to_array(input_shape), mindspore.float32) yolo_out = self.yolo_network(x) loss_l = self.loss_big(*yolo_out[0], y_true_0, gt_0, input_shape) loss_m = self.loss_me(*yolo_out[1], y_true_1, gt_1, input_shape) diff --git a/official/cv/YOLOv3/train.py b/official/cv/YOLOv3/train.py index 6e1351a4c..d762c7ca6 100644 --- a/official/cv/YOLOv3/train.py +++ b/official/cv/YOLOv3/train.py @@ -17,7 +17,7 @@ import os import time import datetime -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.communication as comm @@ -34,7 +34,7 @@ from model_utils.config import config # only useful for huawei cloud modelarts. from model_utils.moxing_adapter import moxing_wrapper, modelarts_pre_process -ms.set_seed(1) +mindspore.set_seed(1) def conver_training_shape(args): @@ -43,9 +43,9 @@ def conver_training_shape(args): def set_graph_kernel_context(): - if ms.get_context("device_target") == "GPU": - ms.set_context(enable_graph_kernel=True) - ms.set_context(graph_kernel_flags="--enable_parallel_fusion " + if mindspore.get_context("device_target") == "GPU": + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(graph_kernel_flags="--enable_parallel_fusion " "--enable_trans_op_optimize " "--disable_cluster_ops=ReduceMax,Reshape " "--enable_expand_ops=Conv2D") @@ -53,22 +53,22 @@ def set_graph_kernel_context(): def network_init(args): device_id = int(os.getenv('DEVICE_ID', '0')) - ms.set_context(mode=ms.GRAPH_MODE, device_target=args.device_target, save_graphs=False, device_id=device_id) + mindspore.set_context(mode=0, device_target=args.device_target, save_graphs=False, device_id=device_id) set_graph_kernel_context() # Set mempool block size for improving memory utilization, which will not take effect in GRAPH_MODE - if ms.get_context("mode") == ms.PYNATIVE_MODE: - ms.set_context(mempool_block_size="31GB") + if mindspore.get_context("mode") == 1: + mindspore.set_context(mempool_block_size="31GB") # Since the default max memory pool available size on ascend is 30GB, # which does not meet the requirements and needs to be adjusted larger. - if ms.get_context("device_target") == "Ascend": - ms.set_context(max_device_memory="31GB") + if mindspore.get_context("device_target") == "Ascend": + mindspore.set_context(max_device_memory="31GB") profiler = None if args.need_profiler: profiling_dir = os.path.join("profiling", datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) - profiler = ms.profiler.Profiler(output_path=profiling_dir) + profiler = mindspore.profiler.Profiler(output_path=profiling_dir) # init distributed if args.is_distributed: @@ -94,13 +94,13 @@ def network_init(args): def parallel_init(args): - ms.reset_auto_parallel_context() - parallel_mode = ms.ParallelMode.STAND_ALONE + mindspore.reset_auto_parallel_context() + parallel_mode = mindspore.ParallelMode.STAND_ALONE degree = 1 if args.is_distributed: - parallel_mode = ms.ParallelMode.DATA_PARALLEL + parallel_mode = mindspore.ParallelMode.DATA_PARALLEL degree = comm.get_group_size() - ms.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) @moxing_wrapper(pre_process=modelarts_pre_process) @@ -135,13 +135,13 @@ def run_train(): config.steps_per_epoch = ds.get_dataset_size() lr = get_lr(config) - opt = nn.Momentum(params=get_param_groups(network), momentum=config.momentum, learning_rate=ms.Tensor(lr), + opt = nn.Momentum(params=get_param_groups(network), momentum=config.momentum, learning_rate=mindspore.Tensor(lr), weight_decay=config.weight_decay, loss_scale=config.loss_scale) - is_gpu = ms.get_context("device_target") == "GPU" + is_gpu = mindspore.get_context("device_target") == "GPU" if is_gpu: loss_scale_value = 1.0 - loss_scale = ms.FixedLossScaleManager(loss_scale_value, drop_overflow_update=False) - network = ms.build_train_network(network, optimizer=opt, loss_scale_manager=loss_scale, + loss_scale = mindspore.FixedLossScaleManager(loss_scale_value, drop_overflow_update=False) + network = mindspore.build_train_network(network, optimizer=opt, loss_scale_manager=loss_scale, level="O2", keep_batchnorm_fp32=False) keep_loss_fp32(network) else: @@ -158,14 +158,14 @@ def run_train(): images = data["image"] input_shape = images.shape[2:4] config.logger.info('iter[{}], shape{}'.format(step_idx, input_shape[0])) - images = ms.Tensor.from_numpy(images) + images = mindspore.Tensor.from_numpy(images) - batch_y_true_0 = ms.Tensor.from_numpy(data['bbox1']) - batch_y_true_1 = ms.Tensor.from_numpy(data['bbox2']) - batch_y_true_2 = ms.Tensor.from_numpy(data['bbox3']) - batch_gt_box0 = ms.Tensor.from_numpy(data['gt_box1']) - batch_gt_box1 = ms.Tensor.from_numpy(data['gt_box2']) - batch_gt_box2 = ms.Tensor.from_numpy(data['gt_box3']) + batch_y_true_0 = mindspore.Tensor.from_numpy(data['bbox1']) + batch_y_true_1 = mindspore.Tensor.from_numpy(data['bbox2']) + batch_y_true_2 = mindspore.Tensor.from_numpy(data['bbox3']) + batch_gt_box0 = mindspore.Tensor.from_numpy(data['gt_box1']) + batch_gt_box1 = mindspore.Tensor.from_numpy(data['gt_box2']) + batch_gt_box2 = mindspore.Tensor.from_numpy(data['gt_box3']) loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2) @@ -196,7 +196,7 @@ def run_train(): if not os.path.exists(ckpt_path): os.makedirs(ckpt_path, exist_ok=True) ckpt_name = os.path.join(ckpt_path, "yolov3_{}_{}.ckpt".format(epoch_idx + 1, config.steps_per_epoch)) - ms.save_checkpoint(network, ckpt_name) + mindspore.save_checkpoint(network, ckpt_name) ckpt_list = [os.path.join(ckpt_path, f) for f in os.listdir(ckpt_path)] ckpt_list = sorted(ckpt_list, key=os.path.getmtime) for i in range(len(ckpt_list) - config.max_checkpoint_num): diff --git a/official/cv/YOLOv4/eval.py b/official/cv/YOLOv4/eval.py index 6a4749f6b..d435fa2a7 100644 --- a/official/cv/YOLOv4/eval.py +++ b/official/cv/YOLOv4/eval.py @@ -17,8 +17,8 @@ import os import datetime import time -from mindspore.context import ParallelMode -from mindspore import context +import mindspore +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.yolo import YOLOV4CspDarkNet53 @@ -90,7 +90,7 @@ def modelarts_pre_process(): def run_eval(): start_time = time.time() device_id = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0 - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) # logger config.outputs_dir = os.path.join(config.log_path, @@ -98,9 +98,9 @@ def run_eval(): rank_id = int(os.environ.get('RANK_ID')) if os.environ.get('RANK_ID') else 0 config.logger = get_logger(config.outputs_dir, rank_id) - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) config.logger.info('Creating Network....') network = YOLOV4CspDarkNet53() diff --git a/official/cv/YOLOv4/export.py b/official/cv/YOLOv4/export.py index 6904fbd53..6173b3b62 100644 --- a/official/cv/YOLOv4/export.py +++ b/official/cv/YOLOv4/export.py @@ -16,7 +16,7 @@ import os import numpy as np import mindspore -from mindspore import context, Tensor +from mindspore import Tensor from mindspore.train.serialization import export, load_checkpoint, load_param_into_net from src.yolo import YOLOV4CspDarkNet53 @@ -32,9 +32,9 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def run_export(): - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) ts_shape = config.testing_shape network = YOLOV4CspDarkNet53() diff --git a/official/cv/YOLOv4/infer/README.md b/official/cv/YOLOv4/infer/README.md index 673cfedfe..4ba63230e 100644 --- a/official/cv/YOLOv4/infer/README.md +++ b/official/cv/YOLOv4/infer/README.md @@ -77,7 +77,7 @@ warmup_epochs: 4 MindSpore支持数据并行及自动并行。自动并行是MindSpore融合了数据并行、模型并行及混合并行的一种分布式并行模式,可以自动建立代价模型,为用户选择一种并行模式。相关代码示例。 ```shell -context.set_auto_parallel_context(parallel_mode = ParallelMode.DATA_PARALLEL, device_num = device_num) +mindspore.set_auto_parallel_context(parallel_mode = ParallelMode.DATA_PARALLEL, device_num = device_num) ``` ### 混合精度训练 @@ -92,7 +92,7 @@ context.set_auto_parallel_context(parallel_mode = ParallelMode.DATA_PARALLEL, de 1. 硬件环境准备请参见各硬件产品[“驱动和固件安装升级指南”](https://support.huawei.com/enterprise/zh/category/ai-computing-platform-pid-1557196528909) 。需要在硬件设备上安装与CANN版本配套的固件与驱动。 -2. 宿主机上需要安装Python3和Docker,并登录[Ascend Hub中心](https://ascend.huawei.com/ascendhub/#/home) 获取镜像。 +2. 宿主机上需要安装Python3和Docker,并登录[Ascend Hub中心](https://www.hiascend.com/developer/ascendhub) 获取镜像。 当前模型支持的镜像列表如下表所示。 **表 1** 镜像列表 diff --git a/official/cv/YOLOv4/model_utils/moxing_adapter.py b/official/cv/YOLOv4/model_utils/moxing_adapter.py index 25838a7da..189ff0667 100644 --- a/official/cv/YOLOv4/model_utils/moxing_adapter.py +++ b/official/cv/YOLOv4/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from .config import config _global_sync_count = 0 @@ -92,7 +92,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/YOLOv4/modelarts/modelarts.py b/official/cv/YOLOv4/modelarts/modelarts.py index bb0c13100..4fe575b09 100644 --- a/official/cv/YOLOv4/modelarts/modelarts.py +++ b/official/cv/YOLOv4/modelarts/modelarts.py @@ -20,11 +20,10 @@ import datetime import numpy as np import mindspore -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.optim.momentum import Momentum from mindspore import Tensor import mindspore.nn as nn -from mindspore import context from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.serialization import export, load_checkpoint, load_param_into_net from mindspore.train.callback import ModelCheckpoint, RunContext @@ -82,7 +81,7 @@ def set_default(): config.ann_val_file = os.path.join(args_opt.data_url, 'annotations/instances_val2017.json') device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, device_id=device_id) if config.need_profiler: @@ -208,13 +207,13 @@ def run_train(): profiler = set_default() loss_meter = AverageMeter('loss') - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE degree = 1 if config.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL degree = get_group_size() - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) network = YOLOV4CspDarkNet53() if config.run_eval: diff --git a/official/cv/YOLOv4/src/yolo.py b/official/cv/YOLOv4/src/yolo.py index 357e37e73..40a174eb1 100644 --- a/official/cv/YOLOv4/src/yolo.py +++ b/official/cv/YOLOv4/src/yolo.py @@ -13,11 +13,10 @@ # limitations under the License. # ============================================================================ """YOLOv4 based on DarkNet.""" -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.common.tensor import Tensor -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.communication.management import get_group_size from mindspore.ops import operations as P @@ -237,7 +236,7 @@ class DetectionBlock(nn.Cell): self.offset_x_y = 0.025 else: raise KeyError("Invalid scale value for DetectionBlock") - self.anchors = Tensor([self.config.anchor_scales[i] for i in idx], ms.float32) + self.anchors = Tensor([self.config.anchor_scales[i] for i in idx], mindspore.float32) self.num_anchors_per_scale = 3 self.num_attrib = 4+1+self.config.num_classes self.lambda_coord = 1 @@ -262,8 +261,8 @@ class DetectionBlock(nn.Cell): range_x = range(grid_size[1]) range_y = range(grid_size[0]) - grid_x = P.Cast()(F.tuple_to_array(range_x), ms.float32) - grid_y = P.Cast()(F.tuple_to_array(range_y), ms.float32) + grid_x = P.Cast()(F.tuple_to_array(range_x), mindspore.float32) + grid_y = P.Cast()(F.tuple_to_array(range_y), mindspore.float32) # Tensor of shape [grid_size[0], grid_size[1], 1, 1] representing the coordinate of x/y axis for each grid # [batch, gridx, gridy, 1, 1] grid_x = self.tile(self.reshape(grid_x, (1, 1, -1, 1, 1)), (1, grid_size[0], 1, 1, 1)) @@ -279,7 +278,7 @@ class DetectionBlock(nn.Cell): # gridsize1 is x # gridsize0 is y box_xy = (self.scale_x_y * self.sigmoid(box_xy) - self.offset_x_y + grid) / \ - P.Cast()(F.tuple_to_array((grid_size[1], grid_size[0])), ms.float32) + P.Cast()(F.tuple_to_array((grid_size[1], grid_size[0])), mindspore.float32) # box_wh is w->h box_wh = P.Exp()(box_wh) * self.anchors / input_shape box_confidence = self.sigmoid(box_confidence) @@ -342,8 +341,8 @@ class YoloLossBlock(nn.Cell): idx = (6, 7, 8) else: raise KeyError("Invalid scale value for DetectionBlock") - self.anchors = Tensor([self.config.anchor_scales[i] for i in idx], ms.float32) - self.ignore_threshold = Tensor(self.config.ignore_threshold, ms.float32) + self.anchors = Tensor([self.config.anchor_scales[i] for i in idx], mindspore.float32) + self.ignore_threshold = Tensor(self.config.ignore_threshold, mindspore.float32) self.concat = P.Concat(axis=-1) self.iou = Iou() self.reduce_max = P.ReduceMax(keep_dims=False) @@ -372,7 +371,7 @@ class YoloLossBlock(nn.Cell): true_boxes = y_true[:, :, :, :, :4] grid_shape = P.Shape()(prediction)[1:3] - grid_shape = P.Cast()(F.tuple_to_array(grid_shape[::-1]), ms.float32) + grid_shape = P.Cast()(F.tuple_to_array(grid_shape[::-1]), mindspore.float32) pred_boxes = self.concat((pred_xy, pred_wh)) true_wh = y_true[:, :, :, :, 2:4] @@ -396,7 +395,7 @@ class YoloLossBlock(nn.Cell): # ignore_mask IOU too small ignore_mask = best_iou < self.ignore_threshold - ignore_mask = P.Cast()(ignore_mask, ms.float32) + ignore_mask = P.Cast()(ignore_mask, mindspore.float32) ignore_mask = P.ExpandDims()(ignore_mask, -1) # ignore_mask backpro will cause a lot maximunGrad and minimumGrad time consume. # so we turn off its gradient @@ -438,7 +437,7 @@ class YOLOV4CspDarkNet53(nn.Cell): super(YOLOV4CspDarkNet53, self).__init__() self.config = default_config self.keep_detect = self.config.keep_detect - self.test_img_shape = Tensor(tuple(self.config.test_img_shape), ms.float32) + self.test_img_shape = Tensor(tuple(self.config.test_img_shape), mindspore.float32) # YOLOv4 network self.feature_map = YOLOv4(backbone=CspDarkNet53(ResidualBlock, detect=True), @@ -497,13 +496,13 @@ class TrainingWrapper(nn.Cell): self.sens = sens self.reducer_flag = False self.grad_reducer = None - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True if self.reducer_flag: - mean = context.get_auto_parallel_context("gradients_mean") + mean = mindspore.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): - degree = context.get_auto_parallel_context("device_num") + degree = mindspore.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree) @@ -549,9 +548,9 @@ class Giou(nn.Cell): union = box_p_area + box_gt_area - intersection union = union + self.eps c_area = c_area + self.eps - iou = self.div(self.cast(intersection, ms.float32), self.cast(union, ms.float32)) + iou = self.div(self.cast(intersection, mindspore.float32), self.cast(union, mindspore.float32)) res_mid0 = c_area - union - res_mid1 = self.div(self.cast(res_mid0, ms.float32), self.cast(c_area, ms.float32)) + res_mid1 = self.div(self.cast(res_mid0, mindspore.float32), self.cast(c_area, mindspore.float32)) giou = iou - res_mid1 giou = C.clip_by_value(giou, -1.0, 1.0) return giou diff --git a/official/cv/YOLOv4/test.py b/official/cv/YOLOv4/test.py index 26be396ee..1235ab62e 100644 --- a/official/cv/YOLOv4/test.py +++ b/official/cv/YOLOv4/test.py @@ -22,9 +22,9 @@ from collections import defaultdict import json import numpy as np -from mindspore import context +import mindspore from mindspore import Tensor -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -37,7 +37,7 @@ from model_utils.moxing_adapter import moxing_wrapper from model_utils.device_adapter import get_device_id, get_device_num devid = int(os.getenv('DEVICE_ID')) -context.set_context(mode=context.GRAPH_MODE, device_target="Davinci", save_graphs=False, device_id=devid) +mindspore.set_context(mode=0, device_target="Davinci", save_graphs=False, device_id=devid) config.data_root = os.path.join(config.data_dir, 'test2017') config.nms_thresh = config.test_nms_thresh @@ -288,12 +288,12 @@ def run_test(): config.logger = get_logger(config.outputs_dir, config.rank) - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() if config.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL else: parallel_mode = ParallelMode.STAND_ALONE - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) config.logger.info('Creating Network....') network = YOLOV4CspDarkNet53() diff --git a/official/cv/YOLOv4/train.py b/official/cv/YOLOv4/train.py index eaf55b6cb..988fb3b95 100644 --- a/official/cv/YOLOv4/train.py +++ b/official/cv/YOLOv4/train.py @@ -17,15 +17,14 @@ import os import time import datetime -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.optim.momentum import Momentum from mindspore import Tensor import mindspore.nn as nn -from mindspore import context from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.callback import ModelCheckpoint, RunContext from mindspore.train.callback import CheckpointConfig -import mindspore as ms +import mindspore from mindspore.common import set_seed from mindspore.profiler.profiling import Profiler @@ -55,7 +54,7 @@ def set_default(): config.ann_val_file = os.path.join(config.data_dir, 'annotations/instances_val2017.json') device_id = int(os.getenv('DEVICE_ID', '0')) - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False, + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, device_id=device_id, ascend_config={"precision_mode": "allow_fp32_to_fp16"}) if config.need_profiler: @@ -180,13 +179,13 @@ def get_network(net, cfg, learning_rate): def run_train(): profiler = set_default() loss_meter = AverageMeter('loss') - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE degree = 1 if config.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL degree = get_group_size() - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) network = YOLOV4CspDarkNet53() if config.run_eval: @@ -234,7 +233,7 @@ def run_train(): data_val_root = config.data_val_root ann_val_file = config.ann_val_file save_ckpt_path = os.path.join(config.outputs_dir, 'ckpt_' + str(config.rank) + '/') - input_val_shape = Tensor(tuple(config.test_img_shape), ms.float32) + input_val_shape = Tensor(tuple(config.test_img_shape), mindspore.float32) # init detection engine eval_dataset, eval_data_size = create_yolo_dataset(data_val_root, ann_val_file, is_training=False, batch_size=config.per_batch_size, max_epoch=1, device_num=1, @@ -263,7 +262,7 @@ def run_train(): batch_gt_box1 = Tensor.from_numpy(data['gt_box2']) batch_gt_box2 = Tensor.from_numpy(data['gt_box3']) - input_shape = Tensor(tuple(input_shape[::-1]), ms.float32) + input_shape = Tensor(tuple(input_shape[::-1]), mindspore.float32) loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2, input_shape) loss_meter.update(loss.asnumpy()) diff --git a/official/cv/YOLOv5/eval.py b/official/cv/YOLOv5/eval.py index 6f451173b..3f4bd88d2 100644 --- a/official/cv/YOLOv5/eval.py +++ b/official/cv/YOLOv5/eval.py @@ -17,9 +17,8 @@ import os import time import shutil -import mindspore as ms -from mindspore import context -from mindspore.context import ParallelMode +import mindspore +from mindspore import ParallelMode from mindspore.communication.management import init, get_group_size, get_rank from src.yolo import YOLOV5 @@ -37,7 +36,7 @@ def eval_preprocess(): config.val_img_dir = os.path.join(config.data_dir, config.val_img_dir) config.val_ann_file = os.path.join(config.data_dir, config.val_ann_file) device_id = int(os.getenv('DEVICE_ID', '0')) - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) parallel_mode = ParallelMode.STAND_ALONE config.eval_parallel = config.is_distributed and config.eval_parallel device_num = 1 @@ -47,8 +46,8 @@ def eval_preprocess(): config.group_size = get_group_size() device_num = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=device_num) + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=device_num) # logger module is managed by config, it is used in other function. e.x. config.logger.info("xxx") config.logger = get_logger(config.output_dir, device_id) @@ -56,7 +55,7 @@ def eval_preprocess(): def load_parameters(network, filename): config.logger.info("yolov5 pretrained network model: %s", filename) - param_dict = ms.load_checkpoint(filename) + param_dict = mindspore.load_checkpoint(filename) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): @@ -65,7 +64,7 @@ def load_parameters(network, filename): param_dict_new[key[13:]] = values else: param_dict_new[key] = values - ms.load_param_into_net(network, param_dict_new) + mindspore.load_param_into_net(network, param_dict_new) config.logger.info('load_model %s success', filename) diff --git a/official/cv/YOLOv5/eval_onnx.py b/official/cv/YOLOv5/eval_onnx.py index 71aa246a4..58edd5789 100644 --- a/official/cv/YOLOv5/eval_onnx.py +++ b/official/cv/YOLOv5/eval_onnx.py @@ -20,8 +20,8 @@ import time import numpy as np import onnxruntime as ort -from mindspore.context import ParallelMode -from mindspore import context +import mindspore +from mindspore import ParallelMode from eval import DetectionEngine from model_utils.config import config @@ -63,9 +63,9 @@ def run_eval(): rank_id = int(os.getenv('DEVICE_ID', '0')) config.logger = get_logger(config.outputs_dir, rank_id) - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) ds = create_yolo_dataset(data_root, ann_file, batch_size=config.per_batch_size, device_num=1, rank=rank_id, config=config, is_training=False, shuffle=False) diff --git a/official/cv/YOLOv5/export.py b/official/cv/YOLOv5/export.py index b11392730..24785c4c8 100644 --- a/official/cv/YOLOv5/export.py +++ b/official/cv/YOLOv5/export.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -import mindspore as ms +import mindspore from src.yolo import YOLOV5s_Infer @@ -22,9 +22,9 @@ from model_utils.moxing_adapter import moxing_wrapper, modelarts_export_preproce @moxing_wrapper(pre_process=modelarts_export_preprocess, pre_args=[config]) def run_export(): - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - ms.set_context(device_id=config.device_id) + mindspore.set_context(device_id=config.device_id) dict_version = {'yolov5s': 0, 'yolov5m': 1, 'yolov5l': 2, 'yolov5x': 3} config.file_name = config.file_name + '_' + config.yolov5_version @@ -32,12 +32,12 @@ def run_export(): network = YOLOV5s_Infer(config.testing_shape[0], version=dict_version[config.yolov5_version]) network.set_train(False) - param_dict = ms.load_checkpoint(config.ckpt_file) - ms.load_param_into_net(network, param_dict) + param_dict = mindspore.load_checkpoint(config.ckpt_file) + mindspore.load_param_into_net(network, param_dict) - input_data = ms.numpy.zeros([config.batch_size, config.testing_shape[0], config.testing_shape[1], 3], ms.int8) + input_data = mindspore.numpy.zeros([config.batch_size, config.testing_shape[0], config.testing_shape[1], 3], mindspore.int8) - ms.export(network, input_data, file_name=config.file_name, file_format=config.file_format) + mindspore.export(network, input_data, file_name=config.file_name, file_format=config.file_format) print('==========success export===============') if __name__ == "__main__": diff --git a/official/cv/YOLOv5/model_utils/moxing_adapter.py b/official/cv/YOLOv5/model_utils/moxing_adapter.py index a2f802f59..3ef209319 100644 --- a/official/cv/YOLOv5/model_utils/moxing_adapter.py +++ b/official/cv/YOLOv5/model_utils/moxing_adapter.py @@ -17,7 +17,8 @@ import os import functools -import mindspore as ms +import mindspore +import mindspore from .config import config _global_sync_count = 0 @@ -151,7 +152,7 @@ def moxing_wrapper(pre_process=None, post_process=None, **kwargs): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - ms.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/cv/YOLOv5/modelarts/train_start.py b/official/cv/YOLOv5/modelarts/train_start.py index 81f98c8c8..8d510884b 100644 --- a/official/cv/YOLOv5/modelarts/train_start.py +++ b/official/cv/YOLOv5/modelarts/train_start.py @@ -16,11 +16,11 @@ import os import time import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.communication as comm from mindspore.train.serialization import export, load_checkpoint, load_param_into_net -from mindspore import context, Tensor +from mindspore import Tensor from src.yolo import YOLOV5, YoloWithLossCell, YOLOV5s_Infer from src.logger import get_logger @@ -36,14 +36,14 @@ from model_utils.device_adapter import get_device_id from model_utils.moxing_adapter import moxing_wrapper, modelarts_pre_process -ms.set_seed(1) +mindspore.set_seed(1) def init_distribute(): comm.init() config.rank = comm.get_rank() config.group_size = comm.get_group_size() - ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.set_auto_parallel_context(parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=config.group_size) @@ -57,7 +57,7 @@ def train_preprocess(): if config.pretrained_checkpoint: config.pretrained_checkpoint = os.path.join(config.load_path, config.pretrained_checkpoint) device_id = get_device_id() - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) if config.is_distributed: # init distributed @@ -81,7 +81,7 @@ def export_models(ckpt_path): outputs_path = os.path.join(config.output_dir, 'yolov5') param_dict = load_checkpoint(ckpt_path) load_param_into_net(net, param_dict) - input_arr = Tensor(np.zeros([1, 12, config.testing_shape[0] // 2, config.testing_shape[1] // 2]), ms.float32) + input_arr = Tensor(np.zeros([1, 12, config.testing_shape[0] // 2, config.testing_shape[1] // 2]), mindspore.float32) export(net, input_arr, file_name=outputs_path, file_format=config.file_format) config.logger.info("export best model finished....") @@ -105,7 +105,7 @@ def run_train(): steps_per_epoch = ds.get_dataset_size() lr = get_lr(config, steps_per_epoch) - opt = nn.Momentum(params=get_param_groups(network), momentum=config.momentum, learning_rate=ms.Tensor(lr), + opt = nn.Momentum(params=get_param_groups(network), momentum=config.momentum, learning_rate=mindspore.Tensor(lr), weight_decay=config.weight_decay, loss_scale=config.loss_scale) network = nn.TrainOneStepCell(network, opt, config.loss_scale // 2) network.set_train() @@ -118,7 +118,7 @@ def run_train(): for step_idx, data in enumerate(data_loader): images = data[0] input_shape = images.shape[2:4] - input_shape = ms.Tensor(tuple(input_shape[::-1]), ms.float32) + input_shape = mindspore.Tensor(tuple(input_shape[::-1]), mindspore.float32) loss = network(images, data[2], data[3], data[4], data[5], data[6], data[7], input_shape) loss_meter.update(loss.asnumpy()) @@ -140,7 +140,7 @@ def run_train(): loss_meter.reset() if config.rank == 0: ckpt_name = os.path.join(config.output_dir, "yolov5_{}_{}.ckpt".format(epoch_idx + 1, steps_per_epoch)) - ms.save_checkpoint(network, ckpt_name) + mindspore.save_checkpoint(network, ckpt_name) export_models(ckpt_name) config.logger.info('==========end training===============') diff --git a/official/cv/YOLOv5/src/initializer.py b/official/cv/YOLOv5/src/initializer.py index ff65b133c..e92b97cda 100644 --- a/official/cv/YOLOv5/src/initializer.py +++ b/official/cv/YOLOv5/src/initializer.py @@ -14,7 +14,7 @@ # ============================================================================ """Parameter init.""" import math -import mindspore as ms +import mindspore from mindspore import nn @@ -22,14 +22,14 @@ def default_recurisive_init(custom_cell): """Initialize parameter.""" for _, cell in custom_cell.cells_and_names(): if isinstance(cell, (nn.Conv2d, nn.Dense)): - cell.weight.set_data(ms.common.initializer.initializer(ms.common.initializer.HeUniform(math.sqrt(5)), + cell.weight.set_data(mindspore.common.initializer.initializer(mindspore.common.initializer.HeUniform(math.sqrt(5)), cell.weight.shape, cell.weight.dtype)) def load_yolov5_params(args, network): """Load yolov5 backbone parameter from checkpoint.""" if args.resume_yolov5: - param_dict = load_checkpoint(args.resume_yolov5) + param_dict = mindspore.load_checkpoint(args.resume_yolov5) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): @@ -42,11 +42,11 @@ def load_yolov5_params(args, network): args.logger.info('in resume {}'.format(key)) args.logger.info('resume finished') - load_param_into_net(network, param_dict_new) + mindspore.load_param_into_net(network, param_dict_new) args.logger.info('load_model {} success'.format(args.resume_yolov5)) if args.pretrained_checkpoint: - param_dict = load_checkpoint(args.pretrained_checkpoint) + param_dict = mindspore.load_checkpoint(args.pretrained_checkpoint) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): @@ -62,11 +62,11 @@ def load_yolov5_params(args, network): args.logger.info('in load {}'.format(key)) args.logger.info('pretrained finished') - load_param_into_net(network, param_dict_new) + mindspore.load_param_into_net(network, param_dict_new) args.logger.info('load_model {} success'.format(args.pretrained_backbone)) if args.pretrained_backbone: - param_dict = load_checkpoint(args.pretrained_backbone) + param_dict = mindspore.load_checkpoint(args.pretrained_backbone) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): @@ -79,5 +79,5 @@ def load_yolov5_params(args, network): args.logger.info('in resume {}'.format(key)) args.logger.info('pretrained finished') - load_param_into_net(network, param_dict_new) + mindspore.load_param_into_net(network, param_dict_new) args.logger.info('load_model {} success'.format(args.pretrained_backbone)) diff --git a/official/cv/YOLOv5/src/util.py b/official/cv/YOLOv5/src/util.py index 5bc8fd781..cdcf2ea92 100644 --- a/official/cv/YOLOv5/src/util.py +++ b/official/cv/YOLOv5/src/util.py @@ -24,7 +24,7 @@ import numpy as np from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore import Tensor, ops @@ -143,7 +143,7 @@ def keep_loss_fp32(network): """Keep loss of network with float32""" for _, cell in network.cells_and_names(): if isinstance(cell, (YoloLossBlock,)): - cell.to_float(ms.float32) + cell.to_float(mindspore.float32) class Redirct: @@ -458,7 +458,7 @@ class EvalWrapper: self.dataset = dataset self.per_batch_size = config.per_batch_size self.device_num = config.group_size - self.input_shape = Tensor(tuple(config.test_img_shape), ms.float32) + self.input_shape = Tensor(tuple(config.test_img_shape), mindspore.float32) self.engine = engine self.eval_parallel = config.eval_parallel if config.eval_parallel: @@ -477,7 +477,7 @@ class EvalWrapper: def inference(self): for index, data in enumerate(self.dataset.create_dict_iterator(output_numpy=True, num_epochs=1)): image = data["image"] - image = ms.Tensor(image) + image = mindspore.Tensor(image) image_shape_ = data["image_shape"] image_id_ = data["img_id"] output_big, output_me, output_small = self.network(image, self.input_shape) diff --git a/official/cv/YOLOv5/src/yolo.py b/official/cv/YOLOv5/src/yolo.py index f9eeec698..143903ffa 100644 --- a/official/cv/YOLOv5/src/yolo.py +++ b/official/cv/YOLOv5/src/yolo.py @@ -14,7 +14,7 @@ # ============================================================================ """YOLOv5 based on DarkNet.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.ops as ops @@ -139,7 +139,7 @@ class DetectionBlock(nn.Cell): self.offset_x_y = 0.025 else: raise KeyError("Invalid scale value for DetectionBlock") - self.anchors = ms.Tensor([self.config.anchor_scales[i] for i in idx], ms.float32) + self.anchors = mindspore.Tensor([self.config.anchor_scales[i] for i in idx], mindspore.float32) self.num_anchors_per_scale = 3 self.num_attrib = 4+1+self.config.num_classes self.lambda_coord = 1 @@ -166,8 +166,8 @@ class DetectionBlock(nn.Cell): grid_size[1])) prediction = self.transpose(prediction, (0, 3, 4, 1, 2)) - grid_x = ms.numpy.arange(grid_size[1]) - grid_y = ms.numpy.arange(grid_size[0]) + grid_x = mindspore.numpy.arange(grid_size[1]) + grid_y = mindspore.numpy.arange(grid_size[0]) # Tensor of shape [grid_size[0], grid_size[1], 1, 1] representing the coordinate of x/y axis for each grid # [batch, gridx, gridy, 1, 1] grid_x = self.tile(self.reshape(grid_x, (1, 1, -1, 1, 1)), (1, grid_size[0], 1, 1, 1)) @@ -183,7 +183,7 @@ class DetectionBlock(nn.Cell): # gridsize1 is x # gridsize0 is y box_xy = (self.scale_x_y * self.sigmoid(box_xy) - self.offset_x_y + grid) / \ - ops.cast(ops.tuple_to_array((grid_size[1], grid_size[0])), ms.float32) + ops.cast(ops.tuple_to_array((grid_size[1], grid_size[0])), mindspore.float32) # box_wh is w->h box_wh = self.exp(box_wh) * self.anchors / input_shape @@ -250,8 +250,8 @@ class YoloLossBlock(nn.Cell): idx = (6, 7, 8) else: raise KeyError("Invalid scale value for DetectionBlock") - self.anchors = ms.Tensor([self.config.anchor_scales[i] for i in idx], ms.float32) - self.ignore_threshold = ms.Tensor(self.config.ignore_threshold, ms.float32) + self.anchors = mindspore.Tensor([self.config.anchor_scales[i] for i in idx], mindspore.float32) + self.ignore_threshold = mindspore.Tensor(self.config.ignore_threshold, mindspore.float32) self.concat = ops.Concat(axis=-1) self.iou = Iou() self.reduce_max = ops.ReduceMax(keep_dims=False) @@ -281,7 +281,7 @@ class YoloLossBlock(nn.Cell): true_boxes = y_true[:, :, :, :, :4] grid_shape = prediction.shape[1:3] - grid_shape = ops.cast(self.tuple_to_array(grid_shape[::-1]), ms.float32) + grid_shape = ops.cast(self.tuple_to_array(grid_shape[::-1]), mindspore.float32) pred_boxes = self.concat((pred_xy, pred_wh)) true_wh = y_true[:, :, :, :, 2:4] @@ -304,7 +304,7 @@ class YoloLossBlock(nn.Cell): # ignore_mask IOU too small ignore_mask = best_iou < self.ignore_threshold - ignore_mask = ops.cast(ignore_mask, ms.float32) + ignore_mask = ops.cast(ignore_mask, mindspore.float32) ignore_mask = self.expand_dims(ignore_mask, -1) # ignore_mask backpro will cause a lot maximunGrad and minimumGrad time consume. # so we turn off its gradient @@ -353,9 +353,9 @@ class YOLOV5(nn.Cell): self.detect_1 = DetectionBlock('l', is_training=is_training) self.detect_2 = DetectionBlock('m', is_training=is_training) self.detect_3 = DetectionBlock('s', is_training=is_training) - self.mean = ms.Tensor(np.array([0.485 * 255, 0.456 * 255, 0.406 * 255], + self.mean = mindspore.Tensor(np.array([0.485 * 255, 0.456 * 255, 0.406 * 255], dtype=np.float32)).reshape((1, 1, 1, 3)) - self.std = ms.Tensor(np.array([0.229 * 255, 0.224 * 255, 0.225 * 255], + self.std = mindspore.Tensor(np.array([0.229 * 255, 0.224 * 255, 0.225 * 255], dtype=np.float32)).reshape((1, 1, 1, 3)) def construct(self, x, input_shape): @@ -432,9 +432,9 @@ class GIou(nn.Cell): union = box_p_area + box_gt_area - intersection union = union + self.eps c_area = c_area + self.eps - iou = self.div(ops.cast(intersection, ms.float32), ops.cast(union, ms.float32)) + iou = self.div(ops.cast(intersection, mindspore.float32), ops.cast(union, mindspore.float32)) res_mid0 = c_area - union - res_mid1 = self.div(ops.cast(res_mid0, ms.float32), ops.cast(c_area, ms.float32)) + res_mid1 = self.div(ops.cast(res_mid0, mindspore.float32), ops.cast(c_area, mindspore.float32)) giou = iou - res_mid1 giou = ops.clip_by_value(giou, -1.0, 1.0) return giou diff --git a/official/cv/YOLOv5/train.py b/official/cv/YOLOv5/train.py index d2318227c..e7af2b662 100644 --- a/official/cv/YOLOv5/train.py +++ b/official/cv/YOLOv5/train.py @@ -16,7 +16,7 @@ import os import time from collections import deque -import mindspore as ms +import mindspore import mindspore.nn as nn import mindspore.communication as comm from mindspore import load_checkpoint, Parameter, save_checkpoint @@ -35,14 +35,14 @@ from model_utils.device_adapter import get_device_id from model_utils.moxing_adapter import moxing_wrapper, modelarts_pre_process, modelarts_post_process -ms.set_seed(1) +mindspore.set_seed(1) def init_distribute(): comm.init() config.rank = comm.get_rank() config.group_size = comm.get_group_size() - ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.set_auto_parallel_context(parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=config.group_size) @@ -56,9 +56,9 @@ def train_preprocess(): device_id = get_device_id() if config.device_target == "Ascend": device_id = get_device_id() - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) else: - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) + mindspore.set_context(mode=0, device_target=config.device_target) if config.is_distributed: # init distributed @@ -95,7 +95,7 @@ def load_parameters(val_network, train_network): param_dict_new[key[13:]] = values else: param_dict_new[key] = values - ms.load_param_into_net(val_network, param_dict_new) + mindspore.load_param_into_net(val_network, param_dict_new) config.logger.info('Load train network success') @@ -146,7 +146,7 @@ def run_train(): steps_per_epoch = ds.get_dataset_size() lr = get_lr(config, steps_per_epoch) - opt = nn.Momentum(params=get_param_groups(network), momentum=config.momentum, learning_rate=ms.Tensor(lr), + opt = nn.Momentum(params=get_param_groups(network), momentum=config.momentum, learning_rate=mindspore.Tensor(lr), weight_decay=config.weight_decay, loss_scale=config.loss_scale) network = nn.TrainOneStepCell(network, opt, config.loss_scale // 2) network.set_train() @@ -163,7 +163,7 @@ def run_train(): for step_idx, data in enumerate(data_loader): images = data[0] input_shape = images.shape[1:3] - input_shape = ms.Tensor(input_shape, ms.float32) + input_shape = mindspore.Tensor(input_shape, mindspore.float32) loss = network(images, data[2], data[3], data[4], data[5], data[6], data[7], input_shape) loss_meter.update(loss.asnumpy()) @@ -185,7 +185,7 @@ def run_train(): loss_meter.reset() if config.rank == 0 and (epoch_idx % config.save_ckpt_interval == 0): ckpt_name = os.path.join(config.output_dir, "yolov5_{}_{}.ckpt".format(epoch_idx + 1, steps_per_epoch)) - ms.save_checkpoint(network, ckpt_name) + mindspore.save_checkpoint(network, ckpt_name) if len(ckpt_queue) == config.save_ckpt_max_num: ckpt_to_remove = ckpt_queue.popleft() os.remove(ckpt_to_remove) diff --git a/official/nlp/Bert/export.py b/official/nlp/Bert/export.py index 942ffeb2e..65a3233b1 100644 --- a/official/nlp/Bert/export.py +++ b/official/nlp/Bert/export.py @@ -17,8 +17,9 @@ import os import shutil import numpy as np +import mindspore import mindspore.common.dtype as mstype -from mindspore import Tensor, context, load_checkpoint, export +from mindspore import Tensor, load_checkpoint, export from src.finetune_eval_model import BertCLSModel, BertSquadModel, BertNERModel from src.bert_for_finetune import BertNER @@ -40,9 +41,9 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def run_export(): '''export function''' - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) + mindspore.set_context(mode=0, device_target=args.device_target) if args.device_target == "Ascend": - context.set_context(device_id=args.device_id) + mindspore.set_context(device_id=args.device_id) if args.description == "run_ner": label_list = [] diff --git a/official/nlp/Bert/modelarts/train_start.py b/official/nlp/Bert/modelarts/train_start.py index 8b9413cde..30149e903 100644 --- a/official/nlp/Bert/modelarts/train_start.py +++ b/official/nlp/Bert/modelarts/train_start.py @@ -19,9 +19,10 @@ Bert finetune and evaluation script. import os import collections import shutil +import mindspore import mindspore.common.dtype as mstype from mindspore import log as logger -from mindspore import Tensor, context, load_checkpoint, export +from mindspore import Tensor, load_checkpoint, export from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.nn.optim import AdamWeightDecay, Lamb, Momentum from mindspore.train.model import Model @@ -153,9 +154,9 @@ def _get_last_ckpt(ckpt_dir): def run_export(ckpt_dir): '''export function''' ckpt_file = _get_last_ckpt(ckpt_dir) - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) + mindspore.set_context(mode=0, device_target=args_opt.device_target) if args_opt.device_target == "Ascend": - context.set_context(device_id=args_opt.device_id) + mindspore.set_context(device_id=args_opt.device_id) if args_opt.description == "run_ner": label_list = [] @@ -218,10 +219,10 @@ def run_squad(): load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path target = args_opt.device_target if target == "Ascend": - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=args_opt.device_id) elif target == "GPU": - context.set_context(mode=context.GRAPH_MODE, device_target="GPU") - context.set_context(enable_graph_kernel=True) + mindspore.set_context(mode=0, device_target="GPU") + mindspore.set_context(enable_graph_kernel=True) if bert_net_cfg.compute_type != mstype.float32: logger.warning('GPU only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 diff --git a/official/nlp/Bert/pretrain_eval.py b/official/nlp/Bert/pretrain_eval.py index 2537b8229..51f804467 100644 --- a/official/nlp/Bert/pretrain_eval.py +++ b/official/nlp/Bert/pretrain_eval.py @@ -18,7 +18,7 @@ Bert evaluation script. """ import os -from mindspore import context +import mindspore from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.utils import BertMetric @@ -32,7 +32,7 @@ def bert_predict(): Predict function ''' devid = int(os.getenv('DEVICE_ID')) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) + mindspore.set_context(mode=0, device_target="Ascend", device_id=devid) dataset = create_eval_dataset(cfg.batch_size, 1, data_dir=cfg.eval_data_dir, dataset_format=cfg.dataset_format) net_for_pretraining = BertPretrainEval(bert_net_cfg) net_for_pretraining.set_train(False) diff --git a/official/nlp/Bert/quick_start.py b/official/nlp/Bert/quick_start.py index 6dd08e514..9ca98af7f 100644 --- a/official/nlp/Bert/quick_start.py +++ b/official/nlp/Bert/quick_start.py @@ -17,7 +17,7 @@ Bert quick start script. ''' -import mindspore as ms +import mindspore from mindspore.train.model import Model from mindspore.ops import operations as P from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -59,9 +59,9 @@ def convert_single_example(text, max_seq_length, tokenizer): input_mask.append(0) segment_ids.append(0) - input_ids = ms.Tensor([input_ids,], dtype=ms.int32) - input_mask = ms.Tensor([input_mask,], dtype=ms.int32) - segment_ids = ms.Tensor([segment_ids,], dtype=ms.int32) + input_ids = mindspore.Tensor([input_ids,], dtype=mindspore.int32) + input_mask = mindspore.Tensor([input_mask,], dtype=mindspore.int32) + segment_ids = mindspore.Tensor([segment_ids,], dtype=mindspore.int32) return input_ids, input_mask, segment_ids diff --git a/official/nlp/Bert/run_classifier.py b/official/nlp/Bert/run_classifier.py index cebf106eb..e9c0e7b58 100644 --- a/official/nlp/Bert/run_classifier.py +++ b/official/nlp/Bert/run_classifier.py @@ -19,9 +19,8 @@ Bert finetune and evaluation script. import os from tqdm import tqdm -import mindspore as ms +import mindspore import mindspore.common.dtype as mstype -from mindspore import context from mindspore import log as logger from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.nn.optim import AdamWeightDecay, Lamb, Momentum @@ -81,7 +80,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin param_dict = load_checkpoint(load_checkpoint_path) load_param_into_net(network, param_dict) - if ms.get_context("device_target") == "CPU": + if mindspore.get_context("device_target") == "CPU": netwithgrads = BertFinetuneCellCPU(network, optimizer=optimizer) else: update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) @@ -171,18 +170,18 @@ def run_classifier(): load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path target = args_opt.device_target if target == "Ascend": - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=args_opt.device_id) elif target == "GPU": - context.set_context(mode=context.GRAPH_MODE, device_target="GPU") - context.set_context(enable_graph_kernel=True) + mindspore.set_context(mode=0, device_target="GPU") + mindspore.set_context(enable_graph_kernel=True) if bert_net_cfg.compute_type != mstype.float32: logger.warning('GPU only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 elif target == "CPU": if args_opt.use_pynative_mode: - context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU", device_id=args_opt.device_id) + mindspore.set_context(mode=1, device_target="CPU", device_id=args_opt.device_id) else: - context.set_context(mode=context.GRAPH_MODE, device_target="CPU", device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target="CPU", device_id=args_opt.device_id) else: raise Exception("Target error, CPU or GPU or Ascend is supported.") diff --git a/official/nlp/Bert/run_ner.py b/official/nlp/Bert/run_ner.py index f020586f7..99b69a2df 100644 --- a/official/nlp/Bert/run_ner.py +++ b/official/nlp/Bert/run_ner.py @@ -20,9 +20,8 @@ Bert finetune and evaluation script. import os import time from tqdm import tqdm -import mindspore as ms +import mindspore import mindspore.common.dtype as mstype -from mindspore import context from mindspore import log as logger from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.nn.optim import AdamWeightDecay, Lamb, Momentum @@ -80,7 +79,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin param_dict = load_checkpoint(load_checkpoint_path) load_param_into_net(network, param_dict) - if ms.get_context("device_target") == "CPU": + if mindspore.get_context("device_target") == "CPU": netwithgrads = BertFinetuneCellCPU(network, optimizer=optimizer) else: update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) @@ -125,7 +124,7 @@ def do_eval(dataset=None, network=None, use_crf="", with_lstm="", num_class=41, model = Model(net_for_pretraining) if assessment_method == "clue_benchmark": - if ms.get_context("device_target") == "CPU": + if mindspore.get_context("device_target") == "CPU": from src.cluener_evaluation_cpu import submit else: from src.cluener_evaluation import submit @@ -201,18 +200,18 @@ def run_ner(): load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path target = args_opt.device_target if target == "Ascend": - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=args_opt.device_id) elif target == "GPU": - context.set_context(mode=context.GRAPH_MODE, device_target="GPU") - context.set_context(enable_graph_kernel=True) + mindspore.set_context(mode=0, device_target="GPU") + mindspore.set_context(enable_graph_kernel=True) if bert_net_cfg.compute_type != mstype.float32: logger.warning('GPU only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 elif target == "CPU": if args_opt.use_pynative_mode: - context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU", device_id=args_opt.device_id) + mindspore.set_context(mode=1, device_target="CPU", device_id=args_opt.device_id) else: - context.set_context(mode=context.GRAPH_MODE, device_target="CPU", device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target="CPU", device_id=args_opt.device_id) else: raise Exception("Target error, CPU or GPU or Ascend is supported.") label_list = [] diff --git a/official/nlp/Bert/run_pretrain.py b/official/nlp/Bert/run_pretrain.py index ce0709b27..a2971b504 100644 --- a/official/nlp/Bert/run_pretrain.py +++ b/official/nlp/Bert/run_pretrain.py @@ -17,13 +17,12 @@ python run_pretrain.py """ import os -import mindspore as ms +import mindspore import mindspore.communication.management as D from mindspore.communication.management import get_rank import mindspore.common.dtype as mstype -from mindspore import context from mindspore.train.model import Model -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -50,25 +49,25 @@ _current_dir = os.path.dirname(os.path.realpath(__file__)) def _set_bert_all_reduce_split(): """set bert all_reduce fusion split, support num_hidden_layers is 12 and 24.""" - device_target = context.get_context('device_target') - enable_graph_kernel = context.get_context('enable_graph_kernel') - device_num = context.get_auto_parallel_context('device_num') + device_target = mindspore.get_context('device_target') + enable_graph_kernel = mindspore.get_context('enable_graph_kernel') + device_num = mindspore.get_auto_parallel_context('device_num') if bert_net_cfg.num_hidden_layers == 12: if bert_net_cfg.use_relative_positions: - context.set_auto_parallel_context(all_reduce_fusion_config=[29, 58, 87, 116, 145, 174, 203, 217]) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[29, 58, 87, 116, 145, 174, 203, 217]) else: - context.set_auto_parallel_context(all_reduce_fusion_config=[28, 55, 82, 109, 136, 163, 190, 205]) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[28, 55, 82, 109, 136, 163, 190, 205]) if device_target == 'GPU' and enable_graph_kernel and device_num == 8: - context.set_auto_parallel_context(all_reduce_fusion_config=[180, 205]) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[180, 205]) elif device_target == 'GPU' and enable_graph_kernel and device_num == 16: - context.set_auto_parallel_context(all_reduce_fusion_config=[120, 205]) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[120, 205]) elif bert_net_cfg.num_hidden_layers == 24: if bert_net_cfg.use_relative_positions: - context.set_auto_parallel_context(all_reduce_fusion_config=[30, 90, 150, 210, 270, 330, 390, 421]) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[30, 90, 150, 210, 270, 330, 390, 421]) else: - context.set_auto_parallel_context(all_reduce_fusion_config=[38, 93, 148, 203, 258, 313, 368, 397]) + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[38, 93, 148, 203, 258, 313, 368, 397]) if device_target == 'Ascend' and enable_graph_kernel and device_num == 8: - context.set_auto_parallel_context(all_reduce_fusion_config=[ + mindspore.set_auto_parallel_context(all_reduce_fusion_config=[ 0, 1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 35, 40, 50, 70, 93, 148, 203, 258, 313, 368, 397]) @@ -105,7 +104,7 @@ def _get_optimizer(args_opt, network): {'order_params': params}] if args_opt.enable_lossscale == "true" and args_opt.device_target == 'GPU': optimizer = AdamWeightDecayForBert(group_params, learning_rate=lr_schedule, eps=cfg.AdamWeightDecay.eps) - elif context.get_context("mode") == context.PYNATIVE_MODE and args_opt.device_target == 'GPU': + elif mindspore.get_context("mode") == 1 and args_opt.device_target == 'GPU': optimizer = AdamWeightDecayOp(group_params, learning_rate=lr_schedule, eps=cfg.AdamWeightDecay.eps) else: optimizer = AdamWeightDecay(group_params, learning_rate=lr_schedule, eps=cfg.AdamWeightDecay.eps) @@ -133,12 +132,12 @@ def _set_graph_kernel_context(device_target): """Add suitable graph kernel context for different configs.""" if device_target == 'GPU': if cfg.bert_network == 'base': - context.set_context(enable_graph_kernel=True, + mindspore.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_stitch_fusion=true " "--enable_parallel_fusion=true " "--enable_cluster_ops=BatchMatMul") else: - context.set_context(enable_graph_kernel=True) + mindspore.set_context(enable_graph_kernel=True) else: logger.warning('Graph kernel only supports GPU back-end now, run with graph kernel off.') @@ -162,11 +161,11 @@ def modelarts_pre_process(): def set_ascend_max_device_memory(config): - is_ascend910b_ge = ms.get_context("enable_ge") and ms.get_context("mode") == ms.GRAPH_MODE and \ + is_ascend910b_ge = mindspore.get_context("enable_ge") and mindspore.get_context("mode") == 0 and \ MSContext.get_instance().get_ascend_soc_version() != 'ascend910' if is_ascend910b_ge and hasattr(config, "max_device_memory"): logger.warning("When encountering a memory shortage situation in 1980B, reduce the max_device_memory.") - ms.set_context(max_device_memory=config.max_device_memory) + mindspore.set_context(max_device_memory=config.max_device_memory) def InitNetWithGrads(net_with_loss, optimizer): @@ -206,8 +205,8 @@ def InitNetWithGrads(net_with_loss, optimizer): @moxing_wrapper(pre_process=modelarts_pre_process) def run_pretrain(): """pre-train bert_clue""" - context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target, device_id=cfg.device_id) - context.set_context(reserve_class_name_in_scope=False) + mindspore.set_context(mode=0, device_target=cfg.device_target, device_id=cfg.device_id) + mindspore.set_context(reserve_class_name_in_scope=False) _set_graph_kernel_context(cfg.device_target) ckpt_save_dir = cfg.save_checkpoint_path rank = 0 @@ -224,8 +223,8 @@ def run_pretrain(): rank = D.get_rank() ckpt_save_dir = os.path.join(cfg.save_checkpoint_path, 'ckpt_' + str(get_rank())) - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) _set_bert_all_reduce_split() diff --git a/official/nlp/Bert/run_squad.py b/official/nlp/Bert/run_squad.py index 47f96eff2..1a73673e2 100644 --- a/official/nlp/Bert/run_squad.py +++ b/official/nlp/Bert/run_squad.py @@ -18,8 +18,8 @@ Bert finetune and evaluation script. ''' import os import collections +import mindspore import mindspore.common.dtype as mstype -from mindspore import context from mindspore import log as logger from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.nn.optim import AdamWeightDecay, Lamb, Momentum @@ -158,18 +158,18 @@ def run_squad(): load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path target = args_opt.device_target if target == "Ascend": - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=args_opt.device_id) elif target == "GPU": - context.set_context(mode=context.GRAPH_MODE, device_target="GPU") - context.set_context(enable_graph_kernel=True) + mindspore.set_context(mode=0, device_target="GPU") + mindspore.set_context(enable_graph_kernel=True) if bert_net_cfg.compute_type != mstype.float32: logger.warning('GPU only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 elif target == "CPU": if args_opt.use_pynative_mode: - context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU", device_id=args_opt.device_id) + mindspore.set_context(mode=1, device_target="CPU", device_id=args_opt.device_id) else: - context.set_context(mode=context.GRAPH_MODE, device_target="CPU", device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target="CPU", device_id=args_opt.device_id) else: raise Exception("Target error, CPU or GPU or Ascend is supported.") diff --git a/official/nlp/Bert/src/bert_for_pre_training.py b/official/nlp/Bert/src/bert_for_pre_training.py index e45d95e65..0a7725d88 100644 --- a/official/nlp/Bert/src/bert_for_pre_training.py +++ b/official/nlp/Bert/src/bert_for_pre_training.py @@ -15,6 +15,7 @@ """Bert for pretraining.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore.common.initializer import initializer, TruncatedNormal from mindspore.ops import operations as P @@ -25,9 +26,9 @@ from mindspore.common.parameter import Parameter from mindspore.common.api import jit from mindspore.common import dtype as mstype from mindspore.nn.wrap.grad_reducer import DistributedGradReducer -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_group_size -from mindspore import context, amp, ops +from mindspore import amp, ops from mindspore._c_expression import MSContext from .bert_model import BertModel @@ -571,7 +572,7 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = F.identity @@ -703,7 +704,7 @@ class BertTrainAccumulationAllReduceEachWithLossScaleCell(nn.Cell): self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = F.identity @@ -883,7 +884,7 @@ class BertPretrainEval(nn.Cell): self.cast = P.Cast() self.allreduce = P.AllReduce() self.reduce_flag = False - parallel_mode = context.get_auto_parallel_context("parallel_mode") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reduce_flag = True diff --git a/official/nlp/Bert/src/finetune_eval_model.py b/official/nlp/Bert/src/finetune_eval_model.py index 7b0f62622..5690fbbd1 100644 --- a/official/nlp/Bert/src/finetune_eval_model.py +++ b/official/nlp/Bert/src/finetune_eval_model.py @@ -16,10 +16,10 @@ ''' Bert finetune and evaluation model script. ''' +import mindspore import mindspore.nn as nn from mindspore.common.initializer import TruncatedNormal from mindspore.ops import operations as P -from mindspore import context from .bert_model import BertModel @@ -76,7 +76,7 @@ class BertSquadModel(nn.Cell): self.dtype = config.dtype self.log_softmax = P.LogSoftmax(axis=1) self.is_training = is_training - self.gpu_target = context.get_context("device_target") == "GPU" + self.gpu_target = mindspore.get_context("device_target") == "GPU" self.cast = P.Cast() self.reshape = P.Reshape() self.transpose = P.Transpose() diff --git a/official/nlp/Bert/src/model_utils/moxing_adapter.py b/official/nlp/Bert/src/model_utils/moxing_adapter.py index 09cb0f0cf..a6d8a3fce 100644 --- a/official/nlp/Bert/src/model_utils/moxing_adapter.py +++ b/official/nlp/Bert/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from src.model_utils.config import config @@ -94,7 +94,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/nlp/Bert_thor/README.md b/official/nlp/Bert_thor/README.md index cd9178cd3..1280c635f 100644 --- a/official/nlp/Bert_thor/README.md +++ b/official/nlp/Bert_thor/README.md @@ -146,7 +146,8 @@ We need five parameters for this scripts. - `SCHEMA_DIR`:Schema path, it is better to use absolute path - `RANK_TABLE_FILE`: rank table file with JSON format -Training result will be stored in the current path, whose folder name begins with the file name that the user defines. Under this, you can find checkpoint file together with result like the followings in log. +Training result will be stored in the current path, whose folder name begins with the file name that the user defines. Under this, you can find checkpoint file together with result like the following +in log. ```shell ... @@ -192,7 +193,8 @@ We need two parameters in evaluation_config.py for this scripts. > checkpoint can be produced in training process. -Inference result will be stored in the example path, you can find result like the followings in log. +Inference result will be stored in the example path, you can find result like the following +in log. ```shell step: 1000 Accuracy: [0.27491578] diff --git a/official/nlp/Bert_thor/pretrain_eval.py b/official/nlp/Bert_thor/pretrain_eval.py index a4f824d73..5d2bcc97f 100644 --- a/official/nlp/Bert_thor/pretrain_eval.py +++ b/official/nlp/Bert_thor/pretrain_eval.py @@ -22,11 +22,11 @@ import os from src import BertModel, GetMaskedLMOutput from src.evaluation_config import cfg, bert_net_cfg +import mindspore import mindspore.common.dtype as mstype import mindspore.dataset as de import mindspore.dataset.transforms as C import mindspore.nn as nn -from mindspore import context from mindspore.common.parameter import Parameter from mindspore.common.tensor import Tensor from mindspore.nn.metrics import Metric @@ -135,7 +135,7 @@ def bert_predict(): Predict function ''' devid = int(os.getenv('DEVICE_ID')) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) + mindspore.set_context(mode=0, device_target="Ascend", device_id=devid) dataset = get_enwiki_512_dataset(cfg.batch_size, 1) net_for_pretraining = BertPretrainEva(bert_net_cfg) net_for_pretraining.set_train(False) diff --git a/official/nlp/Bert_thor/run_pretrain.py b/official/nlp/Bert_thor/run_pretrain.py index a91ce1fe1..34f43e74d 100644 --- a/official/nlp/Bert_thor/run_pretrain.py +++ b/official/nlp/Bert_thor/run_pretrain.py @@ -19,13 +19,13 @@ python run_pretrain.py import argparse import os +import mindspore import mindspore.common.dtype as mstype import mindspore.communication.management as D -from mindspore import context from mindspore import log as logger from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.common import set_seed from mindspore.train.model import Model @@ -113,18 +113,18 @@ def run_pretrain(): parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") args_opt = parser.parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, + mindspore.set_context(mode=0, device_target=args_opt.device_target, device_id=args_opt.device_id, save_graphs=False) - context.set_context(reserve_class_name_in_scope=False) + mindspore.set_context(reserve_class_name_in_scope=False) ckpt_save_dir = args_opt.save_checkpoint_path if args_opt.distribute == "true": D.init() device_num = D.get_group_size() rank = D.get_rank() ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(rank) + '/' - context.reset_auto_parallel_context() + mindspore.reset_auto_parallel_context() _set_bert_all_reduce_split() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) else: diff --git a/official/nlp/Bert_thor/src/bert_for_pre_training.py b/official/nlp/Bert_thor/src/bert_for_pre_training.py index ee9c81a67..07d0b1165 100644 --- a/official/nlp/Bert_thor/src/bert_for_pre_training.py +++ b/official/nlp/Bert_thor/src/bert_for_pre_training.py @@ -15,6 +15,7 @@ """Bert for pretraining.""" import numpy as np +import mindspore import mindspore.nn as nn from mindspore import amp, ops from mindspore.common.initializer import initializer, TruncatedNormal @@ -25,9 +26,8 @@ from mindspore.common.tensor import Tensor from mindspore.common.parameter import Parameter from mindspore.common import dtype as mstype from mindspore.nn.wrap.grad_reducer import DistributedGradReducer -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_group_size -from mindspore import context from .bert_model import BertModel GRADIENT_CLIP_TYPE = 1 @@ -546,7 +546,7 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = F.identity @@ -678,7 +678,7 @@ class BertTrainAccumulationAllReduceEachWithLossScaleCell(nn.Cell): self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = F.identity diff --git a/official/nlp/GPT/eval.py b/official/nlp/GPT/eval.py index a66971c32..fb5ef805c 100644 --- a/official/nlp/GPT/eval.py +++ b/official/nlp/GPT/eval.py @@ -20,7 +20,7 @@ GPT evaluation script. import math import argparse import numpy as np -from mindspore import context +import mindspore import mindspore.common.dtype as mstype from mindspore.common.tensor import Tensor from mindspore.nn.transformer.loss import CrossEntropyLoss @@ -31,7 +31,7 @@ from src.dataset import create_dataset from src.gpt import GPT, EvalNet, GPTWithLoss from src.utils import GPTConfig -context.set_context(mode=context.GRAPH_MODE) +mindspore.set_context(mode=0) def ppl_score(probs, length, is_logsoftmax=True): """ calculate perplexity with prob or log_prob inputs """ diff --git a/official/nlp/GPT/src/gpt_wrapcell.py b/official/nlp/GPT/src/gpt_wrapcell.py index b8da50b79..11eb6cb65 100644 --- a/official/nlp/GPT/src/gpt_wrapcell.py +++ b/official/nlp/GPT/src/gpt_wrapcell.py @@ -14,13 +14,13 @@ # ============================================================================ """GPT training wrapper""" - +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.ops import composite as C from mindspore.ops import functional as F -from mindspore import context, amp, ops -from mindspore.context import ParallelMode +from mindspore import amp, ops +from mindspore import ParallelMode from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.communication.management import get_group_size from mindspore.common.tensor import Tensor @@ -85,7 +85,7 @@ class GPTTrainOneStepWithLossScaleCell(nn.Cell): sens_param=True) self.reducer_flag = False self.allreduce = P.AllReduce() - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = F.identity diff --git a/official/nlp/GPT/train.py b/official/nlp/GPT/train.py index cf8097873..f22c514e1 100644 --- a/official/nlp/GPT/train.py +++ b/official/nlp/GPT/train.py @@ -20,10 +20,10 @@ GPT train script import os import argparse -from mindspore import context +import mindspore from mindspore.train.model import Model import mindspore.communication.management as D -from mindspore.context import ParallelMode +from mindspore import ParallelMode import mindspore.nn as nn from mindspore.train.callback import TimeMonitor, LossMonitor, ModelCheckpoint, CheckpointConfig from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell @@ -56,15 +56,15 @@ def run_train(): args_opt = parser.parse_args() device_id = int(os.getenv("DEVICE_ID", '0')) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=device_id) if args_opt.distribute == "true": D.init() device_num = args_opt.device_num rank = device_id % device_num print("device_id is {}, rank_id is {}".format(device_id, rank)) - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) else: diff --git a/official/nlp/LSTM/eval.py b/official/nlp/LSTM/eval.py index b0866c70f..b5422a761 100644 --- a/official/nlp/LSTM/eval.py +++ b/official/nlp/LSTM/eval.py @@ -22,7 +22,8 @@ from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.dataset import lstm_create_dataset, convert_to_mindrecord from src.lstm import SentimentNet -from mindspore import Tensor, nn, Model, context +import mindspore +from mindspore import Tensor, nn, Model from mindspore.nn import Accuracy, Recall, F1 from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -34,8 +35,8 @@ def eval_lstm(): """ eval lstm """ print('\neval.py config: \n', config) - context.set_context( - mode=context.GRAPH_MODE, + mindspore.set_context( + mode=0, save_graphs=False, device_target=config.device_target) diff --git a/official/nlp/LSTM/export.py b/official/nlp/LSTM/export.py index 8cd6128d8..c29e1bb16 100644 --- a/official/nlp/LSTM/export.py +++ b/official/nlp/LSTM/export.py @@ -19,7 +19,8 @@ python export.py import os import numpy as np -from mindspore import Tensor, context +import mindspore +from mindspore import Tensor from mindspore import export, load_checkpoint, load_param_into_net from src.lstm import SentimentNet @@ -33,8 +34,8 @@ def modelarts_process(): @moxing_wrapper(pre_process=modelarts_process) def export_lstm(): """ export lstm """ - context.set_context( - mode=context.GRAPH_MODE, + mindspore.set_context( + mode=0, save_graphs=False, device_target=config.device_target, device_id=get_device_id()) diff --git a/official/nlp/LSTM/modelarts/data_process.py b/official/nlp/LSTM/modelarts/data_process.py index f359a770b..534e6be32 100644 --- a/official/nlp/LSTM/modelarts/data_process.py +++ b/official/nlp/LSTM/modelarts/data_process.py @@ -11,8 +11,8 @@ import time import moxing as mox import numpy as np +import mindspore import mindspore.dataset as ds -from mindspore import context from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id @@ -143,7 +143,7 @@ def download_data(): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() # create output dir diff --git a/official/nlp/LSTM/modelarts/train_start.py b/official/nlp/LSTM/modelarts/train_start.py index 5e271f8ec..a9aa47f4b 100644 --- a/official/nlp/LSTM/modelarts/train_start.py +++ b/official/nlp/LSTM/modelarts/train_start.py @@ -26,11 +26,12 @@ import time import moxing as mox import numpy as np +import mindspore import mindspore.nn as nn -from mindspore import Tensor, context, export +from mindspore import Tensor, export from mindspore.common import set_seed from mindspore.communication.management import init, get_rank -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.metrics import Accuracy from mindspore.profiler import Profiler from mindspore.train import Model @@ -202,7 +203,7 @@ def download_data(): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() # create output dir @@ -334,8 +335,8 @@ def train_lstm(): # set context device_target = config.device_target _enable_graph_kernel = config.enable_graph_kernel and device_target == "GPU" - context.set_context( - mode=context.GRAPH_MODE, + mindspore.set_context( + mode=0, save_graphs=False, enable_graph_kernel=_enable_graph_kernel, graph_kernel_flags="--enable_cluster_ops=MatMul", @@ -345,18 +346,18 @@ def train_lstm(): device_num = config.device_num rank = 0 if device_num > 1 or config.distribute: - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) if device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) init() rank = get_rank() elif device_target == "GPU": init() else: - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) # dataset preprocess if config.preprocess == 'true': diff --git a/official/nlp/LSTM/src/model_utils/device_adapter.py b/official/nlp/LSTM/src/model_utils/device_adapter.py index 7c5d7f837..825c667a2 100644 --- a/official/nlp/LSTM/src/model_utils/device_adapter.py +++ b/official/nlp/LSTM/src/model_utils/device_adapter.py @@ -15,6 +15,7 @@ """Device adapter for ModelArts""" +import mindspore from .config import config if config.enable_modelarts: diff --git a/official/nlp/LSTM/src/model_utils/moxing_adapter.py b/official/nlp/LSTM/src/model_utils/moxing_adapter.py index 830d19a6f..9c6d88e5d 100644 --- a/official/nlp/LSTM/src/model_utils/moxing_adapter.py +++ b/official/nlp/LSTM/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/nlp/LSTM/train.py b/official/nlp/LSTM/train.py index ca85cb94b..22f0b7215 100644 --- a/official/nlp/LSTM/train.py +++ b/official/nlp/LSTM/train.py @@ -26,13 +26,14 @@ from src.eval_callback import EvalCallBack, apply_eval from src.lr_schedule import get_lr from src.lstm import SentimentNet -from mindspore import Tensor, nn, Model, context +import mindspore +from mindspore import Tensor, nn, Model from mindspore.common import set_seed from mindspore.nn import Accuracy from mindspore.train.callback import LossMonitor, CheckpointConfig, ModelCheckpoint, TimeMonitor from mindspore.train.serialization import load_param_into_net, load_checkpoint from mindspore.communication.management import init, get_rank -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.common import JitConfig set_seed(1) @@ -48,8 +49,8 @@ def train_lstm(): print('\ntrain.py config: \n', config) _enable_graph_kernel = config.enable_graph_kernel == "true" and config.device_target == "GPU" - context.set_context( - mode=context.GRAPH_MODE, + mindspore.set_context( + mode=0, save_graphs=False, enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target) @@ -61,8 +62,8 @@ def train_lstm(): init() device_num = config.device_num # get_device_num() rank = get_rank() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, \ + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, \ device_num=device_num) if config.preprocess == "true": diff --git a/official/nlp/Pangu_alpha/predict.py b/official/nlp/Pangu_alpha/predict.py index d8098fa59..a5e6c8091 100644 --- a/official/nlp/Pangu_alpha/predict.py +++ b/official/nlp/Pangu_alpha/predict.py @@ -22,11 +22,12 @@ import requests import numpy as np from tqdm import tqdm +import mindspore import mindspore.common.dtype as mstype import mindspore.communication.management as D -from mindspore import context, Tensor +from mindspore import Tensor from mindspore import export -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.parallel import set_algo_parameters from mindspore.parallel._cost_model_context import _set_multi_subgraphs from mindspore.train.model import Model @@ -51,15 +52,15 @@ def set_auto_parallel_context(args_opt): """Set the auto parallel context""" rank = 0 device_num = 1 - context.reset_auto_parallel_context() - context.set_auto_parallel_context( + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context( strategy_ckpt_load_file=args_opt.strategy_load_ckpt_path) if args_opt.distribute == "true": D.init() device_num = D.get_group_size() rank = D.get_rank() print("rank_id is {}, device_num is {}".format(rank, device_num)) - context.set_auto_parallel_context( + mindspore.set_auto_parallel_context( parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, gradients_mean=False, full_batch=True, @@ -76,10 +77,10 @@ def load_model(args_opt): The main function for load model """ # Set execution mode - context.set_context(save_graphs=False, - mode=context.GRAPH_MODE, + mindspore.set_context(save_graphs=False, + mode=0, device_target=args_opt.device_target) - context.set_context(max_device_memory="30GB") + mindspore.set_context(max_device_memory="30GB") # Set parallel context rank, device_num = set_auto_parallel_context(args_opt) diff --git a/official/nlp/Pangu_alpha/src/callbacks.py b/official/nlp/Pangu_alpha/src/callbacks.py index 448ef8811..e1a3d99ee 100644 --- a/official/nlp/Pangu_alpha/src/callbacks.py +++ b/official/nlp/Pangu_alpha/src/callbacks.py @@ -19,9 +19,9 @@ Callbacks import time import math import numpy as np +import mindspore from mindspore.train.callback import Callback -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_rank class LossCallBack(Callback): @@ -80,9 +80,9 @@ class EvalCallBack(Callback): self.pplMetric = ppl_metric self.has_trained_step = has_trained_step self.pplMetric.clear() - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") - self.strategy_ckpt_save_file = context.get_auto_parallel_context("strategy_ckpt_save_file") - self.strategy_ckpt_load_file = context.get_auto_parallel_context("strategy_ckpt_load_file") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") + self.strategy_ckpt_save_file = mindspore.get_auto_parallel_context("strategy_ckpt_save_file") + self.strategy_ckpt_load_file = mindspore.get_auto_parallel_context("strategy_ckpt_load_file") def step_end(self, run_context): """ @@ -94,7 +94,7 @@ class EvalCallBack(Callback): return self.pplMetric.clear() if self.parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL): - context.set_auto_parallel_context(strategy_ckpt_save_file="", + mindspore.set_auto_parallel_context(strategy_ckpt_save_file="", strategy_ckpt_load_file=self.strategy_ckpt_save_file) rank_id = 0 if self.parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, @@ -109,5 +109,5 @@ class EvalCallBack(Callback): out_str = "{} == Rank: {} == EvalCallBack model.eval(): {}; eval_time: {}s". \ format(time_str, rank_id, out.values(), eval_time) print(out_str) - context.set_auto_parallel_context(strategy_ckpt_save_file=self.strategy_ckpt_save_file, + mindspore.set_auto_parallel_context(strategy_ckpt_save_file=self.strategy_ckpt_save_file, strategy_ckpt_load_file=self.strategy_ckpt_load_file) diff --git a/official/nlp/Pangu_alpha/src/dataset.py b/official/nlp/Pangu_alpha/src/dataset.py index e624b59b3..021995df1 100644 --- a/official/nlp/Pangu_alpha/src/dataset.py +++ b/official/nlp/Pangu_alpha/src/dataset.py @@ -18,11 +18,11 @@ Create dataset for training and evaluating import os import numpy as np +import mindspore import mindspore.dataset as ds import mindspore.dataset.transforms as C import mindspore.common.dtype as mstype -from mindspore import context def get_input_data_batch_slice_map(input_ids, eod_id, rank, dis, eod_reset): """ @@ -90,8 +90,8 @@ def create_dataset(batch_size, data_path, device_num=1, rank=0, drop=True, full_ # Control the size of data queue in the consideration of the memory ds.config.set_prefetch_size(1) - is_data_parallel = context.get_auto_parallel_context( - "parallel_mode") == context.ParallelMode.DATA_PARALLEL + is_data_parallel = mindspore.get_auto_parallel_context( + "parallel_mode") == mindspore.ParallelMode.DATA_PARALLEL # Get path for source data files home_path = os.path.join(os.getcwd(), data_path) diff --git a/official/nlp/Pangu_alpha/src/metrics.py b/official/nlp/Pangu_alpha/src/metrics.py index 4d9e8ca5e..ff2f17a97 100644 --- a/official/nlp/Pangu_alpha/src/metrics.py +++ b/official/nlp/Pangu_alpha/src/metrics.py @@ -17,8 +17,8 @@ Eval metrics """ import math +import mindspore from mindspore.nn.metrics import Metric -from mindspore import context from mindspore.communication.management import get_rank, get_group_size class PPLMetric(Metric): @@ -30,7 +30,7 @@ class PPLMetric(Metric): super(PPLMetric, self).__init__() self.clear() self.data_length = data_length - pipeline_stages = context.get_auto_parallel_context("pipeline_stages") + pipeline_stages = mindspore.get_auto_parallel_context("pipeline_stages") per_stage_device_num = get_group_size() // pipeline_stages stage_id = get_rank() // per_stage_device_num self.is_last_stage = (stage_id == pipeline_stages - 1) diff --git a/official/nlp/Pangu_alpha/src/pangu_alpha_wrapcell.py b/official/nlp/Pangu_alpha/src/pangu_alpha_wrapcell.py index 72cdec353..d9379b611 100644 --- a/official/nlp/Pangu_alpha/src/pangu_alpha_wrapcell.py +++ b/official/nlp/Pangu_alpha/src/pangu_alpha_wrapcell.py @@ -14,7 +14,7 @@ # ============================================================================ """GPT training wrapper""" -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.ops import operations as P from mindspore.ops import composite as C @@ -22,8 +22,8 @@ from mindspore.ops import functional as F from mindspore.common.tensor import Tensor import mindspore.common.dtype as mstype from mindspore.nn.wrap.loss_scale import TrainOneStepWithLossScaleCell -from mindspore import context, Parameter -from mindspore.context import ParallelMode +from mindspore import Parameter +from mindspore import ParallelMode from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.communication.management import get_group_size from mindspore.ops.operations.math_ops import NPUGetFloatStatusV2, NPUClearFloatStatusV2 @@ -189,7 +189,7 @@ class PanguAlphaTrainPipelineWithLossScaleCell(nn.Cell): sens_param=True) self.reducer_flag = False self.allreduce = P.AllReduce() - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = F.identity @@ -237,7 +237,7 @@ class PanguAlphaTrainPipelineWithLossScaleCell(nn.Cell): else: scaling_sens = sens # alloc status and clear should be right before gradoperation - init = Tensor([0]*8, dtype=ms.int32) + init = Tensor([0]*8, dtype=mindspore.int32) status_clear = self.clear_before_grad(init) scaling_sens = F.depend(scaling_sens, status_clear) grads = self.grad(self.network, weights)(input_ids, diff --git a/official/nlp/Pangu_alpha/src/utils.py b/official/nlp/Pangu_alpha/src/utils.py index 8b20c0428..60fe0741b 100644 --- a/official/nlp/Pangu_alpha/src/utils.py +++ b/official/nlp/Pangu_alpha/src/utils.py @@ -21,8 +21,8 @@ import os import time import hashlib import numpy as np +import mindspore import mindspore.nn as nn -from mindspore import context from mindspore.ops import operations as P from mindspore.ops import composite as C from mindspore.ops import functional as F @@ -30,7 +30,7 @@ import mindspore.common.dtype as mstype from mindspore.common.tensor import Tensor from mindspore.nn.learning_rate_schedule import LearningRateSchedule, PolynomialDecayLR, WarmUpLR, CosineDecayLR from mindspore.parallel._auto_parallel_context import auto_parallel_context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_rank, get_group_size, create_group from mindspore.nn import AdamWeightDecay from mindspore.common import Parameter, ParameterTuple @@ -145,8 +145,8 @@ class GlobalNorm(nn.Cell): def __init__(self, params, config): super(GlobalNorm, self).__init__() self.hyper_map = C.HyperMap() - self.is_pipeline = context.get_auto_parallel_context("pipeline_stages") > 1 - self.is_data_parallel = context.get_auto_parallel_context("parallel_mode") == ParallelMode.DATA_PARALLEL + self.is_pipeline = mindspore.get_auto_parallel_context("pipeline_stages") > 1 + self.is_data_parallel = mindspore.get_auto_parallel_context("parallel_mode") == ParallelMode.DATA_PARALLEL self.config = config self.group_size = 1 if self.is_data_parallel: @@ -154,7 +154,7 @@ class GlobalNorm(nn.Cell): else: self.merge_op = P.AllReduce() if self.is_pipeline: - if context.get_auto_parallel_context("enable_parallel_optimizer"): + if mindspore.get_auto_parallel_context("enable_parallel_optimizer"): self.group_size = get_group_size() // config.parallel_config.pipeline_stage else: self.group_size = config.parallel_config.model_parallel diff --git a/official/nlp/Pangu_alpha/train.py b/official/nlp/Pangu_alpha/train.py index d672a1cac..356c4a832 100644 --- a/official/nlp/Pangu_alpha/train.py +++ b/official/nlp/Pangu_alpha/train.py @@ -21,11 +21,10 @@ import json import glob import os import math - -from mindspore import context +import mindspore from mindspore.train.model import Model import mindspore.communication.management as D -from mindspore.context import ParallelMode +from mindspore import ParallelMode import mindspore.nn as nn from mindspore.train.callback import TimeMonitor from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell @@ -107,14 +106,14 @@ def set_parallel_context(args_opt): device_num = D.get_group_size() rank = D.get_rank() print("rank_id is {}, device_num is {}".format(rank, device_num)) - context.reset_auto_parallel_context() - context.set_auto_parallel_context( + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context( parallel_mode=args_opt.parallel_mode, gradients_mean=False, search_mode=args_opt.search_mode, full_batch=bool(args_opt.full_batch), strategy_ckpt_load_file=args_opt.strategy_load_ckpt_path, enable_parallel_optimizer=bool(args_opt.optimizer_shard), strategy_ckpt_save_file='strategy.ckpt', enable_alltoall=bool(args_opt.enable_alltoall)) set_algo_parameters(elementwise_op_strategy_follow=True) - if context.get_auto_parallel_context("parallel_mode") == ParallelMode.AUTO_PARALLEL: + if mindspore.get_auto_parallel_context("parallel_mode") == ParallelMode.AUTO_PARALLEL: set_algo_parameters(elementwise_op_strategy_follow=False, fully_use_devices=False) _set_multi_subgraphs() return rank, device_num @@ -137,9 +136,9 @@ def cal_model_property(args_opt, device_num): model_parallel_num = min(args_opt.op_level_model_parallel_num, device_num) data_parallel_num = int(device_num / model_parallel_num) batch_size = args_opt.per_batch_size * data_parallel_num - if (context.get_auto_parallel_context("parallel_mode") == ParallelMode.DATA_PARALLEL or - (context.get_auto_parallel_context("parallel_mode") == ParallelMode.AUTO_PARALLEL and - context.get_auto_parallel_context("search_mode") == "recursive_programming")): + if (mindspore.get_auto_parallel_context("parallel_mode") == ParallelMode.DATA_PARALLEL or + (mindspore.get_auto_parallel_context("parallel_mode") == ParallelMode.AUTO_PARALLEL and + mindspore.get_auto_parallel_context("search_mode") == "recursive_programming")): batch_size = args_opt.per_batch_size return model_parallel_num, data_parallel_num, batch_size @@ -147,16 +146,16 @@ def cal_model_property(args_opt, device_num): def run_train(args_opt): r"""The main training process.""" # Set execution mode - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, max_device_memory="30GB") + mindspore.set_context(mode=0, device_target=args_opt.device_target, max_device_memory="30GB") # Set parallel context rank = 0 device_num = 1 if args_opt.distribute == "true": rank, device_num = set_parallel_context(args_opt) - context.set_context(save_graphs=False, save_graphs_path="./graphs_of_device_id_" + str(rank)) + mindspore.set_context(save_graphs=False, save_graphs_path="./graphs_of_device_id_" + str(rank)) if args_opt.parallel_mode == "data_parallel": # in avoid of the loop call depth - context.set_context(max_call_depth=10000) + mindspore.set_context(max_call_depth=10000) # env variable prepare group_info_file = os.getenv("GROUP_INFO_FILE") @@ -414,22 +413,22 @@ def set_pipeline_parallel_context(args_opt): device_num = D.get_group_size() rank_id = D.get_rank() print("rank_id is {}, device_num is {}".format(rank_id, device_num)) - context.reset_auto_parallel_context() - context.set_auto_parallel_context( + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context( parallel_mode=args_opt.parallel_mode, gradients_mean=False, search_mode=args_opt.search_mode, full_batch=bool(args_opt.full_batch), loss_repeated_mean=True, device_num=device_num, enable_parallel_optimizer=bool(args_opt.optimizer_shard), pipeline_stages=args_opt.stage_num, enable_alltoall=bool(args_opt.enable_alltoall)) set_algo_parameters(elementwise_op_strategy_follow=True) - if context.get_auto_parallel_context("parallel_mode") == ParallelMode.AUTO_PARALLEL: + if mindspore.get_auto_parallel_context("parallel_mode") == ParallelMode.AUTO_PARALLEL: set_algo_parameters(elementwise_op_strategy_follow=False, fully_use_devices=False) _set_multi_subgraphs() return rank_id, device_num def cal_model_property_pipeline(args_opt, device_num): - is_auto_parallel = (context.get_auto_parallel_context("parallel_mode") == ParallelMode.AUTO_PARALLEL and - context.get_auto_parallel_context("search_mode") == "recursive_programming") + is_auto_parallel = (mindspore.get_auto_parallel_context("parallel_mode") == ParallelMode.AUTO_PARALLEL and + mindspore.get_auto_parallel_context("search_mode") == "recursive_programming") # in order to make sure data_parallel_num is always non-zero, set model_parallel_num to 1 model_parallel_num = 1 if is_auto_parallel else args_opt.op_level_model_parallel_num stage_device_num = int(device_num / args_opt.stage_num) @@ -443,8 +442,8 @@ def cal_model_property_pipeline(args_opt, device_num): def run_train_pipeline(args_opt): r"""The main training process in pipeline.""" - context.set_context(save_graphs=False, mode=context.GRAPH_MODE, device_target=args_opt.device_target) - context.set_context(max_device_memory="30GB") + mindspore.set_context(save_graphs=False, mode=0, device_target=args_opt.device_target) + mindspore.set_context(max_device_memory="30GB") rank_id = 0 device_num = 1 if args_opt.distribute == "true": @@ -500,7 +499,7 @@ def run_train_pipeline(args_opt): ds = create_dataset(config.batch_size * parallel_config.micro_batch_num * micro_batch_interleaved, data_path=cache_url, device_num=stage_device_num, rank=rank_id % stage_device_num, eod_reset=True, data_start_index=0, - full_batch=context.get_auto_parallel_context("full_batch"), + full_batch=mindspore.get_auto_parallel_context("full_batch"), column_name=args_opt.data_column_name) epoch_num = args_opt.epoch_size step_per_epoch = ds.get_dataset_size() @@ -552,7 +551,7 @@ if __name__ == "__main__": raise ValueError("The alltoall communication is only effective when applying moe") os.environ['HCCL_CONNECT_TIMEOUT'] = str(opt.hccl_connect_time) if opt.atomic_clean_policy == 1: - context.set_context(ascend_config={"atomic_clean_policy": 1}) + mindspore.set_context(ascend_config={"atomic_clean_policy": 1}) if opt.stage_num > 1: run_train_pipeline(opt) diff --git a/official/nlp/Transformer/eval.py b/official/nlp/Transformer/eval.py index e3e6f367f..3c695a80e 100644 --- a/official/nlp/Transformer/eval.py +++ b/official/nlp/Transformer/eval.py @@ -17,7 +17,7 @@ import os import numpy as np -import mindspore as ms +import mindspore import mindspore.nn as nn from mindspore.common.parameter import Parameter from mindspore.common.tensor import Tensor @@ -30,8 +30,8 @@ from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.model_utils.device_adapter import get_device_id -config.dtype = ms.float32 -config.compute_type = ms.float16 +config.dtype = mindspore.float32 +config.compute_type = mindspore.float16 config.batch_size = config.batch_size_ev config.hidden_dropout_prob = config.hidden_dropout_prob_ev config.attention_probs_dropout_prob = config.attention_probs_dropout_prob_ev @@ -45,7 +45,7 @@ def load_test_data(batch_size=1, data_file=None): "target_sos_ids", "target_sos_mask", "target_eos_ids", "target_eos_mask"], shuffle=False) - type_cast_op = deC.TypeCast(ms.int32) + type_cast_op = deC.TypeCast(mindspore.int32) data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_ids") data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_mask") data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_ids") @@ -81,7 +81,7 @@ def load_weights(model_path): ms_ckpt = np.load(model_path) is_npz = True else: - ms_ckpt = ms.load_checkpoint(model_path) + ms_ckpt = mindspore.load_checkpoint(model_path) is_npz = False weights = {} @@ -111,14 +111,14 @@ def run_transformer_eval(): """ Transformer evaluation. """ - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, reserve_class_name_in_scope=False, + mindspore.set_context(mode=0, device_target=config.device_target, reserve_class_name_in_scope=False, device_id=get_device_id()) dataset = load_test_data(batch_size=config.batch_size, data_file=config.data_file) tfm_model = TransformerModel(config=config, is_training=False, use_one_hot_embeddings=False) parameter_dict = load_weights(config.model_file) - ms.load_param_into_net(tfm_model, parameter_dict) + mindspore.load_param_into_net(tfm_model, parameter_dict) tfm_infer = TransformerInferCell(tfm_model) model = Model(tfm_infer) @@ -129,8 +129,8 @@ def run_transformer_eval(): for batch in dataset.create_dict_iterator(output_numpy=True, num_epochs=1): source_sents.append(batch["source_eos_ids"]) target_sents.append(batch["target_eos_ids"]) - source_ids = Tensor(batch["source_eos_ids"], ms.int32) - source_mask = Tensor(batch["source_eos_mask"], ms.int32) + source_ids = Tensor(batch["source_eos_ids"], mindspore.int32) + source_mask = Tensor(batch["source_eos_mask"], mindspore.int32) predicted_ids = model.predict(source_ids, source_mask) predictions.append(predicted_ids.asnumpy()) diff --git a/official/nlp/Transformer/eval_onnx.py b/official/nlp/Transformer/eval_onnx.py index dd650fde7..c5639ce3c 100644 --- a/official/nlp/Transformer/eval_onnx.py +++ b/official/nlp/Transformer/eval_onnx.py @@ -16,7 +16,7 @@ import os -import mindspore as ms +import mindspore import onnxruntime as ort from eval import load_test_data @@ -79,8 +79,8 @@ def run_transformer_eval(): def main(): """Main function""" - config.dtype = ms.float32 - config.compute_type = ms.float16 + config.dtype = mindspore.float32 + config.compute_type = mindspore.float16 config.batch_size = config.batch_size_ev config.hidden_dropout_prob = config.hidden_dropout_prob_ev config.attention_probs_dropout_prob = config.attention_probs_dropout_prob_ev diff --git a/official/nlp/Transformer/export.py b/official/nlp/Transformer/export.py index f76fea6ed..5cd3e4991 100644 --- a/official/nlp/Transformer/export.py +++ b/official/nlp/Transformer/export.py @@ -16,7 +16,7 @@ import numpy as np -import mindspore as ms +import mindspore from mindspore import Tensor from src.transformer_model import TransformerModel @@ -30,9 +30,9 @@ config.batch_size = config.batch_size_ev config.hidden_dropout_prob = config.hidden_dropout_prob_ev config.attention_probs_dropout_prob = config.attention_probs_dropout_prob_ev -ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - ms.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_pre_process(): pass @@ -43,12 +43,12 @@ def export_transformer(): tfm_model = TransformerModel(config=config, is_training=False, use_one_hot_embeddings=False) parameter_dict = load_weights(config.model_file) - ms.load_param_into_net(tfm_model, parameter_dict) + mindspore.load_param_into_net(tfm_model, parameter_dict) source_ids = Tensor(np.ones((config.batch_size, config.seq_length)).astype(np.int32)) source_mask = Tensor(np.ones((config.batch_size, config.seq_length)).astype(np.int32)) - ms.export(tfm_model, source_ids, source_mask, file_name=config.file_name, file_format=config.file_format) + mindspore.export(tfm_model, source_ids, source_mask, file_name=config.file_name, file_format=config.file_format) if __name__ == '__main__': export_transformer() diff --git a/official/nlp/Transformer/mindspore_hub_conf.py b/official/nlp/Transformer/mindspore_hub_conf.py index cf984efc4..98574ae6e 100644 --- a/official/nlp/Transformer/mindspore_hub_conf.py +++ b/official/nlp/Transformer/mindspore_hub_conf.py @@ -17,7 +17,7 @@ Transformer hub interface for transformer large ''' from src.transformer_model import TransformerModel from src.transformer_model import TransformerConfig -import mindspore as ms +import mindspore transformer_net_cfg_large = TransformerConfig( batch_size=96, seq_length=128, @@ -32,8 +32,8 @@ transformer_net_cfg_large = TransformerConfig( max_position_embeddings=128, initializer_range=0.02, label_smoothing=0.1, - dtype=ms.float32, - compute_type=ms.float16 + dtype=mindspore.float32, + compute_type=mindspore.float16 ) def create_network(name, *args, **kwargs): ''' diff --git a/official/nlp/Transformer/modelarts/train_modelarts.py b/official/nlp/Transformer/modelarts/train_modelarts.py index 229237973..9c8013133 100644 --- a/official/nlp/Transformer/modelarts/train_modelarts.py +++ b/official/nlp/Transformer/modelarts/train_modelarts.py @@ -20,8 +20,9 @@ import time import ast import numpy as np from easydict import EasyDict as edict +import mindspore import mindspore.common.dtype as mstype -from mindspore import Tensor, context +from mindspore import Tensor from mindspore.nn.optim import Adam from mindspore.train.model import Model from mindspore.train.loss_scale_manager import DynamicLossScaleManager @@ -30,7 +31,7 @@ from mindspore.train.callback import Callback, TimeMonitor from mindspore.train.serialization import load_checkpoint, load_param_into_net, export import mindspore.communication.management as D from mindspore.communication.management import get_rank -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.common import set_seed from src.transformer_model import TransformerModel from src.transformer_for_train import TransformerTrainOneStepCell, TransformerNetworkWithLoss, \ @@ -137,16 +138,16 @@ def run_transformer_train(): Transformer training. """ if config.device_target == "Ascend": - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) else: - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) - context.set_context(reserve_class_name_in_scope=False) + mindspore.set_context(mode=0, device_target=config.device_target) + mindspore.set_context(reserve_class_name_in_scope=False) # Set mempool block size in PYNATIVE_MODE for improving memory utilization, which will not take effect in GRAPH_MODE if config.device_target == "GPU": # Enable graph kernel - context.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion") + mindspore.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion") if config.distribute == "true": if config.device_target == "Ascend": device_num = config.device_num @@ -156,8 +157,8 @@ def run_transformer_train(): device_num = D.get_group_size() rank = get_rank() config.device_id = rank - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) rank_id = config.device_id % device_num save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(get_rank()) + '/') diff --git a/official/nlp/Transformer/src/beam_search.py b/official/nlp/Transformer/src/beam_search.py index 09e2f6e9a..7d32e4c08 100644 --- a/official/nlp/Transformer/src/beam_search.py +++ b/official/nlp/Transformer/src/beam_search.py @@ -15,7 +15,7 @@ """Transformer beam search module.""" import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore.common.tensor import Tensor @@ -28,22 +28,22 @@ class LengthPenalty(nn.Cell): Args: weight (float): Weight of length penalty. Default: 1.0. - compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mindspore.float32. """ def __init__(self, weight=1.0, - compute_type=ms.float32): + compute_type=mindspore.float32): super(LengthPenalty, self).__init__() self.weight = weight self.add = ops.Add() self.pow = ops.Pow() self.div = ops.RealDiv() self.cast = ops.Cast() - self.five = Tensor(5.0, ms.float32) - self.six = Tensor(6.0, ms.float32) + self.five = Tensor(5.0, mindspore.float32) + self.six = Tensor(6.0, mindspore.float32) def construct(self, length_tensor): - length_tensor = self.cast(length_tensor, ms.float32) + length_tensor = self.cast(length_tensor, mindspore.float32) output = self.add(length_tensor, self.five) output = self.div(output, self.six) output = self.pow(output, self.weight) @@ -56,11 +56,11 @@ class TileBeam(nn.Cell): Args: beam_width (int): beam width setting. Default: 4. - compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mindspore.float32. """ def __init__(self, beam_width, - compute_type=ms.float32): + compute_type=mindspore.float32): super(TileBeam, self).__init__() self.beam_width = beam_width self.expand = ops.ExpandDims() @@ -89,10 +89,10 @@ class Mod(nn.Cell): Mod function. Args: - compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mindspore.float32. """ def __init__(self, - compute_type=ms.float32): + compute_type=mindspore.float32): super(Mod, self).__init__() self.compute_type = compute_type self.floor_div = ops.FloorDiv() @@ -120,7 +120,7 @@ class BeamSearchDecoder(nn.Cell): max_decode_length (int): max decode length. Default: 128. sos_id (int): Id of sequence start token. Default: 1. eos_id (int): Id of sequence end token. Default: 2. - compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mindspore.float32. """ def __init__(self, batch_size, @@ -132,7 +132,7 @@ class BeamSearchDecoder(nn.Cell): max_decode_length=128, sos_id=1, eos_id=2, - compute_type=ms.float32): + compute_type=mindspore.float32): super(BeamSearchDecoder, self).__init__(auto_prefix=False) self.seq_length = seq_length self.batch_size = batch_size @@ -148,23 +148,23 @@ class BeamSearchDecoder(nn.Cell): self.shape_flat = (-1,) self.shape = ops.Shape() - self.zero_tensor = Tensor(np.zeros([batch_size, beam_width]), ms.float32) - self.ninf_tensor = Tensor(np.full([batch_size, beam_width], -INF), ms.float32) + self.zero_tensor = Tensor(np.zeros([batch_size, beam_width]), mindspore.float32) + self.ninf_tensor = Tensor(np.full([batch_size, beam_width], -INF), mindspore.float32) self.select = ops.Select() self.flat_shape = (batch_size, beam_width * vocab_size) self.topk = ops.TopK(sorted=True) self.floor_div = ops.FloorDiv() - self.vocab_size_tensor = Tensor(self.vocab_size, ms.int32) + self.vocab_size_tensor = Tensor(self.vocab_size, mindspore.int32) self.real_div = ops.RealDiv() self.mod = Mod() self.equal = ops.Equal() - self.eos_ids = Tensor(np.full([batch_size, beam_width], eos_id), ms.int32) + self.eos_ids = Tensor(np.full([batch_size, beam_width], eos_id), mindspore.int32) beam_ids = np.tile(np.arange(beam_width).reshape((1, beam_width)), [batch_size, 1]) - self.beam_ids = Tensor(beam_ids, ms.int32) + self.beam_ids = Tensor(beam_ids, mindspore.int32) batch_ids = np.arange(batch_size*beam_width).reshape((batch_size, beam_width)) // beam_width - self.batch_ids = Tensor(batch_ids, ms.int32) + self.batch_ids = Tensor(batch_ids, mindspore.int32) self.concat = ops.Concat(axis=-1) self.gather_nd = ops.GatherNd() @@ -174,14 +174,14 @@ class BeamSearchDecoder(nn.Cell): self.zeroslike = ops.ZerosLike() # init inputs and states - self.start_ids = Tensor(np.full([batch_size * beam_width, 1], sos_id), ms.int32) - self.init_seq = Tensor(np.full([batch_size, beam_width, 1], sos_id), ms.int32) + self.start_ids = Tensor(np.full([batch_size * beam_width, 1], sos_id), mindspore.int32) + self.init_seq = Tensor(np.full([batch_size, beam_width, 1], sos_id), mindspore.int32) init_scores = np.tile(np.array([[0.] + [-INF]*(beam_width-1)]), [batch_size, 1]) - self.init_scores = Tensor(init_scores, ms.float32) + self.init_scores = Tensor(init_scores, mindspore.float32) self.init_finished = Tensor(np.zeros([batch_size, beam_width], dtype=np.bool_)) self.init_length = Tensor(np.zeros([batch_size, beam_width], dtype=np.int32)) self.length_penalty = LengthPenalty(weight=length_penalty_weight) - self.one = Tensor(1, ms.int32) + self.one = Tensor(1, mindspore.int32) def one_step(self, cur_input_ids, enc_states, enc_attention_mask, state_log_probs, state_seq, state_finished, state_length): @@ -207,7 +207,7 @@ class BeamSearchDecoder(nn.Cell): beam_indices = self.zeroslike(topk_indices) for _ in range(self.beam_width - 1): temp = self.sub(temp, self.vocab_size_tensor) - res = self.cast(self.greater_equal(temp, 0), ms.int32) + res = self.cast(self.greater_equal(temp, 0), mindspore.int32) beam_indices = beam_indices + res word_indices = topk_indices - beam_indices * self.vocab_size_tensor #====================================================================== diff --git a/official/nlp/Transformer/src/dataset.py b/official/nlp/Transformer/src/dataset.py index 4728db94e..59300331a 100644 --- a/official/nlp/Transformer/src/dataset.py +++ b/official/nlp/Transformer/src/dataset.py @@ -14,7 +14,7 @@ # ============================================================================ """Data operations, will be used in train.py.""" -import mindspore as ms +import mindspore import mindspore.dataset as de from .model_utils.config import config @@ -33,7 +33,7 @@ def create_transformer_dataset(rank_size=1, rank_id=0, do_shuffle="true", datase "target_sos_ids", "target_sos_mask", "target_eos_ids", "target_eos_mask"], shuffle=(do_shuffle == "true"), num_shards=rank_size, shard_id=rank_id) - type_cast_op = de.transforms.transforms.TypeCast(ms.int32) + type_cast_op = de.transforms.transforms.TypeCast(mindspore.int32) ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids") ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask") ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids") diff --git a/official/nlp/Transformer/src/model_utils/device_adapter.py b/official/nlp/Transformer/src/model_utils/device_adapter.py index 7c5d7f837..825c667a2 100644 --- a/official/nlp/Transformer/src/model_utils/device_adapter.py +++ b/official/nlp/Transformer/src/model_utils/device_adapter.py @@ -15,6 +15,7 @@ """Device adapter for ModelArts""" +import mindspore from .config import config if config.enable_modelarts: diff --git a/official/nlp/Transformer/src/model_utils/moxing_adapter.py b/official/nlp/Transformer/src/model_utils/moxing_adapter.py index a35a25900..636fc02c1 100644 --- a/official/nlp/Transformer/src/model_utils/moxing_adapter.py +++ b/official/nlp/Transformer/src/model_utils/moxing_adapter.py @@ -17,7 +17,8 @@ import os import functools -import mindspore as ms +import mindspore +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +94,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - ms.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/nlp/Transformer/src/transformer_for_train.py b/official/nlp/Transformer/src/transformer_for_train.py index a3019301f..6647350aa 100644 --- a/official/nlp/Transformer/src/transformer_for_train.py +++ b/official/nlp/Transformer/src/transformer_for_train.py @@ -18,14 +18,14 @@ from mindspore import jit from mindspore.common.initializer import initializer from mindspore import amp -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore.common.tensor import Tensor from mindspore.common.parameter import Parameter from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.communication.management import get_group_size -from mindspore.context import ParallelMode +from mindspore import ParallelMode from .transformer_model import TransformerModel @@ -73,8 +73,8 @@ class TransformerTrainingLoss(nn.Cell): super(TransformerTrainingLoss, self).__init__(auto_prefix=False) self.vocab_size = config.vocab_size self.onehot = ops.OneHot() - self.on_value = Tensor(float(1 - config.label_smoothing), ms.float32) - self.off_value = Tensor(config.label_smoothing / float(self.vocab_size - 1), ms.float32) + self.on_value = Tensor(float(1 - config.label_smoothing), mindspore.float32) + self.off_value = Tensor(config.label_smoothing / float(self.vocab_size - 1), mindspore.float32) self.reduce_sum = ops.ReduceSum() self.reduce_mean = ops.ReduceMean() self.reshape = ops.Reshape() @@ -88,13 +88,13 @@ class TransformerTrainingLoss(nn.Cell): """Defines the computation performed.""" flat_shape = (self.batch_size * seq_length,) label_ids = self.reshape(label_ids, flat_shape) - label_weights = self.cast(self.reshape(label_weights, flat_shape), ms.float32) + label_weights = self.cast(self.reshape(label_weights, flat_shape), mindspore.float32) one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value) per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx)) numerator = self.reduce_sum(label_weights * per_example_loss, ()) denominator = self.reduce_sum(label_weights, ()) + \ - self.cast(ops.tuple_to_array((1e-5,)), ms.float32) + self.cast(ops.tuple_to_array((1e-5,)), mindspore.float32) loss = numerator / denominator return loss @@ -129,7 +129,7 @@ class TransformerNetworkWithLoss(nn.Cell): prediction_scores = self.transformer(source_ids, source_mask, target_ids, target_mask) seq_length = self.shape(source_ids)[1] total_loss = self.loss(prediction_scores, label_ids, label_weights, seq_length) - return self.cast(total_loss, ms.float32) + return self.cast(total_loss, mindspore.float32) class TransformerTrainOneStepCell(nn.TrainOneStepCell): @@ -188,7 +188,7 @@ class TransformerTrainOneStepCell(nn.TrainOneStepCell): label_ids, label_weights, self.cast(ops.tuple_to_array((self.sens,)), - ms.float32)) + mindspore.float32)) grads = self.clip_grads(grads) # apply grad reducer on grads grads = self.grad_reducer(grads) @@ -234,7 +234,7 @@ class TransformerTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell) self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: - self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=ms.float32)) + self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mindspore.float32)) self.enable_tuple_broaden = True @jit @@ -282,7 +282,7 @@ class TransformerTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell) label_ids, label_weights, self.cast(scaling_sens, - ms.float32)) + mindspore.float32)) # apply grad reducer on grads grads = self.grad_reducer(grads) @@ -304,21 +304,21 @@ add_grads = ops.MultitypeFuncGraph("add_grads") @add_grads.register("Tensor", "Tensor") def _add_grads(accu_grad, grad): - return accu_grad + cast(grad, ms.float32) + return accu_grad + cast(grad, mindspore.float32) update_accu_grads = ops.MultitypeFuncGraph("update_accu_grads") @update_accu_grads.register("Tensor", "Tensor") def _update_accu_grads(accu_grad, grad): succ = True - return ops.depend(succ, ops.assign(accu_grad, cast(grad, ms.float32))) + return ops.depend(succ, ops.assign(accu_grad, cast(grad, mindspore.float32))) accumulate_accu_grads = ops.MultitypeFuncGraph("accumulate_accu_grads") @accumulate_accu_grads.register("Tensor", "Tensor") def _accumulate_accu_grads(accu_grad, grad): succ = True - return ops.depend(succ, ops.assign_add(accu_grad, cast(grad, ms.float32))) + return ops.depend(succ, ops.assign_add(accu_grad, cast(grad, mindspore.float32))) zeroslike = ops.ZerosLike() @@ -361,14 +361,14 @@ class TransformerTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): self.enable_global_norm = enable_global_norm self.one = Tensor(np.array([1]).astype(np.int32)) self.zero = Tensor(np.array([0]).astype(np.int32)) - self.local_step = Parameter(initializer(0, [1], ms.int32)) + self.local_step = Parameter(initializer(0, [1], mindspore.int32)) self.accu_grads = self.weights.clone(prefix="accu_grads", init='zeros') - self.accu_overflow = Parameter(initializer(0, [1], ms.int32)) - self.accu_loss = Parameter(initializer(0, [1], ms.float32)) + self.accu_overflow = Parameter(initializer(0, [1], mindspore.int32)) + self.accu_loss = Parameter(initializer(0, [1], mindspore.float32)) self.grad = ops.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False - self.parallel_mode = ms.get_auto_parallel_context("parallel_mode") + self.parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = ops.identity @@ -382,7 +382,7 @@ class TransformerTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): self.overflow_reducer = ops.AllReduce() self.cast = ops.Cast() self.reduce_sum = ops.ReduceSum(keep_dims=False) - self.base = Tensor(1, ms.float32) + self.base = Tensor(1, mindspore.float32) self.less_equal = ops.LessEqual() self.logical_or = ops.LogicalOr() self.not_equal = ops.NotEqual() @@ -392,7 +392,7 @@ class TransformerTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: - self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=ms.float32)) + self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mindspore.float32)) self.enable_tuple_broaden = True @jit @@ -455,7 +455,7 @@ class TransformerTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): label_ids, label_weights, self.cast(scaling_sens, - ms.float32)) + mindspore.float32)) accu_succ = self.clip_accumlate_hyper_map(grads) mean_loss = ops.depend(mean_loss, accu_succ) diff --git a/official/nlp/Transformer/src/transformer_model.py b/official/nlp/Transformer/src/transformer_model.py index c359d7d68..8c99c7002 100644 --- a/official/nlp/Transformer/src/transformer_model.py +++ b/official/nlp/Transformer/src/transformer_model.py @@ -17,7 +17,7 @@ import math import copy import numpy as np -import mindspore as ms +import mindspore import mindspore.ops as ops import mindspore.nn as nn from mindspore.common.tensor import Tensor @@ -53,8 +53,8 @@ class TransformerConfig: beam_width (int): beam width setting. Default: 4 max_decode_length (int): max decode length in evaluation. Default: 80 length_penalty_weight (float): normalize scores of translations according to their length. Default: 1.0 - dtype (:class:`mindspore.dtype`): Data type of the input. Default: ms.float32. - compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: ms.float32. + dtype (:class:`mindspore.dtype`): Data type of the input. Default: mindspore.float32. + compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mindspore.float32. """ def __init__(self, batch_size, @@ -73,8 +73,8 @@ class TransformerConfig: beam_width=4, max_decode_length=80, length_penalty_weight=1.0, - dtype=ms.float32, - compute_type=ms.float32): + dtype=mindspore.float32, + compute_type=mindspore.float32): self.batch_size = batch_size self.seq_length = seq_length self.vocab_size = vocab_size @@ -119,8 +119,8 @@ class EmbeddingLookup(nn.Cell): self.shape_flat = (-1,) self.gather = ops.Gather() self.one_hot = ops.OneHot() - self.on_value = Tensor(1.0, ms.float32) - self.off_value = Tensor(0.0, ms.float32) + self.on_value = Tensor(1.0, mindspore.float32) + self.off_value = Tensor(0.0, mindspore.float32) self.array_mul = ops.MatMul() self.reshape = ops.Reshape() self.shape = ops.Shape() @@ -185,14 +185,14 @@ class EmbeddingPostprocessor(nn.Cell): max_position_embeddings=128, dropout_prob=0.1): super(EmbeddingPostprocessor, self).__init__() - self.scores_mul = Tensor([math.sqrt(float(embedding_size))], dtype=ms.float32) + self.scores_mul = Tensor([math.sqrt(float(embedding_size))], dtype=mindspore.float32) self.multiply = ops.Mul() self.add = ops.Add() self.dropout = nn.Dropout(p=dropout_prob) self.use_dropout = dropout_prob > 0 self.expand_dims = ops.ExpandDims() self.position_embedding_table = Tensor(position_encoding(max_position_embeddings, embedding_size), - ms.float32) + mindspore.float32) self.shape = ops.Shape() def construct(self, word_embeddings): @@ -216,7 +216,7 @@ class CastWrapper(nn.Cell): """ Cast wrapper. """ - def __init__(self, src_type=ms.float32, dst_type=ms.float32): + def __init__(self, src_type=mindspore.float32, dst_type=mindspore.float32): super(CastWrapper, self).__init__() self.cast = ops.Cast() self.dst_type = dst_type @@ -237,7 +237,7 @@ class LayerPreprocess(nn.Cell): self.get_dtype = ops.DType() def construct(self, input_tensor): - output = self.cast(input_tensor, ms.float32) + output = self.cast(input_tensor, mindspore.float32) output = self.layernorm(output) output = self.cast(output, self.get_dtype(input_tensor)) return output @@ -284,7 +284,7 @@ class MultiheadAttention(nn.Cell): initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. do_return_2d_tensor (bool): True for return 2d tensor. False for return 3d tensor. Default: False. - compute_type (:class:`mindspore.dtype`): Compute type in MultiheadAttention. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in MultiheadAttention. Default: mindspore.float32. """ def __init__(self, batch_size, @@ -302,7 +302,7 @@ class MultiheadAttention(nn.Cell): use_one_hot_embeddings=False, initializer_range=0.02, do_return_2d_tensor=True, - compute_type=ms.float32): + compute_type=mindspore.float32): super(MultiheadAttention, self).__init__() self.batch_size = batch_size self.num_attention_heads = num_attention_heads @@ -398,7 +398,7 @@ class MultiheadAttention(nn.Cell): adder = self.multiply(multiply_out, self.multiply_data) attention_scores = self.add(adder, attention_scores) - attention_scores = self.softmax_cast(attention_scores, ms.float32) + attention_scores = self.softmax_cast(attention_scores, mindspore.float32) attention_probs = self.softmax(attention_scores) attention_probs = self.softmax_cast(attention_probs, self.get_dtype(key_layer)) if self.use_dropout: @@ -431,7 +431,7 @@ class SelfAttention(nn.Cell): hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. has_attention_mask (bool): Specifies whether has attention mask. Default: True. is_encdec_att (bool): Specifies whether query sequence and memory sequence are different. Default: False. - compute_type (:class:`mindspore.dtype`): Compute type in MultiheadAttention. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in MultiheadAttention. Default: mindspore.float32. """ def __init__(self, batch_size, @@ -443,7 +443,7 @@ class SelfAttention(nn.Cell): hidden_dropout_prob=0.1, has_attention_mask=True, is_encdec_att=False, - compute_type=ms.float32): + compute_type=mindspore.float32): super(SelfAttention, self).__init__() if hidden_size % num_attention_heads != 0: raise ValueError("The hidden size (%d) is not a multiple of the number " @@ -496,7 +496,7 @@ class FeedForward(nn.Cell): hidden_act (str): name of the activation function. Default: relu initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. - compute_type (:class:`mindspore.dtype`): Compute type in FeedForward. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in FeedForward. Default: mindspore.float32. """ def __init__(self, in_channels, @@ -505,7 +505,7 @@ class FeedForward(nn.Cell): hidden_act="relu", initializer_range=0.02, hidden_dropout_prob=0.1, - compute_type=ms.float32): + compute_type=mindspore.float32): super(FeedForward, self).__init__() self.conv1 = nn.Dense(in_channels, @@ -551,7 +551,7 @@ class EncoderCell(nn.Cell): initializer_range (float): Initialization value of TruncatedNormal. Default: 0.1. hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. hidden_act (str): Activation function. Default: "relu". - compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: mindspore.float32. """ def __init__(self, batch_size, @@ -563,7 +563,7 @@ class EncoderCell(nn.Cell): initializer_range=0.02, hidden_dropout_prob=0.1, hidden_act="relu", - compute_type=ms.float32): + compute_type=mindspore.float32): super(EncoderCell, self).__init__() self.attention = SelfAttention( batch_size=batch_size, @@ -609,7 +609,7 @@ class TransformerEncoder(nn.Cell): initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1.. hidden_act (str): Activation function used in the encoder cells. Default: "gelu". - compute_type (:class:`mindspore.dtype`): Compute type. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type. Default: mindspore.float32. """ def __init__(self, batch_size, @@ -622,7 +622,7 @@ class TransformerEncoder(nn.Cell): initializer_range=0.02, hidden_dropout_prob=0.1, hidden_act="relu", - compute_type=ms.float32): + compute_type=mindspore.float32): super(TransformerEncoder, self).__init__() self.num_hidden_layers = num_hidden_layers self.batch_size = batch_size @@ -679,7 +679,7 @@ class DecoderCell(nn.Cell): initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. hidden_act (str): Activation function. Default: "relu". - compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: mindspore.float32. """ def __init__(self, batch_size, @@ -691,7 +691,7 @@ class DecoderCell(nn.Cell): initializer_range=0.02, hidden_dropout_prob=0.1, hidden_act="relu", - compute_type=ms.float32): + compute_type=mindspore.float32): super(DecoderCell, self).__init__() self.self_attention = SelfAttention( batch_size=batch_size, @@ -751,7 +751,7 @@ class TransformerDecoder(nn.Cell): initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. hidden_act (str): Activation function used in the encoder cells. Default: "gelu". - compute_type (:class:`mindspore.dtype`): Compute type. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type. Default: mindspore.float32. """ def __init__(self, batch_size, @@ -764,7 +764,7 @@ class TransformerDecoder(nn.Cell): initializer_range=0.02, hidden_dropout_prob=0.1, hidden_act="relu", - compute_type=ms.float32): + compute_type=mindspore.float32): super(TransformerDecoder, self).__init__() self.num_hidden_layers = num_hidden_layers @@ -825,7 +825,7 @@ class CreateAttentionMaskFromInputMask(nn.Cell): shape_right = (input_shape[0], 1, input_shape[1]) shape_left = input_shape + (1,) - input_mask = self.cast(input_mask, ms.float32) + input_mask = self.cast(input_mask, mindspore.float32) mask_left = self.reshape(input_mask, shape_left) mask_right = self.reshape(input_mask, shape_right) attention_mask = self.batch_matmul(mask_left, mask_right) @@ -841,14 +841,14 @@ class PredLogProbs(nn.Cell): batch_size (int): Batch size. seq_length (int): Length of input sequence. width (int): Hidden size. - compute_type (:class:`mindspore.dtype`): Compute type. Default: ms.float32. - dtype (:class:`mindspore.dtype`): Compute type to compute log_softmax. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type. Default: mindspore.float32. + dtype (:class:`mindspore.dtype`): Compute type to compute log_softmax. Default: mindspore.float32. """ def __init__(self, batch_size, width, - compute_type=ms.float32, - dtype=ms.float32): + compute_type=mindspore.float32, + dtype=mindspore.float32): super(PredLogProbs, self).__init__() self.batch_size = batch_size self.width = width @@ -896,7 +896,7 @@ class TransformerDecoderStep(nn.Cell): initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. hidden_act (str): Activation function used in the encoder cells. Default: "gelu". - compute_type (:class:`mindspore.dtype`): Compute type. Default: ms.float32. + compute_type (:class:`mindspore.dtype`): Compute type. Default: mindspore.float32. embedding_lookup (:class:`EmbeddingLookup`): Embedding lookup module. embedding_processor (:class:`EmbeddingPostprocessor`) Embedding postprocessor module. projection (:class:`PredLogProbs`): PredLogProbs module @@ -913,7 +913,7 @@ class TransformerDecoderStep(nn.Cell): initializer_range=0.02, hidden_dropout_prob=0.3, hidden_act="relu", - compute_type=ms.float32, + compute_type=mindspore.float32, embedding_lookup=None, embedding_processor=None, projection=None): @@ -945,7 +945,7 @@ class TransformerDecoderStep(nn.Cell): self.multiply = ops.Mul() ones = np.ones(shape=(max_decode_length, max_decode_length)) - self.future_mask = Tensor(np.tril(ones), dtype=ms.float32) + self.future_mask = Tensor(np.tril(ones), dtype=mindspore.float32) self.cast_compute_type = CastWrapper(dst_type=compute_type) @@ -985,7 +985,7 @@ class TransformerDecoderStep(nn.Cell): @constexpr def convert_np_to_tensor_encoder(seq_length): ones = np.ones(shape=(seq_length, seq_length)) - return Tensor(np.tril(ones), dtype=ms.float32) + return Tensor(np.tril(ones), dtype=mindspore.float32) class TransformerModel(nn.Cell): @@ -1099,7 +1099,7 @@ class TransformerModel(nn.Cell): self.tfm_decoder.add_flags(loop_can_unroll=True) self.tile_beam = TileBeam(beam_width=self.beam_width) ones = np.ones(shape=(self.batch_size, self.max_decode_length)) - self.encdec_mask = Tensor(ones, ms.float32) + self.encdec_mask = Tensor(ones, mindspore.float32) self.cast = ops.Cast() self.dtype = config.dtype diff --git a/official/nlp/Transformer/train.py b/official/nlp/Transformer/train.py index f2057680e..01c55c94f 100644 --- a/official/nlp/Transformer/train.py +++ b/official/nlp/Transformer/train.py @@ -18,7 +18,7 @@ import os import time from easydict import EasyDict as edict -import mindspore as ms +import mindspore from mindspore.common.tensor import Tensor from mindspore.nn.optim import Adam from mindspore.train.model import Model @@ -27,7 +27,7 @@ from mindspore.train.callback import CheckpointConfig, ModelCheckpoint from mindspore.train.callback import Callback, TimeMonitor import mindspore.communication.management as D from mindspore.communication.management import get_rank -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.common import set_seed from src.transformer_for_train import TransformerTrainOneStepCell, TransformerNetworkWithLoss, \ @@ -49,8 +49,8 @@ def get_ms_timestamp(): time_stamp_init = False time_stamp_first = 0 -config.dtype = ms.float32 -config.compute_type = ms.float16 +config.dtype = mindspore.float32 +config.compute_type = mindspore.float16 config.lr_schedule = edict({ 'learning_rate': 2.0, 'warmup_steps': 8000, @@ -114,18 +114,18 @@ def run_transformer_train(): Transformer training. """ if config.device_target == "Ascend": - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) else: - ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target) - ms.set_context(reserve_class_name_in_scope=False) + mindspore.set_context(mode=0, device_target=config.device_target) + mindspore.set_context(reserve_class_name_in_scope=False) # Set mempool block size in PYNATIVE_MODE for improving memory utilization, which will not take effect in GRAPH_MODE - if ms.get_context("mode") == ms.PYNATIVE_MODE: - ms.set_context(mempool_block_size="31GB") + if mindspore.get_context("mode") == 1: + mindspore.set_context(mempool_block_size="31GB") if config.device_target == "GPU": # Enable graph kernel - ms.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion") + mindspore.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion") if config.distribute == "true": if config.device_target == "Ascend": device_num = config.device_num @@ -135,8 +135,8 @@ def run_transformer_train(): device_num = D.get_group_size() rank = get_rank() config.device_id = rank - ms.reset_auto_parallel_context() - ms.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) rank_id = config.device_id % device_num save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(get_rank()) + '/') @@ -154,8 +154,8 @@ def run_transformer_train(): netwithloss = TransformerNetworkWithLoss(config, True) if config.checkpoint_path: - parameter_dict = ms.load_checkpoint(config.checkpoint_path) - ms.load_param_into_net(netwithloss, parameter_dict) + parameter_dict = mindspore.load_checkpoint(config.checkpoint_path) + mindspore.load_param_into_net(netwithloss, parameter_dict) hidden_size = config.hidden_size learning_rate = config.lr_schedule.learning_rate if config.device_target == "Ascend" else 1.0 @@ -165,7 +165,7 @@ def run_transformer_train(): warmup_steps=config.lr_schedule.warmup_steps, hidden_size=hidden_size, start_decay_step=config.lr_schedule.start_decay_step, - min_lr=config.lr_schedule.min_lr), ms.float32) + min_lr=config.lr_schedule.min_lr), mindspore.float32) if config.device_target == "GPU" and config.transformer_network == "large": optimizer = Adam(netwithloss.trainable_params(), lr, beta2=config.optimizer_adam_beta2) diff --git a/official/recommend/DeepFM/eval.py b/official/recommend/DeepFM/eval.py index 0f95f1aa2..6856de25f 100644 --- a/official/recommend/DeepFM/eval.py +++ b/official/recommend/DeepFM/eval.py @@ -17,7 +17,7 @@ import os import sys import time -from mindspore import context +import mindspore from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -30,7 +30,7 @@ from src.model_utils.device_adapter import get_device_id sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) device_id = get_device_id() # int(os.getenv('DEVICE_ID', '0')) -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=device_id) +mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) def add_write(file_path, print_str): with open(file_path, 'a+', encoding='utf-8') as file_out: diff --git a/official/recommend/DeepFM/export.py b/official/recommend/DeepFM/export.py index 88cd2ffa3..916d2cc7d 100644 --- a/official/recommend/DeepFM/export.py +++ b/official/recommend/DeepFM/export.py @@ -15,7 +15,8 @@ """export ckpt to model""" import numpy as np -from mindspore import context, Tensor +import mindspore +from mindspore import Tensor from mindspore.train.serialization import export, load_checkpoint from src.deepfm import ModelBuilder @@ -24,9 +25,9 @@ from src.model_utils.device_adapter import get_device_id from src.model_utils.moxing_adapter import moxing_wrapper -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_process(): pass diff --git a/official/recommend/DeepFM/modelart/start.py b/official/recommend/DeepFM/modelart/start.py index 69ddc52ed..3ca12c64b 100644 --- a/official/recommend/DeepFM/modelart/start.py +++ b/official/recommend/DeepFM/modelart/start.py @@ -16,8 +16,8 @@ import os import sys -from mindspore import context -from mindspore.context import ParallelMode +import mindspore +from mindspore import ParallelMode from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.model import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor @@ -49,11 +49,11 @@ def train_deepfm(): if config.rank_size > 1: if config.device_target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) - context.reset_auto_parallel_context() - context.set_auto_parallel_context( + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[9, 11]) @@ -61,13 +61,13 @@ def train_deepfm(): rank_id = int(os.environ.get('RANK_ID')) elif config.device_target == "GPU": init() - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=True, device_target=config.device_target) - context.set_context( + mindspore.set_context( graph_kernel_flags="--enable_cluster_ops=MatMul") - context.reset_auto_parallel_context() - context.set_auto_parallel_context( + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context( device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) @@ -78,17 +78,17 @@ def train_deepfm(): else: if config.device_target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) elif config.device_target == "GPU": - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=True, device_target=config.device_target) - context.set_context( + mindspore.set_context( graph_kernel_flags="--enable_cluster_ops=MatMul") else: - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=config.device_target) config.rank_size = None rank_id = None diff --git a/official/recommend/DeepFM/src/deepfm.py b/official/recommend/DeepFM/src/deepfm.py index 26ac9a5ec..c564ceb6b 100644 --- a/official/recommend/DeepFM/src/deepfm.py +++ b/official/recommend/DeepFM/src/deepfm.py @@ -28,7 +28,7 @@ from mindspore.nn.metrics import Metric from mindspore import nn, Tensor, ParameterTuple, Parameter from mindspore.common.initializer import Uniform, initializer from mindspore.train.callback import ModelCheckpoint, CheckpointConfig -from mindspore.context import ParallelMode, get_auto_parallel_context +from mindspore import ParallelMode, get_auto_parallel_context from mindspore.communication.management import get_group_size from mindspore.nn.wrap.grad_reducer import DistributedGradReducer diff --git a/official/recommend/DeepFM/src/model_utils/device_adapter.py b/official/recommend/DeepFM/src/model_utils/device_adapter.py index 7c5d7f837..825c667a2 100644 --- a/official/recommend/DeepFM/src/model_utils/device_adapter.py +++ b/official/recommend/DeepFM/src/model_utils/device_adapter.py @@ -15,6 +15,7 @@ """Device adapter for ModelArts""" +import mindspore from .config import config if config.enable_modelarts: diff --git a/official/recommend/DeepFM/src/model_utils/moxing_adapter.py b/official/recommend/DeepFM/src/model_utils/moxing_adapter.py index 830d19a6f..9c6d88e5d 100644 --- a/official/recommend/DeepFM/src/model_utils/moxing_adapter.py +++ b/official/recommend/DeepFM/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/recommend/DeepFM/train.py b/official/recommend/DeepFM/train.py index 2cb40957b..4f57f702e 100644 --- a/official/recommend/DeepFM/train.py +++ b/official/recommend/DeepFM/train.py @@ -16,8 +16,8 @@ import os import sys -from mindspore import context -from mindspore.context import ParallelMode +import mindspore +from mindspore import ParallelMode from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.model import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor @@ -45,23 +45,23 @@ def modelarts_pre_process(): def train_deepfm(): """ train_deepfm """ if config.device_target == "Ascend": - context.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) + mindspore.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) if config.rank_size > 1: if config.device_target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=device_id) - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[9, 11]) init() rank_id = int(os.environ.get('RANK_ID')) elif config.device_target == "GPU": init() - context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target=config.device_target) - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=get_group_size(), + mindspore.set_context(mode=0, enable_graph_kernel=True, device_target=config.device_target) + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.reset_auto_parallel_context() + mindspore.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) rank_id = get_rank() @@ -71,12 +71,12 @@ def train_deepfm(): else: if config.device_target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=device_id) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=device_id) elif config.device_target == "GPU": - context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target=config.device_target) - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.set_context(mode=0, enable_graph_kernel=True, device_target=config.device_target) + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") else: - context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, enable_graph_kernel=True) + mindspore.set_context(mode=0, device_target=config.device_target, enable_graph_kernel=True) config.rank_size = None rank_id = None diff --git a/official/recommend/Wide_and_Deep/eval.py b/official/recommend/Wide_and_Deep/eval.py index d8c448fcb..b936350f9 100644 --- a/official/recommend/Wide_and_Deep/eval.py +++ b/official/recommend/Wide_and_Deep/eval.py @@ -17,7 +17,8 @@ import os -from mindspore import Model, context +import mindspore +from mindspore import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net,\ build_searched_strategy, merge_sliced_parameter @@ -118,7 +119,7 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def eval_wide_and_deep(): - context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target) + mindspore.set_context(mode=0, device_target=cfg.device_target) test_eval(cfg) if __name__ == "__main__": diff --git a/official/recommend/Wide_and_Deep/export.py b/official/recommend/Wide_and_Deep/export.py index 55867a8af..5af232159 100644 --- a/official/recommend/Wide_and_Deep/export.py +++ b/official/recommend/Wide_and_Deep/export.py @@ -16,7 +16,9 @@ ##############export checkpoint file into air, mindir and onnx models################# """ import numpy as np -from mindspore import Tensor, context, load_checkpoint, export, load_param_into_net + +import mindspore +from mindspore import Tensor, load_checkpoint, export, load_param_into_net from eval import ModelBuilder from src.model_utils.device_adapter import get_device_id @@ -24,9 +26,9 @@ from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +mindspore.set_context(mode=0, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) + mindspore.set_context(device_id=get_device_id()) def modelarts_pre_process(): pass diff --git a/official/recommend/Wide_and_Deep/modelart/start.py b/official/recommend/Wide_and_Deep/modelart/start.py index dedbb8f81..38299bd3c 100644 --- a/official/recommend/Wide_and_Deep/modelart/start.py +++ b/official/recommend/Wide_and_Deep/modelart/start.py @@ -14,7 +14,9 @@ # ============================================================================ """wideanddeep modelarts""" import os -from mindspore import Model, context + +import mindspore +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor import moxing as mox from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel @@ -104,11 +106,11 @@ def modelarts_pre_process(): def train_wide_and_deep(): """train wide and deep""" enable_graph_kernel_ = config.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=enable_graph_kernel_, device_target=config.device_target) if enable_graph_kernel_: - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") test_train(config) diff --git a/official/recommend/Wide_and_Deep/src/callbacks.py b/official/recommend/Wide_and_Deep/src/callbacks.py index c10e221ad..90fdfa42f 100644 --- a/official/recommend/Wide_and_Deep/src/callbacks.py +++ b/official/recommend/Wide_and_Deep/src/callbacks.py @@ -15,9 +15,10 @@ callbacks """ import time + +import mindspore from mindspore.train.callback import Callback -from mindspore import context -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_rank def add_write(file_path, out_str): @@ -57,7 +58,7 @@ class LossCallBack(Callback): cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 cur_num = cb_params.cur_step_num rank_id = 0 - parallel_mode = context.get_auto_parallel_context("parallel_mode") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL, ParallelMode.DATA_PARALLEL): rank_id = get_rank() @@ -107,9 +108,9 @@ class EvalCallBack(Callback): epoch end """ self.aucMetric.clear() - parallel_mode = context.get_auto_parallel_context("parallel_mode") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL): - context.set_auto_parallel_context(strategy_ckpt_save_file="", + mindspore.set_auto_parallel_context(strategy_ckpt_save_file="", strategy_ckpt_load_file=self.config.stra_ckpt) rank_id = 0 if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL, diff --git a/official/recommend/Wide_and_Deep/src/model_utils/device_adapter.py b/official/recommend/Wide_and_Deep/src/model_utils/device_adapter.py index 7c5d7f837..825c667a2 100644 --- a/official/recommend/Wide_and_Deep/src/model_utils/device_adapter.py +++ b/official/recommend/Wide_and_Deep/src/model_utils/device_adapter.py @@ -15,6 +15,7 @@ """Device adapter for ModelArts""" +import mindspore from .config import config if config.enable_modelarts: diff --git a/official/recommend/Wide_and_Deep/src/model_utils/moxing_adapter.py b/official/recommend/Wide_and_Deep/src/model_utils/moxing_adapter.py index 830d19a6f..9c6d88e5d 100644 --- a/official/recommend/Wide_and_Deep/src/model_utils/moxing_adapter.py +++ b/official/recommend/Wide_and_Deep/src/model_utils/moxing_adapter.py @@ -17,7 +17,7 @@ import os import functools -from mindspore import context +import mindspore from mindspore.profiler import Profiler from .config import config @@ -93,7 +93,7 @@ def moxing_wrapper(pre_process=None, post_process=None): sync_data(config.train_url, config.output_path) print("Workspace downloaded: ", os.listdir(config.output_path)) - context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + mindspore.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) config.device_num = get_device_num() config.device_id = get_device_id() if not os.path.exists(config.output_path): diff --git a/official/recommend/Wide_and_Deep/src/wide_and_deep.py b/official/recommend/Wide_and_Deep/src/wide_and_deep.py index 4c3a5c446..5ccab1a0f 100644 --- a/official/recommend/Wide_and_Deep/src/wide_and_deep.py +++ b/official/recommend/Wide_and_Deep/src/wide_and_deep.py @@ -14,14 +14,15 @@ # ============================================================================ """wide and deep model""" import numpy as np -from mindspore import nn, context +import mindspore +from mindspore import nn from mindspore import Parameter, ParameterTuple import mindspore.common.dtype as mstype import mindspore.ops as ops from mindspore.nn import Dropout from mindspore.nn.optim import Adam, FTRL from mindspore.common.initializer import Uniform, initializer -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.communication.management import get_group_size @@ -137,7 +138,7 @@ class WideDeepModel(nn.Cell): self.batch_size = config.batch_size host_device_mix = bool(config.host_device_mix) parameter_server = bool(config.parameter_server) - parallel_mode = context.get_auto_parallel_context("parallel_mode") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) if is_auto_parallel: self.batch_size = self.batch_size * get_group_size() @@ -275,7 +276,7 @@ class NetWithLossClass(nn.Cell): host_device_mix = bool(config.host_device_mix) parameter_server = bool(config.parameter_server) sparse = config.sparse - parallel_mode = context.get_auto_parallel_context("parallel_mode") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) self.no_l2loss = (is_auto_parallel if (host_device_mix or config.field_slice) else parameter_server) @@ -332,7 +333,7 @@ class TrainStepWrap(nn.Cell): def __init__(self, network, sens=1024.0, host_device_mix=False, parameter_server=False, sparse=False, cache_enable=False): super(TrainStepWrap, self).__init__() - parallel_mode = context.get_auto_parallel_context("parallel_mode") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) self.network = network self.network.set_train() @@ -377,8 +378,8 @@ class TrainStepWrap(nn.Cell): self.reducer_flag = parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL) if self.reducer_flag: - mean = context.get_auto_parallel_context("gradients_mean") - degree = context.get_auto_parallel_context("device_num") + mean = mindspore.get_auto_parallel_context("gradients_mean") + degree = mindspore.get_auto_parallel_context("device_num") self.grad_reducer_w = DistributedGradReducer(self.optimizer_w.parameters, mean, degree) self.grad_reducer_d = DistributedGradReducer(self.optimizer_d.parameters, mean, degree) @@ -409,8 +410,8 @@ class PredictWithSigmoid(nn.Cell): super(PredictWithSigmoid, self).__init__() self.network = network self.sigmoid = ops.Sigmoid() - parallel_mode = context.get_auto_parallel_context("parallel_mode") - full_batch = context.get_auto_parallel_context("full_batch") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") + full_batch = mindspore.get_auto_parallel_context("full_batch") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) if is_auto_parallel and full_batch: self.sigmoid.shard(((1, 1),)) diff --git a/official/recommend/Wide_and_Deep/train.py b/official/recommend/Wide_and_Deep/train.py index 5d018831d..3d1d7894d 100644 --- a/official/recommend/Wide_and_Deep/train.py +++ b/official/recommend/Wide_and_Deep/train.py @@ -13,7 +13,9 @@ # limitations under the License. """ test_training """ import os -from mindspore import Model, context + +import mindspore +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel from src.callbacks import LossCallBack @@ -93,12 +95,12 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def train_wide_and_deep(): _enable_graph_kernel = config.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target) if config.device_target == "Ascend": - context.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) + mindspore.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) if _enable_graph_kernel: - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") test_train(config) if __name__ == "__main__": diff --git a/official/recommend/Wide_and_Deep/train_and_eval.py b/official/recommend/Wide_and_Deep/train_and_eval.py index 88b0a16be..61a4689c3 100644 --- a/official/recommend/Wide_and_Deep/train_and_eval.py +++ b/official/recommend/Wide_and_Deep/train_and_eval.py @@ -14,7 +14,8 @@ """ test_training """ import os -from mindspore import Model, context +import mindspore +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel @@ -109,12 +110,12 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def train_wide_and_deep(): if cfg.device_target == "Ascend": - context.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) + mindspore.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) _enable_graph_kernel = cfg.device_target == "GPU" - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, enable_graph_kernel=_enable_graph_kernel, device_target=cfg.device_target) if _enable_graph_kernel: - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") test_train_eval(cfg) if __name__ == "__main__": diff --git a/official/recommend/Wide_and_Deep/train_and_eval_auto_parallel.py b/official/recommend/Wide_and_Deep/train_and_eval_auto_parallel.py index e82281710..7c3124ae9 100644 --- a/official/recommend/Wide_and_Deep/train_and_eval_auto_parallel.py +++ b/official/recommend/Wide_and_Deep/train_and_eval_auto_parallel.py @@ -17,10 +17,12 @@ import os import sys + +import mindspore import mindspore.dataset as ds -from mindspore import Model, context +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.parallel import set_algo_parameters from mindspore.communication.management import get_rank, get_group_size, init @@ -86,7 +88,7 @@ def train_and_eval(config): sparse = config.sparse print("epochs is {}".format(epochs)) if config.full_batch: - context.set_auto_parallel_context(full_batch=True) + mindspore.set_auto_parallel_context(full_batch=True) ds.config.set_seed(1) if config.field_slice: compute_manual_shape(config, get_group_size()) @@ -132,7 +134,7 @@ def train_and_eval(config): ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=os.path.join(config.ckpt_path, 'ckpt_' + str(get_rank())), config=ckptconfig) - context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt) + mindspore.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt) callback_list = [TimeMonitor( ds_train.get_dataset_size()), eval_callback, callback] if not host_device_mix: @@ -148,25 +150,25 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def train_wide_and_deep(): """ train_wide_and_deep """ - context.set_context(mode=context.GRAPH_MODE, + mindspore.set_context(mode=0, device_target=cfg.device_target) if cfg.device_target == "Ascend": - context.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) + mindspore.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) if cfg.device_target == "GPU": - context.set_context(enable_graph_kernel=True) - context.set_context(max_device_memory="24GB") + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(max_device_memory="24GB") init() if cfg.sparse: if cfg.use_sp: - context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, enable_alltoall=True, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, enable_alltoall=True, search_mode="sharding_propagation", gradients_mean=True, strategy_ckpt_save_file='strategy.ckpt') set_algo_parameters(elementwise_op_strategy_follow=False, fully_use_devices=False) else: - context.set_auto_parallel_context( + mindspore.set_auto_parallel_context( parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, gradients_mean=True) else: - context.set_auto_parallel_context( + mindspore.set_auto_parallel_context( parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True, search_mode="dynamic_programming") train_and_eval(cfg) diff --git a/official/recommend/Wide_and_Deep/train_and_eval_distribute.py b/official/recommend/Wide_and_Deep/train_and_eval_distribute.py index faa2ce17d..693079014 100644 --- a/official/recommend/Wide_and_Deep/train_and_eval_distribute.py +++ b/official/recommend/Wide_and_Deep/train_and_eval_distribute.py @@ -17,9 +17,11 @@ import os import sys -from mindspore import Model, context + +import mindspore +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_rank, get_group_size, init from mindspore.common import set_seed @@ -124,17 +126,17 @@ def modelarts_pre_process(): @moxing_wrapper(pre_process=modelarts_pre_process) def train_wide_and_deep(): """ train_wide_and_deep """ - context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target) + mindspore.set_context(mode=0, device_target=cfg.device_target) if cfg.device_target == "Ascend": - context.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) + mindspore.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) _enable_graph_kernel = cfg.device_target == "GPU" if _enable_graph_kernel: - context.set_context(enable_graph_kernel=True) - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") init() - context.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank())) - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank())) + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=get_group_size(), all_reduce_fusion_config=[6, 12]) train_and_eval(cfg) diff --git a/official/recommend/Wide_and_Deep/train_and_eval_parameter_server_distribute.py b/official/recommend/Wide_and_Deep/train_and_eval_parameter_server_distribute.py index eab55ccc6..293695ad6 100644 --- a/official/recommend/Wide_and_Deep/train_and_eval_parameter_server_distribute.py +++ b/official/recommend/Wide_and_Deep/train_and_eval_parameter_server_distribute.py @@ -17,10 +17,12 @@ import os import sys + +import mindspore import mindspore.dataset as ds -from mindspore import Model, context +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_rank, get_group_size, init from mindspore.common import set_seed @@ -88,7 +90,7 @@ def train_and_eval(config): config.full_batch = True print("epochs is {}".format(epochs)) if config.full_batch and os.getenv("MS_ROLE") == "MS_WORKER": - context.set_auto_parallel_context(full_batch=True) + mindspore.set_auto_parallel_context(full_batch=True) ds.config.set_seed(1) ds_train = create_dataset(data_path, train_mode=True, batch_size=batch_size*get_group_size(), data_type=dataset_type) @@ -116,7 +118,7 @@ def train_and_eval(config): if cache_enable: config.stra_ckpt = os.path.join( config.stra_ckpt + "-{}".format(get_rank()), "strategy.ckpt") - context.set_auto_parallel_context( + mindspore.set_auto_parallel_context( strategy_ckpt_save_file=config.stra_ckpt) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) @@ -149,7 +151,7 @@ def modelarts_pre_process(): cfg.ckpt_path = cfg.output_path -context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target) +mindspore.set_context(mode=0, device_target=cfg.device_target) cache_enable = cfg.vocab_cache_size > 0 @@ -157,26 +159,26 @@ cache_enable = cfg.vocab_cache_size > 0 def train_wide_and_deep(): """ train_wide_and_deep """ if cache_enable and cfg.device_target != "GPU": - context.set_context(max_device_memory="24GB") - context.set_ps_context(enable_ps=True) + mindspore.set_context(max_device_memory="24GB") + mindspore.set_ps_context(enable_ps=True) if cfg.device_target == "Ascend": - context.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) + mindspore.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) init() - context.set_context( + mindspore.set_context( save_graphs_path='./graphs_of_device_id_'+str(get_rank())) if cache_enable: if os.getenv("MS_ROLE") == "MS_WORKER": - context.set_auto_parallel_context( + mindspore.set_auto_parallel_context( parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True, search_mode="dynamic_programming") else: - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=get_group_size(), search_mode="dynamic_programming") cfg.sparse = True if cfg.device_target == "GPU": - context.set_context(enable_graph_kernel=True) - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") train_and_eval(cfg) diff --git a/official/recommend/Wide_and_Deep/train_and_eval_parameter_server_standalone.py b/official/recommend/Wide_and_Deep/train_and_eval_parameter_server_standalone.py index 5601767d8..9fe99aa38 100644 --- a/official/recommend/Wide_and_Deep/train_and_eval_parameter_server_standalone.py +++ b/official/recommend/Wide_and_Deep/train_and_eval_parameter_server_standalone.py @@ -17,7 +17,9 @@ import os import sys -from mindspore import Model, context + +import mindspore +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.common import set_seed @@ -120,23 +122,23 @@ def train_and_eval(config): def modelarts_pre_process(): cfg.ckpt_path = cfg.output_path -context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target) +mindspore.set_context(mode=0, device_target=cfg.device_target) cache_enable = cfg.vocab_cache_size > 0 @moxing_wrapper(pre_process=modelarts_pre_process) def train_wide_and_deep(): """ train_wide_and_deep """ - context.set_ps_context(enable_ps=True) + mindspore.set_ps_context(enable_ps=True) if cfg.device_target == "Ascend": - context.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) + mindspore.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) init() if not cache_enable: cfg.sparse = True if cfg.device_target == "GPU": - context.set_context(enable_graph_kernel=True) - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") train_and_eval(cfg) diff --git a/official/recommend/Wide_and_Deep/train_distribute.py b/official/recommend/Wide_and_Deep/train_distribute.py index c22d19d46..935e85698 100644 --- a/official/recommend/Wide_and_Deep/train_distribute.py +++ b/official/recommend/Wide_and_Deep/train_distribute.py @@ -17,9 +17,11 @@ import os import sys -from mindspore import Model, context + +import mindspore +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_rank, get_group_size, init from mindspore.common import set_seed @@ -120,18 +122,18 @@ def modelarts_pre_process(): def train_wide_and_deep(): """ train_wide_and_deep """ - context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target, save_graphs=True) + mindspore.set_context(mode=0, device_target=cfg.device_target, save_graphs=True) if cfg.device_target == "Ascend": - context.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) + mindspore.set_context(ascend_config={"op_precision_mode": "op_precision.ini"}) _enable_graph_kernel = cfg.device_target == "GPU" if _enable_graph_kernel: - context.set_context(enable_graph_kernel=True) - context.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") + mindspore.set_context(enable_graph_kernel=True) + mindspore.set_context(graph_kernel_flags="--enable_cluster_ops=MatMul") init() - context.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank())) - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank())) + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=get_group_size(), all_reduce_fusion_config=[6, 12]) train_and_eval(cfg) diff --git a/official/recommend/Wide_and_Deep_Multitable/eval.py b/official/recommend/Wide_and_Deep_Multitable/eval.py index b5de832d8..065743c91 100644 --- a/official/recommend/Wide_and_Deep_Multitable/eval.py +++ b/official/recommend/Wide_and_Deep_Multitable/eval.py @@ -16,7 +16,9 @@ import os import sys -from mindspore import Model, context + +import mindspore +from mindspore import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel @@ -91,5 +93,5 @@ if __name__ == "__main__": wide_and_deep_config = WideDeepConfig() wide_and_deep_config.argparse_init() compute_emb_dim(wide_and_deep_config) - context.set_context(mode=context.GRAPH_MODE, device_target="Davinci") + mindspore.set_context(mode=0, device_target="Davinci") train_and_eval(wide_and_deep_config) diff --git a/official/recommend/Wide_and_Deep_Multitable/src/wide_and_deep.py b/official/recommend/Wide_and_Deep_Multitable/src/wide_and_deep.py index 5460f1efe..86af6769d 100644 --- a/official/recommend/Wide_and_Deep_Multitable/src/wide_and_deep.py +++ b/official/recommend/Wide_and_Deep_Multitable/src/wide_and_deep.py @@ -15,8 +15,9 @@ """wide and deep model""" import numpy as np +import mindspore import mindspore.common.dtype as mstype -from mindspore import nn, context +from mindspore import nn from mindspore import Tensor, Parameter, ParameterTuple from mindspore.ops import functional as F from mindspore.ops import composite as C @@ -24,7 +25,7 @@ from mindspore.ops import operations as P from mindspore.nn import Dropout, Flatten from mindspore.nn.optim import Adam, FTRL from mindspore.common.initializer import Uniform, initializer -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.nn.wrap.grad_reducer import DistributedGradReducer @@ -550,13 +551,13 @@ class TrainStepWrap(nn.Cell): self.reducer_flag = False self.grad_reducer_w = None self.grad_reducer_d = None - parallel_mode = context.get_auto_parallel_context("parallel_mode") + parallel_mode = mindspore.get_auto_parallel_context("parallel_mode") if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: - mean = context.get_auto_parallel_context("gradients_mean") - degree = context.get_auto_parallel_context("device_num") + mean = mindspore.get_auto_parallel_context("gradients_mean") + degree = mindspore.get_auto_parallel_context("device_num") self.grad_reducer_w = DistributedGradReducer( self.optimizer_w.parameters, mean, degree) self.grad_reducer_d = DistributedGradReducer( diff --git a/official/recommend/Wide_and_Deep_Multitable/train_and_eval.py b/official/recommend/Wide_and_Deep_Multitable/train_and_eval.py index 0a8df002b..72b5971fa 100644 --- a/official/recommend/Wide_and_Deep_Multitable/train_and_eval.py +++ b/official/recommend/Wide_and_Deep_Multitable/train_and_eval.py @@ -16,7 +16,9 @@ import os import sys -from mindspore import Model, context + +import mindspore +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig from mindspore.train.callback import TimeMonitor @@ -105,5 +107,5 @@ if __name__ == "__main__": wide_and_deep_config = WideDeepConfig() wide_and_deep_config.argparse_init() compute_emb_dim(wide_and_deep_config) - context.set_context(mode=context.GRAPH_MODE, device_target="Davinci") + mindspore.set_context(mode=0, device_target="Davinci") train_and_eval(wide_and_deep_config) diff --git a/official/recommend/Wide_and_Deep_Multitable/train_and_eval_distribute.py b/official/recommend/Wide_and_Deep_Multitable/train_and_eval_distribute.py index 5372c46d7..434b75fb1 100644 --- a/official/recommend/Wide_and_Deep_Multitable/train_and_eval_distribute.py +++ b/official/recommend/Wide_and_Deep_Multitable/train_and_eval_distribute.py @@ -16,10 +16,12 @@ import os import sys -from mindspore import Model, context + +import mindspore +from mindspore import Model from mindspore.train.callback import ModelCheckpoint, CheckpointConfig from mindspore.train.callback import TimeMonitor -from mindspore.context import ParallelMode +from mindspore import ParallelMode from mindspore.communication.management import get_rank, get_group_size, init from mindspore.common import set_seed @@ -113,8 +115,8 @@ if __name__ == "__main__": wide_and_deep_config = WideDeepConfig() wide_and_deep_config.argparse_init() compute_emb_dim(wide_and_deep_config) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") + mindspore.set_context(mode=0, device_target="Ascend") init() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=get_group_size()) train_and_eval(wide_and_deep_config) diff --git a/research/cv/RepVGG/README.md b/research/cv/RepVGG/README.md index 859fec1c1..37b2b5b69 100644 --- a/research/cv/RepVGG/README.md +++ b/research/cv/RepVGG/README.md @@ -358,7 +358,7 @@ We need several parameters for these scripts. Training result will be stored in the current path, whose folder name is "LOG". Under this, you can find checkpoint files together with result like the -followings in log. +following in log. ```log epoch: 1 step: 1000, loss is 6.1155386 @@ -611,7 +611,6 @@ Typical outputs for folder with image: | Accuracy | 75.05% | | Model for inference | 15M (14.33M after re-parametrization)(.ckpt file) | | configuration | RepVGG-B0_experiment.yaml | -| Scripts | | ## [Description of Random Situation](#contents) diff --git a/research/cv/ResNeXt/README.md b/research/cv/ResNeXt/README.md index 997b4f070..729735d81 100644 --- a/research/cv/ResNeXt/README.md +++ b/research/cv/ResNeXt/README.md @@ -257,7 +257,7 @@ bash scripts/run_eval.sh 0 /opt/npu/datasets/classification/val /resnext_100.ckp #### Result -Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log. +Evaluation result will be stored in the scripts path. Under this, you can find result like the following in log. ```log acc=78.16%(TOP1) diff --git a/research/cv/ResidualAttentionNet/README.md b/research/cv/ResidualAttentionNet/README.md index 070246a6a..8469c9f98 100644 --- a/research/cv/ResidualAttentionNet/README.md +++ b/research/cv/ResidualAttentionNet/README.md @@ -399,7 +399,7 @@ Current batch_ Size can only be set to 1. # example: bash run_infer_310.sh cifar10-300.mindir cifar10 /data/cifar10/ ../config/cifar10_Ascend_1p_config.yaml 0 ``` -- Inference result will be stored in the example path, you can find result like the followings in acc.log. +- Inference result will be stored in the example path, you can find result like the following in acc.log. ```bash Total data:10000, top1 accuracy:0.9514, top5 accuracy:0.9978. diff --git a/research/cv/east/README.md b/research/cv/east/README.md index d73a3aac6..f9b125a59 100644 --- a/research/cv/east/README.md +++ b/research/cv/east/README.md @@ -294,7 +294,7 @@ You can start training using python or shell scripts. The usage of shell scripts ### Result -Evaluation result will be stored in the output file of evaluation script, you can find result like the followings in `log`. +Evaluation result will be stored in the output file of evaluation script, you can find result like the following in `log`. ```python Calculated {"precision": 0.8329088130412634, "recall": 0.7871930669234473, "hmean": 0.8094059405940593, "AP": 0} diff --git a/research/cv/eppmvsnet/README.md b/research/cv/eppmvsnet/README.md index 32df6e627..1d4bf6183 100644 --- a/research/cv/eppmvsnet/README.md +++ b/research/cv/eppmvsnet/README.md @@ -111,7 +111,7 @@ Parameters for EPP-MVSNet evaluation can be set in validate.py. ``` Evaluation result will be stored in "./results/blendedmvs/val/metrics.txt". You can find the result like the - followings in log. + following in log. ```python stage3_l1_loss:1.1738 diff --git a/research/cv/googlenet/README.md b/research/cv/googlenet/README.md index 2366fd16b..9dcd3c773 100644 --- a/research/cv/googlenet/README.md +++ b/research/cv/googlenet/README.md @@ -509,7 +509,7 @@ Current batch_ Size can only be set to 1. Before running the command below, you should modify the cifar10 config file. The items you should modify are batch_size and val_data_path. LABEL_FILE is only useful for imagenet,you can set any value. - Inference result will be stored in the example path, you can find result like the followings in acc.log. + Inference result will be stored in the example path, you can find result like the following in acc.log. ```shell # Ascend310 inference diff --git a/research/cv/llnet/README.md b/research/cv/llnet/README.md index 18fc609e6..8d1f2b342 100644 --- a/research/cv/llnet/README.md +++ b/research/cv/llnet/README.md @@ -156,7 +156,7 @@ bash run_eval.sh 5 ../dataset ./ckpt_5/llnet-rank5-286_408.ckpt ### Result -Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log. +Evaluation result will be stored in the scripts path. Under this, you can find result like the following in log. PSNR=21.593(dB) SSIM=0.617 ## Inference Process diff --git a/research/cv/nasnet/README.md b/research/cv/nasnet/README.md index 2d768697f..de8b2e7bb 100644 --- a/research/cv/nasnet/README.md +++ b/research/cv/nasnet/README.md @@ -182,7 +182,7 @@ bash run_eval_for_gpu.sh 0 /dataset ./ckpt_0/nasnet-a-mobile-rank0-248_10009.ckp ### Result -Evaluation result will be stored in the ./eval path. Under this, you can find result like the followings in `eval.log`. +Evaluation result will be stored in the ./eval path. Under this, you can find result like the following in `eval.log`. acc=74.39%(TOP1,Ascend) acc=73.5%(TOP1,GPU) diff --git a/research/cv/osnet/README.md b/research/cv/osnet/README.md index bc24c8dd0..2bbafb479 100644 --- a/research/cv/osnet/README.md +++ b/research/cv/osnet/README.md @@ -349,7 +349,7 @@ You can start evaluating using python or shell scripts. The usage of shell scrip ### Result -Evaluation result will be stored in the output file of evaluation script, you can find result like the followings in `eval.log`. +Evaluation result will be stored in the output file of evaluation script, you can find result like the following in `eval.log`. ```python ** Results ** diff --git a/research/cv/pnasnet/README.md b/research/cv/pnasnet/README.md index 951a5b352..9f406d147 100644 --- a/research/cv/pnasnet/README.md +++ b/research/cv/pnasnet/README.md @@ -203,7 +203,7 @@ You can find checkpoint file together with result in log. ### Result -Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log. +Evaluation result will be stored in the scripts path. Under this, you can find result like the following in log. - running on Ascend diff --git a/research/cv/proxylessnas/README.md b/research/cv/proxylessnas/README.md index ddca8cd22..fb55b1cc0 100644 --- a/research/cv/proxylessnas/README.md +++ b/research/cv/proxylessnas/README.md @@ -143,7 +143,7 @@ bash run_eval_for_ascend.sh 0 /dataset ./train_parallel3/ckpt_3/proxylessnas-mob ### Result -Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log. +Evaluation result will be stored in the scripts path. Under this, you can find result like the following in log. acc=75.04%(TOP1) ## Inference Process diff --git a/research/cv/repvgg/__init__.py b/research/cv/repvgg/__init__.py index e69de29bb..602527cd7 100644 --- a/research/cv/repvgg/__init__.py +++ b/research/cv/repvgg/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/research/cv/repvgg/eval.py b/research/cv/repvgg/eval.py index 5907eea9d..87848ba75 100644 --- a/research/cv/repvgg/eval.py +++ b/research/cv/repvgg/eval.py @@ -12,45 +12,64 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""eval""" +"""Evaluation script. Need training config.""" +import os +from functools import reduce + +import mindspore as ms from mindspore import Model from mindspore import context from mindspore import nn from mindspore.common import set_seed +from mindspore.train.callback import TimeMonitor -from src.args import args -from src.tools.cell import cast_amp +from src.tools.amp import cast_amp from src.tools.criterion import get_criterion, NetWithLoss -from src.tools.get_misc import get_dataset, set_device, get_model, pretrained, get_train_one_step +from src.tools.utils import pretrained, get_train_one_step +from src.dataset import create_dataset_imagenet from src.tools.optimizer import get_optimizer +from src.repvgg import get_model, switch_net_to_deploy -set_seed(args.seed) - - -def main(): - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - context.set_context(enable_graph_kernel=False) - if args.device_target == "Ascend": - context.set_context(enable_auto_mixed_precision=True) - set_device(args) - # get model +def eval_ckpt(args): + print('=== Use checkpoint ===') net = get_model(args) - cast_amp(net) + cast_amp(net, args) criterion = get_criterion(args) net_with_loss = NetWithLoss(net, criterion) if args.pretrained: pretrained(args, net) - data = get_dataset(args, training=False) - batch_num = data.val_dataset.get_dataset_size() + print( + 'Number of parameters (before deploy):', + sum( + reduce(lambda x, y: x * y, params.shape) + for params in net.trainable_params() + ) + ) + switch_net_to_deploy(net) + print( + 'Number of parameters (after deploy):', + sum( + reduce(lambda x, y: x * y, params.shape) + for params in net.trainable_params() + ) + ) + cast_amp(net, args) + net.set_train(False) + + data = create_dataset_imagenet( + str(args.dataset_path), args, training=False + ) + batch_num = data.get_dataset_size() optimizer = get_optimizer(args, net, batch_num) - # save a yaml file to read to record parameters net_with_loss = get_train_one_step(args, net_with_loss, optimizer) - eval_network = nn.WithEvalCell(net, criterion, args.amp_level in ["O2", "O3", "auto"]) + eval_network = nn.WithEvalCell( + net, criterion, args.amp_level in ['O2', 'O3', 'auto'] + ) eval_indexes = [0, 1, 2] eval_metrics = {'Loss': nn.Loss(), 'Top1-Acc': nn.Top1CategoricalAccuracy(), @@ -58,10 +77,58 @@ def main(): model = Model(net_with_loss, metrics=eval_metrics, eval_network=eval_network, eval_indexes=eval_indexes) - print(f"=> begin eval") - results = model.eval(data.val_dataset) - print(f"=> eval results:{results}") - print(f"=> eval success") + + print('=> begin eval') + results = model.eval(data, callbacks=[TimeMonitor()]) + return results + + +def eval_mindir(args): + print('=== Use MINDIR model ===') + data = create_dataset_imagenet( + str(args.dataset_path), args, training=False + ) + iterator = data.create_dict_iterator(num_epochs=1) + + graph = ms.load(str(args.pretrained)) + net = nn.GraphCell(graph) + metrics = { + 'Top1-Acc': nn.Top1CategoricalAccuracy(), + 'Top5-Acc': nn.Top5CategoricalAccuracy(), + } + print('=> begin eval') + for batch in iterator: + y_pred = net(batch['image']) + for metric in metrics.values(): + metric.update(y_pred, batch['label']) + + return {name: metric.eval() for name, metric in metrics.items()} + + +def main(): + """Entry point.""" + from src.config import run_args + args = run_args() + + set_seed(args.seed) + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target) + context.set_context(enable_graph_kernel=False) + if args.device_target == 'Ascend': + context.set_context(enable_auto_mixed_precision=True) + + os.environ["RANK_SIZE"] = '0' + + # get model + if args.pretrained.suffix == '.ckpt': + results = eval_ckpt(args) + elif args.pretrained.suffix == '.mindir': + results = eval_mindir(args) + else: + raise ValueError('Incorrect format checkpoint') + + print(f'=> eval results:{results}') + print('=> eval success') if __name__ == '__main__': diff --git a/research/cv/repvgg/export.py b/research/cv/repvgg/export.py index e7b990331..93a589bf8 100644 --- a/research/cv/repvgg/export.py +++ b/research/cv/repvgg/export.py @@ -12,37 +12,58 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -""" -##############export checkpoint file into air, onnx or mindir model################# -python export.py -""" - +"""Export model to MINDIR, AIR or ONNX format. Need training config.""" import numpy as np -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +from mindspore import ( + Tensor, load_checkpoint, load_param_into_net, export, context +) from mindspore import dtype as mstype +from mindspore import nn -from src.args import args -from src.tools.cell import cast_amp -from src.tools.criterion import get_criterion, NetWithLoss -from src.tools.get_misc import get_model +from src.config import run_args +from src.tools.amp import cast_amp +from src.repvgg import get_model, switch_net_to_deploy -context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) -if args.device_target in ["Ascend", "GPU"]: - context.set_context(device_id=args.device_id) +class NetWithSoftmax(nn.Cell): + """Network with softmax at the end.""" -if __name__ == '__main__': + def __init__(self, net): + super().__init__() + self.net = net + self.softmax = nn.Softmax() + + def construct(self, x): + return self.softmax(self.net(x)) + + +def main(): + """Entry point.""" + args = run_args() + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target) + if args.device_target in ['Ascend', 'GPU']: + context.set_context(device_id=args.device_id) net = get_model(args) - criterion = get_criterion(args) - cast_amp(net) - net_with_loss = NetWithLoss(net, criterion) - assert args.pretrained is not None, "checkpoint_path is None." + cast_amp(net, args) + assert args.pretrained is not None, 'checkpoint_path is None.' - param_dict = load_checkpoint(args.pretrained) + param_dict = load_checkpoint(str(args.pretrained)) load_param_into_net(net, param_dict) - + switch_net_to_deploy(net) + cast_amp(net, args) + net = NetWithSoftmax(net) net.set_train(False) net.to_float(mstype.float32) - input_arr = Tensor(np.zeros([1, 3, args.image_size, args.image_size], np.float32)) - export(net, input_arr, file_name=args.arch, file_format=args.file_format) + input_arr = Tensor( + np.zeros([1, 3, args.image_size, args.image_size], np.float32) + ) + export( + net, input_arr, file_name=str(args.export_path), + file_format=args.file_format + ) + + +if __name__ == '__main__': + main() diff --git a/research/cv/repvgg/infer_onnx.py b/research/cv/repvgg/infer_onnx.py index d01128fb6..e6e9ae127 100644 --- a/research/cv/repvgg/infer_onnx.py +++ b/research/cv/repvgg/infer_onnx.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* # Copyright 2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,21 +14,95 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""eval""" +""" +Run prediction on folder or single image, output results and save them to +JSON file. +""" +import argparse +import json +from pathlib import Path + +from PIL import Image import onnxruntime as ort -from mindspore import nn -from src.args import args -from src.data.imagenet import create_dataset_imagenet +from src.dataset import get_transforms + + +def parse_args(): + """ + Create and parse command-line arguments. + + Returns + ------- + argparse.Namespace + Parsed command-line arguments. + """ + parser = argparse.ArgumentParser( + description=__doc__, add_help=False, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument('-h', '--help', action='help', + default=argparse.SUPPRESS, + help='Show this help message and exit.') + parser.add_argument('data', type=Path, + help='Path to dataset for prediction.') + parser.add_argument('-c', '--onnx_path', type=Path, + help='Path to ONNX file.') + parser.add_argument( + '-o', '--output', type=Path, default=Path('predictions.json'), + help='Path to output PKL file.' + ) + parser.add_argument('--image_size', type=int, default=224, + help='Image size.') + parser.add_argument( + '--device_target', default='CPU', choices=['GPU', 'CPU'], + help='Target computation platform.' + ) -def create_session(onnx_path, target_device): - if target_device == 'GPU': + return parser.parse_args() + + +def data_loader(path: Path, image_size: int): + """Load image or images from folder in generator.""" + preprocess = get_transforms(image_size=image_size, + training=False) + + def apply(img): + for p in preprocess: + img = p(img) + return img + extensions = ('.png', '.jpg', '.jpeg') + if path.is_dir(): + print('=' * 5, ' Load directory ', '=' * 5) + for item in path.iterdir(): + if item.is_dir(): + continue + if item.suffix.lower() not in extensions: + continue + image = Image.open(str(item)) + image = image.convert('RGB') + image = apply(image) + yield str(item), image[None] + else: + print('=' * 5, ' Load single image ', '=' * 5) + assert path.suffix.lower() in extensions + + image = Image.open(str(path)) + image = image.convert('RGB') + image = apply(image) + yield str(path), image[None] + + +def create_session(onnx_path, device_target): + """Create ONNX inference session.""" + if device_target == 'GPU': providers = ['CUDAExecutionProvider'] - elif target_device == 'CPU': + elif device_target == 'CPU': providers = ['CPUExecutionProvider'] else: raise ValueError( - f'Unsupported target device {target_device}, ' + f'Unsupported target device {device_target}, ' f'Expected one of: "CPU", "GPU"' ) session = ort.InferenceSession(onnx_path, providers=providers) @@ -34,23 +110,25 @@ def create_session(onnx_path, target_device): return session, input_name -def run_eval(onnx_path, data_dir, target_device): - session, input_name = create_session(onnx_path, target_device) - args.batch_size = 1 - dataset = create_dataset_imagenet(data_dir, args, training=False) - metrics = { - 'top-1 accuracy': nn.Top1CategoricalAccuracy(), - 'top-5 accuracy': nn.Top5CategoricalAccuracy(), - } - for batch in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): - y_pred = session.run(None, {input_name: batch['image']})[0] - for metric in metrics.values(): - metric.update(y_pred, batch['label']) - return {name: metric.eval() for name, metric in metrics.items()} +def main(): + """Entry point.""" + args = parse_args() + loader = data_loader(args.data, args.image_size) + session, input_name = create_session( + str(args.onnx_path), args.device_target + ) -if __name__ == '__main__': + d = {} - results = run_eval(args.onnx_path, args.dataset_path, args.device_target) - for name, value in results.items(): - print(f'{name}: {value:.5f}') + for (name, img) in loader: + res = session.run(None, {input_name: img})[0].argmax() + print(name, f'(class: {res})') + d[name] = int(res) + + with args.output.open(mode='w') as f: + json.dump(d, f, indent=1) + + +if __name__ == '__main__': + main() diff --git a/research/cv/repvgg/requriments.txt b/research/cv/repvgg/requriments.txt index e69de29bb..b209b4fd4 100644 --- a/research/cv/repvgg/requriments.txt +++ b/research/cv/repvgg/requriments.txt @@ -0,0 +1,6 @@ +numpy==1.21.6 +onnxruntime-gpu==1.13.1 +PyYAML==6.0 +matplotlib==3.5.3 +Pillow==9.2.0 +tqdm==4.64.1 diff --git a/research/cv/repvgg/scripts/run_infer_onnx.sh b/research/cv/repvgg/scripts/run_infer_onnx.sh index f1695ae29..d8ac18ee3 100644 --- a/research/cv/repvgg/scripts/run_infer_onnx.sh +++ b/research/cv/repvgg/scripts/run_infer_onnx.sh @@ -13,9 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ - -if [[ $# -lt 2 || $# -gt 4 ]]; then - echo "Usage: bash run_infer_onnx.sh [ONNX_PATH] [DATASET_PATH] [DEVICE_TARGET(optional)] [DEVICE_ID(optional)]" +if [[ $# -lt 3 || $# -gt 4 ]]; then + echo "Usage: bash scripts/run_infer_onnx.sh [ONNX_PATH] [DATA_PATH] [OUTPUT_PATH] [DEVICE_TARGET(optional)]" exit 1 fi @@ -23,32 +22,29 @@ get_real_path(){ if [ "${1:0:1}" == "/" ]; then echo "$1" else - echo "$(realpath -m $PWD/$1)" + realpath -m "$PWD"/"$1" fi } -onnx_path=$(get_real_path $1) -dataset_path=$(get_real_path $2) -if [ $# -eq 3 ]; then - device_target=$3 + +ONNX_PATH=$(get_real_path "$1") +DATA_PATH=$(get_real_path "$2") +OUTPUT_PATH=$(get_real_path "$3") + +if [ ! -f "$ONNX_PATH" ] +then + echo "error: CHECKPOINT_PATH=$ONNX_PATH is not a file" +exit 1 fi + if [ $# -eq 4 ]; then - device_id=$4 + DEVICE_TARGET="$4" +else + DEVICE_TARGET=CPU fi -echo "onnx_path: "$onnx_path -echo "dataset_path: "$dataset_path -echo "device_target: "$device_target -echo "device_id: "$device_id +echo "onnx_path: $ONNX_PATH" +echo "dataset_path: $DATA_PATH" +echo "output_path: $OUTPUT_PATH" +echo "device_target: $DEVICE_TARGET" -function infer() -{ - python ./infer_onnx.py --onnx_path=$onnx_path \ - --dataset_path=$dataset_path \ - --device_target=$device_target \ - --device_id=$device_id &> infer_onnx.log -} -infer -if [ $? -ne 0 ]; then - echo " execute inference failed" - exit 1 -fi \ No newline at end of file +python ./infer_onnx.py "$DATA_PATH" --onnx_path "$ONNX_PATH" --device_target "$DEVICE_TARGET" --output "$OUTPUT_PATH" &> infer_onnx.log & \ No newline at end of file diff --git a/research/cv/repvgg/src/tools/__init__.py b/research/cv/repvgg/src/tools/__init__.py index e69de29bb..69dbde146 100644 --- a/research/cv/repvgg/src/tools/__init__.py +++ b/research/cv/repvgg/src/tools/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""init training tools""" diff --git a/research/cv/repvgg/src/tools/callback.py b/research/cv/repvgg/src/tools/callback.py index 2b6cb4bed..eeb059180 100644 --- a/research/cv/repvgg/src/tools/callback.py +++ b/research/cv/repvgg/src/tools/callback.py @@ -12,39 +12,325 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""callback function""" +"""Custom training and evaluation callbacks.""" +import time -from mindspore.train.callback import Callback +from pathlib import Path +from operator import lt, gt -from src.args import args +import mindspore as ms +from mindspore._checkparam import Validator +from mindspore import Callback, SummaryCollector, SummaryRecord, RunContext -class EvaluateCallBack(Callback): - """EvaluateCallBack""" +class BestCheckpointSavingCallback(Callback): + """Callback to save best model checkpoints during training.""" - def __init__(self, model, eval_dataset, src_url, train_url, total_epochs, save_freq=50): - super(EvaluateCallBack, self).__init__() - self.model = model - self.eval_dataset = eval_dataset - self.src_url = src_url - self.train_url = train_url - self.total_epochs = total_epochs - self.save_freq = save_freq - self.best_acc = 0. + def __init__( + self, + ckpt_dir, + target_metric='acc', + best_is_max=True, + prefix='', + buffer=5 + ): + """ + Initialize ckpt saving callback. + + Parameters + ---------- + ckpt_dir: str + Directory to save checkpoints to. + target_metric: str + Name of the metric listed in `metrics` parameter of Model. + best_is_max: bool + Flag to choose is the higher or lower metric value is better. + For example: + - if `target_metric=loss` then `best_is_max` should be False + - if `target_metric=acc` then `best_is_max` should be True + prefix: str + Prefix of saved checkpoint file. + buffer: int + Max number of saved checkpoints. + """ + self.ckpt_dir = Path(ckpt_dir) + self._make_dir() + self.target_metric = target_metric + self.best_is_max = best_is_max + self.prefix = prefix + if best_is_max: + self.best_metric = float('-inf') + self.compare = lt + else: + self.best_metric = float('inf') + self.compare = gt + + self.current_ckpt = [] + self.buffer_size = buffer + + def _make_dir(self): + """Create a checkpoint directory.""" + if not self.ckpt_dir.exists(): + self.ckpt_dir.mkdir(parents=True) + print(f'Directory created: {self.ckpt_dir}') + else: + print(f'Warning! Directory already exists: {self.ckpt_dir}') + + def _save_checkpoint(self, network, epoch): + """ + Save checkpoint. + + Parameters + ---------- + network + Network to save checkpoint for. + """ + # TODO: May not work with model arts or distributed training. + if not float('-inf') < self.best_metric < float('inf'): + return + ckpt_name = f'epoch={epoch}_' \ + f'{self.target_metric}={self.best_metric:.3f}.ckpt' + if self.prefix: + ckpt_name = f'{self.prefix}_{ckpt_name}' + ms.save_checkpoint(network, str(self.ckpt_dir / ckpt_name)) + self.current_ckpt.append(self.ckpt_dir / ckpt_name) + if len(self.current_ckpt) > self.buffer_size: + removed = self.current_ckpt[0] + removed.unlink() + del self.current_ckpt[0] + + def on_eval_end(self, run_context: RunContext): + """ + Check and safe checkpoint if needed after evaluation complete. + + Parameters + ---------- + run_context: RunContext + + """ + cb_params = run_context.original_args() + metrics = {k: v for k, v in cb_params.eval_results.items() if v != 0} + if self.target_metric not in metrics: + raise KeyError( + f'Target metric {self.target_metric} is not in ' + 'cb_params.metrics.' + ) + # If the new metric is better the previous "best" + if self.compare(self.best_metric, metrics[self.target_metric]): + self.best_metric = metrics[self.target_metric] + self._save_checkpoint( + cb_params.network, epoch=cb_params.cur_epoch_num + ) - def epoch_end(self, run_context): + +class SummaryCallbackWithEval(SummaryCollector): + """ + Callback that can collect a common information like SummaryCollector. + + Additionally, this callback collects: + - learning rate + - validation loss + - validation accuracy + """ + + def __init__( + self, + summary_dir, + collect_freq=10, + collect_specified_data=None, + keep_default_action=True, + custom_lineage_data=None, + collect_tensor_freq=None, + max_file_size=None, + export_options=None + ): + super().__init__( + summary_dir, + collect_freq, + collect_specified_data, + keep_default_action, + custom_lineage_data, + collect_tensor_freq, + max_file_size, + export_options + ) + self.entered_count = 0 + + def on_train_epoch_end(self, run_context: RunContext): """ - Test when epoch end, save best model with best.ckpt. + Collect learning rate after train epoch. + + Parameters + ---------- + run_context: RunContext """ cb_params = run_context.original_args() - if cb_params.cur_epoch_num > self.total_epochs * 0.8: - cur_epoch_num = cb_params.cur_epoch_num - result = self.model.eval(self.eval_dataset) - if result["acc"] > self.best_acc: - self.best_acc = result["acc"] - print("epoch: %s acc: %s, best acc is %s" % - (cb_params.cur_epoch_num, result["acc"], self.best_acc), flush=True) - if args.run_modelarts: - import moxing as mox - if cur_epoch_num % self.save_freq == 0: - mox.file.copy_parallel(src_url=self.src_url, dst_url=self.train_url) + optimizer = cb_params.get('optimizer') + if optimizer is None: + optimizer = getattr(cb_params.network, 'optimizer') + if optimizer is None: + print('Warning: There is no optimizer found!') + else: + global_step = optimizer.global_step + lr = optimizer.learning_rate(global_step) + self._record.add_value('scalar', 'Train/learning_rate', + ms.Tensor(lr)) + self._record.record(cb_params.cur_epoch_num) + super().on_train_epoch_end(run_context) + + def on_eval_end(self, run_context: RunContext): + """ + Collect metrics after evaluation complete. + + Parameters + ---------- + run_context: RunContext + """ + cb_params = run_context.original_args() + metrics = {k: v for k, v in cb_params.eval_results.items() if v != 0} + print( + 'Result metrics', f'epoch {cb_params.cur_epoch_num}: ', + {key: metrics[key] for key in sorted(metrics)} + ) + + for metric_name, value in metrics.items(): + self._record.add_value( + 'scalar', f'Metrics/{metric_name}', ms.Tensor(value) + ) + self._record.record(cb_params.cur_epoch_num) + self._record.flush() + + def __enter__(self): + """ + Enter in context manager and control that SummaryRecord created once. + """ + if self.entered_count == 0: + self._record = SummaryRecord(log_dir=self._summary_dir, + max_file_size=self._max_file_size, + raise_exception=False, + export_options=self._export_options) + self._first_step, self._dataset_sink_mode = True, True + self.entered_count += 1 + return self + + def __exit__(self, *err): + """ + Exit from context manager and control SummaryRecord correct closing. + """ + self.entered_count -= 1 + if self.entered_count == 0: + super().__exit__(err) + + +class TrainTimeMonitor(Callback): + """Monitor the time in train process. + + Parameters + ---------- + data_size: int + How many steps are the intervals between print + information each time. + if the program get `batch_num` during training, `data_size` + will be set to `batch_num`, otherwise `data_size` will be used. + Default: None + + Raises + ------ + ValueError: If data_size is not positive int. + """ + + def __init__(self, data_size=None): + super().__init__() + self.data_size = data_size + self.epoch_time = time.time() + + def on_train_epoch_begin(self, run_context): + """Record time at the beginning of epoch. + + Parameters + ---------- + run_context: RunContext + Context of the process running. For more details, please refer to + :class:`mindspore.RunContext` + """ + self.epoch_time = time.time() + + def on_train_epoch_end(self, run_context): + """Print process cost time at the end of epoch. + + Parameters + ---------- + run_context: RunContext + Context of the process running. For more details, please refer to + :class:`mindspore.RunContext` + """ + epoch_seconds = (time.time() - self.epoch_time) * 1000 + step_size = self.data_size + cb_params = run_context.original_args() + mode = cb_params.get('mode', '') + if hasattr(cb_params, 'batch_num'): + batch_num = cb_params.batch_num + if isinstance(batch_num, int) and batch_num > 0: + step_size = cb_params.batch_num + Validator.check_positive_int(step_size) + + step_seconds = epoch_seconds / step_size + print('{} epoch time: {:5.3f} ms, per step time: {:5.3f} ms'.format + (mode.title(), epoch_seconds, step_seconds), flush=True) + + +class EvalTimeMonitor(Callback): + """Monitor the time in eval process. + + Parameters + ---------- + data_size: int + How many steps are the intervals between print information each + time. If the program get `batch_num` during + training, `data_size` will be set to `batch_num`, otherwise + `data_size` will be used. Default: None + + + Raises + ------ + ValueError: If data_size is not positive int. + """ + + def __init__(self, data_size=None): + super().__init__() + self.data_size = data_size + self.epoch_time = time.time() + + def on_eval_epoch_begin(self, run_context): + """Record time at the beginning of epoch. + + Parameters + ---------- + run_context: + Context of the process running. For more details, please refer to + :class:`mindspore.RunContext` + """ + self.epoch_time = time.time() + + def on_eval_epoch_end(self, run_context): + """Print process cost time at the end of epoch. + + Parameters + ---------- + run_context: + Context of the process running. For more details, please refer to + :class:`mindspore.RunContext` + """ + epoch_seconds = (time.time() - self.epoch_time) * 1000 + step_size = self.data_size + cb_params = run_context.original_args() + mode = cb_params.get('mode', '') + if hasattr(cb_params, 'batch_num'): + batch_num = cb_params.batch_num + if isinstance(batch_num, int) and batch_num > 0: + step_size = cb_params.batch_num + Validator.check_positive_int(step_size) + + step_seconds = epoch_seconds / step_size + print('{} epoch time: {:5.3f} ms, per step time: {:5.3f} ms'.format + (mode.title(), epoch_seconds, step_seconds), flush=True) diff --git a/research/cv/repvgg/src/tools/criterion.py b/research/cv/repvgg/src/tools/criterion.py index 0c3254ec0..4516bd0ba 100644 --- a/research/cv/repvgg/src/tools/criterion.py +++ b/research/cv/repvgg/src/tools/criterion.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""functions of criterion""" +"""Optimized criterion functionality.""" import mindspore.nn as nn from mindspore import Tensor from mindspore import ops @@ -41,18 +41,25 @@ class SoftTargetCrossEntropy(LossBase): class CrossEntropySmooth(LossBase): """CrossEntropy""" - def __init__(self, sparse=True, reduction='mean', smooth_factor=0., num_classes=1000): + def __init__( + self, sparse=True, reduction='mean', smooth_factor=0., + num_classes=1000 + ): super(CrossEntropySmooth, self).__init__() self.onehot = P.OneHot() self.sparse = sparse self.on_value = Tensor(1.0 - smooth_factor, mstype.float32) - self.off_value = Tensor(1.0 * smooth_factor / (num_classes - 1), mstype.float32) + self.off_value = Tensor( + 1.0 * smooth_factor / (num_classes - 1), mstype.float32 + ) self.ce = nn.SoftmaxCrossEntropyWithLogits(reduction=reduction) self.cast = ops.Cast() def construct(self, logit, label): if self.sparse: - label = self.onehot(label, F.shape(logit)[1], self.on_value, self.off_value) + label = self.onehot( + label, F.shape(logit)[1], self.on_value, self.off_value + ) label = P.Cast()(label, mstype.float32) logit = P.Cast()(logit, mstype.float32) loss2 = self.ce(logit, label) @@ -74,15 +81,15 @@ def get_criterion(args): num_classes=args.num_classes) else: print(25 * "=" + "Using Simple CE" + 25 * "=") - criterion = CrossEntropySmooth(sparse=True, reduction="mean", num_classes=args.num_classes) + criterion = CrossEntropySmooth( + sparse=True, reduction="mean", num_classes=args.num_classes + ) return criterion class NetWithLoss(nn.Cell): - """ - NetWithLoss: Only support Network with Classfication - """ + """NetWithLoss: Only support Network with Classfication""" def __init__(self, model, criterion): super(NetWithLoss, self).__init__() diff --git a/research/cv/repvgg/src/tools/optimizer.py b/research/cv/repvgg/src/tools/optimizer.py index 05f407f57..f2bb2553f 100644 --- a/research/cv/repvgg/src/tools/optimizer.py +++ b/research/cv/repvgg/src/tools/optimizer.py @@ -12,9 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""Functions of optimizer""" -import os - +"""Optimizer creation.""" import numpy as np from mindspore.nn.optim import AdamWeightDecay from mindspore.nn.optim.momentum import Momentum @@ -29,30 +27,26 @@ def get_learning_rate(args, batch_num): def get_optimizer(args, model, batch_num): """Get optimizer for training""" - print(f"=> When using train_wrapper, using optimizer {args.optimizer}") + print(f'=> When using train_wrapper, using optimizer {args.optimizer}') optim_type = args.optimizer.lower() params = get_param_groups(model) learning_rate = get_learning_rate(args, batch_num) - step = int(args.start_epoch * batch_num) - accumulation_step = int(args.accumulation_step) - learning_rate = learning_rate[step::accumulation_step] + step = int(args.start_epoch * batch_num) + args.start_step train_step = len(learning_rate) - print(f"=> Get LR from epoch: {args.start_epoch}\n" - f"=> Start step: {step}\n" - f"=> Total step: {train_step}\n" - f"=> Accumulation step:{accumulation_step}") - if accumulation_step > 1: - learning_rate = learning_rate * accumulation_step - learning_rate = learning_rate * args.batch_size * int(os.getenv("DEVICE_NUM", "1")) / 256. - print("learning_rate", np.max(learning_rate)) - if optim_type == "momentum": + learning_rate = learning_rate[step:] + print(f'=> Get LR from epoch: {args.start_epoch}\n' + f'=> Start step: {step}\n' + f'=> Total step: {train_step}\n') + + print('learning_rate', np.max(learning_rate)) + if optim_type == 'momentum': optim = Momentum( params=params, learning_rate=learning_rate, momentum=args.momentum, weight_decay=args.weight_decay ) - elif optim_type == "adamw": + elif optim_type == 'adamw': optim = AdamWeightDecay( params=params, learning_rate=learning_rate, @@ -62,23 +56,27 @@ def get_optimizer(args, model, batch_num): weight_decay=args.weight_decay ) else: - raise ValueError(f"optimizer {optim_type} is not supported") + raise ValueError(f'optimizer {optim_type} is not supported') return optim def get_param_groups(network): - """ get param groups """ + """get param groups""" decay_params = [] no_decay_params = [] for x in network.trainable_params(): parameter_name = x.name - if parameter_name.endswith(".weight"): + if parameter_name.endswith('.weight'): # Dense or Conv's weight using weight decay decay_params.append(x) else: # all bias not using weight decay - # bn weight bias not using weight decay, be carefully for now x not include LN + # bn weight bias not using weight decay, be carefully for now x + # not include LN no_decay_params.append(x) - return [{'params': no_decay_params, 'weight_decay': 0.0}, {'params': decay_params}] + return [ + {'params': no_decay_params, 'weight_decay': 0.0}, + {'params': decay_params} + ] diff --git a/research/cv/repvgg/src/tools/schedulers.py b/research/cv/repvgg/src/tools/schedulers.py index 7b6730767..4ae63b4d6 100644 --- a/research/cv/repvgg/src/tools/schedulers.py +++ b/research/cv/repvgg/src/tools/schedulers.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""LearningRate scheduler functions""" +"""LearningRate scheduler functionality.""" import numpy as np -__all__ = ["multistep_lr", "cosine_lr", "constant_lr", "get_policy", "exp_lr"] +__all__ = ['multistep_lr', 'cosine_lr', 'constant_lr', 'get_policy', 'exp_lr'] def get_policy(name): @@ -24,10 +24,10 @@ def get_policy(name): return constant_lr out_dict = { - "constant_lr": constant_lr, - "cosine_lr": cosine_lr, - "multistep_lr": multistep_lr, - "exp_lr": exp_lr, + 'constant_lr': constant_lr, + 'cosine_lr': cosine_lr, + 'multistep_lr': multistep_lr, + 'exp_lr': exp_lr, } return out_dict[name] @@ -39,7 +39,9 @@ def constant_lr(args, batch_num): def _lr_adjuster(epoch): if epoch < args.warmup_length: - lr = _warmup_lr(args.warmup_lr, args.base_lr, args.warmup_length, epoch) + lr = _warmup_lr( + args.warmup_lr, args.base_lr, args.warmup_length, epoch + ) else: lr = args.base_lr @@ -53,12 +55,14 @@ def constant_lr(args, batch_num): def exp_lr(args, batch_num): - """Get exp lr """ + """Get exp lr""" learning_rate = [] def _lr_adjuster(epoch): if epoch < args.warmup_length: - lr = _warmup_lr(args.warmup_lr, args.base_lr, args.warmup_length, epoch) + lr = _warmup_lr( + args.warmup_lr, args.base_lr, args.warmup_length, epoch + ) else: lr = args.base_lr * args.lr_gamma ** epoch @@ -77,7 +81,9 @@ def cosine_lr(args, batch_num): def _lr_adjuster(epoch): if epoch < args.warmup_length: - lr = _warmup_lr(args.warmup_lr, args.base_lr, args.warmup_length, epoch) + lr = _warmup_lr( + args.warmup_lr, args.base_lr, args.warmup_length, epoch + ) else: e = epoch - args.warmup_length es = args.epochs - args.warmup_length @@ -93,7 +99,8 @@ def cosine_lr(args, batch_num): def multistep_lr(args, batch_num): - """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" + """Sets the learning rate to the initial LR decayed by 10 every 30 epochs + """ learning_rate = [] def _lr_adjuster(epoch): diff --git a/research/cv/repvgg/train.py b/research/cv/repvgg/train.py index 87cba7eca..c6caf375d 100644 --- a/research/cv/repvgg/train.py +++ b/research/cv/repvgg/train.py @@ -12,75 +12,202 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""train""" -import os +"""Train RepVGG on ImageNet.""" +import shutil +import subprocess +import sys +import traceback + +from datetime import datetime +from functools import reduce +from pathlib import Path from mindspore import Model from mindspore import context from mindspore import nn +from mindspore import dataset as ds from mindspore.common import set_seed -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train.callback import ( + CheckpointConfig, ModelCheckpoint, LossMonitor +) -from src.args import args -from src.tools.callback import EvaluateCallBack -from src.tools.cell import cast_amp -from src.tools.criterion import get_criterion, NetWithLoss -from src.tools.get_misc import get_dataset, set_device, get_model, pretrained, get_train_one_step +from src.tools.callback import ( + BestCheckpointSavingCallback, SummaryCallbackWithEval, + TrainTimeMonitor, + EvalTimeMonitor +) from src.tools.optimizer import get_optimizer +from src.tools.amp import cast_amp +from src.tools.criterion import get_criterion, NetWithLoss +from src.tools.utils import set_device, pretrained, get_train_one_step +from src.dataset import get_dataset +from src.repvgg import get_model + + +def get_callbacks( + arch, rank, train_data_size, val_data_size, ckpt_dir, best_ckpt_dir, + summary_dir, ckpt_save_every_step=0, ckpt_save_every_sec=0, + ckpt_keep_num=10, best_ckpt_num=5, print_loss_every=1, collect_freq=1, + collect_tensor_freq=10, collect_input_data=False, + keep_default_action=False +): + """Get common callbacks.""" + if ckpt_save_every_step == 0 and ckpt_save_every_sec == 0: + ckpt_save_every_step = train_data_size + config_ck = CheckpointConfig( + save_checkpoint_steps=ckpt_save_every_step, + save_checkpoint_seconds=ckpt_save_every_sec, + keep_checkpoint_max=ckpt_keep_num, + append_info=['epoch_num', 'step_num'] + ) + train_time_cb = TrainTimeMonitor(data_size=train_data_size) + eval_time_cb = EvalTimeMonitor(data_size=val_data_size) + + best_ckpt_save_cb = BestCheckpointSavingCallback( + best_ckpt_dir, prefix=arch, buffer=best_ckpt_num + ) + + ckpoint_cb = ModelCheckpoint( + prefix=f'{arch}_{rank}', + directory=ckpt_dir, + config=config_ck + ) + loss_cb = LossMonitor(print_loss_every) + + specified = { + 'collect_metric': True, + 'collect_train_lineage': True, + 'collect_eval_lineage': True, + 'collect_input_data': collect_input_data, + } + summary_collector_cb = SummaryCallbackWithEval( + summary_dir=summary_dir, + collect_specified_data=specified, + collect_freq=collect_freq, + keep_default_action=keep_default_action, + collect_tensor_freq=collect_tensor_freq + ) + return [ + train_time_cb, + eval_time_cb, + ckpoint_cb, + loss_cb, + best_ckpt_save_cb, + summary_collector_cb + ] + + +def dump_env_and_params(ckpt_save_dir, args): + """Dump information about environment ang hyper parameters.""" + shutil.copy(str(args.config), str(ckpt_save_dir)) + with open(str(ckpt_save_dir / 'cmd.txt'), 'w', encoding='utf-8' + ) as file: + file.write(' '.join(sys.argv)) + with open(str(ckpt_save_dir / 'args.txt'), 'w', encoding='utf-8' + ) as file: + file.write(str(args)) + try: + with open(str(ckpt_save_dir / 'git.txt'), 'w', encoding='utf-8' + ) as file: + commit_info = subprocess.check_output( + ['git', 'show', '-s'], + cwd=Path(__file__).absolute().parents[0], + ) + decoded_commit_info = commit_info.decode('utf-8') + decoded_commit_info = decoded_commit_info.replace('\n', ', ') + file.write(decoded_commit_info) + except subprocess.CalledProcessError as git_exception: + print(f'Git dumping error: {git_exception}') + print(traceback.format_exc()) def main(): + """Entry point.""" + from src.config import run_args + args = run_args() + + print(args) set_seed(args.seed) - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - context.set_context(enable_graph_kernel=False) - if args.device_target == "Ascend": + context.set_context( + mode=context.GRAPH_MODE, device_target=args.device_target, + enable_graph_kernel=False + ) + if args.device_target == 'Ascend': context.set_context(enable_auto_mixed_precision=True) - rank = set_device(args) + if args.device_target != 'CPU': + rank = set_device(args) + else: + rank = 0 + + ds.config.set_prefetch_size(args.prefetch) # get model and cast amp_level net = get_model(args) - cast_amp(net) + cast_amp(net, args) criterion = get_criterion(args) net_with_loss = NetWithLoss(net, criterion) - if args.pretrained: - pretrained(args, net) data = get_dataset(args) batch_num = data.train_dataset.get_dataset_size() optimizer = get_optimizer(args, net, batch_num) net_with_loss = get_train_one_step(args, net_with_loss, optimizer) + if args.pretrained: + pretrained(args, net_with_loss, args.exclude_epoch_state) - eval_network = nn.WithEvalCell(net, criterion, args.amp_level in ["O2", "O3", "auto"]) + eval_network = nn.WithEvalCell( + net, criterion, args.amp_level in ['O2', 'O3', 'auto'] + ) eval_indexes = [0, 1, 2] - model = Model(net_with_loss, metrics={"acc", "loss"}, - eval_network=eval_network, - eval_indexes=eval_indexes) - - config_ck = CheckpointConfig(save_checkpoint_steps=data.train_dataset.get_dataset_size(), - keep_checkpoint_max=args.save_every) - time_cb = TimeMonitor(data_size=data.train_dataset.get_dataset_size()) - - ckpt_save_dir = "./ckpt_" + str(rank) - if args.run_modelarts: - ckpt_save_dir = "/cache/ckpt_" + str(rank) - - ckpoint_cb = ModelCheckpoint(prefix=args.arch + str(rank), directory=ckpt_save_dir, - config=config_ck) - loss_cb = LossMonitor() - eval_cb = EvaluateCallBack(model, eval_dataset=data.val_dataset, src_url=ckpt_save_dir, - train_url=os.path.join(args.train_url, "ckpt_" + str(rank)), - total_epochs=args.epochs - args.start_epoch, save_freq=args.save_every) - - print("begin train") - model.train(int(args.epochs - args.start_epoch), data.train_dataset, - callbacks=[time_cb, ckpoint_cb, loss_cb, eval_cb], - dataset_sink_mode=True) - print("train success") - - if args.run_modelarts: - import moxing as mox - mox.file.copy_parallel(src_url=ckpt_save_dir, dst_url=os.path.join(args.train_url, "ckpt_" + str(rank))) + model = Model( + net_with_loss, metrics={'acc', 'loss'}, eval_network=eval_network, + eval_indexes=eval_indexes + ) + + # target folder path + experiment_name = '_'.join([ + datetime.now().strftime('%y%m%d_%H%M%S'), args.arch, str(rank), + ]) + if args.brief is not None: + experiment_name = f'{experiment_name}_{args.brief}' + if args.continues is None: + ckpt_save_dir = args.train_url / experiment_name + else: + ckpt_save_dir = args.continues + + callbacks = [ + TrainTimeMonitor(data_size=data.train_dataset.get_dataset_size()), + LossMonitor(args.save_every) + ] + if rank == 0: + callbacks = get_callbacks( + arch=args.arch, rank=rank, ckpt_dir=str(ckpt_save_dir), + train_data_size=data.train_dataset.get_dataset_size(), + val_data_size=data.val_dataset.get_dataset_size(), + best_ckpt_dir=str(ckpt_save_dir / 'best_acc'), + summary_dir=str(ckpt_save_dir / 'logs'), ckpt_save_every_sec=0, + ckpt_save_every_step=args.save_every, + print_loss_every=args.save_every, + ckpt_keep_num=args.keep_checkpoint_max, + best_ckpt_num=args.keep_best_checkpoints_max + ) + + dump_env_and_params(ckpt_save_dir, args) + + print( + 'Number of parameters:', + sum( + reduce(lambda x, y: x * y, params.shape) + for params in net.trainable_params() + ) + ) + print('begin train') + model.fit( + args.epochs, data.train_dataset, valid_dataset=data.val_dataset, + dataset_sink_mode=bool(args.use_data_sink), callbacks=callbacks, + initial_epoch=args.start_epoch + ) + print('train success') if __name__ == '__main__': diff --git a/research/cv/resnetv2_50_frn/README.md b/research/cv/resnetv2_50_frn/README.md index 416a32d79..5c55a92b4 100644 --- a/research/cv/resnetv2_50_frn/README.md +++ b/research/cv/resnetv2_50_frn/README.md @@ -143,7 +143,7 @@ bash run_eval_for_ascend.sh 0 /dataset ./ckpt_0/resnetv2-50-frn-rank0-240_5005.c ### Result -Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log. +Evaluation result will be stored in the scripts path. Under this, you can find result like the following in log. acc=77.4%(TOP1,size:224\*224) acc=78.3%(TOP1,size:299\*299) diff --git a/research/cv/resnext152_64x4d/README.md b/research/cv/resnext152_64x4d/README.md index 0dd9bcd3d..a81c9ecfd 100644 --- a/research/cv/resnext152_64x4d/README.md +++ b/research/cv/resnext152_64x4d/README.md @@ -223,7 +223,7 @@ PLATFORM is Ascend or GPU, default is Ascend. #### Result -Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log. +Evaluation result will be stored in the scripts path. Under this, you can find result like the following in log. ```log acc=80.08%(TOP1) diff --git a/research/cv/se_resnext50/README.md b/research/cv/se_resnext50/README.md index 8e94159b4..7177cdc05 100644 --- a/research/cv/se_resnext50/README.md +++ b/research/cv/se_resnext50/README.md @@ -49,7 +49,7 @@ Dataset used: [imagenet2012](http://www.image-net.org/) ## [Mixed Precision](#contents) -The [mixed precision](https://www.mindspore.cn/tutorials/en/master/advanced/mixed_precision.html) training method accelerates the deep learning neural network training process by using both the single-precision and half-precision data formats, and maintains the network precision achieved by the single-precision training at the same time. Mixed precision training can accelerate the computation process, reduce memory usage, and enable a larger model or batch size to be trained on specific hardware. +The [mixed precision](https://www.mindspore.cn/docs/programming_guide/en/r1.5/enable_mixed_precision.html) training method accelerates the deep learning neural network training process by using both the single-precision and half-precision data formats, and maintains the network precision achieved by the single-precision training at the same time. Mixed precision training can accelerate the computation process, reduce memory usage, and enable a larger model or batch size to be trained on specific hardware. For FP16 operators, if the input data type is FP32, the backend of MindSpore will automatically handle it with reduced precision. Users could check the reduced-precision operators by enabling INFO log and then searching ‘reduce precision’. @@ -61,7 +61,7 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil - [MindSpore](https://www.mindspore.cn/install/en) - For more information, please check the resources below: - [MindSpore Tutorials](https://www.mindspore.cn/tutorials/en/r1.3/index.html) -- [MindSpore Python API](https://www.mindspore.cn/docs/en/master/api_python/mindspore.html) +- [MindSpore Python API](https://www.mindspore.cn/docs/en/master/index.html) If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training and evaluation as follows: @@ -230,7 +230,7 @@ sh run_eval.sh 0 /opt/npu/datasets/classification/val /se_resnext50.ckpt Ascend #### Result -Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log. +Evaluation result will be stored in the scripts path. Under this, you can find result like the following in log. ```log acc=78.81%(TOP1) diff --git a/research/cv/se_resnext50/README_CN.md b/research/cv/se_resnext50/README_CN.md index ee609d9d2..81dc81a2f 100644 --- a/research/cv/se_resnext50/README_CN.md +++ b/research/cv/se_resnext50/README_CN.md @@ -56,7 +56,7 @@ SE_ResNeXt整体网络架构如下: ## 混合精度 -采用[混合精度](https://www.mindspore.cn/tutorials/zh-CN/master/advanced/mixed_precision.html)的训练方法使用支持单精度和半精度数据来提高深度学习神经网络的训练速度,同时保持单精度训练所能达到的网络精度。混合精度训练提高计算速度、减少内存使用的同时,支持在特定硬件上训练更大的模型或实现更大批次的训练。 +采用[混合精度](https://www.mindspore.cn/docs/programming_guide/zh-CN/r1.5/enable_mixed_precision.html)的训练方法使用支持单精度和半精度数据来提高深度学习神经网络的训练速度,同时保持单精度训练所能达到的网络精度。混合精度训练提高计算速度、减少内存使用的同时,支持在特定硬件上训练更大的模型或实现更大批次的训练。 以FP16算子为例,如果输入数据类型为FP32,MindSpore后台会自动降低精度来处理数据。用户可打开INFO日志,搜索“reduce precision”查看精度降低的算子。 @@ -68,7 +68,7 @@ SE_ResNeXt整体网络架构如下: - [MindSpore](https://www.mindspore.cn/install) - 如需查看详情,请参见如下资源: - [MindSpore教程](https://www.mindspore.cn/tutorials/zh-CN/r1.3/index.html) - - [MindSpore Python API](https://www.mindspore.cn/docs/en/master/api_python/mindspore.html) + - [MindSpore Python API](https://www.mindspore.cn/docs/en/master/index.html) 如果要在modelarts上进行模型的训练,可以参考modelarts的官方指导文档(https://support.huaweicloud.com/modelarts/) 开始进行模型的训练和推理,具体操作如下: diff --git a/research/cv/squeezenet/README.md b/research/cv/squeezenet/README.md index 4d8cfd80d..0c87bcdfc 100644 --- a/research/cv/squeezenet/README.md +++ b/research/cv/squeezenet/README.md @@ -344,7 +344,7 @@ For distributed training, a hccl configuration file with JSON format needs to be Please follow the instructions in the link [hccl_tools](https://gitee.com/mindspore/models/tree/master/utils/hccl_tools). -Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. +Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the following in log. ### Result @@ -413,7 +413,7 @@ checkpoint can be produced in training process. ### Result -Evaluation result will be stored in the example path, whose folder name is "eval". Under this, you can find result like the followings in log. +Evaluation result will be stored in the example path, whose folder name is "eval". Under this, you can find result like the following in log. - Evaluating SqueezeNet with CIFAR-10 dataset diff --git a/research/cv/squeezenet1_1/README.md b/research/cv/squeezenet1_1/README.md index 6a615c71d..63cfd4d24 100644 --- a/research/cv/squeezenet1_1/README.md +++ b/research/cv/squeezenet1_1/README.md @@ -169,7 +169,7 @@ For distributed training, a hccl configuration file with JSON format needs to be Please follow the instructions in the link [hccl_tools](https://gitee.com/mindspore/models/tree/master/utils/hccl_tools). -Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. +Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the following in log. ### Result @@ -203,7 +203,7 @@ bash scripts/run_eval.sh 0 /data/imagenet/val ./train/ckpt_squeezenet/squeezenet ### Result -Evaluation result will be stored in the example path, whose folder name is "eval". Under this, you can find result like the followings in log. +Evaluation result will be stored in the example path, whose folder name is "eval". Under this, you can find result like the following in log. - Evaluating SqueezeNet with ImageNet dataset diff --git a/research/cv/ssd_ghostnet/README.md b/research/cv/ssd_ghostnet/README.md index 0d75cb806..5ddcd69c9 100644 --- a/research/cv/ssd_ghostnet/README.md +++ b/research/cv/ssd_ghostnet/README.md @@ -234,7 +234,7 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in LOG4/log.txt. +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in LOG4/log.txt. ### Training on GPU @@ -246,7 +246,7 @@ For details about the parameters, see [Training on Ascend](#training-on-ascend) bash run_distribute_train_gpu.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional) ``` -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in LOG/log.txt. +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in LOG/log.txt. ## [Evaluation Process](#contents) diff --git a/research/cv/ssd_inception_v2/README.md b/research/cv/ssd_inception_v2/README.md index cd0916115..c655109b6 100644 --- a/research/cv/ssd_inception_v2/README.md +++ b/research/cv/ssd_inception_v2/README.md @@ -233,7 +233,7 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name is "LOG". Under this, you can find checkpoint files together with result like the followings in log +Training result will be stored in the current path, whose folder name is "LOG". Under this, you can find checkpoint files together with result like the following in log ```shell epoch: 1 step: 320, loss is 4.008658 @@ -274,7 +274,7 @@ We need four parameters for this scripts. > checkpoint can be produced in training process. -Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the followings in log. +Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the following in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.224 diff --git a/research/cv/ssd_mobilenetV2/README.md b/research/cv/ssd_mobilenetV2/README.md index 0b3b3d8c6..d2fa97b9c 100644 --- a/research/cv/ssd_mobilenetV2/README.md +++ b/research/cv/ssd_mobilenetV2/README.md @@ -241,7 +241,7 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in log ```shell epoch: 1 step: 458, loss is 2.329789 @@ -275,7 +275,7 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in log ```shell epoch: 1 step: 3664, loss is 2.1746433 @@ -303,7 +303,7 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in log ```shell epoch: 1 step: 916, loss is 2.1025786 @@ -342,7 +342,7 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in log ```shell epoch: 1 step: 3664, loss is 2.2511892 @@ -366,7 +366,7 @@ We need two parameters for this scripts. > checkpoint can be produced in training process. -Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the followings in log. +Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the following in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.253 @@ -403,7 +403,7 @@ We need two parameters for this scripts. > checkpoint can be produced in training process. -Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the followings in log. +Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the following in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.258 diff --git a/research/cv/ssd_mobilenetV2_FPNlite/README.md b/research/cv/ssd_mobilenetV2_FPNlite/README.md index 4fe3164b3..b42f63180 100644 --- a/research/cv/ssd_mobilenetV2_FPNlite/README.md +++ b/research/cv/ssd_mobilenetV2_FPNlite/README.md @@ -254,7 +254,7 @@ We need six or eight parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in log ```shell epoch: 1 step: 458, loss is 2.873479 @@ -289,7 +289,7 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in log ```shell epoch: 1 step: 3664, loss is 2.3280334 @@ -375,7 +375,7 @@ We need four parameters for this script: > checkpoint can be produced in training process. -Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the followings in log. +Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the following in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.234 @@ -412,7 +412,7 @@ We need four parameters for this script: > checkpoint can be produced in training process. -Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the followings in log. +Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the following in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.258 diff --git a/research/cv/ssd_resnet34/README.md b/research/cv/ssd_resnet34/README.md index 14a1d1cc4..ae791499b 100644 --- a/research/cv/ssd_resnet34/README.md +++ b/research/cv/ssd_resnet34/README.md @@ -231,7 +231,7 @@ We need five or six parameters for this scripts. - `TRAIN_OUT_PATH`:the output path of train for distributed train. - `PRE_TRAINED_PATH :` the path of pretrained checkpoint file, it is better to use absolute path. -Training result will be stored in the train path, whose folder name "log". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the train path, whose folder name "log". Under this, you can find checkpoint file together with result like the following in log ```shell epoch: 1 step: 458, loss is 4.185711 @@ -281,7 +281,7 @@ We need five parameters for this scripts. > checkpoint can be produced in training process. -Inference result will be stored in the eval path, whose folder name "log". Under this, you can find result like the followings in log. +Inference result will be stored in the eval path, whose folder name "log". Under this, you can find result like the following in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.240 diff --git a/research/cv/ssd_resnet50/README.md b/research/cv/ssd_resnet50/README.md index d37917999..1a6f76237 100644 --- a/research/cv/ssd_resnet50/README.md +++ b/research/cv/ssd_resnet50/README.md @@ -224,7 +224,7 @@ We need five or seven parameters for this scripts. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. -Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the current path, whose folder name begins with "LOG". Under this, you can find checkpoint file together with result like the following in log ```shell epoch: 1 step: 458, loss is 3.1681802 @@ -259,7 +259,7 @@ We need two parameters for this scripts. > checkpoint can be produced in training process. -Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the followings in log. +Inference result will be stored in the example path, whose folder name begins with "eval". Under this, you can find result like the following in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.327 @@ -304,7 +304,7 @@ Current batch_ Size can only be set to 1. bash run_infer_cpp.sh [MINDIR_PATH] [DATA_PATH] [DVPP] [ANNO_FILE] [DEVICE_TYPE] [DEVICE_ID] ``` -Inference result will be stored in the example path, you can find result like the followings in acc.log. +Inference result will be stored in the example path, you can find result like the following in acc.log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.327 diff --git a/research/cv/ssd_resnet_34/README.md b/research/cv/ssd_resnet_34/README.md index 1704cb8eb..d6e8502f0 100644 --- a/research/cv/ssd_resnet_34/README.md +++ b/research/cv/ssd_resnet_34/README.md @@ -230,7 +230,7 @@ We need five or six parameters for this scripts. - `PRE_TRAINED_PATH`: the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED_EPOCH_SIZE`: number of epochs passed by checkpoint. -Training result will be stored in the train path, whose folder name "log". Under this, you can find checkpoint file together with result like the followings in log +Training result will be stored in the train path, whose folder name "log". Under this, you can find checkpoint file together with result like the following in log ```shell Single GPU training: @@ -282,7 +282,7 @@ We need five parameters for this script. > checkpoint can be produced in training process. -Inference result will be stored in the eval path, whose folder name "log". Under this, you can find result like the followings in log. +Inference result will be stored in the eval path, whose folder name "log". Under this, you can find result like the following in log. ```shell Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.254 diff --git a/research/cv/tnt/eval.py b/research/cv/tnt/eval.py index a1a483e33..f26694e15 100644 --- a/research/cv/tnt/eval.py +++ b/research/cv/tnt/eval.py @@ -1,4 +1,4 @@ -# Copyright 2022-2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,64 +12,49 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""Evaluation script. Need training config.""" -import os +"""eval""" -from functools import reduce - -import mindspore as ms from mindspore import Model from mindspore import context from mindspore import nn from mindspore.common import set_seed -from mindspore.train.callback import TimeMonitor +from src.args import args from src.tools.cell import cast_amp from src.tools.criterion import get_criterion, NetWithLoss -from src.tools.get_misc import pretrained, get_train_one_step -from src.data.imagenet import create_dataset_imagenet +from src.tools.get_misc import get_dataset, set_device, get_model, pretrained, get_train_one_step from src.tools.optimizer import get_optimizer -from src.tools.get_misc import get_model +set_seed(args.seed) + + +def main(): + mode = { + 0: context.GRAPH_MODE, + 1: context.PYNATIVE_MODE + } + context.set_context(mode=mode[args.graph_mode], device_target=args.device_target) + context.set_context(enable_graph_kernel=False) + if args.device_target == "Ascend": + context.set_context(enable_auto_mixed_precision=True) + set_device(args) -def eval_ckpt(args): - print('=== Use checkpoint ===') + # get model net = get_model(args) - cast_amp(net, args) + cast_amp(net) criterion = get_criterion(args) net_with_loss = NetWithLoss(net, criterion) if args.pretrained: pretrained(args, net) - print( - 'Number of parameters (before deploy):', - sum( - reduce(lambda x, y: x * y, params.shape) - for params in net.trainable_params() - ) - ) - # switch_net_to_deploy(net) - print( - 'Number of parameters (after deploy):', - sum( - reduce(lambda x, y: x * y, params.shape) - for params in net.trainable_params() - ) - ) - cast_amp(net, args) - net.set_train(False) - - data = create_dataset_imagenet( - str(args.ds_val), args, training=False - ) - batch_num = data.get_dataset_size() + data = get_dataset(args, training=False) + batch_num = data.val_dataset.get_dataset_size() optimizer = get_optimizer(args, net, batch_num) + # save a yaml file to read to record parameters net_with_loss = get_train_one_step(args, net_with_loss, optimizer) - eval_network = nn.WithEvalCell( - net, criterion, args.amp_level in ['O2', 'O3', 'auto'] - ) + eval_network = nn.WithEvalCell(net, criterion, args.amp_level in ["O2", "O3", "auto"]) eval_indexes = [0, 1, 2] eval_metrics = {'Loss': nn.Loss(), 'Top1-Acc': nn.Top1CategoricalAccuracy(), @@ -77,57 +62,10 @@ def eval_ckpt(args): model = Model(net_with_loss, metrics=eval_metrics, eval_network=eval_network, eval_indexes=eval_indexes) - - print('=> begin eval') - results = model.eval(data, callbacks=[TimeMonitor()]) - return results - - -def eval_mindir(args): - print('=== Use MINDIR model ===') - data = create_dataset_imagenet( - str(args.dataset_path), args, training=False - ) - iterator = data.create_dict_iterator(num_epochs=1) - - graph = ms.load(str(args.pretrained)) - net = nn.GraphCell(graph) - metrics = { - 'Top1-Acc': nn.Top1CategoricalAccuracy(), - 'Top5-Acc': nn.Top5CategoricalAccuracy(), - } - print('=> begin eval') - for batch in iterator: - y_pred = net(batch['image']) - for metric in metrics.values(): - metric.update(y_pred, batch['label']) - - return {name: metric.eval() for name, metric in metrics.items()} - - -def main(): - """Entry point.""" - from src.args import args - - set_seed(0) - context.set_context(mode=context.GRAPH_MODE, - device_target=args.device_target) - context.set_context(enable_graph_kernel=False) - if args.device_target == 'Ascend': - context.set_context(enable_auto_mixed_precision=True) - - os.environ["RANK_SIZE"] = '0' - - # get model - if args.pretrained.endswith('.ckpt'): - results = eval_ckpt(args) - elif args.pretrained.endswith('.mindir'): - results = eval_mindir(args) - else: - raise ValueError('Incorrect format checkpoint') - - print(f'=> eval results:{results}') - print('=> eval success') + print(f"=> begin eval") + results = model.eval(data.val_dataset) + print(f"=> eval results:{results}") + print(f"=> eval success") if __name__ == '__main__': diff --git a/research/cv/tnt/export.py b/research/cv/tnt/export.py index 1d26ea8ef..692a104e4 100644 --- a/research/cv/tnt/export.py +++ b/research/cv/tnt/export.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,12 +29,12 @@ from src.tools.get_misc import get_model context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) if args.device_target in ["Ascend", "GPU"]: - context.set_context(device_id=args.device_id[0]) + context.set_context(device_id=args.device_id) if __name__ == '__main__': net = get_model(args) criterion = get_criterion(args) - cast_amp(net, args) + cast_amp(net) net_with_loss = NetWithLoss(net, criterion) assert args.pretrained is not None, "checkpoint_path is None." diff --git a/research/cv/tnt/src/args.py b/research/cv/tnt/src/args.py index 591d23ee0..30f9e28b8 100644 --- a/research/cv/tnt/src/args.py +++ b/research/cv/tnt/src/args.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,8 +28,7 @@ args = None def parse_arguments(): """parse_arguments""" global args - parser = argparse.ArgumentParser(description="MindSpore TNT Training", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser = argparse.ArgumentParser(description="MindSpore TNT Training") parser.add_argument("-a", "--arch", metavar="ARCH", default="ResNet18", help="model architecture") parser.add_argument("--accumulation_step", default=1, type=int, help="accumulation step") @@ -41,21 +40,19 @@ def parse_arguments(): parser.add_argument("--beta", default=[0.9, 0.999], type=lambda x: [float(a) for a in x.split(",")], help="beta for optimizer") parser.add_argument("--clip_global_norm_value", default=5., type=float, help="Clip grad value") - parser.add_argument('--ds_train', default="./data/train", help='Training dataset') - parser.add_argument('--ds_val', default="./data/val", help='validation dataset') - parser.add_argument("--device_id", default=[0], type=int, nargs='+', help="Device Ids") + parser.add_argument('--data_url', default="./data", help='Location of data.') + parser.add_argument("--device_id", default=0, type=int, help="Device Id") parser.add_argument("--device_num", default=1, type=int, help="device num") parser.add_argument("--device_target", default="GPU", choices=["GPU", "Ascend", "CPU"], type=str) parser.add_argument("--epochs", default=300, type=int, metavar="N", help="number of total epochs to run") parser.add_argument("--eps", default=1e-8, type=float) - parser.add_argument("--file_format", type=str, choices=["AIR", "MINDIR", "ONNX"], - default="MINDIR", help="file format") + parser.add_argument("--file_format", type=str, choices=["AIR", "MINDIR"], default="MINDIR", help="file format") parser.add_argument("--in_channel", default=3, type=int) parser.add_argument("--is_dynamic_loss_scale", default=1, type=int, help="is_dynamic_loss_scale ") parser.add_argument("--keep_checkpoint_max", default=20, type=int, help="keep checkpoint max num") parser.add_argument("--optimizer", help="Which optimizer to use", default="sgd") parser.add_argument("--set", help="name of dataset", type=str, default="ImageNet") - parser.add_argument("--pynative_mode", default=0, type=int, help="graph mode with 0, python with 1") + parser.add_argument("--graph_mode", default=0, type=int, help="graph mode with 0, python with 1") parser.add_argument("--mix_up", default=0., type=float, help="mix up") parser.add_argument("--mlp_ratio", help="mlp ", default=4., type=float) parser.add_argument("-j", "--num_parallel_workers", default=20, type=int, metavar="N", @@ -74,43 +71,13 @@ def parse_arguments(): parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum") parser.add_argument("--num_classes", default=1000, type=int) parser.add_argument("--pretrained", dest="pretrained", default=None, type=str, help="use pre-trained model") - parser.add_argument("--exclude_epoch_state", action="store_true", help="exclude epoch state and learning rate") parser.add_argument("--tnt_config", help="Config file to use (see configs dir)", default=None, required=True) parser.add_argument("--seed", default=0, type=int, help="seed for initializing training. ") - parser.add_argument("--save_ckpt_every_step", default=0, type=int, help="Save checkpoint every N batches") - parser.add_argument("--save_ckpt_every_sec", default=1800, type=int, help="Save checkpoint every N seconds") - parser.add_argument("--save_ckpt_keep", default=20, type=int, help="Keep N checkpoints") + parser.add_argument("--save_every", default=2, type=int, help="Save every ___ epochs(default:2)") parser.add_argument("--label_smoothing", type=float, help="Label smoothing to use, default 0.0", default=0.1) parser.add_argument("--image_size", default=224, help="Image Size.", type=int) - parser.add_argument("--img_mean", nargs=3, type=float, default=(0.5, 0.5, 0.5), help="Image mean (model input)") - parser.add_argument("--img_std", nargs=3, type=float, default=(0.5, 0.5, 0.5), help="Image std (model input)") parser.add_argument('--train_url', default="./", help='Location of training outputs.') parser.add_argument("--run_modelarts", type=ast.literal_eval, default=False, help="Whether run on modelarts") - - parser.add_argument("--dir_ckpt", default="ckpt", help="Root directory for checkpoints.") - parser.add_argument("--dir_best_ckpt", default="best_ckpt", help="Root directory for best (acc) checkpoints.") - parser.add_argument("--dir_summary", default="summary", help="Root directory for summary logs.") - parser.add_argument("--dump_graph", action="store_true", - help="Dump model graph to MindInsight") - parser.add_argument("--collect_input_data", action="store_true", - help="Dump input images to MindInsight") - - parser.add_argument( - "--tnt_pt_implementation", - default="/mindspore/Efficient-AI-Backbones/tnt_pytorch", - help="Directory with existing implementation of TNT model (PyTorch)" - " (see https://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/tnt_pytorch)." - ) - parser.add_argument( - "--tnt_pt_pretrained", - default=( - # '/mindspore/pt_weights/tnt_s_81.5.pth.tar' - '/mindspore/pt_weights/tnt_b_82.9.pth.tar' - ), - help="Arguments to PyTorch implementation (JSON-encoded list)." - ) - parser.add_argument("--tnt_ms_export", help="Path to exported weights in MindSpore format (.ckpt).") - parser.add_argument("--pred_output", default="preds.json", help="Path to output predictions (JSON)") args = parser.parse_args() # Allow for use from notebook without config file diff --git a/research/cv/tnt/src/configs/parser.py b/research/cv/tnt/src/configs/parser.py index 7b81b1040..8d757737e 100644 --- a/research/cv/tnt/src/configs/parser.py +++ b/research/cv/tnt/src/configs/parser.py @@ -1,4 +1,4 @@ -# Copyright 2021-2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/src/data/__init__.py b/research/cv/tnt/src/data/__init__.py index 446545075..bd1c59d54 100644 --- a/research/cv/tnt/src/data/__init__.py +++ b/research/cv/tnt/src/data/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021-2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/src/data/augment/__init__.py b/research/cv/tnt/src/data/augment/__init__.py index 899e22c0d..b4d178fcf 100644 --- a/research/cv/tnt/src/data/augment/__init__.py +++ b/research/cv/tnt/src/data/augment/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021-2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/src/data/augment/auto_augment.py b/research/cv/tnt/src/data/augment/auto_augment.py index e5c3310ad..51cd1d671 100644 --- a/research/cv/tnt/src/data/augment/auto_augment.py +++ b/research/cv/tnt/src/data/augment/auto_augment.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,16 +25,12 @@ AugMix adapted from: Papers: AutoAugment: Learning Augmentation Policies from Data - https://arxiv.org/abs/1805.09501 - Learning Data Augmentation Strategies for Object Detection - https://arxiv.org/abs/1906.11172 - RandAugment: Practical automated data augmentation... - https://arxiv.org/abs/1909.13719 - AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty - https://arxiv.org/abs/1912.02781 Hacked together by / Copyright 2020 Ross Wightman """ -# pylint: disable=R1707 import math import random import re @@ -229,39 +225,35 @@ def _randomly_negate(v): return -v if random.random() > 0.5 else v -def _rotate_level_to_arg(level, hparams): +def _rotate_level_to_arg(level, _hparams): """_randomly_negate""" # range [-30, 30] - _ = hparams level = (level / _MAX_LEVEL) * 30. level = _randomly_negate(level) - return level, + return (level,) -def _enhance_level_to_arg(level, hparams): +def _enhance_level_to_arg(level, _hparams): """_enhance_level_to_arg""" # range [0.1, 1.9] - _ = hparams - return (level / _MAX_LEVEL) * 1.8 + 0.1, + return ((level / _MAX_LEVEL) * 1.8 + 0.1,) -def _enhance_increasing_level_to_arg(level, hparams): +def _enhance_increasing_level_to_arg(level, _hparams): """_enhance_increasing_level_to_arg""" # the 'no change' level is 1.0, moving away from that towards 0. or 2.0 increases the enhancement blend # range [0.1, 1.9] - _ = hparams level = (level / _MAX_LEVEL) * .9 level = 1.0 + _randomly_negate(level) - return level, + return (level,) -def _shear_level_to_arg(level, hparams): +def _shear_level_to_arg(level, _hparams): """_shear_level_to_arg""" # range [-0.3, 0.3] - _ = hparams level = (level / _MAX_LEVEL) * 0.3 level = _randomly_negate(level) - return level, + return (level,) def _translate_abs_level_to_arg(level, hparams): @@ -269,7 +261,7 @@ def _translate_abs_level_to_arg(level, hparams): translate_const = hparams['translate_const'] level = (level / _MAX_LEVEL) * float(translate_const) level = _randomly_negate(level) - return level, + return (level,) def _translate_rel_level_to_arg(level, hparams): @@ -278,16 +270,15 @@ def _translate_rel_level_to_arg(level, hparams): translate_pct = hparams.get('translate_pct', 0.45) level = (level / _MAX_LEVEL) * translate_pct level = _randomly_negate(level) - return level, + return (level,) -def _posterize_level_to_arg(level, hparams): +def _posterize_level_to_arg(level, _hparams): """_posterize_level_to_arg""" # As per Tensorflow TPU EfficientNet impl # range [0, 4], 'keep 0 up to 4 MSB of original image' # intensity/severity of augmentation decreases with level - _ = hparams - return int((level / _MAX_LEVEL) * 4), + return (int((level / _MAX_LEVEL) * 4),) def _posterize_increasing_level_to_arg(level, hparams): @@ -295,38 +286,35 @@ def _posterize_increasing_level_to_arg(level, hparams): # As per Tensorflow models research and UDA impl # range [4, 0], 'keep 4 down to 0 MSB of original image', # intensity/severity of augmentation increases with level - return 4 - _posterize_level_to_arg(level, hparams)[0], + return (4 - _posterize_level_to_arg(level, hparams)[0],) -def _posterize_original_level_to_arg(level, hparams): +def _posterize_original_level_to_arg(level, _hparams): """_posterize_original_level_to_arg""" # As per original AutoAugment paper description # range [4, 8], 'keep 4 up to 8 MSB of image' # intensity/severity of augmentation decreases with level - _ = hparams - return int((level / _MAX_LEVEL) * 4) + 4, + return (int((level / _MAX_LEVEL) * 4) + 4,) -def _solarize_level_to_arg(level, hparams): +def _solarize_level_to_arg(level, _hparams): """_solarize_level_to_arg""" # range [0, 256] # intensity/severity of augmentation decreases with level - _ = hparams - return int((level / _MAX_LEVEL) * 256), + return (int((level / _MAX_LEVEL) * 256),) -def _solarize_increasing_level_to_arg(level, hparams): +def _solarize_increasing_level_to_arg(level, _hparams): """_solarize_increasing_level_to_arg""" # range [0, 256] # intensity/severity of augmentation increases with level - return 256 - _solarize_level_to_arg(level, hparams)[0], + return (256 - _solarize_level_to_arg(level, _hparams)[0],) -def _solarize_add_level_to_arg(level, hparams): +def _solarize_add_level_to_arg(level, _hparams): """_solarize_add_level_to_arg""" # range [0, 110] - _ = hparams - return int((level / _MAX_LEVEL) * 110), + return (int((level / _MAX_LEVEL) * 110),) LEVEL_TO_ARG = { @@ -494,7 +482,6 @@ def auto_augment_policy_v0r(hparams): def auto_augment_policy_original(hparams): """auto_augment_policy_original""" # ImageNet policy from https://arxiv.org/abs/1805.09501 - policy = [ [('PosterizeOriginal', 0.4, 8), ('Rotate', 0.6, 9)], [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], @@ -791,7 +778,6 @@ def augmix_ops(magnitude=10, hparams=None, transforms=None): class AugMixAugment: """ AugMix Transform Adapted and improved from impl here: https://github.com/google-research/augmix/blob/master/imagenet.py - From paper: 'AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty - https://arxiv.org/abs/1912.02781 """ diff --git a/research/cv/tnt/src/data/augment/mixup.py b/research/cv/tnt/src/data/augment/mixup.py index 3cd967bc3..f196fccb5 100644 --- a/research/cv/tnt/src/data/augment/mixup.py +++ b/research/cv/tnt/src/data/augment/mixup.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -142,7 +142,7 @@ class Mixup: def _params_per_elem(self, batch_size): """_params_per_elem""" lam = np.ones(batch_size, dtype=np.float32) - use_cutmix = np.zeros(batch_size, dtype=np.bool) + use_cutmix = np.zeros(batch_size, dtype=np.bool_) if self.mixup_enabled: if self.mixup_alpha > 0. and self.cutmix_alpha > 0.: use_cutmix = np.random.rand(batch_size) < self.switch_prob @@ -153,7 +153,7 @@ class Mixup: elif self.mixup_alpha > 0.: lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size) elif self.cutmix_alpha > 0.: - use_cutmix = np.ones(batch_size, dtype=np.bool) + use_cutmix = np.ones(batch_size, dtype=np.bool_) lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size) else: assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true." diff --git a/research/cv/tnt/src/data/augment/random_erasing.py b/research/cv/tnt/src/data/augment/random_erasing.py index eaa263c38..15e304aa7 100644 --- a/research/cv/tnt/src/data/augment/random_erasing.py +++ b/research/cv/tnt/src/data/augment/random_erasing.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/src/data/data_utils/moxing_adapter.py b/research/cv/tnt/src/data/data_utils/moxing_adapter.py index 43691b552..37d2717e8 100644 --- a/research/cv/tnt/src/data/data_utils/moxing_adapter.py +++ b/research/cv/tnt/src/data/data_utils/moxing_adapter.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/src/data/imagenet.py b/research/cv/tnt/src/data/imagenet.py index 94c2e1e6a..95cb688a7 100644 --- a/research/cv/tnt/src/data/imagenet.py +++ b/research/cv/tnt/src/data/imagenet.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021-2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,10 +16,7 @@ Data operations, will be used in train.py and eval.py """ import os -from dataclasses import dataclass -import math -import numpy as np import mindspore.common.dtype as mstype import mindspore.dataset as ds import mindspore.dataset.transforms as C @@ -46,16 +43,14 @@ class ImageNet: self.train_dataset = create_dataset_imagenet(train_dir, training=True, args=args) self.val_dataset = create_dataset_imagenet(val_ir, training=False, args=args) else: - # train_dir = os.path.join(args.data_url, "train") - # val_ir = os.path.join(args.data_url, "val") + train_dir = os.path.join(args.data_url, "train") + val_ir = os.path.join(args.data_url, "val") if training: - self.train_dataset = create_dataset_imagenet(args.ds_train, training=True, args=args) - self.val_dataset = create_dataset_imagenet(args.ds_val, training=False, args=args) + self.train_dataset = create_dataset_imagenet(train_dir, training=True, args=args) + self.val_dataset = create_dataset_imagenet(val_ir, training=False, args=args) -def create_dataset_imagenet(dataset_dir, args, repeat_num=1, training=True, - preloaded_ds=None - ) -> ds.ImageFolderDataset: +def create_dataset_imagenet(dataset_dir, args, repeat_num=1, training=True): """ create a train or eval imagenet2012 dataset for TNT @@ -69,29 +64,22 @@ def create_dataset_imagenet(dataset_dir, args, repeat_num=1, training=True, """ device_num, rank_id = _get_rank_info() - if device_num is None: - device_num = 1 shuffle = bool(training) - ds.config.set_prefetch_size(args.batch_size) - if preloaded_ds is not None: - data_set = preloaded_ds + if device_num == 1 or not training: + data_set = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=args.num_parallel_workers, + shuffle=shuffle) else: - shard_args = {} - if device_num > 1 and training: - shard_args = {'num_shards': device_num, - 'shard_id': rank_id} - data_set = ds.ImageFolderDataset( - dataset_dir, num_parallel_workers=args.num_parallel_workers, - shuffle=shuffle, **shard_args - ) + data_set = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=args.num_parallel_workers, shuffle=shuffle, + num_shards=device_num, shard_id=rank_id) image_size = args.image_size # define map operations # BICUBIC: 3 - mean, std = args.img_mean, args.img_std # ImageNet: [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] if training: + mean = [0.485, 0.456, 0.406] + std = [0.229, 0.224, 0.225] aa_params = dict( translate_const=int(image_size * 0.45), img_mean=tuple([min(255, round(255 * x)) for x in mean]), @@ -114,24 +102,12 @@ def create_dataset_imagenet(dataset_dir, args, repeat_num=1, training=True, RandomErasing(args.re_prob, mode=args.re_mode, max_count=args.re_count) ] else: - mean = (np.array(mean) * 255).tolist() - std = (np.array(std) * 255).tolist() - - # As in the initial repo. - crop_pct = 0.9 - if isinstance(image_size, tuple): - assert len(image_size) == 2 - if image_size[-1] == image_size[-2]: - # fall-back to older behaviour so Resize scales to shortest edge if target is square - scale_size = int(math.floor(image_size[0] / crop_pct)) - else: - scale_size = tuple([int(x / crop_pct) for x in image_size]) - else: - scale_size = int(math.floor(image_size / crop_pct)) - + mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] + std = [0.229 * 255, 0.224 * 255, 0.225 * 255] + # test transform complete transform_img = [ vision.Decode(), - vision.Resize(scale_size, interpolation=Inter.BICUBIC), + vision.Resize(int(256 / 224 * image_size), interpolation=Inter.BICUBIC), vision.CenterCrop(image_size), vision.Normalize(mean=mean, std=std), vision.HWC2CHW() @@ -143,7 +119,7 @@ def create_dataset_imagenet(dataset_dir, args, repeat_num=1, training=True, operations=transform_img) data_set = data_set.map(input_columns="label", num_parallel_workers=args.num_parallel_workers, operations=transform_label) - if (args.mix_up > 0. or args.cutmix > 0.) and not training: + if (args.mix_up > 0. or args.cutmix > 0.) and not training: # if use mixup and not training(False), one hot val data label one_hot = C.OneHot(num_classes=args.num_classes) data_set = data_set.map(input_columns="label", num_parallel_workers=args.num_parallel_workers, @@ -181,156 +157,3 @@ def _get_rank_info(): rank_size = rank_id = None return rank_size, rank_id - - -@dataclass -class DatasetParams: - """Dataset arguments as a namespace""" - batch_size: int - num_parallel_workers: int - image_size: int - img_mean: list - img_std: list - interpolation: str - auto_augment: str - re_prob: float - re_mode: str - re_count: int - num_classes: int - mix_up: float # alpha - mixup_prob: float # prob - mixup_mode: str - switch_prob: float - cutmix: float - label_smoothing: float - - -def init_dataset( - dataset_dir, batch_size: int, - num_parallel_workers: int, - image_size: int, - img_mean: list, - img_std: list, - interpolation: str, - auto_augment: str, - re_prob: float, - re_mode: str, - re_count: int, - num_classes: int, - mix_up: float, - mixup_prob: float, - mixup_mode: str, - switch_prob: float, - cutmix: float, - label_smoothing: float, repeat_num=1, training=True, - preloaded_ds=None, - **kwargs -) -> ds.ImageFolderDataset: - """Initialize dataset with explicit parameter names""" - _ = kwargs - args = DatasetParams( - batch_size, - num_parallel_workers, - image_size, - img_mean, - img_std, - interpolation, - auto_augment, - re_prob, - re_mode, - re_count, - num_classes, - mix_up, - mixup_prob, - mixup_mode, - switch_prob, - cutmix, - label_smoothing - ) - return create_dataset_imagenet( - dataset_dir, args, repeat_num=repeat_num, training=training, - preloaded_ds=preloaded_ds - ) - - -def get_transforms( - image_size: int, training: bool, **aug: dict -): - """Get images preprocessing according mode and augmentations settings. - - Parameters - ---------- - image_size: int - Target image size. - training: bool - Mode. If True augmentations may be applied. - aug: Dict - Augmentation settings (type, auto aug, random erase). - - Returns - ------- - List of transforms. - """ - mean = [0.485, 0.456, 0.406] - std = [0.229, 0.224, 0.225] - - aug = {} if aug is None else aug - if training: - if aug['type'] == 'weak': - transform = [ - vision.ToPIL(), - vision.RandomResizedCrop( - image_size, scale=(0.08, 1.0), ratio=(3 / 4, 4 / 3), - interpolation=Inter.BILINEAR - ), - vision.RandomHorizontalFlip(prob=0.5), - vision.ToTensor(), - vision.Normalize(mean, std, is_hwc=False), - ] - elif aug['type'] == 'none': - transform = [ - vision.ToPIL(), - vision.Resize(image_size, interpolation=Inter.BILINEAR), - vision.CenterCrop(image_size), - vision.ToTensor(), - vision.Normalize(mean, std, is_hwc=False), - ] - elif aug['type'] == 'auto': - aa_params = dict( - translate_const=int(image_size * 0.45), - img_mean=tuple([min(255, round(255 * x)) for x in mean]), - interpolation=_pil_interp(aug['interpolation']) - ) - auto_augment = aug['auto_augment'] - - transform = [ - vision.RandomResizedCrop( - image_size, scale=(0.08, 1.0), ratio=(3 / 4, 4 / 3), - interpolation=Inter.BILINEAR - ), - vision.RandomHorizontalFlip(prob=0.5), - vision.ToPIL() - ] - if auto_augment is not None: - transform += [rand_augment_transform(auto_augment, aa_params)] - transform += [ - vision.ToTensor(), - vision.Normalize(mean=mean, std=std, is_hwc=False), - RandomErasing( - aug['re_prob'], mode=aug['re_mode'], - max_count=aug['re_count']), - ] - else: - raise ValueError('???' + aug.get('type', 'Unknown')) - else: - transform = [ - vision.ToPIL(), - vision.Resize( - int((256 / 224) * image_size), interpolation=Inter.BILINEAR - ), - vision.CenterCrop(image_size), - vision.ToTensor(), - vision.Normalize(mean, std, is_hwc=False), - ] - - return transform diff --git a/research/cv/tnt/src/models/__init__.py b/research/cv/tnt/src/models/__init__.py index eae6de6ed..2024ccbd3 100644 --- a/research/cv/tnt/src/models/__init__.py +++ b/research/cv/tnt/src/models/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021-2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,9 +13,8 @@ # limitations under the License. # ============================================================================ """init model""" -from .tnt import tnt_b_patch16_224, tnt_s_patch16_224 +from .tnt import tnt_s_patch16_224 __all__ = [ - "tnt_b_patch16_224", "tnt_s_patch16_224", ] diff --git a/research/cv/tnt/src/models/tnt/__init__.py b/research/cv/tnt/src/models/tnt/__init__.py index e21989208..2c78378dd 100644 --- a/research/cv/tnt/src/models/tnt/__init__.py +++ b/research/cv/tnt/src/models/tnt/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021-2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,5 +13,4 @@ # limitations under the License. # ============================================================================ """import tnt models""" -from .tnt import tnt_b_patch16_224, tnt_s_patch16_224 -from . import layers +from .tnt import tnt_s_patch16_224 diff --git a/research/cv/tnt/src/models/tnt/tnt.py b/research/cv/tnt/src/models/tnt/tnt.py index 95771a196..25ce3318f 100644 --- a/research/cv/tnt/src/models/tnt/tnt.py +++ b/research/cv/tnt/src/models/tnt/tnt.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,12 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -# -# This file has been derived from the https://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/tnt_pytorch -# repository and modified. -# ============================================================================ -"""Transformer in Transformer (TNT)""" -from dataclasses import dataclass +"""Transformer in Transformer(TNT)""" +import math import numpy as np import mindspore.common.initializer as weight_init @@ -27,15 +23,11 @@ from mindspore import Parameter from mindspore import Tensor from mindspore import dtype as mstype -from .layers.misc import DropPath1D, trunc_array -from .layers.patch_embed import PatchEmbed -from .layers.attention import Attention +from .misc import DropPath1D, to_2tuple, Identity, trunc_array def make_divisible(v, divisor=8, min_value=None): - """ - Round number to the multiple of divisor - """ + """make_divisible""" min_value = min_value or divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. @@ -44,31 +36,43 @@ def make_divisible(v, divisor=8, min_value=None): return new_v +class UnfoldKernelEqPatch(nn.Cell): + """UnfoldKernelEqPatch with better performance""" + + def __init__(self, kernel_size, strides): + super(UnfoldKernelEqPatch, self).__init__() + assert kernel_size == strides + self.kernel_size = kernel_size + self.reshape = P.Reshape() + self.transpose = P.Transpose() + + def construct(self, inputs): + B, C, H, W = inputs.shape + inputs = self.reshape(inputs, + (B, C, H // self.kernel_size[0], self.kernel_size[0], W)) + inputs = self.transpose(inputs, (0, 2, 1, 3, 4)) + inputs = self.reshape(inputs, (-1, C, self.kernel_size[0], W // self.kernel_size[1], self.kernel_size[1])) + inputs = self.transpose(inputs, (0, 3, 1, 2, 4)) + inputs = self.reshape(inputs, (-1, C, self.kernel_size[0], self.kernel_size[1])) + + return inputs + + class Mlp(nn.Cell): - """ - Multi-layer perceptron - - Args: - in_features(int): Number of input features - hidden_features(int): Number of hidden features - out_features(int): Number of output features - act_layer(class): Activation layer (base class) - drop(float): Dropout rate - """ + """Mlp""" - def __init__(self, in_features, hidden_features=None, - out_features=None, act_layer=nn.GELU, drop=0.): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features - self.fc1 = nn.Dense(in_channels=in_features, out_channels=hidden_features, has_bias=True) + self.fc1 = nn.Dense(in_channels=in_features, out_channels=hidden_features) self.act = act_layer() - self.fc2 = nn.Dense(in_channels=hidden_features, out_channels=out_features, has_bias=True) - self.drop = nn.Dropout(keep_prob=1.0 - drop) # if drop > 0. else Identity() + self.fc2 = nn.Dense(in_channels=hidden_features, out_channels=out_features, has_bias=False) + self.drop = nn.Dropout(p=drop) if drop > 0. else Identity() - def construct(self, *inputs, **kwargs): - x = inputs[0] + def construct(self, x): x = self.fc1(x) + x = self.act(x) x = self.drop(x) x = self.fc2(x) @@ -85,71 +89,95 @@ class SE(nn.Cell): self.dim = dim hidden_dim = int(dim * hidden_ratio) self.fc = nn.SequentialCell([ - nn.LayerNorm(normalized_shape=dim, epsilon=1e-5), - nn.Dense(in_channels=dim, out_channels=hidden_dim), + LayerNorm(normalized_shape=dim, eps=1e-05), + nn.Dense(in_channels=dim, out_channels=hidden_dim, has_bias=False), nn.ReLU(), - nn.Dense(in_channels=hidden_dim, out_channels=dim), + nn.Dense(in_channels=hidden_dim, out_channels=dim, has_bias=False), nn.Tanh() ]) - self.reduce_mean = P.ReduceMean() - - def construct(self, *inputs, **kwargs): - x = inputs[0] - a = self.reduce_mean(True, x, 1) # B, 1, C + def construct(self, x): + a = P.ReduceMean()(True, x, 1) # B, 1, C a = self.fc(a) x = a * x return x +class Attention(nn.Cell): + """Attention""" + + def __init__(self, dim, hidden_dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.hidden_dim = hidden_dim + self.num_heads = num_heads + head_dim = hidden_dim // num_heads + self.head_dim = head_dim + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim ** -0.5 + + # self.qk = nn.Dense(in_channels=dim, out_channels=hidden_dim * 2, has_bias=qkv_bias) + self.q = nn.Dense(in_channels=dim, out_channels=hidden_dim, has_bias=qkv_bias) + self.k = nn.Dense(in_channels=dim, out_channels=hidden_dim, has_bias=qkv_bias) + self.v = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias) + self.attn_drop = nn.Dropout(p=attn_drop) + self.proj = nn.Dense(in_channels=dim, out_channels=dim, has_bias=False) + self.proj_drop = nn.Dropout(p=proj_drop) + self.softmax = nn.Softmax(axis=-1) + self.matmul = P.BatchMatMul() + + def construct(self, x): + """Attention construct""" + B, N, _ = x.shape + q = P.Reshape()(self.q(x), (B, N, self.num_heads, self.head_dim)) + q = P.Transpose()(q, (0, 2, 1, 3)) + + k = P.Reshape()(self.k(x), (B, N, self.num_heads, self.head_dim)) + k = P.Transpose()(k, (0, 2, 1, 3)) + # qk = P.Reshape()(self.qk(x), (B, N, 2, self.num_heads, self.head_dim)) + # qk = P.Transpose()(qk, (2, 0, 3, 1, 4)) + + v = P.Reshape()(self.v(x), (B, N, self.num_heads, -1)) + v = P.Transpose()(v, (0, 2, 1, 3)) + + attn = self.matmul(q, P.Transpose()(k, (0, 1, 3, 2))) * self.scale + attn = self.softmax(attn) + attn = self.attn_drop(attn) + + x = P.Transpose()(self.matmul(attn, v), (0, 2, 1, 3)) + x = P.Reshape()(x, (B, N, -1)) + x = self.proj(x) + x = self.proj_drop(x) + return x + + class Block(nn.Cell): - """ - TNT base block - - Args: - outer_dim(int): Number of output features - inner_dim(int): Number of internal features - outer_num_heads(int): Number of output heads - inner_num_heads(int): Number of internal heads - num_words(int): Number of 'visual words' (feature groups) - mlp_ratio(float): Rate of MLP per hidden features - qkv_bias(bool): Use Qk / v bias - qk_scale(float): Qk scale - drop(float): Dropout rate - attn_drop(float): Dropout rate of attention layer - drop_path(float): Path dropout rate - act_layer(class): Activation layer (class) - norm_layer(class): Normalization layer - se(int): SE parameter - """ + """ TNT Block""" - def __init__(self, outer_dim, inner_dim, outer_num_heads, - inner_num_heads, num_words, mlp_ratio=4., - qkv_bias=False, qk_scale=None, drop=0., - attn_drop=0., drop_path=0., act_layer=nn.GELU, + def __init__(self, outer_dim, inner_dim, outer_num_heads, inner_num_heads, num_words, mlp_ratio=4., + qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, se=0): super().__init__() self.has_inner = inner_dim > 0 if self.has_inner: # Inner - self.inner_norm1 = norm_layer((inner_dim,), epsilon=1e-5) + self.inner_norm1 = norm_layer((inner_dim,)) self.inner_attn = Attention( inner_dim, inner_dim, num_heads=inner_num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) - self.inner_norm2 = norm_layer((inner_dim,), epsilon=1e-5) + self.inner_norm2 = norm_layer((inner_dim,)) self.inner_mlp = Mlp(in_features=inner_dim, hidden_features=int(inner_dim * mlp_ratio), out_features=inner_dim, act_layer=act_layer, drop=drop) - self.proj_norm1 = norm_layer((num_words * inner_dim,), epsilon=1e-5) + self.proj_norm1 = norm_layer((num_words * inner_dim,)) self.proj = nn.Dense(in_channels=num_words * inner_dim, out_channels=outer_dim, has_bias=False) - self.proj_norm2 = norm_layer((outer_dim,), epsilon=1e-5) + self.proj_norm2 = norm_layer((outer_dim,)) # Outer - self.outer_norm1 = norm_layer((outer_dim,), epsilon=1e-5) + self.outer_norm1 = norm_layer((outer_dim,)) self.outer_attn = Attention( outer_dim, outer_dim, num_heads=outer_num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) self.drop_path = DropPath1D(drop_path) - self.outer_norm2 = norm_layer((outer_dim,), epsilon=1e-5) + self.outer_norm2 = norm_layer((outer_dim,)) self.outer_mlp = Mlp(in_features=outer_dim, hidden_features=int(outer_dim * mlp_ratio), out_features=outer_dim, act_layer=act_layer, drop=drop) # SE @@ -159,79 +187,65 @@ class Block(nn.Cell): self.se_layer = SE(outer_dim, 0.25) self.zeros = Tensor(np.zeros([1, 1, 1]), dtype=mstype.float32) - self.reshape = P.Reshape() - self.cast = P.Cast() - - def construct(self, *inputs, **kwargs): + def construct(self, inner_tokens, outer_tokens): """TNT Block construct""" - inner_tokens, outer_tokens = inputs[0], inputs[1] if self.has_inner: - in1 = self.inner_norm1(inner_tokens) - attn1 = self.inner_attn(in1) - inner_tokens = inner_tokens + self.drop_path(attn1) # B*N, k*k, c - in2 = self.inner_norm2(inner_tokens) - mlp = self.inner_mlp(in2) - inner_tokens = inner_tokens + self.drop_path(mlp) # B*N, k*k, c - b, n, _ = P.Shape()(outer_tokens) - # zeros = P.Tile()(self.zeros, (B, 1, C)) - proj = self.proj_norm2(self.proj(self.proj_norm1( - self.reshape(inner_tokens, (b, n - 1, -1,)) - ))) - proj = self.cast(proj, mstype.float32) - # proj = P.Concat(1)((zeros, proj)) - # outer_tokens = outer_tokens + proj # B, N, C - outer_tokens[:, 1:] = outer_tokens[:, 1:] + proj + inner_tokens = inner_tokens + self.drop_path(self.inner_attn(self.inner_norm1(inner_tokens))) # B*N, k*k, c + inner_tokens = inner_tokens + self.drop_path(self.inner_mlp(self.inner_norm2(inner_tokens))) # B*N, k*k, c + B, N, C = P.Shape()(outer_tokens) + zeros = P.Tile()(self.zeros, (B, 1, C)) + proj = self.proj_norm2(self.proj(self.proj_norm1(P.Reshape()(inner_tokens, (B, N - 1, -1,))))) + proj = P.Cast()(proj, mstype.float32) + proj = P.Concat(1)((zeros, proj)) + outer_tokens = outer_tokens + proj # B, N, C if self.se > 0: - outer_tokens = outer_tokens + self.drop_path( - self.outer_attn(self.outer_norm1(outer_tokens))) + outer_tokens = outer_tokens + self.drop_path(self.outer_attn(self.outer_norm1(outer_tokens))) tmp_ = self.outer_mlp(self.outer_norm2(outer_tokens)) - outer_tokens = outer_tokens + self.drop_path( - tmp_ + self.se_layer(tmp_)) + outer_tokens = outer_tokens + self.drop_path(tmp_ + self.se_layer(tmp_)) else: - outer_tokens = outer_tokens + self.drop_path( - self.outer_attn(self.outer_norm1(outer_tokens))) - outer_tokens = outer_tokens + self.drop_path( - self.outer_mlp(self.outer_norm2(outer_tokens))) + outer_tokens = outer_tokens + self.drop_path(self.outer_attn(self.outer_norm1(outer_tokens))) + outer_tokens = outer_tokens + self.drop_path(self.outer_mlp(self.outer_norm2(outer_tokens))) return inner_tokens, outer_tokens -class TNT(nn.Cell): +class PatchEmbed(nn.Cell): + """ Image to Visual Word Embedding """ - TNT (Transformer in Transformer) for computer vision - - Args: - img_size(int): Image size (side, px) - patch_size(int): Patch size (side, px) - in_chans(int): Number of input channels - num_classes(int): Number of output classes - outer_dim(int): Number of output features - inner_dim(int): Number of internal features - depth(int): Number of TNT base blocks - outer_num_heads(int): Number of output heads - inner_num_heads(int): Number of internal heads - mlp_ratio(float): Rate of MLP per hidden features - qkv_bias(bool): Use Qk / v bias - qk_scale(float): Qk scale - drop_rate(float): Dropout rate - attn_drop_rate(float): Dropout rate for attention layer - drop_path_rate(float): Dropout rate for DropPath layer - norm_layer(class): Normalization layer - inner_stride(int): Number of strides for internal patches - se(int): SE parameter + + def __init__(self, img_size=224, patch_size=16, in_chans=3, outer_dim=768, inner_dim=24, inner_stride=4): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + self.inner_dim = inner_dim + self.num_words = math.ceil(patch_size[0] / inner_stride) * math.ceil(patch_size[1] / inner_stride) + + self.unfold = UnfoldKernelEqPatch(kernel_size=patch_size, strides=patch_size) + self.proj = nn.Conv2d(in_channels=in_chans, out_channels=inner_dim, kernel_size=7, stride=inner_stride, + pad_mode='pad', padding=3, has_bias=False) + + def construct(self, x): + B = x.shape[0] + x = self.unfold(x) # B, Ck2, N + x = self.proj(x) # B*N, C, 8, 8 + x = P.Reshape()(x, (B * self.num_patches, self.inner_dim, -1,)) # B*N, 8*8, C + x = P.Transpose()(x, (0, 2, 1)) + return x + + +class TNT(nn.Cell): + """ TNT (Transformer in Transformer) for computer vision """ - def __init__(self, img_size=224, patch_size=16, in_chans=3, - num_classes=1000, outer_dim=768, inner_dim=48, - depth=12, outer_num_heads=12, inner_num_heads=4, - mlp_ratio=4., qkv_bias=False, qk_scale=None, - # drop_rate=0., attn_drop_rate=0., drop_path_rate=0., - # norm_layer=LayerNormFixOrder, inner_stride=4, se=0, - drop_rate=0., attn_drop_rate=0., drop_path_rate=0., - norm_layer=nn.LayerNorm, inner_stride=4, se=0, + def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, outer_dim=768, inner_dim=48, + depth=12, outer_num_heads=12, inner_num_heads=4, mlp_ratio=4., qkv_bias=False, qk_scale=None, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, inner_stride=4, se=0, **kwargs): super().__init__() - _ = kwargs self.num_classes = num_classes self.outer_dim = outer_dim @@ -241,16 +255,16 @@ class TNT(nn.Cell): self.num_patches = num_patches = self.patch_embed.num_patches num_words = self.patch_embed.num_words - self.proj_norm1 = norm_layer((num_words * inner_dim,), epsilon=1e-5) - self.proj = nn.Dense(in_channels=num_words * inner_dim, out_channels=outer_dim, has_bias=True) - self.proj_norm2 = norm_layer((outer_dim,), epsilon=1e-5) + self.proj_norm1 = norm_layer((num_words * inner_dim,)) + self.proj = nn.Dense(in_channels=num_words * inner_dim, out_channels=outer_dim, has_bias=False) + self.proj_norm2_tnt = norm_layer((outer_dim,)) self.cls_token = Parameter(Tensor(trunc_array([1, 1, outer_dim]), dtype=mstype.float32), name="cls_token", requires_grad=True) self.outer_pos = Parameter(Tensor(trunc_array([1, num_patches + 1, outer_dim]), dtype=mstype.float32), name="outer_pos") self.inner_pos = Parameter(Tensor(trunc_array([1, num_words, inner_dim]), dtype=mstype.float32)) - self.pos_drop = nn.Dropout(keep_prob=1.0 - drop_rate) + self.pos_drop = nn.Dropout(p=drop_rate) dpr = [x for x in np.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule vanilla_idxs = [] @@ -268,7 +282,6 @@ class TNT(nn.Cell): num_words=num_words, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, se=se)) self.blocks = nn.CellList(blocks) - # self.norm = norm_layer(outer_dim, eps=1e-5) self.norm = norm_layer((outer_dim,)) # NOTE as per official impl, we could have a pre-logits representation dense layer + tanh here @@ -279,12 +292,7 @@ class TNT(nn.Cell): mask = np.zeros([1, num_patches + 1, 1]) mask[:, 0] = 1 self.mask = Tensor(mask, dtype=mstype.float32) - self.head = nn.Dense(in_channels=outer_dim, out_channels=num_classes, has_bias=True) - - self.reshape = P.Reshape() - self.concat = P.Concat(1) - self.tile = P.Tile() - self.cast = P.Cast() + self.head = nn.Dense(in_channels=outer_dim, out_channels=num_classes, has_bias=False) self.init_weights() print("================================success================================") @@ -310,18 +318,13 @@ class TNT(nn.Cell): def forward_features(self, x): """TNT forward_features""" - b = x.shape[0] + B = x.shape[0] inner_tokens = self.patch_embed(x) + self.inner_pos # B*N, 8*8, C - outer_tokens = self.proj_norm2( - self.proj(self.proj_norm1( - self.reshape(inner_tokens, (b, self.num_patches, -1,)) - )) - ) - outer_tokens = self.cast(outer_tokens, mstype.float32) - outer_tokens = self.concat(( - self.tile(self.cls_token, (b, 1, 1)), outer_tokens - )) + outer_tokens = self.proj_norm2_tnt( + self.proj(self.proj_norm1(P.Reshape()(inner_tokens, (B, self.num_patches, -1,))))) + outer_tokens = P.Cast()(outer_tokens, mstype.float32) + outer_tokens = P.Concat(1)((P.Tile()(self.cls_token, (B, 1, 1)), outer_tokens)) outer_tokens = outer_tokens + self.outer_pos outer_tokens = self.pos_drop(outer_tokens) @@ -332,8 +335,7 @@ class TNT(nn.Cell): outer_tokens = self.norm(outer_tokens) # [batch_size, num_patch+1, outer_dim) return outer_tokens[:, 0] - def construct(self, *inputs, **kwargs): - x = inputs[0] + def construct(self, x): x = self.forward_features(x) x = self.head(x) return x @@ -348,13 +350,12 @@ def tnt_s_patch16_224(args): inner_dim = 24 outer_num_heads = 6 inner_num_heads = 4 - depth = 12 drop_path_rate = args.drop_path_rate drop_out = args.drop_out num_classes = args.num_classes outer_dim = make_divisible(outer_dim, outer_num_heads) inner_dim = make_divisible(inner_dim, inner_num_heads) - model = TNT(img_size=224, patch_size=patch_size, outer_dim=outer_dim, inner_dim=inner_dim, depth=depth, + model = TNT(img_size=224, patch_size=patch_size, outer_dim=outer_dim, inner_dim=inner_dim, depth=12, outer_num_heads=outer_num_heads, inner_num_heads=inner_num_heads, qkv_bias=False, inner_stride=inner_stride, drop_path_rate=drop_path_rate, drop_out=drop_out, num_classes=num_classes) return model @@ -369,32 +370,12 @@ def tnt_b_patch16_224(args): inner_dim = 40 outer_num_heads = 10 inner_num_heads = 4 - depth = 12 drop_path_rate = args.drop_path_rate drop_out = args.drop_out num_classes = args.num_classes outer_dim = make_divisible(outer_dim, outer_num_heads) inner_dim = make_divisible(inner_dim, inner_num_heads) - model = TNT(img_size=224, patch_size=patch_size, outer_dim=outer_dim, inner_dim=inner_dim, depth=depth, + model = TNT(img_size=224, patch_size=patch_size, outer_dim=outer_dim, inner_dim=inner_dim, depth=12, outer_num_heads=outer_num_heads, inner_num_heads=inner_num_heads, qkv_bias=False, inner_stride=inner_stride, drop_path_rate=drop_path_rate, drop_out=drop_out, num_classes=num_classes) return model - - -@dataclass -class NetworkParams: - num_classes: int - drop_path_rate: float - drop_out: float - - -def get_model_by_name(arch, num_classes, drop_path_rate, drop_out, - **kwargs) -> TNT: - """get network by name and initialize it""" - _ = kwargs - models = { - 'tnt_s_patch16_224': tnt_s_patch16_224, - 'tnt_b_patch16_224': tnt_b_patch16_224 - } - args = NetworkParams(num_classes, drop_path_rate, drop_out) - return models[arch](args) diff --git a/research/cv/tnt/src/tools/cell.py b/research/cv/tnt/src/tools/cell.py index e506319be..7e886ad6f 100644 --- a/research/cv/tnt/src/tools/cell.py +++ b/research/cv/tnt/src/tools/cell.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ import mindspore.nn as nn from mindspore import dtype as mstype from mindspore.ops import functional as F +from src.args import args + class OutputTo16(nn.Cell): "Wrap cell for amp. Cast network output back to float16" @@ -25,8 +27,7 @@ class OutputTo16(nn.Cell): super(OutputTo16, self).__init__(auto_prefix=False) self._op = op - def construct(self, *inputs, **kwargs): - x = inputs[0] + def construct(self, x): return F.cast(self._op(x), mstype.float16) @@ -37,7 +38,7 @@ def do_keep_fp16(network, cell_types): cell.to_float(mstype.float16) -def cast_amp(net, args): +def cast_amp(net): """cast network amp_level""" if args.amp_level == "O2": cell_types = (nn.Dense,) diff --git a/research/cv/tnt/src/tools/criterion.py b/research/cv/tnt/src/tools/criterion.py index 7d73a4028..ee963c1fe 100644 --- a/research/cv/tnt/src/tools/criterion.py +++ b/research/cv/tnt/src/tools/criterion.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -31,11 +31,10 @@ class SoftTargetCrossEntropy(LossBase): self.sum_ops = P.ReduceSum(keep_dims=False) self.log_softmax = P.LogSoftmax() - def construct(self, logits, labels): - logits = P.Cast()(logits, mstype.float32) - labels = P.Cast()(labels, mstype.float32) - # pylint: disable=invalid-unary-operand-type - loss = self.sum_ops(-labels * self.log_softmax(logits), -1) + def construct(self, logit, label): + logit = P.Cast()(logit, mstype.float32) + label = P.Cast()(label, mstype.float32) + loss = self.sum_ops(-label * self.log_softmax(logit), -1) return self.mean_ops(loss) @@ -51,11 +50,10 @@ class CrossEntropySmooth(LossBase): self.ce = nn.SoftmaxCrossEntropyWithLogits(reduction=reduction) self.cast = ops.Cast() - def construct(self, logits, labels): + def construct(self, logit, label): if self.sparse: - labels = self.onehot(labels, F.shape(logits)[1], - self.on_value, self.off_value) - loss2 = self.ce(logits, labels) + label = self.onehot(label, F.shape(logit)[1], self.on_value, self.off_value) + loss2 = self.ce(logit, label) return loss2 @@ -89,8 +87,7 @@ class NetWithLoss(nn.Cell): self.model = model self.criterion = criterion - def construct(self, *inputs, **kwargs): - data, label = inputs[:2] + def construct(self, data, label): predict = self.model(data) loss = self.criterion(predict, label) return loss diff --git a/research/cv/tnt/src/tools/get_misc.py b/research/cv/tnt/src/tools/get_misc.py index 6c841c97b..73ae63120 100644 --- a/research/cv/tnt/src/tools/get_misc.py +++ b/research/cv/tnt/src/tools/get_misc.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -44,7 +44,7 @@ def set_device(args): rank = get_rank() else: - context.set_context(device_id=args.device_id[rank]) + context.set_context(device_id=args.device_id) elif device_target == "GPU": if device_num > 1: init(backend_name='nccl') @@ -53,14 +53,14 @@ def set_device(args): gradients_mean=True) rank = get_rank() else: - context.set_context(device_id=args.device_id[rank]) + context.set_context(device_id=args.device_id) else: raise ValueError("Unsupported platform.") return rank -def get_dataset(args, training=True) -> data.ImageNet: +def get_dataset(args, training=True): """"Get model according to args.set""" print(f"=> Getting {args.set} dataset") dataset = getattr(data, args.set)(args, training) @@ -76,7 +76,7 @@ def get_model(args): return model -def pretrained(args, model, exclude_epoch_state=True): +def pretrained(args, model): """"Load pretrained weights if args.pretrained is given""" if args.run_modelarts: print('Download data.') @@ -101,13 +101,6 @@ def pretrained(args, model, exclude_epoch_state=True): if value.shape[0] != args.num_classes: print(f'==> removing {key} with shape {value.shape}') param_dict.pop(key) - if exclude_epoch_state: - if 'epoch_num' in param_dict: - param_dict.pop('epoch_num') - if 'step_num' in param_dict: - param_dict.pop('step_num') - if 'learning_rate' in param_dict: - param_dict.pop('learning_rate') load_param_into_net(model, param_dict) else: print("=> no pretrained weights found at '{}'".format(args.pretrained)) diff --git a/research/cv/tnt/src/tools/optimizer.py b/research/cv/tnt/src/tools/optimizer.py index 9c42a98f7..b7d80d52e 100644 --- a/research/cv/tnt/src/tools/optimizer.py +++ b/research/cv/tnt/src/tools/optimizer.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/src/tools/schedulers.py b/research/cv/tnt/src/tools/schedulers.py index c1bbe4b69..dddc77243 100644 --- a/research/cv/tnt/src/tools/schedulers.py +++ b/research/cv/tnt/src/tools/schedulers.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/src/trainers/__init__.py b/research/cv/tnt/src/trainers/__init__.py index 9d38bfada..077e7628e 100644 --- a/research/cv/tnt/src/trainers/__init__.py +++ b/research/cv/tnt/src/trainers/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021-2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/src/trainers/train_one_step_with_scale_and_clip_global_norm.py b/research/cv/tnt/src/trainers/train_one_step_with_scale_and_clip_global_norm.py index 1b5de92b5..ab85b5248 100644 --- a/research/cv/tnt/src/trainers/train_one_step_with_scale_and_clip_global_norm.py +++ b/research/cv/tnt/src/trainers/train_one_step_with_scale_and_clip_global_norm.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021-2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/research/cv/tnt/train.py b/research/cv/tnt/train.py index a275d1990..f0bb7a77f 100644 --- a/research/cv/tnt/train.py +++ b/research/cv/tnt/train.py @@ -1,4 +1,4 @@ -# Copyright 2023 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,35 +12,30 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""Training script for TNT model""" -import time -import datetime -import functools +"""train""" +import os from mindspore import Model from mindspore import context from mindspore import nn from mindspore.common import set_seed +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor -from src.tools.common import get_callbacks +from src.args import args +from src.tools.callback import EvaluateCallBack from src.tools.cell import cast_amp -# from src.tools.callbacks import StopAtEpoch from src.tools.criterion import get_criterion, NetWithLoss -from src.tools.get_misc import ( - get_dataset, set_device, get_model, pretrained, get_train_one_step -) +from src.tools.get_misc import get_dataset, set_device, get_model, pretrained, get_train_one_step from src.tools.optimizer import get_optimizer def main(): - from src.args import args set_seed(args.seed) mode = { 0: context.GRAPH_MODE, 1: context.PYNATIVE_MODE } - context.set_context(mode=mode[args.pynative_mode], - device_target=args.device_target) + context.set_context(mode=mode[args.graph_mode], device_target=args.device_target) context.set_context(enable_graph_kernel=False) if args.device_target == "Ascend": context.set_context(enable_auto_mixed_precision=True) @@ -48,9 +43,11 @@ def main(): # get model and cast amp_level net = get_model(args) - cast_amp(net, args) + cast_amp(net) criterion = get_criterion(args) net_with_loss = NetWithLoss(net, criterion) + if args.pretrained: + pretrained(args, net) data = get_dataset(args) batch_num = data.train_dataset.get_dataset_size() @@ -58,54 +55,38 @@ def main(): # save a yaml file to read to record parameters net_with_loss = get_train_one_step(args, net_with_loss, optimizer) - if args.pretrained: - pretrained(args, net_with_loss, args.exclude_epoch_state) - eval_network = nn.WithEvalCell(net, criterion, - args.amp_level in ["O2", "O3", "auto"]) + eval_network = nn.WithEvalCell(net, criterion, args.amp_level in ["O2", "O3", "auto"]) eval_indexes = [0, 1, 2] model = Model(net_with_loss, metrics={"acc", "loss"}, eval_network=eval_network, eval_indexes=eval_indexes) - cur_name = datetime.datetime.now().strftime('%y-%m-%d_%H%M%S') - ckpt_save_dir = "{}/{}_{}".format(args.dir_ckpt, cur_name, rank) - ckpt_best_save_dir = "{}/{}_{}".format(args.dir_best_ckpt, cur_name, rank) - summary_dir = "{}/{}".format(args.dir_summary, cur_name) - # if args.run_modelarts: - # ckpt_save_dir = "/cache/ckpt_" + str(rank) + config_ck = CheckpointConfig(save_checkpoint_steps=data.train_dataset.get_dataset_size(), + keep_checkpoint_max=args.save_every) + time_cb = TimeMonitor(data_size=data.train_dataset.get_dataset_size()) - cb = get_callbacks( - args.arch, rank, data.train_dataset.get_dataset_size(), - data.val_dataset.get_dataset_size(), ckpt_save_dir, ckpt_best_save_dir, - summary_dir, args.save_ckpt_every_step, args.save_ckpt_every_sec, - args.save_ckpt_keep, print_loss_every=100, - collect_graph=args.dump_graph - ) + ckpt_save_dir = "./ckpt_" + str(rank) + if args.run_modelarts: + ckpt_save_dir = "/cache/ckpt_" + str(rank) - print("begin train") - print('Number of parameters:', - sum(functools.reduce(lambda x, y: x * y, params.shape) - for params in net.trainable_params())) - print('Number of samples in dataset:' - ' train={}, val={}'.format(data.train_dataset.get_dataset_size(), - data.val_dataset.get_dataset_size())) - # cb.append(StopAtEpoch(summary_dir, 1, args.epochs - args.start_epoch)) + ckpoint_cb = ModelCheckpoint(prefix=args.arch + str(rank), directory=ckpt_save_dir, + config=config_ck) + loss_cb = LossMonitor() + eval_cb = EvaluateCallBack(model, eval_dataset=data.val_dataset, src_url=ckpt_save_dir, + train_url=os.path.join(args.train_url, "ckpt_" + str(rank)), + save_freq=args.save_every) - sink_mode = True - t1 = time.time() - model.fit(int(args.epochs - args.start_epoch), data.train_dataset, - data.val_dataset, callbacks=cb, dataset_sink_mode=sink_mode) - t2 = time.time() - dt = 1000 * (t2 - t1) - print('Total training time: {:.3f} ms, time per epoch: {:.3f} ms,' - ' time per batch: {:.3f} ms, time per element: {:.3f} ms' - .format(dt, dt / args.epochs, - dt / args.epochs / data.train_dataset.get_dataset_size(), - dt / args.epochs / - data.train_dataset.get_dataset_size() / args.batch_size)) + print("begin train") + model.train(int(args.epochs - args.start_epoch), data.train_dataset, + callbacks=[time_cb, ckpoint_cb, loss_cb, eval_cb], + dataset_sink_mode=True) print("train success") + if args.run_modelarts: + import moxing as mox + mox.file.copy_parallel(src_url=ckpt_save_dir, dst_url=os.path.join(args.train_url, "ckpt_" + str(rank))) + if __name__ == '__main__': main() diff --git a/research/nlp/tprr/README.md b/research/nlp/tprr/README.md index 3f9ae2daf..646132c50 100644 --- a/research/nlp/tprr/README.md +++ b/research/nlp/tprr/README.md @@ -147,7 +147,7 @@ Parameters for re-ranker and reader evaluation can be passed directly at executi ``` Evaluation result will be stored in the scripts path, whose folder name begins with "eval_tr". You can find the result like the - followings in log. + following in log. ```python ###step###: 0 @@ -175,7 +175,7 @@ Parameters for re-ranker and reader evaluation can be passed directly at executi ``` Evaluation result will be stored in the scripts path, whose folder name begins with "eval". You can find the result like the - followings in log. + following in log. ```python total top1 pem: 0.8803511141120864 diff --git a/research/nlp/transX/README.md b/research/nlp/transX/README.md index 04500ec0b..deb4b4325 100644 --- a/research/nlp/transX/README.md +++ b/research/nlp/transX/README.md @@ -297,7 +297,7 @@ bash scripts/run_eval_gpu.sh [DATASET_ROOT] [DATASET_NAME] [MODEL_NAME] [CKPT_PA #### Result -Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log. +Evaluation result will be stored in the scripts path. Under this, you can find result like the following in log. The evaluation results will be stored in the **./eval-output** directory. If the shell script is used, the logged information will be redirected to the **./eval-logs** directory. -- Gitee From 461655253b7280080e53ea429eba531c8cf30157 Mon Sep 17 00:00:00 2001 From: daiyuxin0511 <455472400@qq.com> Date: Mon, 17 Jun 2024 14:41:38 +0800 Subject: [PATCH 41/44] update jit level --- official/cv/CycleGAN/eval.py | 4 ++-- official/cv/CycleGAN/train.py | 7 ++++--- official/cv/FasterRCNN/eval.py | 3 ++- official/cv/FasterRCNN/train.py | 5 +++-- official/cv/OpenPose/eval.py | 4 ++-- official/cv/OpenPose/train.py | 4 ++-- official/cv/ResNet/eval.py | 4 ++-- official/cv/ResNet/train.py | 23 +++++++++++------------ official/cv/Unet/eval.py | 2 +- official/cv/Unet/train.py | 7 +++---- official/nlp/Bert/pretrain_eval.py | 2 +- official/nlp/Bert/run_classifier.py | 3 ++- official/nlp/Bert/run_ner.py | 3 ++- official/nlp/Bert/run_pretrain.py | 11 ++++++----- official/nlp/Bert/run_squad.py | 3 ++- 15 files changed, 45 insertions(+), 40 deletions(-) diff --git a/official/cv/CycleGAN/eval.py b/official/cv/CycleGAN/eval.py index 44f3c11d4..5a0dcc1c1 100644 --- a/official/cv/CycleGAN/eval.py +++ b/official/cv/CycleGAN/eval.py @@ -27,8 +27,8 @@ from src.utils.tools import save_image, load_ckpt def predict(): """Predict function.""" args = get_args("predict") - mindspore.set_context(mode=0, device_target=args.platform, - save_graphs=args.save_graphs, device_id=args.device_id) + mindspore.set_context(mode=0, device_target=args.platform, save_graphs=args.save_graphs, device_id=args.device_id, + jit_config={"jit_level": "O2"}) args.rank = 0 args.device_num = 1 if args.platform == "GPU": diff --git a/official/cv/CycleGAN/train.py b/official/cv/CycleGAN/train.py index 9e0f1d8af..a81ae1604 100644 --- a/official/cv/CycleGAN/train.py +++ b/official/cv/CycleGAN/train.py @@ -36,15 +36,16 @@ def train(): """Train function.""" args = get_args("train") if args.device_num > 1: - mindspore.set_context(mode=0, device_target=args.platform, save_graphs=args.save_graphs) + mindspore.set_context(mode=0, device_target=args.platform, save_graphs=args.save_graphs, + jit_config={"jit_level": "O2"}) init() mindspore.reset_auto_parallel_context() mindspore.set_auto_parallel_context(parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True) args.rank = get_rank() args.group_size = get_group_size() else: - mindspore.set_context(mode=0, device_target=args.platform, - save_graphs=args.save_graphs, device_id=args.device_id) + mindspore.set_context(mode=0, device_target=args.platform, save_graphs=args.save_graphs, + device_id=args.device_id, jit_config={"jit_level": "O2"}) args.rank = 0 args.device_num = 1 diff --git a/official/cv/FasterRCNN/eval.py b/official/cv/FasterRCNN/eval.py index 87cb9cd91..80d237846 100644 --- a/official/cv/FasterRCNN/eval.py +++ b/official/cv/FasterRCNN/eval.py @@ -199,6 +199,7 @@ def eval_fasterrcnn(): if __name__ == '__main__': set_seed(1) - mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id()) + mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id(), + jit_config={"jit_level": "O2"}) eval_fasterrcnn() diff --git a/official/cv/FasterRCNN/train.py b/official/cv/FasterRCNN/train.py index f5cd9abd4..7556a7a74 100644 --- a/official/cv/FasterRCNN/train.py +++ b/official/cv/FasterRCNN/train.py @@ -288,7 +288,8 @@ def train_fasterrcnn(): if __name__ == '__main__': set_seed(1) mindspore.set_context(mode=0, device_target=config.device_target, device_id=get_device_id(), - ascend_config={"ge_options": {"global": {"ge.exec.memoryOptimizationPolicy": ""}}}) + ascend_config={"ge_options": {"global": {"ge.exec.memoryOptimizationPolicy": ""}}}, + jit_config={"jit_level": "O2"}) set_ascend_max_device_memory() local_path = '/'.join(os.path.realpath(__file__).split('/')[:-1]) summary_dir = local_path + "/train/summary/" @@ -300,7 +301,7 @@ if __name__ == '__main__': rank = get_rank() device_num = get_group_size() mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, - gradients_mean=True) + gradients_mean=True) summary_dir += "thread_num_" + str(rank) + "/" else: rank = 0 diff --git a/official/cv/OpenPose/eval.py b/official/cv/OpenPose/eval.py index 410fc8ac5..2226d720e 100644 --- a/official/cv/OpenPose/eval.py +++ b/official/cv/OpenPose/eval.py @@ -37,8 +37,8 @@ from src.model_utils.device_adapter import get_device_id, get_rank_id, get_devic warnings.filterwarnings("ignore") devid = get_device_id() -mindspore.set_context(mode=0, - device_target=config.device_target, save_graphs=False, device_id=devid) +mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, device_id=devid, + jit_config={"jit_level": "O2"}) show_gt = 0 diff --git a/official/cv/OpenPose/train.py b/official/cv/OpenPose/train.py index 6830a93b0..59b4c257e 100644 --- a/official/cv/OpenPose/train.py +++ b/official/cv/OpenPose/train.py @@ -32,7 +32,7 @@ from src.model_utils.device_adapter import get_rank_id, get_device_num set_seed(1) -mindspore.set_context(mode=0, device_target="Ascend", save_graphs=False) +mindspore.set_context(mode=0, device_target="Ascend", save_graphs=False, jit_config={"jit_level": "O2"}) def modelarts_pre_process(): @@ -49,7 +49,7 @@ def train(): if device_num > 1: init() mindspore.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, - gradients_mean=True) + gradients_mean=True) config.rank = get_rank_id() config.outputs_dir = os.path.join(config.outputs_dir, "ckpt_{}/".format(config.rank)) else: diff --git a/official/cv/ResNet/eval.py b/official/cv/ResNet/eval.py index 0b65eb324..582b25334 100644 --- a/official/cv/ResNet/eval.py +++ b/official/cv/ResNet/eval.py @@ -66,7 +66,7 @@ def eval_net(): """eval net""" target = config.device_target # init context - mindspore.set_context(mode=0, device_target=target, save_graphs=False) + mindspore.set_context(mode=0, device_target=target, save_graphs=False, jit_config={"jit_level": "O2"}) if target == "Ascend": device_id = int(os.getenv('DEVICE_ID', '0')) mindspore.set_context(device_id=device_id) @@ -101,7 +101,7 @@ def eval_net(): # define model, add boostmode for eval scenarios with train.py model = mindspore.Model(net, loss_fn=loss, boost_level=config.boost_mode, - optimizer=opt, metrics={'top_1_accuracy', 'top_5_accuracy'}) + optimizer=opt, metrics={'top_1_accuracy', 'top_5_accuracy'}) # eval model res = model.eval(dataset) diff --git a/official/cv/ResNet/train.py b/official/cv/ResNet/train.py index 67ba139ea..1e5315ba1 100644 --- a/official/cv/ResNet/train.py +++ b/official/cv/ResNet/train.py @@ -73,9 +73,10 @@ def set_parameter(): if target == "Ascend": rank_save_graphs_path = os.path.join(config.save_graphs_path, "soma", str(os.getenv('DEVICE_ID', '0'))) mindspore.set_context(mode=0, device_target=target, save_graphs=config.save_graphs, - save_graphs_path=rank_save_graphs_path) + save_graphs_path=rank_save_graphs_path, jit_config={"jit_level": "O2"}) else: - mindspore.set_context(mode=0, device_target=target, save_graphs=config.save_graphs) + mindspore.set_context(mode=0, device_target=target, save_graphs=config.save_graphs, + jit_config={"jit_level": "O2"}) set_graph_kernel_context(target, config.net_name) else: mindspore.set_context(mode=1, device_target=target, save_graphs=False) @@ -86,8 +87,8 @@ def set_parameter(): if target == "Ascend": device_id = int(os.getenv('DEVICE_ID', '0')) mindspore.set_context(device_id=device_id) - mindspore.set_auto_parallel_context(device_num=config.device_num, parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, - gradients_mean=True) + mindspore.set_auto_parallel_context(device_num=config.device_num, gradients_mean=True, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL) set_algo_parameters(elementwise_op_strategy_follow=True) if config.net_name == "resnet50" or config.net_name == "se-resnet50": if config.boost_mode not in ["O1", "O2"]: @@ -98,9 +99,8 @@ def set_parameter(): # GPU target else: init() - mindspore.set_auto_parallel_context(device_num=get_device_num(), - parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, - gradients_mean=True) + mindspore.set_auto_parallel_context(device_num=get_device_num(), gradients_mean=True, + parallel_mode=mindspore.ParallelMode.DATA_PARALLEL) if config.net_name == "resnet50": mindspore.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) config.rank_id = get_rank() if config.run_distribute else 0 @@ -160,8 +160,8 @@ def train_net(): init_weight(net, config) if config.resume_ckpt: - resume_param = mindspore.load_checkpoint(config.resume_ckpt, - choice_func=lambda x: not x.startswith(('learning_rate', 'global_step'))) + resume_param = mindspore.load_checkpoint(config.resume_ckpt, choice_func=\ + lambda x: not x.startswith(('learning_rate', 'global_step'))) config.start_epoch = int(resume_param.get('epoch_num', mindspore.Tensor(0, mindspore.int32)).asnumpy().item()) lr = mindspore.Tensor(init_lr(step_size=step_size)) @@ -183,9 +183,8 @@ def train_net(): model = mindspore.Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network) else: model = mindspore.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, - amp_level="O3", boost_level=config.boost_mode, - eval_network=dist_eval_network, - boost_config_dict={"grad_freeze": {"total_steps": config.epoch_size * step_size}}) + amp_level="O3", boost_level=config.boost_mode, eval_network=dist_eval_network, + boost_config_dict={"grad_freeze": {"total_steps": config.epoch_size * step_size}}) if config.optimizer == "Thor" and config.dataset == "imagenet2012": from src.lr_generator import get_thor_damping diff --git a/official/cv/Unet/eval.py b/official/cv/Unet/eval.py index 5ab4b111f..b5ee1ae8b 100644 --- a/official/cv/Unet/eval.py +++ b/official/cv/Unet/eval.py @@ -60,7 +60,7 @@ def test_net(data_dir, if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') - mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, jit_config={"jit_level": "O2"}) if config.device_target == "Ascend": device_id = get_device_id() mindspore.set_context(device_id=device_id) diff --git a/official/cv/Unet/train.py b/official/cv/Unet/train.py index 1ca94e640..ecfeb72c3 100644 --- a/official/cv/Unet/train.py +++ b/official/cv/Unet/train.py @@ -50,9 +50,7 @@ def train_net(cross_valid_ind=1, group_size = get_group_size() rank = get_rank() parallel_mode = ParallelMode.DATA_PARALLEL - mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, - device_num=group_size, - gradients_mean=False) + mindspore.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=False) need_slice = False if config.model_name == 'unet_medical': net = UNetMedical(n_channels=config.num_channels, n_classes=config.num_classes) @@ -130,7 +128,8 @@ def train_net(cross_valid_ind=1, if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') # to keep GetNext from timeout, set op_timeout=600 - mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, op_timeout=600) + mindspore.set_context(mode=0, device_target=config.device_target, save_graphs=False, op_timeout=600, + jit_config={"jit_level": "O2"}) if config.device_target == "Ascend": device_id = get_device_id() mindspore.set_context(device_id=device_id) diff --git a/official/nlp/Bert/pretrain_eval.py b/official/nlp/Bert/pretrain_eval.py index 51f804467..84c87a9a4 100644 --- a/official/nlp/Bert/pretrain_eval.py +++ b/official/nlp/Bert/pretrain_eval.py @@ -32,7 +32,7 @@ def bert_predict(): Predict function ''' devid = int(os.getenv('DEVICE_ID')) - mindspore.set_context(mode=0, device_target="Ascend", device_id=devid) + mindspore.set_context(mode=0, device_target="Ascend", device_id=devid, jit_config={"jit_level": "O2"}) dataset = create_eval_dataset(cfg.batch_size, 1, data_dir=cfg.eval_data_dir, dataset_format=cfg.dataset_format) net_for_pretraining = BertPretrainEval(bert_net_cfg) net_for_pretraining.set_train(False) diff --git a/official/nlp/Bert/run_classifier.py b/official/nlp/Bert/run_classifier.py index e9c0e7b58..e72dcfb67 100644 --- a/official/nlp/Bert/run_classifier.py +++ b/official/nlp/Bert/run_classifier.py @@ -170,7 +170,8 @@ def run_classifier(): load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path target = args_opt.device_target if target == "Ascend": - mindspore.set_context(mode=0, device_target="Ascend", device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=args_opt.device_id, + jit_config={"jit_level": "O2"}) elif target == "GPU": mindspore.set_context(mode=0, device_target="GPU") mindspore.set_context(enable_graph_kernel=True) diff --git a/official/nlp/Bert/run_ner.py b/official/nlp/Bert/run_ner.py index 99b69a2df..ec7acf369 100644 --- a/official/nlp/Bert/run_ner.py +++ b/official/nlp/Bert/run_ner.py @@ -200,7 +200,8 @@ def run_ner(): load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path target = args_opt.device_target if target == "Ascend": - mindspore.set_context(mode=0, device_target="Ascend", device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=args_opt.device_id, + jit_config={"jit_level": "O2"}) elif target == "GPU": mindspore.set_context(mode=0, device_target="GPU") mindspore.set_context(enable_graph_kernel=True) diff --git a/official/nlp/Bert/run_pretrain.py b/official/nlp/Bert/run_pretrain.py index a2971b504..6042d8d5c 100644 --- a/official/nlp/Bert/run_pretrain.py +++ b/official/nlp/Bert/run_pretrain.py @@ -133,9 +133,9 @@ def _set_graph_kernel_context(device_target): if device_target == 'GPU': if cfg.bert_network == 'base': mindspore.set_context(enable_graph_kernel=True, - graph_kernel_flags="--enable_stitch_fusion=true " - "--enable_parallel_fusion=true " - "--enable_cluster_ops=BatchMatMul") + graph_kernel_flags="--enable_stitch_fusion=true " + "--enable_parallel_fusion=true " + "--enable_cluster_ops=BatchMatMul") else: mindspore.set_context(enable_graph_kernel=True) else: @@ -205,7 +205,8 @@ def InitNetWithGrads(net_with_loss, optimizer): @moxing_wrapper(pre_process=modelarts_pre_process) def run_pretrain(): """pre-train bert_clue""" - mindspore.set_context(mode=0, device_target=cfg.device_target, device_id=cfg.device_id) + mindspore.set_context(mode=0, device_target=cfg.device_target, device_id=cfg.device_id, + jit_config={"jit_level": "O2"}) mindspore.set_context(reserve_class_name_in_scope=False) _set_graph_kernel_context(cfg.device_target) ckpt_save_dir = cfg.save_checkpoint_path @@ -225,7 +226,7 @@ def run_pretrain(): mindspore.reset_auto_parallel_context() mindspore.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, - device_num=device_num) + device_num=device_num) _set_bert_all_reduce_split() _check_compute_type(cfg) diff --git a/official/nlp/Bert/run_squad.py b/official/nlp/Bert/run_squad.py index 1a73673e2..b1cb2c22a 100644 --- a/official/nlp/Bert/run_squad.py +++ b/official/nlp/Bert/run_squad.py @@ -158,7 +158,8 @@ def run_squad(): load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path target = args_opt.device_target if target == "Ascend": - mindspore.set_context(mode=0, device_target="Ascend", device_id=args_opt.device_id) + mindspore.set_context(mode=0, device_target="Ascend", device_id=args_opt.device_id, + jit_config={"jit_level": "O2"}) elif target == "GPU": mindspore.set_context(mode=0, device_target="GPU") mindspore.set_context(enable_graph_kernel=True) -- Gitee From 0f56a6f51db6ae68eddfb77fefaadd8f08dacb8d Mon Sep 17 00:00:00 2001 From: gaoshuanglong Date: Tue, 18 Jun 2024 16:31:11 +0800 Subject: [PATCH 42/44] Fix YOLOv5 scripts error. --- official/cv/YOLOv5/scripts/run_distribute_eval.sh | 1 - official/cv/YOLOv5/scripts/run_distribute_train.sh | 1 - official/cv/YOLOv5/scripts/run_distribute_train_gpu.sh | 1 - official/cv/YOLOv5/scripts/run_eval.sh | 1 - official/cv/YOLOv5/scripts/run_standalone_train.sh | 1 - official/cv/YOLOv5/scripts/run_standalone_train_gpu.sh | 1 - 6 files changed, 6 deletions(-) diff --git a/official/cv/YOLOv5/scripts/run_distribute_eval.sh b/official/cv/YOLOv5/scripts/run_distribute_eval.sh index 1d9245d80..26551aae0 100644 --- a/official/cv/YOLOv5/scripts/run_distribute_eval.sh +++ b/official/cv/YOLOv5/scripts/run_distribute_eval.sh @@ -84,7 +84,6 @@ do cp ../*.yaml $dir_path cp -r ../src $dir_path cp -r ../model_utils $dir_path - cp -r ../third_party $dir_path cd $dir_path || exit env > env.log echo "start inferring for rank $RANK_ID, device $DEVICE_ID" diff --git a/official/cv/YOLOv5/scripts/run_distribute_train.sh b/official/cv/YOLOv5/scripts/run_distribute_train.sh index e476e06b1..cb38aae52 100644 --- a/official/cv/YOLOv5/scripts/run_distribute_train.sh +++ b/official/cv/YOLOv5/scripts/run_distribute_train.sh @@ -66,7 +66,6 @@ do cp ../*.yaml ./train_parallel$i cp -r ../src ./train_parallel$i cp -r ../model_utils ./train_parallel$i - cp -r ../third_party ./train_parallel$i cd ./train_parallel$i || exit echo "start training for rank $RANK_ID, device $DEVICE_ID" env > env.log diff --git a/official/cv/YOLOv5/scripts/run_distribute_train_gpu.sh b/official/cv/YOLOv5/scripts/run_distribute_train_gpu.sh index 26f81ca7d..b92fe1b65 100644 --- a/official/cv/YOLOv5/scripts/run_distribute_train_gpu.sh +++ b/official/cv/YOLOv5/scripts/run_distribute_train_gpu.sh @@ -48,7 +48,6 @@ cp ../*.py ./distribute_train cp ../*.yaml ./distribute_train cp -r ../src ./distribute_train cp -r ../model_utils ./distribute_train -cp -r ../third_party ./distribute_train cd ./distribute_train || exit mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ diff --git a/official/cv/YOLOv5/scripts/run_eval.sh b/official/cv/YOLOv5/scripts/run_eval.sh index 1a5651383..e43870ccf 100644 --- a/official/cv/YOLOv5/scripts/run_eval.sh +++ b/official/cv/YOLOv5/scripts/run_eval.sh @@ -58,7 +58,6 @@ cp ../*.py ./eval cp ../*.yaml ./eval cp -r ../src ./eval cp -r ../model_utils ./eval -cp -r ../third_party ./eval cd ./eval || exit env > env.log echo "start inferring for device $DEVICE_ID" diff --git a/official/cv/YOLOv5/scripts/run_standalone_train.sh b/official/cv/YOLOv5/scripts/run_standalone_train.sh index 260028a43..2f014e3bb 100644 --- a/official/cv/YOLOv5/scripts/run_standalone_train.sh +++ b/official/cv/YOLOv5/scripts/run_standalone_train.sh @@ -53,7 +53,6 @@ cp ../*.py ./train cp ../*.yaml ./train cp -r ../src ./train cp -r ../model_utils ./train -cp -r ../third_party ./train cd ./train || exit echo "start training for device $DEVICE_ID" env > env.log diff --git a/official/cv/YOLOv5/scripts/run_standalone_train_gpu.sh b/official/cv/YOLOv5/scripts/run_standalone_train_gpu.sh index 0e70ab182..c8839c18b 100644 --- a/official/cv/YOLOv5/scripts/run_standalone_train_gpu.sh +++ b/official/cv/YOLOv5/scripts/run_standalone_train_gpu.sh @@ -55,7 +55,6 @@ cp ../*.py ./train cp ../*.yaml ./train cp -r ../src ./train cp -r ../model_utils ./train -cp -r ../third_party ./train cd ./train || exit echo "======start training======" env > env.log -- Gitee From dacfd9060b6a203655dddc58b30bd0ef145db3db Mon Sep 17 00:00:00 2001 From: weel2020 <573352706@qq.com> Date: Wed, 19 Jun 2024 06:24:05 +0000 Subject: [PATCH 43/44] update research/recommend/GEHRL/README.MD. Signed-off-by: weel2020 <573352706@qq.com> --- research/recommend/GEHRL/README.MD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/research/recommend/GEHRL/README.MD b/research/recommend/GEHRL/README.MD index 2a77830ff..5409245a0 100644 --- a/research/recommend/GEHRL/README.MD +++ b/research/recommend/GEHRL/README.MD @@ -1 +1 @@ -The source code of the paper "Graph Enhanced Hierarchical Reinforcement Learning for Goal-oriented Learning Path Recommendation" will be coming soon... \ No newline at end of file +The source code of the paper "Graph Enhanced Hierarchical Reinforcement Learning for Goal-oriented Learning Path Recommendation" refer to [https://github.com/mindspore-lab/models/tree/master/research/huawei-noah/GEHRL](https://github.com/mindspore-lab/models/tree/master/research/huawei-noah/GEHRL) \ No newline at end of file -- Gitee From d03c9871569b5bd557b0c3551d5cbc0ee91ebd01 Mon Sep 17 00:00:00 2001 From: ash Date: Thu, 20 Jun 2024 10:40:34 +0800 Subject: [PATCH 44/44] update numpy API --- official/cv/Arcface/eval_ijbc.py | 10 +++++----- official/cv/Arcface/eval_ijbc_onnx.py | 10 +++++----- .../src/text_connector/connect_text_lines.py | 2 +- .../cv/CTPN/src/text_connector/detector.py | 6 +++--- .../efficientnet-b0/src/transform_utils.py | 2 +- .../infer/util/classification_task_metric.py | 2 +- official/cv/FasterRCNN/src/detecteval.py | 4 ++-- .../infer/sdk/classification_task_metric.py | 2 +- official/cv/OCRNet/src/basedataset.py | 18 +++++++++--------- official/cv/OCRNet/src/cityscapes.py | 12 ++++++------ official/cv/OCRNet/src/seg_hrnet.py | 2 +- official/cv/OCRNet/src/seg_hrnet_ocr.py | 2 +- .../sdk/classification_task_metric.py | 2 +- official/cv/RetinaFace_ResNet50/eval.py | 16 ++++++++-------- official/cv/Unet/postprocess.py | 2 +- official/cv/Unet/src/data_loader.py | 8 ++++---- official/cv/VIT/src/autoaugment.py | 2 +- official/cv/YOLOv4/src/coco_visual.py | 4 ++-- 18 files changed, 53 insertions(+), 53 deletions(-) diff --git a/official/cv/Arcface/eval_ijbc.py b/official/cv/Arcface/eval_ijbc.py index f4f3c6a55..493da95e9 100644 --- a/official/cv/Arcface/eval_ijbc.py +++ b/official/cv/Arcface/eval_ijbc.py @@ -157,8 +157,8 @@ def divideIntoNstrand(listTemp, n): def read_template_media_list(path): ijb_meta = pd.read_csv(path, sep=' ', header=None).values - templates = ijb_meta[:, 1].astype(np.int) - media = ijb_meta[:, 2].astype(np.int) + templates = ijb_meta[:, 1].astype(np.int_) + media = ijb_meta[:, 2].astype(np.int_) return templates, media @@ -167,9 +167,9 @@ def read_template_media_list(path): def read_template_pair_list(path): pairs = pd.read_csv(path, sep=' ', header=None).values - t1 = pairs[:, 0].astype(np.int) - t2 = pairs[:, 1].astype(np.int) - label = pairs[:, 2].astype(np.int) + t1 = pairs[:, 0].astype(np.int_) + t2 = pairs[:, 1].astype(np.int_) + label = pairs[:, 2].astype(np.int_) return t1, t2, label diff --git a/official/cv/Arcface/eval_ijbc_onnx.py b/official/cv/Arcface/eval_ijbc_onnx.py index 317c51f4b..b6ba4fbb1 100644 --- a/official/cv/Arcface/eval_ijbc_onnx.py +++ b/official/cv/Arcface/eval_ijbc_onnx.py @@ -148,16 +148,16 @@ def divideIntoNstrand(listTemp, n): def read_template_media_list(path): ijb_meta = pd.read_csv(path, sep=' ', header=None).values - templates = ijb_meta[:, 1].astype(np.int) - media = ijb_meta[:, 2].astype(np.int) + templates = ijb_meta[:, 1].astype(np.int_) + media = ijb_meta[:, 2].astype(np.int_) return templates, media def read_template_pair_list(path): pairs = pd.read_csv(path, sep=' ', header=None).values - t1 = pairs[:, 0].astype(np.int) - t2 = pairs[:, 1].astype(np.int) - label = pairs[:, 2].astype(np.int) + t1 = pairs[:, 0].astype(np.int_) + t2 = pairs[:, 1].astype(np.int_) + label = pairs[:, 2].astype(np.int_) return t1, t2, label diff --git a/official/cv/CTPN/src/text_connector/connect_text_lines.py b/official/cv/CTPN/src/text_connector/connect_text_lines.py index 171beca9a..87fa23892 100644 --- a/official/cv/CTPN/src/text_connector/connect_text_lines.py +++ b/official/cv/CTPN/src/text_connector/connect_text_lines.py @@ -52,7 +52,7 @@ def connect_text_lines(text_proposals, scores, size): text_lines = clip_boxes(text_lines, size) - text_recs = np.zeros((len(text_lines), 9), np.float) + text_recs = np.zeros((len(text_lines), 9), np.float_) index = 0 for line in text_lines: xmin, ymin, xmax, ymax = line[0], line[1], line[2], line[3] diff --git a/official/cv/CTPN/src/text_connector/detector.py b/official/cv/CTPN/src/text_connector/detector.py index 7e5d724d4..707876a67 100644 --- a/official/cv/CTPN/src/text_connector/detector.py +++ b/official/cv/CTPN/src/text_connector/detector.py @@ -44,9 +44,9 @@ def filter_boxes(boxes): Returns: boxes(numpy.array): Text boxes after filter. """ - heights = np.zeros((len(boxes), 1), np.float) - widths = np.zeros((len(boxes), 1), np.float) - scores = np.zeros((len(boxes), 1), np.float) + heights = np.zeros((len(boxes), 1), np.float_) + widths = np.zeros((len(boxes), 1), np.float_) + scores = np.zeros((len(boxes), 1), np.float_) index = 0 for box in boxes: widths[index] = abs(box[2] - box[0]) diff --git a/official/cv/Efficientnet/efficientnet-b0/src/transform_utils.py b/official/cv/Efficientnet/efficientnet-b0/src/transform_utils.py index 4e86cf9fb..ef3092cf9 100644 --- a/official/cv/Efficientnet/efficientnet-b0/src/transform_utils.py +++ b/official/cv/Efficientnet/efficientnet-b0/src/transform_utils.py @@ -421,7 +421,7 @@ def skew(img, v, **__): matrix.append([p1[0], p1[1], 1, 0, 0, 0, -p2[0] * p1[0], -p2[0] * p1[1]]) matrix.append([0, 0, 0, p1[0], p1[1], 1, -p2[1] * p1[0], -p2[1] * p1[1]]) - A = np.matrix(matrix, dtype=np.float) + A = np.matrix(matrix, dtype=np.float_) B = np.array(original_plane).reshape(8) perspective_skew_coefficients_matrix = np.dot(np.linalg.pinv(A), B) perspective_skew_coefficients_matrix = np.array(perspective_skew_coefficients_matrix).reshape(8) diff --git a/official/cv/Efficientnet/efficientnet-b3/infer/util/classification_task_metric.py b/official/cv/Efficientnet/efficientnet-b3/infer/util/classification_task_metric.py index 9e689cde7..ce60a7353 100644 --- a/official/cv/Efficientnet/efficientnet-b3/infer/util/classification_task_metric.py +++ b/official/cv/Efficientnet/efficientnet-b3/infer/util/classification_task_metric.py @@ -55,7 +55,7 @@ def load_statistical_predict_result(filepath): data_vec = np.zeros((len(label_list)), dtype=np.float32) if n_label != 0: for ind, cls_ind in enumerate(label_list): - data_vec[ind] = np.int(cls_ind) + data_vec[ind] = np.int_(cls_ind) return data_vec, n_label diff --git a/official/cv/FasterRCNN/src/detecteval.py b/official/cv/FasterRCNN/src/detecteval.py index a6766af97..63d9b21a3 100644 --- a/official/cv/FasterRCNN/src/detecteval.py +++ b/official/cv/FasterRCNN/src/detecteval.py @@ -499,8 +499,8 @@ class DetectEval(COCOeval): assert (tps.shape[0]) == 1 assert (fps.shape[0]) == 1 - tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float) - fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float) + tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float_) + fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float_) ids = catIds[k0] label = labels[ids] diff --git a/official/cv/Inception/inceptionv4/infer/sdk/classification_task_metric.py b/official/cv/Inception/inceptionv4/infer/sdk/classification_task_metric.py index 09d0184a8..4fdf1ec2b 100644 --- a/official/cv/Inception/inceptionv4/infer/sdk/classification_task_metric.py +++ b/official/cv/Inception/inceptionv4/infer/sdk/classification_task_metric.py @@ -48,7 +48,7 @@ def load_statistical_predict_result(filepath): data_vec = np.zeros((n_label), dtype=np.float32) if n_label != 0: for ind, cls_ind in enumerate(temp): - data_vec[ind] = np.int(cls_ind) + data_vec[ind] = np.int_(cls_ind) return data_vec, n_label diff --git a/official/cv/OCRNet/src/basedataset.py b/official/cv/OCRNet/src/basedataset.py index 104ee830f..cb676d793 100644 --- a/official/cv/OCRNet/src/basedataset.py +++ b/official/cv/OCRNet/src/basedataset.py @@ -89,14 +89,14 @@ class BaseDataset: def multi_scale_aug(self, image, label=None, rand_scale=1, rand_crop=True): """Augment feature into different scales.""" - long_size = np.int(self.base_size * rand_scale + 0.5) + long_size = np.int_(self.base_size * rand_scale + 0.5) h, w, _ = image.shape if h > w: new_h = long_size - new_w = np.int(w * long_size / h + 0.5) + new_w = np.int_(w * long_size / h + 0.5) else: new_w = long_size - new_h = np.int(h * long_size / w + 0.5) + new_h = np.int_(h * long_size / w + 0.5) image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR) # image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_NEAREST) @@ -156,8 +156,8 @@ class BaseDataset: batch, _, ori_height, ori_width = image.shape assert batch == 1, "only supporting batchsize 1." image = image.asnumpy()[0].transpose((1, 2, 0)).copy() - stride_h = np.int(self.crop_size[0] * 2.0 / 3.0) - stride_w = np.int(self.crop_size[1] * 2.0 / 3.0) + stride_h = np.int_(self.crop_size[0] * 2.0 / 3.0) + stride_w = np.int_(self.crop_size[1] * 2.0 / 3.0) final_pred = Tensor(np.zeros([1, self.num_classes, ori_height, ori_width]), dtype=dtype.float32) padvalue = -1.0 * np.array(self.mean) / np.array(self.std) @@ -178,10 +178,10 @@ class BaseDataset: new_img = self.pad_image(new_img, height, width, self.crop_size, padvalue) new_h, new_w = new_img.shape[:-1] - rows = np.int(np.ceil(1.0 * (new_h - - self.crop_size[0]) / stride_h)) + 1 - cols = np.int(np.ceil(1.0 * (new_w - - self.crop_size[1]) / stride_w)) + 1 + rows = np.int_(np.ceil(1.0 * (new_h - + self.crop_size[0]) / stride_h)) + 1 + cols = np.int_(np.ceil(1.0 * (new_w - + self.crop_size[1]) / stride_w)) + 1 preds = Tensor(np.zeros([1, self.num_classes, new_h, new_w]), dtype=dtype.float32) count = Tensor(np.zeros([1, 1, new_h, new_w]), dtype=dtype.float32) diff --git a/official/cv/OCRNet/src/cityscapes.py b/official/cv/OCRNet/src/cityscapes.py index 3dbe7a519..18b3a335d 100644 --- a/official/cv/OCRNet/src/cityscapes.py +++ b/official/cv/OCRNet/src/cityscapes.py @@ -118,8 +118,8 @@ class Cityscapes(BaseDataset): batch, _, ori_height, ori_width = image.shape assert batch == 1, "only supporting batchsize 1." image = image.asnumpy()[0].transpose((1, 2, 0)).copy() - stride_h = np.int(self.crop_size[0] * 1.0) - stride_w = np.int(self.crop_size[1] * 1.0) + stride_h = np.int_(self.crop_size[0] * 1.0) + stride_w = np.int_(self.crop_size[1] * 1.0) final_pred = Tensor(np.zeros([1, self.num_classes, ori_height, ori_width]), dtype=dtype.float32) for scale in scales: @@ -137,10 +137,10 @@ class Cityscapes(BaseDataset): preds = preds[:, :, 0:height, 0:width] else: new_h, new_w = new_img.shape[:-1] - rows = np.int(np.ceil(1.0 * (new_h - - self.crop_size[0]) / stride_h)) + 1 - cols = np.int(np.ceil(1.0 * (new_w - - self.crop_size[1]) / stride_w)) + 1 + rows = np.int_(np.ceil(1.0 * (new_h - + self.crop_size[0]) / stride_h)) + 1 + cols = np.int_(np.ceil(1.0 * (new_w - + self.crop_size[1]) / stride_w)) + 1 preds = np.zeros([1, self.num_classes, new_h, new_w]).astype(np.float32) count = np.zeros([1, 1, new_h, new_w]).astype(np.float32) diff --git a/official/cv/OCRNet/src/seg_hrnet.py b/official/cv/OCRNet/src/seg_hrnet.py index 49d3740ab..3fbcc0632 100644 --- a/official/cv/OCRNet/src/seg_hrnet.py +++ b/official/cv/OCRNet/src/seg_hrnet.py @@ -340,7 +340,7 @@ class HighResolutionNet(nn.Cell): self.stage4, pre_stage_channels = self._make_stage( self.stage4_cfg, num_channels, multi_scale_output=True) - last_inp_channels = np.int(np.sum(pre_stage_channels)) + last_inp_channels = np.int_(np.sum(pre_stage_channels)) self.last_layer = nn.SequentialCell([ diff --git a/official/cv/OCRNet/src/seg_hrnet_ocr.py b/official/cv/OCRNet/src/seg_hrnet_ocr.py index 6cbd664a7..275e623f9 100644 --- a/official/cv/OCRNet/src/seg_hrnet_ocr.py +++ b/official/cv/OCRNet/src/seg_hrnet_ocr.py @@ -565,7 +565,7 @@ class HighResolutionNet(nn.Cell): self.stage4, pre_stage_channels = self._make_stage( self.stage4_cfg, num_channels, multi_scale_output=True) - last_inp_channels = np.int(np.sum(pre_stage_channels)) + last_inp_channels = np.int_(np.sum(pre_stage_channels)) ocr_mid_channels = config.model.ocr.mid_channels ocr_key_channels = config.model.ocr.key_channels diff --git a/official/cv/ResNet/infer/ResNet152/sdk/classification_task_metric.py b/official/cv/ResNet/infer/ResNet152/sdk/classification_task_metric.py index 4c82151c1..a830f7d2c 100644 --- a/official/cv/ResNet/infer/ResNet152/sdk/classification_task_metric.py +++ b/official/cv/ResNet/infer/ResNet152/sdk/classification_task_metric.py @@ -69,7 +69,7 @@ def load_statistical_predict_result(filepath): data_vec = np.zeros((len(temp)), dtype=np.float32) if n_label != 0: for ind, cls_ind in enumerate(temp): - data_vec[ind] = np.int(cls_ind) + data_vec[ind] = np.int_(cls_ind) return data_vec, n_label diff --git a/official/cv/RetinaFace_ResNet50/eval.py b/official/cv/RetinaFace_ResNet50/eval.py index 27559fbf8..677df9516 100644 --- a/official/cv/RetinaFace_ResNet50/eval.py +++ b/official/cv/RetinaFace_ResNet50/eval.py @@ -148,8 +148,8 @@ class DetectionEngine: keep = self._nms(dets, self.nms_thresh) dets = dets[keep, :] - dets[:, 2:4] = (dets[:, 2:4].astype(np.int) - dets[:, 0:2].astype(np.int)).astype(np.float) # int - dets[:, 0:4] = dets[:, 0:4].astype(np.int).astype(np.float) # int + dets[:, 2:4] = (dets[:, 2:4].astype(np.int_) - dets[:, 0:2].astype(np.int_)).astype(np.float_) # int + dets[:, 0:4] = dets[:, 0:4].astype(np.int_).astype(np.float_) # int # add to result @@ -157,7 +157,7 @@ class DetectionEngine: if event_name not in self.results.keys(): self.results[event_name] = {} self.results[event_name][img_name[:-4]] = {'img_path': image_path, - 'bboxes': dets[:, :5].astype(np.float).tolist()} + 'bboxes': dets[:, :5].astype(np.float_).tolist()} def _get_gt_boxes(self): from scipy.io import loadmat @@ -182,7 +182,7 @@ class DetectionEngine: for event in self.results: for name in self.results[event].keys(): - bbox = np.array(self.results[event][name]['bboxes']).astype(np.float) + bbox = np.array(self.results[event][name]['bboxes']).astype(np.float_) if bbox.shape[0] <= 0: continue max_score = max(max_score, np.max(bbox[:, -1])) @@ -191,7 +191,7 @@ class DetectionEngine: length = max_score - min_score for event in self.results: for name in self.results[event].keys(): - bbox = np.array(self.results[event][name]['bboxes']).astype(np.float) + bbox = np.array(self.results[event][name]['bboxes']).astype(np.float_) if bbox.shape[0] <= 0: continue bbox[:, -1] -= min_score @@ -227,7 +227,7 @@ class DetectionEngine: - image_pr = np.zeros((section_num, 2), dtype=np.float) + image_pr = np.zeros((section_num, 2), dtype=np.float_) for section in range(section_num): _thresh = 1 - (section + 1)/section_num over_score_index = np.where(predict[:, 4] >= _thresh)[0] @@ -254,7 +254,7 @@ class DetectionEngine: for _set in range(len(sets)): gt_list = set_gts[_set] count_gt = 0 - pr_curve = np.zeros((section_num, 2), dtype=np.float) + pr_curve = np.zeros((section_num, 2), dtype=np.float_) for i, _ in enumerate(event_list): event = str(event_list[i][0][0]) image_list = file_list[i][0] @@ -263,7 +263,7 @@ class DetectionEngine: event_gt_box_list = facebox_list[i][0] for j, _ in enumerate(image_list): - predict = np.array(event_predict_dict[str(image_list[j][0][0])]['bboxes']).astype(np.float) + predict = np.array(event_predict_dict[str(image_list[j][0][0])]['bboxes']).astype(np.float_) gt_boxes = event_gt_box_list[j][0].astype('float') keep_index = event_gt_index_list[j][0] count_gt += len(keep_index) diff --git a/official/cv/Unet/postprocess.py b/official/cv/Unet/postprocess.py index 67def617c..8fd6d2da3 100644 --- a/official/cv/Unet/postprocess.py +++ b/official/cv/Unet/postprocess.py @@ -39,7 +39,7 @@ if __name__ == '__main__': mask = cv2.imread(os.path.join(config.data_path, f, "mask.png"), cv2.IMREAD_GRAYSCALE) mask = cv2.resize(mask, img_size) mask = mask.astype(np.float32) / 255 - mask = (mask > 0.5).astype(np.int) + mask = (mask > 0.5).astype(np.int_) mask = (np.arange(2) == mask[..., None]).astype(int) mask = mask.transpose(2, 0, 1).astype(np.float32) label = mask.reshape(1, 2, 96, 96) diff --git a/official/cv/Unet/src/data_loader.py b/official/cv/Unet/src/data_loader.py index f2fc3eb15..e90863f66 100644 --- a/official/cv/Unet/src/data_loader.py +++ b/official/cv/Unet/src/data_loader.py @@ -32,7 +32,7 @@ def _load_multipage_tiff(path): def _get_val_train_indices(length, fold, ratio=0.8): assert 0 < ratio <= 1, "Train/total data ratio must be in range (0.0, 1.0]" np.random.seed(0) - indices = np.arange(0, length, 1, dtype=np.int) + indices = np.arange(0, length, 1, dtype=np.int_) np.random.shuffle(indices) if fold is not None: @@ -49,7 +49,7 @@ def _get_val_train_indices(length, fold, ratio=0.8): def data_post_process(img, mask): img = np.expand_dims(img, axis=0) - mask = (mask > 0.5).astype(np.int) + mask = (mask > 0.5).astype(np.int_) mask = (np.arange(mask.max() + 1) == mask[..., None]).astype(int) mask = mask.transpose(2, 0, 1).astype(np.float32) return img, mask @@ -238,9 +238,9 @@ def preprocess_img_mask(img, mask, num_classes, img_size, augment=False, eval_re img = img.transpose(2, 0, 1) if num_classes == 2: mask = mask.astype(np.float32) / mask.max() - mask = (mask > 0.5).astype(np.int) + mask = (mask > 0.5).astype(np.int_) else: - mask = mask.astype(np.int) + mask = mask.astype(np.int_) mask = (np.arange(num_classes) == mask[..., None]).astype(int) mask = mask.transpose(2, 0, 1).astype(np.float32) return img, mask diff --git a/official/cv/VIT/src/autoaugment.py b/official/cv/VIT/src/autoaugment.py index 737e1945a..fc5426f4f 100644 --- a/official/cv/VIT/src/autoaugment.py +++ b/official/cv/VIT/src/autoaugment.py @@ -207,7 +207,7 @@ class SubPolicy(): "translateY": np.linspace(0, 150 / 331, 10), "rotate": np.linspace(0, 30, 10), "color": np.linspace(0.0, 0.9, 10), - "posterize": np.round(np.linspace(8, 4, 10), 0).astype(np.int), + "posterize": np.round(np.linspace(8, 4, 10), 0).astype(np.int_), "solarize": np.linspace(256, 0, 10), "contrast": np.linspace(0.0, 0.9, 10), "sharpness": np.linspace(0.0, 0.9, 10), diff --git a/official/cv/YOLOv4/src/coco_visual.py b/official/cv/YOLOv4/src/coco_visual.py index ea9459295..a15a70ee2 100644 --- a/official/cv/YOLOv4/src/coco_visual.py +++ b/official/cv/YOLOv4/src/coco_visual.py @@ -551,8 +551,8 @@ class DetectEval(COCOeval): assert (tps.shape[0]) == 1 assert (fps.shape[0]) == 1 - tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float) - fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float) + tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float_) + fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float_) ids = catIds[k0] label = labels[ids] -- Gitee