From aa6efd857b038c73deb37cf22ae510810fa8c03b Mon Sep 17 00:00:00 2001 From: j00542052 Date: Tue, 2 Jan 2024 17:48:43 +0800 Subject: [PATCH 01/14] KDD2023 ULC --- research/recommend/ULC/README.md | 90 ++++++ research/recommend/ULC/src/alternate_train.py | 234 +++++++++++++++ research/recommend/ULC/src/data.py | 284 ++++++++++++++++++ research/recommend/ULC/src/loss.py | 36 +++ research/recommend/ULC/src/main.py | 79 +++++ research/recommend/ULC/src/metrics.py | 73 +++++ research/recommend/ULC/src/models.py | 103 +++++++ research/recommend/ULC/src/utils.py | 34 +++ 8 files changed, 933 insertions(+) create mode 100644 research/recommend/ULC/README.md create mode 100644 research/recommend/ULC/src/alternate_train.py create mode 100644 research/recommend/ULC/src/data.py create mode 100644 research/recommend/ULC/src/loss.py create mode 100644 research/recommend/ULC/src/main.py create mode 100644 research/recommend/ULC/src/metrics.py create mode 100644 research/recommend/ULC/src/models.py create mode 100644 research/recommend/ULC/src/utils.py diff --git a/research/recommend/ULC/README.md b/research/recommend/ULC/README.md new file mode 100644 index 000000000..8b46f25d0 --- /dev/null +++ b/research/recommend/ULC/README.md @@ -0,0 +1,90 @@ + +# Contents + +- [Contents](#contents) +- [UCL Description](#TAML-description) +- [Dataset](#dataset) +- [Environment Requirements](#environment-requirements) +- [Quick Start](#quick-start) +- [Script Description](#script-description) + - [Script and Sample Code](#script-and-sample-code) + - [Training Process](#training-process) + - [Training](#training) +- [ModelZoo Homepage](#modelzoo-homepage) + +# [ULC Description](#contents) + +Conversion rate prediction is critical to many online applications such as digital display advertising. To capture +dynamic data distribution, industrial systems often require retraining models on recent data daily or weekly. However, +the delay of conversion behavior usually leads to incorrect labeling, which is called delayed feedback problem. Existing +work may fail to introduce the correct information about false negative samples due to data sparsity and dynamic data +distribution. To directly introduce the correct feedback label information, we propose an Unbiased delayed feedback +Label Correction framework (ULC), which uses an auxiliary model to correct labels for observed negative feedback +samples. Firstly, we theoretically prove that the label-corrected loss is an unbiased estimate of the oracle loss using +true labels. Then, as there are no ready training data for label correction, counterfactual labeling is used to +construct artificial training data. Furthermore, since counterfactual labeling utilizes only partial training data, we +design an embedding-based alternative training method to enhance performance. Comparative experiments on both public and +private datasets and detailed analyses show that our proposed approach effectively alleviates the delayed feedback +problem and consistently outperforms the previous state-of-the-art methods. + +A preprint version of our paper is available at http://arxiv.org/abs/2307.12756. + +# [Dataset](#contents) + +- [Criteo dataset](https://drive.google.com/file/d/1x4KktfZtls9QjNdFYKCjTpfjM4tG2PcK/view?usp=sharing) + +# [Environment Requirements](#contents) + +- Hardware(CPU) + - Prepare hardware environment with GPU processor. +- Framework + - [MindSpore-2.0.0](https://www.mindspore.cn/install/en) + +- Requirements + +```shell + + $ conda create --name --file requirements.txt + +``` + +- For more information, please check the resources below: + - [MindSpore Tutorials](https://www.mindspore.cn/tutorials/en/r2.0/index.html) + - [MindSpore Python API](https://www.mindspore.cn/docs/en/r2.0/index.html) + +# [Quick Start](#contents) + +After installing MindSpore via the official website, you can start training and evaluation as follows: + +- processing dataset + +# [Script Description](#contents) + +## [Script and Sample Code](#contents) + +```bash +. +└─ULC + └─src + ├─alternate_train.py # modules in ULC + ├─data.py # data process + ├─loss.py # loss in ULC + ├─main.py # train ULC + ├─metric.py # metrics in ULC + ├─models.py # ULC structure + └─utils.py # modules in ULC +``` + +## [Training Process](#contents) + +### Training + +- running on CPU + + ```python + python ./src/main.py --method ULC --l2_reg 0.00001 --cuda_device 0 --lr 0.0001 --CD 7 --batch_size 1024 --optimizer Adam --seed 0 + ``` + +# [ModelZoo Homepage](#contents) + + Please check the official [homepage](https://gitee.com/mindspore/models) \ No newline at end of file diff --git a/research/recommend/ULC/src/alternate_train.py b/research/recommend/ULC/src/alternate_train.py new file mode 100644 index 000000000..8a2945c01 --- /dev/null +++ b/research/recommend/ULC/src/alternate_train.py @@ -0,0 +1,234 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from copy import deepcopy + +from models import get_model +from loss import get_loss_fn +from utils import get_optimizer +from metrics import cal_llloss_with_logits, cal_auc, cal_prauc +from data import get_criteo_dataset, RandomAccessDataset +from tqdm import tqdm +import numpy as np +import mindspore.dataset as ds +import mindspore.nn as nn +import mindspore + +def test(model, test_data, params): + all_logits = [] + all_probs = [] + all_labels = [] + model.set_train(False) + + for batch in tqdm(test_data): + batch_x = batch[0] + batch_y = batch[1] + logits = model(batch_x) + all_logits.append(logits.asnumpy()) + all_labels.append(batch_y.asnumpy()) + all_probs.append(nn.Sigmoid()(logits).asnumpy()) + + all_logits = np.reshape(np.concatenate(all_logits, axis=0), (-1,)) + all_labels = np.reshape(np.concatenate(all_labels, axis=0), (-1,)) + all_probs = np.reshape(np.concatenate(all_probs, axis=0), (-1,)) + llloss = cal_llloss_with_logits(all_labels, all_logits) + auc = cal_auc(all_labels, all_probs) + prauc = cal_prauc(all_labels, all_probs) + return auc, prauc, llloss + +def get_valid_llloss_blc(model, test_data, correction_model): + all_logits = [] + all_labels = [] + model.set_train(False) + + for batch in tqdm(test_data): + batch_x = batch[0] + batch_y = batch[1] + logits0 = correction_model(batch_x) + corrections = (nn.Sigmoid()(logits0)).flatten() + corrections = mindspore.numpy.where(batch_y < 1, corrections, batch_y.float()) + logits = model(batch_x) + all_logits.append(logits.asnumpy()) + all_labels.append(corrections.asnumpy()) + + all_logits = np.reshape(np.concatenate(all_logits, axis=0), (-1,)) + all_labels = np.reshape(np.concatenate(all_labels, axis=0), (-1,)) + + + llloss = cal_llloss_with_logits(all_labels, all_logits) + return llloss + +def alternate_run(params, wandb): + cvr_model = None + sub_model = None + dataset = get_criteo_dataset(params) + sub_params = deepcopy(params) + + sub_params["dataset"] = "fsiwsg_cd_"+str(params["CD"])\ + +"_end_"+str(params["training_end_day"])+"_seed_"+str(params["seed"]) + np.random.seed(params["seed"]) + sub_dataset = get_criteo_dataset(sub_params)["train"] + np.random.seed(params["seed"]) + + params["log_step"] = 0 + params["idx"] = 1 + for _ in range(2): + sub_model = sub_train(cvr_model, sub_dataset, params) + cvr_model = cvr_train(sub_model, dataset, params, wandb) + params["idx"] += 1 + +def sub_train(cvr_model, sub_dataset, params): + train_data_x = sub_dataset["x"].to_numpy().astype(np.float32) + train_data_label = sub_dataset["labels"] + train_data_label = 1 - train_data_label + train_data = RandomAccessDataset(train_data_x, train_data_label) + train_data_loader = ds.GeneratorDataset(source=train_data, shuffle=True, column_names=['feature', 'label']) + train_data_loader = train_data_loader.batch(batch_size=params["batch_size"]) + + model = get_model("MLP_FSIW", params) + + if cvr_model is not None: + sd = cvr_model.parameters_dict() + part_sd = {k: v for k, v in sd.items() if ("category_embeddings" in k) or ("numeric_embeddings" in k)} + model_dict = model.parameters_dict() + model_dict.update(part_sd) + mindspore.load_param_into_net(model, model_dict) + + optimizer = nn.Adam(params=model.trainable_params(), learning_rate=0.001, weight_decay=0) + loss_fn = get_loss_fn("cross_entropy_loss") + + def forward_fn(data, label): + outputs = model(data) + targets = {"label": label} + loss_dict = loss_fn(targets, outputs, params) + loss = loss_dict["loss"] + return loss + + grad_fn = mindspore.value_and_grad(forward_fn, None, optimizer.parameters) + + for _ in range(5): + for batch in train_data_loader: + batch_x = batch[0] + batch_y = batch[1][:, 0] + targets = {"label": batch_y} + + model.set_train(True) + _, grads = grad_fn(batch_x, targets["label"]) + + optimizer(grads) + + return model + +def cvr_train(sub_model, datasets, params, wandb): + model = get_model("MLP_SIG", params) + models = {"model": model, "submodel": sub_model} + + optimizer = get_optimizer(models["model"].trainable_params(), params["optimizer"], params) + + train_dataset = datasets["train"] + train_data_x = train_dataset["x"].to_numpy().astype(np.float32) + train_data_label = train_dataset["labels"] + train_data = RandomAccessDataset(train_data_x, train_data_label) + train_data_loader = ds.GeneratorDataset(source=train_data, shuffle=True, column_names=['feature', 'label']) + train_data_loader = train_data_loader.batch(batch_size=params["batch_size"]) + + valid_dataset = datasets["valid"] + valid_data_x = valid_dataset["x"].to_numpy().astype(np.float32) + valid_data_label = valid_dataset["labels"] + valid_data = RandomAccessDataset(valid_data_x, valid_data_label) + valid_data_loader = ds.GeneratorDataset(source=valid_data, column_names=['feature', 'label']) + valid_data_loader = valid_data_loader.batch(batch_size=params["batch_size"]) + + test_dataset = datasets["test"] + test_data_x = test_dataset["x"].to_numpy().astype(np.float32) + test_data_label = test_dataset["labels"] + test_data = RandomAccessDataset(test_data_x, test_data_label) + test_data_loader = ds.GeneratorDataset(source=test_data, column_names=['feature', 'label']) + test_data_loader = test_data_loader.batch(batch_size=params["batch_size"]) + + data_loaders = { + "train_data": train_data_loader, + "test_data": test_data_loader, + "valid_data": valid_data_loader + } + optimizers = { + "optimizer": optimizer + } + + + return train(models, optimizers, data_loaders, params, wandb) + + +def train(models, optimizers, data_loaders, params, wandb): + train_data = data_loaders["train_data"] + valid_data = data_loaders["valid_data"] + test_data = data_loaders["test_data"] + best_model = None + + optimizer = optimizers["optimizer"] + + loss_fn = get_loss_fn(params["loss"]) + val_llloss = [] + test_auc, test_prauc, test_llloss = [], [], [] + + def forward_fn(data, label): + outputs = models["model"](data) + logits0 = models["submodel"](data) + correction_label = nn.Sigmoid()(logits0).flatten() + label = mindspore.numpy.where(label < 1, correction_label, label.float()) + targets = {"label": label} + loss_dict = loss_fn(targets, outputs, params) + loss = loss_dict["loss"] + + return loss + + grad_fn = mindspore.value_and_grad(forward_fn, None, optimizer.parameters) + + for ep in range(params["train_epoch"]): + vllloss = get_valid_llloss_blc(models["model"], valid_data, models["submodel"]) + print("Val ep{}, llloss {}".format(ep, vllloss)) + tauc, tprauc, tllloss = test(models["model"], test_data, params) + print("Test ep{}, auc {}, prauc {}, llloss {}".format(ep, tauc, tprauc, tllloss)) + + if not val_llloss or vllloss < min(val_llloss): + best_model = models["model"].parameters_dict() + + val_llloss.append(vllloss) + test_auc.append(tauc) + test_prauc.append(tprauc) + test_llloss.append(tllloss) + + if len(val_llloss) - val_llloss.index(min(val_llloss)) > params["early_stop"]: + best_ep = val_llloss.index(min(val_llloss)) + print("Early stop at ep {}. Best ep {}. Best val_lloss {}.".format(ep, best_ep, min(val_llloss))) + print("Final test evaluation: auc {}, prauc {}, llloss {}."\ + .format(test_auc[best_ep], test_prauc[best_ep], test_llloss[best_ep])) + break + train_loss = [] + for batch in tqdm(train_data): + batch_x = batch[0] + batch_y = batch[1] + + models["model"].set_train(True) + models["submodel"].set_train(False) + loss, grads = grad_fn(batch_x, batch_y) + + train_loss.append(loss.asnumpy()) + optimizer(grads) + params["log_step"] += 1 + print("Train ep{}, loss {}".format(ep, np.mean(train_loss))) + + mindspore.load_param_into_net(models["model"], best_model) + return models["model"] diff --git a/research/recommend/ULC/src/data.py b/research/recommend/ULC/src/data.py new file mode 100644 index 000000000..b15ee49d4 --- /dev/null +++ b/research/recommend/ULC/src/data.py @@ -0,0 +1,284 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import copy +import os +import pickle +import datetime +import pandas as pd +import numpy as np + +from sklearn.preprocessing import LabelEncoder + +from utils import parse_float_arg + +SECONDS_A_DAY = 60 * 60 * 24 +SECONDS_AN_HOUR = 60 * 60 +SECONDS_DELAY_NORM = 1 +SECONDS_FSIW_NORM = SECONDS_A_DAY * 5 +num_bin_size = (64, 16, 128, 64, 128, 64, 512, 512) + + +class RandomAccessDataset: + def __init__(self, data, label): + self._data = np.array(data) + self._label = np.array(label) + + def __getitem__(self, index): + return (self._data[index], self._label[index]) + + def __len__(self): + return len(self._data) + + +def get_data_df(params): + if params["dataset_source"] in ["criteo"]: + df = pd.read_csv(params["data_path"], sep="\t", header=None) + click_ts = df[df.columns[0]].to_numpy() + pay_ts = df[df.columns[1]].fillna(-1).to_numpy() + + if params["dataset_source"] == "criteo": + df = df[df.columns[2:]] + for c in df.columns[8:]: + df[c] = df[c].fillna("") + df[c] = df[c].astype(str) + + label_encoder = LabelEncoder() + for c in df.columns[8:]: + df[c] = label_encoder.fit_transform(df[c]) + + for i, c in enumerate(df.columns[:8]): + df[c] = df[c].fillna(-1) + df[c] = (df[c] - df[c].min()) / (df[c].max() - df[c].min()) + df[c] = np.floor(df[c] * (num_bin_size[i] - 0.00001)).astype(str) + df.columns = [str(i) for i in range(17)] + return df, click_ts, pay_ts + + +class DataDF: + + def __init__(self, features, click_ts, pay_ts, sample_ts=None, labels=None, delay_label=None): + self.x = features.copy(deep=True) + self.click_ts = copy.deepcopy(click_ts) + self.pay_ts = copy.deepcopy(pay_ts) + self.delay_label = delay_label + if sample_ts is not None: + self.sample_ts = copy.deepcopy(sample_ts) + else: + self.sample_ts = copy.deepcopy(click_ts) + if labels is not None: + self.labels = copy.deepcopy(labels) + else: + self.labels = (pay_ts > 0).astype(np.int32) + + def sub_days(self, start_day, end_day): + start_ts = start_day * SECONDS_A_DAY + end_ts = end_day * SECONDS_A_DAY + mask = np.logical_and(self.sample_ts >= start_ts, + self.sample_ts < end_ts) + return DataDF(self.x.iloc[mask], + self.click_ts[mask], + self.pay_ts[mask], + self.sample_ts[mask], + self.labels[mask]) + + def to_fsiw_1(self, cd, T): # build pre-training dataset 1 of FSIW + mask = np.logical_and(self.click_ts < T - cd, self.pay_ts > 0) + mask = np.logical_and(mask, self.pay_ts < T) + x = self.x.iloc[mask].copy(deep=True) + pay_ts = self.pay_ts[mask] + click_ts = self.click_ts[mask] + sample_ts = self.click_ts[mask] + label = np.zeros((x.shape[0],)) + label[pay_ts < T - cd] = 1 + # FSIW needs elapsed time information + x.insert(x.shape[1], column="elapse", value=( + T - click_ts - cd) / SECONDS_FSIW_NORM) + return DataDF(x, + click_ts, + pay_ts, + sample_ts, + label) + + def to_fsiw_0(self, cd, T): # build pre-training dataset 0 of FSIW + mask = np.logical_or(self.pay_ts >= T - cd, self.pay_ts < 0) + mask = np.logical_or(mask, self.pay_ts > T) + mask = np.logical_and(self.click_ts < T - cd, mask) + x = self.x.iloc[mask].copy(deep=True) + pay_ts = self.pay_ts[mask] + click_ts = self.click_ts[mask] + sample_ts = self.sample_ts[mask] + label = np.zeros((x.shape[0],)) + label[np.logical_or(pay_ts < 0, pay_ts > T)] = 1 + x.insert(x.shape[1], column="elapse", value=( + T - click_ts - cd) / SECONDS_FSIW_NORM) + return DataDF(x, + click_ts, + pay_ts, + sample_ts, + label) + + def shuffle(self): + idx = list(range(self.x.shape[0])) + np.random.shuffle(idx) + return DataDF(self.x.iloc[idx], + self.click_ts[idx], + self.pay_ts[idx], + self.sample_ts[idx], + self.labels[idx]) + + +def get_criteo_dataset(params): + name = params["dataset"] + print("loading datasest {}".format(name)) + cache_path = os.path.join( + params["data_cache_path"], "{}.pkl".format(name)) + if params["data_cache_path"] != "None" and os.path.isfile(cache_path): + print("cache_path {}".format(cache_path)) + print("\nloading from dataset cache") + with open(cache_path, "rb") as f: + data = pickle.load(f) + train_data = data["train"] + test_data = data["test"] + if "valid" in data: + valid_data = data["valid"] + if "clean" in data: + _ = data["clean"] + if "fn" in data: + fn_data = data["fn"] + else: + train_data, test_data, valid_data, fn_data = build_criteo_dataset(params, name, cache_path) + result = { + "train": { + "x": train_data.x, + "click_ts": train_data.click_ts, + "pay_ts": train_data.pay_ts, + "sample_ts": train_data.sample_ts, + "labels": train_data.labels, + }, + "test": { + "x": test_data.x, + "click_ts": test_data.click_ts, + "pay_ts": test_data.pay_ts, + "sample_ts": train_data.sample_ts, + "labels": test_data.labels, + } + } + if ("next" in name) or ("oracle" in name): + result["valid"] = { + "x": valid_data.x, + "click_ts": valid_data.click_ts, + "pay_ts": valid_data.pay_ts, + "sample_ts": valid_data.sample_ts, + "labels": valid_data.labels, + } + result["fn"] = { + "x": fn_data.x, + "click_ts": fn_data.click_ts, + "pay_ts": fn_data.pay_ts, + "sample_ts": fn_data.sample_ts, + "labels": fn_data.labels, + } + return result + + +def build_criteo_dataset(params, name, cache_path): + print("\nbuilding dataset") + + starttime = datetime.datetime.now() + if params["dataset_source"] == "criteo": + source_cache_path = "./cache_data.pkl" + if os.path.isfile(source_cache_path): + with open(source_cache_path, "rb") as f: + data = pickle.load(f) + else: + df, click_ts, pay_ts = get_data_df(params) + data = DataDF(df, click_ts, pay_ts) + with open(source_cache_path, "wb") as f: + pickle.dump(data, f, protocol=4) + endtime = datetime.datetime.now() + print("Time:{}s".format((endtime - starttime).total_seconds())) + + if "fsiwsg" in name: + cd = parse_float_arg(name, "cd") + training_start = params["training_end_day"] - params["training_duration"] + train_data = data.sub_days(training_start, params["training_end_day"]).shuffle() + test_data = data.sub_days(params["training_end_day"], params["training_end_day"] + 1) + train_data = train_data.to_fsiw_0( + cd=cd * SECONDS_A_DAY, T=params["training_end_day"] * SECONDS_A_DAY) + cvrs = np.reshape(train_data.pay_ts > 0, (-1, 1)) + pot_cvr = np.reshape(train_data.pay_ts > params["training_end_day"] * SECONDS_A_DAY, (-1, 1)) + train_data.labels = np.reshape(train_data.labels, (-1, 1)) + train_data.labels = np.concatenate( + [train_data.labels, cvrs, pot_cvr], axis=1) + test_data = test_data.to_fsiw_0( + cd=cd * SECONDS_A_DAY, T=params["training_end_day"] * SECONDS_A_DAY) + elif "fsiw_next" in name: + cd = parse_float_arg(name, "cd") + training_start = params["training_end_day"] - params["training_duration"] + train_data = data.sub_days(training_start, params["training_end_day"]).shuffle() + mask = train_data.pay_ts > (params["training_end_day"] * SECONDS_A_DAY) + train_data.labels[mask] = 0 + train_data.x.insert(train_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - train_data.click_ts) / SECONDS_FSIW_NORM) + fn_data = DataDF(train_data.x.iloc[mask], + train_data.click_ts[mask], + train_data.pay_ts[mask], + train_data.sample_ts[mask], + train_data.labels[mask]) + valid_data = data.sub_days(params["training_end_day"], + params["training_end_day"] + 1 * params["valid_test_size"]) + valid_data.x.insert(valid_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - valid_data.click_ts) / SECONDS_FSIW_NORM) + val_mask = valid_data.pay_ts > ( + (params["training_end_day"] + 1 * params["valid_test_size"]) * SECONDS_A_DAY) + valid_data.labels[val_mask] = 0 + test_data = data.sub_days(params["training_end_day"] + 1 * params["valid_test_size"], + params["training_end_day"] + 2 * params["valid_test_size"]) + test_data.x.insert(test_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - test_data.click_ts) / SECONDS_FSIW_NORM) + elif "oracle" in name: + cd = parse_float_arg(name, "cd") + training_start = params["training_end_day"] - params["training_duration"] + train_data = data.sub_days(training_start, params["training_end_day"]).shuffle() + train_data.x.insert(train_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - train_data.click_ts) / SECONDS_FSIW_NORM) + + mask = train_data.pay_ts > (params["training_end_day"] * SECONDS_A_DAY) + fn_data = DataDF(train_data.x.iloc[mask], + train_data.click_ts[mask], + train_data.pay_ts[mask], + train_data.sample_ts[mask], + train_data.labels[mask]) + fn_data.labels[:] = 0 + + valid_data = data.sub_days(params["training_end_day"], + params["training_end_day"] + 1 * params["valid_test_size"]) + valid_data.x.insert(valid_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - valid_data.click_ts) / SECONDS_FSIW_NORM) + test_data = data.sub_days(params["training_end_day"] + 1 * params["valid_test_size"], + params["training_end_day"] + 2 * params["valid_test_size"]) + test_data.x.insert(test_data.x.shape[1], column="elapse", value=(params[ + "training_end_day"] * SECONDS_A_DAY - test_data.click_ts) / SECONDS_FSIW_NORM) + else: + raise NotImplementedError("{} dataset does not exist".format(name)) + if params["data_cache_path"] != "None": + with open(cache_path, "wb") as f: + if ("next" in name) or ("oracle" in name): + pickle.dump({"train": train_data, "test": test_data, "valid": valid_data, "fn": fn_data}, f) + else: + pickle.dump({"train": train_data, "test": test_data}, f) + + return train_data, test_data, valid_data, fn_data diff --git a/research/recommend/ULC/src/loss.py b/research/recommend/ULC/src/loss.py new file mode 100644 index 000000000..743d95833 --- /dev/null +++ b/research/recommend/ULC/src/loss.py @@ -0,0 +1,36 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import mindspore.numpy as np +import mindspore.ops as ops +import mindspore + + +def stable_log1pex(x): + return -np.where(x < 0, x, np.zeros_like(x)) + np.log(1 + np.exp(-np.absolute(x))) + +def cross_entropy_loss(targets, outputs, params=None): + z = targets["label"] + x = outputs + x = ops.Reshape()(x, (-1,)) + z = z.float() + loss_value = ops.binary_cross_entropy_with_logits(x, z, mindspore.Tensor([1.0]), mindspore.Tensor([1.0])) + + return {"loss": loss_value} + +def get_loss_fn(name): + if name == "cross_entropy_loss": + return cross_entropy_loss + raise NotImplementedError("{} loss does not implemented".format(name)) diff --git a/research/recommend/ULC/src/main.py b/research/recommend/ULC/src/main.py new file mode 100644 index 000000000..406db7bda --- /dev/null +++ b/research/recommend/ULC/src/main.py @@ -0,0 +1,79 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import argparse +import os +import pathlib +from copy import deepcopy + +import numpy as np + +from alternate_train import alternate_run + +wandb = None + + +def run_params(args): + params = deepcopy(vars(args)) + params["model"] = "MLP_SIG" + if args.data_cache_path != "None": + pathlib.Path(args.data_cache_path).mkdir(parents=True, exist_ok=True) + + if args.method == "ULC": + params["loss"] = "cross_entropy_loss" + params["dataset"] = "last_30_train_test_fsiw_next" + "_end_" + str(args.training_end_day) + "_seed_" + str( + args.seed) + + return params + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--method", choices=["ULC"], + type=str, required=True) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--dataset_source", type=str, default="criteo", choices=["criteo"]) + parser.add_argument("--CD", type=int, default=7, + help="interval between counterfactual deadline and actual deadline") + parser.add_argument("--lr", type=float, default=1e-3) + parser.add_argument("--data_path", type=str, default="./data/data.txt", + help="path of the data.txt in criteo dataset") + parser.add_argument("--data_cache_path", type=str, default="./data") + parser.add_argument("--batch_size", type=int, + default=1024) + parser.add_argument("--epoch", type=int, default=5, + help="training epoch of pretraining") + parser.add_argument("--l2_reg", type=float, default=0, + help="l2 regularizer strength") + parser.add_argument("--training_end_day", type=int, default=58, + help="deadline for training data") + parser.add_argument("--training_duration", type=int, default=21, + help="duration of training data") + parser.add_argument("--valid_test_size", type=float, default=1, + help="duration of valid/test data") + parser.add_argument("--train_epoch", type=int, default=100, + help="max train epoch") + parser.add_argument("--early_stop", type=int, default=4) + parser.add_argument("--cuda_device", type=str, default="0") + parser.add_argument("--optimizer", type=str, default="Adam") + parser.add_argument("--save_model", type=int, default=0) + parser.add_argument("--base_model", type=str, default="MLP", choices=["MLP"]) + + args = parser.parse_args() + params = run_params(args) + os.environ["CUDA_VISIBLE_DEVICES"] = params["cuda_device"] + np.random.seed(args.seed) + + alternate_run(params, wandb) diff --git a/research/recommend/ULC/src/metrics.py b/research/recommend/ULC/src/metrics.py new file mode 100644 index 000000000..274a0e508 --- /dev/null +++ b/research/recommend/ULC/src/metrics.py @@ -0,0 +1,73 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from sklearn import metrics +import numpy as np + +def sigmoid(x): + return 1/(1+np.exp(np.clip(-x, a_min=-1e50, a_max=1e20))) + +def cal_auc(label, pos_prob): + fpr, tpr, _ = metrics.roc_curve(label, pos_prob, pos_label=1) + auc = metrics.auc(fpr, tpr) + return auc + +def stable_log1pex(x): + return -np.minimum(x, 0) + np.log(1+np.exp(-np.abs(x))) + +def cal_llloss_with_logits(label, logits): + ll = -np.mean(label*(-stable_log1pex(logits)) + (1-label)*(-logits - stable_log1pex(logits))) + return ll + +def cal_llloss_with_logits_and_weight(label, logits, logits0, logits1): + x = logits + + pos_loss = stable_log1pex(x) + neg_loss = x + stable_log1pex(x) + + pos_weight = 1/(logits1+1e-8) + neg_weight = logits0 + + clf_loss = np.mean( + pos_loss*pos_weight*label + neg_loss*neg_weight*(1-label)) + + weight = np.mean(pos_weight*label + neg_weight*(1-label)) + + return clf_loss/weight + +def prob_clip(x): + return np.clip(x, a_min=1e-20, a_max=1) + +def cal_llloss_with_neg_log_prob(label, neg_log_prob): + ll = -np.mean((1-label)*neg_log_prob + label*(np.log(prob_clip(1 - prob_clip(np.exp(neg_log_prob)))))) + return ll + +def cal_llloss_with_prob(label, prob): + ll = -np.mean(label*np.log(prob_clip(prob)) + (1-label)*(np.log(prob_clip(1-prob)))) + return ll + +def cal_prauc(label, pos_prob): + precision, recall, _ = metrics.precision_recall_curve(label, pos_prob) + area = metrics.auc(recall, precision) + return area + +def cal_acc(label, prob): + label = np.reshape(label, (-1,)) + prob = np.reshape(label, (-1,)) + prob_acc = np.mean(label*prob) + return prob_acc + +def stable_softplus(x): + return np.log(1 + np.exp(-np.abs(x))) + np.maximum(x, 0) diff --git a/research/recommend/ULC/src/models.py b/research/recommend/ULC/src/models.py new file mode 100644 index 000000000..d8d63ab25 --- /dev/null +++ b/research/recommend/ULC/src/models.py @@ -0,0 +1,103 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from mindspore import nn +from mindspore import ops + +class MLP(nn.Cell): + def __init__(self, name, params): + super(MLP, self).__init__() + self.model_name = name + self.params = params + self.dataset_source = params["dataset_source"] + if params["dataset_source"] == "criteo": + self.category_embeddings = nn.CellList([ + nn.Embedding(55824, 64), + nn.Embedding(5443, 64), + nn.Embedding(13073, 64), + nn.Embedding(13170, 64), + nn.Embedding(3145, 64), + nn.Embedding(33843, 64), + nn.Embedding(14304, 64), + nn.Embedding(11, 64), + nn.Embedding(13601, 64) + ]) + + self.numeric_embeddings = nn.CellList([ + nn.Embedding(64, 64), + nn.Embedding(16, 64), + nn.Embedding(128, 64), + nn.Embedding(64, 64), + nn.Embedding(128, 64), + nn.Embedding(64, 64), + nn.Embedding(512, 64), + nn.Embedding(512, 64) + ]) + presize = 1088 + + if name == "MLP_FSIW": + print("using elapse feature") + presize += 1 + + self.mlp = nn.CellList([ + nn.Dense(presize, 256, activation='leakyrelu'), + nn.Dense(256, 256, activation='leakyrelu'), + nn.Dense(256, 128, activation='leakyrelu') + ]) + + if self.model_name in ["MLP_SIG", "MLP_FSIW"]: + self.mlp.append(nn.Dense(128, 1)) + else: + raise ValueError("model name {} not exist".format(name)) + + def construct(self, x): + if self.dataset_source == "criteo": + cate_embeddings = [] + nume_embeddings = [] + if self.model_name == "MLP_FSIW": + for i in range(8): + nume_embeddings.append(self.numeric_embeddings[i](x[:, i].int())) + + for i in range(9): + cate_embeddings.append(self.category_embeddings[8 - i](x[:, -i - 2].int())) + + features = nume_embeddings + cate_embeddings + [x[:, -1:]] + x = ops.Concat(axis=1)(features) + else: + for i in range(8): + nume_embeddings.append(self.numeric_embeddings[i](x[:, i].int())) + + for i in range(9): + cate_embeddings.append(self.category_embeddings[8 - i](x[:, -i - 2].int())) + + features = nume_embeddings + cate_embeddings + x = ops.Concat(axis=1)(features) + + for layer in self.mlp: + x = layer(x) + + if self.model_name in ["MLP_SIG", "MLP_FSIW"]: + return x + raise NotImplementedError() + +def get_model(name, params): + if name in ["MLP_tn_dp", "MLP_FSIW"]: + return MLP(name, params) + if name == "MLP_SIG": + if params["base_model"] == "MLP": + return MLP(name, params) + else: + raise NotImplementedError() + return 0 diff --git a/research/recommend/ULC/src/utils.py b/research/recommend/ULC/src/utils.py new file mode 100644 index 000000000..b5aca4c04 --- /dev/null +++ b/research/recommend/ULC/src/utils.py @@ -0,0 +1,34 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import re +import mindspore.nn as nn + + +def get_optimizer(p, name, params): + if name == "Adam": + return nn.Adam(params=p, learning_rate=params["lr"], weight_decay=params["l2_reg"]) + return 0 + + +def parse_float_arg(Input, prefix): + p = re.compile(prefix+"_[+-]?([0-9]*[.])?[0-9]+") + m = p.search(Input) + if m is None: + return None + Input = m.group() + p = re.compile("[+-]?([0-9]*[.])?[0-9]+") + m = p.search(Input) + return float(m.group()) -- Gitee From e4f3f9a2a8bd7c4f9c7b6a4d92aab947867f9847 Mon Sep 17 00:00:00 2001 From: daiyuxin0511 <455472400@qq.com> Date: Wed, 3 Jan 2024 09:31:57 +0800 Subject: [PATCH 02/14] cast the output of range --- .../src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py | 1 + .../maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py | 1 + 2 files changed, 2 insertions(+) diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py index 07c5bc621..470f709d6 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py @@ -214,6 +214,7 @@ class BboxAssignSampleForRcnn(nn.Cell): # normalized box coordinate boxes = boxes / self.image_h_w box_ids = F.range(self.start, self.limit, self.delta) + box_ids = self.cast(box_ids, mstype.int32) pos_masks_fb = self.expand_dims(pos_masks_fb, -1) boxes = self.cast(boxes, mstype.float32) pos_masks_fb = self.crop_and_resize(pos_masks_fb, boxes, box_ids, self.mask_shape) diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py index e97ee0a83..681b8a300 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/bbox_assign_sample_stage2.py @@ -212,6 +212,7 @@ class BboxAssignSampleForRcnn(nn.Cell): # normalized box coordinate boxes = boxes / self.image_h_w box_ids = F.range(self.start, self.limit, self.delta) + box_ids = self.cast(box_ids, mstype.int32) pos_masks_fb = self.expand_dims(pos_masks_fb, -1) boxes = self.cast(boxes, mstype.float32) pos_masks_fb = self.crop_and_resize(pos_masks_fb, boxes, box_ids, self.mask_shape) -- Gitee From 073b71d00822ff526f85181c9c7644e25a96b34e Mon Sep 17 00:00:00 2001 From: daiyuxin0511 <455472400@qq.com> Date: Thu, 4 Jan 2024 10:00:58 +0800 Subject: [PATCH 03/14] Revert "modify the input of concat from ms.bool_ to ms.int32" This reverts commit da505fa86daecff3255a862e6d523b5efcfe202c. --- .../src/FasterRcnn/bbox_assign_sample.py | 8 +------- .../src/FasterRcnn/bbox_assign_sample_stage2.py | 10 ++-------- .../src/FasterRcnn/proposal_generator.py | 14 +++----------- .../src/maskrcnn_mobilenetv1/fpn_neck.py | 10 ++++++---- .../maskrcnn_resnet50/src/maskrcnn/fpn_neck.py | 11 ++++++----- 5 files changed, 18 insertions(+), 35 deletions(-) diff --git a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py index a49572c6a..57c758a51 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample.py @@ -144,13 +144,7 @@ class BboxAssignSample(nn.Cell): num_pos = self.cast(self.logicalnot(valid_pos_index), self.ms_type) num_pos = self.sum_inds(num_pos, -1) unvalid_pos_index = self.less(self.range_pos_size, num_pos) - valid_neg_index = self.logicaland( - self.cast(self.concat(( - self.cast(self.check_neg_mask, ms.int32), - self.cast(unvalid_pos_index, ms.int32) - )), ms.bool_), - self.cast(valid_neg_index, ms.bool_) - ) + valid_neg_index = self.logicaland(self.concat((self.check_neg_mask, unvalid_pos_index)), valid_neg_index) pos_bboxes_ = self.gatherND(bboxes, pos_index) pos_gt_bboxes_ = self.gatherND(gt_bboxes_i, pos_assigned_gt_index) diff --git a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py index 7602adcc5..5cbc16ee5 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/bbox_assign_sample_stage2.py @@ -114,7 +114,7 @@ class BboxAssignSampleForRcnn(nn.Cell): gt_bboxes_i, self.check_gt_one) bboxes = self.select(self.cast(self.tile(self.reshape(self.cast(valid_mask, ms.int32), \ (self.num_bboxes, 1)), (1, 4)), ms.bool_), \ - self.cast(bboxes, ms.float16), self.check_anchor_two) + bboxes, self.check_anchor_two) overlaps = self.iou(bboxes, gt_bboxes_i) @@ -171,13 +171,7 @@ class BboxAssignSampleForRcnn(nn.Cell): neg_index, valid_neg_index = self.random_choice_with_mask_neg(self.equal(assigned_gt_inds5, 0)) unvalid_pos_index = self.less(self.range_pos_size, num_pos) - valid_neg_index = self.logicaland( - self.cast(self.concat(( - self.cast(self.check_neg_mask, ms.int32), - self.cast(unvalid_pos_index, ms.int32) - )), ms.bool_), - self.cast(valid_neg_index, ms.bool_) - ) + valid_neg_index = self.logicaland(self.concat((self.check_neg_mask, unvalid_pos_index)), valid_neg_index) neg_index = self.reshape(neg_index, self.reshape_shape_neg) valid_neg_index = self.cast(valid_neg_index, ms.int32) diff --git a/official/cv/FasterRCNN/src/FasterRcnn/proposal_generator.py b/official/cv/FasterRCNN/src/FasterRcnn/proposal_generator.py index 5317ca51c..16e2b4265 100644 --- a/official/cv/FasterRCNN/src/FasterRcnn/proposal_generator.py +++ b/official/cv/FasterRCNN/src/FasterRcnn/proposal_generator.py @@ -183,21 +183,13 @@ class Proposal(nn.Cell): mlvl_proposals = mlvl_proposals + (proposals,) mlvl_mask = mlvl_mask + (mask_valid,) - proposals = self.concat_axis0( - tuple(self.cast(proposal, ms.int64) for proposal in mlvl_proposals) - ) - masks = self.concat_axis0( - tuple(self.cast(mask, ms.int64) for mask in mlvl_mask) - ) + proposals = self.concat_axis0(mlvl_proposals) + masks = self.concat_axis0(mlvl_mask) _, _, _, _, scores = self.split(proposals) scores = self.squeeze(scores) topk_mask = self.cast(self.topK_mask, self.ms_type) - scores_using = self.cast(self.select( - self.cast(masks, ms.bool_), - self.cast(scores, ms.bool_), - self.cast(topk_mask, ms.bool_) - ), ms.int32) + scores_using = self.select(masks, scores, topk_mask) _, topk_inds = self.topKv2(scores_using, self.max_num) diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py index 2ad832bbd..38ca57bb2 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py @@ -89,7 +89,9 @@ class FeatPyramidNeck(nn.Cell): self.fpn_convs_.append(fpn_conv) self.lateral_convs_list = nn.layer.CellList(self.lateral_convs_list_) self.fpn_convs_list = nn.layer.CellList(self.fpn_convs_) - self.interpolate = P.ResizeBilinearV2() + self.interpolate1 = P.ResizeBilinear((48, 80)) + self.interpolate2 = P.ResizeBilinear((96, 160)) + self.interpolate3 = P.ResizeBilinear((192, 320)) self.cast = P.Cast() self.maxpool = P.MaxPool(kernel_size=1, strides=2, pad_mode="same") @@ -99,9 +101,9 @@ class FeatPyramidNeck(nn.Cell): x += (self.lateral_convs_list[i](inputs[i]),) y = (x[3],) - y = y + (x[2] + self.cast(self.interpolate(y[self.fpn_layer - 4], (48, 80)), self.platform_mstype),) - y = y + (x[1] + self.cast(self.interpolate(y[self.fpn_layer - 3], (96, 160)), self.platform_mstype),) - y = y + (x[0] + self.cast(self.interpolate(y[self.fpn_layer - 2], (192, 320)), self.platform_mstype),) + y = y + (x[2] + self.cast(self.interpolate1(y[self.fpn_layer - 4]), self.platform_mstype),) + y = y + (x[1] + self.cast(self.interpolate2(y[self.fpn_layer - 3]), self.platform_mstype),) + y = y + (x[0] + self.cast(self.interpolate3(y[self.fpn_layer - 2]), self.platform_mstype),) z = () for i in range(self.fpn_layer - 1, -1, -1): diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py index 68590ec9d..6a98de5d2 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py @@ -92,8 +92,9 @@ class FeatPyramidNeck(nn.Cell): self.fpn_convs_.append(fpn_conv) self.lateral_convs_list = nn.layer.CellList(self.lateral_convs_list_) self.fpn_convs_list = nn.layer.CellList(self.fpn_convs_) - self.interpolate = P.ResizeBilinearV2() - self.feature_shapes = feature_shapes + self.interpolate1 = P.ResizeBilinear(feature_shapes[2]) + self.interpolate2 = P.ResizeBilinear(feature_shapes[1]) + self.interpolate3 = P.ResizeBilinear(feature_shapes[0]) self.cast = P.Cast() self.maxpool = P.MaxPool(kernel_size=1, strides=2, pad_mode="same") @@ -103,9 +104,9 @@ class FeatPyramidNeck(nn.Cell): x += (self.lateral_convs_list[i](inputs[i]),) y = (x[3],) - y = y + (x[2] + self.cast(self.interpolate(y[self.fpn_layer - 4], self.feature_shapes[2]), self.cast_type),) - y = y + (x[1] + self.cast(self.interpolate(y[self.fpn_layer - 3], self.feature_shapes[1]), self.cast_type),) - y = y + (x[0] + self.cast(self.interpolate(y[self.fpn_layer - 2], self.feature_shapes[0]), self.cast_type),) + y = y + (x[2] + self.cast(self.interpolate1(y[self.fpn_layer - 4]), self.cast_type),) + y = y + (x[1] + self.cast(self.interpolate2(y[self.fpn_layer - 3]), self.cast_type),) + y = y + (x[0] + self.cast(self.interpolate3(y[self.fpn_layer - 2]), self.cast_type),) z = () for i in range(self.fpn_layer - 1, -1, -1): -- Gitee From bd2ea20d4bb74bce1875db5c52fffebf70df64ed Mon Sep 17 00:00:00 2001 From: wilsonleehw Date: Wed, 10 Jan 2024 14:42:14 +0800 Subject: [PATCH 04/14] update README for TinySAM --- research/cv/TinySAM/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/research/cv/TinySAM/README.md b/research/cv/TinySAM/README.md index bd0e17c07..d01ba5f57 100644 --- a/research/cv/TinySAM/README.md +++ b/research/cv/TinySAM/README.md @@ -54,6 +54,10 @@ SNN-MLP After installing MindSpore via the official website, you can start evaluation as follows: +### Download + +Download ckpts from [modelzoo](https://download-mindspore.osinfra.cn/model_zoo/research/cv/TinySAM/tinysam_mindspore.ckpt). + ### Launch ```bash -- Gitee From d8785b55ecf8faf6459f6d1c615cd5b19df1de85 Mon Sep 17 00:00:00 2001 From: zhangyifan Date: Wed, 17 Jan 2024 14:40:19 +0800 Subject: [PATCH 05/14] add OWNERS --- OWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/OWNERS b/OWNERS index 0cd9b621a..8d29f4f1f 100644 --- a/OWNERS +++ b/OWNERS @@ -10,3 +10,4 @@ approvers: - baochong - luoyang42 - wang_hua_2019 +- zhangyifan999 -- Gitee From 9f2cf06f9c48d40d7d1006cc60e9adedd3395ec5 Mon Sep 17 00:00:00 2001 From: ash Date: Thu, 18 Jan 2024 22:11:32 +0800 Subject: [PATCH 06/14] fix fasterrcnn loss nan --- official/cv/FasterRCNN/scripts/run_distribute_train_ascend.sh | 1 + official/cv/FasterRCNN/scripts/run_standalone_train_ascend.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/official/cv/FasterRCNN/scripts/run_distribute_train_ascend.sh b/official/cv/FasterRCNN/scripts/run_distribute_train_ascend.sh index 86588bf77..8087e1f88 100644 --- a/official/cv/FasterRCNN/scripts/run_distribute_train_ascend.sh +++ b/official/cv/FasterRCNN/scripts/run_distribute_train_ascend.sh @@ -96,6 +96,7 @@ export HCCL_CONNECT_TIMEOUT=600 export DEVICE_NUM=8 export RANK_SIZE=8 export RANK_TABLE_FILE=$PATH1 +export MS_ASCEND_CHECK_OVERFLOW_MODE="SATURATION_MODE" for((i=0; i<${DEVICE_NUM}; i++)) do diff --git a/official/cv/FasterRCNN/scripts/run_standalone_train_ascend.sh b/official/cv/FasterRCNN/scripts/run_standalone_train_ascend.sh index 565f1c562..828f5133e 100644 --- a/official/cv/FasterRCNN/scripts/run_standalone_train_ascend.sh +++ b/official/cv/FasterRCNN/scripts/run_standalone_train_ascend.sh @@ -88,6 +88,7 @@ export DEVICE_NUM=1 export DEVICE_ID=$4 export RANK_ID=0 export RANK_SIZE=1 +export MS_ASCEND_CHECK_OVERFLOW_MODE="SATURATION_MODE" if [ -d "train" ]; then -- Gitee From 6736ad2212724cac681aea5e5dc85c9a72e370f7 Mon Sep 17 00:00:00 2001 From: daiyuxin0511 <455472400@qq.com> Date: Tue, 23 Jan 2024 17:16:46 +0800 Subject: [PATCH 07/14] remove ResizeBilinear from MaskRCNN --- .../src/maskrcnn_mobilenetv1/fpn_neck.py | 10 ++++------ .../maskrcnn_resnet50/src/maskrcnn/fpn_neck.py | 11 +++++------ 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py index 38ca57bb2..2ad832bbd 100644 --- a/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py +++ b/official/cv/MaskRCNN/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py @@ -89,9 +89,7 @@ class FeatPyramidNeck(nn.Cell): self.fpn_convs_.append(fpn_conv) self.lateral_convs_list = nn.layer.CellList(self.lateral_convs_list_) self.fpn_convs_list = nn.layer.CellList(self.fpn_convs_) - self.interpolate1 = P.ResizeBilinear((48, 80)) - self.interpolate2 = P.ResizeBilinear((96, 160)) - self.interpolate3 = P.ResizeBilinear((192, 320)) + self.interpolate = P.ResizeBilinearV2() self.cast = P.Cast() self.maxpool = P.MaxPool(kernel_size=1, strides=2, pad_mode="same") @@ -101,9 +99,9 @@ class FeatPyramidNeck(nn.Cell): x += (self.lateral_convs_list[i](inputs[i]),) y = (x[3],) - y = y + (x[2] + self.cast(self.interpolate1(y[self.fpn_layer - 4]), self.platform_mstype),) - y = y + (x[1] + self.cast(self.interpolate2(y[self.fpn_layer - 3]), self.platform_mstype),) - y = y + (x[0] + self.cast(self.interpolate3(y[self.fpn_layer - 2]), self.platform_mstype),) + y = y + (x[2] + self.cast(self.interpolate(y[self.fpn_layer - 4], (48, 80)), self.platform_mstype),) + y = y + (x[1] + self.cast(self.interpolate(y[self.fpn_layer - 3], (96, 160)), self.platform_mstype),) + y = y + (x[0] + self.cast(self.interpolate(y[self.fpn_layer - 2], (192, 320)), self.platform_mstype),) z = () for i in range(self.fpn_layer - 1, -1, -1): diff --git a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py index 6a98de5d2..68590ec9d 100644 --- a/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py +++ b/official/cv/MaskRCNN/maskrcnn_resnet50/src/maskrcnn/fpn_neck.py @@ -92,9 +92,8 @@ class FeatPyramidNeck(nn.Cell): self.fpn_convs_.append(fpn_conv) self.lateral_convs_list = nn.layer.CellList(self.lateral_convs_list_) self.fpn_convs_list = nn.layer.CellList(self.fpn_convs_) - self.interpolate1 = P.ResizeBilinear(feature_shapes[2]) - self.interpolate2 = P.ResizeBilinear(feature_shapes[1]) - self.interpolate3 = P.ResizeBilinear(feature_shapes[0]) + self.interpolate = P.ResizeBilinearV2() + self.feature_shapes = feature_shapes self.cast = P.Cast() self.maxpool = P.MaxPool(kernel_size=1, strides=2, pad_mode="same") @@ -104,9 +103,9 @@ class FeatPyramidNeck(nn.Cell): x += (self.lateral_convs_list[i](inputs[i]),) y = (x[3],) - y = y + (x[2] + self.cast(self.interpolate1(y[self.fpn_layer - 4]), self.cast_type),) - y = y + (x[1] + self.cast(self.interpolate2(y[self.fpn_layer - 3]), self.cast_type),) - y = y + (x[0] + self.cast(self.interpolate3(y[self.fpn_layer - 2]), self.cast_type),) + y = y + (x[2] + self.cast(self.interpolate(y[self.fpn_layer - 4], self.feature_shapes[2]), self.cast_type),) + y = y + (x[1] + self.cast(self.interpolate(y[self.fpn_layer - 3], self.feature_shapes[1]), self.cast_type),) + y = y + (x[0] + self.cast(self.interpolate(y[self.fpn_layer - 2], self.feature_shapes[0]), self.cast_type),) z = () for i in range(self.fpn_layer - 1, -1, -1): -- Gitee From bf11fd9a2af157e8e33e5c88c8b51f01db4629f0 Mon Sep 17 00:00:00 2001 From: r1chardf1d0 Date: Mon, 29 Jan 2024 17:26:14 +0800 Subject: [PATCH 08/14] disable trace for pangu alpha --- official/nlp/Pangu_alpha/src/pangu_alpha.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/official/nlp/Pangu_alpha/src/pangu_alpha.py b/official/nlp/Pangu_alpha/src/pangu_alpha.py index 00f594b3d..02f1570be 100644 --- a/official/nlp/Pangu_alpha/src/pangu_alpha.py +++ b/official/nlp/Pangu_alpha/src/pangu_alpha.py @@ -23,7 +23,6 @@ from mindspore import Tensor, Parameter from mindspore.ops import operations as P from mindspore.ops import functional as F from mindspore.nn import Cell -from mindspore.ops._tracefunc import trace from mindformers.modules.transformer import VocabEmbedding, TransformerEncoder, TransformerEncoderLayer, \ AttentionMask, MoEConfig @@ -306,7 +305,6 @@ class PanguAlpha_Model(Cell): self.load_embedding_from_ckpt(config.load_ckpt_path) self.run_type = config.run_type - @trace def construct_blocks(self, hidden_state, encoder_masks, init_reset, batch_valid_length): if self.blocks is not None: for i in range(self.num_layers - 1): -- Gitee From 169176618719aa9393944946dd5946941a62db80 Mon Sep 17 00:00:00 2001 From: PingqiLi Date: Mon, 29 Jan 2024 17:43:00 +0800 Subject: [PATCH 09/14] modify the learning rate of the example in SSD training scripts --- official/cv/SSD/scripts/run_distribute_train.sh | 2 +- official/cv/SSD/scripts/run_distribute_train_gpu.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/official/cv/SSD/scripts/run_distribute_train.sh b/official/cv/SSD/scripts/run_distribute_train.sh index 893f4a76d..37fe46d37 100644 --- a/official/cv/SSD/scripts/run_distribute_train.sh +++ b/official/cv/SSD/scripts/run_distribute_train.sh @@ -17,7 +17,7 @@ echo "==============================================================================================================" echo "Please run the script as: " echo "bash run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET RANK_TABLE_FILE CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" -echo "for example: bash run_distribute_train.sh 8 500 0.2 coco /data/hccl.json /config_path /opt/ssd-300.ckpt(optional) 200(optional)" +echo "for example: bash run_distribute_train.sh 8 500 0.05 coco /data/hccl.json /config_path /opt/ssd-300.ckpt(optional) 200(optional)" echo "It is better to use absolute path." echo "=================================================================================================================" diff --git a/official/cv/SSD/scripts/run_distribute_train_gpu.sh b/official/cv/SSD/scripts/run_distribute_train_gpu.sh index 0778ad70f..0ff4b1818 100644 --- a/official/cv/SSD/scripts/run_distribute_train_gpu.sh +++ b/official/cv/SSD/scripts/run_distribute_train_gpu.sh @@ -17,7 +17,7 @@ echo "==============================================================================================================" echo "Please run the script as: " echo "bash run_distribute_train_gpu.sh DEVICE_NUM EPOCH_SIZE LR DATASET CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" -echo "for example: bash run_distribute_train_gpu.sh 8 500 0.2 coco /config_path /opt/ssd-300.ckpt(optional) 200(optional)" +echo "for example: bash run_distribute_train_gpu.sh 8 500 0.05 coco /config_path /opt/ssd-300.ckpt(optional) 200(optional)" echo "It is better to use absolute path." echo "=================================================================================================================" -- Gitee From 77d00581d59e1beefcef041ac8e054018c72de64 Mon Sep 17 00:00:00 2001 From: ash Date: Sun, 4 Feb 2024 11:11:40 +0800 Subject: [PATCH 10/14] fix MELGAN readme file name --- official/audio/MELGAN/README.md | 2 +- official/audio/MELGAN/README_CN.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/official/audio/MELGAN/README.md b/official/audio/MELGAN/README.md index 448fa1bb5..4185a79a5 100644 --- a/official/audio/MELGAN/README.md +++ b/official/audio/MELGAN/README.md @@ -46,7 +46,7 @@ Dataset used: [LJ Speech]() - Dataset size:2.6GB,13,100 short audio clips of a single speaker reading passages from 7 non-fiction books. - Data format:Each audio file is a single-channel 16-bit PCM WAV with a sample rate of 22050 Hz - - The audio data needs to be processed to a mel-spectrum, and you can refer to the script in [mel-spectrogram data creation](https://github.com/seungwonpark/melgan/blob/master/preprocess.py). Non CUDA environment needs to delete `. cuda()` in `utils/stfy.py`. To save data in the `npy` format, `preprocess.py` also needs to be modified. As follows: + - The audio data needs to be processed to a mel-spectrum, and you can refer to the script in [mel-spectrogram data creation](https://github.com/seungwonpark/melgan/blob/master/preprocess.py). Non CUDA environment needs to delete `. cuda()` in `utils/stft.py`. To save data in the `npy` format, `preprocess.py` also needs to be modified. As follows: ``` # 37 - 38 lines diff --git a/official/audio/MELGAN/README_CN.md b/official/audio/MELGAN/README_CN.md index 03a3b7bad..5d80d749a 100644 --- a/official/audio/MELGAN/README_CN.md +++ b/official/audio/MELGAN/README_CN.md @@ -46,7 +46,7 @@ MelGAN模型是非自回归全卷积模型。它的参数比同类模型少得 - Dataset size:2.6GB,包含13,100条只有一个说话人的短语音。语音的内容来自7本纪实书籍。 - 数据格式:每条语音文件都是单声道、16-bit以及采样率为22050。 - - 语音需要被处理为Mel谱, 可以参考脚本[Mel谱处理脚本](https://github.com/seungwonpark/melgan/blob/master/preprocess.py)。非CUDA环境需删除`utils/stfy.py`中的`.cuda()`,因为要保存`npy`格式的数据,所以`preproccess.py`也需要修改以下,参考代码如下: + - 语音需要被处理为Mel谱, 可以参考脚本[Mel谱处理脚本](https://github.com/seungwonpark/melgan/blob/master/preprocess.py)。非CUDA环境需删除`utils/stft.py`中的`.cuda()`,因为要保存`npy`格式的数据,所以`preproccess.py`也需要修改以下,参考代码如下: ``` # 37 - 38 行 -- Gitee From e38c7377188d0a6e1f560d85c7e29e4c0c8046ed Mon Sep 17 00:00:00 2001 From: tomzwang11 Date: Mon, 5 Feb 2024 15:07:49 +0800 Subject: [PATCH 11/14] update fasterrcnn --- official/cv/FasterRCNN/README_CN.md | 2 +- official/cv/FasterRCNN/scripts/run_distribute_train_gpu.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/official/cv/FasterRCNN/README_CN.md b/official/cv/FasterRCNN/README_CN.md index 676b27231..ace823dbf 100644 --- a/official/cv/FasterRCNN/README_CN.md +++ b/official/cv/FasterRCNN/README_CN.md @@ -625,7 +625,7 @@ bash run_infer_cpp.sh [MINDIR_PATH] [DATA_PATH] [ANNO_PATH] [DEVICE_TYPE] [IMAGE | 上传日期 | 2020/8/31 | 2021/2/10 |2022/8/10| | MindSpore版本 | 1.0.0 |1.2.0 |1.7.0| | 数据集 | COCO 2017 |COCO 2017 |FaceMaskDetection| -| 训练参数 | epoch=12, batch_size=2 |epoch=12, batch_size=2 |epoch=20,batch_size=2| +| 训练参数 | epoch=12, batch_size=2 |epoch=20, batch_size=2 |epoch=20,batch_size=2| | 优化器 | SGD |SGD |SGD| | 损失函数 | Softmax交叉熵,Sigmoid交叉熵,SmoothL1Loss |Softmax交叉熵,Sigmoid交叉熵,SmoothL1Loss |Softmax交叉熵,Sigmoid交叉熵,SmoothL1Loss| | 速度 | 1卡:190毫秒/步;8卡:200毫秒/步 | 1卡:320毫秒/步;8卡:335毫秒/步 |1卡:7328毫秒/步| diff --git a/official/cv/FasterRCNN/scripts/run_distribute_train_gpu.sh b/official/cv/FasterRCNN/scripts/run_distribute_train_gpu.sh index 8b27d1c67..d7af4ca64 100644 --- a/official/cv/FasterRCNN/scripts/run_distribute_train_gpu.sh +++ b/official/cv/FasterRCNN/scripts/run_distribute_train_gpu.sh @@ -97,4 +97,5 @@ mpirun -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout --all --pre_trained=$PRETRAINED_PATH \ --backbone=$3 \ --coco_root=$PATH3 \ + --base_lr=0.008 \ --mindrecord_dir=$mindrecord_dir > train.log 2>&1 & \ No newline at end of file -- Gitee From 39ec85593dc2d611a679efe15f0f49a8fdfe5ab4 Mon Sep 17 00:00:00 2001 From: ash Date: Sat, 17 Feb 2024 11:04:39 +0800 Subject: [PATCH 12/14] fix ssd error in 910A+GE --- official/cv/SSD/config/ssd_mobilenet_v1_fpn_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/official/cv/SSD/config/ssd_mobilenet_v1_fpn_config.yaml b/official/cv/SSD/config/ssd_mobilenet_v1_fpn_config.yaml index 099ea3758..6fa43d8a5 100644 --- a/official/cv/SSD/config/ssd_mobilenet_v1_fpn_config.yaml +++ b/official/cv/SSD/config/ssd_mobilenet_v1_fpn_config.yaml @@ -23,7 +23,7 @@ match_threshold: 0.5 nms_threshold: 0.6 min_score: 0.1 max_boxes: 100 -all_reduce_fusion_config: [29, 58, 89] +all_reduce_fusion_config: [29, 58, 89, 201] # learning rate settings lr_init: 0.01333 -- Gitee From fcf960d7d002a29aca2411b6b4a53d31caf77e39 Mon Sep 17 00:00:00 2001 From: gaoshuanglong Date: Tue, 20 Feb 2024 19:48:02 +0800 Subject: [PATCH 13/14] Fix Tacotron2 bug and readme --- official/audio/Tacotron2/README.md | 6 +++--- official/audio/Tacotron2/README_CN.md | 6 +++--- official/audio/Tacotron2/src/tacotron2.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/official/audio/Tacotron2/README.md b/official/audio/Tacotron2/README.md index 53283ba9b..51c939693 100644 --- a/official/audio/Tacotron2/README.md +++ b/official/audio/Tacotron2/README.md @@ -76,8 +76,8 @@ After installing MindSpore via the official website, you can start training and # example: bash run_standalone_train.sh /path/ljspeech.hdf5 0 # run distributed training - bash run_distributed_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] - # example: bash run_distributed_train.sh /path/ljspeech.h5 ../hccl_8p_01234567_127.0.0.1.json 8 0 + bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] + # example: bash run_distribute_train.sh /path/ljspeech.h5 ../hccl_8p_01234567_127.0.0.1.json 8 0 # run evaluation bash run_eval.sh [OUTPUT_PATH] [MODEL_CKPT] [DEVICE_ID] text is set in config.py( can modify text of ljspeech_config.yaml) @@ -246,7 +246,7 @@ Parameters for both training and evaluation can be set in [DATASET]_config.yaml ```bash cd scripts - bash run_distributed_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] + bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] ``` Note: `DATASET_PATH` is the directory contains hdf5 file. diff --git a/official/audio/Tacotron2/README_CN.md b/official/audio/Tacotron2/README_CN.md index e104f6e48..aaf135f92 100644 --- a/official/audio/Tacotron2/README_CN.md +++ b/official/audio/Tacotron2/README_CN.md @@ -76,8 +76,8 @@ Tacotron2实质上是一种包含编码器和解码器的序列到序列模型 # 示例:bash run_standalone_train.sh /path/ljspeech.hdf5 0 # 运行分布式训练 - bash run_distributed_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] - # 示例:bash run_distributed_train.sh /path/ljspeech.h5 ../hccl_8p_01234567_127.0.0.1.json 8 0 + bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] + # 示例:bash run_distribute_train.sh /path/ljspeech.h5 ../hccl_8p_01234567_127.0.0.1.json 8 0 # 运行评估 bash run_eval.sh [OUTPUT_PATH] [MODEL_CKPT] [DEVICE_ID] text is set in config.py( can modify text of ljspeech_config.yaml) @@ -246,7 +246,7 @@ tacotron2/ ```bash cd scripts - bash run_distributed_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] + bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME] [RANK_SIZE] [DEVICE_BEGIN] ``` 注:`DATASET_PATH`是包含HDF5文件的目录。 diff --git a/official/audio/Tacotron2/src/tacotron2.py b/official/audio/Tacotron2/src/tacotron2.py index e685cf0f8..a5b573f31 100644 --- a/official/audio/Tacotron2/src/tacotron2.py +++ b/official/audio/Tacotron2/src/tacotron2.py @@ -1168,7 +1168,7 @@ class TrainStepWrap(nn.Cell): overflow = ops.logical_not(amp.all_finite(grads)) if self.reducer_flag: - overflow = self.allreduce(overflow.to(mstype.float32)) >= self.base + overflow = self.all_reduce(overflow.to(mstype.float32)) >= self.base overflow = self.loss_scaling_manager(self.loss_scale, overflow) -- Gitee From 0b79514ce5098bdf015baacfc46a0b7959d9b858 Mon Sep 17 00:00:00 2001 From: The-truthh <821372701@qq.com> Date: Tue, 27 Feb 2024 17:24:37 +0800 Subject: [PATCH 14/14] fix the low evaluation accuracy of fasterrcnn --- official/cv/FasterRCNN/scripts/run_eval_ascend.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/official/cv/FasterRCNN/scripts/run_eval_ascend.sh b/official/cv/FasterRCNN/scripts/run_eval_ascend.sh index d27654617..d9b2be113 100644 --- a/official/cv/FasterRCNN/scripts/run_eval_ascend.sh +++ b/official/cv/FasterRCNN/scripts/run_eval_ascend.sh @@ -95,6 +95,7 @@ export DEVICE_NUM=1 export RANK_SIZE=$DEVICE_NUM export DEVICE_ID=$5 export RANK_ID=0 +export MS_ASCEND_CHECK_OVERFLOW_MODE="SATURATION_MODE" if [ -d "eval" ]; then -- Gitee