diff --git a/application_example/maskrcnn/src/dataset/__init__.py b/application_example/maskrcnn/src/dataset/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/application_example/maskrcnn/src/datasets.md b/application_example/maskrcnn/src/datasets.md new file mode 100644 index 0000000000000000000000000000000000000000..bc300a2a659d274da454f15fa4f1f365f49eb2cf --- /dev/null +++ b/application_example/maskrcnn/src/datasets.md @@ -0,0 +1,7 @@ +. +└─cocodataset + ├─annotations + ├─instance_train2017.json + └─instance_val2017.json + ├─val2017 + └─train2017 diff --git a/application_example/maskrcnn/src/eval.py b/application_example/maskrcnn/src/eval.py index e5a7f9a7b0e53e8f28c53e5f3fe4d9b013de0b07..8afaead1229b35d4ce39345fa9780e6a4551e635 100644 --- a/application_example/maskrcnn/src/eval.py +++ b/application_example/maskrcnn/src/eval.py @@ -23,10 +23,10 @@ from mindspore import context, Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.common import set_seed +from utils.config import config from model.mask_rcnn_r50 import MaskRcnnResnet50 # when use maskrcnn mobilenetv1, just change the following backbone # from mask_rcnn_mobilenetv1 -from utils.config import config from utils.util import coco_eval, bbox2result_1image, results2json, get_seg_masks from dataset.dataset import data_to_mindrecord_byte_image, create_coco_dataset diff --git a/application_example/maskrcnn/src/images/framework.png b/application_example/maskrcnn/src/images/framework.png new file mode 100644 index 0000000000000000000000000000000000000000..c3cd10ba7b68be5a85d6fe16059c4df733ecb204 Binary files /dev/null and b/application_example/maskrcnn/src/images/framework.png differ diff --git a/application_example/maskrcnn/src/images/infer.png b/application_example/maskrcnn/src/images/infer.png new file mode 100644 index 0000000000000000000000000000000000000000..6d4a7e758ca9e84366f1e68eca8730c9d70d5e0d Binary files /dev/null and b/application_example/maskrcnn/src/images/infer.png differ diff --git a/application_example/maskrcnn/src/images/mobilenetv1.png b/application_example/maskrcnn/src/images/mobilenetv1.png new file mode 100644 index 0000000000000000000000000000000000000000..dccdafac99d892149bcf481f1d204f0e9daafefa Binary files /dev/null and b/application_example/maskrcnn/src/images/mobilenetv1.png differ diff --git a/application_example/maskrcnn/src/images/resnet_block.png b/application_example/maskrcnn/src/images/resnet_block.png new file mode 100644 index 0000000000000000000000000000000000000000..35d8ba8bdac9431e7f78e252979c6e822eea4522 Binary files /dev/null and b/application_example/maskrcnn/src/images/resnet_block.png differ diff --git a/application_example/maskrcnn/src/images/roi_align.png b/application_example/maskrcnn/src/images/roi_align.png new file mode 100644 index 0000000000000000000000000000000000000000..9a1ae73bb2feac22c9dc5c43c9392f4a8e4492ff Binary files /dev/null and b/application_example/maskrcnn/src/images/roi_align.png differ diff --git a/application_example/maskrcnn/src/infer.py b/application_example/maskrcnn/src/infer.py index 3f4fc70660e8edadf5f099a83b70512636b02294..e916f62bb528aece3b57787174de6a8cd2b8dbc8 100644 --- a/application_example/maskrcnn/src/infer.py +++ b/application_example/maskrcnn/src/infer.py @@ -27,10 +27,10 @@ from mindspore import context, Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.common import set_seed +from utils.config import config # when use maskrcnn mobilenetv1, just change the following backbone # from mask_rcnn_mobilenetv1 from model.mask_rcnn_r50 import MaskRcnnResnet50 -from utils.config import config from dataset.dataset import create_coco_dataset set_seed(1) diff --git a/application_example/maskrcnn/src/maskrcnn.ipynb b/application_example/maskrcnn/src/maskrcnn.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..cceabf2cb109d9c2326e3619492c3df5699dbad5 --- /dev/null +++ b/application_example/maskrcnn/src/maskrcnn.ipynb @@ -0,0 +1,3788 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Mask R-CNN\n", + "\n", + "MaskRCNN是一种概念简单、灵活、通用的目标实例分割框架,在检测出图像中目标的同时,还为每一个实例生成高质量掩码。这种称为Mask R-CNN的方法,通过添加与现有边框检测分支平行的预测目标掩码分支,达到扩展Faster R-CNN的目的。Mask R-CNN训练简单,运行速度达5fps,与Faster R-CNN相比,开销只有小幅上涨。此外,Mask R-CNN易于推广到其他任务。例如,允许在同一框架中预测人体姿势。 Mask R-CNN在COCO挑战赛的三个关键难点上都表现不俗,包括实例分割、边框目标检测和人物关键点检测。Mask R-CNN没有什么华而不实的附加功能,各任务的表现都优于现存所有单模型,包括COCO 2016挑战赛的胜出模型。\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 模型简介\n", + "\n", + "MaskRCNN是一个两级目标检测网络,作为FasterRCNN的扩展模型,在现有的边框检测分支的基础上增加了一个预测目标掩码的分支。该网络采用区域候选网络(RPN),可与检测网络共享整个图像\n", + "的卷积特征,无需任何代价就可轻松计算候选区域。整个网络通过共享卷积特征,将RPN和掩码分支合并为一个网络。其模型骨干还可以选择轻量级网络Mobilenet。\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 官方库和第三方库的导入\n", + "\n", + "我们首先导入案例依赖的官方库和第三方库。" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import os\n", + "\n", + "import numpy as np\n", + "import mindspore.nn as nn\n", + "import mindspore.common.dtype as mstype\n", + "from mindspore.ops import operations as P\n", + "from mindspore.ops import functional as F\n", + "from mindspore.ops import composite as C\n", + "from mindspore.nn import layer as L\n", + "from mindspore.common.initializer import initializer\n", + "from mindspore import context, Tensor, Parameter\n", + "from mindspore import ParameterTuple\n", + "from mindspore.train.callback import Callback\n", + "from mindspore.nn.wrap.grad_reducer import DistributedGradReducer\n", + "from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor\n", + "from mindspore.train import Model\n", + "from mindspore.train.serialization import load_checkpoint, load_param_into_net\n", + "from mindspore.nn import Momentum\n", + "from mindspore.common import set_seed\n", + "\n", + "from utils.config import config" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 数据处理\n", + "\n", + "开始实验之前,请确保本地已经安装了Python环境并安装了MindSpore Vision套件。\n", + "\n", + "### 数据准备\n", + "\n", + "COCO2017是一个广泛应用的数据集,带有边框和像素级背景注释。这些注释可用于场景理解任务,如语义分割,目标检测和图像字幕制作。训练和评估的图像大小为118K和5K。\n", + "\n", + "数据集大小:19G\n", + "\n", + "训练:18G,118,000个图像\n", + "\n", + "评估:1G,5000个图像\n", + "\n", + "注释:241M;包括实例、字幕、人物关键点等\n", + "\n", + "数据格式:图像及JSON文件\n", + "\n", + "注:数据在dataset.py中处理。\n", + "\n", + "首先,你需要下载 coco2017 数据集。\n", + "\n", + "下载完成后,确保你的数据集存放符合如下路径。" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".\n", + "└─cocodataset\n", + " ├─annotations\n", + " ├─instance_train2017.json\n", + " └─instance_val2017.json\n", + " ├─val2017\n", + " └─train2017" + ] + } + ], + "source": [ + "!cat datasets.md" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据预处理\n", + "\n", + "原始数据集中图像大小不一致,不方便统一读取和检测。我们首先统一图像大小。数据的注释信息保存在json文件中,我们需要读取出来给图像数据加label。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据增强\n", + "\n", + "在你开始训练模型之前。数据增强对于您的数据集以及创建训练数据和测试数据是必要的。对于coco数据集,你可以使用dataset.py为图像添加label,并将它们转换到MindRecord。MindRecord是一种MindSpore指定的数据格式,可以在某些场景下优化MindSpore的性能。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "首先,我们创建MindRecord数据集保存和读取的地址。" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "from dataset.dataset import create_coco_dataset, data_to_mindrecord_byte_image\n", + "\n", + "def create_mindrecord_dir(prefix, mindrecord_dir):\n", + " \"\"\"Create MindRecord Direction.\"\"\"\n", + " if not os.path.isdir(mindrecord_dir):\n", + " os.makedirs(mindrecord_dir)\n", + " if config.dataset == \"coco\":\n", + " if os.path.isdir(config.data_root):\n", + " print(\"Create Mindrecord.\")\n", + " data_to_mindrecord_byte_image(\"coco\", True, prefix)\n", + " print(\"Create Mindrecord Done, at {}\".format(mindrecord_dir))\n", + " else:\n", + " raise Exception(\"coco_root not exits.\")\n", + " else:\n", + " if os.path.isdir(config.IMAGE_DIR) and os.path.exists(config.ANNO_PATH):\n", + " print(\"Create Mindrecord.\")\n", + " data_to_mindrecord_byte_image(\"other\", True, prefix)\n", + " print(\"Create Mindrecord Done, at {}\".format(mindrecord_dir))\n", + " else:\n", + " raise Exception(\"IMAGE_DIR or ANNO_PATH not exits.\")\n", + " while not os.path.exists(mindrecord_file+\".db\"):\n", + " time.sleep(5)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "然后,加载数据集,调用dataset.py中的create_coco_dataset函数完成数据预处理和数据增强。" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Start create dataset!\n", + "total images num: 51790\n", + "Create dataset done!\n" + ] + } + ], + "source": [ + "# Allocating memory Environment\n", + "device_target = config.device_target\n", + "rank = 0\n", + "device_num = 1\n", + "context.set_context(mode=context.GRAPH_MODE, device_target=device_target)\n", + "\n", + "print(\"Start create dataset!\")\n", + "# Call the interface for data processing\n", + "# It will generate mindrecord file in config.mindrecord_dir,\n", + "# and the file name is MaskRcnn.mindrecord0, 1, ... file_num.\n", + "prefix = \"MaskRcnn.mindrecord\"\n", + "mindrecord_dir = config.mindrecord_dir\n", + "mindrecord_file = os.path.join(mindrecord_dir, prefix + \"0\")\n", + "if rank == 0 and not os.path.exists(mindrecord_file):\n", + " create_mindrecord_dir(prefix, mindrecord_dir)\n", + "# When create MindDataset, using the fitst mindrecord file,\n", + "# such as MaskRcnn.mindrecord0.\n", + "dataset = create_coco_dataset(mindrecord_file, batch_size=config.batch_size, device_num=device_num, rank_id=rank)\n", + "dataset_size = dataset.get_dataset_size()\n", + "print(\"total images num: \", dataset_size)\n", + "print(\"Create dataset done!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据集可视化\n", + "\n", + "运行以下代码观察数据增强后的图片。可以发现图片经过了旋转处理,并且图片的shape也已经转换为待输入网络的(N,C,H,W)格式,其中N代表样本数量,C代表图片通道,H和W代表图片的高和宽。" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Image shape: (2, 3, 768, 1280)\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "show_data = next(dataset.create_dict_iterator())\n", + "\n", + "show_images = show_data[\"image\"].asnumpy()\n", + "print(f'Image shape: {show_images.shape}')\n", + "\n", + "plt.figure()\n", + "\n", + "# 展示2张图片供参考\n", + "for i in range(1, 3):\n", + " plt.subplot(1, 2, i)\n", + "\n", + " # 将图片转换HWC格式\n", + " image_trans = np.transpose(show_images[i - 1], (1, 2, 0))\n", + " image_trans = np.clip(image_trans, 0, 1)\n", + "\n", + " plt.imshow(image_trans[:, :], cmap=None)\n", + " plt.xticks(rotation=180)\n", + " plt.axis(\"off\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 构建网络\n", + "\n", + "![image1](images/framework.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "前文提到Mask RCNN的模型骨干采用ResNet50(原文),通过添加与现有边框检测分支平行的预测目标掩模分支实现扩展Faster R-CNN,完成目标检测。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 骨干网络\n", + "\n", + "Mask R-CNN骨干网络的选择:ResNet, VGG, Mobilenet等。本项目中,使用了对ResNet为骨干的Mask RCNN进行了框架迁移。以及扩展了Mobilenet这种轻量级网络。\n", + "\n", + "骨干网络:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Resnet(Deep residual network, ResNet),深度残差神经网络,卷积神经网络历史在具有划时代意义的神经网络。与Alexnet和VGG不同的是,网络结构上就有很大的改变,在大家为了提升卷积神经网络的性能在不断提升网络深度的时候,大家发现随着网络深度的提升,网络的效果变得越来越差,甚至出现了网络的退化问题,80层的网络比30层的效果还差,深度网络存在的梯度消失和爆炸问题越来越严重,这使得训练一个优异的深度学习模型变得更加艰难,在这种情况下,网络残差模块可以有效消除梯度消失和梯度爆炸问题。\n", + "\n", + "![image2](images/resnet_block.png)\n", + "\n", + "2. Mobilenetv1是一种轻量级的深度卷积网络,MobileNet的基本单元是深度级可分离卷积(depthwise separable convolution),将标准卷积分成两步。第一步 Depthwise convolution(DW),也即逐通道的卷积,一个卷积核负责一个通道,一个通道只被一个卷积核“滤波”,则卷积核个数和通道数个数相同;第二步,Pointwise convolution(PW),将depthwise convolution得到的结果通过1x1卷积,再“串”起来。这样其实整体效果和一个标准卷积是差不多的,但是会大大减少计算量和模型参数量。其网络结构如下。\n", + "\n", + "![image3](images/mobilenetv1.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "原文中,使用Resnet为骨干网络。这里,我们也选择Resnet50作为骨干网络执行案例。" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import mindspore.nn as nn\n", + "import mindspore.common.dtype as mstype\n", + "from mindspore.ops import operations as P\n", + "from mindspore.common.tensor import Tensor\n", + "from mindspore.ops import functional as F\n", + "\n", + "\n", + "ms_cast_type = mstype.float32\n", + "\n", + "\n", + "def weight_init_ones(shape):\n", + " \"\"\"\n", + " Weight init.\n", + "\n", + " Args:\n", + " shape(List): weights shape.\n", + "\n", + " Returns:\n", + " Tensor, weights, default float32.\n", + " \"\"\"\n", + " return Tensor(np.array(np.ones(shape).astype(np.float32) * 0.01).astype(np.float32))\n", + "\n", + "\n", + "def _conv(in_channels, out_channels, kernel_size=3, stride=1, padding=0, pad_mode='pad'):\n", + " \"\"\"\n", + " Conv2D wrapper.\n", + "\n", + " Args:\n", + " in_channels (int): The channel number of the input tensor of the Conv2d layer.\n", + " out_channels (int): The channel number of the output tensor of the Conv2d layer.\n", + " kernel_size (Union[int, tuple[int]]): Specifies the height and width of the 2D convolution kernel.\n", + " The data type is an integer or a tuple of two integers. An integer represents the height\n", + " and width of the convolution kernel. A tuple of two integers represents the height\n", + " and width of the convolution kernel respectively. Default: 3.\n", + " stride (Union[int, tuple[int]]): The movement stride of the 2D convolution kernel.\n", + " The data type is an integer or a tuple of two integers. An integer represents the movement step size\n", + " in both height and width directions. A tuple of two integers represents the movement step size in the height\n", + " and width directions respectively. Default: 1.\n", + " padding (Union[int, tuple[int]]): The number of padding on the height and width directions of the input.\n", + " The data type is an integer or a tuple of four integers. If `padding` is an integer,\n", + " then the top, bottom, left, and right padding are all equal to `padding`.\n", + " If `padding` is a tuple of 4 integers, then the top, bottom, left, and right padding\n", + " is equal to `padding[0]`, `padding[1]`, `padding[2]`, and `padding[3]` respectively.\n", + " The value should be greater than or equal to 0. Default: 0.\n", + " pad_mode (str): Specifies padding mode. The optional values are\n", + " \"same\", \"valid\", \"pad\". Default: \"pad\".\n", + "\n", + " Outputs:\n", + " Tensor, math '(N, C_{out}, H_{out}, W_{out})' or math '(N, H_{out}, W_{out}, C_{out})'.\n", + " \"\"\"\n", + " shape = (out_channels, in_channels, kernel_size, kernel_size)\n", + " weights = weight_init_ones(shape)\n", + " return nn.Conv2d(in_channels, out_channels,\n", + " kernel_size=kernel_size, stride=stride, padding=padding,\n", + " pad_mode=pad_mode, weight_init=weights, has_bias=False).to_float(ms_cast_type)\n", + "\n", + "\n", + "def _batch_norm2d_init(out_chls, momentum=0.1, affine=True, use_batch_statistics=True):\n", + " \"\"\"\n", + " Batchnorm2D wrapper.\n", + "\n", + " Args:\n", + " out_cls (int): The number of channels of the input tensor. Expected input size is (N, C, H, W),\n", + " `C` represents the number of channels\n", + " momentum (float): A floating hyperparameter of the momentum for the\n", + " running_mean and running_var computation. Default: 0.1.\n", + " affine (bool): A bool value. When set to True, gamma and beta can be learned. Default: True.\n", + " use_batch_statistics (bool):\n", + "\n", + " - If true, use the mean value and variance value of current batch data and track running mean\n", + " and running variance. Default: True.\n", + " - If false, use the mean value and variance value of specified value, and not track statistical value.\n", + " - If None, the use_batch_statistics is automatically set to true or false according to the training\n", + " and evaluation mode. During training, the parameter is set to true, and during evaluation, the\n", + " parameter is set to false.\n", + " Outputs:\n", + " Tensor, the normalized, scaled, offset tensor, of shape :math:'(N, C_{out}, H_{out}, W_{out})'.\n", + " \"\"\"\n", + " gamma_init = Tensor(np.array(np.ones(out_chls)).astype(np.float32))\n", + " beta_init = Tensor(np.array(np.ones(out_chls) * 0).astype(np.float32))\n", + " moving_mean_init = Tensor(np.array(np.ones(out_chls) * 0).astype(np.float32))\n", + " moving_var_init = Tensor(np.array(np.ones(out_chls)).astype(np.float32))\n", + "\n", + " return nn.BatchNorm2d(out_chls, momentum=momentum, affine=affine, gamma_init=gamma_init,\n", + " beta_init=beta_init, moving_mean_init=moving_mean_init,\n", + " moving_var_init=moving_var_init,\n", + " use_batch_statistics=use_batch_statistics)\n", + "\n", + "\n", + "class ResNetFea(nn.Cell):\n", + " \"\"\"\n", + " ResNet architecture.\n", + "\n", + " Args:\n", + " block (Cell): Block for network.\n", + " layer_nums (list): Numbers of block in different layers.\n", + " in_channels (list): Input channel in each layer.\n", + " out_channels (list): Output channel in each layer.\n", + " weights_update (bool): Weight update flag.\n", + "\n", + " Inputs:\n", + " - **x** (Cell) - Input block.\n", + "\n", + " Outputs:\n", + " Cell, output block.\n", + "\n", + " Support Plarforms:\n", + " ``Ascend`` ``CPU`` ``GPU``\n", + "\n", + " Examples:\n", + " >>> ResNetFea(ResidualBlockUsing, [3, 4, 6, 3], [64, 256, 512, 1024], [256, 512, 1024, 2048], False)\n", + " \"\"\"\n", + " def __init__(self, block, layer_nums, in_channels, out_channels, weights_update=False):\n", + " super(ResNetFea, self).__init__()\n", + "\n", + " if not len(layer_nums) == len(in_channels) == len(out_channels) == 4:\n", + " raise ValueError(\"the length of \"\n", + " \"layer_num, inchannel, outchannel list must be 4!\")\n", + "\n", + " bn_training = False\n", + " self.conv1 = _conv(3, 64, kernel_size=7, stride=2, padding=3, pad_mode='pad')\n", + " self.bn1 = _batch_norm2d_init(64, affine=bn_training, use_batch_statistics=bn_training)\n", + " self.relu = P.ReLU()\n", + " self.maxpool = P.MaxPool(kernel_size=3, strides=2, pad_mode=\"SAME\")\n", + " self.weights_update = weights_update\n", + "\n", + " if not self.weights_update:\n", + " self.conv1.weight.requires_grad = False\n", + "\n", + " self.layer1 = self._make_layer(block, layer_nums[0], in_channel=in_channels[0],\n", + " out_channel=out_channels[0], stride=1, training=bn_training,\n", + " weights_update=self.weights_update)\n", + " self.layer2 = self._make_layer(block, layer_nums[1], in_channel=in_channels[1],\n", + " out_channel=out_channels[1], stride=2,\n", + " training=bn_training, weights_update=True)\n", + " self.layer3 = self._make_layer(block, layer_nums[2], in_channel=in_channels[2],\n", + " out_channel=out_channels[2], stride=2,\n", + " training=bn_training, weights_update=True)\n", + " self.layer4 = self._make_layer(block, layer_nums[3], in_channel=in_channels[3],\n", + " out_channel=out_channels[3], stride=2,\n", + " training=bn_training, weights_update=True)\n", + "\n", + " def _make_layer(self, block, layer_num, in_channel, out_channel, stride, training=False, weights_update=False):\n", + " \"\"\"\n", + " Make layer for resnet backbone.\n", + "\n", + " Args:\n", + " block (Cell): ResNet block.\n", + " layer_num (int): Layer number.\n", + " in_channel (int): Input channel.\n", + " out_channel (int): Output channel.\n", + " stride (int): Stride size for convolutional layer.\n", + " training(bool): Whether to do training. Default: False.\n", + " weights_update(bool): Whether to update weights. Default: False.\n", + "\n", + " Returns:\n", + " SequentialCell, Combine several layers toghter.\n", + "\n", + " Examples:\n", + " >>> _make_layer(InvertedResidual, 4, 64, 64, 1)\n", + " \"\"\"\n", + " layers = []\n", + " down_sample = False\n", + " if stride != 1 or in_channel != out_channel:\n", + " down_sample = True\n", + " resblk = block(in_channel, out_channel, stride=stride, down_sample=down_sample,\n", + " training=training, weights_update=weights_update)\n", + " layers.append(resblk)\n", + "\n", + " for _ in range(1, layer_num):\n", + " resblk = block(out_channel, out_channel, stride=1, training=training, weights_update=weights_update)\n", + " layers.append(resblk)\n", + "\n", + " return nn.SequentialCell(layers)\n", + "\n", + " def construct(self, x):\n", + " \"\"\"Construct ResNet architecture.\"\"\"\n", + " x = self.conv1(x)\n", + " x = self.bn1(x)\n", + " x = self.relu(x)\n", + " c1 = self.maxpool(x)\n", + "\n", + " c2 = self.layer1(c1)\n", + " identity = c2\n", + " if not self.weights_update:\n", + " identity = F.stop_gradient(c2)\n", + " c3 = self.layer2(identity)\n", + " c4 = self.layer3(c3)\n", + " c5 = self.layer4(c4)\n", + "\n", + " return identity, c3, c4, c5\n", + "\n", + "\n", + "class ResidualBlockUsing(nn.Cell):\n", + " \"\"\"\n", + " ResNet V1 residual block definition.\n", + "\n", + " Args:\n", + " in_channels (int): Input channel.\n", + " out_channels (int): Output channel.\n", + " stride (int): Stride size for the initial convolutional layer. Default: 1.\n", + " down_sample (bool): If to do the downsample in block. Default: False.\n", + " momentum (float): Momentum for batchnorm layer. Default: 0.1.\n", + " training (bool): Training flag. Default: False.\n", + " weights_updata (bool): Weights update flag. Default: False.\n", + "\n", + " Inputs:\n", + " - **x** (Cell) - Input block.\n", + "\n", + " Outputs:\n", + " Cell, output block.\n", + "\n", + " Support Plarforms:\n", + " ``Ascend`` ``CPU`` ``GPU``\n", + "\n", + " Examples:\n", + " ResidualBlockUsing(3, 256, stride=2, down_sample=True)\n", + " \"\"\"\n", + " expansion = 4\n", + "\n", + " def __init__(self, in_channels, out_channels, stride=1, down_sample=False,\n", + " momentum=0.1, training=False, weights_update=False):\n", + " super(ResidualBlockUsing, self).__init__()\n", + "\n", + " self.affine = weights_update\n", + "\n", + " out_chls = out_channels // self.expansion\n", + " self.conv1 = _conv(in_channels, out_chls, kernel_size=1, stride=1, padding=0)\n", + " self.bn1 = _batch_norm2d_init(out_chls, momentum=momentum, affine=self.affine, use_batch_statistics=training)\n", + "\n", + " self.conv2 = _conv(out_chls, out_chls, kernel_size=3, stride=stride, padding=1)\n", + " self.bn2 = _batch_norm2d_init(out_chls, momentum=momentum, affine=self.affine, use_batch_statistics=training)\n", + "\n", + " self.conv3 = _conv(out_chls, out_channels, kernel_size=1, stride=1, padding=0)\n", + " self.bn3 = _batch_norm2d_init(out_channels, momentum=momentum, affine=self.affine,\n", + " use_batch_statistics=training)\n", + "\n", + " if training:\n", + " self.bn1 = self.bn1.set_train()\n", + " self.bn2 = self.bn2.set_train()\n", + " self.bn3 = self.bn3.set_train()\n", + "\n", + " if not weights_update:\n", + " self.conv1.weight.requires_grad = False\n", + " self.conv2.weight.requires_grad = False\n", + " self.conv3.weight.requires_grad = False\n", + "\n", + " self.relu = P.ReLU()\n", + " self.downsample = down_sample\n", + " if self.downsample:\n", + " self.conv_down_sample = _conv(in_channels, out_channels, kernel_size=1, stride=stride, padding=0)\n", + " self.bn_down_sample = _batch_norm2d_init(out_channels, momentum=momentum, affine=self.affine,\n", + " use_batch_statistics=training)\n", + " if training:\n", + " self.bn_down_sample = self.bn_down_sample.set_train()\n", + " if not weights_update:\n", + " self.conv_down_sample.weight.requires_grad = False\n", + " self.add = P.Add()\n", + "\n", + " def construct(self, x):\n", + " \"\"\"Construct ResNet V1 residual block.\"\"\"\n", + " identity = x\n", + "\n", + " out = self.conv1(x)\n", + " out = self.bn1(out)\n", + " out = self.relu(out)\n", + "\n", + " out = self.conv2(out)\n", + " out = self.bn2(out)\n", + " out = self.relu(out)\n", + "\n", + " out = self.conv3(out)\n", + " out = self.bn3(out)\n", + "\n", + " if self.downsample:\n", + " identity = self.conv_down_sample(identity)\n", + " identity = self.bn_down_sample(identity)\n", + "\n", + " out = self.add(out, identity)\n", + " out = self.relu(out)\n", + "\n", + " return out" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### FPN网络\n", + "\n", + "FPN网络(Feature Pyramid Network)同时利用低层特征高分辨率和高层特征的高语义信息,通过融合这些不同层的特征达到预测的效果。并且预测是在每个融合后的特征层上单独进行的,这和常规的特征融合方式不同。\n", + "\n", + "骨干网络和FPN网络结合构成了Mask RCNN网络的卷积层。" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "def bias_init_zeros(shape):\n", + " \"\"\"Bias init method.\"\"\"\n", + " result = Tensor(np.array(np.zeros(shape).astype(np.float32)), dtype=mstype.float32)\n", + " return result\n", + "\n", + "\n", + "def _conv(in_channels, out_channels, kernel_size=3, stride=1, padding=0, pad_mode='pad'):\n", + " \"\"\"\n", + " Conv2D wrapper.\n", + "\n", + " Args:\n", + " in_channels(int): Input channel num.\n", + " out_channels(int): Output channel num.\n", + " kernel_size(int): Kernel size. Default: 1.\n", + " stride(int): Stride. Default: 1.\n", + " padding(int): Padding range. Default: 0.\n", + " pad_mode(bool): Padding model. Default: 'pad'.\n", + " gain(int): Gain. Default: 1.\n", + "\n", + " Returns:\n", + " Tensor, Convoluted result.\n", + " \"\"\"\n", + " shape = (out_channels, in_channels, kernel_size, kernel_size)\n", + " weights = initializer(\"XavierUniform\", shape=shape, dtype=mstype.float32)\n", + " shape_bias = (out_channels,)\n", + " biass = bias_init_zeros(shape_bias)\n", + " return nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding,\n", + " pad_mode=pad_mode, weight_init=weights, has_bias=True, bias_init=biass)\n", + "\n", + "\n", + "class FeatPyramidNeck(nn.Cell):\n", + " \"\"\"\n", + " Feature pyramid network cell, usually uses as network neck.\n", + "\n", + " Applies the convolution on multiple, input feature maps\n", + " and output feature map with same channel size. if required num of\n", + " output larger then num of inputs, add extra maxpooling for further\n", + " downsampling;\n", + "\n", + " Args:\n", + " in_channels (tuple): Channel size of input feature maps.\n", + " out_channels (int): Channel size output.\n", + " num_outs (int): Num of output features.\n", + " Inputs:\n", + " - **x** (Tensor) - Input variant\n", + "\n", + " Outputs:\n", + " Tuple, with tensors of same channel size.\n", + "\n", + " Support Platform:\n", + " ``Ascend`` ``CPU`` ``GPU``\n", + "\n", + " Examples:\n", + " >>> neck = FeatPyramidNeck([100,200,300], 50, 4)\n", + " >>> input_data = (normal(0,0.1,(1,c,1280//(4*2**i), 768//(4*2**i)),\n", + " ... dtype=np.float32) for i, c in enumerate(config.fpn_in_channels))\n", + " >>> out = neck(input_data)\n", + " \"\"\"\n", + "\n", + " def __init__(self,\n", + " in_channels,\n", + " out_channels,\n", + " num_outs):\n", + " super(FeatPyramidNeck, self).__init__()\n", + "\n", + " self.cast_type = mstype.float32\n", + "\n", + " self.num_outs = num_outs\n", + " self.in_channels = in_channels\n", + " self.fpn_layer = len(self.in_channels)\n", + "\n", + " assert not self.num_outs < len(in_channels)\n", + "\n", + " self.lateral_convs_list_ = []\n", + " self.fpn_convs_ = []\n", + "\n", + " for _, channel in enumerate(in_channels):\n", + " l_conv = _conv(channel, out_channels, kernel_size=1, stride=1, padding=0,\n", + " pad_mode='valid').to_float(self.cast_type)\n", + " fpn_conv = _conv(out_channels, out_channels, kernel_size=3, stride=1, padding=0,\n", + " pad_mode='same').to_float(self.cast_type)\n", + " self.lateral_convs_list_.append(l_conv)\n", + " self.fpn_convs_.append(fpn_conv)\n", + " self.lateral_convs_list = nn.layer.CellList(self.lateral_convs_list_)\n", + " self.fpn_convs_list = nn.layer.CellList(self.fpn_convs_)\n", + " self.interpolate1 = P.ResizeBilinear((48, 80))\n", + " self.interpolate2 = P.ResizeBilinear((96, 160))\n", + " self.interpolate3 = P.ResizeBilinear((192, 320))\n", + " self.cast = P.Cast()\n", + " self.maxpool = P.MaxPool(kernel_size=1, strides=2, pad_mode=\"same\")\n", + "\n", + " def construct(self, inputs):\n", + " \"\"\"construction of Feature Pyramid Neck.\"\"\"\n", + " layers = ()\n", + " for i in range(self.fpn_layer):\n", + " layers += (self.lateral_convs_list[i](inputs[i]),)\n", + "\n", + " cast_layers = (layers[3],)\n", + " cast_layers = \\\n", + " cast_layers + (layers[2] + self.cast(self.interpolate1(cast_layers[self.fpn_layer - 4]), self.cast_type),)\n", + " cast_layers = \\\n", + " cast_layers + (layers[1] + self.cast(self.interpolate2(cast_layers[self.fpn_layer - 3]), self.cast_type),)\n", + " cast_layers = \\\n", + " cast_layers + (layers[0] + self.cast(self.interpolate3(cast_layers[self.fpn_layer - 2]), self.cast_type),)\n", + "\n", + " layers_arranged = ()\n", + " for i in range(self.fpn_layer - 1, -1, -1):\n", + " layers_arranged = layers_arranged + (cast_layers[i],)\n", + "\n", + " outs = ()\n", + " for i in range(self.fpn_layer):\n", + " outs = outs + (self.fpn_convs_list[i](layers_arranged[i]),)\n", + "\n", + " for i in range(self.num_outs - self.fpn_layer):\n", + " outs = outs + (self.maxpool(outs[3]),)\n", + " return outs\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### RPN网络\n", + "\n", + "RPN第一次出现在世人眼中是在Faster RCNN这个结构中,专门用来提取候选框,在RCNN和Fast RCNN等物体检测架构中,用来提取候选框的方法通常是Selective Search,是比较传统的方法,而且比较耗时,在CPU上要2s一张图。所以作者提出RPN,专门用来提取候选框,一方面RPN耗时少,另一方面RPN可以很容易结合到Fast RCNN中,称为一个整体。\n", + "\n", + "RPN网络主要输出项:\n", + "\n", + "1. ROI:对应在特征层每个特征点产生4k个变量,其中4表示[dy, dx, dh, dw]四个边框平移缩放量。其中k表示4个边框,k=4。\n", + "\n", + "2. scores:对应在特征层每个特征点产生2k个变量,其中2表示前景和北京概率。其中k表示3个边框,k=3。" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "from model.bbox_assign_sample import BboxAssignSample\n", + "\n", + "\n", + "class RpnRegClsBlock(nn.Cell):\n", + " \"\"\"\n", + " Rpn reg cls block for rpn layer\n", + "\n", + " Args:\n", + " in_channels (int): Input channels of shared convolution.\n", + " feat_channels (int): Output channels of shared convolution.\n", + " num_anchors (int): The anchor number.\n", + " cls_out_channels (int): Output channels of classification convolution.\n", + " weight_conv (Tensor): Weight init for rpn conv.\n", + " bias_conv (Tensor): Bias init for rpn conv.\n", + " weight_cls (Tensor): Weight init for rpn cls conv.\n", + " bias_cls (Tensor): Bias init for rpn cls conv.\n", + " weight_reg (Tensor): Weight init for rpn reg conv.\n", + " bias_reg (Tensor): Bias init for rpn reg conv.\n", + "\n", + " Inputs:\n", + " - **x** (Tensor) - input variant\n", + "\n", + " Outputs:\n", + " Tensor, output tensor.\n", + "\n", + " Support Platform:\n", + " ``Ascend`` ``CPU`` ``GPU``\n", + "\n", + " Examples:\n", + " >>> x = Tensor(np.array([[[[1., 2.], [3., 4.]]]]), mindspore.float32)\n", + " >>> weight_conv = Tensor(np.array([[[[0.2, 0.3], [0.4, 0.1]]]]), mindspore.float32)\n", + " >>> bias_conv = Tensor(np.array([[[[0., 0.], [0., 0.]]]]), mindspore.float32)\n", + " >>> weight_cls = Tensor(np.array([[[[0.2, 0.3], [0.4, 0.1]]]]), mindspore.float32)\n", + " >>> bias_cls = Tensor(np.array([[[[0., 0.], [0., 0.]]]]), mindspore.float32)\n", + " >>> weight_reg = Tensor(np.array([[[[0.2, 0.3], [0.4, 0.1]]]]), mindspore.float32)\n", + " >>> bias_reg = Tensor(np.array([[[[0., 0.], [0., 0.]]]]), mindspore.float32)\n", + " >>> rpn = RpnRegClsBlock(2, 2, 4, 4, )\n", + " >>> rpn = ops.SingleRoIExtractor(2, 2, 0.5, 2, weight_conv, bias_conv,\n", + " ... weight_cls, bias_cls, weight_reg, bias_reg)\n", + " >>> output = rpn(x)\n", + " \"\"\"\n", + " def __init__(self, in_channels, feat_channels, num_anchors, cls_out_channels, weight_conv,\n", + " bias_conv, weight_cls, bias_cls, weight_reg, bias_reg):\n", + " super(RpnRegClsBlock, self).__init__()\n", + " self.rpn_conv = nn.Conv2d(in_channels, feat_channels, kernel_size=3,\n", + " stride=1, pad_mode='same',\n", + " has_bias=True, weight_init=weight_conv,\n", + " bias_init=bias_conv)\n", + " self.relu = nn.ReLU()\n", + "\n", + " self.rpn_cls = nn.Conv2d(feat_channels, num_anchors * cls_out_channels,\n", + " kernel_size=1, pad_mode='valid',\n", + " has_bias=True, weight_init=weight_cls,\n", + " bias_init=bias_cls)\n", + " self.rpn_reg = nn.Conv2d(feat_channels, num_anchors * 4,\n", + " kernel_size=1, pad_mode='valid',\n", + " has_bias=True, weight_init=weight_reg,\n", + " bias_init=bias_reg)\n", + "\n", + " def construct(self, x):\n", + " \"\"\"Construct Rpn reg cls block for rpn layer.\"\"\"\n", + " x = self.relu(self.rpn_conv(x))\n", + "\n", + " x1 = self.rpn_cls(x)\n", + " x2 = self.rpn_reg(x)\n", + "\n", + " return x1, x2\n", + "\n", + "\n", + "class RPN(nn.Cell):\n", + " \"\"\"\n", + " ROI proposal network..\n", + "\n", + " Args:\n", + " config (dict): Config.\n", + " batch_size (int): Batchsize.\n", + " in_channels (int): Input channels of shared convolution.\n", + " feat_channels (int): Output channels of shared convolution.\n", + " num_anchors (int): The anchor number.\n", + " cls_out_channels (int): Output channels of classification convolution.\n", + "\n", + " Inputs:\n", + " - **inputs** (Tensor) - Input variant.\n", + " - **img_metas** (Tensor) - Img shape.\n", + " - **anchor_list** (Tensor) - A list of anchors.\n", + " - **gt_bboxes** (Tensor) - Ground truth bounding boxes.\n", + " - **gt_labels** (Tensor) - Ground truth labels.\n", + " - **gt_valids** (Tensor) - Ground truth validations.\n", + "\n", + " Outputs:\n", + " Tuple, tuple of output tensor.\n", + "\n", + " Support Platform:\n", + " ``Ascend`` ``CPU`` ``GPU``\n", + "\n", + " Examples:\n", + " >>> RPN(config=config, batch_size=2, in_channels=256, feat_channels=1024,\n", + " ... num_anchors=3, cls_out_channels=512)\n", + " \"\"\"\n", + " def __init__(self, config, batch_size, in_channels, feat_channels, num_anchors, cls_out_channels):\n", + " super(RPN, self).__init__()\n", + " cfg_rpn = config\n", + "\n", + " self.cast_type = mstype.float32\n", + " self.np_cast_type = np.float32\n", + "\n", + " self.num_bboxes = cfg_rpn.num_bboxes\n", + " self.slice_index = ()\n", + " self.feature_anchor_shape = ()\n", + " self.slice_index += (0,)\n", + " index = 0\n", + " for shape in cfg_rpn.feature_shapes:\n", + " self.slice_index += (self.slice_index[index] + shape[0] * shape[1] * num_anchors,)\n", + " self.feature_anchor_shape += (shape[0] * shape[1] * num_anchors * batch_size,)\n", + " index += 1\n", + "\n", + " self.num_anchors = num_anchors\n", + " self.batch_size = batch_size\n", + " self.test_batch_size = cfg_rpn.test_batch_size\n", + " self.num_layers = 5\n", + " self.real_ratio = Tensor(np.ones((1, 1)).astype(self.np_cast_type))\n", + "\n", + " self.rpn_convs_list = nn.layer.CellList(self._make_rpn_layer(self.num_layers, in_channels, feat_channels,\n", + " num_anchors, cls_out_channels))\n", + "\n", + " self.transpose = P.Transpose()\n", + " self.reshape = P.Reshape()\n", + " self.concat = P.Concat(axis=0)\n", + " self.fill = P.Fill()\n", + " self.placeh1 = Tensor(np.ones((1,)).astype(self.np_cast_type))\n", + "\n", + " self.trans_shape = (0, 2, 3, 1)\n", + "\n", + " self.reshape_shape_reg = (-1, 4)\n", + " self.reshape_shape_cls = (-1,)\n", + " self.rpn_loss_reg_weight = Tensor(np.array(cfg_rpn.rpn_loss_reg_weight).astype(self.np_cast_type))\n", + " self.rpn_loss_cls_weight = Tensor(np.array(cfg_rpn.rpn_loss_cls_weight).astype(self.np_cast_type))\n", + " expected_total_size = cfg_rpn.num_expected_neg * self.batch_size\n", + " self.num_expected_total = Tensor(np.array(expected_total_size).astype(self.np_cast_type))\n", + " self.num_bboxes = cfg_rpn.num_bboxes\n", + " self.get_targets = BboxAssignSample(cfg_rpn, self.batch_size, self.num_bboxes, False)\n", + " self.check_valid = P.CheckValid()\n", + " self.sum_loss = P.ReduceSum()\n", + " self.loss_cls = P.SigmoidCrossEntropyWithLogits()\n", + " self.loss_bbox = P.SmoothL1Loss(beta=1.0/9.0)\n", + " self.squeeze = P.Squeeze()\n", + " self.cast = P.Cast()\n", + " self.tile = P.Tile()\n", + " self.zeros_like = P.ZerosLike()\n", + " self.loss = Tensor(np.zeros((1,)).astype(self.np_cast_type))\n", + " self.clsloss = Tensor(np.zeros((1,)).astype(self.np_cast_type))\n", + " self.regloss = Tensor(np.zeros((1,)).astype(self.np_cast_type))\n", + "\n", + " def _make_rpn_layer(self, num_layers, in_channels,\n", + " feat_channels, num_anchors, cls_out_channels):\n", + " \"\"\"\n", + " Make rpn layer for rpn proposal network\n", + "\n", + " Args:\n", + " num_layers (int): layer num.\n", + " in_channels (int): Input channels of shared convolution.\n", + " feat_channels (int): Output channels of shared convolution.\n", + " num_anchors (int): The anchor number.\n", + " cls_out_channels (int): Output channels of classification convolution.\n", + "\n", + " Returns:\n", + " List, list of RpnRegClsBlock cells.\n", + " \"\"\"\n", + " rpn_layer = []\n", + "\n", + " shp_weight_conv = (feat_channels, in_channels, 3, 3)\n", + " shp_bias_conv = (feat_channels,)\n", + " weight_conv = initializer('Normal', shape=shp_weight_conv, dtype=mstype.float32)\n", + " bias_conv = initializer(0, shape=shp_bias_conv, dtype=mstype.float32)\n", + "\n", + " shp_weight_cls = (num_anchors * cls_out_channels, feat_channels, 1, 1)\n", + " shp_bias_cls = (num_anchors * cls_out_channels,)\n", + " weight_cls = initializer('Normal', shape=shp_weight_cls, dtype=mstype.float32)\n", + " bias_cls = initializer(0, shape=shp_bias_cls, dtype=mstype.float32)\n", + "\n", + " shp_weight_reg = (num_anchors * 4, feat_channels, 1, 1)\n", + " shp_bias_reg = (num_anchors * 4,)\n", + " weight_reg = initializer('Normal', shape=shp_weight_reg, dtype=mstype.float32)\n", + " bias_reg = initializer(0, shape=shp_bias_reg, dtype=mstype.float32)\n", + "\n", + " for i in range(num_layers):\n", + " rpn_layer.append(RpnRegClsBlock(in_channels, feat_channels, num_anchors, cls_out_channels, weight_conv,\n", + " bias_conv, weight_cls, bias_cls, weight_reg,\n", + " bias_reg).to_float(self.cast_type))\n", + "\n", + " for i in range(1, num_layers):\n", + " rpn_layer[i].rpn_conv.weight = rpn_layer[0].rpn_conv.weight\n", + " rpn_layer[i].rpn_cls.weight = rpn_layer[0].rpn_cls.weight\n", + " rpn_layer[i].rpn_reg.weight = rpn_layer[0].rpn_reg.weight\n", + "\n", + " rpn_layer[i].rpn_conv.bias = rpn_layer[0].rpn_conv.bias\n", + " rpn_layer[i].rpn_cls.bias = rpn_layer[0].rpn_cls.bias\n", + " rpn_layer[i].rpn_reg.bias = rpn_layer[0].rpn_reg.bias\n", + "\n", + " return rpn_layer\n", + "\n", + " def construct(self, inputs, img_metas, anchor_list, gt_bboxes, gt_labels, gt_valids):\n", + " \"\"\"Construct ROI Proposal Network.\"\"\"\n", + " loss_print = ()\n", + " rpn_cls_score = ()\n", + " rpn_bbox_pred = ()\n", + " rpn_cls_score_total = ()\n", + " rpn_bbox_pred_total = ()\n", + "\n", + " for i in range(self.num_layers):\n", + " x1, x2 = self.rpn_convs_list[i](inputs[i])\n", + "\n", + " rpn_cls_score_total = rpn_cls_score_total + (x1,)\n", + " rpn_bbox_pred_total = rpn_bbox_pred_total + (x2,)\n", + "\n", + " x1 = self.transpose(x1, self.trans_shape)\n", + " x1 = self.reshape(x1, self.reshape_shape_cls)\n", + "\n", + " x2 = self.transpose(x2, self.trans_shape)\n", + " x2 = self.reshape(x2, self.reshape_shape_reg)\n", + "\n", + " rpn_cls_score = rpn_cls_score + (x1,)\n", + " rpn_bbox_pred = rpn_bbox_pred + (x2,)\n", + "\n", + " loss = self.loss\n", + " clsloss = self.clsloss\n", + " regloss = self.regloss\n", + " bbox_targets = ()\n", + " bbox_weights = ()\n", + " labels = ()\n", + " label_weights = ()\n", + "\n", + " output = ()\n", + " if self.training:\n", + " for i in range(self.batch_size):\n", + " multi_level_flags = ()\n", + " anchor_list_tuple = ()\n", + "\n", + " for j in range(self.num_layers):\n", + " res = self.cast(self.check_valid(anchor_list[j], self.squeeze(img_metas[i:i + 1:1, ::])),\n", + " mstype.int32)\n", + " multi_level_flags = multi_level_flags + (res,)\n", + " anchor_list_tuple = anchor_list_tuple + (anchor_list[j],)\n", + "\n", + " valid_flag_list = self.concat(multi_level_flags)\n", + " anchor_using_list = self.concat(anchor_list_tuple)\n", + "\n", + " gt_bboxes_i = self.squeeze(gt_bboxes[i:i + 1:1, ::])\n", + " gt_labels_i = self.squeeze(gt_labels[i:i + 1:1, ::])\n", + " gt_valids_i = self.squeeze(gt_valids[i:i + 1:1, ::])\n", + "\n", + " bbox_target, bbox_weight, label, label_weight = \\\n", + " self.get_targets(gt_bboxes_i, gt_labels_i, self.cast(valid_flag_list, mstype.bool_),\n", + " anchor_using_list, gt_valids_i)\n", + "\n", + " bbox_weight = self.cast(bbox_weight, self.cast_type)\n", + " label = self.cast(label, self.cast_type)\n", + " label_weight = self.cast(label_weight, self.cast_type)\n", + "\n", + " for j in range(self.num_layers):\n", + " begin = self.slice_index[j]\n", + " end = self.slice_index[j + 1]\n", + " stride = 1\n", + " bbox_targets += (bbox_target[begin:end:stride, ::],)\n", + " bbox_weights += (bbox_weight[begin:end:stride],)\n", + " labels += (label[begin:end:stride],)\n", + " label_weights += (label_weight[begin:end:stride],)\n", + "\n", + " for i in range(self.num_layers):\n", + " bbox_target_using = ()\n", + " bbox_weight_using = ()\n", + " label_using = ()\n", + " label_weight_using = ()\n", + "\n", + " for j in range(self.batch_size):\n", + " bbox_target_using += (bbox_targets[i + (self.num_layers * j)],)\n", + " bbox_weight_using += (bbox_weights[i + (self.num_layers * j)],)\n", + " label_using += (labels[i + (self.num_layers * j)],)\n", + " label_weight_using += (label_weights[i + (self.num_layers * j)],)\n", + "\n", + " bbox_target_with_batchsize = self.concat(bbox_target_using)\n", + " bbox_weight_with_batchsize = self.concat(bbox_weight_using)\n", + " label_with_batchsize = self.concat(label_using)\n", + " label_weight_with_batchsize = self.concat(label_weight_using)\n", + "\n", + " # stop\n", + " bbox_target_ = F.stop_gradient(bbox_target_with_batchsize)\n", + " bbox_weight_ = F.stop_gradient(bbox_weight_with_batchsize)\n", + " label_ = F.stop_gradient(label_with_batchsize)\n", + " label_weight_ = F.stop_gradient(label_weight_with_batchsize)\n", + "\n", + " cls_score_i = rpn_cls_score[i]\n", + " reg_score_i = rpn_bbox_pred[i]\n", + "\n", + " loss_cls = self.loss_cls(cls_score_i, label_)\n", + " loss_cls_item = loss_cls * label_weight_\n", + " loss_cls_item = self.sum_loss(loss_cls_item, (0,)) / self.num_expected_total\n", + "\n", + " loss_reg = self.loss_bbox(reg_score_i, bbox_target_)\n", + " bbox_weight_ = self.tile(self.reshape(bbox_weight_, (self.feature_anchor_shape[i], 1)), (1, 4))\n", + " loss_reg = loss_reg * bbox_weight_\n", + " loss_reg_item = self.sum_loss(loss_reg, (1,))\n", + " loss_reg_item = self.sum_loss(loss_reg_item, (0,)) / self.num_expected_total\n", + "\n", + " loss_total = self.rpn_loss_cls_weight * loss_cls_item + self.rpn_loss_reg_weight * loss_reg_item\n", + "\n", + " loss += loss_total\n", + " loss_print += (loss_total, loss_cls_item, loss_reg_item)\n", + " clsloss += loss_cls_item\n", + " regloss += loss_reg_item\n", + "\n", + " output = (loss, rpn_cls_score_total, rpn_bbox_pred_total,\n", + " clsloss, regloss, loss_print)\n", + " else:\n", + " output = (self.placeh1, rpn_cls_score_total, rpn_bbox_pred_total,\n", + " self.placeh1, self.placeh1, self.placeh1)\n", + " return output\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ROI Align\n", + "\n", + "ROI Align可以计算不同proposal对应到不同尺度下的特征,利用proposal对该特征进行剪裁、resize、pooling提取特征。\n", + "\n", + "Mask-RCNN中使用的ROI Level校准:\n", + "\n", + "$$\n", + "k=[k_0+\\log_2{(\\frac{\\sqrt{wh}}{224/\\sqrt{image\\; area}})}]\n", + "$$\n", + "\n", + "#### 解释\n", + "\n", + "1. 由于Mask R-CNN训练数据的box和anchor都做了调整,所以ROI Level的计算部分也需要 $224/\\sqrt{image\\; area}$。其中,224应为输入图像尺寸的一半。\n", + "\n", + "2. 计算得到的k即为ROI对应的level,level一共4个:\n", + "\n", + " 1. $level=2$表示映射回特征 $P_{2}$,大小为原输入图像的 $1/4$。\n", + "\n", + " 2. $level=3$表示映射回特征 $P_{3}$,大小为原输入图像的 $1/8$。\n", + "\n", + " 3. $level=4$表示映射回特征 $P_{4}$,大小为原输入图像的 $1/16$。\n", + "\n", + " 4. $level=5$表示映射回特征 $P_{5}$,大小为原输入图像的 $1/32$。\n", + "\n", + "![image4](images/roi_align.png)\n", + "\n", + "虚线网格表示特征图,实线表示RoI(在本例中为2×2个bin),点表示每个容器中的4个采样点。RoIAlign通过双线性插值从特征图上附近的网格点(最近的4个)计算每个采样点的值。在ROI、4个bin或采样点中涉及的任何坐标上都不进行量化。" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "class ROIAlign(nn.Cell):\n", + " \"\"\"\n", + " Extract RoI features from mulitiple feature map.\n", + "\n", + " Args:\n", + " out_size_h (int): RoI height.\n", + " out_size_w (int): RoI width.\n", + " spatial_scale (int): RoI spatial scale.\n", + " sample_num (int): RoI sample number. Default: 0.\n", + " roi_align_mode (int): RoI align mode. Default: 1.\n", + "\n", + " Inputs:\n", + " - **features** (Tensor) - The input features, whose shape must be :math:'(N, C, H, W)'.\n", + " - **rois** (Tensor) - The shape is :math:'(rois_n, 5)'. With data type of float16 or float32.\n", + "\n", + " Outputs:\n", + " Tensor, the shape is :math: '(rois_n, C, pooled_height, pooled_width)'.\n", + "\n", + " Support Platform:\n", + " ``Ascend`` ``CPU`` ``GPU``\n", + "\n", + " Examples:\n", + " >>> features = Tensor(np.array([[[[1., 2.], [3., 4.]]]]), mindspore.float32)\n", + " >>> rois = Tensor(np.array([[0, 0.2, 0.3, 0.2, 0.3]]), mindspore.float32)\n", + " >>> roi_align = ops.ROIAlign(2, 2, 0.5, 2)\n", + " >>> output = roi_align(features, rois)\n", + " >>> print(output)\n", + " [[[[1.775 2.025]\n", + " [2.275 2.525]]]]\n", + " \"\"\"\n", + " def __init__(self, out_size_h, out_size_w, spatial_scale, sample_num=0, roi_align_mode=1):\n", + " super(ROIAlign, self).__init__()\n", + "\n", + " self.out_size = (out_size_h, out_size_w)\n", + " self.spatial_scale = float(spatial_scale)\n", + " self.sample_num = int(sample_num)\n", + " self.align_op = P.ROIAlign(self.out_size[0], self.out_size[1],\n", + " self.spatial_scale, self.sample_num,\n", + " roi_align_mode)\n", + "\n", + " def construct(self, features, rois):\n", + " \"\"\"Construct ROI Align\"\"\"\n", + " return self.align_op(features, rois)\n", + "\n", + " def __repr__(self):\n", + " format_str = self.__class__.__name__\n", + " format_str += \\\n", + " '(out_size={}, spatial_scale={}, sample_num={}'.format(self.out_size, self.spatial_scale, self.sample_num)\n", + " return format_str\n", + "\n", + "\n", + "class SingleRoIExtractor(nn.Cell):\n", + " \"\"\"\n", + " Extract RoI features from a single level feature map.\n", + "\n", + " If there are multiple input feature levels, each RoI is mapped to a level according to its scale.\n", + "\n", + " Args:\n", + " config (dict): Config\n", + " out_channels (int): Output channels of RoI layers.\n", + " featmap_strides (int): Strides of input feature maps.\n", + " batch_size (int): Batchsize. Default: 1.\n", + " finest_scale (int): Scale threshold of mapping to level 0. Default: 56.\n", + " mask (bool): Specify ROIAlign for cls or mask branch. Default: False.\n", + "\n", + " Inputs:\n", + " - **rois** (Tensor) - The shape is :math:'(rois_n, 5)'. With data type of float16 or float32.\n", + " - **feat1** (Tensor) - The input features, whose shape must be :math:'(N, C, H, W)'.\n", + " - **feat2** (Tensor) - The input features, whose shape must be :math:'(N, C, H, W)'.\n", + " - **feat3** (Tensor) - The input features, whose shape must be :math:'(N, C, H, W)'.\n", + " - **feat4** (Tensor) - The input features, whose shape must be :math:'(N, C, H, W)'.\n", + "\n", + " Outputs:\n", + " Tensor, the shape is :math:'(rois_n, C, pooled_height, pooled_width)'.\n", + "\n", + " Support Platform:\n", + " ``Ascend`` ``CPU`` ``GPU``\n", + "\n", + " Examples:\n", + " >>> fea1 = Tensor(np.array([[[[1., 2.], [3., 4.]]]]), mindspore.float32)\n", + " >>> fea2 = Tensor(np.array([[[[1., 2.], [3., 4.]]]]), mindspore.float32)\n", + " >>> fea3 = Tensor(np.array([[[[1., 2.], [3., 4.]]]]), mindspore.float32)\n", + " >>> fea4 = Tensor(np.array([[[[1., 2.], [3., 4.]]]]), mindspore.float32)\n", + " >>> rois = Tensor(np.array([[0, 0.2, 0.3, 0.2, 0.3]]), mindspore.float32)\n", + " >>> single_roi = ops.SingleRoIExtractor(conifg, 2, 1, 2, 2, mask)\n", + " >>> output = single_roi(rois, fea1, fea2, fea3, fea4)\n", + " \"\"\"\n", + "\n", + " def __init__(self, config, roi_layer, out_channels, featmap_strides, batch_size=1, finest_scale=56, mask=False):\n", + " super(SingleRoIExtractor, self).__init__()\n", + " cfg = config\n", + " self.train_batch_size = batch_size\n", + " self.out_channels = out_channels\n", + " self.featmap_strides = featmap_strides\n", + " self.num_levels = len(self.featmap_strides)\n", + " self.out_size = roi_layer.mask_out_size if mask else roi_layer.out_size\n", + " self.mask = mask\n", + " self.sample_num = roi_layer.sample_num\n", + " self.roi_layers = self.build_roi_layers(self.featmap_strides)\n", + " self.roi_layers = L.CellList(self.roi_layers)\n", + "\n", + " self.sqrt = P.Sqrt()\n", + " self.log = P.Log()\n", + " self.finest_scale_ = finest_scale\n", + " self.clamp = C.clip_by_value\n", + "\n", + " self.cast = P.Cast()\n", + " self.equal = P.Equal()\n", + " self.select = P.Select()\n", + "\n", + " in_mode_16 = False\n", + " self.dtype = np.float16 if in_mode_16 else np.float32\n", + " self.ms_dtype = mstype.float16 if in_mode_16 else mstype.float32\n", + " self.set_train_local(cfg, training=True)\n", + "\n", + " def set_train_local(self, config, training=True):\n", + " \"\"\"Set training flag.\"\"\"\n", + " self.training_local = training\n", + "\n", + " cfg = config\n", + " # Init tensor\n", + " roi_sample_num = cfg.num_expected_pos_stage2 if self.mask else cfg.roi_sample_num\n", + " self.batch_size = roi_sample_num if self.training_local else cfg.rpn_max_num\n", + " self.batch_size = self.train_batch_size*self.batch_size \\\n", + " if self.training_local else cfg.test_batch_size*self.batch_size\n", + " self.ones = Tensor(np.array(np.ones((self.batch_size, 1)), dtype=self.dtype))\n", + " finest_scale = np.array(np.ones((self.batch_size, 1)), dtype=self.dtype) * self.finest_scale_\n", + " self.finest_scale = Tensor(finest_scale)\n", + " self.epslion = Tensor(np.array(np.ones((self.batch_size, 1)), dtype=self.dtype)*self.dtype(1e-6))\n", + " self.zeros = Tensor(np.array(np.zeros((self.batch_size, 1)), dtype=np.int32))\n", + " self.max_levels = Tensor(np.array(np.ones((self.batch_size, 1)), dtype=np.int32)*(self.num_levels-1))\n", + " self.twos = Tensor(np.array(np.ones((self.batch_size, 1)), dtype=self.dtype) * 2)\n", + " self.res_ = Tensor(np.array(np.zeros((self.batch_size, self.out_channels, self.out_size, self.out_size)),\n", + " dtype=self.dtype))\n", + "\n", + " def num_inputs(self):\n", + " \"\"\"input number.\"\"\"\n", + " return len(self.featmap_strides)\n", + "\n", + " def log2(self, value):\n", + " \"\"\"calculate log2.\"\"\"\n", + " return self.log(value) / self.log(self.twos)\n", + "\n", + " def build_roi_layers(self, featmap_strides):\n", + " \"\"\"build ROI layers.\"\"\"\n", + " roi_layers = []\n", + " for s in featmap_strides:\n", + " layer_cls = ROIAlign(self.out_size, self.out_size, spatial_scale=1 / s,\n", + " sample_num=self.sample_num, roi_align_mode=0)\n", + " roi_layers.append(layer_cls)\n", + " return roi_layers\n", + "\n", + " def _c_map_roi_levels(self, rois):\n", + " \"\"\"Map rois to corresponding feature levels by scales.\n", + "\n", + " - scale < finest_scale * 2: level 0\n", + " - finest_scale * 2 <= scale < finest_scale * 4: level 1\n", + " - finest_scale * 4 <= scale < finest_scale * 8: level 2\n", + " - scale >= finest_scale * 8: level 3\n", + "\n", + " Args:\n", + " rois (Tensor): Input RoIs, shape (k, 5).\n", + " num_levels (int): Total level number.\n", + "\n", + " Returns:\n", + " Tensor, Level index (0-based) of each RoI, shape (k, )\n", + " \"\"\"\n", + " scale = self.sqrt(rois[::, 3:4:1] - rois[::, 1:2:1] + self.ones) * \\\n", + " self.sqrt(rois[::, 4:5:1] - rois[::, 2:3:1] + self.ones)\n", + "\n", + " target_lvls = self.log2(scale / self.finest_scale + self.epslion)\n", + " target_lvls = P.Floor()(target_lvls)\n", + " target_lvls = self.cast(target_lvls, mstype.int32)\n", + " target_lvls = self.clamp(target_lvls, self.zeros, self.max_levels)\n", + "\n", + " return target_lvls\n", + "\n", + " def construct(self, rois, feat1, feat2, feat3, feat4):\n", + " \"\"\"Construct Single RoI Extractor\"\"\"\n", + " feats = (feat1, feat2, feat3, feat4)\n", + " res = self.res_\n", + " target_lvls = self._c_map_roi_levels(rois)\n", + " for i in range(self.num_levels):\n", + " mask = self.equal(target_lvls, P.ScalarToArray()(i))\n", + " mask = P.Reshape()(mask, (-1, 1, 1, 1))\n", + " roi_feats_t = self.roi_layers[i](feats[i], rois)\n", + " mask = \\\n", + " self.cast(P.Tile()(self.cast(mask, mstype.int32), (1, 256, self.out_size, self.out_size)), mstype.bool_)\n", + " res = self.select(mask, roi_feats_t, res)\n", + " return res\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Class/Bounding Box预测\n", + "\n", + "Class/bounding box预测时,RPN输出一系列ROI,RoIAlign将ROI逐个对应会Resnet输出的5个特征层中的一个。再对该特征做相应的裁剪,resize操作得到对应的特征。再对该特征做进一步卷积,全连接最终输出预测。" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "class DenseNoTranpose(nn.Cell):\n", + " \"\"\"\n", + " Dense method\n", + "\n", + " Args:\n", + " input_channels (int): Channel size of input feature maps.\n", + " output_channels (int): Channel size output.\n", + " weight_init (tuple): Initialized values of weights.\n", + "\n", + " Inputs:\n", + " - **x** (Tensor) - Input from the upper layer.\n", + "\n", + " Outputs:\n", + " Tensor, dense result.\n", + "\n", + " Support Platforms:\n", + " ``Ascend`` ``CPU`` ``GPU``\n", + "\n", + " Examples:\n", + " >>> out_channels = 128\n", + " >>> dense_notranspose = DenseNoTranpose(input_channels, output_channels, weights)\n", + " \"\"\"\n", + " def __init__(self, input_channels, output_channels, weight_init):\n", + " super(DenseNoTranpose, self).__init__()\n", + " self.weight = Parameter(initializer(weight_init, [input_channels, output_channels], mstype.float32))\n", + " self.bias = Parameter(initializer(\"zeros\", [output_channels], mstype.float32))\n", + " self.matmul = P.MatMul(transpose_b=False)\n", + " self.bias_add = P.BiasAdd()\n", + "\n", + " def construct(self, x):\n", + " \"\"\"Construct Dense No Transpose.\"\"\"\n", + " output = self.bias_add(self.matmul(x, self.weight), self.bias)\n", + " return output\n", + "\n", + "\n", + "class FpnCls(nn.Cell):\n", + " \"\"\"\n", + " Dense layer of classification and box head\n", + "\n", + " Args:\n", + " input_channels (int): Channel size of input feature maps.\n", + " output_channels (int): Channel size output\n", + " num_classes (int): Number of classes.\n", + " pool_size (int): Pooling size.\n", + "\n", + " Inputs:\n", + " - **x** (Tensor) - Input from the upper layer.\n", + "\n", + " Outputs:\n", + " Tensor, dense result.\n", + "\n", + " Support Platforms:\n", + " ``Ascend`` ``CPU`` ``GPU``\n", + "\n", + " Examples:\n", + " >>> fpn_cls = FpnCls(256,128,81,2)\n", + " \"\"\"\n", + " def __init__(self, input_channels, output_channels, num_classes, pool_size):\n", + " super(FpnCls, self).__init__()\n", + "\n", + " self.cast_type = mstype.float32\n", + "\n", + " representation_size = input_channels * pool_size * pool_size\n", + " shape_0 = (output_channels, representation_size)\n", + " weights_0 = initializer(\"XavierUniform\", shape=shape_0[::-1], dtype=mstype.float32)\n", + " shape_1 = (output_channels, output_channels)\n", + " weights_1 = initializer(\"XavierUniform\", shape=shape_1[::-1], dtype=mstype.float32)\n", + " self.shared_fc_0 = DenseNoTranpose(representation_size, output_channels, weights_0).to_float(self.cast_type)\n", + " self.shared_fc_1 = DenseNoTranpose(output_channels, output_channels, weights_1).to_float(self.cast_type)\n", + "\n", + " cls_weight = initializer('Normal', shape=[num_classes, output_channels][::-1], dtype=mstype.float32)\n", + " reg_weight = initializer('Normal', shape=[num_classes * 4, output_channels][::-1], dtype=mstype.float32)\n", + " self.cls_scores = DenseNoTranpose(output_channels, num_classes, cls_weight).to_float(self.cast_type)\n", + " self.reg_scores = DenseNoTranpose(output_channels, num_classes * 4, reg_weight).to_float(self.cast_type)\n", + "\n", + " self.relu = P.ReLU()\n", + " self.flatten = P.Flatten()\n", + "\n", + " def construct(self, x):\n", + " \"\"\"Construct FPNCls\"\"\"\n", + " # two share fc layer\n", + " x = self.flatten(x)\n", + "\n", + " x = self.relu(self.shared_fc_0(x))\n", + " x = self.relu(self.shared_fc_1(x))\n", + "\n", + " # classifier head\n", + " cls_scores = self.cls_scores(x)\n", + " # bbox head\n", + " reg_scores = self.reg_scores(x)\n", + "\n", + " return cls_scores, reg_scores\n", + "\n", + "\n", + "class RcnnCls(nn.Cell):\n", + " \"\"\"\n", + " Rcnn for classification and box regression subnet.\n", + "\n", + " Args:\n", + " config (dict): Config.\n", + " batch_size (int): Batchsize.\n", + " num_classes (int): Class number.\n", + " target_means (list): Means for encode function. Default: (.0, .0, .0, .0]).\n", + " target_stds (list): Stds for encode function. Default: (0.1, 0.1, 0.2, 0.2).\n", + "\n", + " Inputs:\n", + " - **featuremap** (tuple) - Feature map.\n", + " - **bbox_targets** (tuple) - A set of bounding box targets.\n", + " - **labels** (tuple) - Ground truth labels.\n", + " - **mask** (tuple) - Mask array.\n", + "\n", + " Outputs:\n", + " Tuple, tuple of output tensor.\n", + "\n", + " Support Platforms:\n", + " ``Ascend`` ``CPU`` ``GPU``\n", + "\n", + " Examples:\n", + " >>> RcnnCls(config=config, representation_size = 1024,\n", + " ... batch_size=2, num_classes = 81,\n", + " ... target_means=(0., 0., 0., 0.),\n", + " ... target_stds=(0.1, 0.1, 0.2, 0.2))\n", + " \"\"\"\n", + "\n", + " def __init__(self, config, batch_size, num_classes, target_means=(0., 0., 0., 0.),\n", + " target_stds=(0.1, 0.1, 0.2, 0.2)):\n", + " super(RcnnCls, self).__init__()\n", + " cfg = config\n", + "\n", + " self.cast_type = mstype.float32\n", + " self.np_cast_type = np.float32\n", + "\n", + " self.rcnn_loss_cls_weight = Tensor(np.array(cfg.rcnn_loss_cls_weight).astype(self.np_cast_type))\n", + " self.rcnn_loss_reg_weight = Tensor(np.array(cfg.rcnn_loss_reg_weight).astype(self.np_cast_type))\n", + " self.rcnn_fc_out_channels = cfg.rcnn_fc_out_channels\n", + " self.target_means = target_means\n", + " self.target_stds = target_stds\n", + " self.num_classes = num_classes\n", + " self.in_channels = cfg.rcnn_in_channels\n", + " self.train_batch_size = batch_size\n", + " self.test_batch_size = cfg.test_batch_size\n", + "\n", + " self.fpn_cls = FpnCls(self.in_channels, self.rcnn_fc_out_channels, self.num_classes, cfg.roi_layer.out_size)\n", + " self.relu = P.ReLU()\n", + " self.logicaland = P.LogicalAnd()\n", + " self.loss_cls = P.SoftmaxCrossEntropyWithLogits()\n", + " self.loss_bbox = P.SmoothL1Loss(beta=1.0)\n", + " self.loss_mask = P.SigmoidCrossEntropyWithLogits()\n", + " self.reshape = P.Reshape()\n", + " self.onehot = P.OneHot()\n", + " self.greater = P.Greater()\n", + " self.cast = P.Cast()\n", + " self.sum_loss = P.ReduceSum()\n", + " self.tile = P.Tile()\n", + " self.expandims = P.ExpandDims()\n", + "\n", + " self.gather = P.GatherNd()\n", + " self.argmax = P.ArgMaxWithValue(axis=1)\n", + "\n", + " self.on_value = Tensor(1.0, mstype.float32)\n", + " self.off_value = Tensor(0.0, mstype.float32)\n", + " self.value = Tensor(1.0, self.cast_type)\n", + "\n", + " self.num_bboxes = (cfg.num_expected_pos_stage2 + cfg.num_expected_neg_stage2) * batch_size\n", + "\n", + " rmv_first = np.ones((self.num_bboxes, self.num_classes))\n", + " rmv_first[:, 0] = np.zeros((self.num_bboxes,))\n", + " self.rmv_first_tensor = Tensor(rmv_first.astype(self.np_cast_type))\n", + "\n", + " self.num_bboxes_test = cfg.rpn_max_num * cfg.test_batch_size\n", + "\n", + " def construct(self, featuremap, bbox_targets, labels, mask):\n", + " \"\"\"Construct Rcnn for classification\"\"\"\n", + " x_cls, x_reg = self.fpn_cls(featuremap)\n", + "\n", + " if self.training:\n", + " bbox_weights = self.cast(self.logicaland(self.greater(labels, 0), mask), mstype.int32) * labels\n", + " labels = self.cast(self.onehot(labels, self.num_classes, self.on_value, self.off_value), self.cast_type)\n", + " bbox_targets = self.tile(self.expandims(bbox_targets, 1), (1, self.num_classes, 1))\n", + "\n", + " loss_cls, loss_reg = self.loss(x_cls, x_reg, bbox_targets, bbox_weights, labels, mask)\n", + " out = (loss_cls, loss_reg)\n", + " else:\n", + " out = (x_cls, x_reg)\n", + "\n", + " return out\n", + "\n", + " def loss(self, cls_score, bbox_pred, bbox_targets, bbox_weights, labels, weights):\n", + " \"\"\"\n", + " Loss method.\n", + " Args:\n", + " cls_score(Array): Classificaiton scores.\n", + " bbox_pred(Array): Bounding box prediction.\n", + " bbox_targets(Array): Bounding box GT target.\n", + " bbox_weights(Array): Bounding box weights.\n", + " labels(Array): GT labels.\n", + " weights(Array): GT wieghts.\n", + "\n", + " Returns:\n", + " loss_cls, float, classification loss.\n", + " loss_reg, float, regression loss.\n", + " \"\"\"\n", + " # loss_cls\n", + " loss_cls, _ = self.loss_cls(cls_score, labels)\n", + " weights = self.cast(weights, self.cast_type)\n", + " loss_cls = loss_cls * weights\n", + " loss_cls = self.sum_loss(loss_cls, (0,)) / (self.sum_loss(weights, (0,)) + 1e-5)\n", + "\n", + " # loss_reg\n", + " bbox_weights = self.cast(self.onehot(bbox_weights, self.num_classes, self.on_value, self.off_value),\n", + " self.cast_type)\n", + " bbox_weights = bbox_weights * self.rmv_first_tensor\n", + " pos_bbox_pred = self.reshape(bbox_pred, (self.num_bboxes, -1, 4))\n", + " loss_reg = self.loss_bbox(pos_bbox_pred, bbox_targets)\n", + " loss_reg = self.sum_loss(loss_reg, (2,))\n", + " loss_reg = loss_reg * bbox_weights\n", + " loss_reg = loss_reg / (self.sum_loss(weights, (0,)) + 1e-5)\n", + " loss_reg = self.sum_loss(loss_reg, (0, 1))\n", + "\n", + " return loss_cls, loss_reg\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Mask预测\n", + "\n", + "对RoIAlign输出的特征进行一系列的卷积,转置卷积,最后输出mask的预测结果。" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "def _conv(in_channels, out_channels, kernel_size=1, stride=1, padding=0, pad_mode='pad', gain=1):\n", + " \"\"\"\n", + " Conv2D wrapper.\n", + "\n", + " Args:\n", + " in_channels(int): Input channel num.\n", + " out_channels(int): Output channel num.\n", + " kernel_size(int): Kernel size. Default: 1\n", + " stride(int): Stride. Default: 1\n", + " padding(int): Padding range. Default: 0\n", + " pad_mode(bool): Padding model. Default: 'pad'\n", + " gain(int): Gain. Default: 1\n", + "\n", + " Returns:\n", + " Tensor, Convoluted result.\n", + " \"\"\"\n", + " shape = (out_channels, in_channels, kernel_size, kernel_size)\n", + " # xavier_normal\n", + " fan_in = in_channels * kernel_size * kernel_size\n", + " fan_out = out_channels * kernel_size * kernel_size\n", + " std = gain * (2 / (fan_in + fan_out)) ** 0.5\n", + " weights = Tensor(np.random.normal(loc=0.0, scale=std, size=shape).astype(np.float32))\n", + " shape_bias = (out_channels,)\n", + " bias = Tensor(np.array(np.zeros(shape_bias)).astype(np.float32))\n", + " return nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding,\n", + " pad_mode=pad_mode, weight_init=weights, has_bias=True, bias_init=bias)\n", + "\n", + "\n", + "def _conv_transpose(in_channels, out_channels, kernel_size=1, stride=1, padding=0, pad_mode='pad', gain=1):\n", + " \"\"\"\n", + " ConvTranspose wrapper.\n", + "\n", + " Args:\n", + " in_channels(int): Input channel num.\n", + " out_channels(int): Output channel num.\n", + " kernel_size(int): Kernel size. Default: 1\n", + " stride(int): Stride. Default: 1\n", + " padding(int): Padding range. Default: 0\n", + " pad_mode(bool): Padding model. Default: 'pad'\n", + " gain(int): Gain. Default: 1\n", + "\n", + " Returns:\n", + " Tensor, Convoluted Transposed result.\n", + " \"\"\"\n", + " shape = (out_channels, in_channels, kernel_size, kernel_size)\n", + " # xavier_normal\n", + " fan_in = in_channels * kernel_size * kernel_size\n", + " fan_out = out_channels * kernel_size * kernel_size\n", + " std = gain * (2 / (fan_in + fan_out)) ** 0.5\n", + " weights = Tensor(np.random.normal(loc=0.0, scale=std, size=shape).astype(np.float32))\n", + " shape_bias = (out_channels,)\n", + " bias = Tensor(np.array(np.zeros(shape_bias)).astype(np.float32))\n", + " return nn.Conv2dTranspose(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding,\n", + " pad_mode=pad_mode, weight_init=weights, has_bias=True, bias_init=bias)\n", + "\n", + "\n", + "class FpnMask(nn.Cell):\n", + " \"\"\"\n", + " Conv layers of mask head\n", + "\n", + " Args:\n", + " input_channels (int): Channel size of input feature maps.\n", + " output_channels (int): Channel size output\n", + " num_classes (int): Number of classes.\n", + "\n", + " Inputs:\n", + " - **x** (Tensor) - Input from the upper layer.\n", + "\n", + " Outputs:\n", + " Tuple, tuple of output tensor.\n", + "\n", + " Support Platforms:\n", + " ``Ascend`` ``CPU`` ``GPU``\n", + "\n", + " Examples:\n", + " >>> FpnMask(input_channels=256, output_channels=256, num_classes=81)\n", + " \"\"\"\n", + " def __init__(self, input_channels, output_channels, num_classes):\n", + " super(FpnMask, self).__init__()\n", + "\n", + " self.cast_type = mstype.float32\n", + "\n", + " self.mask_conv1 = _conv(input_channels, output_channels,\n", + " kernel_size=3, gain=2 ** 0.5,\n", + " pad_mode=\"same\").to_float(self.cast_type)\n", + " self.mask_relu1 = P.ReLU()\n", + "\n", + " self.mask_conv2 = _conv(output_channels, output_channels,\n", + " kernel_size=3, gain=2 ** 0.5,\n", + " pad_mode=\"same\").to_float(self.cast_type)\n", + " self.mask_relu2 = P.ReLU()\n", + "\n", + " self.mask_conv3 = _conv(output_channels, output_channels,\n", + " kernel_size=3, gain=2 ** 0.5,\n", + " pad_mode=\"same\").to_float(self.cast_type)\n", + " self.mask_relu3 = P.ReLU()\n", + "\n", + " self.mask_conv4 = _conv(output_channels, output_channels,\n", + " kernel_size=3, gain=2 ** 0.5,\n", + " pad_mode=\"same\").to_float(self.cast_type)\n", + " self.mask_relu4 = P.ReLU()\n", + "\n", + " self.mask_deconv5 = _conv_transpose(output_channels, output_channels, kernel_size=2, gain=2 ** 0.5,\n", + " stride=2, pad_mode=\"valid\").to_float(self.cast_type)\n", + " self.mask_relu5 = P.ReLU()\n", + " self.mask_conv6 = _conv(output_channels, num_classes, kernel_size=1, stride=1, gain=2,\n", + " pad_mode=\"valid\").to_float(self.cast_type)\n", + "\n", + " def construct(self, x):\n", + " \"\"\"Construct convolutional layers of mask heads. \"\"\"\n", + " x = self.mask_conv1(x)\n", + " x = self.mask_relu1(x)\n", + "\n", + " x = self.mask_conv2(x)\n", + " x = self.mask_relu2(x)\n", + "\n", + " x = self.mask_conv3(x)\n", + " x = self.mask_relu3(x)\n", + "\n", + " x = self.mask_conv4(x)\n", + " x = self.mask_relu4(x)\n", + "\n", + " x = self.mask_deconv5(x)\n", + " x = self.mask_relu5(x)\n", + "\n", + " x = self.mask_conv6(x)\n", + "\n", + " return x\n", + "\n", + "\n", + "class RcnnMask(nn.Cell):\n", + " \"\"\"\n", + " Rcnn for mask subnet.\n", + "\n", + " Args:\n", + " config (dict): Config.\n", + " batch_size (int): Batchsize.\n", + " num_classes (int): Class number.\n", + " target_means (list): Means for encode function. Default: (.0, .0, .0, .0]).\n", + " target_stds (list): Stds for encode function. Default: (0.1, 0.1, 0.2, 0.2).\n", + "\n", + " Inputs:\n", + " - **mask_featuremap** (tuple) - Masked feature map\n", + " - **labels** (tuple) - Ground truth labels. Default: None\n", + " - **mask** (tuple) - Mask map. Default: None\n", + " - **mask_fb_targets** (tuple) - Masked targets. Default: None\n", + "\n", + " Outputs:\n", + " Tuple, tuple of output tensor.\n", + "\n", + " Examples:\n", + " >>> RcnnMask(config=config, representation_size = 1024,\n", + " ... batch_size=2, num_classes = 81,\n", + " ... target_means=(0., 0., 0., 0.),\n", + " ... target_stds=(0.1, 0.1, 0.2, 0.2))\n", + " \"\"\"\n", + "\n", + " def __init__(self, config, batch_size, num_classes, target_means=(0., 0., 0., 0.),\n", + " target_stds=(0.1, 0.1, 0.2, 0.2)):\n", + " super(RcnnMask, self).__init__()\n", + " cfg = config\n", + "\n", + " self.cast_type = mstype.float32\n", + " self.np_cast_type = np.float32\n", + "\n", + " self.rcnn_loss_mask_fb_weight = Tensor(np.array(cfg.rcnn_loss_mask_fb_weight).astype(self.np_cast_type))\n", + " self.rcnn_mask_out_channels = cfg.rcnn_mask_out_channels\n", + " self.target_means = target_means\n", + " self.target_stds = target_stds\n", + " self.num_classes = num_classes\n", + " self.in_channels = cfg.rcnn_in_channels\n", + "\n", + " self.fpn_mask = FpnMask(self.in_channels, self.rcnn_mask_out_channels, self.num_classes)\n", + "\n", + " self.logicaland = P.LogicalAnd()\n", + " self.loss_mask = P.SigmoidCrossEntropyWithLogits()\n", + " self.onehot = P.OneHot()\n", + " self.greater = P.Greater()\n", + " self.cast = P.Cast()\n", + " self.sum_loss = P.ReduceSum()\n", + " self.tile = P.Tile()\n", + " self.expandims = P.ExpandDims()\n", + "\n", + " self.on_value = Tensor(1.0, mstype.float32)\n", + " self.off_value = Tensor(0.0, mstype.float32)\n", + "\n", + " self.num_bboxes = cfg.num_expected_pos_stage2 * batch_size\n", + " rmv_first = np.ones((self.num_bboxes, self.num_classes))\n", + " rmv_first[:, 0] = np.zeros((self.num_bboxes,))\n", + " self.rmv_first_tensor = Tensor(rmv_first.astype(self.np_cast_type))\n", + " self.mean_loss = P.ReduceMean()\n", + "\n", + " def construct(self, mask_featuremap, labels=None, mask=None, mask_fb_targets=None):\n", + " \"\"\"Construct Rcnn Mask.\"\"\"\n", + " x_mask_fb = self.fpn_mask(mask_featuremap)\n", + "\n", + " if self.training:\n", + " bbox_weights = self.cast(self.logicaland(self.greater(labels, 0), mask), mstype.int32) * labels\n", + " mask_fb_targets = self.tile(self.expandims(mask_fb_targets, 1), (1, self.num_classes, 1, 1))\n", + "\n", + " loss_mask_fb = self.loss(x_mask_fb, bbox_weights, mask, mask_fb_targets)\n", + " out = loss_mask_fb\n", + " else:\n", + " out = x_mask_fb\n", + "\n", + " return out\n", + "\n", + " def loss(self, masks_fb_pred, bbox_weights, weights, masks_fb_targets):\n", + " \"\"\"\n", + " Loss method.\n", + "\n", + " Args:\n", + " mask_fb_pred (Tensor): Mask feedback prediction.\n", + " bbox_weights (Tensor): Bounding box weights.\n", + " weights (Tensor): GT weights.\n", + " masks_fb_targets (Tensor): Mask feedback targets.\n", + "\n", + " Returns:\n", + " Tensor, loss mask feedback result.\n", + " \"\"\"\n", + " weights = self.cast(weights, self.cast_type)\n", + " bbox_weights = \\\n", + " self.cast(self.onehot(bbox_weights, self.num_classes, self.on_value, self.off_value), self.cast_type)\n", + " bbox_weights = bbox_weights * self.rmv_first_tensor\n", + "\n", + " # loss_mask_fb\n", + " masks_fb_targets = self.cast(masks_fb_targets, self.cast_type)\n", + " loss_mask_fb = self.loss_mask(masks_fb_pred, masks_fb_targets)\n", + " loss_mask_fb = self.mean_loss(loss_mask_fb, (2, 3))\n", + " loss_mask_fb = loss_mask_fb * bbox_weights\n", + " loss_mask_fb = loss_mask_fb / (self.sum_loss(weights, (0,)) + 1e-5)\n", + " loss_mask_fb = self.sum_loss(loss_mask_fb, (0, 1))\n", + "\n", + " return loss_mask_fb\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Mask RCNN模型\n", + "\n", + "我们将卷积层,RPN层,RoIAlign层,Bbox预测层和Mask预测层连接起来,构建Mask RCNN网络。" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "from model.bbox_assign_sample_stage2 import BboxAssignSampleForRcnn\n", + "from model.proposal_generator import Proposal\n", + "from model.anchor_generator import AnchorGenerator\n", + "\n", + "\n", + "class MaskRcnnResnet50(nn.Cell):\n", + " \"\"\"\n", + " MaskRcnn Network.\n", + "\n", + " Note:\n", + " backbone = resnet50\n", + "\n", + " Args:\n", + " config (dict): Config.\n", + "\n", + " Inputs:\n", + " - **img_data** (Tensor) - Image data.\n", + " - **img_metas** (List) - Image shapes.\n", + " - **gt_bboxes** (List) - GT boudning boxes.\n", + " - **gt_labels** (List) - GT labels.\n", + " - **gt_valids** (List) - GT validations.\n", + " - **gt_masks** (List) - GT masks.\n", + "\n", + " Outputs:\n", + " Function, return a tuple of output tensor.\n", + "\n", + " Support Plarforms:\n", + " ``Ascend`` ``CPU`` ``GPU``\n", + "\n", + " Examples:\n", + " >>> net = MaskRcnnResnet50(config)\n", + " \"\"\"\n", + " def __init__(self, config):\n", + " super(MaskRcnnResnet50, self).__init__()\n", + "\n", + " self.cast_type = mstype.float32\n", + " self.np_cast_type = np.float32\n", + "\n", + " self.train_batch_size = config.batch_size\n", + " self.num_classes = config.num_classes\n", + " self.anchor_scales = config.anchor_scales\n", + " self.anchor_ratios = config.anchor_ratios\n", + " self.anchor_strides = config.anchor_strides\n", + " self.target_means = tuple(config.rcnn_target_means)\n", + " self.target_stds = tuple(config.rcnn_target_stds)\n", + "\n", + " # Anchor generator\n", + " anchor_base_sizes = None\n", + " self.anchor_base_sizes = list(\n", + " self.anchor_strides) if anchor_base_sizes is None else anchor_base_sizes\n", + "\n", + " self.anchor_generators = []\n", + " for anchor_base in self.anchor_base_sizes:\n", + " self.anchor_generators.append(AnchorGenerator(anchor_base, self.anchor_scales, self.anchor_ratios))\n", + "\n", + " self.num_anchors = len(self.anchor_ratios) * len(self.anchor_scales)\n", + "\n", + " featmap_sizes = config.feature_shapes\n", + " assert len(featmap_sizes) == len(self.anchor_generators)\n", + "\n", + " self.anchor_list = self.get_anchors(featmap_sizes)\n", + "\n", + " # Backbone resnet50\n", + " self.backbone = ResNetFea(ResidualBlockUsing, config.resnet_block, config.resnet_in_channels,\n", + " config.resnet_out_channels, False)\n", + " # Fpn\n", + " self.fpn_ncek = FeatPyramidNeck(config.fpn_in_channels, config.fpn_out_channels, config.fpn_num_outs)\n", + "\n", + " # Rpn and rpn loss\n", + " self.gt_labels_stage1 = Tensor(np.ones((self.train_batch_size, config.num_gts)).astype(np.int32))\n", + " self.rpn_with_loss = RPN(config, self.train_batch_size, config.rpn_in_channels,\n", + " config.rpn_feat_channels, config.num_anchors, config.rpn_cls_out_channels)\n", + "\n", + " # Proposal\n", + " self.proposal_generator = Proposal(config, self.train_batch_size,\n", + " config.activate_num_classes,\n", + " config.use_sigmoid_cls)\n", + " self.proposal_generator.set_train_local(config, True)\n", + " self.proposal_generator_test = Proposal(config, config.test_batch_size,\n", + " config.activate_num_classes,\n", + " config.use_sigmoid_cls)\n", + " self.proposal_generator_test.set_train_local(config, False)\n", + "\n", + " # Assign and sampler stage two\n", + " self.bbox_assigner_sampler_for_rcnn = \\\n", + " BboxAssignSampleForRcnn(config, self.train_batch_size, config.num_bboxes_stage2, True)\n", + " self.decode = P.BoundingBoxDecode(max_shape=(768, 1280), means=self.target_means, stds=self.target_stds)\n", + "\n", + " # Roi\n", + " self.init_roi(config)\n", + "\n", + " # Rcnn\n", + " self.rcnn_cls = RcnnCls(config, self.train_batch_size, self.num_classes)\n", + " self.rcnn_mask = RcnnMask(config, self.train_batch_size, self.num_classes)\n", + "\n", + " # Op declare\n", + " self.squeeze = P.Squeeze()\n", + " self.cast = P.Cast()\n", + "\n", + " self.concat = P.Concat(axis=0)\n", + " self.concat_1 = P.Concat(axis=1)\n", + " self.concat_2 = P.Concat(axis=2)\n", + " self.reshape = P.Reshape()\n", + " self.select = P.Select()\n", + " self.greater = P.Greater()\n", + " self.transpose = P.Transpose()\n", + "\n", + " # Test mode\n", + " self.init_test_mode(config)\n", + "\n", + " # Improve speed\n", + " self.concat_start = min(self.num_classes - 2, 55)\n", + " self.concat_end = (self.num_classes - 1)\n", + "\n", + " # Init tensor\n", + " self.init_tensor(config)\n", + "\n", + " def init_roi(self, config):\n", + " \"\"\"initialize roi aligners.\"\"\"\n", + " self.roi_align = SingleRoIExtractor(config, config.roi_layer, config.roi_align_out_channels,\n", + " config.roi_align_featmap_strides, self.train_batch_size,\n", + " config.roi_align_finest_scale, mask=False)\n", + " self.roi_align.set_train_local(config, True)\n", + "\n", + " self.roi_align_mask = SingleRoIExtractor(config, config.roi_layer, config.roi_align_out_channels,\n", + " config.roi_align_featmap_strides, self.train_batch_size,\n", + " config.roi_align_finest_scale, mask=True)\n", + " self.roi_align_mask.set_train_local(config, True)\n", + "\n", + " self.roi_align_test = SingleRoIExtractor(config, config.roi_layer, config.roi_align_out_channels,\n", + " config.roi_align_featmap_strides, 1,\n", + " config.roi_align_finest_scale, mask=False)\n", + " self.roi_align_test.set_train_local(config, False)\n", + "\n", + " self.roi_align_mask_test = SingleRoIExtractor(config, config.roi_layer, config.roi_align_out_channels,\n", + " config.roi_align_featmap_strides, 1,\n", + " config.roi_align_finest_scale, mask=True)\n", + " self.roi_align_mask_test.set_train_local(config, False)\n", + "\n", + " def init_test_mode(self, config):\n", + " \"\"\"\"initialize the test mode.\"\"\"\n", + " self.test_batch_size = config.test_batch_size\n", + " self.split = P.Split(axis=0, output_num=self.test_batch_size)\n", + " self.split_shape = P.Split(axis=0, output_num=4)\n", + " self.split_scores = P.Split(axis=1, output_num=self.num_classes)\n", + " self.split_fb_mask = P.Split(axis=1, output_num=self.num_classes)\n", + " self.split_cls = P.Split(axis=0, output_num=self.num_classes-1)\n", + " self.tile = P.Tile()\n", + " self.gather = P.GatherNd()\n", + "\n", + " self.rpn_max_num = config.rpn_max_num\n", + "\n", + " self.zeros_for_nms = Tensor(np.zeros((self.rpn_max_num, 3)).astype(self.np_cast_type))\n", + " self.ones_mask = np.ones((self.rpn_max_num, 1)).astype(np.bool)\n", + " self.zeros_mask = np.zeros((self.rpn_max_num, 1)).astype(np.bool)\n", + " self.bbox_mask = Tensor(np.concatenate((self.ones_mask, self.zeros_mask,\n", + " self.ones_mask, self.zeros_mask), axis=1))\n", + " self.nms_pad_mask = Tensor(np.concatenate((self.ones_mask, self.ones_mask,\n", + " self.ones_mask, self.ones_mask,\n", + " self.zeros_mask), axis=1))\n", + "\n", + " self.test_score_thresh = Tensor(np.ones((self.rpn_max_num, 1)).astype(self.np_cast_type) * \\\n", + " config.test_score_thr)\n", + " self.test_score_zeros = Tensor(np.ones((self.rpn_max_num, 1)).astype(self.np_cast_type) * 0)\n", + " self.test_box_zeros = Tensor(np.ones((self.rpn_max_num, 4)).astype(self.np_cast_type) * -1)\n", + " self.test_iou_thr = Tensor(np.ones((self.rpn_max_num, 1)).astype(self.np_cast_type) * config.test_iou_thr)\n", + " self.test_max_per_img = config.test_max_per_img\n", + " self.nms_test = P.NMSWithMask(config.test_iou_thr)\n", + " self.softmax = P.Softmax(axis=1)\n", + " self.logicand = P.LogicalAnd()\n", + " self.oneslike = P.OnesLike()\n", + " self.test_topk = P.TopK(sorted=True)\n", + " self.test_num_proposal = self.test_batch_size * self.rpn_max_num\n", + "\n", + " def init_tensor(self, config):\n", + " \"\"\"initialize the tensors.\"\"\"\n", + " roi_align_index = [np.array(np.ones((config.num_expected_pos_stage2 + \\\n", + " config.num_expected_neg_stage2, 1)) * i,\n", + " dtype=self.np_cast_type) for i in range(self.train_batch_size)]\n", + "\n", + " roi_align_index_test = [np.array(np.ones((config.rpn_max_num, 1)) * i,\n", + " dtype=self.np_cast_type) for i in range(self.test_batch_size)]\n", + "\n", + " self.roi_align_index_tensor = Tensor(np.concatenate(roi_align_index))\n", + " self.roi_align_index_test_tensor = Tensor(np.concatenate(roi_align_index_test))\n", + "\n", + " roi_align_index_pos = [np.array(np.ones((config.num_expected_pos_stage2, 1)) * i,\n", + " dtype=self.np_cast_type) for i in range(self.train_batch_size)]\n", + " self.roi_align_index_tensor_pos = Tensor(np.concatenate(roi_align_index_pos))\n", + "\n", + " self.rcnn_loss_cls_weight = Tensor(np.array(config.rcnn_loss_cls_weight).astype(self.np_cast_type))\n", + " self.rcnn_loss_reg_weight = Tensor(np.array(config.rcnn_loss_reg_weight).astype(self.np_cast_type))\n", + " self.rcnn_loss_mask_fb_weight = Tensor(np.array(config.rcnn_loss_mask_fb_weight).astype(self.np_cast_type))\n", + "\n", + " self.argmax_with_value = P.ArgMaxWithValue(axis=1)\n", + " self.on_value = Tensor(1.0, mstype.float32)\n", + " self.off_value = Tensor(0.0, mstype.float32)\n", + " self.onehot = P.OneHot()\n", + " self.reducesum = P.ReduceSum()\n", + " self.sigmoid = P.Sigmoid()\n", + " self.expand_dims = P.ExpandDims()\n", + " self.test_mask_fb_zeros = Tensor(np.zeros((self.rpn_max_num, 28, 28)).astype(self.np_cast_type))\n", + " self.value = Tensor(1.0, self.cast_type)\n", + "\n", + " def construct(self, img_data, img_metas, gt_bboxes, gt_labels, gt_valids, gt_masks):\n", + " \"\"\"Construct for Mask R-CNN net.\"\"\"\n", + " x = self.backbone(img_data)\n", + " x = self.fpn_ncek(x)\n", + "\n", + " rpn_loss, cls_score, bbox_pred, rpn_cls_loss, rpn_reg_loss, _ = self.rpn_with_loss(x, img_metas,\n", + " self.anchor_list,\n", + " gt_bboxes,\n", + " self.gt_labels_stage1,\n", + " gt_valids)\n", + "\n", + " if self.training:\n", + " proposal, proposal_mask = self.proposal_generator(cls_score, bbox_pred, self.anchor_list)\n", + " else:\n", + " proposal, proposal_mask = self.proposal_generator_test(cls_score, bbox_pred, self.anchor_list)\n", + "\n", + " gt_labels = self.cast(gt_labels, mstype.int32)\n", + " gt_valids = self.cast(gt_valids, mstype.int32)\n", + " bboxes_tuple = ()\n", + " deltas_tuple = ()\n", + " labels_tuple = ()\n", + " mask_tuple = ()\n", + "\n", + " pos_bboxes_tuple = ()\n", + " pos_mask_fb_tuple = ()\n", + " pos_labels_tuple = ()\n", + " pos_mask_tuple = ()\n", + "\n", + " if self.training:\n", + " for i in range(self.train_batch_size):\n", + " gt_bboxes_i = self.squeeze(gt_bboxes[i:i + 1:1, ::])\n", + "\n", + " gt_labels_i = self.squeeze(gt_labels[i:i + 1:1, ::])\n", + " gt_labels_i = self.cast(gt_labels_i, mstype.int32)\n", + "\n", + " gt_valids_i = self.squeeze(gt_valids[i:i + 1:1, ::])\n", + " gt_valids_i = self.cast(gt_valids_i, mstype.bool_)\n", + "\n", + " gt_masks_i = self.squeeze(gt_masks[i:i + 1:1, ::])\n", + " gt_masks_i = self.cast(gt_masks_i, mstype.bool_)\n", + "\n", + " bboxes, deltas, labels, mask, pos_bboxes, pos_mask_fb, pos_labels, pos_mask = \\\n", + " self.bbox_assigner_sampler_for_rcnn(gt_bboxes_i, gt_labels_i, proposal_mask[i],\n", + " proposal[i][::, 0:4:1], gt_valids_i, gt_masks_i)\n", + " bboxes_tuple += (bboxes,)\n", + " deltas_tuple += (deltas,)\n", + " labels_tuple += (labels,)\n", + " mask_tuple += (mask,)\n", + "\n", + " pos_bboxes_tuple += (pos_bboxes,)\n", + " pos_mask_fb_tuple += (pos_mask_fb,)\n", + " pos_labels_tuple += (pos_labels,)\n", + " pos_mask_tuple += (pos_mask,)\n", + "\n", + " bbox_targets = self.concat(deltas_tuple)\n", + " rcnn_labels = self.concat(labels_tuple)\n", + " bbox_targets = F.stop_gradient(bbox_targets)\n", + " rcnn_labels = F.stop_gradient(rcnn_labels)\n", + " rcnn_labels = self.cast(rcnn_labels, mstype.int32)\n", + "\n", + " rcnn_pos_masks_fb = self.concat(pos_mask_fb_tuple)\n", + " rcnn_pos_masks_fb = F.stop_gradient(rcnn_pos_masks_fb)\n", + " rcnn_pos_labels = self.concat(pos_labels_tuple)\n", + " rcnn_pos_labels = F.stop_gradient(rcnn_pos_labels)\n", + " rcnn_pos_labels = self.cast(rcnn_pos_labels, mstype.int32)\n", + " else:\n", + " mask_tuple += proposal_mask\n", + " bbox_targets = proposal_mask\n", + " rcnn_labels = proposal_mask\n", + "\n", + " rcnn_pos_masks_fb = proposal_mask\n", + " rcnn_pos_labels = proposal_mask\n", + " for p_i in proposal:\n", + " bboxes_tuple += (p_i[::, 0:4:1],)\n", + "\n", + " bboxes_all, rois, pos_rois = self.rois(bboxes_tuple, pos_bboxes_tuple)\n", + "\n", + " if self.training:\n", + " roi_feats = self.roi_align(rois,\n", + " self.cast(x[0], mstype.float32),\n", + " self.cast(x[1], mstype.float32),\n", + " self.cast(x[2], mstype.float32),\n", + " self.cast(x[3], mstype.float32))\n", + " else:\n", + " roi_feats = self.roi_align_test(rois,\n", + " self.cast(x[0], mstype.float32),\n", + " self.cast(x[1], mstype.float32),\n", + " self.cast(x[2], mstype.float32),\n", + " self.cast(x[3], mstype.float32))\n", + "\n", + "\n", + " roi_feats = self.cast(roi_feats, self.cast_type)\n", + " rcnn_masks = self.concat(mask_tuple)\n", + " rcnn_masks = F.stop_gradient(rcnn_masks)\n", + " rcnn_mask_squeeze = self.squeeze(self.cast(rcnn_masks, mstype.bool_))\n", + "\n", + " rcnn_pos_masks = self.concat(pos_mask_tuple)\n", + " rcnn_pos_masks = F.stop_gradient(rcnn_pos_masks)\n", + " rcnn_pos_mask_squeeze = self.squeeze(self.cast(rcnn_pos_masks, mstype.bool_))\n", + "\n", + " rcnn_cls_loss, rcnn_reg_loss = self.rcnn_cls(roi_feats, bbox_targets, rcnn_labels, rcnn_mask_squeeze)\n", + "\n", + " if self.training:\n", + " return self.get_output_train(pos_rois, x, rcnn_pos_labels, rcnn_pos_mask_squeeze, rcnn_pos_masks_fb,\n", + " rpn_loss, rpn_cls_loss, rpn_reg_loss, rcnn_cls_loss, rcnn_reg_loss)\n", + "\n", + " return self.get_output_eval(x, bboxes_all, rcnn_cls_loss, rcnn_reg_loss, rcnn_masks, img_metas)\n", + "\n", + " def rois(self, bboxes_tuple, pos_bboxes_tuple):\n", + " \"\"\"\"initialize the rois.\"\"\"\n", + " pos_rois = None\n", + " if self.training:\n", + " if self.train_batch_size > 1:\n", + " bboxes_all = self.concat(bboxes_tuple)\n", + " pos_bboxes_all = self.concat(pos_bboxes_tuple)\n", + " else:\n", + " bboxes_all = bboxes_tuple[0]\n", + " pos_bboxes_all = pos_bboxes_tuple[0]\n", + " rois = self.concat_1((self.roi_align_index_tensor, bboxes_all))\n", + " pos_rois = self.concat_1((self.roi_align_index_tensor_pos, pos_bboxes_all))\n", + " pos_rois = self.cast(pos_rois, mstype.float32)\n", + " pos_rois = F.stop_gradient(pos_rois)\n", + " else:\n", + " if self.test_batch_size > 1:\n", + " bboxes_all = self.concat(bboxes_tuple)\n", + " else:\n", + " bboxes_all = bboxes_tuple[0]\n", + " rois = self.concat_1((self.roi_align_index_test_tensor, bboxes_all))\n", + "\n", + " rois = self.cast(rois, mstype.float32)\n", + " rois = F.stop_gradient(rois)\n", + "\n", + " return bboxes_all, rois, pos_rois\n", + "\n", + " def get_output_train(self, pos_rois, x, rcnn_pos_labels, rcnn_pos_mask_squeeze, rcnn_pos_masks_fb,\n", + " rpn_loss, rpn_cls_loss, rpn_reg_loss, rcnn_cls_loss, rcnn_reg_loss):\n", + " \"\"\"get the training outputs.\"\"\"\n", + " output = ()\n", + " roi_feats_mask = self.roi_align_mask(pos_rois,\n", + " self.cast(x[0], mstype.float32),\n", + " self.cast(x[1], mstype.float32),\n", + " self.cast(x[2], mstype.float32),\n", + " self.cast(x[3], mstype.float32))\n", + " roi_feats_mask = self.cast(roi_feats_mask, self.cast_type)\n", + " rcnn_mask_fb_loss = self.rcnn_mask(roi_feats_mask, rcnn_pos_labels, rcnn_pos_mask_squeeze, rcnn_pos_masks_fb)\n", + "\n", + " rcnn_loss = self.rcnn_loss_cls_weight * rcnn_cls_loss + self.rcnn_loss_reg_weight * rcnn_reg_loss + \\\n", + " self.rcnn_loss_mask_fb_weight * rcnn_mask_fb_loss\n", + " output += (rpn_loss, rcnn_loss, rpn_cls_loss, rpn_reg_loss, rcnn_cls_loss, rcnn_reg_loss, rcnn_mask_fb_loss)\n", + " return output\n", + "\n", + " def get_output_eval(self, x, bboxes_all, rcnn_cls_loss, rcnn_reg_loss, rcnn_masks, img_metas):\n", + " \"\"\"get the evaluation results.\"\"\"\n", + " mask_fb_pred_all = self.rcnn_mask_test(x, bboxes_all, rcnn_cls_loss, rcnn_reg_loss)\n", + " output = self.get_det_bboxes(rcnn_cls_loss, rcnn_reg_loss, rcnn_masks, bboxes_all, img_metas, mask_fb_pred_all)\n", + " return output\n", + "\n", + " def get_det_bboxes(self, cls_logits, reg_logits, mask_logits, rois, img_metas, mask_fb_pred_all):\n", + " \"\"\"Get the actual detection box.\"\"\"\n", + " scores = self.softmax(cls_logits / self.value)\n", + " mask_fb_logits = self.sigmoid(mask_fb_pred_all)\n", + "\n", + " boxes_all = ()\n", + " for i in range(self.num_classes):\n", + " k = i * 4\n", + " reg_logits_i = self.squeeze(reg_logits[::, k:k+4:1])\n", + " out_boxes_i = self.decode(rois, reg_logits_i)\n", + " boxes_all += (out_boxes_i,)\n", + "\n", + " img_metas_all = self.split(img_metas)\n", + " scores_all = self.split(scores)\n", + " mask_all = self.split(self.cast(mask_logits, mstype.int32))\n", + " mask_fb_all = self.split(mask_fb_logits)\n", + "\n", + " boxes_all_with_batchsize = ()\n", + " for i in range(self.test_batch_size):\n", + " scale = self.split_shape(self.squeeze(img_metas_all[i]))\n", + " scale_h = scale[2]\n", + " scale_w = scale[3]\n", + " boxes_tuple = ()\n", + " for j in range(self.num_classes):\n", + " boxes_tmp = self.split(boxes_all[j])\n", + " out_boxes_h = boxes_tmp[i] / scale_h\n", + " out_boxes_w = boxes_tmp[i] / scale_w\n", + " boxes_tuple += (self.select(self.bbox_mask, out_boxes_w, out_boxes_h),)\n", + " boxes_all_with_batchsize += (boxes_tuple,)\n", + "\n", + " output = self.multiclass_nms(boxes_all_with_batchsize, scores_all, mask_all, mask_fb_all)\n", + "\n", + " return output\n", + "\n", + " def multiclass_nms(self, boxes_all, scores_all, mask_all, mask_fb_all):\n", + " \"\"\"\n", + " Multiscale postprocessing.\n", + "\n", + " Args:\n", + " boxes_all (tuple): All bounding boxes.\n", + " scores_all (tuple): All scores.\n", + " mask_all (tuple): All masks.\n", + " mask_fb_all (tuple): All feedback masks.\n", + "\n", + " Returns:\n", + " - all_bboxes, tuple, output bounding boxes with the same shape of boxes_all.\n", + " - all_labels, tuple, output labels with the same shape of scores_all.\n", + " - all_masks, tuple, output masks with the same shape of mask_all.\n", + " - all_masks_fb, tuple, output feedback masks with the same shape of mask_fb_all.\n", + " \"\"\"\n", + " all_bboxes = ()\n", + " all_labels = ()\n", + " all_masks = ()\n", + " all_masks_fb = ()\n", + "\n", + " for i in range(self.test_batch_size):\n", + " bboxes = boxes_all[i]\n", + " scores = scores_all[i]\n", + " masks = self.cast(mask_all[i], mstype.bool_)\n", + " masks_fb = mask_fb_all[i]\n", + " mask_fb_all_x = self.split_fb_mask(masks_fb)\n", + "\n", + " res_boxes_tuple = ()\n", + " res_labels_tuple = ()\n", + " res_masks_tuple = ()\n", + " res_masks_fb_tuple = ()\n", + "\n", + " for j in range(self.num_classes - 1):\n", + " k = j + 1\n", + " cls_scores_x = scores[::, k:k + 1:1]\n", + " bboxes_x = self.squeeze(bboxes[k])\n", + " mask_ox = self.reshape(masks, (self.rpn_max_num, 1))\n", + " masks_fb_x = self.squeeze(mask_fb_all_x[k])\n", + "\n", + " cls_mask = self.greater(cls_scores_x, self.test_score_thresh)\n", + " mask_x = self.logicand(mask_ox, cls_mask)\n", + "\n", + " reg_mask_x = self.cast(self.tile(self.cast(mask_x, mstype.int32), (1, 4)), mstype.bool_)\n", + "\n", + " bboxes_x = self.select(reg_mask_x, bboxes_x, self.test_box_zeros)\n", + " fb_mask_x = self.expand_dims(mask_x, -1)\n", + " mask_fb_mask_x = self.cast(self.tile(self.cast(fb_mask_x, mstype.int32), (1, 28, 28)), mstype.bool_)\n", + " masks_fb_x = self.select(mask_fb_mask_x, masks_fb_x, self.test_mask_fb_zeros)\n", + " cls_scores_x = self.select(mask_x, cls_scores_x, self.test_score_zeros)\n", + " cls_scores_x_next = self.squeeze(cls_scores_x)\n", + " scores_sorted, topk_inds = self.test_topk(cls_scores_x_next, self.rpn_max_num)\n", + " topk_inds = self.reshape(topk_inds, (self.rpn_max_num, 1))\n", + " scores_sorted = self.reshape(scores_sorted, (self.rpn_max_num, 1))\n", + " bboxes_x_sorted = self.gather(bboxes_x, topk_inds)\n", + " mask_fb_sorted_x = self.gather(masks_fb_x, topk_inds)\n", + " mask_sorted_x = self.gather(mask_x, topk_inds)\n", + "\n", + " scores_sorted = self.tile(scores_sorted, (1, 4))\n", + " cls_dets = self.concat_1((bboxes_x_sorted, scores_sorted))\n", + " cls_dets = P.Slice()(cls_dets, (0, 0), (self.rpn_max_num, 5))\n", + "\n", + " cls_dets, index_x, mask_nms_x = self.nms_test(cls_dets)\n", + " index_x = self.reshape(index_x, (self.rpn_max_num, 1))\n", + " mask_nms_x = self.reshape(mask_nms_x, (self.rpn_max_num, 1))\n", + "\n", + " mask_n_x = self.gather(mask_sorted_x, index_x)\n", + " mask_n_x = self.logicand(mask_n_x, mask_nms_x)\n", + "\n", + " mask_fb_x = self.gather(mask_fb_sorted_x, index_x)\n", + "\n", + " cls_labels = self.oneslike(index_x) * j\n", + " res_boxes_tuple += (cls_dets,)\n", + " res_labels_tuple += (cls_labels,)\n", + " res_masks_tuple += (mask_n_x,)\n", + " res_masks_fb_tuple += (mask_fb_x,)\n", + "\n", + " res_boxes_start = self.concat(res_boxes_tuple[:self.concat_start])\n", + " res_labels_start = self.concat(res_labels_tuple[:self.concat_start])\n", + " res_masks_start = self.concat(res_masks_tuple[:self.concat_start])\n", + " res_masks_fb_start = self.concat(res_masks_fb_tuple[:self.concat_start])\n", + "\n", + " res_boxes_end = self.concat(res_boxes_tuple[self.concat_start:self.concat_end])\n", + " res_labels_end = self.concat(res_labels_tuple[self.concat_start:self.concat_end])\n", + " res_masks_end = self.concat(res_masks_tuple[self.concat_start:self.concat_end])\n", + " res_masks_fb_end = self.concat(res_masks_fb_tuple[self.concat_start:self.concat_end])\n", + "\n", + " res_boxes = self.concat((res_boxes_start, res_boxes_end))\n", + " res_labels = self.concat((res_labels_start, res_labels_end))\n", + " res_masks = self.concat((res_masks_start, res_masks_end))\n", + " res_masks_fb = self.concat((res_masks_fb_start, res_masks_fb_end))\n", + "\n", + " reshape_size = (self.num_classes - 1) * self.rpn_max_num\n", + " res_boxes = self.reshape(res_boxes, (1, reshape_size, 5))\n", + " res_labels = self.reshape(res_labels, (1, reshape_size, 1))\n", + " res_masks = self.reshape(res_masks, (1, reshape_size, 1))\n", + " res_masks_fb = self.reshape(res_masks_fb, (1, reshape_size, 28, 28))\n", + "\n", + " all_bboxes += (res_boxes,)\n", + " all_labels += (res_labels,)\n", + " all_masks += (res_masks,)\n", + " all_masks_fb += (res_masks_fb,)\n", + "\n", + " all_bboxes = self.concat(all_bboxes)\n", + " all_labels = self.concat(all_labels)\n", + " all_masks = self.concat(all_masks)\n", + " all_masks_fb = self.concat(all_masks_fb)\n", + " return all_bboxes, all_labels, all_masks, all_masks_fb\n", + "\n", + " def get_anchors(self, featmap_sizes):\n", + " \"\"\"Get anchors according to feature map sizes.\n", + "\n", + " Args:\n", + " featmap_sizes (list[tuple]): Multi-level feature map sizes.\n", + " img_metas (list[dict]): Image meta info.\n", + "\n", + " Returns:\n", + " Tuple, anchors of each image, valid flags of each image\n", + " \"\"\"\n", + " num_levels = len(featmap_sizes)\n", + "\n", + " # since feature map sizes of all images are the same, we only compute\n", + " # anchors for one time\n", + " multi_level_anchors = ()\n", + " for i in range(num_levels):\n", + " anchors = self.anchor_generators[i].grid_anchors(featmap_sizes[i], self.anchor_strides[i])\n", + " multi_level_anchors += (Tensor(anchors.astype(self.np_cast_type)),)\n", + "\n", + " return multi_level_anchors\n", + "\n", + " def rcnn_mask_test(self, x, rois, cls_pred, reg_pred):\n", + " \"\"\"\n", + " Prediction masks in an images by the bounding boxes.\n", + "\n", + " Args:\n", + " x (Cell): Input layer.\n", + " rois (List): Region of Interest.\n", + " cls_pred (float): Classification prediction.\n", + " reg_pred (float): Regression prediction.\n", + "\n", + " Returns:\n", + " Cell, masked rcnn layer.\n", + " \"\"\"\n", + " cls_scores = self.softmax(cls_pred / self.value)\n", + "\n", + " cls_scores_all = self.split(cls_scores)\n", + " reg_pred = self.reshape(reg_pred, (-1, self.num_classes, 4))\n", + " reg_pred_all = self.split(reg_pred)\n", + " rois_all = self.split(rois)\n", + " boxes_tuple = ()\n", + " for i in range(self.test_batch_size):\n", + " cls_score_max_index, _ = self.argmax_with_value(cls_scores_all[i])\n", + " cls_score_max_index = self.cast(self.onehot(cls_score_max_index, self.num_classes,\n", + " self.on_value, self.off_value), self.cast_type)\n", + " cls_score_max_index = self.expand_dims(cls_score_max_index, -1)\n", + " cls_score_max_index = self.tile(cls_score_max_index, (1, 1, 4))\n", + " reg_pred_max = reg_pred_all[i] * cls_score_max_index\n", + " reg_pred_max = self.reducesum(reg_pred_max, 1)\n", + " out_boxes_i = self.decode(rois_all[i], reg_pred_max)\n", + " boxes_tuple += (out_boxes_i,)\n", + "\n", + " boxes_all = self.concat(boxes_tuple)\n", + " boxes_rois = self.concat_1((self.roi_align_index_test_tensor, boxes_all))\n", + " boxes_rois = self.cast(boxes_rois, self.cast_type)\n", + " roi_feats_mask_test = self.roi_align_mask_test(boxes_rois,\n", + " self.cast(x[0], mstype.float32),\n", + " self.cast(x[1], mstype.float32),\n", + " self.cast(x[2], mstype.float32),\n", + " self.cast(x[3], mstype.float32))\n", + " roi_feats_mask_test = self.cast(roi_feats_mask_test, self.cast_type)\n", + " mask_fb_pred_all = self.rcnn_mask(roi_feats_mask_test)\n", + " return mask_fb_pred_all\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 连接网络和损失函数\n", + "\n", + "MindSpore将损失函数、优化器等操作都封装到了Cell中,我们需要自定义WithLossCell类,将网络和Loss连接起来。\n", + "\n", + "Mask RCNN的损失函数被定义为:\n", + "\n", + "$$\n", + "L=L_{c l s}+L_{b o x}+L_{\\text {mask }}\n", + "$$\n", + "\n", + "$L_{c l s}$类别损失:rpn class和rcnn_cls的类别损失都是交叉熵损失。\n", + "\n", + "$L_{b o x}$边框损失:\n", + "\n", + "$$\n", + "\\operatorname{smooth}_{L_{1}}(x)= \\begin{cases}0.5 x^{2} & \\text { if }|x|<1 \\\\ |x|-0.5 & \\text { otherwise }\\end{cases}\n", + "$$\n", + "\n", + "$L_{mask}$掩膜损失:\n", + "\n", + "只对rcnn_mask计算1/0交叉熵损失。" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "TIME_STAMP_INIT = False\n", + "TIME_STAMP_FIRST = 0\n", + "\n", + "GRADIENT_CLIP_TYPE = 1\n", + "GRADIENT_CLIP_VALUE = 1.0\n", + "\n", + "clip_grad = C.MultitypeFuncGraph(\"clip_grad\")\n", + "\n", + "@clip_grad.register(\"Number\", \"Number\", \"Tensor\")\n", + "def _clip_grad(clip_type, clip_value, grad):\n", + " \"\"\"\n", + " Clip gradients.\n", + "\n", + " Args:\n", + " clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'.\n", + " clip_value (float): Specifies how much to clip.\n", + " grad (tuple[Tensor]): Gradients.\n", + "\n", + " Returns:\n", + " tuple[Tensor], clipped gradients.\n", + " \"\"\"\n", + " if clip_type not in (0, 1):\n", + " return grad\n", + " dt = F.dtype(grad)\n", + " mf_cast = F.cast(F.tuple_to_array((-clip_value,)), dt)\n", + " pf_cast = F.cast(F.tuple_to_array((clip_value,)), dt)\n", + " if clip_type == 0:\n", + " new_grad = C.clip_by_value(grad, mf_cast, pf_cast)\n", + " else:\n", + " new_grad = nn.ClipByNorm()(grad, pf_cast)\n", + " return F.cast(new_grad, dt)\n", + "\n", + "class LossCallBack(Callback):\n", + " \"\"\"\n", + " Monitor the loss in training.\n", + "\n", + " If the loss is NAN or INF terminating training.\n", + "\n", + " Note:\n", + " If per_print_times is 0 do not print loss.\n", + "\n", + " Args:\n", + " per_print_times (int): Print loss every times. Default: 1.\n", + " \"\"\"\n", + "\n", + " def __init__(self, per_print_times=1, rank_id=0):\n", + " super(LossCallBack, self).__init__()\n", + " if not isinstance(per_print_times, int) or per_print_times < 0:\n", + " raise ValueError(\"print_step must be int and >= 0.\")\n", + " self._per_print_times = per_print_times\n", + " self.count = 0\n", + " self.loss_sum = 0\n", + " self.rank_id = rank_id\n", + "\n", + " global TIME_STAMP_INIT, TIME_STAMP_FIRST\n", + " if not TIME_STAMP_INIT:\n", + " TIME_STAMP_FIRST = time.time()\n", + " TIME_STAMP_INIT = True\n", + "\n", + " def step_end(self, run_context):\n", + " \"\"\"set the end of step\"\"\"\n", + " cb_params = run_context.original_args()\n", + " loss = cb_params.net_outputs.asnumpy()\n", + " cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1\n", + " cur_time = time.time()\n", + " self.count += 1\n", + " self.loss_sum += float(loss)\n", + "\n", + " if self.count >= 1:\n", + " global TIME_STAMP_FIRST\n", + " time_stamp_current = time.time()\n", + " total_loss = self.loss_sum/self.count\n", + "\n", + " print(\"%lu epoch: %s step: %s total_loss: %.5f\" %\n", + " (time_stamp_current - TIME_STAMP_FIRST,\n", + " cb_params.cur_epoch_num, cur_step_in_epoch, total_loss))\n", + " loss_file = open(\"./loss_{}.log\".format(self.rank_id), \"a+\")\n", + " loss_file.write(\"%lu epoch: %s step: %s total_loss: %.5f\" %\n", + " (time_stamp_current - TIME_STAMP_FIRST,\n", + " cb_params.cur_epoch_num, cur_step_in_epoch,\n", + " total_loss))\n", + " loss_file.write(\"\\n\")\n", + " loss_file.close()\n", + "\n", + " self.count = 0\n", + " self.loss_sum = 0\n", + "\n", + " if cur_step_in_epoch > 100 and total_loss < 1:\n", + " print(\"End training, time:\", cur_time, \",epoch:\", cb_params.cur_epoch_num,\n", + " \",step:\", cur_step_in_epoch, \",loss:\", total_loss)\n", + " run_context.request_stop()\n", + "\n", + "\n", + "class LossNet(nn.Cell):\n", + " \"\"\"MaskRcnn loss sum\"\"\"\n", + " def construct(self, x1, x2, x3, x4, x5, x6, x7):\n", + " return x1 + x2\n", + "\n", + "\n", + "class WithLossCell(nn.Cell):\n", + " \"\"\"\n", + " Wrap the network with loss function to compute loss.\n", + "\n", + " Args:\n", + " backbone (Cell): The target network to wrap.\n", + " loss_fn (Cell): The loss function used to compute loss.\n", + "\n", + " Inputs:\n", + " - **x** (Tensor) - Input variant.\n", + " - **img_shape** (Tensor) - Img shape.\n", + " - **gt_bboxe** (Tensor) - Ground truth bounding boxes.\n", + " - **gt_label** (Tensor) - Ground truth labels.\n", + " - **gt_num** (int) - The number of ground truth.\n", + " - **gt_mask** (Tensor) - Ground truth mask.\n", + "\n", + " Outputs:\n", + " Loss network, Cell\n", + "\n", + " Support Platform:\n", + " \"Ascend\" \"CPU\" \"GPU\"\n", + "\n", + " Examples:\n", + " >>> net = MaskRcnnMobilenetV1(config=config)\n", + " >>> loss = LossNet()\n", + " >>> net_with_loss = WithLossCell(network, loss)\n", + " \"\"\"\n", + " def __init__(self, backbone, loss_fn):\n", + " super(WithLossCell, self).__init__(auto_prefix=False)\n", + " self._backbone = backbone\n", + " self._loss_fn = loss_fn\n", + "\n", + " def construct(self, x, img_shape, gt_bboxe, gt_label, gt_num, gt_mask):\n", + " loss1, loss2, loss3, loss4, loss5, loss6, loss7 = \\\n", + " self._backbone(x, img_shape, gt_bboxe, gt_label, gt_num, gt_mask)\n", + " return self._loss_fn(loss1, loss2, loss3, loss4, loss5, loss6, loss7)\n", + "\n", + " @property\n", + " def backbone_network(self):\n", + " \"\"\"\n", + " Get the backbone network.\n", + "\n", + " Returns:\n", + " Cell, return backbone network.\n", + " \"\"\"\n", + " return self._backbone\n", + "\n", + "class TrainOneStepCell(nn.Cell):\n", + " \"\"\"\n", + " Network training package class.\n", + "\n", + " Append an optimizer to the training network\n", + " after that the construct function.\n", + " can be called to create the backward graph.\n", + "\n", + " Args:\n", + " network (Cell): The training network.\n", + " optimizer (Cell): Optimizer for updating the weights.\n", + " sens (Number): The adjust parameter. Default: 1.0.\n", + " reduce_flag (bool): The reduce flag. Default: False.\n", + " mean (bool): Allreduce method. Default: False.\n", + " degree (int): Device number. Default: None.\n", + "\n", + " Inputs:\n", + " - **x** (Tensor) - Input variant.\n", + " - **img_shape** (Tensor) - Img shape.\n", + " - **gt_bboxe** (Tensor) - Ground truth bounding boxes.\n", + " - **gt_label** (Tensor) - Ground truth labels.\n", + " - **gt_num** (int) - The number of ground truth.\n", + " - **gt_mask** (Tensor) - Ground truth mask.\n", + "\n", + " Outputs:\n", + " Float, loss result.\n", + "\n", + " Support Platform:\n", + " ``Ascend`` ``CPU`` ``GPU``\n", + "\n", + " Examples:\n", + " >>> from utils.config import config\n", + " >>> from model.mask_rcnn_r50 import MaskRcnnResnet50\n", + " >>> net = MaskRcnnResnet50(config=config)\n", + " >>> loss = LossNet()\n", + " >>> net_with_loss = WithLossCell(net, loss)\n", + " >>> lr = Tensor(dynamic_lr(config, rank_size=1, start_steps=0), mstype.float32)\n", + " >>> opt = Momentum(params=net.trainable_params(), learning_rate=lr, momentum=0.91,\n", + " ... weight_decay=1e-4, loss_scale=1)\n", + " >>> net = TrainOneStepCell(net_with_loss, opt, sens=config.loss_scale)\n", + " \"\"\"\n", + " def __init__(self, network, optimizer, sens=1.0, reduce_flag=False, mean=True, degree=None):\n", + " super(TrainOneStepCell, self).__init__(auto_prefix=False)\n", + " self.network = network\n", + " self.network.set_grad()\n", + " self.weights = ParameterTuple(network.trainable_params())\n", + " self.optimizer = optimizer\n", + " self.grad = C.GradOperation(get_by_list=True, sens_param=True)\n", + "\n", + " self.sens = Tensor((np.ones((1,)) * sens).astype(np.float32))\n", + " self.reduce_flag = reduce_flag\n", + " self.hyper_map = C.HyperMap()\n", + " if reduce_flag:\n", + " self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)\n", + "\n", + " def construct(self, x, img_shape, gt_bboxe, gt_label, gt_num, gt_mask):\n", + " \"\"\"Construct Network training package class.\"\"\"\n", + " weights = self.weights\n", + " loss = self.network(x, img_shape, gt_bboxe, gt_label, gt_num, gt_mask)\n", + " grads = self.grad(self.network, weights)(x, img_shape, gt_bboxe, gt_label, gt_num, gt_mask, self.sens)\n", + " if self.reduce_flag:\n", + " grads = self.grad_reducer(grads)\n", + " grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)\n", + " self.optimizer(grads)\n", + " return loss\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 训练\n", + "\n", + "### 模型训练参数\n", + "\n", + "在这里,我们列出了一些重要的训练参数。此外,您可以查看配置文件config.py的详细信息。\n", + "\n", + "| Parameter | Default | Description |\n", + "| ---- | ---- | ---- |\n", + "| workers | 1 | Number of parallel workers |\n", + "| device_target | GPU | Device type |\n", + "| learning_rate | 0.002 | learning rate |\n", + "| weight_decay | 1e-4 | Control weight decay speed |\n", + "| total_epoch | 13 | Number of epoch |\n", + "| batch_size | 2 | Batch size |\n", + "| dataset | coco | Dataset name |\n", + "| pre_trained | ./checkpoint | The path of pretrained model |\n", + "| checkpoint_path | ./ckpt_0 | The path to save |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 训练模型\n", + "\n", + "模型训练需要定义好优化器、损失函数等。同时,可以加载预训练模型以加快模型训练。\n", + "\n", + "因此,我们定义权重文件加载函数。" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "def load_pretrained_ckpt(net, load_path, device_target):\n", + " \"\"\"\n", + " Load pretrained checkpoint.\n", + "\n", + " Args:\n", + " net(Cell): Used Network\n", + " load_path(string): The path of checkpoint.\n", + " device_target(string): device target.\n", + "\n", + " Returns:\n", + " Cell, the network with pretrained weights.\n", + " \"\"\"\n", + " param_dict = load_checkpoint(load_path)\n", + " if config.pretrain_epoch_size == 0:\n", + " for item in list(param_dict.keys()):\n", + " if not (item.startswith('backbone') or item.startswith('rcnn_mask')):\n", + " param_dict.pop(item)\n", + "\n", + " if device_target == 'GPU':\n", + " for key, value in param_dict.items():\n", + " tensor = Tensor(value, mstype.float32)\n", + " param_dict[key] = Parameter(tensor, key)\n", + "\n", + " load_param_into_net(net, param_dict)\n", + " return net" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "本案例中,为了方便展示效果,选取了数据集中的部分数据进行了1个epoch的训练,由于加载了预训练模型,所以loss值快速趋于稳定,在1附近间波动,这可以作为判断模型收敛的一个标准。\n", + "\n", + "训练得到的ckpt文件被保存在checkpoint文件夹内,可以作为后续fine-tune以及推理的加载模型使用。" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Start create dataset!\n", + "total images num: 51790\n", + "Create dataset done!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/maskrcnn/miniconda3/envs/mrcnn17/lib/python3.7/site-packages/ipykernel_launcher.py:155: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + "/home/maskrcnn/miniconda3/envs/mrcnn17/lib/python3.7/site-packages/ipykernel_launcher.py:156: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading pretrained resnet50 checkpoint\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.385.002 [mindspore/train/serialization.py:648] For 'load_param_into_net', 83 parameters in the 'net' are not loaded, because they are not in the 'parameter_dict', please check whether the network structure is consistent when training and loading checkpoint.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.386.026 [mindspore/train/serialization.py:650] backbone.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.386.719 [mindspore/train/serialization.py:650] backbone.layer1.0.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.387.262 [mindspore/train/serialization.py:650] backbone.layer1.0.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.387.842 [mindspore/train/serialization.py:650] backbone.layer1.0.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.389.278 [mindspore/train/serialization.py:650] backbone.layer1.0.conv_down_sample.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.389.868 [mindspore/train/serialization.py:650] backbone.layer1.1.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.390.374 [mindspore/train/serialization.py:650] backbone.layer1.1.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.390.927 [mindspore/train/serialization.py:650] backbone.layer1.1.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.392.133 [mindspore/train/serialization.py:650] backbone.layer1.2.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.392.697 [mindspore/train/serialization.py:650] backbone.layer1.2.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.393.600 [mindspore/train/serialization.py:650] backbone.layer1.2.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.394.189 [mindspore/train/serialization.py:650] backbone.layer2.0.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.394.694 [mindspore/train/serialization.py:650] backbone.layer2.0.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.395.240 [mindspore/train/serialization.py:650] backbone.layer2.0.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.395.775 [mindspore/train/serialization.py:650] backbone.layer2.0.conv_down_sample.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.396.353 [mindspore/train/serialization.py:650] backbone.layer2.1.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.396.779 [mindspore/train/serialization.py:650] backbone.layer2.1.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.397.027 [mindspore/train/serialization.py:650] backbone.layer2.1.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.397.270 [mindspore/train/serialization.py:650] backbone.layer2.2.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.397.534 [mindspore/train/serialization.py:650] backbone.layer2.2.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.397.771 [mindspore/train/serialization.py:650] backbone.layer2.2.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.398.016 [mindspore/train/serialization.py:650] backbone.layer2.3.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.398.247 [mindspore/train/serialization.py:650] backbone.layer2.3.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.398.476 [mindspore/train/serialization.py:650] backbone.layer2.3.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.398.708 [mindspore/train/serialization.py:650] backbone.layer3.0.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.398.942 [mindspore/train/serialization.py:650] backbone.layer3.0.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.399.169 [mindspore/train/serialization.py:650] backbone.layer3.0.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.399.406 [mindspore/train/serialization.py:650] backbone.layer3.0.conv_down_sample.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.399.636 [mindspore/train/serialization.py:650] backbone.layer3.1.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.399.865 [mindspore/train/serialization.py:650] backbone.layer3.1.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.400.095 [mindspore/train/serialization.py:650] backbone.layer3.1.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.400.335 [mindspore/train/serialization.py:650] backbone.layer3.2.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.400.562 [mindspore/train/serialization.py:650] backbone.layer3.2.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.400.793 [mindspore/train/serialization.py:650] backbone.layer3.2.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.401.032 [mindspore/train/serialization.py:650] backbone.layer3.3.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.401.276 [mindspore/train/serialization.py:650] backbone.layer3.3.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.401.515 [mindspore/train/serialization.py:650] backbone.layer3.3.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.401.741 [mindspore/train/serialization.py:650] backbone.layer3.4.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.401.968 [mindspore/train/serialization.py:650] backbone.layer3.4.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.402.211 [mindspore/train/serialization.py:650] backbone.layer3.4.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.402.446 [mindspore/train/serialization.py:650] backbone.layer3.5.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.402.673 [mindspore/train/serialization.py:650] backbone.layer3.5.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.402.916 [mindspore/train/serialization.py:650] backbone.layer3.5.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.405.019 [mindspore/train/serialization.py:650] backbone.layer4.0.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.405.285 [mindspore/train/serialization.py:650] backbone.layer4.0.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.405.528 [mindspore/train/serialization.py:650] backbone.layer4.0.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.405.785 [mindspore/train/serialization.py:650] backbone.layer4.0.conv_down_sample.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.406.027 [mindspore/train/serialization.py:650] backbone.layer4.1.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.406.279 [mindspore/train/serialization.py:650] backbone.layer4.1.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.406.533 [mindspore/train/serialization.py:650] backbone.layer4.1.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.406.770 [mindspore/train/serialization.py:650] backbone.layer4.2.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.407.016 [mindspore/train/serialization.py:650] backbone.layer4.2.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.407.259 [mindspore/train/serialization.py:650] backbone.layer4.2.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.407.502 [mindspore/train/serialization.py:650] fpn_ncek.lateral_convs_list.0.weight is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.407.749 [mindspore/train/serialization.py:650] fpn_ncek.lateral_convs_list.0.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.407.994 [mindspore/train/serialization.py:650] fpn_ncek.lateral_convs_list.1.weight is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.408.244 [mindspore/train/serialization.py:650] fpn_ncek.lateral_convs_list.1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.408.494 [mindspore/train/serialization.py:650] fpn_ncek.lateral_convs_list.2.weight is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.408.787 [mindspore/train/serialization.py:650] fpn_ncek.lateral_convs_list.2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.409.080 [mindspore/train/serialization.py:650] fpn_ncek.lateral_convs_list.3.weight is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.409.370 [mindspore/train/serialization.py:650] fpn_ncek.lateral_convs_list.3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.409.689 [mindspore/train/serialization.py:650] fpn_ncek.fpn_convs_list.0.weight is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.409.987 [mindspore/train/serialization.py:650] fpn_ncek.fpn_convs_list.0.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.410.280 [mindspore/train/serialization.py:650] fpn_ncek.fpn_convs_list.1.weight is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.410.584 [mindspore/train/serialization.py:650] fpn_ncek.fpn_convs_list.1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.412.125 [mindspore/train/serialization.py:650] fpn_ncek.fpn_convs_list.2.weight is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.412.393 [mindspore/train/serialization.py:650] fpn_ncek.fpn_convs_list.2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.412.655 [mindspore/train/serialization.py:650] fpn_ncek.fpn_convs_list.3.weight is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.412.897 [mindspore/train/serialization.py:650] fpn_ncek.fpn_convs_list.3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.413.181 [mindspore/train/serialization.py:650] rpn_with_loss.rpn_convs_list.0.rpn_conv.weight is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.413.429 [mindspore/train/serialization.py:650] rpn_with_loss.rpn_convs_list.0.rpn_conv.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.413.698 [mindspore/train/serialization.py:650] rpn_with_loss.rpn_convs_list.0.rpn_cls.weight is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.413.936 [mindspore/train/serialization.py:650] rpn_with_loss.rpn_convs_list.0.rpn_cls.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.414.199 [mindspore/train/serialization.py:650] rpn_with_loss.rpn_convs_list.0.rpn_reg.weight is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.414.440 [mindspore/train/serialization.py:650] rpn_with_loss.rpn_convs_list.0.rpn_reg.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.414.693 [mindspore/train/serialization.py:650] rcnn_cls.fpn_cls.shared_fc_0.weight is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.414.942 [mindspore/train/serialization.py:650] rcnn_cls.fpn_cls.shared_fc_0.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.415.197 [mindspore/train/serialization.py:650] rcnn_cls.fpn_cls.shared_fc_1.weight is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.415.458 [mindspore/train/serialization.py:650] rcnn_cls.fpn_cls.shared_fc_1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.415.708 [mindspore/train/serialization.py:650] rcnn_cls.fpn_cls.cls_scores.weight is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.415.969 [mindspore/train/serialization.py:650] rcnn_cls.fpn_cls.cls_scores.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.416.216 [mindspore/train/serialization.py:650] rcnn_cls.fpn_cls.reg_scores.weight is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:30:32.416.476 [mindspore/train/serialization.py:650] rcnn_cls.fpn_cls.reg_scores.bias is not loaded.\n", + "[WARNING] PIPELINE(1289646,7fca63db83c0,python):2022-07-27-23:30:35.375.397 [mindspore/ccsrc/pipeline/jit/pipeline.cc:173] CheckArgValid] The data types of Tensor:[[ True True True False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False]\n", + " [ True True True True True True True True False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False]] is bool, which may cause SelectKernelInfo failure for operator [AddN]. For more details, please refer to the FAQ at https://www.mindspore.cn.\n", + "[WARNING] PIPELINE(1289646,7fca63db83c0,python):2022-07-27-23:30:35.377.424 [mindspore/ccsrc/pipeline/jit/pipeline.cc:173] CheckArgValid] The data types of Tensor:[[[[False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " ...\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]]\n", + "\n", + " [[False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " ...\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]]\n", + "\n", + " [[False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " ...\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]]\n", + "\n", + " ...\n", + "\n", + " [[False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " ...\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]]\n", + "\n", + " [[False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " ...\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]]\n", + "\n", + " [[False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " ...\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]]]\n", + "\n", + "\n", + " [[[False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " ...\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]]\n", + "\n", + " [[False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " ...\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]]\n", + "\n", + " [[False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " ...\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]]\n", + "\n", + " ...\n", + "\n", + " [[False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " ...\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]]\n", + "\n", + " [[False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " ...\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]]\n", + "\n", + " [[False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " ...\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]\n", + " [False False False ... False False False]]]] is bool, which may cause SelectKernelInfo failure for operator [AddN]. For more details, please refer to the FAQ at https://www.mindspore.cn.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "58 epoch: 1 step: 1 total_loss: 5.75554\n", + "59 epoch: 1 step: 2 total_loss: 5.62021\n", + "60 epoch: 1 step: 3 total_loss: 4.39285\n", + "60 epoch: 1 step: 4 total_loss: 3.12360\n", + "61 epoch: 1 step: 5 total_loss: 2.20772\n", + "62 epoch: 1 step: 6 total_loss: 2.08234\n", + "62 epoch: 1 step: 7 total_loss: 1.85937\n", + "63 epoch: 1 step: 8 total_loss: 1.48294\n", + "64 epoch: 1 step: 9 total_loss: 2.45977\n", + "64 epoch: 1 step: 10 total_loss: 4.78331\n", + "65 epoch: 1 step: 11 total_loss: 2.60320\n", + "66 epoch: 1 step: 12 total_loss: 2.66626\n", + "67 epoch: 1 step: 13 total_loss: 2.92546\n", + "68 epoch: 1 step: 14 total_loss: 1.05098\n", + "69 epoch: 1 step: 15 total_loss: 2.44125\n", + "70 epoch: 1 step: 16 total_loss: 1.25769\n", + "71 epoch: 1 step: 17 total_loss: 2.13439\n", + "72 epoch: 1 step: 18 total_loss: 1.85752\n", + "73 epoch: 1 step: 19 total_loss: 1.57048\n", + "73 epoch: 1 step: 20 total_loss: 1.16632\n", + "74 epoch: 1 step: 21 total_loss: 0.95452\n", + "75 epoch: 1 step: 22 total_loss: 1.38052\n", + "75 epoch: 1 step: 23 total_loss: 0.91722\n", + "76 epoch: 1 step: 24 total_loss: 0.92058\n", + "77 epoch: 1 step: 25 total_loss: 1.44209\n", + "78 epoch: 1 step: 26 total_loss: 1.62278\n", + "78 epoch: 1 step: 27 total_loss: 0.97781\n", + "79 epoch: 1 step: 28 total_loss: 1.98220\n", + "80 epoch: 1 step: 29 total_loss: 1.54636\n", + "80 epoch: 1 step: 30 total_loss: 1.13240\n", + "81 epoch: 1 step: 31 total_loss: 1.45014\n", + "82 epoch: 1 step: 32 total_loss: 0.87475\n", + "82 epoch: 1 step: 33 total_loss: 1.08100\n", + "83 epoch: 1 step: 34 total_loss: 1.36666\n", + "84 epoch: 1 step: 35 total_loss: 1.28859\n", + "84 epoch: 1 step: 36 total_loss: 1.48311\n", + "85 epoch: 1 step: 37 total_loss: 1.17995\n", + "86 epoch: 1 step: 38 total_loss: 1.42331\n", + "86 epoch: 1 step: 39 total_loss: 1.17482\n", + "87 epoch: 1 step: 40 total_loss: 1.84335\n", + "88 epoch: 1 step: 41 total_loss: 1.44425\n", + "89 epoch: 1 step: 42 total_loss: 1.15063\n", + "89 epoch: 1 step: 43 total_loss: 0.96332\n", + "90 epoch: 1 step: 44 total_loss: 1.46840\n", + "91 epoch: 1 step: 45 total_loss: 1.93449\n", + "91 epoch: 1 step: 46 total_loss: 1.37041\n", + "92 epoch: 1 step: 47 total_loss: 1.96878\n", + "93 epoch: 1 step: 48 total_loss: 1.11222\n", + "93 epoch: 1 step: 49 total_loss: 1.23973\n", + "94 epoch: 1 step: 50 total_loss: 1.93399\n", + "95 epoch: 1 step: 51 total_loss: 0.91994\n", + "95 epoch: 1 step: 52 total_loss: 1.58976\n", + "96 epoch: 1 step: 53 total_loss: 1.69277\n", + "97 epoch: 1 step: 54 total_loss: 1.47368\n", + "97 epoch: 1 step: 55 total_loss: 1.21262\n", + "98 epoch: 1 step: 56 total_loss: 1.52950\n", + "99 epoch: 1 step: 57 total_loss: 0.98201\n", + "99 epoch: 1 step: 58 total_loss: 1.20535\n", + "100 epoch: 1 step: 59 total_loss: 1.22178\n", + "101 epoch: 1 step: 60 total_loss: 1.29352\n", + "102 epoch: 1 step: 61 total_loss: 1.34618\n", + "102 epoch: 1 step: 62 total_loss: 2.33590\n", + "103 epoch: 1 step: 63 total_loss: 1.24699\n", + "104 epoch: 1 step: 64 total_loss: 1.82364\n", + "104 epoch: 1 step: 65 total_loss: 1.18940\n", + "105 epoch: 1 step: 66 total_loss: 1.08232\n", + "106 epoch: 1 step: 67 total_loss: 1.29253\n", + "106 epoch: 1 step: 68 total_loss: 1.34224\n", + "107 epoch: 1 step: 69 total_loss: 1.15712\n", + "108 epoch: 1 step: 70 total_loss: 1.37997\n", + "108 epoch: 1 step: 71 total_loss: 1.35490\n", + "109 epoch: 1 step: 72 total_loss: 1.45236\n", + "110 epoch: 1 step: 73 total_loss: 1.71247\n", + "110 epoch: 1 step: 74 total_loss: 1.17064\n", + "111 epoch: 1 step: 75 total_loss: 1.47019\n", + "112 epoch: 1 step: 76 total_loss: 2.02627\n", + "112 epoch: 1 step: 77 total_loss: 0.79998\n", + "113 epoch: 1 step: 78 total_loss: 0.98788\n", + "114 epoch: 1 step: 79 total_loss: 1.17783\n", + "114 epoch: 1 step: 80 total_loss: 0.96300\n", + "115 epoch: 1 step: 81 total_loss: 2.99116\n", + "116 epoch: 1 step: 82 total_loss: 1.02599\n", + "117 epoch: 1 step: 83 total_loss: 1.96789\n", + "117 epoch: 1 step: 84 total_loss: 1.10882\n", + "118 epoch: 1 step: 85 total_loss: 1.30341\n", + "119 epoch: 1 step: 86 total_loss: 1.18809\n", + "119 epoch: 1 step: 87 total_loss: 2.02210\n", + "120 epoch: 1 step: 88 total_loss: 0.93199\n", + "121 epoch: 1 step: 89 total_loss: 1.32749\n", + "121 epoch: 1 step: 90 total_loss: 1.17464\n", + "122 epoch: 1 step: 91 total_loss: 0.74877\n", + "123 epoch: 1 step: 92 total_loss: 1.13211\n", + "123 epoch: 1 step: 93 total_loss: 1.60181\n", + "124 epoch: 1 step: 94 total_loss: 1.62449\n", + "125 epoch: 1 step: 95 total_loss: 2.71518\n", + "125 epoch: 1 step: 96 total_loss: 1.31592\n", + "126 epoch: 1 step: 97 total_loss: 1.58154\n", + "127 epoch: 1 step: 98 total_loss: 0.87627\n", + "127 epoch: 1 step: 99 total_loss: 0.95769\n", + "128 epoch: 1 step: 100 total_loss: 1.19945\n", + "129 epoch: 1 step: 101 total_loss: 2.51535\n", + "129 epoch: 1 step: 102 total_loss: 0.93991\n", + "End training, time: 1658964762.9842348 ,epoch: 1 ,step: 102 ,loss: 0.9399106502532959\n", + "epoch time: 129033.264 ms, per step time: 2.491 ms\n" + ] + } + ], + "source": [ + "from utils.lr_schedule import dynamic_lr\n", + "\n", + "set_seed(1)\n", + "\n", + "def train_maskrcnn():\n", + " \"\"\"Construct the traning function\"\"\"\n", + " # Allocating memory Environment\n", + " device_target = config.device_target\n", + " rank = 0\n", + " device_num = 1\n", + " context.set_context(mode=context.GRAPH_MODE, device_target=device_target)\n", + "\n", + " print(\"Start create dataset!\")\n", + " # Call the interface for data processing\n", + " # It will generate mindrecord file in config.mindrecord_dir,\n", + " # and the file name is MaskRcnn.mindrecord0, 1, ... file_num.\n", + " prefix = \"MaskRcnn.mindrecord\"\n", + " mindrecord_dir = config.mindrecord_dir\n", + " mindrecord_file = os.path.join(mindrecord_dir, prefix + \"0\")\n", + " if rank == 0 and not os.path.exists(mindrecord_file):\n", + " create_mindrecord_dir(prefix, mindrecord_dir)\n", + " # When create MindDataset, using the fitst mindrecord file,\n", + " # such as MaskRcnn.mindrecord0.\n", + " dataset = create_coco_dataset(mindrecord_file, batch_size=config.batch_size, device_num=device_num, rank_id=rank)\n", + " dataset_size = dataset.get_dataset_size()\n", + " print(\"total images num: \", dataset_size)\n", + " print(\"Create dataset done!\")\n", + " # Net Instance\n", + " net = MaskRcnnResnet50(config=config)\n", + "\n", + " net = net.set_train()\n", + " # load pretrained model\n", + " load_path = config.pre_trained\n", + " if load_path != \"\":\n", + " print(\"Loading pretrained resnet50 checkpoint\")\n", + " net = load_pretrained_ckpt(net=net, load_path=load_path, device_target=device_target)\n", + "\n", + " loss = LossNet()\n", + " lr = Tensor(dynamic_lr(config, rank_size=device_num, start_steps=config.pretrain_epoch_size * dataset_size),\n", + " mstype.float32)\n", + " opt = Momentum(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum,\n", + " weight_decay=config.weight_decay, loss_scale=config.loss_scale)\n", + " # wrap the loss function\n", + " net_with_loss = WithLossCell(net, loss)\n", + " # Use TrainOneStepCell set the training pipeline.\n", + " net = TrainOneStepCell(net_with_loss, opt, sens=config.loss_scale)\n", + " # Monitor the traning process.\n", + " time_cb = TimeMonitor(data_size=dataset_size)\n", + " loss_cb = LossCallBack(rank_id=rank)\n", + " cb = [time_cb, loss_cb]\n", + " # save the trained model\n", + " if config.save_checkpoint:\n", + " # set saved weights.\n", + " ckpt_step = config.save_checkpoint_epochs * dataset_size\n", + " ckptconfig = CheckpointConfig(save_checkpoint_steps=ckpt_step, keep_checkpoint_max=config.keep_checkpoint_max)\n", + " save_checkpoint_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(rank) + '/')\n", + " # apply saved weights.\n", + " ckpoint_cb = ModelCheckpoint(prefix='mask_rcnn', directory=save_checkpoint_path, config=ckptconfig)\n", + " cb += [ckpoint_cb]\n", + " # start training.\n", + " model = Model(net)\n", + " model.train(config.epoch_size, dataset, callbacks=cb, dataset_sink_mode=False)\n", + "\n", + "if __name__ == '__main__':\n", + " train_maskrcnn()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 评估\n", + "\n", + "完成训练后,我们可以将我们训练的模型保存在checkpoint目录下。\n", + "\n", + "在COCO的validation数据集上,可以评估我们训练好的模型的准确性。" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Start Eval!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/maskrcnn/miniconda3/envs/mrcnn17/lib/python3.7/site-packages/ipykernel_launcher.py:155: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + "/home/maskrcnn/miniconda3/envs/mrcnn17/lib/python3.7/site-packages/ipykernel_launcher.py:156: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.595.02 [mindspore/train/serialization.py:648] For 'load_param_into_net', 53 parameters in the 'net' are not loaded, because they are not in the 'parameter_dict', please check whether the network structure is consistent when training and loading checkpoint.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.601.40 [mindspore/train/serialization.py:650] backbone.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.605.69 [mindspore/train/serialization.py:650] backbone.layer1.0.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.608.74 [mindspore/train/serialization.py:650] backbone.layer1.0.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.611.73 [mindspore/train/serialization.py:650] backbone.layer1.0.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.614.65 [mindspore/train/serialization.py:650] backbone.layer1.0.conv_down_sample.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.617.53 [mindspore/train/serialization.py:650] backbone.layer1.1.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.620.48 [mindspore/train/serialization.py:650] backbone.layer1.1.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.628.26 [mindspore/train/serialization.py:650] backbone.layer1.1.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.631.16 [mindspore/train/serialization.py:650] backbone.layer1.2.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.634.01 [mindspore/train/serialization.py:650] backbone.layer1.2.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.636.95 [mindspore/train/serialization.py:650] backbone.layer1.2.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.639.68 [mindspore/train/serialization.py:650] backbone.layer2.0.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.642.69 [mindspore/train/serialization.py:650] backbone.layer2.0.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.645.46 [mindspore/train/serialization.py:650] backbone.layer2.0.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.648.30 [mindspore/train/serialization.py:650] backbone.layer2.0.conv_down_sample.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.651.13 [mindspore/train/serialization.py:650] backbone.layer2.1.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.654.17 [mindspore/train/serialization.py:650] backbone.layer2.1.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.656.95 [mindspore/train/serialization.py:650] backbone.layer2.1.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.659.91 [mindspore/train/serialization.py:650] backbone.layer2.2.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.662.58 [mindspore/train/serialization.py:650] backbone.layer2.2.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.665.53 [mindspore/train/serialization.py:650] backbone.layer2.2.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.668.37 [mindspore/train/serialization.py:650] backbone.layer2.3.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.671.12 [mindspore/train/serialization.py:650] backbone.layer2.3.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.673.90 [mindspore/train/serialization.py:650] backbone.layer2.3.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.676.95 [mindspore/train/serialization.py:650] backbone.layer3.0.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.679.81 [mindspore/train/serialization.py:650] backbone.layer3.0.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.682.63 [mindspore/train/serialization.py:650] backbone.layer3.0.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.685.38 [mindspore/train/serialization.py:650] backbone.layer3.0.conv_down_sample.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.688.34 [mindspore/train/serialization.py:650] backbone.layer3.1.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.691.18 [mindspore/train/serialization.py:650] backbone.layer3.1.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.694.14 [mindspore/train/serialization.py:650] backbone.layer3.1.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.696.78 [mindspore/train/serialization.py:650] backbone.layer3.2.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.699.74 [mindspore/train/serialization.py:650] backbone.layer3.2.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.702.42 [mindspore/train/serialization.py:650] backbone.layer3.2.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.705.18 [mindspore/train/serialization.py:650] backbone.layer3.3.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.707.91 [mindspore/train/serialization.py:650] backbone.layer3.3.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.710.83 [mindspore/train/serialization.py:650] backbone.layer3.3.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.713.68 [mindspore/train/serialization.py:650] backbone.layer3.4.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.716.59 [mindspore/train/serialization.py:650] backbone.layer3.4.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.719.24 [mindspore/train/serialization.py:650] backbone.layer3.4.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.722.19 [mindspore/train/serialization.py:650] backbone.layer3.5.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.724.90 [mindspore/train/serialization.py:650] backbone.layer3.5.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.727.85 [mindspore/train/serialization.py:650] backbone.layer3.5.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.730.67 [mindspore/train/serialization.py:650] backbone.layer4.0.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.733.62 [mindspore/train/serialization.py:650] backbone.layer4.0.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.736.40 [mindspore/train/serialization.py:650] backbone.layer4.0.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.739.31 [mindspore/train/serialization.py:650] backbone.layer4.0.conv_down_sample.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.741.95 [mindspore/train/serialization.py:650] backbone.layer4.1.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.744.82 [mindspore/train/serialization.py:650] backbone.layer4.1.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.747.67 [mindspore/train/serialization.py:650] backbone.layer4.1.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.750.58 [mindspore/train/serialization.py:650] backbone.layer4.2.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.753.23 [mindspore/train/serialization.py:650] backbone.layer4.2.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-27-23:32:48.756.11 [mindspore/train/serialization.py:650] backbone.layer4.2.conv3.bias is not loaded.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading annotations into memory...\n", + "Done (t=0.54s)\n", + "creating index...\n", + "index created!\n", + "total images num: 2500\n", + "Processing, please wait a moment.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[WARNING] PIPELINE(1289646,7fca63db83c0,python):2022-07-27-23:32:49.023.161 [mindspore/ccsrc/pipeline/jit/pipeline.cc:173] CheckArgValid] The data types of Tensor:[[False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False]\n", + " [False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False]] is bool, which may cause SelectKernelInfo failure for operator [AddN]. For more details, please refer to the FAQ at https://www.mindspore.cn.\n", + "[WARNING] PIPELINE(1289646,7fca63db83c0,python):2022-07-27-23:32:49.023.271 [mindspore/ccsrc/pipeline/jit/pipeline.cc:173] CheckArgValid] The data types of Tensor:[[[[False]]]\n", + "\n", + "\n", + " [[[False]]]] is bool, which may cause SelectKernelInfo failure for operator [AddN]. For more details, please refer to the FAQ at https://www.mindspore.cn.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluation cost time 1896.9504017829895\n", + "Loading and preparing results...\n", + "DONE (t=1.67s)\n", + "creating index...\n", + "index created!\n", + "Running per image evaluation...\n", + "Evaluate annotation type *bbox*\n", + "DONE (t=27.94s).\n", + "Accumulating evaluation results...\n", + "DONE (t=5.42s).\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.374\n", + " Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.599\n", + " Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.403\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.235\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.415\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.474\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.312\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.501\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.530\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.363\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.571\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.656\n", + "Loading and preparing results...\n", + "DONE (t=4.10s)\n", + "creating index...\n", + "index created!\n", + "Running per image evaluation...\n", + "Evaluate annotation type *segm*\n", + "DONE (t=32.34s).\n", + "Accumulating evaluation results...\n", + "DONE (t=5.36s).\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.329\n", + " Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.555\n", + " Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.344\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.165\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.357\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.477\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.284\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.436\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.455\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.283\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.490\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.592\n", + "ckpt_path= ./checkpoint/maskrcnn_coco2017_acc32.9.ckpt\n" + ] + } + ], + "source": [ + "from pycocotools.coco import COCO\n", + "\n", + "from utils.util import coco_eval, bbox2result_1image, results2json, get_seg_masks\n", + "\n", + "set_seed(1)\n", + "\n", + "\n", + "def maskrcnn_eval(dataset_path, ckpt_path, ann_file):\n", + " \"\"\"\n", + " MaskRcnn evaluation.\n", + "\n", + " Args:\n", + " dataset_path(str): Dataset file path.\n", + " ckpt_path(str): Checkpoint file path.\n", + " ann_file(str): Annotations file path.\n", + " \"\"\"\n", + " ds = create_coco_dataset(dataset_path, batch_size=config.test_batch_size, is_training=False)\n", + "\n", + " net = MaskRcnnResnet50(config)\n", + " param_dict = load_checkpoint(ckpt_path)\n", + " load_param_into_net(net, param_dict)\n", + " net.set_train(False)\n", + "\n", + " eval_iter = 0\n", + " total = ds.get_dataset_size()\n", + " outputs = []\n", + " dataset_coco = COCO(ann_file)\n", + "\n", + " print(\"total images num: \", total)\n", + " print(\"Processing, please wait a moment.\")\n", + " max_num = 128\n", + " start = time.time()\n", + " for data in ds.create_dict_iterator(output_numpy=True, num_epochs=1):\n", + " eval_iter = eval_iter + 1\n", + "\n", + " img_data = data['image']\n", + " img_metas = data['image_shape']\n", + " gt_bboxes = data['box']\n", + " gt_labels = data['label']\n", + " gt_num = data['valid_num']\n", + " gt_mask = data[\"mask\"]\n", + "\n", + " # run net\n", + " output = net(Tensor(img_data), Tensor(img_metas), Tensor(gt_bboxes),\n", + " Tensor(gt_labels), Tensor(gt_num), Tensor(gt_mask))\n", + "\n", + " # output\n", + " all_bbox = output[0]\n", + " all_label = output[1]\n", + " all_mask = output[2]\n", + " all_mask_fb = output[3]\n", + "\n", + " for j in range(config.test_batch_size):\n", + " all_bbox_squee = np.squeeze(all_bbox.asnumpy()[j, :, :])\n", + " all_label_squee = np.squeeze(all_label.asnumpy()[j, :, :])\n", + " all_mask_squee = np.squeeze(all_mask.asnumpy()[j, :, :])\n", + " all_mask_fb_squee = np.squeeze(all_mask_fb.asnumpy()[j, :, :, :])\n", + "\n", + " all_bboxes_tmp_mask = all_bbox_squee[all_mask_squee, :]\n", + " all_labels_tmp_mask = all_label_squee[all_mask_squee]\n", + " all_mask_fb_tmp_mask = all_mask_fb_squee[all_mask_squee, :, :]\n", + "\n", + " if all_bboxes_tmp_mask.shape[0] > max_num:\n", + " inds = np.argsort(-all_bboxes_tmp_mask[:, -1])\n", + " inds = inds[:max_num]\n", + " all_bboxes_tmp_mask = all_bboxes_tmp_mask[inds]\n", + " all_labels_tmp_mask = all_labels_tmp_mask[inds]\n", + " all_mask_fb_tmp_mask = all_mask_fb_tmp_mask[inds]\n", + "\n", + " bbox_results = bbox2result_1image(all_bboxes_tmp_mask, all_labels_tmp_mask, config.num_classes)\n", + " segm_results = get_seg_masks(all_mask_fb_tmp_mask, all_bboxes_tmp_mask, all_labels_tmp_mask,\n", + " img_metas[j], True, config.num_classes)\n", + " outputs.append((bbox_results, segm_results))\n", + "\n", + " end = time.time()\n", + " print(\"Evaluation cost time {}\".format(end - start))\n", + " eval_types = [\"bbox\", \"segm\"]\n", + " result_files = results2json(dataset_coco, outputs, \"./results.pkl\")\n", + " coco_eval(result_files, eval_types, dataset_coco, single_result=False)\n", + "\n", + "def eval_():\n", + " \"\"\"Execute the Evaluation.\"\"\"\n", + " device_target = config.device_target\n", + " context.set_context(mode=context.GRAPH_MODE, device_target=device_target)\n", + "\n", + " config.mindrecord_dir = os.path.join(config.data_root, config.mindrecord_dir)\n", + "\n", + " prefix = \"MaskRcnn_eval.mindrecord\"\n", + " mindrecord_dir = config.mindrecord_dir\n", + " mindrecord_file = os.path.join(mindrecord_dir, prefix)\n", + "\n", + " if not os.path.exists(mindrecord_file):\n", + " if not os.path.isdir(mindrecord_dir):\n", + " os.makedirs(mindrecord_dir)\n", + " if config.dataset == \"coco\":\n", + " if os.path.isdir(config.data_root):\n", + " print(\"Create Mindrecord.\")\n", + " data_to_mindrecord_byte_image(\"coco\", False, prefix, file_num=1)\n", + " print(\"Create Mindrecord Done, at {}\".format(mindrecord_dir))\n", + " else:\n", + " print(\"data_root not exits.\")\n", + " else:\n", + " if os.path.isdir(config.IMAGE_DIR) and os.path.exists(config.ANNO_PATH):\n", + " print(\"Create Mindrecord.\")\n", + " data_to_mindrecord_byte_image(\"other\", False, prefix, file_num=1)\n", + " print(\"Create Mindrecord Done, at {}\".format(mindrecord_dir))\n", + " else:\n", + " print(\"IMAGE_DIR or ANNO_PATH not exits.\")\n", + "\n", + " print(\"Start Eval!\")\n", + " maskrcnn_eval(mindrecord_file, config.checkpoint_path, config.ann_file)\n", + " print(\"ckpt_path=\", config.checkpoint_path)\n", + "\n", + "if __name__ == '__main__':\n", + " eval_()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 推理\n", + "\n", + "最后,可以使用自己的数据集来测试训练后的模型,完成目标检测。" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/maskrcnn/miniconda3/envs/mrcnn17/lib/python3.7/site-packages/ipykernel_launcher.py:155: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + "/home/maskrcnn/miniconda3/envs/mrcnn17/lib/python3.7/site-packages/ipykernel_launcher.py:156: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.325.709 [mindspore/train/serialization.py:648] For 'load_param_into_net', 53 parameters in the 'net' are not loaded, because they are not in the 'parameter_dict', please check whether the network structure is consistent when training and loading checkpoint.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.326.820 [mindspore/train/serialization.py:650] backbone.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.327.509 [mindspore/train/serialization.py:650] backbone.layer1.0.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.328.057 [mindspore/train/serialization.py:650] backbone.layer1.0.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.328.627 [mindspore/train/serialization.py:650] backbone.layer1.0.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.329.161 [mindspore/train/serialization.py:650] backbone.layer1.0.conv_down_sample.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.329.722 [mindspore/train/serialization.py:650] backbone.layer1.1.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.330.237 [mindspore/train/serialization.py:650] backbone.layer1.1.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.330.765 [mindspore/train/serialization.py:650] backbone.layer1.1.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.331.291 [mindspore/train/serialization.py:650] backbone.layer1.2.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.331.863 [mindspore/train/serialization.py:650] backbone.layer1.2.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.334.193 [mindspore/train/serialization.py:650] backbone.layer1.2.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.334.770 [mindspore/train/serialization.py:650] backbone.layer2.0.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.335.286 [mindspore/train/serialization.py:650] backbone.layer2.0.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.335.863 [mindspore/train/serialization.py:650] backbone.layer2.0.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.336.367 [mindspore/train/serialization.py:650] backbone.layer2.0.conv_down_sample.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.336.922 [mindspore/train/serialization.py:650] backbone.layer2.1.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.337.450 [mindspore/train/serialization.py:650] backbone.layer2.1.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.338.010 [mindspore/train/serialization.py:650] backbone.layer2.1.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.338.512 [mindspore/train/serialization.py:650] backbone.layer2.2.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.339.082 [mindspore/train/serialization.py:650] backbone.layer2.2.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.339.586 [mindspore/train/serialization.py:650] backbone.layer2.2.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.340.157 [mindspore/train/serialization.py:650] backbone.layer2.3.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.340.659 [mindspore/train/serialization.py:650] backbone.layer2.3.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.340.959 [mindspore/train/serialization.py:650] backbone.layer2.3.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.341.191 [mindspore/train/serialization.py:650] backbone.layer3.0.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.341.427 [mindspore/train/serialization.py:650] backbone.layer3.0.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.341.664 [mindspore/train/serialization.py:650] backbone.layer3.0.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.341.898 [mindspore/train/serialization.py:650] backbone.layer3.0.conv_down_sample.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.342.142 [mindspore/train/serialization.py:650] backbone.layer3.1.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.342.369 [mindspore/train/serialization.py:650] backbone.layer3.1.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.342.598 [mindspore/train/serialization.py:650] backbone.layer3.1.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.342.832 [mindspore/train/serialization.py:650] backbone.layer3.2.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.343.072 [mindspore/train/serialization.py:650] backbone.layer3.2.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.343.297 [mindspore/train/serialization.py:650] backbone.layer3.2.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.343.526 [mindspore/train/serialization.py:650] backbone.layer3.3.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.343.755 [mindspore/train/serialization.py:650] backbone.layer3.3.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.343.991 [mindspore/train/serialization.py:650] backbone.layer3.3.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.344.235 [mindspore/train/serialization.py:650] backbone.layer3.4.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.344.489 [mindspore/train/serialization.py:650] backbone.layer3.4.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.344.737 [mindspore/train/serialization.py:650] backbone.layer3.4.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.344.995 [mindspore/train/serialization.py:650] backbone.layer3.5.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.345.244 [mindspore/train/serialization.py:650] backbone.layer3.5.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.345.505 [mindspore/train/serialization.py:650] backbone.layer3.5.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.345.768 [mindspore/train/serialization.py:650] backbone.layer4.0.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.347.967 [mindspore/train/serialization.py:650] backbone.layer4.0.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.348.233 [mindspore/train/serialization.py:650] backbone.layer4.0.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.348.494 [mindspore/train/serialization.py:650] backbone.layer4.0.conv_down_sample.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.348.758 [mindspore/train/serialization.py:650] backbone.layer4.1.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.349.290 [mindspore/train/serialization.py:650] backbone.layer4.1.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.349.560 [mindspore/train/serialization.py:650] backbone.layer4.1.conv3.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.349.817 [mindspore/train/serialization.py:650] backbone.layer4.2.conv1.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.350.074 [mindspore/train/serialization.py:650] backbone.layer4.2.conv2.bias is not loaded.\n", + "[WARNING] ME(1289646:140507235451840,MainProcess):2022-07-28-00:20:38.350.332 [mindspore/train/serialization.py:650] backbone.layer4.2.conv3.bias is not loaded.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Image ID: 1061\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[WARNING] PIPELINE(1289646,7fca63db83c0,python):2022-07-28-00:21:29.239.761 [mindspore/ccsrc/pipeline/jit/pipeline.cc:173] CheckArgValid] The data types of Tensor:[[False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False]\n", + " [False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False False False False False\n", + " False False False False False False False False]] is bool, which may cause SelectKernelInfo failure for operator [AddN]. For more details, please refer to the FAQ at https://www.mindspore.cn.\n", + "[WARNING] PIPELINE(1289646,7fca63db83c0,python):2022-07-28-00:21:29.239.874 [mindspore/ccsrc/pipeline/jit/pipeline.cc:173] CheckArgValid] The data types of Tensor:[[[[False]]]\n", + "\n", + "\n", + " [[[False]]]] is bool, which may cause SelectKernelInfo failure for operator [AddN]. For more details, please refer to the FAQ at https://www.mindspore.cn.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cost time of detection: 34.88\n", + "Class Num: 4\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import random\n", + "import colorsys\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.patches as patches\n", + "\n", + "\n", + "set_seed(1)\n", + "\n", + "def get_ax(rows=1, cols=1, size=16):\n", + " \"\"\"\n", + " Set axis\n", + "\n", + " Return a Matplotlib Axes array to be used in all visualizations in the notebook. Provide a central\n", + " point to control graph sizes.\n", + " Adjust the size attribute to control how big to render images.\n", + "\n", + " Args:\n", + " rows(int): Row size. Default: 1.\n", + " cols(int): Column size. Default: 1.\n", + " size(int): Pixel size. Default: 16.\n", + "\n", + " Returns:\n", + " Array, array of Axes\n", + " \"\"\"\n", + " _, axis = plt.subplots(rows, cols, figsize=(size*cols, size*rows))\n", + " return axis\n", + "\n", + "def mindrecord_to_rgb(img_data):\n", + " \"\"\"\n", + " Returns a RGB image from evaluated results.\n", + " Args:\n", + " rows(Array): An image.\n", + "\n", + " Returns:\n", + " Array, a RGB image.\n", + " \"\"\"\n", + " index = 0\n", + " convert_img = (-np.min(img_data[index, :, :, :])+img_data[index, :, :, :]) *\\\n", + " 255/(np.max(img_data[index, :, :, :])-np.min(img_data[index, :, :, :]))\n", + " temp_img = convert_img.astype(np.uint8)\n", + " image = np.zeros([config.img_height, config.img_width, 3])\n", + " image[:, :, 0] = temp_img[0, :, :]\n", + " image[:, :, 1] = temp_img[1, :, :]\n", + " image[:, :, 2] = temp_img[2, :, :]\n", + " return image\n", + "\n", + "def random_colors(num, bright=True):\n", + " \"\"\"\n", + " Generate random colors.\n", + "\n", + " To get visually distinct colors, generate them in HSV space then\n", + " convert to RGB.\n", + "\n", + " Args:\n", + " num(int): The color number.\n", + "\n", + " Returns:\n", + " List, a list of different colors.\n", + " \"\"\"\n", + " brightness = 1.0 if bright else 0.7\n", + " hsv = [(i / num, 1, brightness) for i in range(num)]\n", + " colors = list(map(lambda c: colorsys.hsv_to_rgb(*c), hsv))\n", + " random.shuffle(colors)\n", + " return colors\n", + "\n", + "def infer():\n", + " \"\"\"\n", + " Return Mask RCNN evaluated results.\n", + "\n", + " Returns:\n", + " - output, tensor, Mask RCNN evaluated result.\n", + " [Tensor[2,80000,5], Tensor[2,80000,1], Tensor[2,80000,1], Tensor[2,80000,28,28]]\n", + " - img, tensor, RGB image.\n", + " - img_metas, list, shape (height, width, 3).\n", + " \"\"\"\n", + " # load image\n", + " device_target = config.device_target\n", + " context.set_context(mode=context.GRAPH_MODE, device_target=device_target)\n", + "\n", + " mindrecord_dir = os.path.join(config.data_root, config.mindrecord_dir)\n", + "\n", + " prefix = \"MaskRcnn_eval.mindrecord\"\n", + "\n", + " mindrecord_file = os.path.join(mindrecord_dir, prefix)\n", + "\n", + " dataset = create_coco_dataset(mindrecord_file, batch_size=config.test_batch_size, is_training=False)\n", + "\n", + " total = dataset.get_dataset_size()\n", + " image_id = np.random.choice(total, 1)\n", + "\n", + " # load model\n", + " ckpt_path = config.checkpoint_path\n", + " net = MaskRcnnResnet50(config)\n", + " param_dict = load_checkpoint(ckpt_path)\n", + " load_param_into_net(net, param_dict)\n", + " net.set_train(False)\n", + "\n", + " data = list(dataset.create_dict_iterator(output_numpy=True, num_epochs=1))[image_id[0]]\n", + " print(\"Image ID: \", image_id[0])\n", + " img_data = data['image']\n", + " img_metas = data['image_shape']\n", + " gt_bboxes = data['box']\n", + " gt_labels = data['label']\n", + " gt_num = data['valid_num']\n", + " gt_mask = data[\"mask\"]\n", + "\n", + " img = mindrecord_to_rgb(img_data)\n", + "\n", + " start = time.time()\n", + " # run net\n", + " output = net(Tensor(img_data), Tensor(img_metas), Tensor(gt_bboxes),\n", + " Tensor(gt_labels), Tensor(gt_num), Tensor(gt_mask))\n", + " end = time.time()\n", + " print(\"Cost time of detection: {:.2f}\".format(end - start))\n", + " return output, img, img_metas\n", + "\n", + "def detection(output, img, img_metas):\n", + " \"\"\"Mask RCNN Detection.\n", + " Arg:\n", + " output(Tensor): evaluated results by Mask RCNN.\n", + " [Tensor[2,80000,5], Tensor[2,80000,1], Tensor[2,80000,1], Tensor[2,80000,28,28]]\n", + " img(Tensor): RGB image.\n", + " img_metas(List): image shape.\n", + " \"\"\"\n", + " # scaling ratio\n", + " ratio = img_metas[0, 2]\n", + "\n", + " # output\n", + " all_bbox = output[0][0].asnumpy()\n", + " all_label = output[1][0].asnumpy()\n", + " all_mask = output[2][0].asnumpy()\n", + "\n", + " num = 0\n", + " mask_id = -1\n", + " type_ids = []\n", + " for bool_ in all_mask:\n", + " mask_id += 1\n", + " if np.equal(bool_, True) and all_bbox[mask_id, 4] > 0.8:\n", + " type_ids.append(mask_id)\n", + " num += 1\n", + " print(\"Class Num:\", num)\n", + "\n", + " # Generate random colors\n", + " colors = random_colors(num)\n", + "\n", + " # Show area outside image boundaries.\n", + " height = config.img_height\n", + " width = config.img_width\n", + " ax = get_ax(1)\n", + " ax.set_ylim(height + 10, -10)\n", + " ax.set_xlim(-10, width + 10)\n", + " ax.axis('off')\n", + " ax.set_title(\"Precision\")\n", + "\n", + " masked_image = img.astype(np.uint32).copy()\n", + " for j in range(num):\n", + " color = colors[j]\n", + " i = type_ids[j]\n", + " # Bounding box\n", + " x1, y1, x2, y2, _ = all_bbox[i]*ratio\n", + " score = all_bbox[i, 4]\n", + "\n", + " p = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2, alpha=0.7,\n", + " linestyle=\"dashed\", edgecolor=color, facecolor='none')\n", + " ax.add_patch(p)\n", + "\n", + " # Label\n", + " class_names = config.data_classes\n", + " class_id = all_label[i, 0].astype(np.uint8)+1\n", + " score = all_bbox[i, 4]\n", + " label = class_names[class_id]\n", + "\n", + " caption = \"{} {:.3f}\".format(label, score)\n", + " ax.text(x1, y1 + 8, caption, color='w', size=11, backgroundcolor=\"none\")\n", + "\n", + " ax.imshow(masked_image.astype(np.uint8))\n", + " plt.show()\n", + "\n", + "if __name__ == '__main__':\n", + " out, img_rgb, img_shape = infer()\n", + " detection(out, img_rgb, img_shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 参考文献\n", + "\n", + "[1] He K, Gkioxari G, Dollár P, et al. Mask r-cnn[C]//Proceedings of the IEEE international conference on computer vision. 2017: 2961-2969." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.6 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/application_example/maskrcnn/src/model/roi_align.py b/application_example/maskrcnn/src/model/roi_align.py index b13ecd16d25cd3d9647dbb49616131e04854ce7b..42579a23256d47ed25b4ec112d525101eca813fd 100644 --- a/application_example/maskrcnn/src/model/roi_align.py +++ b/application_example/maskrcnn/src/model/roi_align.py @@ -1,4 +1,4 @@ -# Copyright 2022 Huawei Technologies Co., Ltd +# Copyright 2020-2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ # limitations under the License. # ============================================================================ """MaskRcnn ROIAlign module.""" + import numpy as np import mindspore.nn as nn import mindspore.common.dtype as mstype @@ -27,32 +28,18 @@ class ROIAlign(nn.Cell): Extract RoI features from mulitiple feature map. Args: - out_size_h (int): RoI height. - out_size_w (int): RoI width. - spatial_scale (int): RoI spatial scale. - sample_num (int): RoI sample number. Default: 0. - roi_align_mode (int): RoI align mode. Default: 1. - - Inputs: - - **features** (Tensor) - The input features, whose shape must be :math:'(N, C, H, W)'. - - **rois** (Tensor) - The shape is :math:'(rois_n, 5)'. With data type of float16 or float32. - - Outputs: - Tensor, the shape is :math: '(rois_n, C, pooled_height, pooled_width)'. - - Support Platform: - ``Ascend`` ``CPU`` ``GPU`` - - Examples: - >>> features = Tensor(np.array([[[[1., 2.], [3., 4.]]]]), mindspore.float32) - >>> rois = Tensor(np.array([[0, 0.2, 0.3, 0.2, 0.3]]), mindspore.float32) - >>> roi_align = ops.ROIAlign(2, 2, 0.5, 2) - >>> output = roi_align(features, rois) - >>> print(output) - [[[[1.775 2.025] - [2.275 2.525]]]] + out_size_h (int) - RoI height. + out_size_w (int) - RoI width. + spatial_scale (int) - RoI spatial scale. + sample_num (int) - RoI sample number. + roi_align_mode (int)- RoI align mode """ - def __init__(self, out_size_h, out_size_w, spatial_scale, sample_num=0, roi_align_mode=1): + def __init__(self, + out_size_h, + out_size_w, + spatial_scale, + sample_num=0, + roi_align_mode=1): super(ROIAlign, self).__init__() self.out_size = (out_size_h, out_size_w) @@ -68,8 +55,8 @@ class ROIAlign(nn.Cell): def __repr__(self): format_str = self.__class__.__name__ - format_str += \ - '(out_size={}, spatial_scale={}, sample_num={}'.format(self.out_size, self.spatial_scale, self.sample_num) + format_str += '(out_size={}, spatial_scale={}, sample_num={}'.format( + self.out_size, self.spatial_scale, self.sample_num) return format_str @@ -77,49 +64,37 @@ class SingleRoIExtractor(nn.Cell): """ Extract RoI features from a single level feature map. - If there are multiple input feature levels, each RoI is mapped to a level according to its scale. + If there are multiple input feature levels, each RoI is mapped to a level + according to its scale. Args: config (dict): Config + roi_layer (dict): Specify RoI layer type and arguments. out_channels (int): Output channels of RoI layers. featmap_strides (int): Strides of input feature maps. - batch_size (int): Batchsize. Default: 1. - finest_scale (int): Scale threshold of mapping to level 0. Default: 56. - mask (bool): Specify ROIAlign for cls or mask branch. Default: False. - - Inputs: - - **rois** (Tensor) - The shape is :math:'(rois_n, 5)'. With data type of float16 or float32. - - **feat1** (Tensor) - The input features, whose shape must be :math:'(N, C, H, W)'. - - **feat2** (Tensor) - The input features, whose shape must be :math:'(N, C, H, W)'. - - **feat3** (Tensor) - The input features, whose shape must be :math:'(N, C, H, W)'. - - **feat4** (Tensor) - The input features, whose shape must be :math:'(N, C, H, W)'. - - Outputs: - Tensor, the shape is :math:'(rois_n, C, pooled_height, pooled_width)'. - - Support Platform: - ``Ascend`` ``CPU`` ``GPU`` - - Examples: - >>> fea1 = Tensor(np.array([[[[1., 2.], [3., 4.]]]]), mindspore.float32) - >>> fea2 = Tensor(np.array([[[[1., 2.], [3., 4.]]]]), mindspore.float32) - >>> fea3 = Tensor(np.array([[[[1., 2.], [3., 4.]]]]), mindspore.float32) - >>> fea4 = Tensor(np.array([[[[1., 2.], [3., 4.]]]]), mindspore.float32) - >>> rois = Tensor(np.array([[0, 0.2, 0.3, 0.2, 0.3]]), mindspore.float32) - >>> single_roi = ops.SingleRoIExtractor(conifg, 2, 1, 2, 2, mask) - >>> output = single_roi(rois, fea1, fea2, fea3, fea4) + batch_size (int): Batchsize. + finest_scale (int): Scale threshold of mapping to level 0. + mask (bool): Specify ROIAlign for cls or mask branch """ - def __init__(self, config, roi_layer, out_channels, featmap_strides, batch_size=1, finest_scale=56, mask=False): + def __init__(self, + config, + roi_layer, + out_channels, + featmap_strides, + batch_size=1, + finest_scale=56, + mask=False): super(SingleRoIExtractor, self).__init__() cfg = config self.train_batch_size = batch_size self.out_channels = out_channels self.featmap_strides = featmap_strides self.num_levels = len(self.featmap_strides) - self.out_size = roi_layer.mask_out_size if mask else roi_layer.out_size + + self.out_size = config.roi_layer.mask_out_size if mask else config.roi_layer.out_size self.mask = mask - self.sample_num = roi_layer.sample_num + self.sample_num = config.roi_layer.sample_num self.roi_layers = self.build_roi_layers(self.featmap_strides) self.roi_layers = L.CellList(self.roi_layers) @@ -132,9 +107,9 @@ class SingleRoIExtractor(nn.Cell): self.equal = P.Equal() self.select = P.Select() - in_mode_16 = False - self.dtype = np.float16 if in_mode_16 else np.float32 - self.ms_dtype = mstype.float16 if in_mode_16 else mstype.float32 +# _mode_16 = False + self.dtype = np.float32 #np.float16 if _mode_16 else np.float32 + self.ms_dtype = mstype.float32 #mstype.float16 if _mode_16 else mstype.float32 self.set_train_local(cfg, training=True) def set_train_local(self, config, training=True): @@ -143,24 +118,43 @@ class SingleRoIExtractor(nn.Cell): cfg = config # Init tensor - roi_sample_num = cfg.num_expected_pos_stage2 if self.mask else cfg.roi_sample_num - self.batch_size = roi_sample_num if self.training_local else cfg.rpn_max_num + roi_sample_num = \ + cfg.num_expected_pos_stage2 if self.mask else cfg.roi_sample_num + self.batch_size = \ + roi_sample_num if self.training_local else cfg.rpn_max_num self.batch_size = self.train_batch_size*self.batch_size \ if self.training_local else cfg.test_batch_size*self.batch_size - self.ones = Tensor(np.array(np.ones((self.batch_size, 1)), dtype=self.dtype)) - finest_scale = np.array(np.ones((self.batch_size, 1)), dtype=self.dtype) * self.finest_scale_ + self.ones = \ + Tensor(np.array(np.ones((self.batch_size, 1)), dtype=self.dtype)) + finest_scale = \ + np.array(np.ones((self.batch_size, 1)), + dtype=self.dtype) * self.finest_scale_ self.finest_scale = Tensor(finest_scale) - self.epslion = Tensor(np.array(np.ones((self.batch_size, 1)), dtype=self.dtype)*self.dtype(1e-6)) - self.zeros = Tensor(np.array(np.zeros((self.batch_size, 1)), dtype=np.int32)) - self.max_levels = Tensor(np.array(np.ones((self.batch_size, 1)), dtype=np.int32)*(self.num_levels-1)) - self.twos = Tensor(np.array(np.ones((self.batch_size, 1)), dtype=self.dtype) * 2) - self.res_ = Tensor(np.array(np.zeros((self.batch_size, self.out_channels, self.out_size, self.out_size)), - dtype=self.dtype)) + self.epslion = \ + Tensor(np.array(np.ones((self.batch_size, 1)), + dtype=self.dtype)*self.dtype(1e-6)) + self.zeros = \ + Tensor(np.array(np.zeros((self.batch_size, 1)), + dtype=np.int32)) + self.max_levels = \ + Tensor(np.array(np.ones((self.batch_size, 1)), + dtype=np.int32)*(self.num_levels-1)) + self.twos = \ + Tensor(np.array(np.ones((self.batch_size, 1)), + dtype=self.dtype) * 2) + self.res_ = \ + Tensor(np.array(np.zeros((self.batch_size, self.out_channels, + self.out_size, self.out_size)), + dtype=self.dtype)) def num_inputs(self): """input number.""" return len(self.featmap_strides) + def init_weights(self): + """initialize weights.""" + pass + def log2(self, value): """calculate log2.""" return self.log(value) / self.log(self.twos) @@ -169,8 +163,10 @@ class SingleRoIExtractor(nn.Cell): """build ROI layers.""" roi_layers = [] for s in featmap_strides: - layer_cls = ROIAlign(self.out_size, self.out_size, spatial_scale=1 / s, - sample_num=self.sample_num, roi_align_mode=0) + layer_cls = ROIAlign(self.out_size, self.out_size, + spatial_scale=1 / s, + sample_num=self.sample_num, + roi_align_mode=0) roi_layers.append(layer_cls) return roi_layers @@ -187,7 +183,7 @@ class SingleRoIExtractor(nn.Cell): num_levels (int): Total level number. Returns: - Tensor, Level index (0-based) of each RoI, shape (k, ) + Tensor: Level index (0-based) of each RoI, shape (k, ) """ scale = self.sqrt(rois[::, 3:4:1] - rois[::, 1:2:1] + self.ones) * \ self.sqrt(rois[::, 4:5:1] - rois[::, 2:3:1] + self.ones) @@ -208,8 +204,9 @@ class SingleRoIExtractor(nn.Cell): mask = self.equal(target_lvls, P.ScalarToArray()(i)) mask = P.Reshape()(mask, (-1, 1, 1, 1)) roi_feats_t = self.roi_layers[i](feats[i], rois) - mask = \ - self.cast(P.Tile()(self.cast(mask, mstype.int32), (1, 256, self.out_size, self.out_size)), mstype.bool_) + mask = self.cast(P.Tile()(self.cast(mask, mstype.int32), + (1, 256, self.out_size, self.out_size)), + mstype.bool_) res = self.select(mask, roi_feats_t, res) return res diff --git a/application_example/maskrcnn/src/train.py b/application_example/maskrcnn/src/train.py index beead122f1a2b02e4014953b61ecb83b9baef9df..64bcb315246a2f26b27eff3c70b490d76df6094d 100644 --- a/application_example/maskrcnn/src/train.py +++ b/application_example/maskrcnn/src/train.py @@ -25,12 +25,12 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.nn import Momentum from mindspore.common import set_seed +from utils.config import config # when use maskrcnn mobilenetv1, just change the following backbone and defined network # from mask_rcnn_mobilenetv1 and network_define_maskrcnnmobilenetv1 from model.mask_rcnn_r50 import MaskRcnnResnet50 from utils.network_define import LossCallBack, WithLossCell, TrainOneStepCell, LossNet from utils.lr_schedule import dynamic_lr -from utils.config import config from dataset.dataset import create_coco_dataset, data_to_mindrecord_byte_image diff --git a/application_example/maskrcnn/src/utils/__init__.py b/application_example/maskrcnn/src/utils/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/application_example/maskrcnn/src/utils/config.py b/application_example/maskrcnn/src/utils/config.py index e2daf39f5aefd94eddb93b768e11a2a29474d9d2..57f67d9508ca8a51a8d7844f7c95397a814c1139 100644 --- a/application_example/maskrcnn/src/utils/config.py +++ b/application_example/maskrcnn/src/utils/config.py @@ -40,13 +40,12 @@ def parse_args(): help="File path of dataset in training.") # MaskRcnn training - parser.add_argument('--only_create_dataset', default=False, type=ast.literal_eval, - help="Whether to create dataset.") + parser.add_argument('--only_create_dataset', default=False, type=ast.literal_eval, help="Whether to create dataset.") parser.add_argument('--run_distribute', default=False, type=ast.literal_eval, help="Whether to run distribute.") parser.add_argument('--do_train', default=True, type=ast.literal_eval, help="Whether to do train.") parser.add_argument('--do_eval', default=False, type=ast.literal_eval, help="Whether to do eval.") parser.add_argument('--dataset', default='coco', type=str, help="Dataset name") - parser.add_argument('--pre_trained', default='../../maskrcnnr5/checkpoint/resnet50.ckpt', + parser.add_argument('--pre_trained', default='../../maskrcnnr5/checkpoint/resnet50_ascend_v180_imagenet2012_official_cv_top1acc76.97_top5acc93.44.ckpt', type=str, help="File path of pretrained checkpoint in training.") parser.add_argument('--device_id', default=0, type=int, help="Target device id.") parser.add_argument('--device_num', default=1, type=int, help="Target device number.") @@ -80,8 +79,7 @@ def parse_args(): parser.add_argument('--img_width', default=1280, type=int, help="The input image width.") parser.add_argument('--img_height', default=768, type=int, help="The input image height.") - parser.add_argument('--keep_ratio', default=True, type=ast.literal_eval, - help="Whether to keep the same image scaling ratio.") + parser.add_argument('--keep_ratio', default=True, type=ast.literal_eval, help="Whether to keep the same image scaling ratio.") parser.add_argument('--flip_ratio', default=0.5, type=float, help="The flip ratio.") parser.add_argument('--expand_ratio', default=1.0, type=float, help="The expand ratio.") @@ -130,8 +128,7 @@ def parse_args(): # proposal parser.add_argument('--activate_num_classes', default=256, type=int, help="The activate number of classes.") - parser.add_argument('--use_sigmoid_cls', default=True, type=ast.literal_eval, - help="Whether to use sigmoid for classification.") + parser.add_argument('--use_sigmoid_cls', default=True, type=ast.literal_eval, help="Whether to use sigmoid for classification.") # roi_align parser.add_argument('--roi_layer', default=ed(type='RoIAlign', out_size=7, mask_out_size=14, sample_num=2), diff --git a/application_example/maskrcnn/src/utils/network_define.py b/application_example/maskrcnn/src/utils/network_define.py index fc398688f647235ec760cc68d457732435c85a8e..4a7ebe473f36a03eeb22b730ee968f02b42269d9 100644 --- a/application_example/maskrcnn/src/utils/network_define.py +++ b/application_example/maskrcnn/src/utils/network_define.py @@ -156,7 +156,8 @@ class WithLossCell(nn.Cell): self._loss_fn = loss_fn def construct(self, x, img_shape, gt_bboxe, gt_label, gt_num, gt_mask): - loss1, loss2, _, _, _, _, _ = self._backbone(x, img_shape, gt_bboxe, gt_label, gt_num, gt_mask) + loss1, loss2, loss3, loss4, loss5, loss6, loss7 = \ + self._backbone(x, img_shape, gt_bboxe, gt_label, gt_num, gt_mask) return self._loss_fn(loss1, loss2) @property