diff --git a/application_example/vipnas/README.md b/application_example/vipnas/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6bb7c7b7514b4a84487766ef9c13054611b555fc --- /dev/null +++ b/application_example/vipnas/README.md @@ -0,0 +1,106 @@ +# ViPNAS + +*** + +Human pose estimation has achieved significant progress in recent years. However, most of the recent methods focus on improving accuracy using complicated models and ignoring real-time efficiency. To achieve a better trade-off between accuracy and efficiency, a novel neural architecture search (NAS) method, termed ViPNAS, is proposed to search networks in both spatial and temporal levels for fast online video pose estimation. And the discovered model family, S-ViPNAS and T-ViPNAS, achieve significantly higher inference speed (CPU real-time) without sacrificing the accuracy compared to the previous state-of-the-art methods. + +[Paper](https://arxiv.org/pdf/2105.10154.pdf): Xu, L. , et al. "ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search.", 10.48550/arXiv.2105.10154. 2021. + +## Prepare datasets + +ViPNAS uses [COCO dataset](https://cocodataset.org/#download) for training and testing. 2017 Train/Val is needed for COCO keypoints training and validation. HRNet-Human-Pose-Estimation provides person detection result of COCO val2017 to reproduce our multi-person pose estimation results. Please download from [OneDrive Download](https://onedrive.live.com/?authkey=%21ANejPkF4WXyxYz4&id=56B9F9C97F261712%2110160&cid=56B9F9C97F261712) and make them look like this: + +```text +coco/coco2017 + │-- annotations + │ │-- person_keypoints_train2017.json + │ |-- person_keypoints_val2017.json + |-- person_detection_results + | |-- COCO_val2017_detections_AP_H_56_person.json + │-- train2017 + │ │-- 000000000009.jpg + │ │-- 000000000025.jpg + │ │-- 000000000030.jpg + │ │-- ... + `-- val2017 + │-- 000000000139.jpg + │-- 000000000285.jpg + │-- 000000000632.jpg + │-- ... +``` + +## Train + +```shell + nohup mpirun -n {GPU_NUMS} python train.py & +``` + +## Infer + +```shell + python eval.py +``` + +## Examples + +*** + +### Train + +- The following configuration uses 6 GPUs for training. Output will be saved in loss.log + + ```shell + nohup mpirun -n 6 python train.py & + ``` + + output: + + ```text + epoch:0, step:50, loss:0.002301577078178525 + epoch:0, step:50, loss:0.002285936907865107 + epoch:0, step:50, loss:0.0023103518085554244 + epoch:0, step:50, loss:0.0023135159723460673 + epoch:0, step:50, loss:0.002274846043437719 + epoch:0, step:50, loss:0.0023004490323364733 + epoch:0, step:100, loss:0.002259711856022477 + epoch:0, step:100, loss:0.0022529929876327513 + epoch:0, step:100, loss:0.0022121677175164223 + epoch:0, step:100, loss:0.0022342068213038148 + epoch:0, step:100, loss:0.002201517731882632 + epoch:0, step:100, loss:0.002213634729851037 + ... + ``` + +### infer + +- The following configuration for infer. + + ```shell + python eval.py + ``` + + output: + + ```text + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets= 20 ] = 0.711 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets= 20 ] = 0.893 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets= 20 ] = 0.790 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.678 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.777 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 20 ] = 0.769 + Average Recall (AR) @[ IoU=0.50 | area= all | maxDets= 20 ] = 0.934 + Average Recall (AR) @[ IoU=0.75 | area= all | maxDets= 20 ] = 0.839 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.727 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.829 + AP:0.7110389643479526 + AP (L):0.7770362190493157 + AP (M):0.678388706244732 + AP .5:0.8927431916469015 + AP .75:0.7896810636572867 + AR:0.7689389168765743 + AR (L):0.8293942772203641 + AR (M):0.7273149412728763 + AR .5:0.9338790931989924 + AR .75:0.8389483627204031 + ``` + diff --git a/application_example/vipnas/ViPNAS.ipynb b/application_example/vipnas/ViPNAS.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..3d59670c8eeac34a61de544f2028095e6dc4b923 --- /dev/null +++ b/application_example/vipnas/ViPNAS.ipynb @@ -0,0 +1,322 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ceef9d74", + "metadata": {}, + "source": [ + "1.准备数据集" + ] + }, + { + "cell_type": "markdown", + "id": "cc923262", + "metadata": {}, + "source": [ + "ViPNAS使用COCO数据集进行训练和测试. COCO关键点训练和验证需要COCO2017 Train/Val数据集. HRNet-Human-Pose-Estimation提供COCO val2017人体检测结果来获得我们的多人姿态检测结果,数据集放置的目录结构如下:" + ] + }, + { + "cell_type": "markdown", + "id": "68dd2970", + "metadata": {}, + "source": [ + "coco/coco2017\n", + "│-- annotations\n", + "│ │-- person_keypoints_train2017.json\n", + "│ │-- person_keypoints_val2017.json\n", + "│-- person_detection_results\n", + "│ │-- COCO_val2017_detections_AP_H_56_person.json\n", + "│-- train2017\n", + "│ │-- 000000000009.jpg\n", + "│ │-- 000000000025.jpg\n", + "│ │-- 000000000030.jpg\n", + "│ │-- ...\n", + "│-- val2017\n", + " │-- 000000000139.jpg\n", + " │-- 000000000285.jpg\n", + " │-- 000000000632.jpg\n", + " │-- ..." + ] + }, + { + "cell_type": "markdown", + "id": "c480c472", + "metadata": {}, + "source": [ + "2.训练脚本" + ] + }, + { + "cell_type": "markdown", + "id": "cb1ab210", + "metadata": {}, + "source": [ + "训练使用GPU来完成,采用静态图模式,使用混合精度模型。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9088152c", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "\n", + "import mindspore\n", + "import mindspore.dataset as ds\n", + "import mindspore.nn as nn\n", + "from mindspore import save_checkpoint\n", + "\n", + "from src.utils.loss import JointsMSELoss, CustomWithLossCell\n", + "from src.model.top_down import create_net\n", + "import src.process_dataset.vipnas_image_load as ld\n", + "\n", + "\n", + "mindspore.set_context(mode=mindspore.GRAPH_MODE, device_target=\"GPU\")\n", + "\n", + "channel_cfg = dict(\n", + " num_output_channels=17,\n", + " dataset_joints=17,\n", + " dataset_channel=[\n", + " [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],\n", + " ],\n", + " inference_channel=[\n", + " 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16\n", + " ])\n", + "\n", + "# model settings\n", + "data_cfg = dict(\n", + " image_size=[192, 256],\n", + " heatmap_size=[48, 64],\n", + " num_output_channels=channel_cfg['num_output_channels'],\n", + " num_joints=channel_cfg['dataset_joints'],\n", + " dataset_channel=channel_cfg['dataset_channel'],\n", + " inference_channel=channel_cfg['inference_channel'],\n", + " soft_nms=False,\n", + " nms_thr=1.0,\n", + " oks_thr=0.9,\n", + " vis_thr=0.2,\n", + " use_gt_bbox=False,\n", + " det_bbox_thr=0.0,\n", + " bbox_file='coco/coco2017/person_detection_results/'\n", + " 'COCO_val2017_detections_AP_H_56_person.json',\n", + ")\n", + "\n", + "train_ds = ld.TopDownCocoDataset(\n", + " ann_file='coco/coco2017/annotations/person_keypoints_train2017.json',\n", + " img_prefix='coco/coco2017/train2017/',\n", + " pipeline=[ld.LoadImageFromFile(),\n", + " ld.TopDownRandomFlip(flip_prob=0.5),\n", + " ld.TopDownHalfBodyTransform(num_joints_half_body=8,\n", + " prob_half_body=0.3),\n", + " ld.TopDownGetRandomScaleRotation(rot_factor=30,\n", + " scale_factor=0.25),\n", + " ld.TopDownAffine(),\n", + " ld.ToTensor(),\n", + " ld.NormalizeTensor(mean=[0.485, 0.456, 0.406],\n", + " std=[0.229, 0.224, 0.225]),\n", + " ld.TopDownGenerateTarget(sigma=2),\n", + " ld.Collect(keys=['img', 'target', 'target_weight'],\n", + " meta_keys=['image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',\n", + " 'rotation', 'bbox_score', 'flip_pairs'])\n", + " ],\n", + " data_cfg=data_cfg,\n", + " test_mode=False\n", + " )\n", + "\n", + "dataset = ds.GeneratorDataset(train_ds, [\"img\", \"target\", \"target_weight\"])\n", + "\n", + "train_loaders = dataset.batch(64)\n", + "\n", + "network = create_net(backbone='ViPNAS_ResNet')\n", + "net_opt = nn.Adam(network.trainable_params(), learning_rate=5e-4)\n", + "loss = JointsMSELoss()\n", + "\n", + "loss_net = CustomWithLossCell(network, loss)\n", + "\n", + "manager = nn.DynamicLossScaleUpdateCell(loss_scale_value=2**24, scale_factor=2, scale_window=1000)\n", + "train_model = nn.TrainOneStepWithLossScaleCell(loss_net, net_opt, scale_sense=manager)\n", + "train_model.set_train()\n", + "\n", + "filename = 'loss.log'\n", + "logger = logging.getLogger(filename)\n", + "logger.setLevel(logging.INFO)\n", + "fmt = logging.Formatter('%(message)s')\n", + "file_handler = logging.handlers.RotatingFileHandler(\n", + " filename=filename, maxBytes=1*1024*1024*1024, backupCount=1, encoding='utf-8')\n", + "file_handler.setFormatter(fmt)\n", + "logger.addHandler(file_handler)\n", + "\n", + "step = 0\n", + "epochs = 210\n", + "steps = train_loaders.get_dataset_size()\n", + "for epoch in range(epochs):\n", + " step_loss = 0\n", + " step = 0\n", + " for d in train_loaders.create_dict_iterator():\n", + " step_loss += train_model(d['img'], d['target'], d['target_weight'])[0].asnumpy()\n", + " step = step + 1\n", + " if step % 50 == 0:\n", + " loss = step_loss / 50\n", + " logger.info(\"epoch:%s, step:%s, loss:%s\", epoch, step, loss)\n", + " step_loss = 0\n", + " if mindspore.get_context(\"device_target\") == 2:\n", + " file_name = \"./checkpoints\" + str(epoch) + \".ckpt\"\n", + " save_checkpoint(save_obj=train_model, ckpt_file_name=file_name)" + ] + }, + { + "cell_type": "markdown", + "id": "344bcc0f", + "metadata": {}, + "source": [ + "训练过程将存入loss.log中。" + ] + }, + { + "cell_type": "markdown", + "id": "ab72bc3f", + "metadata": {}, + "source": [ + "3.推理脚本" + ] + }, + { + "cell_type": "markdown", + "id": "b40cdfdf", + "metadata": {}, + "source": [ + "将需要的checkpoint文件导入模型中" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c981079", + "metadata": {}, + "outputs": [], + "source": [ + "import mindspore\n", + "from mindspore import Tensor, load_param_into_net, load_checkpoint\n", + "import mindspore.ops as ops\n", + "\n", + "import src.process_dataset.vipnas_image_load as ld\n", + "from src.model.top_down import create_net\n", + "\n", + "\n", + "mindspore.context.set_context(device_target='GPU', device_id=0)\n", + "network = create_net(backbone='ViPNAS_ResNet')\n", + "\n", + "param_dict = load_checkpoint(\"checkpoints205.ckpt\")\n", + "para = load_param_into_net(network, param_dict)" + ] + }, + { + "cell_type": "markdown", + "id": "dd2ed00b", + "metadata": {}, + "source": [ + "进行推理,获得输出结果" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1179590", + "metadata": {}, + "outputs": [], + "source": [ + "channel_cfg = dict(\n", + " num_output_channels=17,\n", + " dataset_joints=17,\n", + " dataset_channel=[\n", + " [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],\n", + " ],\n", + " inference_channel=[\n", + " 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16\n", + " ])\n", + "data_cfg = dict(\n", + " image_size=[192, 256],\n", + " heatmap_size=[48, 64],\n", + " num_output_channels=channel_cfg['num_output_channels'],\n", + " num_joints=channel_cfg['dataset_joints'],\n", + " dataset_channel=channel_cfg['dataset_channel'],\n", + " inference_channel=channel_cfg['inference_channel'],\n", + " soft_nms=False,\n", + " nms_thr=1.0,\n", + " oks_thr=0.9,\n", + " vis_thr=0.2,\n", + " use_gt_bbox=False,\n", + " det_bbox_thr=0.0,\n", + " bbox_file='coco/coco2017/person_detection_results/'\n", + " 'COCO_val2017_detections_AP_H_56_person.json',\n", + ")\n", + "d_s = ld.TopDownCocoDataset(\n", + " ann_file='coco/coco2017/annotations/person_keypoints_val2017.json',\n", + " img_prefix='coco/coco2017/val2017/',\n", + " pipeline=[ld.LoadImageFromFile(),\n", + " ld.TopDownAffine(),\n", + " ld.ToTensor(),\n", + " ld.NormalizeTensor(mean=[0.485, 0.456, 0.406],\n", + " std=[0.229, 0.224, 0.225]),\n", + " ld.Collect(keys=['img'],\n", + " meta_keys=['image_file', 'center', 'scale',\n", + " 'rotation', 'bbox_score', 'flip_pairs'])\n", + " ],\n", + " data_cfg=data_cfg,\n", + " test_mode=True\n", + " )\n", + "\n", + "output = []\n", + "expand_dims = ops.ExpandDims()\n", + "op = ops.Concat()\n", + "i = 0\n", + "while i < len(d_s.db):\n", + " if i % 32 == 0:\n", + " if i != 0:\n", + " ds_output = network.construct(img=img,\n", + " img_metas=img_metas,\n", + " return_loss=False)\n", + "\n", + " output.append(ds_output)\n", + " img = None\n", + " img_metas = []\n", + " ds = d_s[i]\n", + " img_expand = expand_dims(Tensor(ds['img'], mindspore.float32), 0)\n", + " if img is None:\n", + " img = img_expand\n", + " else:\n", + " img = op((img, img_expand))\n", + " img_metas.append(ds['img_metas'])\n", + " i += 1\n", + "\n", + "results = d_s.evaluate(output, 'result/')\n", + "for k, v in sorted(results.items()):\n", + " print(f'{k}:{v}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}