Chinaunix首页 | 论坛 | 博客
  • 博客访问: 3607675
  • 博文数量: 210
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 7395
  • 用 户 组: 普通用户
  • 注册时间: 2013-01-23 18:56
个人简介

将晦涩难懂的技术讲的通俗易懂

文章分类

全部博文(210)

文章存档

2025年(2)

2024年(11)

2023年(9)

2022年(4)

2021年(12)

2020年(8)

2019年(18)

2018年(19)

2017年(9)

2016年(26)

2015年(18)

2014年(54)

2013年(20)

分类: LINUX

2025-01-05 23:13:54

NCCL源码解析4——channel搜索

——lvyilong316
      上文计算出GPU节点到其他任意节点的{BANNED}最佳优路径了,本节看下ncclchannel的搜索过程。

NCCLchannel的概念表示一个通信“信道”。注意这里没有说他是路径,因为它和路径还是有区别的。如下图所示,我们可以说AB有两个path,分别是A-BA-C-B,但是AB可以有多个channel,他们来自一个path。有点像我们打电话,底层是用的一个电话线(path),但是每两个人之间通话都是一个channel

Channel的作用是为了更好的利用带宽和网卡,以及同一块数据可以通过多个channel并发通信。后续我们可以看到一个channel对应了一个GPU stream,同时也对应一个执行核函数的block,这样我们可以更加清晰。所以基于这些原因,NCCL会使用多channel,搜索的过程就是搜索出来一组channel

下面回到initTransportsRank中继续查看channel搜索相关逻辑。首先,设置是否支持 CollNet NVLS。并调用 ncclTopoCompute 函数计算 RINGTree 等拓扑信息,并根据需要将相关拓扑信息打印到文件中。关于 NVLink Sharp (NVLS) 的信息,可以参考 NVLS Issue

点击(此处)折叠或打开

  1. // Determine local CollNet support
  2.   if (collNetSupport(comm)) {
  3.     const char *collNetEnable = ncclGetEnv("NCCL_COLLNET_ENABLE");
  4.     if (collNetEnable != NULL) {
  5.       INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
  6.       if (strcmp(collNetEnable, "1") == 0) {
  7.         comm->collNetSupport = 1;
  8.       }
  9.     }
  10.   }

  11.   // Determine local Nvls support
  12.   NCCLCHECK(ncclNvlsInit(comm));

  13.   timers[TIMER_INIT_GRAPHS] = clockNano();
  14.   // Get rings and trees
  15.   memset(ringGraph, 0, sizeof(struct ncclTopoGraph));
  16.   ringGraph->id = 0;
  17.   ringGraph->pattern = NCCL_TOPO_PATTERN_RING;
  18.   ringGraph->minChannels = 1;
  19.   ringGraph->maxChannels = MAXCHANNELS/2;
  20.   NCCLCHECKGOTO(ncclTopoCompute(comm->topo, ringGraph), ret, fail);
  21.   NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, ringGraph), ret, fail);

  22.   memset(treeGraph, 0, sizeof(struct ncclTopoGraph));
  23.   treeGraph->id = 1;
  24.   treeGraph->pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
  25.   treeGraph->minChannels = ringGraph->nChannels;
  26.   treeGraph->maxChannels = ringGraph->nChannels;
  27.   NCCLCHECKGOTO(ncclTopoCompute(comm->topo, treeGraph), ret, fail);
  28.   NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, treeGraph), ret, fail);

  29.   memset(collNetChainGraph, 0, sizeof(struct ncclTopoGraph));
  30.   collNetChainGraph->id = 2;
  31.   collNetChainGraph->pattern = NCCL_TOPO_PATTERN_TREE;
  32.   collNetChainGraph->collNet = 1;
  33.   collNetChainGraph->minChannels = ringGraph->nChannels;
  34.   collNetChainGraph->maxChannels = ringGraph->nChannels;

  35.   memset(collNetDirectGraph, 0, sizeof(struct ncclTopoGraph));
  36.   collNetDirectGraph->id = 2;
  37.   collNetDirectGraph->pattern = NCCL_TOPO_PATTERN_COLLNET_DIRECT;
  38.   collNetDirectGraph->collNet = 1;
  39.   collNetDirectGraph->minChannels = 1;
  40.   collNetDirectGraph->maxChannels = MAXCHANNELS;
  41.   if (comm->collNetSupport) {
  42.     NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetChainGraph), ret, fail);
  43.     NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetChainGraph), ret, fail);
  44.     NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetDirectGraph), ret, fail);
  45.     NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetDirectGraph), ret, fail);
  46.   }

  47.   memset(nvlsGraph, 0, sizeof(struct ncclTopoGraph));
  48.   nvlsGraph->id = 3;
  49.   nvlsGraph->pattern = NCCL_TOPO_PATTERN_NVLS;
  50.   nvlsGraph->minChannels = 1;
  51.   nvlsGraph->maxChannels = MAXCHANNELS;
  52.   if (comm->nvlsSupport) {
  53.     NCCLCHECKGOTO(ncclTopoCompute(comm->topo, nvlsGraph), ret, fail);
  54.     NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, nvlsGraph), ret, fail);
  55.   }
  56.   timers[TIMER_INIT_GRAPHS] = clockNano() - timers[TIMER_INIT_GRAPHS];

  57.   // Initialize num P2P LL buffers for this communicator
  58.   comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1;

  59.   if (comm->rank == ncclParamGraphDumpFileRank()) {
  60.     struct ncclTopoGraph* dumpGraphs[5] = { ringGraph, treeGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph };
  61.     NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 5, dumpGraphs), ret, fail);
  62.   }

下面我们体讨论ncclTopoCompute函数,这个函数是channel搜索的核心,它根据传入的graph->pattern类型,构建 TreeRing 等拓扑的 channels 信息。为了方便,我们后面以Ring(NCCL_TOPO_PATTERN_RING)为例进行分析。

ncclTopoCompute的执行过程就是实际搜索channel的过程,目标是搜索出来尽可能多,带宽尽可能大的一系列channel,本质就是暴力搜索,先设置一系列的条件搜答案,如果搜不出来则降低条件继续搜。当搜索到了之后,又会尝试提升条件进一步搜索,看是否可以搜索出更好的。

点击(此处)折叠或打开

  1. ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
  2.   int ngpus = system->nodes[GPU].count;
  3.   int crossNic = (system->nodes[NET].count > 1) &&
  4.    (graph->pattern == NCCL_TOPO_PATTERN_RING ||
  5.           graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
  6.           graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? ncclParamCrossNic() : 0;
  7.   //如果是多机场景,system->nodes[NET].count是大于1的,所以crossNic也大于1,但是由于crossNic不等于1,因此graph->crossNic反而初始化为0
  8.   graph->crossNic = crossNic == 1 ? 1 : 0;
  9.   graph->bwIntra = graph->bwInter = 0;
  10.   graph->latencyInter = 0;
  11.   graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
  12.   graph->typeInter = PATH_PIX;
  13.   graph->nChannels = 0;
  14.   int trySameChannels = graph->pattern == NCCL_TOPO_PATTERN_NVLS ? 0 : 1;
  15.   graph->sameChannels = trySameChannels;

  16.   int cpuArch, cpuVendor, cpuModel;
  17.   NCCLCHECK(ncclTopoCpuType(system, &cpuArch, &cpuVendor, &cpuModel));

  18.   const char* str = ncclGetEnv("NCCL_GRAPH_FILE");
  19.   if (str) {
  20.     INFO(NCCL_ENV, "NCCL_GRAPH_FILE set by environment to %s", str);
  21.     struct ncclXml* xml;
  22.     NCCLCHECK(xmlAlloc(&xml, NCCL_GRAPH_XML_MAX_NODES));
  23.     NCCLCHECK(ncclTopoGetXmlGraphFromFile(str, xml));
  24.     int nChannels;
  25.     NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph, &nChannels));
  26.     INFO(NCCL_GRAPH, "Search %d : %d channels loaded from XML graph", graph->id, nChannels);
  27.     free(xml);
  28.     if (graph->nChannels > 0) return ncclSuccess;
  29.   }

  30.   int ccMin;
  31.   NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
  32.   if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess;
  33.   // NVLS and COLLNET_DIRECT search must have ngpus heads at most.
  34.   if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT)
  35.     graph->maxChannels = system->nodes[GPU].count;

  36.   if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;

  37.   if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
  38.     // Force intra-node NVLS algorithm to pull evenly from all GPUs.
  39.     graph->minChannels = graph->maxChannels = system->nodes[GPU].count;
  40.   }

  41.   struct ncclTopoGraph tmpGraph;
  42.   memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));

  43.   // First try crossnic, then decrease bw and finally increase bwIntra.
  44.   int nspeeds = 0;
  45.   float* speedArray = NULL;
  46.   if (system->nodes[NET].count == 0) {
  47.     nspeeds = ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA;
  48.     speedArray = ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra;
  49.   } else {
  50.     nspeeds = ccMin >= 90 ? NSPEEDSINTER_SM90 : NSPEEDSINTER;
  51.     speedArray = ccMin >= 90 ? sm90SpeedArrayInter : speedArrayInter;
  52.   }
  53.   int pass = 1;
  54.   int speedIndex = 0;
  55.   float maxBw = system->maxBw;
  56.   float totalBw = system->totalBw;
  57.   if (ngpus > 1 && graph->pattern != NCCL_TOPO_PATTERN_RING) totalBw *= ngpus*1.0/(ngpus-1);
  58.   while ((speedArray[speedIndex] > maxBw || speedArray[speedIndex]*graph->minChannels > totalBw) && speedIndex < nspeeds-1) speedIndex++;
  59.   tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
  60.   int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;

  61. search:
  62.   int time = tmpGraph.sameChannels ? NCCL_SEARCH_TIMEOUT_SAMECHANNELS :
  63.     tmpGraph.pattern == NCCL_TOPO_PATTERN_TREE ? NCCL_SEARCH_TIMEOUT_TREE : NCCL_SEARCH_TIMEOUT;
  64.   tmpGraph.nChannels = 0;
  65.   globalTimeout -= time;

  66.   NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time));
  67. #if 0
  68.   printf("Id %d Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.id, tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
  69.   for (int c=0; c<graph->nChannels; c++) {
  70.     printf("%2d : ", c);
  71.     for (int g=0; g<ngpus; g++) {
  72.       printf("%d ", graph->intra[c*ngpus+g]);
  73.     }
  74.     printf("[%lx %lx]", graph->inter[c*2+0], graph->inter[c*2+1]);
  75.     printf("\n");
  76.   }
  77. #endif
  78.   // Optimal solution, stop here
  79.   if (time == -1) goto done;
  80.   if (graph->nChannels*graph->bwInter >= system->totalBw) goto done;

  81.   if (pass == 1) {
  82.     // First pass, we don't have a solution yet ; try other options

  83.     // Try having different channels (except when going through AMD CPUs)
  84.     if (tmpGraph.sameChannels == 1 &&
  85.         !(cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD && tmpGraph.typeIntra == PATH_SYS)) {
  86.       tmpGraph.sameChannels = 0;
  87.       goto search;
  88.     }
  89.     tmpGraph.sameChannels = trySameChannels;

  90.     if (time != -1) globalTimeout += time;
  91.     else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
  92.     if (globalTimeout < 0 && graph->nChannels) goto done;

  93.     // Try a simpler tree
  94.     if (ccMin >= 90 && tmpGraph.pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
  95.       tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
  96.       goto search;
  97.     }
  98.     tmpGraph.pattern = graph->pattern;

  99.     int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
  100.     if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
  101.       tmpGraph.typeIntra += 1; //初始为PATH_NVL,逐步降低条件
  102.       goto search;
  103.     }
  104.     tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;

  105.     if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
  106.       tmpGraph.typeInter += 1; //初始为PATH_PIX,逐步降低条件
  107.       goto search;
  108.     }
  109.     tmpGraph.typeInter = PATH_PIX;

  110.     if (crossNic == 2 && tmpGraph.crossNic == 0) {
  111.       // Try again with crossNic if permitted
  112.       tmpGraph.crossNic = 1;
  113.       goto search;
  114.     }
  115.     tmpGraph.crossNic = crossNic == 1 ? 1 : 0;

  116.     // Decrease bw until we find a solution
  117.     if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->bwInter > .49))) {
  118.       tmpGraph.bwInter = tmpGraph.bwIntra = speedArray[++speedIndex];
  119.       goto search;
  120.     }
  121.     speedIndex = 0;
  122.     while (speedArray[speedIndex] > maxBw && speedIndex < nspeeds-1) speedIndex++;
  123.     tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];

  124.   }

  125. done:
  126.   // We have a solution. Start from that solution and move to pass 2.
  127.   if (pass == 1) {
  128.     time = -1;
  129.     NCCLCHECK(ncclTopoDupChannels(graph, ccMin, ngpus));
  130.     memcpy(&tmpGraph, graph, sizeof(tmpGraph));
  131.     speedIndex = 0;
  132.     while (speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds-1) speedIndex++;
  133.     tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
  134.     tmpGraph.minChannels = graph->nChannels;
  135.     pass = 2;
  136.   }

  137.   if (pass == 2) {
  138.     // See if we can increase bw
  139.     if (time != 0 && speedIndex > 0) {
  140.       if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
  141.         // increase bw for Ring
  142.         tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[--speedIndex];
  143.         goto search;
  144.       } else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && tmpGraph.bwInter == graph->bwInter && tmpGraph.bwInter < tmpGraph.bwIntra*2) {
  145.         tmpGraph.minChannels = tmpGraph.maxChannels = graph->nChannels;
  146.         tmpGraph.bwInter = speedArray[--speedIndex];
  147.         goto search;
  148.       } else if (tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2) {
  149.         // increase bwIntra for trees (2 nodes or collnet)
  150.         tmpGraph.bwIntra = speedArray[--speedIndex];
  151.         goto search;
  152.       }
  153.     }
  154.     time = -1;
  155.     memcpy(&tmpGraph, graph, sizeof(tmpGraph));
  156.   }

  157.   if (graph->nChannels == 0 && graph->collNet == 0 && graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
  158.     WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
  159.     for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
  160.     graph->inter[0] = graph->inter[1] = 0;
  161.     graph->bwIntra = graph->bwInter = 0.1;
  162.     graph->typeIntra = graph->typeInter = PATH_SYS;
  163.     graph->nChannels = 1;
  164.   }
  165.   return ncclSuccess;
  166. }

在我的一台测试机上,拓扑如下所示,有两个CPU nodesocket),每个CPU node通过PCIe vswitch连接两个GPU,两个GPU之间通过NVlink相连。

    这里我们通过如下命令,指定两个rank测试all_reduce,注意这里的host_ip是我本机上的一个网卡IP

点击(此处)折叠或打开

  1. /usr/local/openmpi/bin/mpirun --allow-run-as-root -np 2 -H $host_ip:2 \
  2.         -x NCCL_IB_GID_INDEX=3 \
  3.         -x NCCL_DEBUG=TRACE \
  4.         -x NCCL_IB_GID_INDEX=1 \
  5.         -x NCCL_NET_GDR_LEVEL=2 \
  6.         -x NCCL_IB_QPS_PER_CONNECTION=4 \
  7.         -x LD_LIBRARY_PATH -x PATH \
  8. /root/nccl-tests/build/all_reduce_perf -b 32M -e 1G -i 1000 -f 2 -g 1

由于指定两个rank,程序就选择到了拓扑中左侧的两个通过Nvlink相连的GPU。通过将ncclTopoCompute函数中的打印注释打开,我们得到如下{BANNED}最佳后的channel结果。可以看到Pattern4就是NCCL_TOPO_PATTERN_RING,我们{BANNED}最佳后共找到了6channel(都是GPU0->GPU1),每个channel的带宽是40GB。下面我们结合代码看是如何搜索的。


      首先是初始化搜索条件:由于是单机搜索,因此没有NET节点,所以crossNic0,然后初始化graph,首先设置{BANNED}最佳高的条件,限制节点内部typeIntra只能使用不超过PATH_NVL路径,节点间typeInter只能使用不超过PATH_PIX的路径,然后根据是机内搜索还是机间搜索设置带宽数组是speedArrayIntra还是speedArrayInter,其中两个数组定义如下,

点击(此处)折叠或打开

  1. float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
  2. float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };

数组的重点是尝试一组有限的离散值,这些值是已知需要的,以加快搜索速度。关于这两个数组的具体解释可以参考

接着执行ncclTopoSearchRec搜索出一个答案存储到一个临时的tmpGraph中。我们注意到其中有一个pass变量,{BANNED}最佳初它设置为1

当{BANNED}中国第一次调用ncclTopoSearchRec后,判断是否满足搜索结束条件(graph->nChannels*graph->bwInter >= system->totalBw 或者搜索超时),如果不满足,则进入if (pass == 1)的逻辑,这个逻辑主要就是逐步降低搜索条件,包括降低typeIntra(机内搜索)或typeInter(机间搜索),通过调整speedArrayIntra(机内搜索)或speedArrayInter(机间搜索)数组下标降低带宽,以及设置crossNic尝试跨机搜索。

当尝试一系列降低条件后,跟进当前搜索的channel情况设置speedIndex(带宽数组下标),并设置pass = 2,开始进入if (pass == 2)的逻辑。这个逻辑尝试基于当前的channel带宽逐步提升带宽尝试搜索,以试图找到比刚才带宽更好的channel

以上就是channel的搜索过程。每次调用ncclTopoSearchRec表示一次搜索,当然这个函数又回递归调用,具体过程如下图所示,这里不再展开。

这里我们从几个场景切入来更清楚的理解channel的搜索过程。首先,为什么上述我们两个rank的情况搜索出来的是12channel?要想知道这个问题我们要看channel在搜索过程中是在哪里增加的。我找到以下逻辑处:

ncclTopoSearchRec—>ncclTopoSearchRecGpu—>ncclTopoSearchRecGpu。但这里当时曾一度让我非常费解,如下代码中在graph->nChannels++的调用后还有一个graph->nChannels--,那不是说明channel永远不会变吗?其实这里的核心在参数saveGraph参数搜索{BANNED}最佳终结果的保存处,而graph则是一个临时变量。每次搜索到一个channel后执行graph->nChannels++,然后执行ncclTopoCompareGraphs对比当前channel是否比之前的更优,如果是的话则将当前channel也算进去,这个时候执行memcpy就把saveGraph->nChannels增加了,而随后graph->nChannel--只是更新回退临时变量。

点击(此处)折叠或打开

  1. ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
  2.   if ((*time) <= 0) return ncclSuccess;
  3.   (*time)--;

  4.   int ngpus = system->nodes[GPU].count;
  5.   if (step == ngpus) {
  6.     // Determine whether we found a better solution or not
  7.     int copy = 0;
  8.     graph->nChannels++;
  9.     NCCLCHECK(ncclTopoCompareGraphs(system, graph, saveGraph, &copy));
  10.     if (copy) {
  11.       memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph));
  12.       if (graph->nChannels == graph->maxChannels) *time = -1;
  13.     }
  14.     if (graph->nChannels < graph->maxChannels) {
  15.       NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, time));
  16.     }
  17.     graph->nChannels--;
  18.     return ncclSuccess;
  19.   }
  20. ……
  21. }

    然后我们重点看一下ncclTopoCompareGraphs的逻辑,看什么条件会允许增加channel。可以看到当前channel加入后如果总带宽增加,或者让跳数减少都认为当前channel可以被计入。

点击(此处)折叠或打开

  1. ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
  2.   // 1. Try to get the same nChannels between Rings and Trees
  3.   if (graph->nChannels < graph->minChannels) return ncclSuccess;

  4.   if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { // NVLS channels correspond to GPUs pulling from NVLS. So the more the better.
  5.     if (graph->nChannels > refGraph->nChannels && graph->nChannels <= system->nodes[GPU].count) *copy = 1;
  6.     if (graph->nChannels*graph->bwInter > refGraph->nChannels*refGraph->bwInter) *copy = 1;
  7.     return ncclSuccess;
  8.   }
  9.   // 2. Try to get better bandwidth
  10.   if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra) {
  11.     *copy = 1;
  12.     return ncclSuccess;
  13.   }
  14.   if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra) return ncclSuccess;

  15.   // 3. Less hops
  16.   if (graph->pattern == refGraph->pattern && graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
  17.   return ncclSuccess;
  18. }

   {BANNED}最佳后,我们思考另一个问题,上述场景{BANNED}最佳后搜索出来是6channel。其实决定channel的个数因素是path->bwgraph->bwIntra(有网卡时为graph->bwInter),即一个path的带宽(path->bw)可以容纳几个channel带宽graph->bwIntra。相关函数逻辑在:ncclTopoFollowPath—>followPath,每一个channel会消耗一定path的带宽: SUB_ROUND(link->bw, fwBw),如下图所示。

点击(此处)折叠或打开

  1. static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNode* start, int maxSteps, float bw, int* steps) {
  2.   float pciBw = bw;
  3.   for (int step=0; step<path->count; step++) {
  4.     struct ncclTopoNode* node = path->list[step]->remNode;
  5.     if (node->type == CPU) {
  6.       // Account for P2P inefficiency through Intel CPU RC
  7.       if (path->type == PATH_PHB && start->type == GPU &&
  8.           node->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 &&
  9.           node->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
  10.         pciBw = INTEL_P2P_OVERHEAD(bw);
  11.       }
  12.     }
  13.   }

  14.   struct ncclTopoNode* node = start;
  15.   for (int step=0; step<maxSteps; step++) {
  16.     struct ncclTopoLink* link = path->list[step];
  17.     struct ncclTopoLink* revLink = NULL;
  18.     float fwBw = link->type == LINK_PCI ? pciBw : bw;
  19.     float revBw = 0;
  20.     if (link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) {
  21.       if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink));
  22.       revBw += fwBw/8;
  23.     }
  24.     if (link->remNode->type == CPU && link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER && link->type == LINK_NVL) {
  25.       if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink));
  26.       revBw += fwBw;
  27.     }
  28.     if (link->bw < fwBw || (revBw && revLink->bw < revBw)) { *steps = step; return ncclSuccess; }
  29.     SUB_ROUND(link->bw, fwBw);
  30.     if (revBw) SUB_ROUND(revLink->bw, revBw);
  31.     node = link->remNode;
  32.   }
  33.   *steps = maxSteps;
  34.   return ncclSuccess;
  35. }

而我的测试环境GPUA100NVlink对应path->bw是在构建拓扑的时候读取的, 具体是在ncclTopoAddNvLinks函数中,12nvlink,每个20G,一共240G

nvlBw: 20.000000#define SM80_NVLINK_BW 20.0, gpu->gpu.cudaCompCap: 80, count: 12

因此,path->bw即为240,又因为搜索初始的graph->bwIntra搜索带宽设置为speedArrayIntra数组的{BANNED}中国第一个元素,即40,因此一共搜索出6channel240/40=6)。

而当我们使用如下命令指定4rank时,搜索出的channel就只有一个了,并且带宽只有10GB

这是因为在我们的拓扑中4rank的时候GPU Ring要跨PCIeUPI,而PCIe的带宽就限制了path->bw,进而限制channel的个数1个就够了。


阅读(15) | 评论(0) | 转发(0) |
0

上一篇:NCCL源码解析3——拓扑生成

下一篇:没有了

给主人留下些什么吧!~~