Chinaunix首页 | 论坛 | 博客
  • 博客访问: 3649924
  • 博文数量: 214
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 7439
  • 用 户 组: 普通用户
  • 注册时间: 2013-01-23 18:56
个人简介

将晦涩难懂的技术讲的通俗易懂

文章分类

全部博文(214)

文章存档

2025年(6)

2024年(11)

2023年(9)

2022年(4)

2021年(12)

2020年(8)

2019年(18)

2018年(19)

2017年(9)

2016年(26)

2015年(18)

2014年(54)

2013年(20)

分类: LINUX

2025-03-09 00:14:22

NCCL源码解析5——跨机channel合并

——lvyilong316
      上节中我们知道通过ncclTopoCompute函数完成单机(单node)内的channel搜索和构建。接下来回到initTransportsRank中,看下如果是多机器情况,是如何合并各个node的拓扑和channel。在后续介绍之前,需要先再回顾一下graph结构的intrainter数组含义,其中intra记录的是所有channelnode内节点,即GPUinter记录的向外互联的节点,即NET节点。如下图所示,假如当前Node,分别有GPU0-> GPU1-> GPU2-> GPU3和对外的NET,以及GPU1-> GPU0-> GPU3-> GPU2和对外的NET两个channel,则对应的intrainter数组如下所示。

接下来我们回到代码,开始第二次AllGather的准备工作。首先分配allGather3Data用于rank间聚合channel的信息,其中allGather3Data对应的结构定义如下

点击(此处)折叠或打开

  1. struct allGatherInfo *allGather3Data
  2. struct allGatherInfo {
  3.     struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS];
  4.     struct ncclTopoRanks topoRanks;
  5.     int cpuArch;
  6.     int cpuVendor;
  7.   };

allGatherInfo结构包含所有算法的graphchannel信息。

点击(此处)折叠或打开

  1. // AllGather3 - begin
  2.   NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail);
  3.   //graphs即comm->graphs
  4.   for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
  5.     allGather3Data[rank].graphInfo[a].pattern = graphs[a]->pattern;
  6.     allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels;
  7.     allGather3Data[rank].graphInfo[a].sameChannels = graphs[a]->sameChannels;
  8.     allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra;
  9.     allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter;
  10.     allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra;
  11.     allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter;
  12.     allGather3Data[rank].graphInfo[a].crossNic = graphs[a]->crossNic;
  13.   }

  14.   allGather3Data[rank].cpuArch = comm->cpuArch;
  15.   allGather3Data[rank].cpuVendor = comm->cpuVendor;

  16.   comm->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
  17.   NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail);

上述代码的for循环用于初始化allGather3Data[rank].graphInfo,ncclTopoPreset函数则用于初始化allGather3Data[rank].topoRanks。首先看一下topoRanks的结构体定义:

点击(此处)折叠或打开

  1. struct ncclTopoRanks {
  2.   int ringRecv[MAXCHANNELS];
  3.   int ringSend[MAXCHANNELS];
  4.   int ringPrev[MAXCHANNELS];
  5.   int ringNext[MAXCHANNELS];
  6.   int treeToParent[MAXCHANNELS];
  7.   int treeToChild0[MAXCHANNELS];
  8.   int treeToChild1[MAXCHANNELS];
  9.   int nvlsHeads[MAXCHANNELS];
  10.   int nvlsHeadNum;
  11. };

涉及以下信息:

l  ringRecv[c]:当前rank所在的部分 channel c ”的头节点,也可以理解为所在机器的头节点rank

l  ringSend[c]:当前rank所在的部分 channel c ”的尾节点也可以理解为所在机器的尾节点rank

l  ringPrev[c]:当前rank所在的部分 channel c ”中的前一个rank

l  ringNext[c]:当前rank所在的部分 channel c ”中的后一个rank

其中{BANNED}中国第一个rankprev和{BANNED}最佳后一个ranknext-1ncclTopoPreset同时还会将comm->channels进行初始化。我们只关注Ring算法的部分,其他代码省去,注意这里{BANNED}最佳后将搜索到的环复制了一遍,即channel这里会翻倍,这里在官方issue中看到相关解释是为了进一步的并行以充分利用带宽。

点击(此处)折叠或打开

  1. ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
  2.   int rank = comm->rank;
  3.   int localRanks = comm->topo->nodes[GPU].count;
  4.   int nvlsRanks = comm->MNNVL ? comm->clique.size : localRanks;
  5.   int nChannels = comm->nChannels;

  6.   topoRanks->nvlsHeadNum = 0;
  7.   for (int c=0; c<nChannels; c++) {
  8.     struct ncclChannel* channel = comm->channels+c;
  9.     channel->ring.prev = channel->ring.next = -1;
  10.     channel->tree.up = -1;
  11.     channel->collnetChain.up = -1;
  12.     for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
  13.     for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collnetChain.down[i] = -1;
  14.     channel->collnetDirect.out = -1;
  15.     channel->collnetDirect.headRank = -1;
  16.     channel->collnetDirect.nHeads = 0;
  17.     channel->collnetDirect.shift = 0;
  18.     for (int i=0; i<NCCL_MAX_DIRECT_ARITY+1; i++) channel->collnetDirect.heads[i] = -1;
  19.     for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.up[i] = -1;
  20.     for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.down[i] = -1;

  21.     int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks;
  22.     int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks;
  23.     int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks;

  24.     for (int i=0; i<localRanks; i++) {
  25.       if (ringIntra[i] == rank) {
  26.         topoRanks->ringRecv[c] = ringIntra[0];
  27.         topoRanks->ringSend[c] = ringIntra[localRanks-1];
  28.         topoRanks->ringPrev[c] = (i == 0) ? -1 : ringIntra[i-1];
  29.         topoRanks->ringNext[c] = (i == localRanks-1) ? -1 : ringIntra[i+1];
  30.       }
  31.       if (collNetIntra[i] == rank) {
  32.         channel->collnetChain.up = i == 0 ? comm->nRanks : collNetIntra[i-1];
  33.         channel->collnetChain.down[0] = i == localRanks-1 ? -1 : collNetIntra[i+1];
  34.       }
  35.     }
  36.   }
  37.   // Duplicate channels trees
  38.   struct ncclChannel* channel0 = comm->channels;
  39.   struct ncclChannel* channel1 = channel0+nChannels;
  40.   memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));

  41.   // Get nvls heads and the number of heads. Duplicate head is not allowed.
  42.   for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
  43.     bool addHead = true;
  44.     int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * nvlsRanks;

  45.     for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
  46.       if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
  47.         addHead = false;
  48.         break;
  49.       }
  50.     }
  51.     if (addHead) {
  52.       topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0];
  53.     }
  54.   }
  55.   memcpy(comm->nvlsHeads, topoRanks->nvlsHeads, sizeof(int) * topoRanks->nvlsHeadNum);

  56.   return ncclSuccess;
  57. }

回到initTransportsRank,接着,通过bootstrapAllGather获取的所有进程间的图信息,并计算下面的数据结构。

l  nodesFirstRank: 记录机器 node 上的{BANNED}中国第一个 rank 编号。

l  nodesTreePatters: 记录机器 node 上的树形拓扑的模式。

l  rankToNode: 记录 rank 所属的机器 node

l  localRankToRank:记录 local_rank 到全局 rank 的映射。

l  localRank: 记录本进程的 local_rank

l  localRanks: 记录 local_rank 的数量。

如果所有进程中使用有不同型号或者厂商的 CPU,则给出警告信息以便于排查错误,因为在过去的实现中这种配置可能导致集合通信死锁。

点击(此处)折叠或打开

  1. NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);

  2.   // Determine nNodes, firstRanks, ...
  3.   NCCLCHECKGOTO(ncclCalloc(&nodesFirstRank, nranks), ret, fail);
  4.   NCCLCHECKGOTO(ncclCalloc(&nodesTreePatterns, nranks), ret, fail);
  5.   NCCLCHECKGOTO(ncclCalloc(&comm->rankToNode, comm->nRanks), ret, fail);
  6.   for (int r=0; r<nranks; r++) {
  7.     int node;
  8.     int firstRank = allGather3Data[r].topoRanks.ringRecv[0];
  9.     for (node=0; node<comm->nNodes && nodesFirstRank[node] != firstRank; node++);
  10.     if (node == comm->nNodes) {
  11.       comm->nNodes++;
  12.       nodesFirstRank[node] = firstRank;
  13.       // Record tree pattern of each node as they can be different depending on sm arch
  14.       nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern;
  15.     }
  16.     comm->rankToNode[r] = node;

  17.     if (comm->cpuArch != allGather3Data[r].cpuArch &&
  18.         comm->cpuArch != NCCL_TOPO_CPU_ARCH_MIXED) {
  19.       comm->cpuArch = NCCL_TOPO_CPU_ARCH_MIXED;
  20.     }
  21.     if (comm->cpuVendor != allGather3Data[r].cpuVendor &&
  22.         comm->cpuVendor != NCCL_TOPO_CPU_VENDOR_MIXED) {
  23.       comm->cpuVendor = NCCL_TOPO_CPU_VENDOR_MIXED;
  24.     }
  25.   }

  26.   // Alert the user to the presence of mixed CPUs. In the past this has caused
  27.   // locks in some collective routines. This may help debug issues in the future.
  28.   if (rank==0) {
  29.     if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_MIXED) {
  30.       INFO(NCCL_GRAPH, "CPUs with mixed architecture were detected.");
  31.     }
  32.     if (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) {
  33.       INFO(NCCL_GRAPH, "CPUs with mixed vendors were detected.");
  34.     }
  35.   }

  36.   // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
  37.   NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks, comm->nNodes), ret, fail);
  38.   NCCLCHECKGOTO(ncclCalloc(&comm->rankToLocalRank, comm->nRanks), ret, fail);
  39.   for (int r=0; r<comm->nRanks; r++) {
  40.     int node = comm->rankToNode[r];
  41.     comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks;
  42.     comm->nodeRanks[node].localRanks++;
  43.   }
  44.   // Allocate ranks arrays for each node
  45.   for (int n=0; n<comm->nNodes; n++) {
  46.     NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks), ret, fail);
  47.     comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks);
  48.     comm->nodeRanks[n].localRanks = 0;
  49.   }
  50.   // And fill the ranks arrays
  51.   for (int r=0; r<comm->nRanks; r++) {
  52.     int node = comm->rankToNode[r];
  53.     comm->nodeRanks[node].localRankToRank[comm->nodeRanks[node].localRanks++] = r;
  54.   }
  55.   comm->node = comm->rankToNode[rank];
  56.   comm->localRankToRank = comm->nodeRanks[comm->node].localRankToRank;
  57.   comm->localRank = comm->rankToLocalRank[rank];
  58.   comm->localRanks = comm->nodeRanks[comm->node].localRanks;

  59.   TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d",
  60.         rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]);
  61.   if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) {
  62.     WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d",
  63.          rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
  64.          comm->localRank, comm->localRanks, comm->localRankToRank[0]);
  65.     ret = ncclInternalError;
  66.     goto fail;
  67.   }

  68.   INFO(NCCL_INIT, "comm %p rank %d nRanks %d nNodes %d localRanks %d localRank %d MNNVL %d",
  69.        comm, rank, comm->nRanks, comm->nNodes, comm->localRanks, comm->localRank, comm->MNNVL);

接着,我们对齐所有 rank 间的配置信息。

点击(此处)折叠或打开

  1. nChannelsOrig = comm->nChannels;
  2.   NCCLCHECKGOTO(ncclCalloc(&allTopoRanks, comm->nRanks), ret, fail);
  3.   for (int i=0; i<nranks; i++) {
  4.     allTopoRanks[i] = &allGather3Data[i].topoRanks;
  5.     // Make sure we align all ranks so that the tuning is consistent across ranks
  6.     for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
  7.       graphs[a]->nChannels = std::min(allGather3Data[i].graphInfo[a].nChannels, graphs[a]->nChannels);
  8.       graphs[a]->sameChannels = std::min(allGather3Data[i].graphInfo[a].sameChannels, graphs[a]->sameChannels);
  9.       graphs[a]->bwIntra = std::min(allGather3Data[i].graphInfo[a].bwIntra, graphs[a]->bwIntra);
  10.       graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter);
  11.       graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra);
  12.       graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
  13.       graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic);
  14.     }
  15.   }

随后,由于删除了一些 channels,因此需要重新设置副本 channels

点击(此处)折叠或打开

  1. if (comm->nChannels < nChannelsOrig) {
  2.     // We started duplicating channels during Preset(), so we need to move the
  3.     // duplicated channels since we have removed some.
  4.     for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
  5.   }

之后,设置是否支持 CollNet

点击(此处)折叠或打开

  1. // Determine CollNet support after all-gather now that we know nNodes and each node localRanks
  2.   if (comm->collNetSupport == 1) {
  3.     int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
  4.     if (comm->nNodes < collNetNodeThreshold) {
  5.       INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
  6.       comm->collNetSupport = 0;
  7.     }
  8.     comm->collNetRegSupport = true;
  9.     for (int n=0; n<comm->nNodes; n++) {
  10.       if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) {
  11.         WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1);
  12.         comm->collNetSupport = 0;
  13.         break;
  14.       }
  15.       if (comm->nodeRanks[n].localRanks > 1) {
  16.         // As long as there is more than 1 rank on any node, we need to disable collnet reg
  17.         comm->collNetRegSupport = false;
  18.       }
  19.     }
  20.   }

完成上述操作后,我们调用 ncclTopoPostset 函数合并所有的 RingTree 等拓扑。例如,对于 Ring 形的拓扑,依次连接各个子环,构成完成的环。第二次AllGather结束。

点击(此处)折叠或打开

  1. NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
  2. NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail);
  3.   // AllGather3 - end

如下图所示,有两台机器,每台机器上有 4 GPUrank 编号为 07。每台机器上的 rank 构建本机器上的部分环,然后通过 ncclTopoPostset 是两个部分环首尾相连,构建完整的环。

ncclTopoPostset将所有channelprevnextsendrecv信息打平到数组中,例如recv[0]表示{BANNED}中国第一个ringrank0recv是哪个rank,然后开始计算当前机器{BANNED}中国第一个rankprev和{BANNED}最佳后一个ranknext

点击(此处)折叠或打开

  1. ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) {
  2.   // Gather data from all ranks
  3.   int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
  4.   int nranks = comm->nRanks;
  5.   int nNodes = comm->nNodes;
  6.   int nChannels = comm->nChannels;
  7.   int minHeadNum = INT_MAX;
  8.   int shared = parent && parent->nvlsSupport && parent->config.splitShare;
  9.   NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
  10.   NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS));
  11.   NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
  12.   NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
  13.   NCCLCHECK(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS));
  14.   NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS));
  15.   NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS));
  16.   NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS));

  17.   // Alternate rings to avoid crossing rails
  18.   if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) {
  19.     for (int r=0; r<comm->nRanks; r++) {
  20.       if (comm->rankToNode[r] % 2 == 1) {
  21.         // Exchange rings
  22.         for (int c=0; c<nChannels; c+=2) {
  23.           exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
  24.           exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
  25.           exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
  26.           exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
  27.         }
  28.       }
  29.     }
  30.   }

  31.   for (int c=0; c<nChannels;c++) {
  32.     for (int n=0; n<nNodes; n++) {
  33.       int r = firstRanks[n];
  34.       ringRecv[c*nNodes+n] = allTopoRanks[r]->ringRecv[c];
  35.       ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c];
  36.       treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c];
  37.       treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c];
  38.       treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c];
  39.     }
  40.     for (int r=0; r<nranks; r++) {
  41.       ringPrev[c*nranks+r] = allTopoRanks[r]->ringPrev[c];
  42.       ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
  43.     }
  44.   }

  45.   for (int n = 0; n < nNodes; n++) {
  46.     int r = firstRanks[n];
  47.     if (minHeadNum > allTopoRanks[r]->nvlsHeadNum)
  48.       minHeadNum = allTopoRanks[r]->nvlsHeadNum;
  49.   }

  50.   for (int c = 0; c < minHeadNum; c++) {
  51.     for (int n = 0; n < nNodes; n++) {
  52.       int r = firstRanks[n];
  53.       nvlsHeads[c * nNodes + n] = allTopoRanks[r]->nvlsHeads[c];
  54.     }
  55.   }

  56.   // Connect rings and trees. This should also duplicate the channels.
  57.   NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
  58.   NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));

  59.   // Duplicate ringPrev/ringNext for ncclBuildRing
  60.   memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
  61.   memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));

  62.   // Set ring prev/next for my rank
  63.   for (int c=0; c<nChannels; c++) {
  64.     struct ncclChannel* channel0 = comm->channels+c;
  65.     struct ncclChannel* channel1 = channel0+nChannels;
  66.     channel0->ring.prev = channel1->ring.prev = ringPrev[c*nranks+comm->rank];
  67.     channel0->ring.next = channel1->ring.next = ringNext[c*nranks+comm->rank];
  68.   }

  69.   // Duplication should be complete now
  70.   nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);

  71.   // Setup CollNet
  72.   if (comm->collNetSupport == 1) {
  73.     struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN];
  74.     // Add more channels to saturate intra-node bandwidth, except the 1 PPN case
  75.     if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) {
  76.       int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
  77.       nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
  78.     }
  79.     NCCLCHECK(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]));
  80.   }

  81.   // Use 4 compute channels per search channel to reach peak BW on <8 PPN
  82.   if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
  83.      nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
  84.   }

  85.   // Double the number of channels when using unpack networking (greater than 1 node)
  86.   // We won't automatically double past 16 channels, users can specify 32 if they want
  87.   if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && nChannels < 16 && ncclParamUnpackDoubleNChannels()) {
  88.      nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
  89.   }

  90.   // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
  91.   // We permit combining max, then min, to only use the first channels, then duplicate them.
  92.   if (comm->sharedRes->owner != comm) {
  93.     /* child comm #channels cannot exceed top parent #channels. */
  94.     nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
  95.     nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), comm->config.minCTAs), comm->sharedRes->tpNChannels), ringPrev, ringNext);
  96.   } else {
  97.     nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs);
  98.     nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->config.minCTAs), ringPrev, ringNext);
  99.   }

  100.   comm->collChannels = comm->nChannels;
  101. #if CUDART_VERSION >= 12010
  102.   // Support maximal channel usage for aggregation
  103.   if (shared && comm->nvlsChannels > parent->nvlsResources->nChannels) {
  104.     comm->nvlsChannels = parent->nvlsResources->nChannels;
  105.   }
  106.   if (comm->nChannels < comm->nvlsChannels) {
  107.     nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
  108.   }
  109.   NCCLCHECK(connectNvls(comm, nvlsHeads, minHeadNum));
  110. #endif
  111.   if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
  112.     nChannels = comm->nChannels = parent->sharedRes->tpNChannels;
  113.     comm->collChannels = std::min(comm->collChannels, comm->nChannels);
  114.   }

  115.   // Create rings array and check all is fine
  116.   NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));

  117.   free(ringRecv);
  118.   free(ringSend);
  119.   free(ringPrev);
  120.   free(ringNext);
  121.   free(treeToParent);
  122.   free(treeToChild0);
  123.   free(treeToChild1);
  124.   free(nvlsHeads);

  125.   return ncclSuccess;
  126. }

阅读(27) | 评论(0) | 转发(0) |
1

上一篇:招贤纳士

下一篇:没有了

给主人留下些什么吧!~~