NCCL源码解析5——跨机channel合并
——lvyilong316
上节中我们知道通过
ncclTopoCompute函数完成单机(单
node)内的
channel搜索和构建。接下来回到
initTransportsRank中,看下如果是多机器情况,是如何合并各个
node的拓扑和
channel。在后续介绍之前,需要先再回顾一下
graph结构的
intra和
inter数组含义,其中
intra记录的是所有
channel的
node内节点,即
GPU,
inter记录的向外互联的节点,即
NET节点。如下图所示,假如当前
Node,分别有
GPU0-> GPU1-> GPU2-> GPU3和对外的
NET,以及
GPU1-> GPU0-> GPU3-> GPU2和对外的
NET两个
channel,则对应的
intra和
inter数组如下所示。
接下来我们回到代码,开始第二次AllGather的准备工作。首先分配allGather3Data用于rank间聚合channel的信息,其中allGather3Data对应的结构定义如下
-
struct allGatherInfo *allGather3Data
-
struct allGatherInfo {
-
struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS];
-
struct ncclTopoRanks topoRanks;
-
int cpuArch;
-
int cpuVendor;
-
};
allGatherInfo结构包含所有算法的graph和channel信息。
-
// AllGather3 - begin
-
NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail);
-
//graphs即comm->graphs
-
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
-
allGather3Data[rank].graphInfo[a].pattern = graphs[a]->pattern;
-
allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels;
-
allGather3Data[rank].graphInfo[a].sameChannels = graphs[a]->sameChannels;
-
allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra;
-
allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter;
-
allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra;
-
allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter;
-
allGather3Data[rank].graphInfo[a].crossNic = graphs[a]->crossNic;
-
}
-
-
allGather3Data[rank].cpuArch = comm->cpuArch;
-
allGather3Data[rank].cpuVendor = comm->cpuVendor;
-
-
comm->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
-
NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail);
上述代码的for循环用于初始化allGather3Data[rank].graphInfo,而ncclTopoPreset函数则用于初始化allGather3Data[rank].topoRanks。首先看一下topoRanks的结构体定义:
-
struct ncclTopoRanks {
-
int ringRecv[MAXCHANNELS];
-
int ringSend[MAXCHANNELS];
-
int ringPrev[MAXCHANNELS];
-
int ringNext[MAXCHANNELS];
-
int treeToParent[MAXCHANNELS];
-
int treeToChild0[MAXCHANNELS];
-
int treeToChild1[MAXCHANNELS];
-
int nvlsHeads[MAXCHANNELS];
-
int nvlsHeadNum;
-
};
涉及以下信息:
l ringRecv[c]:当前rank所在的”部分 channel c ”的头节点,也可以理解为所在机器的头节点rank;
l ringSend[c]:当前rank所在的”部分 channel c ”的尾节点也可以理解为所在机器的尾节点rank;
l ringPrev[c]:当前rank所在的”部分 channel c ”中的前一个rank;
l ringNext[c]:当前rank所在的”部分 channel c ”中的后一个rank;
其中{BANNED}中国第一个rank的prev和{BANNED}最佳后一个rank的next为-1。ncclTopoPreset同时还会将comm->channels进行初始化。我们只关注Ring算法的部分,其他代码省去,注意这里{BANNED}最佳后将搜索到的环复制了一遍,即channel这里会翻倍,这里在官方issue中看到相关解释是为了进一步的并行以充分利用带宽。
-
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
-
int rank = comm->rank;
-
int localRanks = comm->topo->nodes[GPU].count;
-
int nvlsRanks = comm->MNNVL ? comm->clique.size : localRanks;
-
int nChannels = comm->nChannels;
-
-
topoRanks->nvlsHeadNum = 0;
-
for (int c=0; c<nChannels; c++) {
-
struct ncclChannel* channel = comm->channels+c;
-
channel->ring.prev = channel->ring.next = -1;
-
channel->tree.up = -1;
-
channel->collnetChain.up = -1;
-
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
-
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collnetChain.down[i] = -1;
-
channel->collnetDirect.out = -1;
-
channel->collnetDirect.headRank = -1;
-
channel->collnetDirect.nHeads = 0;
-
channel->collnetDirect.shift = 0;
-
for (int i=0; i<NCCL_MAX_DIRECT_ARITY+1; i++) channel->collnetDirect.heads[i] = -1;
-
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.up[i] = -1;
-
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.down[i] = -1;
-
-
int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks;
-
int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks;
-
int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks;
-
-
for (int i=0; i<localRanks; i++) {
-
if (ringIntra[i] == rank) {
-
topoRanks->ringRecv[c] = ringIntra[0];
-
topoRanks->ringSend[c] = ringIntra[localRanks-1];
-
topoRanks->ringPrev[c] = (i == 0) ? -1 : ringIntra[i-1];
-
topoRanks->ringNext[c] = (i == localRanks-1) ? -1 : ringIntra[i+1];
-
}
-
if (collNetIntra[i] == rank) {
-
channel->collnetChain.up = i == 0 ? comm->nRanks : collNetIntra[i-1];
-
channel->collnetChain.down[0] = i == localRanks-1 ? -1 : collNetIntra[i+1];
-
}
-
}
-
}
-
// Duplicate channels trees
-
struct ncclChannel* channel0 = comm->channels;
-
struct ncclChannel* channel1 = channel0+nChannels;
-
memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
-
-
// Get nvls heads and the number of heads. Duplicate head is not allowed.
-
for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
-
bool addHead = true;
-
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * nvlsRanks;
-
-
for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
-
if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
-
addHead = false;
-
break;
-
}
-
}
-
if (addHead) {
-
topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0];
-
}
-
}
-
memcpy(comm->nvlsHeads, topoRanks->nvlsHeads, sizeof(int) * topoRanks->nvlsHeadNum);
-
-
return ncclSuccess;
-
}
回到initTransportsRank,接着,通过bootstrapAllGather获取的所有进程间的图信息,并计算下面的数据结构。
l nodesFirstRank: 记录机器 node 上的{BANNED}中国第一个 rank 编号。
l nodesTreePatters: 记录机器 node 上的树形拓扑的模式。
l rankToNode: 记录 rank 所属的机器 node。
l localRankToRank:记录 local_rank 到全局 rank 的映射。
l localRank: 记录本进程的 local_rank。
l localRanks: 记录 local_rank 的数量。
如果所有进程中使用有不同型号或者厂商的 CPU,则给出警告信息以便于排查错误,因为在过去的实现中这种配置可能导致集合通信死锁。
-
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
-
-
// Determine nNodes, firstRanks, ...
-
NCCLCHECKGOTO(ncclCalloc(&nodesFirstRank, nranks), ret, fail);
-
NCCLCHECKGOTO(ncclCalloc(&nodesTreePatterns, nranks), ret, fail);
-
NCCLCHECKGOTO(ncclCalloc(&comm->rankToNode, comm->nRanks), ret, fail);
-
for (int r=0; r<nranks; r++) {
-
int node;
-
int firstRank = allGather3Data[r].topoRanks.ringRecv[0];
-
for (node=0; node<comm->nNodes && nodesFirstRank[node] != firstRank; node++);
-
if (node == comm->nNodes) {
-
comm->nNodes++;
-
nodesFirstRank[node] = firstRank;
-
// Record tree pattern of each node as they can be different depending on sm arch
-
nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern;
-
}
-
comm->rankToNode[r] = node;
-
-
if (comm->cpuArch != allGather3Data[r].cpuArch &&
-
comm->cpuArch != NCCL_TOPO_CPU_ARCH_MIXED) {
-
comm->cpuArch = NCCL_TOPO_CPU_ARCH_MIXED;
-
}
-
if (comm->cpuVendor != allGather3Data[r].cpuVendor &&
-
comm->cpuVendor != NCCL_TOPO_CPU_VENDOR_MIXED) {
-
comm->cpuVendor = NCCL_TOPO_CPU_VENDOR_MIXED;
-
}
-
}
-
-
// Alert the user to the presence of mixed CPUs. In the past this has caused
-
// locks in some collective routines. This may help debug issues in the future.
-
if (rank==0) {
-
if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_MIXED) {
-
INFO(NCCL_GRAPH, "CPUs with mixed architecture were detected.");
-
}
-
if (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) {
-
INFO(NCCL_GRAPH, "CPUs with mixed vendors were detected.");
-
}
-
}
-
-
// Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
-
NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks, comm->nNodes), ret, fail);
-
NCCLCHECKGOTO(ncclCalloc(&comm->rankToLocalRank, comm->nRanks), ret, fail);
-
for (int r=0; r<comm->nRanks; r++) {
-
int node = comm->rankToNode[r];
-
comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks;
-
comm->nodeRanks[node].localRanks++;
-
}
-
// Allocate ranks arrays for each node
-
for (int n=0; n<comm->nNodes; n++) {
-
NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks), ret, fail);
-
comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks);
-
comm->nodeRanks[n].localRanks = 0;
-
}
-
// And fill the ranks arrays
-
for (int r=0; r<comm->nRanks; r++) {
-
int node = comm->rankToNode[r];
-
comm->nodeRanks[node].localRankToRank[comm->nodeRanks[node].localRanks++] = r;
-
}
-
comm->node = comm->rankToNode[rank];
-
comm->localRankToRank = comm->nodeRanks[comm->node].localRankToRank;
-
comm->localRank = comm->rankToLocalRank[rank];
-
comm->localRanks = comm->nodeRanks[comm->node].localRanks;
-
-
TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d",
-
rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]);
-
if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) {
-
WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d",
-
rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
-
comm->localRank, comm->localRanks, comm->localRankToRank[0]);
-
ret = ncclInternalError;
-
goto fail;
-
}
-
-
INFO(NCCL_INIT, "comm %p rank %d nRanks %d nNodes %d localRanks %d localRank %d MNNVL %d",
-
comm, rank, comm->nRanks, comm->nNodes, comm->localRanks, comm->localRank, comm->MNNVL);
接着,我们对齐所有 rank 间的配置信息。
-
nChannelsOrig = comm->nChannels;
-
NCCLCHECKGOTO(ncclCalloc(&allTopoRanks, comm->nRanks), ret, fail);
-
for (int i=0; i<nranks; i++) {
-
allTopoRanks[i] = &allGather3Data[i].topoRanks;
-
// Make sure we align all ranks so that the tuning is consistent across ranks
-
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
-
graphs[a]->nChannels = std::min(allGather3Data[i].graphInfo[a].nChannels, graphs[a]->nChannels);
-
graphs[a]->sameChannels = std::min(allGather3Data[i].graphInfo[a].sameChannels, graphs[a]->sameChannels);
-
graphs[a]->bwIntra = std::min(allGather3Data[i].graphInfo[a].bwIntra, graphs[a]->bwIntra);
-
graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter);
-
graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra);
-
graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
-
graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic);
-
}
-
}
随后,由于删除了一些 channels,因此需要重新设置副本 channels。
-
if (comm->nChannels < nChannelsOrig) {
-
// We started duplicating channels during Preset(), so we need to move the
-
// duplicated channels since we have removed some.
-
for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
-
}
之后,设置是否支持 CollNet。
-
// Determine CollNet support after all-gather now that we know nNodes and each node localRanks
-
if (comm->collNetSupport == 1) {
-
int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
-
if (comm->nNodes < collNetNodeThreshold) {
-
INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
-
comm->collNetSupport = 0;
-
}
-
comm->collNetRegSupport = true;
-
for (int n=0; n<comm->nNodes; n++) {
-
if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) {
-
WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1);
-
comm->collNetSupport = 0;
-
break;
-
}
-
if (comm->nodeRanks[n].localRanks > 1) {
-
// As long as there is more than 1 rank on any node, we need to disable collnet reg
-
comm->collNetRegSupport = false;
-
}
-
}
-
}
完成上述操作后,我们调用 ncclTopoPostset 函数合并所有的 Ring、Tree 等拓扑。例如,对于 Ring 形的拓扑,依次连接各个子环,构成完成的环。第二次AllGather结束。
-
NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
-
NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail);
-
// AllGather3 - end
如下图所示,有两台机器,每台机器上有 4 张 GPU,rank 编号为 0~7。每台机器上的 rank 构建本机器上的部分环,然后通过 ncclTopoPostset 是两个部分环首尾相连,构建完整的环。
ncclTopoPostset将所有channel的prev,next,send,recv信息打平到数组中,例如recv[0]表示{BANNED}中国第一个ring中rank0的recv是哪个rank,然后开始计算当前机器{BANNED}中国第一个rank的prev和{BANNED}最佳后一个rank的next。
-
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) {
-
// Gather data from all ranks
-
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
-
int nranks = comm->nRanks;
-
int nNodes = comm->nNodes;
-
int nChannels = comm->nChannels;
-
int minHeadNum = INT_MAX;
-
int shared = parent && parent->nvlsSupport && parent->config.splitShare;
-
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
-
NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS));
-
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
-
NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
-
NCCLCHECK(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS));
-
NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS));
-
NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS));
-
NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS));
-
-
// Alternate rings to avoid crossing rails
-
if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) {
-
for (int r=0; r<comm->nRanks; r++) {
-
if (comm->rankToNode[r] % 2 == 1) {
-
// Exchange rings
-
for (int c=0; c<nChannels; c+=2) {
-
exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
-
exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
-
exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
-
exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
-
}
-
}
-
}
-
}
-
-
for (int c=0; c<nChannels;c++) {
-
for (int n=0; n<nNodes; n++) {
-
int r = firstRanks[n];
-
ringRecv[c*nNodes+n] = allTopoRanks[r]->ringRecv[c];
-
ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c];
-
treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c];
-
treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c];
-
treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c];
-
}
-
for (int r=0; r<nranks; r++) {
-
ringPrev[c*nranks+r] = allTopoRanks[r]->ringPrev[c];
-
ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
-
}
-
}
-
-
for (int n = 0; n < nNodes; n++) {
-
int r = firstRanks[n];
-
if (minHeadNum > allTopoRanks[r]->nvlsHeadNum)
-
minHeadNum = allTopoRanks[r]->nvlsHeadNum;
-
}
-
-
for (int c = 0; c < minHeadNum; c++) {
-
for (int n = 0; n < nNodes; n++) {
-
int r = firstRanks[n];
-
nvlsHeads[c * nNodes + n] = allTopoRanks[r]->nvlsHeads[c];
-
}
-
}
-
-
// Connect rings and trees. This should also duplicate the channels.
-
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
-
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
-
-
// Duplicate ringPrev/ringNext for ncclBuildRing
-
memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
-
memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
-
-
// Set ring prev/next for my rank
-
for (int c=0; c<nChannels; c++) {
-
struct ncclChannel* channel0 = comm->channels+c;
-
struct ncclChannel* channel1 = channel0+nChannels;
-
channel0->ring.prev = channel1->ring.prev = ringPrev[c*nranks+comm->rank];
-
channel0->ring.next = channel1->ring.next = ringNext[c*nranks+comm->rank];
-
}
-
-
// Duplication should be complete now
-
nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
-
-
// Setup CollNet
-
if (comm->collNetSupport == 1) {
-
struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN];
-
// Add more channels to saturate intra-node bandwidth, except the 1 PPN case
-
if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) {
-
int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
-
nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
-
}
-
NCCLCHECK(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]));
-
}
-
-
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
-
if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
-
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
-
}
-
-
// Double the number of channels when using unpack networking (greater than 1 node)
-
// We won't automatically double past 16 channels, users can specify 32 if they want
-
if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && nChannels < 16 && ncclParamUnpackDoubleNChannels()) {
-
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
-
}
-
-
// Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
-
// We permit combining max, then min, to only use the first channels, then duplicate them.
-
if (comm->sharedRes->owner != comm) {
-
/* child comm #channels cannot exceed top parent #channels. */
-
nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
-
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), comm->config.minCTAs), comm->sharedRes->tpNChannels), ringPrev, ringNext);
-
} else {
-
nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs);
-
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->config.minCTAs), ringPrev, ringNext);
-
}
-
-
comm->collChannels = comm->nChannels;
-
#if CUDART_VERSION >= 12010
-
// Support maximal channel usage for aggregation
-
if (shared && comm->nvlsChannels > parent->nvlsResources->nChannels) {
-
comm->nvlsChannels = parent->nvlsResources->nChannels;
-
}
-
if (comm->nChannels < comm->nvlsChannels) {
-
nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
-
}
-
NCCLCHECK(connectNvls(comm, nvlsHeads, minHeadNum));
-
#endif
-
if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
-
nChannels = comm->nChannels = parent->sharedRes->tpNChannels;
-
comm->collChannels = std::min(comm->collChannels, comm->nChannels);
-
}
-
-
// Create rings array and check all is fine
-
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
-
-
free(ringRecv);
-
free(ringSend);
-
free(ringPrev);
-
free(ringNext);
-
free(treeToParent);
-
free(treeToChild0);
-
free(treeToChild1);
-
free(nvlsHeads);
-
-
return ncclSuccess;
-
}