NCCL源码解析4——channel搜索
——lvyilong316
上文计算出GPU和节点到其他任意节点的{BANNED}最佳优路径了,本节看下nccl中channel的搜索过程。
NCCL中channel的概念表示一个通信“信道”。注意这里没有说他是路径,因为它和路径还是有区别的。如下图所示,我们可以说A到B有两个path,分别是A-B和A-C-B,但是A到B可以有多个channel,他们来自一个path。有点像我们打电话,底层是用的一个电话线(path),但是每两个人之间通话都是一个channel。
Channel的作用是为了更好的利用带宽和网卡,以及同一块数据可以通过多个channel并发通信。后续我们可以看到一个channel对应了一个GPU stream,同时也对应一个执行核函数的block,这样我们可以更加清晰。所以基于这些原因,NCCL会使用多channel,搜索的过程就是搜索出来一组channel。
下面回到initTransportsRank中继续查看channel搜索相关逻辑。首先,设置是否支持 CollNet 和 NVLS。并调用 ncclTopoCompute 函数计算 RING、Tree 等拓扑信息,并根据需要将相关拓扑信息打印到文件中。关于 NVLink Sharp (NVLS) 的信息,可以参考 NVLS Issue。
-
// Determine local CollNet support
-
if (collNetSupport(comm)) {
-
const char *collNetEnable = ncclGetEnv("NCCL_COLLNET_ENABLE");
-
if (collNetEnable != NULL) {
-
INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
-
if (strcmp(collNetEnable, "1") == 0) {
-
comm->collNetSupport = 1;
-
}
-
}
-
}
-
-
// Determine local Nvls support
-
NCCLCHECK(ncclNvlsInit(comm));
-
-
timers[TIMER_INIT_GRAPHS] = clockNano();
-
// Get rings and trees
-
memset(ringGraph, 0, sizeof(struct ncclTopoGraph));
-
ringGraph->id = 0;
-
ringGraph->pattern = NCCL_TOPO_PATTERN_RING;
-
ringGraph->minChannels = 1;
-
ringGraph->maxChannels = MAXCHANNELS/2;
-
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, ringGraph), ret, fail);
-
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, ringGraph), ret, fail);
-
-
memset(treeGraph, 0, sizeof(struct ncclTopoGraph));
-
treeGraph->id = 1;
-
treeGraph->pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
-
treeGraph->minChannels = ringGraph->nChannels;
-
treeGraph->maxChannels = ringGraph->nChannels;
-
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, treeGraph), ret, fail);
-
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, treeGraph), ret, fail);
-
-
memset(collNetChainGraph, 0, sizeof(struct ncclTopoGraph));
-
collNetChainGraph->id = 2;
-
collNetChainGraph->pattern = NCCL_TOPO_PATTERN_TREE;
-
collNetChainGraph->collNet = 1;
-
collNetChainGraph->minChannels = ringGraph->nChannels;
-
collNetChainGraph->maxChannels = ringGraph->nChannels;
-
-
memset(collNetDirectGraph, 0, sizeof(struct ncclTopoGraph));
-
collNetDirectGraph->id = 2;
-
collNetDirectGraph->pattern = NCCL_TOPO_PATTERN_COLLNET_DIRECT;
-
collNetDirectGraph->collNet = 1;
-
collNetDirectGraph->minChannels = 1;
-
collNetDirectGraph->maxChannels = MAXCHANNELS;
-
if (comm->collNetSupport) {
-
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetChainGraph), ret, fail);
-
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetChainGraph), ret, fail);
-
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetDirectGraph), ret, fail);
-
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetDirectGraph), ret, fail);
-
}
-
-
memset(nvlsGraph, 0, sizeof(struct ncclTopoGraph));
-
nvlsGraph->id = 3;
-
nvlsGraph->pattern = NCCL_TOPO_PATTERN_NVLS;
-
nvlsGraph->minChannels = 1;
-
nvlsGraph->maxChannels = MAXCHANNELS;
-
if (comm->nvlsSupport) {
-
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, nvlsGraph), ret, fail);
-
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, nvlsGraph), ret, fail);
-
}
-
timers[TIMER_INIT_GRAPHS] = clockNano() - timers[TIMER_INIT_GRAPHS];
-
-
// Initialize num P2P LL buffers for this communicator
-
comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1;
-
-
if (comm->rank == ncclParamGraphDumpFileRank()) {
-
struct ncclTopoGraph* dumpGraphs[5] = { ringGraph, treeGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph };
-
NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 5, dumpGraphs), ret, fail);
-
}
下面我们体讨论ncclTopoCompute函数,这个函数是channel搜索的核心,它根据传入的graph->pattern类型,构建 Tree、Ring 等拓扑的 channels 信息。为了方便,我们后面以Ring(NCCL_TOPO_PATTERN_RING)为例进行分析。
ncclTopoCompute的执行过程就是实际搜索channel的过程,目标是搜索出来尽可能多,带宽尽可能大的一系列channel,本质就是暴力搜索,先设置一系列的条件搜答案,如果搜不出来则降低条件继续搜。当搜索到了之后,又会尝试提升条件进一步搜索,看是否可以搜索出更好的。
-
ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
-
int ngpus = system->nodes[GPU].count;
-
int crossNic = (system->nodes[NET].count > 1) &&
-
(graph->pattern == NCCL_TOPO_PATTERN_RING ||
-
graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
-
graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? ncclParamCrossNic() : 0;
-
//如果是多机场景,system->nodes[NET].count是大于1的,所以crossNic也大于1,但是由于crossNic不等于1,因此graph->crossNic反而初始化为0
-
graph->crossNic = crossNic == 1 ? 1 : 0;
-
graph->bwIntra = graph->bwInter = 0;
-
graph->latencyInter = 0;
-
graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
-
graph->typeInter = PATH_PIX;
-
graph->nChannels = 0;
-
int trySameChannels = graph->pattern == NCCL_TOPO_PATTERN_NVLS ? 0 : 1;
-
graph->sameChannels = trySameChannels;
-
-
int cpuArch, cpuVendor, cpuModel;
-
NCCLCHECK(ncclTopoCpuType(system, &cpuArch, &cpuVendor, &cpuModel));
-
-
const char* str = ncclGetEnv("NCCL_GRAPH_FILE");
-
if (str) {
-
INFO(NCCL_ENV, "NCCL_GRAPH_FILE set by environment to %s", str);
-
struct ncclXml* xml;
-
NCCLCHECK(xmlAlloc(&xml, NCCL_GRAPH_XML_MAX_NODES));
-
NCCLCHECK(ncclTopoGetXmlGraphFromFile(str, xml));
-
int nChannels;
-
NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph, &nChannels));
-
INFO(NCCL_GRAPH, "Search %d : %d channels loaded from XML graph", graph->id, nChannels);
-
free(xml);
-
if (graph->nChannels > 0) return ncclSuccess;
-
}
-
-
int ccMin;
-
NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
-
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess;
-
// NVLS and COLLNET_DIRECT search must have ngpus heads at most.
-
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT)
-
graph->maxChannels = system->nodes[GPU].count;
-
-
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
-
-
if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
-
// Force intra-node NVLS algorithm to pull evenly from all GPUs.
-
graph->minChannels = graph->maxChannels = system->nodes[GPU].count;
-
}
-
-
struct ncclTopoGraph tmpGraph;
-
memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
-
-
// First try crossnic, then decrease bw and finally increase bwIntra.
-
int nspeeds = 0;
-
float* speedArray = NULL;
-
if (system->nodes[NET].count == 0) {
-
nspeeds = ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA;
-
speedArray = ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra;
-
} else {
-
nspeeds = ccMin >= 90 ? NSPEEDSINTER_SM90 : NSPEEDSINTER;
-
speedArray = ccMin >= 90 ? sm90SpeedArrayInter : speedArrayInter;
-
}
-
int pass = 1;
-
int speedIndex = 0;
-
float maxBw = system->maxBw;
-
float totalBw = system->totalBw;
-
if (ngpus > 1 && graph->pattern != NCCL_TOPO_PATTERN_RING) totalBw *= ngpus*1.0/(ngpus-1);
-
while ((speedArray[speedIndex] > maxBw || speedArray[speedIndex]*graph->minChannels > totalBw) && speedIndex < nspeeds-1) speedIndex++;
-
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
-
int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
-
-
search:
-
int time = tmpGraph.sameChannels ? NCCL_SEARCH_TIMEOUT_SAMECHANNELS :
-
tmpGraph.pattern == NCCL_TOPO_PATTERN_TREE ? NCCL_SEARCH_TIMEOUT_TREE : NCCL_SEARCH_TIMEOUT;
-
tmpGraph.nChannels = 0;
-
globalTimeout -= time;
-
-
NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time));
-
#if 0
-
printf("Id %d Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.id, tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
-
for (int c=0; c<graph->nChannels; c++) {
-
printf("%2d : ", c);
-
for (int g=0; g<ngpus; g++) {
-
printf("%d ", graph->intra[c*ngpus+g]);
-
}
-
printf("[%lx %lx]", graph->inter[c*2+0], graph->inter[c*2+1]);
-
printf("\n");
-
}
-
#endif
-
// Optimal solution, stop here
-
if (time == -1) goto done;
-
if (graph->nChannels*graph->bwInter >= system->totalBw) goto done;
-
-
if (pass == 1) {
-
// First pass, we don't have a solution yet ; try other options
-
-
// Try having different channels (except when going through AMD CPUs)
-
if (tmpGraph.sameChannels == 1 &&
-
!(cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD && tmpGraph.typeIntra == PATH_SYS)) {
-
tmpGraph.sameChannels = 0;
-
goto search;
-
}
-
tmpGraph.sameChannels = trySameChannels;
-
-
if (time != -1) globalTimeout += time;
-
else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
-
if (globalTimeout < 0 && graph->nChannels) goto done;
-
-
// Try a simpler tree
-
if (ccMin >= 90 && tmpGraph.pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
-
tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
-
goto search;
-
}
-
tmpGraph.pattern = graph->pattern;
-
-
int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
-
if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
-
tmpGraph.typeIntra += 1; //初始为PATH_NVL,逐步降低条件
-
goto search;
-
}
-
tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
-
-
if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
-
tmpGraph.typeInter += 1; //初始为PATH_PIX,逐步降低条件
-
goto search;
-
}
-
tmpGraph.typeInter = PATH_PIX;
-
-
if (crossNic == 2 && tmpGraph.crossNic == 0) {
-
// Try again with crossNic if permitted
-
tmpGraph.crossNic = 1;
-
goto search;
-
}
-
tmpGraph.crossNic = crossNic == 1 ? 1 : 0;
-
-
// Decrease bw until we find a solution
-
if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->bwInter > .49))) {
-
tmpGraph.bwInter = tmpGraph.bwIntra = speedArray[++speedIndex];
-
goto search;
-
}
-
speedIndex = 0;
-
while (speedArray[speedIndex] > maxBw && speedIndex < nspeeds-1) speedIndex++;
-
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
-
-
}
-
-
done:
-
// We have a solution. Start from that solution and move to pass 2.
-
if (pass == 1) {
-
time = -1;
-
NCCLCHECK(ncclTopoDupChannels(graph, ccMin, ngpus));
-
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
-
speedIndex = 0;
-
while (speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds-1) speedIndex++;
-
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
-
tmpGraph.minChannels = graph->nChannels;
-
pass = 2;
-
}
-
-
if (pass == 2) {
-
// See if we can increase bw
-
if (time != 0 && speedIndex > 0) {
-
if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
-
// increase bw for Ring
-
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[--speedIndex];
-
goto search;
-
} else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && tmpGraph.bwInter == graph->bwInter && tmpGraph.bwInter < tmpGraph.bwIntra*2) {
-
tmpGraph.minChannels = tmpGraph.maxChannels = graph->nChannels;
-
tmpGraph.bwInter = speedArray[--speedIndex];
-
goto search;
-
} else if (tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2) {
-
// increase bwIntra for trees (2 nodes or collnet)
-
tmpGraph.bwIntra = speedArray[--speedIndex];
-
goto search;
-
}
-
}
-
time = -1;
-
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
-
}
-
-
if (graph->nChannels == 0 && graph->collNet == 0 && graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
-
WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
-
for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
-
graph->inter[0] = graph->inter[1] = 0;
-
graph->bwIntra = graph->bwInter = 0.1;
-
graph->typeIntra = graph->typeInter = PATH_SYS;
-
graph->nChannels = 1;
-
}
-
return ncclSuccess;
-
}
在我的一台测试机上,拓扑如下所示,有两个CPU node(socket),每个CPU node通过PCIe vswitch连接两个GPU,两个GPU之间通过NVlink相连。
这里我们通过如下命令,指定两个rank测试all_reduce,注意这里的host_ip是我本机上的一个网卡IP。
-
/usr/local/openmpi/bin/mpirun --allow-run-as-root -np 2 -H $host_ip:2 \
-
-x NCCL_IB_GID_INDEX=3 \
-
-x NCCL_DEBUG=TRACE \
-
-x NCCL_IB_GID_INDEX=1 \
-
-x NCCL_NET_GDR_LEVEL=2 \
-
-x NCCL_IB_QPS_PER_CONNECTION=4 \
-
-x LD_LIBRARY_PATH -x PATH \
-
/root/nccl-tests/build/all_reduce_perf -b 32M -e 1G -i 1000 -f 2 -g 1
由于指定两个rank,程序就选择到了拓扑中左侧的两个通过Nvlink相连的GPU。通过将ncclTopoCompute函数中的打印注释打开,我们得到如下{BANNED}最佳后的channel结果。可以看到Pattern为4就是NCCL_TOPO_PATTERN_RING,我们{BANNED}最佳后共找到了6个channel(都是GPU0->GPU1),每个channel的带宽是40GB。下面我们结合代码看是如何搜索的。
首先是初始化搜索条件:由于是单机搜索,因此没有NET节点,所以crossNic为0,然后初始化graph,首先设置{BANNED}最佳高的条件,限制节点内部typeIntra只能使用不超过PATH_NVL路径,节点间typeInter只能使用不超过PATH_PIX的路径,然后根据是机内搜索还是机间搜索设置带宽数组是speedArrayIntra还是speedArrayInter,其中两个数组定义如下,
-
float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
-
float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
数组的重点是尝试一组有限的离散值,这些值是已知需要的,以加快搜索速度。关于这两个数组的具体解释可以参考
接着执行ncclTopoSearchRec搜索出一个答案存储到一个临时的tmpGraph中。我们注意到其中有一个pass变量,{BANNED}最佳初它设置为1。
当{BANNED}中国第一次调用ncclTopoSearchRec后,判断是否满足搜索结束条件(graph->nChannels*graph->bwInter >= system->totalBw 或者搜索超时),如果不满足,则进入if (pass == 1)的逻辑,这个逻辑主要就是逐步降低搜索条件,包括降低typeIntra(机内搜索)或typeInter(机间搜索),通过调整speedArrayIntra(机内搜索)或speedArrayInter(机间搜索)数组下标降低带宽,以及设置crossNic尝试跨机搜索。
当尝试一系列降低条件后,跟进当前搜索的channel情况设置speedIndex(带宽数组下标),并设置pass = 2,开始进入if (pass == 2)的逻辑。这个逻辑尝试基于当前的channel带宽逐步提升带宽尝试搜索,以试图找到比刚才带宽更好的channel。
以上就是channel的搜索过程。每次调用ncclTopoSearchRec表示一次搜索,当然这个函数又回递归调用,具体过程如下图所示,这里不再展开。
这里我们从几个场景切入来更清楚的理解channel的搜索过程。首先,为什么上述我们两个rank的情况搜索出来的是12个channel?要想知道这个问题我们要看channel在搜索过程中是在哪里增加的。我找到以下逻辑处:
ncclTopoSearchRec—>ncclTopoSearchRecGpu—>ncclTopoSearchRecGpu。但这里当时曾一度让我非常费解,如下代码中在graph->nChannels++的调用后还有一个graph->nChannels--,那不是说明channel永远不会变吗?其实这里的核心在参数saveGraph参数搜索{BANNED}最佳终结果的保存处,而graph则是一个临时变量。每次搜索到一个channel后执行graph->nChannels++,然后执行ncclTopoCompareGraphs对比当前channel是否比之前的更优,如果是的话则将当前channel也算进去,这个时候执行memcpy就把saveGraph->nChannels增加了,而随后graph->nChannel--只是更新回退临时变量。
-
ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
-
if ((*time) <= 0) return ncclSuccess;
-
(*time)--;
-
-
int ngpus = system->nodes[GPU].count;
-
if (step == ngpus) {
-
// Determine whether we found a better solution or not
-
int copy = 0;
-
graph->nChannels++;
-
NCCLCHECK(ncclTopoCompareGraphs(system, graph, saveGraph, ©));
-
if (copy) {
-
memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph));
-
if (graph->nChannels == graph->maxChannels) *time = -1;
-
}
-
if (graph->nChannels < graph->maxChannels) {
-
NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, time));
-
}
-
graph->nChannels--;
-
return ncclSuccess;
-
}
-
……
-
}
然后我们重点看一下ncclTopoCompareGraphs的逻辑,看什么条件会允许增加channel。可以看到当前channel加入后如果总带宽增加,或者让跳数减少都认为当前channel可以被计入。
-
ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
-
// 1. Try to get the same nChannels between Rings and Trees
-
if (graph->nChannels < graph->minChannels) return ncclSuccess;
-
-
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { // NVLS channels correspond to GPUs pulling from NVLS. So the more the better.
-
if (graph->nChannels > refGraph->nChannels && graph->nChannels <= system->nodes[GPU].count) *copy = 1;
-
if (graph->nChannels*graph->bwInter > refGraph->nChannels*refGraph->bwInter) *copy = 1;
-
return ncclSuccess;
-
}
-
// 2. Try to get better bandwidth
-
if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra) {
-
*copy = 1;
-
return ncclSuccess;
-
}
-
if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra) return ncclSuccess;
-
-
// 3. Less hops
-
if (graph->pattern == refGraph->pattern && graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
-
return ncclSuccess;
-
}
{BANNED}最佳后,我们思考另一个问题,上述场景{BANNED}最佳后搜索出来是6个channel。其实决定channel的个数因素是path->bw和graph->bwIntra(有网卡时为graph->bwInter),即一个path的带宽(path->bw)可以容纳几个channel带宽graph->bwIntra。相关函数逻辑在:ncclTopoFollowPath—>followPath,每一个channel会消耗一定path的带宽: SUB_ROUND(link->bw, fwBw),如下图所示。
-
static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNode* start, int maxSteps, float bw, int* steps) {
-
float pciBw = bw;
-
for (int step=0; step<path->count; step++) {
-
struct ncclTopoNode* node = path->list[step]->remNode;
-
if (node->type == CPU) {
-
// Account for P2P inefficiency through Intel CPU RC
-
if (path->type == PATH_PHB && start->type == GPU &&
-
node->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 &&
-
node->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
-
pciBw = INTEL_P2P_OVERHEAD(bw);
-
}
-
}
-
}
-
-
struct ncclTopoNode* node = start;
-
for (int step=0; step<maxSteps; step++) {
-
struct ncclTopoLink* link = path->list[step];
-
struct ncclTopoLink* revLink = NULL;
-
float fwBw = link->type == LINK_PCI ? pciBw : bw;
-
float revBw = 0;
-
if (link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) {
-
if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink));
-
revBw += fwBw/8;
-
}
-
if (link->remNode->type == CPU && link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER && link->type == LINK_NVL) {
-
if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink));
-
revBw += fwBw;
-
}
-
if (link->bw < fwBw || (revBw && revLink->bw < revBw)) { *steps = step; return ncclSuccess; }
-
SUB_ROUND(link->bw, fwBw);
-
if (revBw) SUB_ROUND(revLink->bw, revBw);
-
node = link->remNode;
-
}
-
*steps = maxSteps;
-
return ncclSuccess;
-
}
而我的测试环境GPU为A100,NVlink对应path->bw是在构建拓扑的时候读取的, 具体是在ncclTopoAddNvLinks函数中,12个nvlink,每个20G,一共240G。
nvlBw: 20.000000(#define SM80_NVLINK_BW 20.0), gpu->gpu.cudaCompCap: 80, count: 12;
因此,path->bw即为240,又因为搜索初始的graph->bwIntra搜索带宽设置为speedArrayIntra数组的{BANNED}中国第一个元素,即40,因此一共搜索出6个channel(240/40=6)。
而当我们使用如下命令指定4个rank时,搜索出的channel就只有一个了,并且带宽只有10GB。
这是因为在我们的拓扑中4个rank的时候GPU Ring要跨PCIe和UPI,而PCIe的带宽就限制了path->bw,进而限制channel的个数1个就够了。