最近开发一个驱动时,测试的时候发现应用程序阻塞了,通过进一步的发掘发现是阻塞在了驱动里了。如何找出具体阻塞的位置呢?传统的方法是在驱动里加printk一步一步确认。但是这种方法有些很明显的缺点,一是太麻烦,每次都得重新编译驱动,重新加载驱动,甚至要重启系统。如果这个bug很难复现或者需要很长时间才能复现,这种方法就不太可取了。接下来介绍一种方法来快速的定位到问题的根源。
首先用strace -p tid确认一下该进程/线程是否确实是阻塞了,如果是阻塞了就没有连续的输出
用ps加-o wchan看一下阻塞在了哪个函数,
#ps H -eo tid,pid,ppid,comm,wchan
1418 1396 1 captureThrFxn vpfe_private_handler
加上n看一下阻塞的偏移地址
#ps nH -eo tid,pid,ppid,comm,wchan
1418 1396 1 captureThrFxn 73b54
73b54这个地址看起来很奇怪,应该不是一个绝对地址,这个地址代表什么意思呢,接下来看看vpfe_private_handler这个函数在内核中的加载地址
# grep vpfe_private_handler /proc/kallsyms
bf073ad0 t vpfe_private_handler [dm365_vpfe]
由此可以猜测73b54其实是bf073ad0的偏移0x84的位置。
最后我们用arm-none-linux-gnueabi-objdump将相应的模块反汇编
# arm-none-linux-gnueabi-objdump -d /workspace/DM36x_IPCAM/driver/720p_hdr/vpfe/dm365_vpfe.ko //最好不要strip模块,否则看不出来函数的分界,不过不会影响偏移
...........
00000ad0 :
ad0: e1a0c00d mov ip, sp
ad4: e92dd9f0 push {r4, r5, r6, r7, r8, fp, ip, lr, pc}
ad8: e24cb004 sub fp, ip, #4 ; 0x4
adc: e24dd00c sub sp, sp, #12 ; 0xc
ae0: e1a04002 mov r4, r2
ae4: e1a07003 mov r7, r3
ae8: ebfffffe bl 0
aec: e2800008 add r0, r0, #8 ; 0x8
af0: ebfffffe bl 0
af4: e59f3200 ldr r3, [pc, #512] ; cfc
af8: e1a05000 mov r5, r0
afc: e1540003 cmp r4, r3
b00: 0a000023 beq b94
b04: ca000003 bgt b18
b08: e59f31f0 ldr r3, [pc, #496] ; d00
b0c: e1540003 cmp r4, r3
b10: 1a000075 bne cec
b14: ea00006f b cd8
b18: e59f31e4 ldr r3, [pc, #484] ; d04
b1c: e1540003 cmp r4, r3
b20: 0a000003 beq b34
b24: e59f31dc ldr r3, [pc, #476] ; d08
b28: e1540003 cmp r4, r3
b2c: 1a00006e bne cec
b30: ea000027 b bd4
b34: e2804e35 add r4, r0, #848 ; 0x350
b38: e1a00004 mov r0, r4
b3c: ebfffffe bl 0
b40: e2506000 subs r6, r0, #0 ; 0x0
b44: 1a00000f bne b88
b48: e1a00005 mov r0, r5
b4c: e5a06368 str r6, [r0, #872]!
b50: ebfffffe bl 0 // actually block here
b54: e3500000 cmp r0, #0 ; 0x0 // block here
b58: 0a000003 beq b6c
b5c: e1a01000 mov r1, r0
b60: e59f01a4 ldr r0, [pc, #420] ; d0c
b68: ea000004 b b80
b6c: e1a01007 mov r1, r7
b70: e5950044 ldr r0, [r5, #68]
b74: e2872010 add r2, r7, #16 ; 0x10
b78: ebfffffe bl 0
b7c: e1a06000 mov r6, r0
b80: e1a00004 mov r0, r4
b84: ebfffffe bl 0
b88: e1a00005 mov r0, r5
b8c: ebfffd1b bl 0
b90: ea000056 b cf0
b94: e5972000 ldr r2, [r7]
b98: e5903978 ldr r3, [r0, #2424]
b9c: e1520003 cmp r2, r3
ba0: 2a000051 bcs cec
ba4: e3a03030 mov r3, #48 ; 0x30
ba8: e02c0293 mla ip, r3, r2, r0
bac: e1a04007 mov r4, r7
bb0: e28ccfdd add ip, ip, #884 ; 0x374
bb4: e8bc000f ldm ip!, {r0, r1, r2, r3}
bb8: e8a4000f stmia r4!, {r0, r1, r2, r3}
bbc: e8bc000f ldm ip!, {r0, r1, r2, r3}
bc0: e8a4000f stmia r4!, {r0, r1, r2, r3}
bc4: e3a06000 mov r6, #0 ; 0x0
bc8: e89c000f ldm ip, {r0, r1, r2, r3}
bcc: e884000f stm r4, {r0, r1, r2, r3}
bd0: ea000046 b cf0
bd4: e10f8000 mrs r8, CPSR
bd8: e3883080 orr r3, r8, #128 ; 0x80
bdc: e121f003 msr CPSR_c, r3
be0: e5976000 ldr r6, [r7]
be4: e59039a4 ldr r3, [r0, #2468]
be8: e3560000 cmp r6, #0 ; 0x0
bec: 0a000014 beq c44
bf0: e3530000 cmp r3, #0 ; 0x0
bf4: 1a00000c bne c2c
bf8: e590312c ldr r3, [r0, #300]
bfc: e1a00007 mov r0, r7
c00: e58539a8 str r3, [r5, #2472]
c04: ebfffd8e bl 244
c08: e5973000 ldr r3, [r7]
c0c: e59f10fc ldr r1, [pc, #252] ; d10
c10: e58539a4 str r3, [r5, #2468]
c14: e59f00f8 ldr r0, [pc, #248] ; d14
c18: ea00002a b cc8
c1c: e59f00f4 ldr r0, [pc, #244] ; d18
c24: e59f00f0 ldr r0, [pc, #240] ; d1c
c28: ebfffffe bl 0 <__const_udelay>
c2c: ebfffd3c bl 124
c30: e3500000 cmp r0, #0 ; 0x0
c34: 1afffff8 bne c1c
c38: e1a00007 mov r0, r7
c3c: ebfffd80 bl 244
c40: ea000021 b ccc
c44: e3530000 cmp r3, #0 ; 0x0
c48: 0a00001f beq ccc
c4c: e59009a8 ldr r0, [r0, #2472]
c50: ebfffffe bl 0
c54: e59539a8 ldr r3, [r5, #2472]
c58: e1a01000 mov r1, r0
c5c: e5932000 ldr r2, [r3]
c60: e24b402c sub r4, fp, #44 ; 0x2c
c64: e59f00b4 ldr r0, [pc, #180] ; d20
c6c: e1a00004 mov r0, r4
c70: ebfffffe bl 0
c74: e8940003 ldm r4, {r0, r1}
c78: e59539a8 ldr r3, [r5, #2472]
c7c: e2833040 add r3, r3, #64 ; 0x40
c80: e8830003 stm r3, {r0, r1}
c84: e59529a8 ldr r2, [r5, #2472]
c88: e3a03004 mov r3, #4 ; 0x4
c8c: e5823020 str r3, [r2, #32]
c90: e59509a8 ldr r0, [r5, #2472]
c94: e5953150 ldr r3, [r5, #336]
c98: e3a01001 mov r1, #1 ; 0x1
c9c: e5803014 str r3, [r0, #20]
ca0: e59509a8 ldr r0, [r5, #2472]
ca4: e1a02001 mov r2, r1
ca8: e1a03006 mov r3, r6
cac: e2800034 add r0, r0, #52 ; 0x34
cb0: ebfffffe bl 0 <__wake_up>
cb4: e58569a8 str r6, [r5, #2472]
cb8: e5973000 ldr r3, [r7]
cbc: e59f0060 ldr r0, [pc, #96] ; d24
cc0: e59f1048 ldr r1, [pc, #72] ; d10
cc4: e58539a4 str r3, [r5, #2468]
ccc: e2180080 ands r0, r8, #128 ; 0x80
cd0: 0a000002 beq ce0
cd4: e121f008 msr CPSR_c, r8
cd8: e3a06000 mov r6, #0 ; 0x0
cdc: ea000003 b cf0
ce0: e121f008 msr CPSR_c, r8
ce4: e1a06000 mov r6, r0
ce8: ea000000 b cf0
cec: e3e06015 mvn r6, #21 ; 0x15
cf0: e1a00006 mov r0, r6
cf4: e24bd020 sub sp, fp, #32 ; 0x20
cf8: e89da9f0 ldm sp, {r4, r5, r6, r7, r8, fp, sp, pc}
cfc: c03056c1 .word 0xc03056c1
d00: 843056c6 .word 0x843056c6
d04: 402056c0 .word 0x402056c0
d08: 443056c5 .word 0x443056c5
d0c: 000001d4 .word 0x000001d4
d10: 00000340 .word 0x00000340
d14: 00000201 .word 0x00000201
d18: 0000022c .word 0x0000022c
d1c: 00a3d6f8 .word 0x00a3d6f8
d20: 0000024c .word 0x0000024c
d24: 0000026e .word 0x0000026e
..........
b54偏移应该是pc的值但实际正在执行的代码应该是b50处的,很明显可以看出这边执行了wait_for_completion_interruptible在等待其他地方来唤醒它。好了,阻塞的根源找到了,剩下的任务就是去看看到底谁在哪唤醒它并分析一下为什么这次没能唤醒了。