go func...会被编译成newproc方法,我们可以随便写一个测试文件编译后用gdb打开
(gdb) b runtime.newproc
Breakpoint 5 at 0x10303c0: file /usr/local/homebrew/Cellar/go@1.9/1.9.6/libexec/src/runtime/proc.go, line 2929.
func newproc(siz int32, fn *funcval) {
argp := add(unsafe.Pointer(&fn), sys.PtrSize)
pc := getcallerpc(unsafe.Pointer(&siz))
systemstack(func() {
newproc1(fn, (*uint8)(argp), siz, 0, pc)
})
}
type funcval struct {
fn uintptr
// variable-size, fn-specific data here
}
golang的参数调用方式和C差不多,都是按参数从右到左入栈,所以siz是第一个参数,fn是变长参数,它的长度由siz确定,所以从我们的代码 go func...最终会被编译器call newproc,并且会把参数拷贝到调用栈上,fn中包含了所有g执行的上下文(方法指针[IP]和方法参数)
还一个有意思的是取调用方的pc,汇编中call调用后cpu会把下一条地址也就是方法的返回地址压入栈中。
systemstack是让M使用g0堆栈,我们已经知道M执行的时候会从G中恢复出堆栈执行,但做一些系统任务比如这儿的新建proc的时候如果再用G的堆栈就不合适了,所以golang设计的时候每个M都分配一个g0堆栈,这一点我觉得应该是模仿操作系统的,cpu有4个等级,linux使用0和3,用户态在3级,当系统调用陷入内核的时候操作系统会切换到0级并且把堆栈都切换到内核堆栈。
继续newproc1
// 返回值忽略了,因为goroutine从不返回
func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr) *g {
_g_ := getg()
if fn == nil {
_g_.m.throwing = -1 // do not dump full stacks
throw("go of nil func value")
}
_g_.m.locks++ // disable preemption because it can be holding p in a local var
// 对其内存
siz := narg + nret
siz = (siz + 7) &^ 7
// We could allocate a larger initial stack if necessary.
// Not worth it: this is almost always an error.
// 4*sizeof(uintreg): extra space added below
// sizeof(uintreg): caller's LR (arm) or return address (x86, in gostartcall).
if siz >= _StackMin-4*sys.RegSize-sys.RegSize {
throw("newproc: function arguments too large for new goroutine")
}
// 从P获取一个G如果没有就new一个
_p_ := _g_.m.p.ptr()
newg := gfget(_p_)
if newg == nil {
newg = malg(_StackMin)
casgstatus(newg, _Gidle, _Gdead)
allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack.
}
if newg.stack.hi == 0 {
throw("newproc1: newg missing stack")
}
if readgstatus(newg) != _Gdead {
throw("newproc1: new g is not Gdead")
}
totalSize := 4*sys.RegSize + uintptr(siz) + sys.MinFrameSize // extra space in case of reads slightly beyond frame
totalSize += -totalSize & (sys.SpAlign - 1) // align to spAlign
sp := newg.stack.hi - totalSize
spArg := sp
if usesLR {
// caller's LR
*(*uintptr)(unsafe.Pointer(sp)) = 0
prepGoExitFrame(sp)
spArg += sys.MinFrameSize
}
if narg > 0 {
// 把参数copy到G的栈中
memmove(unsafe.Pointer(spArg), unsafe.Pointer(argp), uintptr(narg))
// This is a stack-to-stack copy. If write barriers
// are enabled and the source stack is grey (the
// destination is always black), then perform a
// barrier copy. We do this *after* the memmove
// because the destination stack may have garbage on
// it.
if writeBarrier.needed && !_g_.m.curg.gcscandone {
f := findfunc(fn.fn)
stkmap := (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
// We're in the prologue, so it's always stack map index 0.
bv := stackmapdata(stkmap, 0)
bulkBarrierBitmap(spArg, spArg, uintptr(narg), 0, bv.bytedata)
}
}
memclrNoHeapPointers(unsafe.Pointer(&newg.sched), unsafe.Sizeof(newg.sched))
newg.sched.sp = sp
newg.stktopsp = sp
newg.sched.pc = funcPC(goexit) + sys.PCQuantum // +PCQuantum so that previous instruction is in same function
newg.sched.g = guintptr(unsafe.Pointer(newg))
// 该方法会把pc换成fn.fn也就是方法执行的地址,而把老pc放入lr,但我不太懂为什么要这么写
gostartcallfn(&newg.sched, fn)
newg.gopc = callerpc
newg.startpc = fn.fn
if _g_.m.curg != nil {
newg.labels = _g_.m.curg.labels
}
if isSystemGoroutine(newg) {
atomic.Xadd(&sched.ngsys, +1)
}
newg.gcscanvalid = false
casgstatus(newg, _Gdead, _Grunnable)
if _p_.goidcache == _p_.goidcacheend {
// Sched.goidgen is the last allocated id,
// this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
// At startup sched.goidgen=0, so main goroutine receives goid=1.
_p_.goidcache = atomic.Xadd64(&sched.goidgen, _GoidCacheBatch)
_p_.goidcache -= _GoidCacheBatch - 1
_p_.goidcacheend = _p_.goidcache + _GoidCacheBatch
}
newg.goid = int64(_p_.goidcache)
_p_.goidcache++
if raceenabled {
newg.racectx = racegostart(callerpc)
}
if trace.enabled {
traceGoCreate(newg, newg.startpc)
}
// 好了G的创建完成了,放入到P的run队列中
runqput(_p_, newg, true)
// 如果有空闲P就尝试唤醒一个M
if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 && mainStarted {
wakep()
}
_g_.m.locks--
if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
_g_.stackguard0 = stackPreempt
}
return newg
}
// 这就是上面说的把pc替换成真正的fn.fn的地方,不懂为什么要这么写
func gostartcallfn(gobuf *gobuf, fv *funcval) {
var fn unsafe.Pointer
if fv != nil {
fn = unsafe.Pointer(fv.fn)
} else {
fn = unsafe.Pointer(funcPC(nilfunc))
}
gostartcall(gobuf, fn, unsafe.Pointer(fv))
}
func gostartcall(buf *gobuf, fn, ctxt unsafe.Pointer) {
if buf.lr != 0 {
throw("invalid use of gostartcall")
}
buf.lr = buf.pc
buf.pc = uintptr(fn)
buf.ctxt = ctxt
}
总结下go在go func中干了啥
- 编译器会转成newproc
- newproc中从调用栈中取出执行方法地址,调用参数,调用者pc
- 然后从P中取一个空闲G没有就创建一个newG
- 对newG赋值,主要是赋值G的sched,它保存了G的调用上下文,有方法地址,参数,栈地址,下次M从P中取到这个G的时候运行就会从上下文中恢复到寄存器中从而完成用户态进程的切换
- 最后结束的时候会尝试唤醒M
M绑定一个P后就会一直从P的G队列中取出G来执行,那我们就来看看是怎么实现G的切换的,这也是用户态协程的核心了
(gdb) b runtime.execute
Breakpoint 1 at 0x102d7a0: file /usr/local/homebrew/Cellar/go@1.9/1.9.6/libexec/src/runtime/proc.go, line 1896.
func execute(gp *g, inheritTime bool) {
_g_ := getg()
casgstatus(gp, _Grunnable, _Grunning)
gp.waitsince = 0
gp.preempt = false
gp.stackguard0 = gp.stack.lo + _StackGuard
if !inheritTime {
_g_.m.p.ptr().schedtick++
}
_g_.m.curg = gp
gp.m = _g_.m
// Check whether the profiler needs to be turned on or off.
hz := sched.profilehz
if _g_.m.profilehz != hz {
setThreadCPUProfiler(hz)
}
if trace.enabled {
// GoSysExit has to happen when we have a P, but before GoStart.
// So we emit it here.
if gp.syscallsp != 0 && gp.sysblocktraced {
traceGoSysExit(gp.sysexitticks)
}
traceGoStart()
}
gogo(&gp.sched)
}
(gdb) b runtime.gogo
Breakpoint 2 at 0x104e150: file /usr/local/homebrew/Cellar/go@1.9/1.9.6/libexec/src/runtime/asm_amd64.s, line 228.
TEXT runtime·gogo(SB), NOSPLIT, $16-8
MOVQ buf+0(FP), BX // gobuf
// If ctxt is not nil, invoke deletion barrier before overwriting.
MOVQ gobuf_ctxt(BX), AX
TESTQ AX, AX
JZ nilctxt
LEAQ gobuf_ctxt(BX), AX
MOVQ AX, 0(SP)
MOVQ $0, 8(SP)
CALL runtime·writebarrierptr_prewrite(SB)
MOVQ buf+0(FP), BX
nilctxt:
MOVQ gobuf_g(BX), DX
MOVQ 0(DX), CX // make sure g != nil
get_tls(CX)
MOVQ DX, g(CX)
MOVQ gobuf_sp(BX), SP // restore SP
MOVQ gobuf_ret(BX), AX
MOVQ gobuf_ctxt(BX), DX
MOVQ gobuf_bp(BX), BP
MOVQ $0, gobuf_sp(BX) // clear to help garbage collector
MOVQ $0, gobuf_ret(BX)
MOVQ $0, gobuf_ctxt(BX)
MOVQ $0, gobuf_bp(BX)
MOVQ gobuf_pc(BX), BX
JMP BX
(gdb) b runtime.writebarrierptr_prewrite
Breakpoint 3 at 0x100f8d0: file /usr/local/homebrew/Cellar/go@1.9/1.9.6/libexec/src/runtime/mbarrier.go, line 225.
func writebarrierptr_prewrite(dst *uintptr, src uintptr) {
if writeBarrier.cgo {
cgoCheckWriteBarrier(dst, src)
}
if !writeBarrier.needed {
return
}
if src != 0 && src < minPhysPageSize {
systemstack(func() { throw("bad pointer in write barrier") })
}
writebarrierptr_prewrite1(dst, src)
}
其实关键就在gogo这个汇编编写的方法了,这种直接操作寄存器的应该只有用汇编才能实现了,其实懂的话很简单,就是从g的sched中恢复出对应的上下文到sp,ax,dx寄存器,然后跳转到上一次保存的ip处继续执行
补充:
前面说初始化G的时候gostartcall这个方法有点莫名其妙,先是在外面把pc赋值G的退出方法,然后在这个方法内部又把pc重新指成对应的fn也就是真正执行的方法地址。今天在linux机器上操作的的时候,用gdb查出来gostartcall是指向sys_x86.go中的
func gostartcall(buf *gobuf, fn, ctxt unsafe.Pointer) {
sp := buf.sp
if sys.RegSize > sys.PtrSize {
sp -= sys.PtrSize
*(*uintptr)(unsafe.Pointer(sp)) = 0
}
sp -= sys.PtrSize
*(*uintptr)(unsafe.Pointer(sp)) = buf.pc
buf.sp = sp
buf.pc = uintptr(fn)
buf.ctxt = ctxt
}
仔细看了下跟arm的区别,并且搜索了下差异才明白这个方法的真正含义。其实这是处理统一的返回,首先这里有个大前提,就是当M取出一个G然后从G的sched中恢复出上下文后,是不能直接call fn这样调用进入的,因为这样直接call,cpu会把下一条指令压入栈中,但这样是错的,M希望G执行后ret指令返回到继续调度的地方重新执行,所以go也使用了内核类似的jmp指令的方式跳转到fn的地方,并手动往栈中压入了退出点pc,也就是funcPC(goexit) + sys.PCQuantum。但只是x86规定把返回地址放在栈中,像别的平台就不太一样了,比如arm中就规定放在lr寄存器中,所以go给每个平台都定义了gostartcall,这样就明白了为什么需要一个gostartcall方法了!
那么就趁热打铁看看goexit到底干了啥
func funcPC(f interface{}) uintptr {
return **(**uintptr)(add(unsafe.Pointer(&f), sys.PtrSize))
}
func goexit(neverCallThisFunction)
// 用查找goexit的代码位置
(gdb) b runtime.goexit
Breakpoint 4 at 0x1050890: file /usr/local/homebrew/Cellar/go@1.9/1.9.6/libexec/src/runtime/asm_amd64.s, line 2337.
TEXT runtime·goexit(SB),NOSPLIT,$0-0
BYTE $0x90 // NOP
CALL runtime·goexit1(SB) // does not return
// traceback from goexit1 must hit code range of goexit
BYTE $0x90 // NOP
// 用查找goexit1的代码位置
(gdb) b runtime.goexit1
Breakpoint 5 at 0x102f0a0: file /usr/local/homebrew/Cellar/go@1.9/1.9.6/libexec/src/runtime/proc.go, line 2365.
func goexit1() {
if raceenabled {
racegoend()
}
if trace.enabled {
traceGoEnd()
}
mcall(goexit0)
}
// 用查找mcall的代码位置
(gdb) b runtime.mcall
Breakpoint 6 at 0x104e1d0: file /usr/local/homebrew/Cellar/go@1.9/1.9.6/libexec/src/runtime/asm_amd64.s, line 262.
TEXT runtime·mcall(SB), NOSPLIT, $0-8
MOVQ fn+0(FP), DI
get_tls(CX)
MOVQ g(CX), AX // save state in g->sched
MOVQ 0(SP), BX // caller's PC
MOVQ BX, (g_sched+gobuf_pc)(AX)
LEAQ fn+0(FP), BX // caller's SP
MOVQ BX, (g_sched+gobuf_sp)(AX)
MOVQ AX, (g_sched+gobuf_g)(AX)
MOVQ BP, (g_sched+gobuf_bp)(AX)
// switch to m->g0 & its stack, call fn
MOVQ g(CX), BX
MOVQ g_m(BX), BX
MOVQ m_g0(BX), SI
CMPQ SI, AX // if g == m->g0 call badmcall
JNE 3(PC)
MOVQ $runtime·badmcall(SB), AX
JMP AX
MOVQ SI, g(CX) // g = m->g0
MOVQ (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp
PUSHQ AX
MOVQ DI, DX
MOVQ 0(DI), DI
CALL DI
POPQ AX
MOVQ $runtime·badmcall2(SB), AX
JMP AX
RET
func goexit0(gp *g) {
_g_ := getg()
casgstatus(gp, _Grunning, _Gdead)
if isSystemGoroutine(gp) {
atomic.Xadd(&sched.ngsys, -1)
}
gp.m = nil
gp.lockedm = nil
_g_.m.lockedg = nil
gp.paniconfault = false
gp._defer = nil // should be true already but just in case.
gp._panic = nil // non-nil for Goexit during panic. points at stack-allocated data.
gp.writebuf = nil
gp.waitreason = ""
gp.param = nil
gp.labels = nil
gp.timer = nil
// Note that gp's stack scan is now "valid" because it has no
// stack.
gp.gcscanvalid = true
dropg()
if _g_.m.locked&^_LockExternal != 0 {
print("invalid m->locked = ", _g_.m.locked, "\n")
throw("internal lockOSThread error")
}
_g_.m.locked = 0
gfput(_g_.m.p.ptr(), gp)
schedule()
}
最终发现是到了mcall(goexit0),mcall是一个用汇编实现的,它主要是先把上下文恢复回G中,然后把栈指向g0,这样就可以在M中执行非G的其他操作而不会污染G了。
goexit0则是一个用go实现的,主要就是给G收尾,把资源都回收了,回收工作做完以后又重新执行了schedule(),这样就会循环了,找到下一个G重复一遍