summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/nouveau/nvc0_grhub.fuc
diff options
context:
space:
mode:
authorBen Skeggs <bskeggs@redhat.com>2011-05-25 18:32:44 +1000
committerBen Skeggs <bskeggs@redhat.com>2011-06-23 15:57:38 +1000
commit0411de854898a2402cf4bd915bed7ec9a6b76f9a (patch)
tree52678d0557377674a78b95211adb36dba1ecde9b /drivers/gpu/drm/nouveau/nvc0_grhub.fuc
parentf8522fc80f2e0392fc44b069f61721bd25907270 (diff)
drm/nvc0/gr: import and use our own fuc by default
The ability to use NVIDIA's fuc has been retained *temporarily* in order to better debug any issues that may be lingering in our initial attempt at writing this ucode. Once I'm fairly confident we're okay, it'll be removed. There's a number of things not implemented by this fuc currently, but most of it is sets of state that our context setup would not have used anyway. No doubt we'll find out what they're for at some point, and implement it if required. This has been tested on 0xc0/0xc4 thus far, and from what I could tell it worked as well as NVIDIA's. It's also been tested on 0xc1, but even with NVIDIA's fuc that chipset doesn't work correctly with nouveau yet. 0xc3/0xc8/0xce should in theory be supported too, but I don't have the hardware to check that. There's no doubt numerous bugs to squash yet, please report any! Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
Diffstat (limited to 'drivers/gpu/drm/nouveau/nvc0_grhub.fuc')
-rw-r--r--drivers/gpu/drm/nouveau/nvc0_grhub.fuc808
1 files changed, 808 insertions, 0 deletions
diff --git a/drivers/gpu/drm/nouveau/nvc0_grhub.fuc b/drivers/gpu/drm/nouveau/nvc0_grhub.fuc
new file mode 100644
index 00000000000..a1a599124cf
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvc0_grhub.fuc
@@ -0,0 +1,808 @@
+/* fuc microcode for nvc0 PGRAPH/HUB
+ *
+ * Copyright 2011 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ */
+
+/* To build:
+ * m4 nvc0_grhub.fuc | envyas -a -w -m fuc -V nva3 -o nvc0_grhub.fuc.h
+ */
+
+.section nvc0_grhub_data
+include(`nvc0_graph.fuc')
+gpc_count: .b32 0
+rop_count: .b32 0
+cmd_queue: queue_init
+hub_mmio_list_head: .b32 0
+hub_mmio_list_tail: .b32 0
+
+ctx_current: .b32 0
+
+chipsets:
+.b8 0xc0 0 0 0
+.b16 nvc0_hub_mmio_head
+.b16 nvc0_hub_mmio_tail
+.b8 0xc1 0 0 0
+.b16 nvc0_hub_mmio_head
+.b16 nvc1_hub_mmio_tail
+.b8 0xc3 0 0 0
+.b16 nvc0_hub_mmio_head
+.b16 nvc0_hub_mmio_tail
+.b8 0xc4 0 0 0
+.b16 nvc0_hub_mmio_head
+.b16 nvc0_hub_mmio_tail
+.b8 0xc8 0 0 0
+.b16 nvc0_hub_mmio_head
+.b16 nvc0_hub_mmio_tail
+.b8 0xce 0 0 0
+.b16 nvc0_hub_mmio_head
+.b16 nvc0_hub_mmio_tail
+.b8 0 0 0 0
+
+nvc0_hub_mmio_head:
+mmctx_data(0x17e91c, 2)
+mmctx_data(0x400204, 2)
+mmctx_data(0x404004, 11)
+mmctx_data(0x404044, 1)
+mmctx_data(0x404094, 14)
+mmctx_data(0x4040d0, 7)
+mmctx_data(0x4040f8, 1)
+mmctx_data(0x404130, 3)
+mmctx_data(0x404150, 3)
+mmctx_data(0x404164, 2)
+mmctx_data(0x404174, 3)
+mmctx_data(0x404200, 8)
+mmctx_data(0x404404, 14)
+mmctx_data(0x404460, 4)
+mmctx_data(0x404480, 1)
+mmctx_data(0x404498, 1)
+mmctx_data(0x404604, 4)
+mmctx_data(0x404618, 32)
+mmctx_data(0x404698, 21)
+mmctx_data(0x4046f0, 2)
+mmctx_data(0x404700, 22)
+mmctx_data(0x405800, 1)
+mmctx_data(0x405830, 3)
+mmctx_data(0x405854, 1)
+mmctx_data(0x405870, 4)
+mmctx_data(0x405a00, 2)
+mmctx_data(0x405a18, 1)
+mmctx_data(0x406020, 1)
+mmctx_data(0x406028, 4)
+mmctx_data(0x4064a8, 2)
+mmctx_data(0x4064b4, 2)
+mmctx_data(0x407804, 1)
+mmctx_data(0x40780c, 6)
+mmctx_data(0x4078bc, 1)
+mmctx_data(0x408000, 7)
+mmctx_data(0x408064, 1)
+mmctx_data(0x408800, 3)
+mmctx_data(0x408900, 4)
+mmctx_data(0x408980, 1)
+nvc0_hub_mmio_tail:
+mmctx_data(0x4064c0, 2)
+nvc1_hub_mmio_tail:
+
+.align 256
+chan_data:
+chan_mmio_count: .b32 0
+chan_mmio_address: .b32 0
+
+.align 256
+xfer_data: .b32 0
+
+.section nvc0_grhub_code
+bra init
+define(`include_code')
+include(`nvc0_graph.fuc')
+
+// reports an exception to the host
+//
+// In: $r15 error code (see nvc0_graph.fuc)
+//
+error:
+ push $r14
+ mov $r14 0x814
+ shl b32 $r14 6
+ iowr I[$r14 + 0x000] $r15 // CC_SCRATCH[5] = error code
+ mov $r14 0xc1c
+ shl b32 $r14 6
+ mov $r15 1
+ iowr I[$r14 + 0x000] $r15 // INTR_UP_SET
+ pop $r14
+ ret
+
+// HUB fuc initialisation, executed by triggering ucode start, will
+// fall through to main loop after completion.
+//
+// Input:
+// CC_SCRATCH[0]: chipset (PMC_BOOT_0 read returns 0x0bad0bad... sigh)
+//
+// Output:
+// CC_SCRATCH[0]:
+// 31:31: set to signal completion
+// CC_SCRATCH[1]:
+// 31:0: total PGRAPH context size
+//
+init:
+ clear b32 $r0
+ mov $sp $r0
+ mov $xdbase $r0
+
+ // enable fifo access
+ mov $r1 0x1200
+ mov $r2 2
+ iowr I[$r1 + 0x000] $r2 // FIFO_ENABLE
+
+ // setup i0 handler, and route all interrupts to it
+ mov $r1 ih
+ mov $iv0 $r1
+ mov $r1 0x400
+ iowr I[$r1 + 0x300] $r0 // INTR_DISPATCH
+
+ // route HUB_CHANNEL_SWITCH to fuc interrupt 8
+ mov $r3 0x404
+ shl b32 $r3 6
+ mov $r2 0x2003 // { HUB_CHANNEL_SWITCH, ZERO } -> intr 8
+ iowr I[$r3 + 0x000] $r2
+
+ // not sure what these are, route them because NVIDIA does, and
+ // the IRQ handler will signal the host if we ever get one.. we
+ // may find out if/why we need to handle these if so..
+ //
+ mov $r2 0x2004
+ iowr I[$r3 + 0x004] $r2 // { 0x04, ZERO } -> intr 9
+ mov $r2 0x200b
+ iowr I[$r3 + 0x008] $r2 // { 0x0b, ZERO } -> intr 10
+ mov $r2 0x200c
+ iowr I[$r3 + 0x01c] $r2 // { 0x0c, ZERO } -> intr 15
+
+ // enable all INTR_UP interrupts
+ mov $r2 0xc24
+ shl b32 $r2 6
+ not b32 $r3 $r0
+ iowr I[$r2] $r3
+
+ // enable fifo, ctxsw, 9, 10, 15 interrupts
+ mov $r2 -0x78fc // 0x8704
+ sethi $r2 0
+ iowr I[$r1 + 0x000] $r2 // INTR_EN_SET
+
+ // fifo level triggered, rest edge
+ sub b32 $r1 0x100
+ mov $r2 4
+ iowr I[$r1] $r2
+
+ // enable interrupts
+ bset $flags ie0
+
+ // fetch enabled GPC/ROP counts
+ mov $r14 -0x69fc // 0x409604
+ sethi $r14 0x400000
+ call nv_rd32
+ extr $r1 $r15 16:20
+ st b32 D[$r0 + rop_count] $r1
+ and $r15 0x1f
+ st b32 D[$r0 + gpc_count] $r15
+
+ // set BAR_REQMASK to GPC mask
+ mov $r1 1
+ shl b32 $r1 $r15
+ sub b32 $r1 1
+ mov $r2 0x40c
+ shl b32 $r2 6
+ iowr I[$r2 + 0x000] $r1
+ iowr I[$r2 + 0x100] $r1
+
+ // find context data for this chipset
+ mov $r2 0x800
+ shl b32 $r2 6
+ iord $r2 I[$r2 + 0x000] // CC_SCRATCH[0]
+ mov $r15 chipsets - 8
+ init_find_chipset:
+ add b32 $r15 8
+ ld b32 $r3 D[$r15 + 0x00]
+ cmpu b32 $r3 $r2
+ bra e init_context
+ cmpu b32 $r3 0
+ bra ne init_find_chipset
+ // unknown chipset
+ ret
+
+ // context size calculation, reserve first 256 bytes for use by fuc
+ init_context:
+ mov $r1 256
+
+ // calculate size of mmio context data
+ ld b16 $r14 D[$r15 + 4]
+ ld b16 $r15 D[$r15 + 6]
+ sethi $r14 0
+ st b32 D[$r0 + hub_mmio_list_head] $r14
+ st b32 D[$r0 + hub_mmio_list_tail] $r15
+ call mmctx_size
+
+ // set mmctx base addresses now so we don't have to do it later,
+ // they don't (currently) ever change
+ mov $r3 0x700
+ shl b32 $r3 6
+ shr b32 $r4 $r1 8
+ iowr I[$r3 + 0x000] $r4 // MMCTX_SAVE_SWBASE
+ iowr I[$r3 + 0x100] $r4 // MMCTX_LOAD_SWBASE
+ add b32 $r3 0x1300
+ add b32 $r1 $r15
+ shr b32 $r15 2
+ iowr I[$r3 + 0x000] $r15 // MMCTX_LOAD_COUNT, wtf for?!?
+
+ // strands, base offset needs to be aligned to 256 bytes
+ shr b32 $r1 8
+ add b32 $r1 1
+ shl b32 $r1 8
+ mov b32 $r15 $r1
+ call strand_ctx_init
+ add b32 $r1 $r15
+
+ // initialise each GPC in sequence by passing in the offset of its
+ // context data in GPCn_CC_SCRATCH[1], and starting its FUC (which
+ // has previously been uploaded by the host) running.
+ //
+ // the GPC fuc init sequence will set GPCn_CC_SCRATCH[0] bit 31
+ // when it has completed, and return the size of its context data
+ // in GPCn_CC_SCRATCH[1]
+ //
+ ld b32 $r3 D[$r0 + gpc_count]
+ mov $r4 0x2000
+ sethi $r4 0x500000
+ init_gpc:
+ // setup, and start GPC ucode running
+ add b32 $r14 $r4 0x804
+ mov b32 $r15 $r1
+ call nv_wr32 // CC_SCRATCH[1] = ctx offset
+ add b32 $r14 $r4 0x800
+ mov b32 $r15 $r2
+ call nv_wr32 // CC_SCRATCH[0] = chipset
+ add b32 $r14 $r4 0x10c
+ clear b32 $r15
+ call nv_wr32
+ add b32 $r14 $r4 0x104
+ call nv_wr32 // ENTRY
+ add b32 $r14 $r4 0x100
+ mov $r15 2 // CTRL_START_TRIGGER
+ call nv_wr32 // CTRL
+
+ // wait for it to complete, and adjust context size
+ add b32 $r14 $r4 0x800
+ init_gpc_wait:
+ call nv_rd32
+ xbit $r15 $r15 31
+ bra e init_gpc_wait
+ add b32 $r14 $r4 0x804
+ call nv_rd32
+ add b32 $r1 $r15
+
+ // next!
+ add b32 $r4 0x8000
+ sub b32 $r3 1
+ bra ne init_gpc
+
+ // save context size, and tell host we're ready
+ mov $r2 0x800
+ shl b32 $r2 6
+ iowr I[$r2 + 0x100] $r1 // CC_SCRATCH[1] = context size
+ add b32 $r2 0x800
+ clear b32 $r1
+ bset $r1 31
+ iowr I[$r2 + 0x000] $r1 // CC_SCRATCH[0] |= 0x80000000
+
+// Main program loop, very simple, sleeps until woken up by the interrupt
+// handler, pulls a command from the queue and executes its handler
+//
+main:
+ // sleep until we have something to do
+ bset $flags $p0
+ sleep $p0
+ mov $r13 cmd_queue
+ call queue_get
+ bra $p1 main
+
+ // context switch, requested by GPU?
+ cmpu b32 $r14 0x4001
+ bra ne main_not_ctx_switch
+ trace_set(T_AUTO)
+ mov $r1 0xb00
+ shl b32 $r1 6
+ iord $r2 I[$r1 + 0x100] // CHAN_NEXT
+ iord $r1 I[$r1 + 0x000] // CHAN_CUR
+
+ xbit $r3 $r1 31
+ bra e chsw_no_prev
+ xbit $r3 $r2 31
+ bra e chsw_prev_no_next
+ push $r2
+ mov b32 $r2 $r1
+ trace_set(T_SAVE)
+ bclr $flags $p1
+ bset $flags $p2
+ call ctx_xfer
+ trace_clr(T_SAVE);
+ pop $r2
+ trace_set(T_LOAD);
+ bset $flags $p1
+ call ctx_xfer
+ trace_clr(T_LOAD);
+ bra chsw_done
+ chsw_prev_no_next:
+ push $r2
+ mov b32 $r2 $r1
+ bclr $flags $p1
+ bclr $flags $p2
+ call ctx_xfer
+ pop $r2
+ mov $r1 0xb00
+ shl b32 $r1 6
+ iowr I[$r1] $r2
+ bra chsw_done
+ chsw_no_prev:
+ xbit $r3 $r2 31
+ bra e chsw_done
+ bset $flags $p1
+ bclr $flags $p2
+ call ctx_xfer
+
+ // ack the context switch request
+ chsw_done:
+ mov $r1 0xb0c
+ shl b32 $r1 6
+ mov $r2 1
+ iowr I[$r1 + 0x000] $r2 // 0x409b0c
+ trace_clr(T_AUTO)
+ bra main
+
+ // request to set current channel? (*not* a context switch)
+ main_not_ctx_switch:
+ cmpu b32 $r14 0x0001
+ bra ne main_not_ctx_chan
+ mov b32 $r2 $r15
+ call ctx_chan
+ bra main_done
+
+ // request to store current channel context?
+ main_not_ctx_chan:
+ cmpu b32 $r14 0x0002
+ bra ne main_not_ctx_save
+ trace_set(T_SAVE)
+ bclr $flags $p1
+ bclr $flags $p2
+ call ctx_xfer
+ trace_clr(T_SAVE)
+ bra main_done
+
+ main_not_ctx_save:
+ shl b32 $r15 $r14 16
+ or $r15 E_BAD_COMMAND
+ call error
+ bra main
+
+ main_done:
+ mov $r1 0x820
+ shl b32 $r1 6
+ clear b32 $r2
+ bset $r2 31
+ iowr I[$r1 + 0x000] $r2 // CC_SCRATCH[0] |= 0x80000000
+ bra main
+
+// interrupt handler
+ih:
+ push $r8
+ mov $r8 $flags
+ push $r8
+ push $r9
+ push $r10
+ push $r11
+ push $r13
+ push $r14
+ push $r15
+
+ // incoming fifo command?
+ iord $r10 I[$r0 + 0x200] // INTR
+ and $r11 $r10 0x00000004
+ bra e ih_no_fifo
+ // queue incoming fifo command for later processing
+ mov $r11 0x1900
+ mov $r13 cmd_queue
+ iord $r14 I[$r11 + 0x100] // FIFO_CMD
+ iord $r15 I[$r11 + 0x000] // FIFO_DATA
+ call queue_put
+ add b32 $r11 0x400
+ mov $r14 1
+ iowr I[$r11 + 0x000] $r14 // FIFO_ACK
+
+ // context switch request?
+ ih_no_fifo:
+ and $r11 $r10 0x00000100
+ bra e ih_no_ctxsw
+ // enqueue a context switch for later processing
+ mov $r13 cmd_queue
+ mov $r14 0x4001
+ call queue_put
+
+ // anything we didn't handle, bring it to the host's attention
+ ih_no_ctxsw:
+ mov $r11 0x104
+ not b32 $r11
+ and $r11 $r10 $r11
+ bra e ih_no_other
+ mov $r10 0xc1c
+ shl b32 $r10 6
+ iowr I[$r10] $r11 // INTR_UP_SET
+
+ // ack, and wake up main()
+ ih_no_other:
+ iowr I[$r0 + 0x100] $r10 // INTR_ACK
+
+ pop $r15
+ pop $r14
+ pop $r13
+ pop $r11
+ pop $r10
+ pop $r9
+ pop $r8
+ mov $flags $r8
+ pop $r8
+ bclr $flags $p0
+ iret
+
+// Not real sure, but, MEM_CMD 7 will hang forever if this isn't done
+ctx_4160s:
+ mov $r14 0x4160
+ sethi $r14 0x400000
+ mov $r15 1
+ call nv_wr32
+ ctx_4160s_wait:
+ call nv_rd32
+ xbit $r15 $r15 4
+ bra e ctx_4160s_wait
+ ret
+
+// Without clearing again at end of xfer, some things cause PGRAPH
+// to hang with STATUS=0x00000007 until it's cleared.. fbcon can
+// still function with it set however...
+ctx_4160c:
+ mov $r14 0x4160
+ sethi $r14 0x400000
+ clear b32 $r15
+ call nv_wr32
+ ret
+
+// Again, not real sure
+//
+// In: $r15 value to set 0x404170 to
+//
+ctx_4170s:
+ mov $r14 0x4170
+ sethi $r14 0x400000
+ or $r15 0x10
+ call nv_wr32
+ ret
+
+// Waits for a ctx_4170s() call to complete
+//
+ctx_4170w:
+ mov $r14 0x4170
+ sethi $r14 0x400000
+ call nv_rd32
+ and $r15 0x10
+ bra ne ctx_4170w
+ ret
+
+// Disables various things, waits a bit, and re-enables them..
+//
+// Not sure how exactly this helps, perhaps "ENABLE" is not such a
+// good description for the bits we turn off? Anyways, without this,
+// funny things happen.
+//
+ctx_redswitch:
+ mov $r14 0x614
+ shl b32 $r14 6
+ mov $r15 0x270
+ iowr I[$r14] $r15 // HUB_RED_SWITCH = ENABLE_GPC, POWER_ALL
+ mov $r15 8
+ ctx_redswitch_delay:
+ sub b32 $r15 1
+ bra ne ctx_redswitch_delay
+ mov $r15 0x770
+ iowr I[$r14] $r15 // HUB_RED_SWITCH = ENABLE_ALL, POWER_ALL
+ ret
+
+// Not a clue what this is for, except that unless the value is 0x10, the
+// strand context is saved (and presumably restored) incorrectly..
+//
+// In: $r15 value to set to (0x00/0x10 are used)
+//
+ctx_86c:
+ mov $r14 0x86c
+ shl b32 $r14 6
+ iowr I[$r14] $r15 // HUB(0x86c) = val
+ mov $r14 -0x75ec
+ sethi $r14 0x400000
+ call nv_wr32 // ROP(0xa14) = val
+ mov $r14 -0x5794
+ sethi $r14 0x410000
+ call nv_wr32 // GPC(0x86c) = val
+ ret
+
+// ctx_load - load's a channel's ctxctl data, and selects its vm
+//
+// In: $r2 channel address
+//
+ctx_load:
+ trace_set(T_CHAN)
+
+ // switch to channel, somewhat magic in parts..
+ mov $r10 12 // DONE_UNK12
+ call wait_donez
+ mov $r1 0xa24
+ shl b32 $r1 6
+ iowr I[$r1 + 0x000] $r0 // 0x409a24
+ mov $r3 0xb00
+ shl b32 $r3 6
+ iowr I[$r3 + 0x100] $r2 // CHAN_NEXT
+ mov $r1 0xa0c
+ shl b32 $r1 6
+ mov $r4 7
+ iowr I[$r1 + 0x000] $r2 // MEM_CHAN
+ iowr I[$r1 + 0x100] $r4 // MEM_CMD
+ ctx_chan_wait_0:
+ iord $r4 I[$r1 + 0x100]
+ and $r4 0x1f
+ bra ne ctx_chan_wait_0
+ iowr I[$r3 + 0x000] $r2 // CHAN_CUR
+
+ // load channel header, fetch PGRAPH context pointer
+ mov $xtargets $r0
+ bclr $r2 31
+ shl b32 $r2 4
+ add b32 $r2 2
+
+ trace_set(T_LCHAN)
+ mov $r1 0xa04
+ shl b32 $r1 6
+ iowr I[$r1 + 0x000] $r2 // MEM_BASE
+ mov $r1 0xa20
+ shl b32 $r1 6
+ mov $r2 0x0002
+ sethi $r2 0x80000000
+ iowr I[$r1 + 0x000] $r2 // MEM_TARGET = vram
+ mov $r1 0x10 // chan + 0x0210
+ mov $r2 xfer_data
+ sethi $r2 0x00020000 // 16 bytes
+ xdld $r1 $r2
+ xdwait
+ trace_clr(T_LCHAN)
+
+ // update current context
+ ld b32 $r1 D[$r0 + xfer_data + 4]
+ shl b32 $r1 24
+ ld b32 $r2 D[$r0 + xfer_data + 0]
+ shr b32 $r2 8
+ or $r1 $r2
+ st b32 D[$r0 + ctx_current] $r1
+
+ // set transfer base to start of context, and fetch context header
+ trace_set(T_LCTXH)
+ mov $r2 0xa04
+ shl b32 $r2 6
+ iowr I[$r2 + 0x000] $r1 // MEM_BASE
+ mov $r2 1
+ mov $r1 0xa20
+ shl b32 $r1 6
+ iowr I[$r1 + 0x000] $r2 // MEM_TARGET = vm
+ mov $r1 chan_data
+ sethi $r1 0x00060000 // 256 bytes
+ xdld $r0 $r1
+ xdwait
+ trace_clr(T_LCTXH)
+
+ trace_clr(T_CHAN)
+ ret
+
+// ctx_chan - handler for HUB_SET_CHAN command, will set a channel as
+// the active channel for ctxctl, but not actually transfer
+// any context data. intended for use only during initial
+// context construction.
+//
+// In: $r2 channel address
+//
+ctx_chan:
+ call ctx_4160s
+ call ctx_load
+ mov $r10 12 // DONE_UNK12
+ call wait_donez
+ mov $r1 0xa10
+ shl b32 $r1 6
+ mov $r2 5
+ iowr I[$r1 + 0x000] $r2 // MEM_CMD = 5 (???)
+ ctx_chan_wait:
+ iord $r2 I[$r1 + 0x000]
+ or $r2 $r2
+ bra ne ctx_chan_wait
+ call ctx_4160c
+ ret
+
+// Execute per-context state overrides list
+//
+// Only executed on the first load of a channel. Might want to look into
+// removing this and having the host directly modify the channel's context
+// to change this state... The nouveau DRM already builds this list as
+// it's definitely needed for NVIDIA's, so we may as well use it for now
+//
+// Input: $r1 mmio list length
+//
+ctx_mmio_exec:
+ // set transfer base to be the mmio list
+ ld b32 $r3 D[$r0 + chan_mmio_address]
+ mov $r2 0xa04
+ shl b32 $r2 6
+ iowr I[$r2 + 0x000] $r3 // MEM_BASE
+
+ clear b32 $r3
+ ctx_mmio_loop:
+ // fetch next 256 bytes of mmio list if necessary
+ and $r4 $r3 0xff
+ bra ne ctx_mmio_pull
+ mov $r5 xfer_data
+ sethi $r5 0x00060000 // 256 bytes
+ xdld $r3 $r5
+ xdwait
+
+ // execute a single list entry
+ ctx_mmio_pull:
+ ld b32 $r14 D[$r4 + xfer_data + 0x00]
+ ld b32 $r15 D[$r4 + xfer_data + 0x04]
+ call nv_wr32
+
+ // next!
+ add b32 $r3 8
+ sub b32 $r1 1
+ bra ne ctx_mmio_loop
+
+ // set transfer base back to the current context
+ ctx_mmio_done:
+ ld b32 $r3 D[$r0 + ctx_current]
+ iowr I[$r2 + 0x000] $r3 // MEM_BASE
+
+ // disable the mmio list now, we don't need/want to execute it again
+ st b32 D[$r0 + chan_mmio_count] $r0
+ mov $r1 chan_data
+ sethi $r1 0x00060000 // 256 bytes
+ xdst $r0 $r1
+ xdwait
+ ret
+
+// Transfer HUB context data between GPU and storage area
+//
+// In: $r2 channel address
+// $p1 clear on save, set on load
+// $p2 set if opposite direction done/will be done, so:
+// on save it means: "a load will follow this save"
+// on load it means: "a save preceeded this load"
+//
+ctx_xfer:
+ bra not $p1 ctx_xfer_pre
+ bra $p2 ctx_xfer_pre_load
+ ctx_xfer_pre:
+ mov $r15 0x10
+ call ctx_86c
+ call ctx_4160s
+ bra not $p1 ctx_xfer_exec
+
+ ctx_xfer_pre_load:
+ mov $r15 2
+ call ctx_4170s
+ call ctx_4170w
+ call ctx_redswitch
+ clear b32 $r15
+ call ctx_4170s
+ call ctx_load
+
+ // fetch context pointer, and initiate xfer on all GPCs
+ ctx_xfer_exec:
+ ld b32 $r1 D[$r0 + ctx_current]
+ mov $r2 0x414
+ shl b32 $r2 6
+ iowr I[$r2 + 0x000] $r0 // BAR_STATUS = reset
+ mov $r14 -0x5b00
+ sethi $r14 0x410000
+ mov b32 $r15 $r1
+ call nv_wr32 // GPC_BCAST_WRCMD_DATA = ctx pointer
+ add b32 $r14 4
+ xbit $r15 $flags $p1
+ xbit $r2 $flags $p2
+ shl b32 $r2 1
+ or $r15 $r2
+ call nv_wr32 // GPC_BCAST_WRCMD_CMD = GPC_XFER(type)
+
+ // strands
+ mov $r1 0x4afc
+ sethi $r1 0x20000
+ mov $r2 0xc
+ iowr I[$r1] $r2 // STRAND_CMD(0x3f) = 0x0c
+ call strand_wait
+ mov $r2 0x47fc
+ sethi $r2 0x20000
+ iowr I[$r2] $r0 // STRAND_FIRST_GENE(0x3f) = 0x00
+ xbit $r2 $flags $p1
+ add b32 $r2 3
+ iowr I[$r1] $r2 // STRAND_CMD(0x3f) = 0x03/0x04 (SAVE/LOAD)
+
+ // mmio context
+ xbit $r10 $flags $p1 // direction
+ or $r10 6 // first, last
+ mov $r11 0 // base = 0
+ ld b32 $r12 D[$r0 + hub_mmio_list_head]
+ ld b32 $r13 D[$r0 + hub_mmio_list_tail]
+ mov $r14 0 // not multi
+ call mmctx_xfer
+
+ // wait for GPCs to all complete
+ mov $r10 8 // DONE_BAR
+ call wait_doneo
+
+ // wait for strand xfer to complete
+ call strand_wait
+
+ // post-op
+ bra $p1 ctx_xfer_post
+ mov $r10 12 // DONE_UNK12
+ call wait_donez
+ mov $r1 0xa10
+ shl b32 $r1 6
+ mov $r2 5
+ iowr I[$r1] $r2 // MEM_CMD
+ ctx_xfer_post_save_wait:
+ iord $r2 I[$r1]
+ or $r2 $r2
+ bra ne ctx_xfer_post_save_wait
+
+ bra $p2 ctx_xfer_done
+ ctx_xfer_post:
+ mov $r15 2
+ call ctx_4170s
+ clear b32 $r15
+ call ctx_86c
+ call strand_post
+ call ctx_4170w
+ clear b32 $r15
+ call ctx_4170s
+
+ bra not $p1 ctx_xfer_no_post_mmio
+ ld b32 $r1 D[$r0 + chan_mmio_count]
+ or $r1 $r1
+ bra e ctx_xfer_no_post_mmio
+ call ctx_mmio_exec
+
+ ctx_xfer_no_post_mmio:
+ call ctx_4160c
+
+ ctx_xfer_done:
+ ret
+
+.align 256