1 files changed, 424 insertions, 241 deletions
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.c
index bc77eea351a5..cb390e0134a2 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.c
@@ -26,6 +26,7 @@
 #include <subdev/fb.h>
 #include <subdev/mc.h>
 #include <subdev/timer.h>
+#include <engine/fifo.h>
 
 /*******************************************************************************
  * PGRAPH context register lists
@@ -850,12 +851,17 @@ gf100_grctx_init_gcc_0[] = {
 };
 
 const struct gf100_gr_pack
-gf100_grctx_pack_gpc[] = {
+gf100_grctx_pack_gpc_0[] = {
 	{ gf100_grctx_init_gpc_unk_0 },
 	{ gf100_grctx_init_prop_0 },
 	{ gf100_grctx_init_gpc_unk_1 },
 	{ gf100_grctx_init_setup_0 },
 	{ gf100_grctx_init_zcull_0 },
+	{}
+};
+
+const struct gf100_gr_pack
+gf100_grctx_pack_gpc_1[] = {
 	{ gf100_grctx_init_crstr_0 },
 	{ gf100_grctx_init_gpm_0 },
 	{ gf100_grctx_init_gcc_0 },
@@ -985,187 +991,117 @@ gf100_grctx_pack_tpc[] = {
  * PGRAPH context implementation
  ******************************************************************************/
 
-int
-gf100_grctx_mmio_data(struct gf100_grctx *info, u32 size, u32 align, u32 access)
+void
+gf100_grctx_patch_wr32(struct gf100_gr_chan *chan, u32 addr, u32 data)
 {
-	if (info->data) {
-		info->buffer[info->buffer_nr] = round_up(info->addr, align);
-		info->addr = info->buffer[info->buffer_nr] + size;
-		info->data->size = size;
-		info->data->align = align;
-		info->data->access = access;
-		info->data++;
-		return info->buffer_nr++;
+	if (unlikely(!chan->mmio)) {
+		nvkm_wr32(chan->gr->base.engine.subdev.device, addr, data);
+		return;
 	}
-	return -1;
+
+	nvkm_wo32(chan->mmio, chan->mmio_nr++ * 4, addr);
+	nvkm_wo32(chan->mmio, chan->mmio_nr++ * 4, data);
 }
 
 void
-gf100_grctx_mmio_item(struct gf100_grctx *info, u32 addr, u32 data,
-		      int shift, int buffer)
+gf100_grctx_generate_r419cb8(struct gf100_gr *gr)
 {
-	struct nvkm_device *device = info->gr->base.engine.subdev.device;
-	if (info->data) {
-		if (shift >= 0) {
-			info->mmio->addr = addr;
-			info->mmio->data = data;
-			info->mmio->shift = shift;
-			info->mmio->buffer = buffer;
-			if (buffer >= 0)
-				data |= info->buffer[buffer] >> shift;
-			info->mmio++;
-		} else
-			return;
-	} else {
-		if (buffer >= 0)
-			return;
-	}
-
-	nvkm_wr32(device, addr, data);
+	struct nvkm_device *device = gr->base.engine.subdev.device;
+	nvkm_mask(device, 0x419cb8, 0x00007c00, 0x00000000);
 }
 
 void
-gf100_grctx_generate_bundle(struct gf100_grctx *info)
+gf100_grctx_generate_bundle(struct gf100_gr_chan *chan, u64 addr, u32 size)
 {
-	const struct gf100_grctx_func *grctx = info->gr->func->grctx;
-	const u32 access = NV_MEM_ACCESS_RW | NV_MEM_ACCESS_SYS;
-	const int s = 8;
-	const int b = mmio_vram(info, grctx->bundle_size, (1 << s), access);
-	mmio_refn(info, 0x408004, 0x00000000, s, b);
-	mmio_wr32(info, 0x408008, 0x80000000 | (grctx->bundle_size >> s));
-	mmio_refn(info, 0x418808, 0x00000000, s, b);
-	mmio_wr32(info, 0x41880c, 0x80000000 | (grctx->bundle_size >> s));
+	gf100_grctx_patch_wr32(chan, 0x408004, addr >> 8);
+	gf100_grctx_patch_wr32(chan, 0x408008, 0x80000000 | (size >> 8));
+	gf100_grctx_patch_wr32(chan, 0x418808, addr >> 8);
+	gf100_grctx_patch_wr32(chan, 0x41880c, 0x80000000 | (size >> 8));
 }
 
 void
-gf100_grctx_generate_pagepool(struct gf100_grctx *info)
+gf100_grctx_generate_pagepool(struct gf100_gr_chan *chan, u64 addr)
 {
-	const struct gf100_grctx_func *grctx = info->gr->func->grctx;
-	const u32 access = NV_MEM_ACCESS_RW | NV_MEM_ACCESS_SYS;
-	const int s = 8;
-	const int b = mmio_vram(info, grctx->pagepool_size, (1 << s), access);
-	mmio_refn(info, 0x40800c, 0x00000000, s, b);
-	mmio_wr32(info, 0x408010, 0x80000000);
-	mmio_refn(info, 0x419004, 0x00000000, s, b);
-	mmio_wr32(info, 0x419008, 0x00000000);
+	gf100_grctx_patch_wr32(chan, 0x40800c, addr >> 8);
+	gf100_grctx_patch_wr32(chan, 0x408010, 0x80000000);
+	gf100_grctx_patch_wr32(chan, 0x419004, addr >> 8);
+	gf100_grctx_patch_wr32(chan, 0x419008, 0x00000000);
 }
 
 void
-gf100_grctx_generate_attrib(struct gf100_grctx *info)
+gf100_grctx_generate_attrib(struct gf100_gr_chan *chan)
 {
-	struct gf100_gr *gr = info->gr;
+	struct gf100_gr *gr = chan->gr;
 	const struct gf100_grctx_func *grctx = gr->func->grctx;
 	const u32 attrib = grctx->attrib_nr;
-	const u32   size = 0x20 * (grctx->attrib_nr_max + grctx->alpha_nr_max);
-	const u32 access = NV_MEM_ACCESS_RW;
-	const int s = 12;
-	const int b = mmio_vram(info, size * gr->tpc_total, (1 << s), access);
 	int gpc, tpc;
 	u32 bo = 0;
 
-	mmio_refn(info, 0x418810, 0x80000000, s, b);
-	mmio_refn(info, 0x419848, 0x10000000, s, b);
-	mmio_wr32(info, 0x405830, (attrib << 16));
+	gf100_grctx_patch_wr32(chan, 0x405830, (attrib << 16));
 
 	for (gpc = 0; gpc < gr->gpc_nr; gpc++) {
 		for (tpc = 0; tpc < gr->tpc_nr[gpc]; tpc++) {
 			const u32 o = TPC_UNIT(gpc, tpc, 0x0520);
-			mmio_skip(info, o, (attrib << 16) | ++bo);
-			mmio_wr32(info, o, (attrib << 16) | --bo);
+
+			gf100_grctx_patch_wr32(chan, o, (attrib << 16) | bo);
 			bo += grctx->attrib_nr_max;
 		}
 	}
 }
 
 void
-gf100_grctx_generate_unkn(struct gf100_gr *gr)
+gf100_grctx_generate_attrib_cb(struct gf100_gr_chan *chan, u64 addr, u32 size)
 {
+	gf100_grctx_patch_wr32(chan, 0x418810, 0x80000000 | addr >> 12);
+	gf100_grctx_patch_wr32(chan, 0x419848, 0x10000000 | addr >> 12);
 }
 
-void
-gf100_grctx_generate_tpcid(struct gf100_gr *gr)
+u32
+gf100_grctx_generate_attrib_cb_size(struct gf100_gr *gr)
 {
-	struct nvkm_device *device = gr->base.engine.subdev.device;
-	int gpc, tpc, id;
-
-	for (tpc = 0, id = 0; tpc < 4; tpc++) {
-		for (gpc = 0; gpc < gr->gpc_nr; gpc++) {
-			if (tpc < gr->tpc_nr[gpc]) {
-				nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x698), id);
-				nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x4e8), id);
-				nvkm_wr32(device, GPC_UNIT(gpc, 0x0c10 + tpc * 4), id);
-				nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x088), id);
-				id++;
-			}
+	const struct gf100_grctx_func *grctx = gr->func->grctx;
 
-			nvkm_wr32(device, GPC_UNIT(gpc, 0x0c08), gr->tpc_nr[gpc]);
-			nvkm_wr32(device, GPC_UNIT(gpc, 0x0c8c), gr->tpc_nr[gpc]);
-		}
-	}
+	return 0x20 * (grctx->attrib_nr_max + grctx->alpha_nr_max) * gr->tpc_total;
 }
 
 void
-gf100_grctx_generate_r406028(struct gf100_gr *gr)
+gf100_grctx_generate_unkn(struct gf100_gr *gr)
 {
-	struct nvkm_device *device = gr->base.engine.subdev.device;
-	u32 tmp[GPC_MAX / 8] = {}, i = 0;
-	for (i = 0; i < gr->gpc_nr; i++)
-		tmp[i / 8] |= gr->tpc_nr[i] << ((i % 8) * 4);
-	for (i = 0; i < 4; i++) {
-		nvkm_wr32(device, 0x406028 + (i * 4), tmp[i]);
-		nvkm_wr32(device, 0x405870 + (i * 4), tmp[i]);
-	}
 }
 
 void
 gf100_grctx_generate_r4060a8(struct gf100_gr *gr)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
-	u8  tpcnr[GPC_MAX], data[TPC_MAX];
-	int gpc, tpc, i;
-
-	memcpy(tpcnr, gr->tpc_nr, sizeof(gr->tpc_nr));
-	memset(data, 0x1f, sizeof(data));
-
-	gpc = -1;
-	for (tpc = 0; tpc < gr->tpc_total; tpc++) {
-		do {
-			gpc = (gpc + 1) % gr->gpc_nr;
-		} while (!tpcnr[gpc]);
-		tpcnr[gpc]--;
-		data[tpc] = gpc;
+	const u8 gpcmax = nvkm_rd32(device, 0x022430);
+	const u8 tpcmax = nvkm_rd32(device, 0x022434) * gpcmax;
+	int i, j, sm = 0;
+	u32 data;
+
+	for (i = 0; i < DIV_ROUND_UP(tpcmax, 4); i++) {
+		for (data = 0, j = 0; j < 4; j++) {
+			if (sm < gr->sm_nr)
+				data |= gr->sm[sm++].gpc << (j * 8);
+			else
+				data |= 0x1f << (j * 8);
+		}
+		nvkm_wr32(device, 0x4060a8 + (i * 4), data);
 	}
-
-	for (i = 0; i < 4; i++)
-		nvkm_wr32(device, 0x4060a8 + (i * 4), ((u32 *)data)[i]);
 }
 
 void
-gf100_grctx_generate_r418bb8(struct gf100_gr *gr)
+gf100_grctx_generate_rop_mapping(struct gf100_gr *gr)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
 	u32 data[6] = {}, data2[2] = {};
-	u8  tpcnr[GPC_MAX];
 	u8  shift, ntpcv;
-	int gpc, tpc, i;
-
-	/* calculate first set of magics */
-	memcpy(tpcnr, gr->tpc_nr, sizeof(gr->tpc_nr));
+	int i;
 
-	gpc = -1;
-	for (tpc = 0; tpc < gr->tpc_total; tpc++) {
-		do {
-			gpc = (gpc + 1) % gr->gpc_nr;
-		} while (!tpcnr[gpc]);
-		tpcnr[gpc]--;
-
-		data[tpc / 6] |= gpc << ((tpc % 6) * 5);
-	}
+	/* Pack tile map into register format. */
+	for (i = 0; i < 32; i++)
+		data[i / 6] |= (gr->tile[i] & 0x07) << ((i % 6) * 5);
 
-	for (; tpc < 32; tpc++)
-		data[tpc / 6] |= 7 << ((tpc % 6) * 5);
-
-	/* and the second... */
+	/* Magic. */
 	shift = 0;
 	ntpcv = gr->tpc_total;
 	while (!(ntpcv & (1 << 4))) {
@@ -1200,152 +1136,375 @@ gf100_grctx_generate_r418bb8(struct gf100_gr *gr)
 }
 
 void
-gf100_grctx_generate_r406800(struct gf100_gr *gr)
+gf100_grctx_generate_max_ways_evict(struct gf100_gr *gr)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
-	u64 tpc_mask = 0, tpc_set = 0;
-	u8  tpcnr[GPC_MAX];
-	int gpc, tpc;
-	int i, a, b;
-
-	memcpy(tpcnr, gr->tpc_nr, sizeof(gr->tpc_nr));
-	for (gpc = 0; gpc < gr->gpc_nr; gpc++)
-		tpc_mask |= ((1ULL << gr->tpc_nr[gpc]) - 1) << (gpc * 8);
-
-	for (i = 0, gpc = -1, b = -1; i < 32; i++) {
-		a = (i * (gr->tpc_total - 1)) / 32;
-		if (a != b) {
-			b = a;
-			do {
-				gpc = (gpc + 1) % gr->gpc_nr;
-			} while (!tpcnr[gpc]);
-			tpc = gr->tpc_nr[gpc] - tpcnr[gpc]--;
-
-			tpc_set |= 1ULL << ((gpc * 8) + tpc);
+	u32 fbps = nvkm_rd32(device, 0x121c74);
+	if (fbps == 1)
+		nvkm_mask(device, 0x17e91c, 0x001f0000, 0x00090000);
+}
+
+static const u32
+gf100_grctx_alpha_beta_map[17][32] = {
+	[1] = {
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	},
+	[2] = {
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	},
+	//XXX: 3
+	[4] = {
+		1, 1, 1, 1, 1, 1, 1, 1,
+		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+		3, 3, 3, 3, 3, 3, 3, 3,
+	},
+	//XXX: 5
+	//XXX: 6
+	[7] = {
+		1, 1, 1, 1,
+		2, 2, 2, 2, 2, 2,
+		3, 3, 3, 3, 3, 3,
+		4, 4, 4, 4, 4, 4,
+		5, 5, 5, 5, 5, 5,
+		6, 6, 6, 6,
+	},
+	[8] = {
+		1, 1, 1,
+		2, 2, 2, 2, 2,
+		3, 3, 3, 3, 3,
+		4, 4, 4, 4, 4, 4,
+		5, 5, 5, 5, 5,
+		6, 6, 6, 6, 6,
+		7, 7, 7,
+	},
+	//XXX: 9
+	//XXX: 10
+	[11] = {
+		1, 1,
+		2, 2, 2, 2,
+		3, 3, 3,
+		4, 4, 4, 4,
+		5, 5, 5,
+		6, 6, 6,
+		7, 7, 7, 7,
+		8, 8, 8,
+		9, 9, 9, 9,
+		10, 10,
+	},
+	//XXX: 12
+	//XXX: 13
+	[14] = {
+		1, 1,
+		2, 2,
+		3, 3, 3,
+		4, 4, 4,
+		5, 5,
+		6, 6, 6,
+		7, 7,
+		8, 8, 8,
+		9, 9,
+		10, 10, 10,
+		11, 11, 11,
+		12, 12,
+		13, 13,
+	},
+	[15] = {
+		1, 1,
+		2, 2,
+		3, 3,
+		4, 4, 4,
+		5, 5,
+		6, 6, 6,
+		7, 7,
+		8, 8,
+		9, 9, 9,
+		10, 10,
+		11, 11, 11,
+		12, 12,
+		13, 13,
+		14, 14,
+	},
+	[16] = {
+		1, 1,
+		2, 2,
+		3, 3,
+		4, 4,
+		5, 5,
+		6, 6, 6,
+		7, 7,
+		8, 8,
+		9, 9,
+		10, 10, 10,
+		11, 11,
+		12, 12,
+		13, 13,
+		14, 14,
+		15, 15,
+	},
+};
+
+void
+gf100_grctx_generate_alpha_beta_tables(struct gf100_gr *gr)
+{
+	struct nvkm_subdev *subdev = &gr->base.engine.subdev;
+	struct nvkm_device *device = subdev->device;
+	int i, gpc;
+
+	for (i = 0; i < 32; i++) {
+		u32 atarget = gf100_grctx_alpha_beta_map[gr->tpc_total][i];
+		u32 abits[GPC_MAX] = {}, amask = 0, bmask = 0;
+
+		if (!atarget) {
+			nvkm_warn(subdev, "missing alpha/beta mapping table\n");
+			atarget = max_t(u32, gr->tpc_total * i / 32, 1);
+		}
+
+		while (atarget) {
+			for (gpc = 0; atarget && gpc < gr->gpc_nr; gpc++) {
+				if (abits[gpc] < gr->tpc_nr[gpc]) {
+					abits[gpc]++;
+					atarget--;
+				}
+			}
 		}
 
-		nvkm_wr32(device, 0x406800 + (i * 0x20), lower_32_bits(tpc_set));
-		nvkm_wr32(device, 0x406c00 + (i * 0x20), lower_32_bits(tpc_set ^ tpc_mask));
-		if (gr->gpc_nr > 4) {
-			nvkm_wr32(device, 0x406804 + (i * 0x20), upper_32_bits(tpc_set));
-			nvkm_wr32(device, 0x406c04 + (i * 0x20), upper_32_bits(tpc_set ^ tpc_mask));
+		for (gpc = 0; gpc < gr->gpc_nr; gpc++) {
+			u32 bbits = gr->tpc_nr[gpc] - abits[gpc];
+			amask |= ((1 << abits[gpc]) - 1) << (gpc * 8);
+			bmask |= ((1 << bbits) - 1) << abits[gpc] << (gpc * 8);
 		}
+
+		nvkm_wr32(device, 0x406800 + (i * 0x20), amask);
+		nvkm_wr32(device, 0x406c00 + (i * 0x20), bmask);
+	}
+}
+
+void
+gf100_grctx_generate_tpc_nr(struct gf100_gr *gr, int gpc)
+{
+	struct nvkm_device *device = gr->base.engine.subdev.device;
+	nvkm_wr32(device, GPC_UNIT(gpc, 0x0c08), gr->tpc_nr[gpc]);
+	nvkm_wr32(device, GPC_UNIT(gpc, 0x0c8c), gr->tpc_nr[gpc]);
+}
+
+void
+gf100_grctx_generate_sm_id(struct gf100_gr *gr, int gpc, int tpc, int sm)
+{
+	struct nvkm_device *device = gr->base.engine.subdev.device;
+	nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x698), sm);
+	nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x4e8), sm);
+	nvkm_wr32(device, GPC_UNIT(gpc, 0x0c10 + tpc * 4), sm);
+	nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x088), sm);
+}
+
+void
+gf100_grctx_generate_floorsweep(struct gf100_gr *gr)
+{
+	const struct gf100_grctx_func *func = gr->func->grctx;
+	int sm;
+
+	for (sm = 0; sm < gr->sm_nr; sm++) {
+		func->sm_id(gr, gr->sm[sm].gpc, gr->sm[sm].tpc, sm);
+		if (func->tpc_nr)
+			func->tpc_nr(gr, gr->sm[sm].gpc);
 	}
+
+	gf100_gr_init_num_tpc_per_gpc(gr, false, true);
+	if (!func->skip_pd_num_tpc_per_gpc)
+		gf100_gr_init_num_tpc_per_gpc(gr, true, false);
+
+	if (func->r4060a8)
+		func->r4060a8(gr);
+
+	func->rop_mapping(gr);
+
+	if (func->alpha_beta_tables)
+		func->alpha_beta_tables(gr);
+	if (func->max_ways_evict)
+		func->max_ways_evict(gr);
+	if (func->dist_skip_table)
+		func->dist_skip_table(gr);
+	if (func->r406500)
+		func->r406500(gr);
+	if (func->gpc_tpc_nr)
+		func->gpc_tpc_nr(gr);
+	if (func->r419f78)
+		func->r419f78(gr);
+	if (func->tpc_mask)
+		func->tpc_mask(gr);
+	if (func->smid_config)
+		func->smid_config(gr);
 }
 
 void
-gf100_grctx_generate_main(struct gf100_gr *gr, struct gf100_grctx *info)
+gf100_grctx_generate_main(struct gf100_gr_chan *chan)
 {
+	struct gf100_gr *gr = chan->gr;
 	struct nvkm_device *device = gr->base.engine.subdev.device;
 	const struct gf100_grctx_func *grctx = gr->func->grctx;
 	u32 idle_timeout;
 
 	nvkm_mc_unk260(device, 0);
 
-	gf100_gr_mmio(gr, grctx->hub);
-	gf100_gr_mmio(gr, grctx->gpc);
-	gf100_gr_mmio(gr, grctx->zcull);
-	gf100_gr_mmio(gr, grctx->tpc);
-	gf100_gr_mmio(gr, grctx->ppc);
+	if (!gr->sw_ctx) {
+		gf100_gr_mmio(gr, grctx->hub);
+		gf100_gr_mmio(gr, grctx->gpc_0);
+		gf100_gr_mmio(gr, grctx->zcull);
+		gf100_gr_mmio(gr, grctx->gpc_1);
+		gf100_gr_mmio(gr, grctx->tpc);
+		gf100_gr_mmio(gr, grctx->ppc);
+	} else {
+		gf100_gr_mmio(gr, gr->sw_ctx);
+	}
+
+	if (gr->func->init_419bd8)
+		gr->func->init_419bd8(gr);
+	if (grctx->r419ea8)
+		grctx->r419ea8(gr);
+
+	gf100_gr_wait_idle(gr);
 
 	idle_timeout = nvkm_mask(device, 0x404154, 0xffffffff, 0x00000000);
 
-	grctx->bundle(info);
-	grctx->pagepool(info);
-	grctx->attrib(info);
+	grctx->pagepool(chan, chan->pagepool->addr);
+	grctx->bundle(chan, chan->bundle_cb->addr, grctx->bundle_size);
+	grctx->attrib_cb(chan, chan->attrib_cb->addr, grctx->attrib_cb_size(gr));
+	grctx->attrib(chan);
+	if (grctx->patch_ltc)
+		grctx->patch_ltc(chan);
+	if (grctx->unknown_size)
+		grctx->unknown(chan, chan->unknown->addr, grctx->unknown_size);
 	grctx->unkn(gr);
 
-	gf100_grctx_generate_tpcid(gr);
-	gf100_grctx_generate_r406028(gr);
-	gf100_grctx_generate_r4060a8(gr);
-	gf100_grctx_generate_r418bb8(gr);
-	gf100_grctx_generate_r406800(gr);
+	gf100_grctx_generate_floorsweep(gr);
+
+	gf100_gr_wait_idle(gr);
+
+	if (grctx->r400088) grctx->r400088(gr, false);
+
+	if (gr->bundle)
+		gf100_gr_icmd(gr, gr->bundle);
+	else
+		gf100_gr_icmd(gr, grctx->icmd);
+
+	if (gr->bundle_veid)
+		gf100_gr_icmd(gr, gr->bundle_veid);
+	else
+		gf100_gr_icmd(gr, grctx->sw_veid_bundle_init);
+
+	if (gr->bundle64)
+		gf100_gr_icmd(gr, gr->bundle64);
+	else
+	if (grctx->sw_bundle64_init)
+		gf100_gr_icmd(gr, grctx->sw_bundle64_init);
+
+	if (grctx->r400088) grctx->r400088(gr, true);
 
-	gf100_gr_icmd(gr, grctx->icmd);
 	nvkm_wr32(device, 0x404154, idle_timeout);
-	gf100_gr_mthd(gr, grctx->mthd);
+
+	if (gr->method)
+		gf100_gr_mthd(gr, gr->method);
+	else
+		gf100_gr_mthd(gr, grctx->mthd);
 	nvkm_mc_unk260(device, 1);
+
+	if (grctx->r419cb8)
+		grctx->r419cb8(gr);
+	if (grctx->r418800)
+		grctx->r418800(gr);
+	if (grctx->r419eb0)
+		grctx->r419eb0(gr);
+	if (grctx->r419e00)
+		grctx->r419e00(gr);
+	if (grctx->r418e94)
+		grctx->r418e94(gr);
+	if (grctx->r419a3c)
+		grctx->r419a3c(gr);
+	if (grctx->r408840)
+		grctx->r408840(gr);
+	if (grctx->r419c0c)
+		grctx->r419c0c(gr);
+
+	gf100_gr_wait_idle(gr);
 }
 
+#define CB_RESERVED 0x80000
+
 int
-gf100_grctx_generate(struct gf100_gr *gr)
+gf100_grctx_generate(struct gf100_gr *gr, struct gf100_gr_chan *chan, struct nvkm_gpuobj *inst)
 {
 	const struct gf100_grctx_func *grctx = gr->func->grctx;
 	struct nvkm_subdev *subdev = &gr->base.engine.subdev;
 	struct nvkm_device *device = subdev->device;
-	struct nvkm_memory *chan;
-	struct gf100_grctx info;
+	struct nvkm_memory *data = NULL;
+	struct nvkm_vma *ctx = NULL;
 	int ret, i;
 	u64 addr;
 
-	/* allocate memory to for a "channel", which we'll use to generate
-	 * the default context values
-	 */
-	ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, 0x80000 + gr->size,
-			      0x1000, true, &chan);
-	if (ret) {
-		nvkm_error(subdev, "failed to allocate chan memory, %d\n", ret);
-		return ret;
-	}
-
-	addr = nvkm_memory_addr(chan);
+	/* NV_PGRAPH_FE_PWR_MODE_FORCE_ON. */
+	nvkm_wr32(device, 0x404170, 0x00000012);
+	nvkm_msec(device, 2000,
+		if (!(nvkm_rd32(device, 0x404170) & 0x00000010))
+			break;
+	);
 
-	/* PGD pointer */
-	nvkm_kmap(chan);
-	nvkm_wo32(chan, 0x0200, lower_32_bits(addr + 0x1000));
-	nvkm_wo32(chan, 0x0204, upper_32_bits(addr + 0x1000));
-	nvkm_wo32(chan, 0x0208, 0xffffffff);
-	nvkm_wo32(chan, 0x020c, 0x000000ff);
+	if (grctx->unkn88c)
+		grctx->unkn88c(gr, true);
 
-	/* PGT[0] pointer */
-	nvkm_wo32(chan, 0x1000, 0x00000000);
-	nvkm_wo32(chan, 0x1004, 0x00000001 | (addr + 0x2000) >> 8);
+	/* Reset FECS. */
+	gr->func->fecs.reset(gr);
 
-	/* identity-map the whole "channel" into its own vm */
-	for (i = 0; i < nvkm_memory_size(chan) / 4096; i++) {
-		u64 addr = ((nvkm_memory_addr(chan) + (i * 4096)) >> 8) | 1;
-		nvkm_wo32(chan, 0x2000 + (i * 8), lower_32_bits(addr));
-		nvkm_wo32(chan, 0x2004 + (i * 8), upper_32_bits(addr));
-	}
+	if (grctx->unkn88c)
+		grctx->unkn88c(gr, false);
 
-	/* context pointer (virt) */
-	nvkm_wo32(chan, 0x0210, 0x00080004);
-	nvkm_wo32(chan, 0x0214, 0x00000000);
-	nvkm_done(chan);
-
-	nvkm_wr32(device, 0x100cb8, (addr + 0x1000) >> 8);
-	nvkm_wr32(device, 0x100cbc, 0x80000001);
+	/* NV_PGRAPH_FE_PWR_MODE_AUTO. */
+	nvkm_wr32(device, 0x404170, 0x00000010);
 	nvkm_msec(device, 2000,
-		if (nvkm_rd32(device, 0x100c80) & 0x00008000)
+		if (!(nvkm_rd32(device, 0x404170) & 0x00000010))
 			break;
 	);
 
-	/* setup default state for mmio list construction */
-	info.gr = gr;
-	info.data = gr->mmio_data;
-	info.mmio = gr->mmio_list;
-	info.addr = 0x2000 + (i * 8);
-	info.buffer_nr = 0;
+	/* Init SCC RAM. */
+	nvkm_wr32(device, 0x40802c, 0x00000001);
 
-	/* make channel current */
-	if (gr->firmware) {
-		nvkm_wr32(device, 0x409840, 0x00000030);
-		nvkm_wr32(device, 0x409500, 0x80000000 | addr >> 12);
-		nvkm_wr32(device, 0x409504, 0x00000003);
-		nvkm_msec(device, 2000,
-			if (nvkm_rd32(device, 0x409800) & 0x00000010)
-				break;
-		);
+	/* Allocate memory to store context, and dummy global context buffers. */
+	ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST,
+			      CB_RESERVED + gr->size, 0, true, &data);
+	if (ret)
+		goto done;
 
-		nvkm_kmap(chan);
-		nvkm_wo32(chan, 0x8001c, 1);
-		nvkm_wo32(chan, 0x80020, 0);
-		nvkm_wo32(chan, 0x80028, 0);
-		nvkm_wo32(chan, 0x8002c, 0);
-		nvkm_done(chan);
+	ret = nvkm_vmm_get(chan->vmm, 0, nvkm_memory_size(data), &ctx);
+	if (ret)
+		goto done;
+
+	ret = nvkm_memory_map(data, 0, chan->vmm, ctx, NULL, 0);
+	if (ret)
+		goto done;
+
+	/* Setup context pointer. */
+	nvkm_kmap(inst);
+	nvkm_wo32(inst, 0x0210, lower_32_bits(ctx->addr + CB_RESERVED) | 4);
+	nvkm_wo32(inst, 0x0214, upper_32_bits(ctx->addr + CB_RESERVED));
+	nvkm_done(inst);
+
+	/* Make channel current. */
+	addr = inst->addr >> 12;
+	if (gr->firmware) {
+		ret = gf100_gr_fecs_bind_pointer(gr, 0x80000000 | addr);
+		if (ret)
+			goto done_inst;
+
+		nvkm_kmap(data);
+		nvkm_wo32(data, 0x1c, 1);
+		nvkm_wo32(data, 0x20, 0);
+		nvkm_wo32(data, 0x28, 0);
+		nvkm_wo32(data, 0x2c, 0);
+		nvkm_done(data);
 	} else {
 		nvkm_wr32(device, 0x409840, 0x80000000);
-		nvkm_wr32(device, 0x409500, 0x80000000 | addr >> 12);
+		nvkm_wr32(device, 0x409500, 0x80000000 | addr);
 		nvkm_wr32(device, 0x409504, 0x00000001);
 		nvkm_msec(device, 2000,
 			if (nvkm_rd32(device, 0x409800) & 0x80000000)
@@ -1353,34 +1512,48 @@ gf100_grctx_generate(struct gf100_gr *gr)
 		);
 	}
 
-	grctx->main(gr, &info);
+	grctx->main(chan);
 
-	/* trigger a context unload by unsetting the "next channel valid" bit
-	 * and faking a context switch interrupt
-	 */
-	nvkm_mask(device, 0x409b04, 0x80000000, 0x00000000);
-	nvkm_wr32(device, 0x409000, 0x00000100);
-	if (nvkm_msec(device, 2000,
-		if (!(nvkm_rd32(device, 0x409b00) & 0x80000000))
-			break;
-	) < 0) {
-		ret = -EBUSY;
-		goto done;
+	if (!gr->firmware) {
+		/* Trigger a context unload by unsetting the "next channel valid" bit
+		 * and faking a context switch interrupt.
+		 */
+		nvkm_mask(device, 0x409b04, 0x80000000, 0x00000000);
+		nvkm_wr32(device, 0x409000, 0x00000100);
+		if (nvkm_msec(device, 2000,
+			if (!(nvkm_rd32(device, 0x409b00) & 0x80000000))
+				break;
+		) < 0) {
+			ret = -EBUSY;
+			goto done_inst;
+		}
+	} else {
+		ret = gf100_gr_fecs_wfi_golden_save(gr, 0x80000000 | addr);
+		if (ret)
+			goto done_inst;
+
+		nvkm_mask(device, 0x409b00, 0x80000000, 0x00000000);
 	}
 
 	gr->data = kmalloc(gr->size, GFP_KERNEL);
 	if (gr->data) {
-		nvkm_kmap(chan);
+		nvkm_kmap(data);
 		for (i = 0; i < gr->size; i += 4)
-			gr->data[i / 4] = nvkm_ro32(chan, 0x80000 + i);
-		nvkm_done(chan);
+			gr->data[i / 4] = nvkm_ro32(data, CB_RESERVED + i);
+		nvkm_done(data);
 		ret = 0;
 	} else {
 		ret = -ENOMEM;
 	}
 
+done_inst:
+	nvkm_kmap(inst);
+	nvkm_wo32(inst, 0x0210, 0);
+	nvkm_wo32(inst, 0x0214, 0);
+	nvkm_done(inst);
 done:
-	nvkm_memory_del(&chan);
+	nvkm_vmm_put(chan->vmm, &ctx);
+	nvkm_memory_unref(&data);
 	return ret;
 }
 
@@ -1389,7 +1562,8 @@ gf100_grctx = {
 	.main  = gf100_grctx_generate_main,
 	.unkn  = gf100_grctx_generate_unkn,
 	.hub   = gf100_grctx_pack_hub,
-	.gpc   = gf100_grctx_pack_gpc,
+	.gpc_0 = gf100_grctx_pack_gpc_0,
+	.gpc_1 = gf100_grctx_pack_gpc_1,
 	.zcull = gf100_grctx_pack_zcull,
 	.tpc   = gf100_grctx_pack_tpc,
 	.icmd  = gf100_grctx_pack_icmd,
@@ -1398,7 +1572,16 @@ gf100_grctx = {
 	.bundle_size = 0x1800,
 	.pagepool = gf100_grctx_generate_pagepool,
 	.pagepool_size = 0x8000,
+	.attrib_cb_size = gf100_grctx_generate_attrib_cb_size,
+	.attrib_cb = gf100_grctx_generate_attrib_cb,
 	.attrib = gf100_grctx_generate_attrib,
 	.attrib_nr_max = 0x324,
 	.attrib_nr = 0x218,
+	.sm_id = gf100_grctx_generate_sm_id,
+	.tpc_nr = gf100_grctx_generate_tpc_nr,
+	.r4060a8 = gf100_grctx_generate_r4060a8,
+	.rop_mapping = gf100_grctx_generate_rop_mapping,
+	.alpha_beta_tables = gf100_grctx_generate_alpha_beta_tables,
+	.max_ways_evict = gf100_grctx_generate_max_ways_evict,
+	.r419cb8 = gf100_grctx_generate_r419cb8,
 };