Skip to content

Commit

Permalink
drm/i915: Flush TLBs before releasing backing store
Browse files Browse the repository at this point in the history
We need to flush TLBs before releasing backing store otherwise userspace
is able to encounter stale entries if a) it is not declaring access to
certain buffers and b) it races with the backing store release from a
such undeclared execution already executing on the GPU in parallel.

The approach taken is to mark any buffer objects which were ever bound
to the GPU and to trigger a serialized TLB flush when their backing
store is released.

Alternatively the flushing could be done on VMA unbind, at which point
we would be able to ascertain whether there is potential a parallel GPU
execution (which could race), but essentially it boils down to paying
the cost of TLB flushes potentially needlessly at VMA unbind time (when
the backing store is not known to be going away so not needed for
safety), versus potentially needlessly at backing store relase time
(since we at that point cannot tell whether there is anything executing
on the GPU which uses that object).

Thereforce simplicity of implementation has been chosen for now with
scope to benchmark and refine later as required.

Signed-off-by: Tvrtko Ursulin <[email protected]>
Reported-by: Sushma Venkatesh Reddy <[email protected]>
Reviewed-by: Daniel Vetter <[email protected]>
Acked-by: Dave Airlie <[email protected]>
Cc: Daniel Vetter <[email protected]>
Cc: Jon Bloomfield <[email protected]>
Cc: Joonas Lahtinen <[email protected]>
Cc: Jani Nikula <[email protected]>
Cc: [email protected]
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
tursulin authored and torvalds committed Jan 25, 2022
1 parent 49d766f commit 7938d61
Show file tree
Hide file tree
Showing 9 changed files with 161 additions and 4 deletions.
1 change: 1 addition & 0 deletions drivers/gpu/drm/i915/gem/i915_gem_object_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,7 @@ struct drm_i915_gem_object {
#define I915_BO_READONLY BIT(6)
#define I915_TILING_QUIRK_BIT 7 /* unknown swizzling; do not release! */
#define I915_BO_PROTECTED BIT(8)
#define I915_BO_WAS_BOUND_BIT 9
/**
* @mem_flags - Mutable placement-related flags
*
Expand Down
10 changes: 10 additions & 0 deletions drivers/gpu/drm/i915/gem/i915_gem_pages.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#include "i915_gem_lmem.h"
#include "i915_gem_mman.h"

#include "gt/intel_gt.h"

void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
struct sg_table *pages,
unsigned int sg_page_sizes)
Expand Down Expand Up @@ -221,6 +223,14 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
__i915_gem_object_reset_page_iter(obj);
obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;

if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) {
struct drm_i915_private *i915 = to_i915(obj->base.dev);
intel_wakeref_t wakeref;

with_intel_runtime_pm_if_active(&i915->runtime_pm, wakeref)
intel_gt_invalidate_tlbs(to_gt(i915));
}

return pages;
}

Expand Down
108 changes: 108 additions & 0 deletions drivers/gpu/drm/i915/gt/intel_gt.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ void __intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915)
{
spin_lock_init(&gt->irq_lock);

mutex_init(&gt->tlb_invalidate_lock);

INIT_LIST_HEAD(&gt->closed_vma);
spin_lock_init(&gt->closed_lock);

Expand Down Expand Up @@ -912,3 +914,109 @@ void intel_gt_info_print(const struct intel_gt_info *info,

intel_sseu_dump(&info->sseu, p);
}

struct reg_and_bit {
i915_reg_t reg;
u32 bit;
};

static struct reg_and_bit
get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
const i915_reg_t *regs, const unsigned int num)
{
const unsigned int class = engine->class;
struct reg_and_bit rb = { };

if (drm_WARN_ON_ONCE(&engine->i915->drm,
class >= num || !regs[class].reg))
return rb;

rb.reg = regs[class];
if (gen8 && class == VIDEO_DECODE_CLASS)
rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
else
rb.bit = engine->instance;

rb.bit = BIT(rb.bit);

return rb;
}

void intel_gt_invalidate_tlbs(struct intel_gt *gt)
{
static const i915_reg_t gen8_regs[] = {
[RENDER_CLASS] = GEN8_RTCR,
[VIDEO_DECODE_CLASS] = GEN8_M1TCR, /* , GEN8_M2TCR */
[VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR,
[COPY_ENGINE_CLASS] = GEN8_BTCR,
};
static const i915_reg_t gen12_regs[] = {
[RENDER_CLASS] = GEN12_GFX_TLB_INV_CR,
[VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR,
[VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR,
[COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR,
};
struct drm_i915_private *i915 = gt->i915;
struct intel_uncore *uncore = gt->uncore;
struct intel_engine_cs *engine;
enum intel_engine_id id;
const i915_reg_t *regs;
unsigned int num = 0;

if (I915_SELFTEST_ONLY(gt->awake == -ENODEV))
return;

if (GRAPHICS_VER(i915) == 12) {
regs = gen12_regs;
num = ARRAY_SIZE(gen12_regs);
} else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) {
regs = gen8_regs;
num = ARRAY_SIZE(gen8_regs);
} else if (GRAPHICS_VER(i915) < 8) {
return;
}

if (drm_WARN_ONCE(&i915->drm, !num,
"Platform does not implement TLB invalidation!"))
return;

GEM_TRACE("\n");

assert_rpm_wakelock_held(&i915->runtime_pm);

mutex_lock(&gt->tlb_invalidate_lock);
intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);

for_each_engine(engine, gt, id) {
/*
* HW architecture suggest typical invalidation time at 40us,
* with pessimistic cases up to 100us and a recommendation to
* cap at 1ms. We go a bit higher just in case.
*/
const unsigned int timeout_us = 100;
const unsigned int timeout_ms = 4;
struct reg_and_bit rb;

rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
if (!i915_mmio_reg_offset(rb.reg))
continue;

intel_uncore_write_fw(uncore, rb.reg, rb.bit);
if (__intel_wait_for_register_fw(uncore,
rb.reg, rb.bit, 0,
timeout_us, timeout_ms,
NULL))
drm_err_ratelimited(&gt->i915->drm,
"%s TLB invalidation did not complete in %ums!\n",
engine->name, timeout_ms);
}

/*
* Use delayed put since a) we mostly expect a flurry of TLB
* invalidations so it is good to avoid paying the forcewake cost and
* b) it works around a bug in Icelake which cannot cope with too rapid
* transitions.
*/
intel_uncore_forcewake_put_delayed(uncore, FORCEWAKE_ALL);
mutex_unlock(&gt->tlb_invalidate_lock);
}
2 changes: 2 additions & 0 deletions drivers/gpu/drm/i915/gt/intel_gt.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,4 +91,6 @@ void intel_gt_info_print(const struct intel_gt_info *info,

void intel_gt_watchdog_work(struct work_struct *work);

void intel_gt_invalidate_tlbs(struct intel_gt *gt);

#endif /* __INTEL_GT_H__ */
2 changes: 2 additions & 0 deletions drivers/gpu/drm/i915/gt/intel_gt_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ struct intel_gt {

struct intel_uc uc;

struct mutex tlb_invalidate_lock;

struct i915_wa_list wa_list;

struct intel_gt_timelines {
Expand Down
11 changes: 11 additions & 0 deletions drivers/gpu/drm/i915/i915_reg.h
Original file line number Diff line number Diff line change
Expand Up @@ -2721,6 +2721,12 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
#define GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING (1 << 28)
#define GAMT_CHKN_DISABLE_I2M_CYCLE_ON_WR_PORT (1 << 24)

#define GEN8_RTCR _MMIO(0x4260)
#define GEN8_M1TCR _MMIO(0x4264)
#define GEN8_M2TCR _MMIO(0x4268)
#define GEN8_BTCR _MMIO(0x426c)
#define GEN8_VTCR _MMIO(0x4270)

#if 0
#define PRB0_TAIL _MMIO(0x2030)
#define PRB0_HEAD _MMIO(0x2034)
Expand Down Expand Up @@ -2819,6 +2825,11 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
#define FAULT_VA_HIGH_BITS (0xf << 0)
#define FAULT_GTT_SEL (1 << 4)

#define GEN12_GFX_TLB_INV_CR _MMIO(0xced8)
#define GEN12_VD_TLB_INV_CR _MMIO(0xcedc)
#define GEN12_VE_TLB_INV_CR _MMIO(0xcee0)
#define GEN12_BLT_TLB_INV_CR _MMIO(0xcee4)

#define GEN12_AUX_ERR_DBG _MMIO(0x43f4)

#define FPGA_DBG _MMIO(0x42300)
Expand Down
3 changes: 3 additions & 0 deletions drivers/gpu/drm/i915/i915_vma.c
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,9 @@ int i915_vma_bind(struct i915_vma *vma,
vma->ops->bind_vma(vma->vm, NULL, vma, cache_level, bind_flags);
}

if (vma->obj)
set_bit(I915_BO_WAS_BOUND_BIT, &vma->obj->flags);

atomic_or(bind_flags, &vma->flags);
return 0;
}
Expand Down
26 changes: 22 additions & 4 deletions drivers/gpu/drm/i915/intel_uncore.c
Original file line number Diff line number Diff line change
Expand Up @@ -724,7 +724,8 @@ void intel_uncore_forcewake_get__locked(struct intel_uncore *uncore,
}

static void __intel_uncore_forcewake_put(struct intel_uncore *uncore,
enum forcewake_domains fw_domains)
enum forcewake_domains fw_domains,
bool delayed)
{
struct intel_uncore_forcewake_domain *domain;
unsigned int tmp;
Expand All @@ -739,7 +740,11 @@ static void __intel_uncore_forcewake_put(struct intel_uncore *uncore,
continue;
}

fw_domains_put(uncore, domain->mask);
if (delayed &&
!(domain->uncore->fw_domains_timer & domain->mask))
fw_domain_arm_timer(domain);
else
fw_domains_put(uncore, domain->mask);
}
}

Expand All @@ -760,7 +765,20 @@ void intel_uncore_forcewake_put(struct intel_uncore *uncore,
return;

spin_lock_irqsave(&uncore->lock, irqflags);
__intel_uncore_forcewake_put(uncore, fw_domains);
__intel_uncore_forcewake_put(uncore, fw_domains, false);
spin_unlock_irqrestore(&uncore->lock, irqflags);
}

void intel_uncore_forcewake_put_delayed(struct intel_uncore *uncore,
enum forcewake_domains fw_domains)
{
unsigned long irqflags;

if (!uncore->fw_get_funcs)
return;

spin_lock_irqsave(&uncore->lock, irqflags);
__intel_uncore_forcewake_put(uncore, fw_domains, true);
spin_unlock_irqrestore(&uncore->lock, irqflags);
}

Expand Down Expand Up @@ -802,7 +820,7 @@ void intel_uncore_forcewake_put__locked(struct intel_uncore *uncore,
if (!uncore->fw_get_funcs)
return;

__intel_uncore_forcewake_put(uncore, fw_domains);
__intel_uncore_forcewake_put(uncore, fw_domains, false);
}

void assert_forcewakes_inactive(struct intel_uncore *uncore)
Expand Down
2 changes: 2 additions & 0 deletions drivers/gpu/drm/i915/intel_uncore.h
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,8 @@ void intel_uncore_forcewake_get(struct intel_uncore *uncore,
enum forcewake_domains domains);
void intel_uncore_forcewake_put(struct intel_uncore *uncore,
enum forcewake_domains domains);
void intel_uncore_forcewake_put_delayed(struct intel_uncore *uncore,
enum forcewake_domains domains);
void intel_uncore_forcewake_flush(struct intel_uncore *uncore,
enum forcewake_domains fw_domains);

Expand Down

0 comments on commit 7938d61

Please sign in to comment.