Skip to content

Commit

Permalink
lazy stack: do nothing in kernel threads and on populated stack
Browse files Browse the repository at this point in the history
This patch annotates the relevant call sites with the invariant assert
expressions to validate assumptions that let us do "nothing" in all these
cases. We also reorganize some code in the scheduler to help differentiate
between cases when given function/method is called with interrupts or preemption
disabled or from kernel thread or by interrupt handler.

Following methods get added to scheduler code with names describing state of interrupts
or preemption or kernel caller:
- timer_base::set_with_irq_disabled(osv::clock::uptime::time_point time)
- timer_base::set_with_irq_disabled(std::chrono::duration<Rep, Period> duration)
- thread::wake_with_irq_disabled()
- thread::wake_with_irq_or_preemption_disabled(Action action)
- thread_handle::wake_from_kernel_or_with_irq_disabled()

In general:
- we modify all interrupt handlers (those that are executed on interrupt stack)
  to call one of the 3 new wake_...() methods (mostly wake_with_irq_disabled())
  to indicate we do not need/should not pre-fault the stack; most of those are in
  device drivers code
- we modify all code executed on kernel threads that disables preemption or interrupts
  by adding relevant invariant - assert(!sched::thread::current()->is_app()); we do
  not need to pre-fault because the stack is populated
- we also modify the code whhich is indirectly called from kernel threads like
  classifier::post_packet() in net channels
- finally we also modify the scheduler code to use timer_bas::set_with_irq_disabled()
  mostly around preemption_timer to indicate that we should not pre-fault the stack
  downstream

Signed-off-by: Waldemar Kozaczuk <[email protected]>
  • Loading branch information
wkozaczuk committed Oct 16, 2022
1 parent f5684d9 commit fabacc4
Show file tree
Hide file tree
Showing 32 changed files with 157 additions and 39 deletions.
3 changes: 3 additions & 0 deletions arch/aarch64/exceptions.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ void interrupt_table::unregister_interrupt(interrupt *interrupt)

bool interrupt_table::invoke_interrupt(unsigned int id)
{
#if CONF_lazy_stack_invariant
assert(!arch::irq_enabled());
#endif
WITH_LOCK(osv::rcu_read_lock) {
assert(id < this->nr_irqs);
interrupt_desc *desc = this->irq_desc[id].read();
Expand Down
3 changes: 3 additions & 0 deletions arch/aarch64/interrupt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ sgi_interrupt::~sgi_interrupt()

void sgi_interrupt::send(sched::cpu* cpu)
{
#if CONF_lazy_stack_invariant
assert(!arch::irq_enabled() || !sched::preemptable());
#endif
gic::gic->send_sgi(gic::sgi_filter::SGI_TARGET_LIST,
cpu->arch.smp_idx, get_id());
}
Expand Down
3 changes: 3 additions & 0 deletions arch/x64/exceptions.cc
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,9 @@ void interrupt_descriptor_table::unregister_interrupt(gsi_level_interrupt *inter

void interrupt_descriptor_table::invoke_interrupt(unsigned vector)
{
#if CONF_lazy_stack_invariant
assert(!arch::irq_enabled());
#endif
WITH_LOCK(osv::rcu_read_lock) {
unsigned i, nr_shared;
bool handled = false;
Expand Down
2 changes: 1 addition & 1 deletion arch/x64/mmu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ std::atomic<int> tlb_flush_pendingconfirms;
inter_processor_interrupt tlb_flush_ipi{IPI_TLB_FLUSH, [] {
mmu::flush_tlb_local();
if (tlb_flush_pendingconfirms.fetch_add(-1) == 1) {
tlb_flush_waiter.wake();
tlb_flush_waiter.wake_from_kernel_or_with_irq_disabled();
}
}};

Expand Down
2 changes: 1 addition & 1 deletion arch/x64/msi.cc
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ static inline void set_affinity_and_wake(
v->msix_unmask_entries();
}

t->wake();
t->wake_with_irq_disabled();
}

bool interrupt_manager::easy_register(std::initializer_list<msix_binding> bindings)
Expand Down
6 changes: 6 additions & 0 deletions core/async.cc
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,9 @@ class async_worker {
return _timer.expired() || !_queue.empty();
});

#if CONF_lazy_stack_invariant
assert(!sched::thread::current()->is_app());
#endif
WITH_LOCK(preempt_lock) {
_timer.cancel();

Expand Down Expand Up @@ -224,6 +227,9 @@ class async_worker {
}

auto& master = *task.master;
#if CONF_lazy_stack_invariant
assert(!sched::thread::current()->is_app());
#endif
DROP_LOCK(preempt_lock) {
master.fire(task);
}
Expand Down
2 changes: 1 addition & 1 deletion core/epoll.cc
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ class epoll_file final : public special_file {
if (!_activity_ring.push(key)) {
_activity_ring_overflow.store(true, std::memory_order_relaxed);
}
_activity_ring_owner.wake();
_activity_ring_owner.wake_from_kernel_or_with_irq_disabled();
}
};

Expand Down
12 changes: 9 additions & 3 deletions core/mempool.cc
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ void pool::collect_garbage()

static void garbage_collector_fn()
{
#if CONF_lazy_stack_invariant
assert(!sched::thread::current()->is_app());
#endif
WITH_LOCK(preempt_lock) {
pool::collect_garbage();
}
Expand Down Expand Up @@ -1352,9 +1355,12 @@ void l1::fill_thread()
auto& pbuf = get_l1();
for (;;) {
sched::thread::wait_until([&] {
WITH_LOCK(preempt_lock) {
return pbuf.nr < pbuf.watermark_lo || pbuf.nr > pbuf.watermark_hi;
}
#if CONF_lazy_stack_invariant
assert(!sched::thread::current()->is_app());
#endif
WITH_LOCK(preempt_lock) {
return pbuf.nr < pbuf.watermark_lo || pbuf.nr > pbuf.watermark_hi;
}
});
if (pbuf.nr < pbuf.watermark_lo) {
while (pbuf.nr + page_batch::nr_pages < pbuf.max / 2) {
Expand Down
8 changes: 7 additions & 1 deletion core/net_channel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,16 @@ void net_channel::process_queue()

void net_channel::wake_pollers()
{
#if CONF_lazy_stack_invariant
assert(!sched::thread::current()->is_app());
#endif
WITH_LOCK(osv::rcu_read_lock) {
auto pl = _pollers.read();
if (pl) {
for (pollreq* pr : *pl) {
// net_channel is self synchronizing
pr->_awake.store(true, std::memory_order_relaxed);
pr->_poll_thread.wake();
pr->_poll_thread.wake_from_kernel_or_with_irq_disabled();
}
}
// can't call epoll_wake from rcu, so copy the data
Expand Down Expand Up @@ -128,6 +131,9 @@ void classifier::remove(ipv4_tcp_conn_id id)

bool classifier::post_packet(mbuf* m)
{
#if CONF_lazy_stack_invariant
assert(!sched::thread::current()->is_app());
#endif
WITH_LOCK(osv::rcu_read_lock) {
if (auto nc = classify_ipv4_tcp(m)) {
log_packet_in(m, NETISR_ETHER);
Expand Down
5 changes: 4 additions & 1 deletion core/rcu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ void cpu_quiescent_state_thread::do_work()
{
while (true) {
bool toclean = false;
#if CONF_lazy_stack_invariant
assert(!sched::thread::current()->is_app());
#endif
WITH_LOCK(preempt_lock) {
auto p = &*percpu_callbacks;
if (p->ncallbacks[p->buf]) {
Expand Down Expand Up @@ -242,7 +245,7 @@ void rcu_flush()
rcu_defer([&] { s.post(); });
// rcu_defer() might not wake the cleanup thread until enough deferred
// callbacks have accumulated, so wake it up now.
percpu_quiescent_state_thread->wake();
percpu_quiescent_state_thread->wake_from_kernel_or_with_irq_disabled();
}, sched::thread::attr().pin(c)));
t->start();
t->join();
Expand Down
79 changes: 71 additions & 8 deletions core/sched.cc
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ void cpu::reschedule_from_interrupt(bool called_from_yield,
preemption_timer.cancel();
auto delta = p->_runtime.time_until(t._runtime.get_local());
if (delta > 0) {
preemption_timer.set(now + delta);
preemption_timer.set_with_irq_disabled(now + delta);
}
#ifdef __aarch64__
return switch_data;
Expand Down Expand Up @@ -352,11 +352,11 @@ void cpu::reschedule_from_interrupt(bool called_from_yield,
auto& t = *runqueue.begin();
auto delta = n->_runtime.time_until(t._runtime.get_local());
if (delta > 0) {
preemption_timer.set(now + delta);
preemption_timer.set_with_irq_disabled(now + delta);
}
}
} else {
preemption_timer.set(now + preempt_after);
preemption_timer.set_with_irq_disabled(now + preempt_after);
}

if (app_thread.load(std::memory_order_relaxed) != n->_app) { // don't write into a cache line if it can be avoided
Expand Down Expand Up @@ -444,6 +444,9 @@ void cpu::do_idle()
}
}
}
#if CONF_lazy_stack_invariant
assert(!thread::current()->is_app());
#endif
std::unique_lock<irq_lock_type> guard(irq_lock);
handle_incoming_wakeups();
if (!runqueue.empty()) {
Expand All @@ -462,6 +465,9 @@ void cpu::idle()
// The idle thread must not sleep, because the whole point is that the
// scheduler can always find at least one runnable thread.
// We set preempt_disable just to help us verify this.
#if CONF_lazy_stack_invariant
assert(!thread::current()->is_app());
#endif
preempt_disable();

if (id == 0) {
Expand All @@ -477,6 +483,9 @@ void cpu::idle()

void cpu::handle_incoming_wakeups()
{
#if CONF_lazy_stack_invariant
assert(!arch::irq_enabled() || !thread::current()->is_app());
#endif
cpu_set queues_with_wakes{incoming_wakeups_mask.fetch_clear()};
if (!queues_with_wakes) {
return;
Expand Down Expand Up @@ -569,7 +578,7 @@ void thread::pin(cpu *target_cpu)
t._detached_state->st.store(thread::status::waiting);
// Note that wakeme is on the same CPU, and irq is disabled,
// so it will not actually run until we stop running.
wakeme->wake_with([&] { do_wakeme = true; });
wakeme->wake_with_irq_or_preemption_disabled([&] { do_wakeme = true; });
#ifdef __aarch64__
reschedule_from_interrupt(source_cpu, false, thyst);
#else
Expand All @@ -591,6 +600,9 @@ void thread::pin(thread *t, cpu *target_cpu)
// helper thread to follow the target thread's CPU. We could have also
// re-used an existing thread (e.g., the load balancer thread).
thread_unique_ptr helper(thread::make_unique([&] {
#if CONF_lazy_stack_invariant
assert(!thread::current()->is_app());
#endif
WITH_LOCK(irq_lock) {
// This thread started on the same CPU as t, but by now t might
// have moved. If that happened, we need to move too.
Expand Down Expand Up @@ -658,7 +670,7 @@ void thread::pin(thread *t, cpu *target_cpu)
// comment above).
if (t->_detached_state->st.load(std::memory_order_relaxed) == status::waking) {
t->_detached_state->st.store(status::waiting);
t->wake();
t->wake_with_irq_disabled();
}
break;
case status::queued:
Expand All @@ -672,7 +684,7 @@ void thread::pin(thread *t, cpu *target_cpu)
t->remote_thread_local_var(current_cpu) = target_cpu;
// pretend the thread was waiting, so we can wake it
t->_detached_state->st.store(status::waiting);
t->wake();
t->wake_with_irq_disabled();
break;
default:
// Thread is in an unexpected state (for example, already
Expand Down Expand Up @@ -702,6 +714,9 @@ void thread::unpin()
return;
}
thread_unique_ptr helper(thread::make_unique([this] {
#if CONF_lazy_stack_invariant
assert(!thread::current()->is_app());
#endif
WITH_LOCK(preempt_lock) {
// helper thread started on the same CPU as "this", but by now
// "this" might migrated. If that happened helper need to migrate.
Expand Down Expand Up @@ -741,6 +756,9 @@ void cpu::load_balance()
if (min->load() >= (load() - 1)) {
continue;
}
#if CONF_lazy_stack_invariant
assert(!thread::current()->is_app());
#endif
WITH_LOCK(irq_lock) {
auto i = std::find_if(runqueue.rbegin(), runqueue.rend(),
[](thread& t) { return t._migration_lock_counter == 0; });
Expand Down Expand Up @@ -1175,7 +1193,7 @@ void thread::destroy()
ds->st.store(status::terminated);
} else {
// The joiner won the race, and will wait. We need to wake it.
joiner->wake_with([&] { ds->st.store(status::terminated); });
joiner->wake_with_irq_or_preemption_disabled([&] { ds->st.store(status::terminated); });
}
}
}
Expand All @@ -1200,6 +1218,9 @@ void thread::wake_impl(detached_state* st, unsigned allowed_initial_states_mask)
unsigned c = cpu::current()->id;
// we can now use st->t here, since the thread cannot terminate while
// it's waking, but not afterwards, when it may be running
#if CONF_lazy_stack_invariant
assert(!sched::preemptable());
#endif
irq_save_lock_type irq_lock;
WITH_LOCK(irq_lock) {
tcpu->incoming_wakeups[c].push_back(*st->t);
Expand All @@ -1223,6 +1244,16 @@ void thread::wake()
}
}

void thread::wake_with_irq_disabled()
{
#if CONF_lazy_stack_invariant
assert(!arch::irq_enabled());
#endif
WITH_LOCK(rcu_read_lock) {
wake_impl(_detached_state.get());
}
}

void thread::wake_lock(mutex* mtx, wait_record* wr)
{
// must be called with mtx held
Expand Down Expand Up @@ -1395,7 +1426,7 @@ void thread::set_cleanup(std::function<void ()> cleanup)

void thread::timer_fired()
{
wake();
wake_with_irq_disabled();
}

unsigned int thread::id() const
Expand Down Expand Up @@ -1439,6 +1470,19 @@ void thread_handle::wake()
}
}

void thread_handle::wake_from_kernel_or_with_irq_disabled()
{
#if CONF_lazy_stack_invariant
assert(!sched::thread::current()->is_app() || !arch::irq_enabled());
#endif
WITH_LOCK(rcu_read_lock) {
thread::detached_state* ds = _t.read();
if (ds) {
thread::wake_impl(ds);
}
}
}

timer_list::callback_dispatch::callback_dispatch()
{
clock_event->set_callback(this);
Expand Down Expand Up @@ -1526,6 +1570,22 @@ void timer_base::expire()
_t.timer_fired();
}

void timer_base::set_with_irq_disabled(osv::clock::uptime::time_point time)
{
#if CONF_lazy_stack_invariant
assert(!arch::irq_enabled());
#endif
trace_timer_set(this, time.time_since_epoch().count());
_state = state::armed;
_time = time;

auto& timers = cpu::current()->timers;
_t._active_timers.push_back(*this);
if (timers._list.insert(*this)) {
timers.rearm();
}
};

void timer_base::set(osv::clock::uptime::time_point time)
{
trace_timer_set(this, time.time_since_epoch().count());
Expand Down Expand Up @@ -1566,6 +1626,9 @@ void timer_base::reset(osv::clock::uptime::time_point time)

auto& timers = cpu::current()->timers;

#if CONF_lazy_stack_invariant
assert(!thread::current()->is_app() || !sched::preemptable());
#endif
irq_save_lock_type irq_lock;
WITH_LOCK(irq_lock) {
if (_state == state::armed) {
Expand Down
2 changes: 1 addition & 1 deletion drivers/acpi.cc
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ class acpi_interrupt {
, _stopped(false)
, _counter(0)
, _thread(sched::thread::make([this] { process_interrupts(); }))
, _intr(gsi, [this] { _counter.fetch_add(1); _thread->wake(); })
, _intr(gsi, [this] { _counter.fetch_add(1); _thread->wake_with_irq_disabled(); })
{
_thread->start();
}
Expand Down
2 changes: 1 addition & 1 deletion drivers/ahci.cc
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ void port::req_done()
_slot_free++;

// Wakeup the thread waiting for a free slot
_cmd_send_waiter.wake();
_cmd_send_waiter.wake_from_kernel_or_with_irq_disabled();
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion drivers/ahci.hh
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ public:
void enable_irq();
void wait_device_ready();
void wait_ci_ready(u8 slot);
void wakeup() { _irq_thread->wake(); }
void wakeup() { _irq_thread->wake_with_irq_disabled(); }
bool linkup() { return _linkup; }

u32 port2hba(u32 port_reg)
Expand Down
2 changes: 1 addition & 1 deletion drivers/cadence-uart.cc
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ void Cadence_Console::irq_handler()
// IRQ must be cleared after character is read from FIFO
uart->cisr = cisr;

_thread->wake();
_thread->wake_with_irq_disabled();
}

void Cadence_Console::dev_start() {
Expand Down
Loading

0 comments on commit fabacc4

Please sign in to comment.