arch/x86/xen/spinlock.c

   1 /*
   2  * Split spinlock implementation out into its own file, so it can be
   3  * compiled in a FTRACE-compatible way.
   4  */
   5 #include <linux/kernel_stat.h>
   6 #include <linux/spinlock.h>
   7 #include <linux/debugfs.h>
   8 #include <linux/log2.h>
   9 #include <linux/gfp.h>
  10 #include <linux/slab.h>
  11
  12 #include <asm/paravirt.h>
  13
  14 #include <xen/interface/xen.h>
  15 #include <xen/events.h>
  16
  17 #include "xen-ops.h"
  18 #include "debugfs.h"
  19
  20 static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
  21 static DEFINE_PER_CPU(char *, irq_name);
  22 static bool xen_pvspin = true;
  23
  24 #ifdef CONFIG_QUEUED_SPINLOCKS
  25
  26 #include <asm/qspinlock.h>
  27
  28 static void xen_qlock_kick(int cpu)
  29 {
  30         xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
  31 }
  32
  33 /*
  34  * Halt the current CPU & release it back to the host
  35  */
  36 static void xen_qlock_wait(u8 *byte, u8 val)
  37 {
  38         int irq = __this_cpu_read(lock_kicker_irq);
  39
  40         /* If kicker interrupts not initialized yet, just spin */
  41         if (irq == -1)
  42                 return;
  43
  44         /* clear pending */
  45         xen_clear_irq_pending(irq);
  46         barrier();
  47
  48         /*
  49          * We check the byte value after clearing pending IRQ to make sure
  50          * that we won't miss a wakeup event because of the clearing.
  51          *
  52          * The sync_clear_bit() call in xen_clear_irq_pending() is atomic.
  53          * So it is effectively a memory barrier for x86.
  54          */
  55         if (READ_ONCE(*byte) != val)
  56                 return;
  57
  58         /*
  59          * If an interrupt happens here, it will leave the wakeup irq
  60          * pending, which will cause xen_poll_irq() to return
  61          * immediately.
  62          */
  63
  64         /* Block until irq becomes pending (or perhaps a spurious wakeup) */
  65         xen_poll_irq(irq);
  66 }
  67
  68 #else /* CONFIG_QUEUED_SPINLOCKS */
  69
  70 enum xen_contention_stat {
  71         TAKEN_SLOW,
  72         TAKEN_SLOW_PICKUP,
  73         TAKEN_SLOW_SPURIOUS,
  74         RELEASED_SLOW,
  75         RELEASED_SLOW_KICKED,
  76         NR_CONTENTION_STATS
  77 };
  78
  79
  80 #ifdef CONFIG_XEN_DEBUG_FS
  81 #define HISTO_BUCKETS   30
  82 static struct xen_spinlock_stats
  83 {
  84         u32 contention_stats[NR_CONTENTION_STATS];
  85         u32 histo_spin_blocked[HISTO_BUCKETS+1];
  86         u64 time_blocked;
  87 } spinlock_stats;
  88
  89 static u8 zero_stats;
  90
  91 static inline void check_zero(void)
  92 {
  93         u8 ret;
  94         u8 old = READ_ONCE(zero_stats);
  95         if (unlikely(old)) {
  96                 ret = cmpxchg(&zero_stats, old, 0);
  97                 /* This ensures only one fellow resets the stat */
  98                 if (ret == old)
  99                         memset(&spinlock_stats, 0, sizeof(spinlock_stats));
 100         }
 101 }
 102
 103 static inline void add_stats(enum xen_contention_stat var, u32 val)
 104 {
 105         check_zero();
 106         spinlock_stats.contention_stats[var] += val;
 107 }
 108
 109 static inline u64 spin_time_start(void)
 110 {
 111         return xen_clocksource_read();
 112 }
 113
 114 static void __spin_time_accum(u64 delta, u32 *array)
 115 {
 116         unsigned index = ilog2(delta);
 117
 118         check_zero();
 119
 120         if (index < HISTO_BUCKETS)
 121                 array[index]++;
 122         else
 123                 array[HISTO_BUCKETS]++;
 124 }
 125
 126 static inline void spin_time_accum_blocked(u64 start)
 127 {
 128         u32 delta = xen_clocksource_read() - start;
 129
 130         __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
 131         spinlock_stats.time_blocked += delta;
 132 }
 133 #else  /* !CONFIG_XEN_DEBUG_FS */
 134 static inline void add_stats(enum xen_contention_stat var, u32 val)
 135 {
 136 }
 137
 138 static inline u64 spin_time_start(void)
 139 {
 140         return 0;
 141 }
 142
 143 static inline void spin_time_accum_blocked(u64 start)
 144 {
 145 }
 146 #endif  /* CONFIG_XEN_DEBUG_FS */
 147
 148 struct xen_lock_waiting {
 149         struct arch_spinlock *lock;
 150         __ticket_t want;
 151 };
 152
 153 static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
 154 static cpumask_t waiting_cpus;
 155
 156 __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 157 {
 158         int irq = __this_cpu_read(lock_kicker_irq);
 159         struct xen_lock_waiting *w = this_cpu_ptr(&lock_waiting);
 160         int cpu = smp_processor_id();
 161         u64 start;
 162         __ticket_t head;
 163         unsigned long flags;
 164
 165         /* If kicker interrupts not initialized yet, just spin */
 166         if (irq == -1)
 167                 return;
 168
 169         start = spin_time_start();
 170
 171         /*
 172          * Make sure an interrupt handler can't upset things in a
 173          * partially setup state.
 174          */
 175         local_irq_save(flags);
 176         /*
 177          * We don't really care if we're overwriting some other
 178          * (lock,want) pair, as that would mean that we're currently
 179          * in an interrupt context, and the outer context had
 180          * interrupts enabled.  That has already kicked the VCPU out
 181          * of xen_poll_irq(), so it will just return spuriously and
 182          * retry with newly setup (lock,want).
 183          *
 184          * The ordering protocol on this is that the "lock" pointer
 185          * may only be set non-NULL if the "want" ticket is correct.
 186          * If we're updating "want", we must first clear "lock".
 187          */
 188         w->lock = NULL;
 189         smp_wmb();
 190         w->want = want;
 191         smp_wmb();
 192         w->lock = lock;
 193
 194         /* This uses set_bit, which atomic and therefore a barrier */
 195         cpumask_set_cpu(cpu, &waiting_cpus);
 196         add_stats(TAKEN_SLOW, 1);
 197
 198         /* clear pending */
 199         xen_clear_irq_pending(irq);
 200
 201         /* Only check lock once pending cleared */
 202         barrier();
 203
 204         /*
 205          * Mark entry to slowpath before doing the pickup test to make
 206          * sure we don't deadlock with an unlocker.
 207          */
 208         __ticket_enter_slowpath(lock);
 209
 210         /* make sure enter_slowpath, which is atomic does not cross the read */
 211         smp_mb__after_atomic();
 212
 213         /*
 214          * check again make sure it didn't become free while
 215          * we weren't looking
 216          */
 217         head = READ_ONCE(lock->tickets.head);
 218         if (__tickets_equal(head, want)) {
 219                 add_stats(TAKEN_SLOW_PICKUP, 1);
 220                 goto out;
 221         }
 222
 223         /* Allow interrupts while blocked */
 224         local_irq_restore(flags);
 225
 226         /*
 227          * If an interrupt happens here, it will leave the wakeup irq
 228          * pending, which will cause xen_poll_irq() to return
 229          * immediately.
 230          */
 231
 232         /* Block until irq becomes pending (or perhaps a spurious wakeup) */
 233         xen_poll_irq(irq);
 234         add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq));
 235
 236         local_irq_save(flags);
 237
 238         kstat_incr_irq_this_cpu(irq);
 239 out:
 240         cpumask_clear_cpu(cpu, &waiting_cpus);
 241         w->lock = NULL;
 242
 243         local_irq_restore(flags);
 244
 245         spin_time_accum_blocked(start);
 246 }
 247 PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning);
 248
 249 static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
 250 {
 251         int cpu;
 252
 253         add_stats(RELEASED_SLOW, 1);
 254
 255         for_each_cpu(cpu, &waiting_cpus) {
 256                 const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
 257
 258                 /* Make sure we read lock before want */
 259                 if (READ_ONCE(w->lock) == lock &&
 260                     READ_ONCE(w->want) == next) {
 261                         add_stats(RELEASED_SLOW_KICKED, 1);
 262                         xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
 263                         break;
 264                 }
 265         }
 266 }
 267 #endif /* CONFIG_QUEUED_SPINLOCKS */
 268
 269 static irqreturn_t dummy_handler(int irq, void *dev_id)
 270 {
 271         BUG();
 272         return IRQ_HANDLED;
 273 }
 274
 275 void xen_init_lock_cpu(int cpu)
 276 {
 277         int irq;
 278         char *name;
 279
 280         if (!xen_pvspin)
 281                 return;
 282
 283         WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n",
 284              cpu, per_cpu(lock_kicker_irq, cpu));
 285
 286         name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
 287         irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
 288                                      cpu,
 289                                      dummy_handler,
 290                                      IRQF_PERCPU|IRQF_NOBALANCING,
 291                                      name,
 292                                      NULL);
 293
 294         if (irq >= 0) {
 295                 disable_irq(irq); /* make sure it's never delivered */
 296                 per_cpu(lock_kicker_irq, cpu) = irq;
 297                 per_cpu(irq_name, cpu) = name;
 298         }
 299
 300         printk("cpu %d spinlock event irq %d\n", cpu, irq);
 301 }
 302
 303 void xen_uninit_lock_cpu(int cpu)
 304 {
 305         if (!xen_pvspin)
 306                 return;
 307
 308         unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
 309         per_cpu(lock_kicker_irq, cpu) = -1;
 310         kfree(per_cpu(irq_name, cpu));
 311         per_cpu(irq_name, cpu) = NULL;
 312 }
 313
 314
 315 /*
 316  * Our init of PV spinlocks is split in two init functions due to us
 317  * using paravirt patching and jump labels patching and having to do
 318  * all of this before SMP code is invoked.
 319  *
 320  * The paravirt patching needs to be done _before_ the alternative asm code
 321  * is started, otherwise we would not patch the core kernel code.
 322  */
 323 void __init xen_init_spinlocks(void)
 324 {
 325
 326         if (!xen_pvspin) {
 327                 printk(KERN_DEBUG "xen: PV spinlocks disabled\n");
 328                 return;
 329         }
 330         printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
 331 #ifdef CONFIG_QUEUED_SPINLOCKS
 332         __pv_init_lock_hash();
 333         pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
 334         pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
 335         pv_lock_ops.wait = xen_qlock_wait;
 336         pv_lock_ops.kick = xen_qlock_kick;
 337 #else
 338         pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
 339         pv_lock_ops.unlock_kick = xen_unlock_kick;
 340 #endif
 341 }
 342
 343 /*
 344  * While the jump_label init code needs to happend _after_ the jump labels are
 345  * enabled and before SMP is started. Hence we use pre-SMP initcall level
 346  * init. We cannot do it in xen_init_spinlocks as that is done before
 347  * jump labels are activated.
 348  */
 349 static __init int xen_init_spinlocks_jump(void)
 350 {
 351         if (!xen_pvspin)
 352                 return 0;
 353
 354         if (!xen_domain())
 355                 return 0;
 356
 357         static_key_slow_inc(&paravirt_ticketlocks_enabled);
 358         return 0;
 359 }
 360 early_initcall(xen_init_spinlocks_jump);
 361
 362 static __init int xen_parse_nopvspin(char *arg)
 363 {
 364         xen_pvspin = false;
 365         return 0;
 366 }
 367 early_param("xen_nopvspin", xen_parse_nopvspin);
 368
 369 #if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCKS)
 370
 371 static struct dentry *d_spin_debug;
 372
 373 static int __init xen_spinlock_debugfs(void)
 374 {
 375         struct dentry *d_xen = xen_init_debugfs();
 376
 377         if (d_xen == NULL)
 378                 return -ENOMEM;
 379
 380         if (!xen_pvspin)
 381                 return 0;
 382
 383         d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
 384
 385         debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
 386
 387         debugfs_create_u32("taken_slow", 0444, d_spin_debug,
 388                            &spinlock_stats.contention_stats[TAKEN_SLOW]);
 389         debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
 390                            &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
 391         debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
 392                            &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]);
 393
 394         debugfs_create_u32("released_slow", 0444, d_spin_debug,
 395                            &spinlock_stats.contention_stats[RELEASED_SLOW]);
 396         debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
 397                            &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
 398
 399         debugfs_create_u64("time_blocked", 0444, d_spin_debug,
 400                            &spinlock_stats.time_blocked);
 401
 402         debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
 403                                 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
 404
 405         return 0;
 406 }
 407 fs_initcall(xen_spinlock_debugfs);
 408
 409 #endif  /* CONFIG_XEN_DEBUG_FS */