Print this page
5042 stop using deprecated atomic functions


  34 #include <sys/cpuvar.h>
  35 #include <sys/x_call.h>
  36 #include <sys/xc_levels.h>
  37 #include <sys/cpu.h>
  38 #include <sys/psw.h>
  39 #include <sys/sunddi.h>
  40 #include <sys/debug.h>
  41 #include <sys/systm.h>
  42 #include <sys/archsystm.h>
  43 #include <sys/machsystm.h>
  44 #include <sys/mutex_impl.h>
  45 #include <sys/stack.h>
  46 #include <sys/promif.h>
  47 #include <sys/x86_archext.h>
  48 
  49 /*
  50  * Implementation for cross-processor calls via interprocessor interrupts
  51  *
  52  * This implementation uses a message passing architecture to allow multiple
  53  * concurrent cross calls to be in flight at any given time. We use the cmpxchg
  54  * instruction, aka casptr(), to implement simple efficient work queues for
  55  * message passing between CPUs with almost no need for regular locking.
  56  * See xc_extract() and xc_insert() below.
  57  *
  58  * The general idea is that initiating a cross call means putting a message
  59  * on a target(s) CPU's work queue. Any synchronization is handled by passing
  60  * the message back and forth between initiator and target(s).
  61  *
  62  * Every CPU has xc_work_cnt, which indicates it has messages to process.
  63  * This value is incremented as message traffic is initiated and decremented
  64  * with every message that finishes all processing.
  65  *
  66  * The code needs no mfence or other membar_*() calls. The uses of
  67  * casptr(), cas32() and atomic_dec_32() for the message passing are
  68  * implemented with LOCK prefix instructions which are equivalent to mfence.

  69  *
  70  * One interesting aspect of this implmentation is that it allows 2 or more
  71  * CPUs to initiate cross calls to intersecting sets of CPUs at the same time.
  72  * The cross call processing by the CPUs will happen in any order with only
  73  * a guarantee, for xc_call() and xc_sync(), that an initiator won't return
  74  * from cross calls before all slaves have invoked the function.
  75  *
  76  * The reason for this asynchronous approach is to allow for fast global
  77  * TLB shootdowns. If all CPUs, say N, tried to do a global TLB invalidation
  78  * on a different Virtual Address at the same time. The old code required
  79  * N squared IPIs. With this method, depending on timing, it could happen
  80  * with just N IPIs.
  81  */
  82 
  83 /*
  84  * The default is to not enable collecting counts of IPI information, since
  85  * the updating of shared cachelines could cause excess bus traffic.
  86  */
  87 uint_t xc_collect_enable = 0;
  88 uint64_t xc_total_cnt = 0;      /* total #IPIs sent for cross calls */


 127 #define XC_BT_CLEAR(vector, b)  BT_ATOMIC_CLEAR((ulong_t *)(vector), (b))
 128 
 129 /*
 130  * Decrement a CPU's work count
 131  */
 132 static void
 133 xc_decrement(struct machcpu *mcpu)
 134 {
 135         atomic_dec_32(&mcpu->xc_work_cnt);
 136 }
 137 
 138 /*
 139  * Increment a CPU's work count and return the old value
 140  */
 141 static int
 142 xc_increment(struct machcpu *mcpu)
 143 {
 144         int old;
 145         do {
 146                 old = mcpu->xc_work_cnt;
 147         } while (cas32((uint32_t *)&mcpu->xc_work_cnt, old, old + 1) != old);
 148         return (old);
 149 }
 150 
 151 /*
 152  * Put a message into a queue. The insertion is atomic no matter
 153  * how many different inserts/extracts to the same queue happen.
 154  */
 155 static void
 156 xc_insert(void *queue, xc_msg_t *msg)
 157 {
 158         xc_msg_t *old_head;
 159 
 160         /*
 161          * FREE messages should only ever be getting inserted into
 162          * the xc_master CPUs xc_free queue.
 163          */
 164         ASSERT(msg->xc_command != XC_MSG_FREE ||
 165             cpu[msg->xc_master] == NULL || /* possible only during init */
 166             queue == &cpu[msg->xc_master]->cpu_m.xc_free);
 167 
 168         do {
 169                 old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue;
 170                 msg->xc_next = old_head;
 171         } while (casptr(queue, old_head, msg) != old_head);
 172 }
 173 
 174 /*
 175  * Extract a message from a queue. The extraction is atomic only
 176  * when just one thread does extractions from the queue.
 177  * If the queue is empty, NULL is returned.
 178  */
 179 static xc_msg_t *
 180 xc_extract(xc_msg_t **queue)
 181 {
 182         xc_msg_t *old_head;
 183 
 184         do {
 185                 old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue;
 186                 if (old_head == NULL)
 187                         return (old_head);
 188         } while (casptr(queue, old_head, old_head->xc_next) != old_head);

 189         old_head->xc_next = NULL;
 190         return (old_head);
 191 }
 192 
 193 /*
 194  * Initialize the machcpu fields used for cross calls
 195  */
 196 static uint_t xc_initialized = 0;
 197 
 198 void
 199 xc_init_cpu(struct cpu *cpup)
 200 {
 201         xc_msg_t *msg;
 202         int c;
 203 
 204         /*
 205          * Allocate message buffers for the new CPU.
 206          */
 207         for (c = 0; c < max_ncpus; ++c) {
 208                 if (plat_dr_support_cpu()) {


 591         xc_priority_data.xc_func = func;
 592         xc_priority_data.xc_a1 = arg1;
 593         xc_priority_data.xc_a2 = arg2;
 594         xc_priority_data.xc_a3 = arg3;
 595 
 596         /*
 597          * Post messages to all CPUs involved that are CPU_READY
 598          * We'll always IPI, plus bang on the xc_msgbox for i86_mwait()
 599          */
 600         for (c = 0; c < max_ncpus; ++c) {
 601                 if (!BT_TEST(set, c))
 602                         continue;
 603                 cpup = cpu[c];
 604                 if (cpup == NULL || !(cpup->cpu_flags & CPU_READY) ||
 605                     cpup == CPU)
 606                         continue;
 607                 (void) xc_increment(&cpup->cpu_m);
 608                 XC_BT_SET(xc_priority_set, c);
 609                 send_dirint(c, XC_HI_PIL);
 610                 for (i = 0; i < 10; ++i) {
 611                         (void) casptr(&cpup->cpu_m.xc_msgbox,
 612                             cpup->cpu_m.xc_msgbox, cpup->cpu_m.xc_msgbox);
 613                 }
 614         }
 615 }
 616 
 617 /*
 618  * Do cross call to all other CPUs with absolutely no waiting or handshaking.
 619  * This should only be used for extraordinary operations, like panic(), which
 620  * need to work, in some fashion, in a not completely functional system.
 621  * All other uses that want minimal waiting should use xc_call_nowait().
 622  */
 623 void
 624 xc_priority(
 625         xc_arg_t arg1,
 626         xc_arg_t arg2,
 627         xc_arg_t arg3,
 628         ulong_t *set,
 629         xc_func_t func)
 630 {
 631         extern int IGNORE_KERNEL_PREEMPTION;




  34 #include <sys/cpuvar.h>
  35 #include <sys/x_call.h>
  36 #include <sys/xc_levels.h>
  37 #include <sys/cpu.h>
  38 #include <sys/psw.h>
  39 #include <sys/sunddi.h>
  40 #include <sys/debug.h>
  41 #include <sys/systm.h>
  42 #include <sys/archsystm.h>
  43 #include <sys/machsystm.h>
  44 #include <sys/mutex_impl.h>
  45 #include <sys/stack.h>
  46 #include <sys/promif.h>
  47 #include <sys/x86_archext.h>
  48 
  49 /*
  50  * Implementation for cross-processor calls via interprocessor interrupts
  51  *
  52  * This implementation uses a message passing architecture to allow multiple
  53  * concurrent cross calls to be in flight at any given time. We use the cmpxchg
  54  * instruction, aka atomic_cas_ptr(), to implement simple efficient work
  55  * queues for message passing between CPUs with almost no need for regular
  56  * locking.  See xc_extract() and xc_insert() below.
  57  *
  58  * The general idea is that initiating a cross call means putting a message
  59  * on a target(s) CPU's work queue. Any synchronization is handled by passing
  60  * the message back and forth between initiator and target(s).
  61  *
  62  * Every CPU has xc_work_cnt, which indicates it has messages to process.
  63  * This value is incremented as message traffic is initiated and decremented
  64  * with every message that finishes all processing.
  65  *
  66  * The code needs no mfence or other membar_*() calls. The uses of
  67  * atomic_cas_ptr(), atomic_cas_32() and atomic_dec_32() for the message
  68  * passing are implemented with LOCK prefix instructions which are
  69  * equivalent to mfence.
  70  *
  71  * One interesting aspect of this implmentation is that it allows 2 or more
  72  * CPUs to initiate cross calls to intersecting sets of CPUs at the same time.
  73  * The cross call processing by the CPUs will happen in any order with only
  74  * a guarantee, for xc_call() and xc_sync(), that an initiator won't return
  75  * from cross calls before all slaves have invoked the function.
  76  *
  77  * The reason for this asynchronous approach is to allow for fast global
  78  * TLB shootdowns. If all CPUs, say N, tried to do a global TLB invalidation
  79  * on a different Virtual Address at the same time. The old code required
  80  * N squared IPIs. With this method, depending on timing, it could happen
  81  * with just N IPIs.
  82  */
  83 
  84 /*
  85  * The default is to not enable collecting counts of IPI information, since
  86  * the updating of shared cachelines could cause excess bus traffic.
  87  */
  88 uint_t xc_collect_enable = 0;
  89 uint64_t xc_total_cnt = 0;      /* total #IPIs sent for cross calls */


 128 #define XC_BT_CLEAR(vector, b)  BT_ATOMIC_CLEAR((ulong_t *)(vector), (b))
 129 
 130 /*
 131  * Decrement a CPU's work count
 132  */
 133 static void
 134 xc_decrement(struct machcpu *mcpu)
 135 {
 136         atomic_dec_32(&mcpu->xc_work_cnt);
 137 }
 138 
 139 /*
 140  * Increment a CPU's work count and return the old value
 141  */
 142 static int
 143 xc_increment(struct machcpu *mcpu)
 144 {
 145         int old;
 146         do {
 147                 old = mcpu->xc_work_cnt;
 148         } while (atomic_cas_32(&mcpu->xc_work_cnt, old, old + 1) != old);
 149         return (old);
 150 }
 151 
 152 /*
 153  * Put a message into a queue. The insertion is atomic no matter
 154  * how many different inserts/extracts to the same queue happen.
 155  */
 156 static void
 157 xc_insert(void *queue, xc_msg_t *msg)
 158 {
 159         xc_msg_t *old_head;
 160 
 161         /*
 162          * FREE messages should only ever be getting inserted into
 163          * the xc_master CPUs xc_free queue.
 164          */
 165         ASSERT(msg->xc_command != XC_MSG_FREE ||
 166             cpu[msg->xc_master] == NULL || /* possible only during init */
 167             queue == &cpu[msg->xc_master]->cpu_m.xc_free);
 168 
 169         do {
 170                 old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue;
 171                 msg->xc_next = old_head;
 172         } while (atomic_cas_ptr(queue, old_head, msg) != old_head);
 173 }
 174 
 175 /*
 176  * Extract a message from a queue. The extraction is atomic only
 177  * when just one thread does extractions from the queue.
 178  * If the queue is empty, NULL is returned.
 179  */
 180 static xc_msg_t *
 181 xc_extract(xc_msg_t **queue)
 182 {
 183         xc_msg_t *old_head;
 184 
 185         do {
 186                 old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue;
 187                 if (old_head == NULL)
 188                         return (old_head);
 189         } while (atomic_cas_ptr(queue, old_head, old_head->xc_next) !=
 190             old_head);
 191         old_head->xc_next = NULL;
 192         return (old_head);
 193 }
 194 
 195 /*
 196  * Initialize the machcpu fields used for cross calls
 197  */
 198 static uint_t xc_initialized = 0;
 199 
 200 void
 201 xc_init_cpu(struct cpu *cpup)
 202 {
 203         xc_msg_t *msg;
 204         int c;
 205 
 206         /*
 207          * Allocate message buffers for the new CPU.
 208          */
 209         for (c = 0; c < max_ncpus; ++c) {
 210                 if (plat_dr_support_cpu()) {


 593         xc_priority_data.xc_func = func;
 594         xc_priority_data.xc_a1 = arg1;
 595         xc_priority_data.xc_a2 = arg2;
 596         xc_priority_data.xc_a3 = arg3;
 597 
 598         /*
 599          * Post messages to all CPUs involved that are CPU_READY
 600          * We'll always IPI, plus bang on the xc_msgbox for i86_mwait()
 601          */
 602         for (c = 0; c < max_ncpus; ++c) {
 603                 if (!BT_TEST(set, c))
 604                         continue;
 605                 cpup = cpu[c];
 606                 if (cpup == NULL || !(cpup->cpu_flags & CPU_READY) ||
 607                     cpup == CPU)
 608                         continue;
 609                 (void) xc_increment(&cpup->cpu_m);
 610                 XC_BT_SET(xc_priority_set, c);
 611                 send_dirint(c, XC_HI_PIL);
 612                 for (i = 0; i < 10; ++i) {
 613                         (void) atomic_cas_ptr(&cpup->cpu_m.xc_msgbox,
 614                             cpup->cpu_m.xc_msgbox, cpup->cpu_m.xc_msgbox);
 615                 }
 616         }
 617 }
 618 
 619 /*
 620  * Do cross call to all other CPUs with absolutely no waiting or handshaking.
 621  * This should only be used for extraordinary operations, like panic(), which
 622  * need to work, in some fashion, in a not completely functional system.
 623  * All other uses that want minimal waiting should use xc_call_nowait().
 624  */
 625 void
 626 xc_priority(
 627         xc_arg_t arg1,
 628         xc_arg_t arg2,
 629         xc_arg_t arg3,
 630         ulong_t *set,
 631         xc_func_t func)
 632 {
 633         extern int IGNORE_KERNEL_PREEMPTION;