XXXX-pass-in-cpu_pause_func-via-pause_cpus Wdiff usr/src/uts/common/os/lgrp.c

Print this page

XXXX pass in cpu_pause_func via pause_cpus

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/lgrp.c
          +++ new/usr/src/uts/common/os/lgrp.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  /*
  27   27   * Basic NUMA support in terms of locality groups
  28   28   *
  29   29   * Solaris needs to know which CPUs, memory, etc. are near each other to
  30   30   * provide good performance on NUMA machines by optimizing for locality.
  31   31   * In order to do this, a new abstraction called a "locality group (lgroup)"
  32   32   * has been introduced to keep track of which CPU-like and memory-like hardware
  33   33   * resources are close to each other.  Currently, latency is the only measure
  34   34   * used to determine how to group hardware resources into lgroups, but this
  35   35   * does not limit the groupings to be based solely on latency.  Other factors
  36   36   * may be used to determine the groupings in the future.
  37   37   *
  38   38   * Lgroups are organized into a hieararchy or topology that represents the
  39   39   * latency topology of the machine.  There is always at least a root lgroup in
  40   40   * the system.  It represents all the hardware resources in the machine at a
  41   41   * latency big enough that any hardware resource can at least access any other
  42   42   * hardware resource within that latency.  A Uniform Memory Access (UMA)
  43   43   * machine is represented with one lgroup (the root).  In contrast, a NUMA
  44   44   * machine is represented at least by the root lgroup and some number of leaf
  45   45   * lgroups where the leaf lgroups contain the hardware resources within the
  46   46   * least latency of each other and the root lgroup still contains all the
  47   47   * resources in the machine.  Some number of intermediate lgroups may exist
  48   48   * which represent more levels of locality than just the local latency of the
  49   49   * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
  50   50   * (eg. root and intermediate lgroups) contain the next nearest resources to
  51   51   * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
  52   52   * to the root lgroup shows the hardware resources from closest to farthest
  53   53   * from the leaf lgroup such that each successive ancestor lgroup contains
  54   54   * the next nearest resources at the next level of locality from the previous.
  55   55   *
  56   56   * The kernel uses the lgroup abstraction to know how to allocate resources
  57   57   * near a given process/thread.  At fork() and lwp/thread_create() time, a
  58   58   * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
  59   59   * with the lowest load average.  Binding to a processor or processor set will
  60   60   * change the home lgroup for a thread.  The scheduler has been modified to try
  61   61   * to dispatch a thread on a CPU in its home lgroup.  Physical memory
  62   62   * allocation is lgroup aware too, so memory will be allocated from the current
  63   63   * thread's home lgroup if possible.  If the desired resources are not
  64   64   * available, the kernel traverses the lgroup hierarchy going to the parent
  65   65   * lgroup to find resources at the next level of locality until it reaches the
  66   66   * root lgroup.
  67   67   */
  68   68  
  69   69  #include <sys/lgrp.h>
  70   70  #include <sys/lgrp_user.h>
  71   71  #include <sys/types.h>
  72   72  #include <sys/mman.h>
  73   73  #include <sys/param.h>
  74   74  #include <sys/var.h>
  75   75  #include <sys/thread.h>
  76   76  #include <sys/cpuvar.h>
  77   77  #include <sys/cpupart.h>
  78   78  #include <sys/kmem.h>
  79   79  #include <vm/seg.h>
  80   80  #include <vm/seg_kmem.h>
  81   81  #include <vm/seg_spt.h>
  82   82  #include <vm/seg_vn.h>
  83   83  #include <vm/as.h>
  84   84  #include <sys/atomic.h>
  85   85  #include <sys/systm.h>
  86   86  #include <sys/errno.h>
  87   87  #include <sys/cmn_err.h>
  88   88  #include <sys/kstat.h>
  89   89  #include <sys/sysmacros.h>
  90   90  #include <sys/pg.h>
  91   91  #include <sys/promif.h>
  92   92  #include <sys/sdt.h>
  93   93  
  94   94  lgrp_gen_t      lgrp_gen = 0;           /* generation of lgroup hierarchy */
  95   95  lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
  96   96                                  /* indexed by lgrp_id */
  97   97  int     nlgrps;                 /* number of lgroups in machine */
  98   98  int     lgrp_alloc_hint = -1;   /* hint for where to try to allocate next */
  99   99  int     lgrp_alloc_max = 0;     /* max lgroup ID allocated so far */
 100  100  
 101  101  /*
 102  102   * Kstat data for lgroups.
 103  103   *
 104  104   * Actual kstat data is collected in lgrp_stats array.
 105  105   * The lgrp_kstat_data array of named kstats is used to extract data from
 106  106   * lgrp_stats and present it to kstat framework. It is protected from partallel
 107  107   * modifications by lgrp_kstat_mutex. This may cause some contention when
 108  108   * several kstat commands run in parallel but this is not the
 109  109   * performance-critical path.
 110  110   */
 111  111  extern struct lgrp_stats lgrp_stats[];  /* table of per-lgrp stats */
 112  112  
 113  113  /*
 114  114   * Declare kstat names statically for enums as defined in the header file.
 115  115   */
 116  116  LGRP_KSTAT_NAMES;
 117  117  
 118  118  static void     lgrp_kstat_init(void);
 119  119  static int      lgrp_kstat_extract(kstat_t *, int);
 120  120  static void     lgrp_kstat_reset(lgrp_id_t);
 121  121  
 122  122  static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
 123  123  static kmutex_t lgrp_kstat_mutex;
 124  124  
 125  125  
 126  126  /*
 127  127   * max number of lgroups supported by the platform
 128  128   */
 129  129  int     nlgrpsmax = 0;
 130  130  
 131  131  /*
 132  132   * The root lgroup. Represents the set of resources at the system wide
 133  133   * level of locality.
 134  134   */
 135  135  lgrp_t          *lgrp_root = NULL;
 136  136  
 137  137  /*
 138  138   * During system bootstrap cp_default does not contain the list of lgrp load
 139  139   * averages (cp_lgrploads). The list is allocated after the first CPU is brought
 140  140   * on-line when cp_default is initialized by cpupart_initialize_default().
 141  141   * Configuring CPU0 may create a two-level topology with root and one leaf node
 142  142   * containing CPU0. This topology is initially constructed in a special
 143  143   * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
 144  144   * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
 145  145   * for all lpl operations until cp_default is fully constructed.
 146  146   *
 147  147   * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
 148  148   * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
 149  149   * the first element of lpl_bootstrap_list.
 150  150   *
 151  151   * CPUs that are added to the system, but have not yet been assigned to an
 152  152   * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
 153  153   * on some architectures (x86) it's possible for the slave CPU startup thread
 154  154   * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
 155  155   */
 156  156  #define LPL_BOOTSTRAP_SIZE 2
 157  157  static lpl_t    lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
 158  158  lpl_t           *lpl_bootstrap;
 159  159  static lpl_t    *lpl_bootstrap_rset[LPL_BOOTSTRAP_SIZE];
 160  160  static int      lpl_bootstrap_id2rset[LPL_BOOTSTRAP_SIZE];
 161  161  
 162  162  /*
 163  163   * If cp still references the bootstrap lpl, it has not yet been added to
 164  164   * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
 165  165   * a thread is trying to allocate memory close to a CPU that has no lgrp.
 166  166   */
 167  167  #define LGRP_CPU_HAS_NO_LGRP(cp)        ((cp)->cpu_lpl == lpl_bootstrap)
 168  168  
 169  169  static lgrp_t   lroot;
 170  170  
 171  171  /*
 172  172   * Size, in bytes, beyond which random memory allocation policy is applied
 173  173   * to non-shared memory.  Default is the maximum size, so random memory
 174  174   * allocation won't be used for non-shared memory by default.
 175  175   */
 176  176  size_t  lgrp_privm_random_thresh = (size_t)(-1);
 177  177  
 178  178  /* the maximum effect that a single thread can have on it's lgroup's load */
 179  179  #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \
 180  180          ((lgrp_loadavg_max_effect) / (ncpu))
 181  181  uint32_t        lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
 182  182  
 183  183  
 184  184  /*
 185  185   * Size, in bytes, beyond which random memory allocation policy is applied to
 186  186   * shared memory.  Default is 8MB (2 ISM pages).
 187  187   */
 188  188  size_t  lgrp_shm_random_thresh = 8*1024*1024;
 189  189  
 190  190  /*
 191  191   * Whether to do processor set aware memory allocation by default
 192  192   */
 193  193  int     lgrp_mem_pset_aware = 0;
 194  194  
 195  195  /*
 196  196   * Set the default memory allocation policy for root lgroup
 197  197   */
 198  198  lgrp_mem_policy_t       lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
 199  199  
 200  200  /*
 201  201   * Set the default memory allocation policy.  For most platforms,
 202  202   * next touch is sufficient, but some platforms may wish to override
 203  203   * this.
 204  204   */
 205  205  lgrp_mem_policy_t       lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
 206  206  
 207  207  
 208  208  /*
 209  209   * lgroup CPU event handlers
 210  210   */
 211  211  static void     lgrp_cpu_init(struct cpu *);
 212  212  static void     lgrp_cpu_fini(struct cpu *, lgrp_id_t);
 213  213  static lgrp_t   *lgrp_cpu_to_lgrp(struct cpu *);
 214  214  
 215  215  /*
 216  216   * lgroup memory event handlers
 217  217   */
 218  218  static void     lgrp_mem_init(int, lgrp_handle_t, boolean_t);
 219  219  static void     lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
 220  220  static void     lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
 221  221  
 222  222  /*
 223  223   * lgroup CPU partition event handlers
 224  224   */
 225  225  static void     lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
 226  226  static void     lgrp_part_del_cpu(struct cpu *);
 227  227  
 228  228  /*
 229  229   * lgroup framework initialization
 230  230   */
 231  231  static void     lgrp_main_init(void);
 232  232  static void     lgrp_main_mp_init(void);
 233  233  static void     lgrp_root_init(void);
 234  234  static void     lgrp_setup(void);
 235  235  
 236  236  /*
 237  237   * lpl topology
 238  238   */
 239  239  static void     lpl_init(lpl_t *, lpl_t *, lgrp_t *);
 240  240  static void     lpl_clear(lpl_t *);
 241  241  static void     lpl_leaf_insert(lpl_t *, struct cpupart *);
 242  242  static void     lpl_leaf_remove(lpl_t *, struct cpupart *);
 243  243  static void     lpl_rset_add(lpl_t *, lpl_t *);
 244  244  static void     lpl_rset_del(lpl_t *, lpl_t *);
 245  245  static int      lpl_rset_contains(lpl_t *, lpl_t *);
 246  246  static void     lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
 247  247  static void     lpl_child_update(lpl_t *, struct cpupart *);
 248  248  static int      lpl_pick(lpl_t *, lpl_t *);
 249  249  static void     lpl_verify_wrapper(struct cpupart *);
 250  250  
 251  251  /*
 252  252   * defines for lpl topology verifier return codes
 253  253   */
 254  254  
 255  255  #define LPL_TOPO_CORRECT                        0
 256  256  #define LPL_TOPO_PART_HAS_NO_LPL                -1
 257  257  #define LPL_TOPO_CPUS_NOT_EMPTY                 -2
 258  258  #define LPL_TOPO_LGRP_MISMATCH                  -3
 259  259  #define LPL_TOPO_MISSING_PARENT                 -4
 260  260  #define LPL_TOPO_PARENT_MISMATCH                -5
 261  261  #define LPL_TOPO_BAD_CPUCNT                     -6
 262  262  #define LPL_TOPO_RSET_MISMATCH                  -7
 263  263  #define LPL_TOPO_LPL_ORPHANED                   -8
 264  264  #define LPL_TOPO_LPL_BAD_NCPU                   -9
 265  265  #define LPL_TOPO_RSET_MSSNG_LF                  -10
 266  266  #define LPL_TOPO_CPU_HAS_BAD_LPL                -11
 267  267  #define LPL_TOPO_NONLEAF_HAS_CPUS               -12
 268  268  #define LPL_TOPO_LGRP_NOT_LEAF                  -13
 269  269  #define LPL_TOPO_BAD_RSETCNT                    -14
 270  270  
 271  271  /*
 272  272   * Return whether lgroup optimizations should be enabled on this system
 273  273   */
 274  274  int
 275  275  lgrp_optimizations(void)
 276  276  {
 277  277          /*
 278  278           * System must have more than 2 lgroups to enable lgroup optimizations
 279  279           *
 280  280           * XXX This assumes that a 2 lgroup system has an empty root lgroup
 281  281           * with one child lgroup containing all the resources. A 2 lgroup
 282  282           * system with a root lgroup directly containing CPUs or memory might
 283  283           * need lgroup optimizations with its child lgroup, but there
 284  284           * isn't such a machine for now....
 285  285           */
 286  286          if (nlgrps > 2)
 287  287                  return (1);
 288  288  
 289  289          return (0);
 290  290  }
 291  291  
 292  292  /*
 293  293   * Setup root lgroup
 294  294   */
 295  295  static void
 296  296  lgrp_root_init(void)
 297  297  {
 298  298          lgrp_handle_t   hand;
 299  299          int             i;
 300  300          lgrp_id_t       id;
 301  301  
 302  302          /*
 303  303           * Create the "root" lgroup
 304  304           */
 305  305          ASSERT(nlgrps == 0);
 306  306          id = nlgrps++;
 307  307  
 308  308          lgrp_root = &lroot;
 309  309  
 310  310          lgrp_root->lgrp_cpu = NULL;
 311  311          lgrp_root->lgrp_mnodes = 0;
 312  312          lgrp_root->lgrp_nmnodes = 0;
 313  313          hand = lgrp_plat_root_hand();
 314  314          lgrp_root->lgrp_plathand = hand;
 315  315  
 316  316          lgrp_root->lgrp_id = id;
 317  317          lgrp_root->lgrp_cpucnt = 0;
 318  318          lgrp_root->lgrp_childcnt = 0;
 319  319          klgrpset_clear(lgrp_root->lgrp_children);
 320  320          klgrpset_clear(lgrp_root->lgrp_leaves);
 321  321          lgrp_root->lgrp_parent = NULL;
 322  322          lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
 323  323  
 324  324          for (i = 0; i < LGRP_RSRC_COUNT; i++)
 325  325                  klgrpset_clear(lgrp_root->lgrp_set[i]);
 326  326  
 327  327          lgrp_root->lgrp_kstat = NULL;
 328  328  
 329  329          lgrp_table[id] = lgrp_root;
 330  330  
 331  331          /*
 332  332           * Setup initial lpl list for CPU0 and initial t0 home.
 333  333           * The only lpl space we have so far is lpl_bootstrap. It is used for
 334  334           * all topology operations until cp_default is initialized at which
 335  335           * point t0.t_lpl will be updated.
 336  336           */
 337  337          lpl_bootstrap = lpl_bootstrap_list;
 338  338          t0.t_lpl = lpl_bootstrap;
 339  339          cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
 340  340          lpl_bootstrap_list[1].lpl_lgrpid = 1;
 341  341  
 342  342          /*
 343  343           * Set up the bootstrap rset
 344  344           * Since the bootstrap toplogy has just the root, and a leaf,
 345  345           * the rset contains just the leaf, and both lpls can use the same rset
 346  346           */
 347  347          lpl_bootstrap_rset[0] = &lpl_bootstrap_list[1];
 348  348          lpl_bootstrap_list[0].lpl_rset_sz = 1;
 349  349          lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
 350  350          lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
 351  351  
 352  352          lpl_bootstrap_list[1].lpl_rset_sz = 1;
 353  353          lpl_bootstrap_list[1].lpl_rset = lpl_bootstrap_rset;
 354  354          lpl_bootstrap_list[1].lpl_id2rset = lpl_bootstrap_id2rset;
 355  355  
 356  356          cp_default.cp_lgrploads = lpl_bootstrap;
 357  357  }
 358  358  
 359  359  /*
 360  360   * Initialize the lgroup framework and allow the platform to do the same
 361  361   *
 362  362   * This happens in stages during boot and is all funnelled through this routine
 363  363   * (see definition of lgrp_init_stages_t to see what happens at each stage and
 364  364   * when)
 365  365   */
 366  366  void
 367  367  lgrp_init(lgrp_init_stages_t stage)
 368  368  {
 369  369          /*
 370  370           * Initialize the platform
 371  371           */
 372  372          lgrp_plat_init(stage);
 373  373  
 374  374          switch (stage) {
 375  375          case LGRP_INIT_STAGE1:
 376  376                  /*
 377  377                   * Set max number of lgroups supported on this platform which
 378  378                   * must be less than the max number of lgroups supported by the
 379  379                   * common lgroup framework (eg. NLGRPS_MAX is max elements in
 380  380                   * lgrp_table[], etc.)
 381  381                   */
 382  382                  nlgrpsmax = lgrp_plat_max_lgrps();
 383  383                  ASSERT(nlgrpsmax <= NLGRPS_MAX);
 384  384                  break;
 385  385  
 386  386          case LGRP_INIT_STAGE2:
 387  387                  lgrp_setup();
 388  388                  break;
 389  389  
 390  390          case LGRP_INIT_STAGE4:
 391  391                  lgrp_main_init();
 392  392                  break;
 393  393  
 394  394          case LGRP_INIT_STAGE5:
 395  395                  lgrp_main_mp_init();
 396  396                  break;
 397  397  
 398  398          default:
 399  399                  break;
 400  400          }
 401  401  }
 402  402  
 403  403  /*
 404  404   * Create the root and cpu0's lgroup, and set t0's home.
 405  405   */
 406  406  static void
 407  407  lgrp_setup(void)
 408  408  {
 409  409          /*
 410  410           * Setup the root lgroup
 411  411           */
 412  412          lgrp_root_init();
 413  413  
 414  414          /*
 415  415           * Add cpu0 to an lgroup
 416  416           */
 417  417          lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
 418  418          lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
 419  419  }
 420  420  
 421  421  /*
 422  422   * true when lgrp initialization has been completed.
 423  423   */
 424  424  int     lgrp_initialized = 0;
 425  425  
 426  426  /*
 427  427   * True when lgrp topology is constructed.
 428  428   */
 429  429  int     lgrp_topo_initialized = 0;
 430  430  
 431  431  /*
 432  432   * Init routine called after startup(), /etc/system has been processed,
 433  433   * and cpu0 has been added to an lgroup.
 434  434   */
 435  435  static void
 436  436  lgrp_main_init(void)
 437  437  {
 438  438          cpu_t           *cp = CPU;
 439  439          lgrp_id_t       lgrpid;
 440  440          int             i;
 441  441          extern void     pg_cpu0_reinit();
 442  442  
 443  443          /*
 444  444           * Enforce a valid lgrp_mem_default_policy
 445  445           */
 446  446          if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
 447  447              (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) ||
 448  448              (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG))
 449  449                  lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
 450  450  
 451  451          /*
 452  452           * See if mpo should be disabled.
 453  453           * This may happen in the case of null proc LPA on Starcat.
 454  454           * The platform won't be able to detect null proc LPA until after
 455  455           * cpu0 and memory have already been added to lgroups.
 456  456           * When and if it is detected, the Starcat platform will return
 457  457           * a different platform handle for cpu0 which is what we check for
 458  458           * here. If mpo should be disabled move cpu0 to it's rightful place
 459  459           * (the root), and destroy the remaining lgroups. This effectively
 460  460           * provides an UMA lgroup topology.
 461  461           */
 462  462          lgrpid = cp->cpu_lpl->lpl_lgrpid;
 463  463          if (lgrp_table[lgrpid]->lgrp_plathand !=
 464  464              lgrp_plat_cpu_to_hand(cp->cpu_id)) {
 465  465                  lgrp_part_del_cpu(cp);
 466  466                  lgrp_cpu_fini(cp, lgrpid);
 467  467  
 468  468                  lgrp_cpu_init(cp);
 469  469                  lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
 470  470  
 471  471                  ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
 472  472  
 473  473                  /*
 474  474                   * Notify the PG subsystem that the CPU's lgrp
 475  475                   * association has changed
 476  476                   */
 477  477                  pg_cpu0_reinit();
 478  478  
 479  479                  /*
 480  480                   * Destroy all lgroups except for root
 481  481                   */
 482  482                  for (i = 0; i <= lgrp_alloc_max; i++) {
 483  483                          if (LGRP_EXISTS(lgrp_table[i]) &&
 484  484                              lgrp_table[i] != lgrp_root)
 485  485                                  lgrp_destroy(lgrp_table[i]);
 486  486                  }
 487  487  
 488  488                  /*
 489  489                   * Fix up root to point at itself for leaves and resources
 490  490                   * and not have any children
 491  491                   */
 492  492                  lgrp_root->lgrp_childcnt = 0;
 493  493                  klgrpset_clear(lgrp_root->lgrp_children);
 494  494                  klgrpset_clear(lgrp_root->lgrp_leaves);
 495  495                  klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
 496  496                  klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
 497  497                  klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
 498  498          }
 499  499  
 500  500          /*
 501  501           * Initialize kstats framework.
 502  502           */
 503  503          lgrp_kstat_init();
 504  504          /*
 505  505           * cpu0 is finally where it should be, so create it's lgroup's kstats
 506  506           */
 507  507          mutex_enter(&cpu_lock);
 508  508          lgrp_kstat_create(cp);
 509  509          mutex_exit(&cpu_lock);
 510  510  
 511  511          lgrp_initialized = 1;
 512  512  }
 513  513  
 514  514  /*
 515  515   * Finish lgrp initialization after all CPUS are brought on-line.
 516  516   * This routine is called after start_other_cpus().
 517  517   */
 518  518  static void
 519  519  lgrp_main_mp_init(void)
 520  520  {
 521  521          klgrpset_t changed;
 522  522  
 523  523          /*
 524  524           * Update lgroup topology (if necessary)
 525  525           */
 526  526          klgrpset_clear(changed);
 527  527          (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
 528  528          lgrp_topo_initialized = 1;
 529  529  }
 530  530  
 531  531  /*
 532  532   * Change latency of lgroup with specified lgroup platform handle (if one is
 533  533   * given) or change all lgroups with old latency to new latency
 534  534   */
 535  535  void
 536  536  lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime,
 537  537      u_longlong_t newtime)
 538  538  {
 539  539          lgrp_t          *lgrp;
 540  540          int             i;
 541  541  
 542  542          for (i = 0; i <= lgrp_alloc_max; i++) {
 543  543                  lgrp = lgrp_table[i];
 544  544  
 545  545                  if (!LGRP_EXISTS(lgrp))
 546  546                          continue;
 547  547  
 548  548                  if ((hand == LGRP_NULL_HANDLE &&
 549  549                      lgrp->lgrp_latency == oldtime) ||
 550  550                      (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand))
 551  551                          lgrp->lgrp_latency = (int)newtime;
 552  552          }
 553  553  }
 554  554  
 555  555  /*
 556  556   * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
 557  557   */
 558  558  void
 559  559  lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
 560  560  {
 561  561          klgrpset_t      changed;
 562  562          cpu_t           *cp;
 563  563          lgrp_id_t       id;
 564  564          int             rc;
 565  565  
 566  566          switch (event) {
 567  567          /*
 568  568           * The following (re)configuration events are common code
 569  569           * initiated. lgrp_plat_config() is called here to inform the
 570  570           * platform of the reconfiguration event.
 571  571           */
 572  572          case LGRP_CONFIG_CPU_ADD:
 573  573                  cp = (cpu_t *)resource;
 574  574  
 575  575                  /*
 576  576                   * Initialize the new CPU's lgrp related next/prev
 577  577                   * links, and give it a bootstrap lpl so that it can
 578  578                   * survive should it need to enter the dispatcher.
 579  579                   */
 580  580                  cp->cpu_next_lpl = cp;
 581  581                  cp->cpu_prev_lpl = cp;
 582  582                  cp->cpu_next_lgrp = cp;
 583  583                  cp->cpu_prev_lgrp = cp;
 584  584                  cp->cpu_lpl = lpl_bootstrap;
 585  585  
 586  586                  lgrp_plat_config(event, resource);
 587  587                  atomic_add_32(&lgrp_gen, 1);
 588  588  
 589  589                  break;
 590  590          case LGRP_CONFIG_CPU_DEL:
 591  591                  lgrp_plat_config(event, resource);
 592  592                  atomic_add_32(&lgrp_gen, 1);
 593  593  
 594  594                  break;
 595  595          case LGRP_CONFIG_CPU_ONLINE:
 596  596                  cp = (cpu_t *)resource;
 597  597                  lgrp_cpu_init(cp);
 598  598                  lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
 599  599                  rc = lpl_topo_verify(cp->cpu_part);
 600  600                  if (rc != LPL_TOPO_CORRECT) {
 601  601                          panic("lpl_topo_verify failed: %d", rc);
 602  602                  }
 603  603                  lgrp_plat_config(event, resource);
 604  604                  atomic_add_32(&lgrp_gen, 1);
 605  605  
 606  606                  break;
 607  607          case LGRP_CONFIG_CPU_OFFLINE:
 608  608                  cp = (cpu_t *)resource;
 609  609                  id = cp->cpu_lpl->lpl_lgrpid;
 610  610                  lgrp_part_del_cpu(cp);
 611  611                  lgrp_cpu_fini(cp, id);
 612  612                  rc = lpl_topo_verify(cp->cpu_part);
 613  613                  if (rc != LPL_TOPO_CORRECT) {
 614  614                          panic("lpl_topo_verify failed: %d", rc);
 615  615                  }
 616  616                  lgrp_plat_config(event, resource);
 617  617                  atomic_add_32(&lgrp_gen, 1);
 618  618  
 619  619                  break;
 620  620          case LGRP_CONFIG_CPUPART_ADD:
 621  621                  cp = (cpu_t *)resource;
 622  622                  lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
 623  623                  rc = lpl_topo_verify(cp->cpu_part);
 624  624                  if (rc != LPL_TOPO_CORRECT) {
 625  625                          panic("lpl_topo_verify failed: %d", rc);
 626  626                  }
 627  627                  lgrp_plat_config(event, resource);
 628  628  
 629  629                  break;
 630  630          case LGRP_CONFIG_CPUPART_DEL:
 631  631                  cp = (cpu_t *)resource;
 632  632                  lgrp_part_del_cpu((cpu_t *)resource);
 633  633                  rc = lpl_topo_verify(cp->cpu_part);
 634  634                  if (rc != LPL_TOPO_CORRECT) {
 635  635                          panic("lpl_topo_verify failed: %d", rc);
 636  636                  }
 637  637                  lgrp_plat_config(event, resource);
 638  638  
 639  639                  break;
 640  640          /*
 641  641           * The following events are initiated by the memnode
 642  642           * subsystem.
 643  643           */
 644  644          case LGRP_CONFIG_MEM_ADD:
 645  645                  lgrp_mem_init((int)resource, where, B_FALSE);
 646  646                  atomic_add_32(&lgrp_gen, 1);
 647  647  
 648  648                  break;
 649  649          case LGRP_CONFIG_MEM_DEL:
 650  650                  lgrp_mem_fini((int)resource, where, B_FALSE);
 651  651                  atomic_add_32(&lgrp_gen, 1);
 652  652  
 653  653                  break;
 654  654          case LGRP_CONFIG_MEM_RENAME: {
 655  655                  lgrp_config_mem_rename_t *ren_arg =
 656  656                      (lgrp_config_mem_rename_t *)where;
 657  657  
 658  658                  lgrp_mem_rename((int)resource,
 659  659                      ren_arg->lmem_rename_from,
 660  660                      ren_arg->lmem_rename_to);
 661  661                  atomic_add_32(&lgrp_gen, 1);
 662  662  
 663  663                  break;
 664  664          }
 665  665          case LGRP_CONFIG_GEN_UPDATE:
 666  666                  atomic_add_32(&lgrp_gen, 1);
 667  667  
 668  668                  break;
 669  669          case LGRP_CONFIG_FLATTEN:
 670  670                  if (where == 0)
 671  671                          lgrp_topo_levels = (int)resource;
 672  672                  else
 673  673                          (void) lgrp_topo_flatten(resource,
 674  674                              lgrp_table, lgrp_alloc_max, &changed);
 675  675  
 676  676                  break;
 677  677          /*
 678  678           * Update any lgroups with old latency to new latency
 679  679           */
 680  680          case LGRP_CONFIG_LAT_CHANGE_ALL:
 681  681                  lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource,
 682  682                      (u_longlong_t)where);
 683  683  
 684  684                  break;
 685  685          /*
 686  686           * Update lgroup with specified lgroup platform handle to have
 687  687           * new latency
 688  688           */
 689  689          case LGRP_CONFIG_LAT_CHANGE:
 690  690                  lgrp_latency_change((lgrp_handle_t)resource, 0,
 691  691                      (u_longlong_t)where);
 692  692  
 693  693                  break;
 694  694          case LGRP_CONFIG_NOP:
 695  695  
 696  696                  break;
 697  697          default:
 698  698                  break;
 699  699          }
 700  700  
 701  701  }
 702  702  
 703  703  /*
 704  704   * Called to add lgrp info into cpu structure from cpu_add_unit;
 705  705   * do not assume cpu is in cpu[] yet!
 706  706   *
 707  707   * CPUs are brought online with all other CPUs paused so we can't
 708  708   * allocate memory or we could deadlock the system, so we rely on
 709  709   * the platform to statically allocate as much space as we need
 710  710   * for the lgrp structs and stats.
 711  711   */
 712  712  static void
 713  713  lgrp_cpu_init(struct cpu *cp)
 714  714  {
 715  715          klgrpset_t      changed;
 716  716          int             count;
 717  717          lgrp_handle_t   hand;
 718  718          int             first_cpu;
 719  719          lgrp_t          *my_lgrp;
 720  720          lgrp_id_t       lgrpid;
 721  721          struct cpu      *cptr;
 722  722  
 723  723          /*
 724  724           * This is the first time through if the resource set
 725  725           * for the root lgroup is empty. After cpu0 has been
 726  726           * initially added to an lgroup, the root's CPU resource
 727  727           * set can never be empty, since the system's last CPU
 728  728           * cannot be offlined.
 729  729           */
 730  730          if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
 731  731                  /*
 732  732                   * First time through.
 733  733                   */
 734  734                  first_cpu = 1;
 735  735          } else {
 736  736                  /*
 737  737                   * If cpu0 needs to move lgroups, we may come
 738  738                   * through here again, at which time cpu_lock won't
 739  739                   * be held, and lgrp_initialized will be false.
 740  740                   */
 741  741                  ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
 742  742                  ASSERT(cp->cpu_part != NULL);
 743  743                  first_cpu = 0;
 744  744          }
 745  745  
 746  746          hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
 747  747          my_lgrp = lgrp_hand_to_lgrp(hand);
 748  748  
 749  749          if (my_lgrp == NULL) {
 750  750                  /*
 751  751                   * Create new lgrp and add it to lgroup topology
 752  752                   */
 753  753                  my_lgrp = lgrp_create();
 754  754                  my_lgrp->lgrp_plathand = hand;
 755  755                  my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
 756  756                  lgrpid = my_lgrp->lgrp_id;
 757  757                  klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
 758  758                  klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
 759  759  
 760  760                  count = 0;
 761  761                  klgrpset_clear(changed);
 762  762                  count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
 763  763                      &changed);
 764  764                  /*
 765  765                   * May have added new intermediate lgroups, so need to add
 766  766                   * resources other than CPUs which are added below
 767  767                   */
 768  768                  (void) lgrp_mnode_update(changed, NULL);
 769  769          } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
 770  770              > 0) {
 771  771                  /*
 772  772                   * Leaf lgroup was created, but latency wasn't available
 773  773                   * then.  So, set latency for it and fill in rest of lgroup
 774  774                   * topology  now that we know how far it is from other leaf
 775  775                   * lgroups.
 776  776                   */
 777  777                  lgrpid = my_lgrp->lgrp_id;
 778  778                  klgrpset_clear(changed);
 779  779                  if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
 780  780                      lgrpid))
 781  781                          klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
 782  782                  count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
 783  783                      &changed);
 784  784  
 785  785                  /*
 786  786                   * May have added new intermediate lgroups, so need to add
 787  787                   * resources other than CPUs which are added below
 788  788                   */
 789  789                  (void) lgrp_mnode_update(changed, NULL);
 790  790          } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
 791  791              my_lgrp->lgrp_id)) {
 792  792                  int     i;
 793  793  
 794  794                  /*
 795  795                   * Update existing lgroup and lgroups containing it with CPU
 796  796                   * resource
 797  797                   */
 798  798                  lgrpid = my_lgrp->lgrp_id;
 799  799                  klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
 800  800                  for (i = 0; i <= lgrp_alloc_max; i++) {
 801  801                          lgrp_t          *lgrp;
 802  802  
 803  803                          lgrp = lgrp_table[i];
 804  804                          if (!LGRP_EXISTS(lgrp) ||
 805  805                              !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
 806  806                                  continue;
 807  807  
 808  808                          klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
 809  809                  }
 810  810          }
 811  811  
 812  812          lgrpid = my_lgrp->lgrp_id;
 813  813          cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
 814  814  
 815  815          /*
 816  816           * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
 817  817           * end up in lpl for lgroup 0 whether it is supposed to be in there or
 818  818           * not since none of lgroup IDs in the lpl's have been set yet.
 819  819           */
 820  820          if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
 821  821                  cp->cpu_lpl->lpl_lgrpid = lgrpid;
 822  822  
 823  823          /*
 824  824           * link the CPU into the lgrp's CPU list
 825  825           */
 826  826          if (my_lgrp->lgrp_cpucnt == 0) {
 827  827                  my_lgrp->lgrp_cpu = cp;
 828  828                  cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
 829  829          } else {
 830  830                  cptr = my_lgrp->lgrp_cpu;
 831  831                  cp->cpu_next_lgrp = cptr;
 832  832                  cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
 833  833                  cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
 834  834                  cptr->cpu_prev_lgrp = cp;
 835  835          }
 836  836          my_lgrp->lgrp_cpucnt++;
 837  837  }
 838  838  
 839  839  lgrp_t *
 840  840  lgrp_create(void)
 841  841  {
 842  842          lgrp_t          *my_lgrp;
 843  843          lgrp_id_t       lgrpid;
 844  844          int             i;
 845  845  
 846  846          ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
 847  847  
 848  848          /*
 849  849           * Find an open slot in the lgroup table and recycle unused lgroup
 850  850           * left there if any
 851  851           */
 852  852          my_lgrp = NULL;
 853  853          if (lgrp_alloc_hint == -1)
 854  854                  /*
 855  855                   * Allocate from end when hint not set yet because no lgroups
 856  856                   * have been deleted yet
 857  857                   */
 858  858                  lgrpid = nlgrps++;
 859  859          else {
 860  860                  /*
 861  861                   * Start looking for next open slot from hint and leave hint
 862  862                   * at slot allocated
 863  863                   */
 864  864                  for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
 865  865                          my_lgrp = lgrp_table[i];
 866  866                          if (!LGRP_EXISTS(my_lgrp)) {
 867  867                                  lgrpid = i;
 868  868                                  nlgrps++;
 869  869                                  break;
 870  870                          }
 871  871                  }
 872  872                  lgrp_alloc_hint = lgrpid;
 873  873          }
 874  874  
 875  875          /*
 876  876           * Keep track of max lgroup ID allocated so far to cut down on searches
 877  877           */
 878  878          if (lgrpid > lgrp_alloc_max)
 879  879                  lgrp_alloc_max = lgrpid;
 880  880  
 881  881          /*
 882  882           * Need to allocate new lgroup if next open slot didn't have one
 883  883           * for recycling
 884  884           */
 885  885          if (my_lgrp == NULL)
 886  886                  my_lgrp = lgrp_plat_alloc(lgrpid);
 887  887  
 888  888          if (nlgrps > nlgrpsmax || my_lgrp == NULL)
 889  889                  panic("Too many lgrps for platform (%d)", nlgrps);
 890  890  
 891  891          my_lgrp->lgrp_id = lgrpid;
 892  892          my_lgrp->lgrp_latency = 0;
 893  893          my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
 894  894          my_lgrp->lgrp_parent = NULL;
 895  895          my_lgrp->lgrp_childcnt = 0;
 896  896          my_lgrp->lgrp_mnodes = (mnodeset_t)0;
 897  897          my_lgrp->lgrp_nmnodes = 0;
 898  898          klgrpset_clear(my_lgrp->lgrp_children);
 899  899          klgrpset_clear(my_lgrp->lgrp_leaves);
 900  900          for (i = 0; i < LGRP_RSRC_COUNT; i++)
 901  901                  klgrpset_clear(my_lgrp->lgrp_set[i]);
 902  902  
 903  903          my_lgrp->lgrp_cpu = NULL;
 904  904          my_lgrp->lgrp_cpucnt = 0;
 905  905  
 906  906          if (my_lgrp->lgrp_kstat != NULL)
 907  907                  lgrp_kstat_reset(lgrpid);
 908  908  
 909  909          lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
 910  910  
 911  911          return (my_lgrp);
 912  912  }
 913  913  
 914  914  void
 915  915  lgrp_destroy(lgrp_t *lgrp)
 916  916  {
 917  917          int             i;
 918  918  
 919  919          /*
 920  920           * Unless this lgroup is being destroyed on behalf of
 921  921           * the boot CPU, cpu_lock must be held
 922  922           */
 923  923          ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
 924  924  
 925  925          if (nlgrps == 1)
 926  926                  cmn_err(CE_PANIC, "Can't destroy only lgroup!");
 927  927  
 928  928          if (!LGRP_EXISTS(lgrp))
 929  929                  return;
 930  930  
 931  931          /*
 932  932           * Set hint to lgroup being deleted and try to keep lower numbered
 933  933           * hints to facilitate finding empty slots
 934  934           */
 935  935          if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
 936  936                  lgrp_alloc_hint = lgrp->lgrp_id;
 937  937  
 938  938          /*
 939  939           * Mark this lgroup to be recycled by setting its lgroup ID to
 940  940           * LGRP_NONE and clear relevant fields
 941  941           */
 942  942          lgrp->lgrp_id = LGRP_NONE;
 943  943          lgrp->lgrp_latency = 0;
 944  944          lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
 945  945          lgrp->lgrp_parent = NULL;
 946  946          lgrp->lgrp_childcnt = 0;
 947  947  
 948  948          klgrpset_clear(lgrp->lgrp_children);
 949  949          klgrpset_clear(lgrp->lgrp_leaves);
 950  950          for (i = 0; i < LGRP_RSRC_COUNT; i++)
 951  951                  klgrpset_clear(lgrp->lgrp_set[i]);
 952  952  
 953  953          lgrp->lgrp_mnodes = (mnodeset_t)0;
 954  954          lgrp->lgrp_nmnodes = 0;
 955  955  
 956  956          lgrp->lgrp_cpu = NULL;
 957  957          lgrp->lgrp_cpucnt = 0;
 958  958  
 959  959          nlgrps--;
 960  960  }
 961  961  
 962  962  /*
 963  963   * Initialize kstat data. Called from lgrp intialization code.
 964  964   */
 965  965  static void
 966  966  lgrp_kstat_init(void)
 967  967  {
 968  968          lgrp_stat_t     stat;
 969  969  
 970  970          mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
 971  971  
 972  972          for (stat = 0; stat < LGRP_NUM_STATS; stat++)
 973  973                  kstat_named_init(&lgrp_kstat_data[stat],
 974  974                      lgrp_kstat_names[stat], KSTAT_DATA_INT64);
 975  975  }
 976  976  
 977  977  /*
 978  978   * initialize an lgrp's kstats if needed
 979  979   * called with cpu_lock held but not with cpus paused.
 980  980   * we don't tear these down now because we don't know about
 981  981   * memory leaving the lgrp yet...
 982  982   */
 983  983  
 984  984  void
 985  985  lgrp_kstat_create(cpu_t *cp)
 986  986  {
 987  987          kstat_t         *lgrp_kstat;
 988  988          lgrp_id_t       lgrpid;
 989  989          lgrp_t          *my_lgrp;
 990  990  
 991  991          ASSERT(MUTEX_HELD(&cpu_lock));
 992  992  
 993  993          lgrpid = cp->cpu_lpl->lpl_lgrpid;
 994  994          my_lgrp = lgrp_table[lgrpid];
 995  995  
 996  996          if (my_lgrp->lgrp_kstat != NULL)
 997  997                  return; /* already initialized */
 998  998  
 999  999          lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
1000 1000              KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
1001 1001              KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
1002 1002  
1003 1003          if (lgrp_kstat != NULL) {
1004 1004                  lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
1005 1005                  lgrp_kstat->ks_private = my_lgrp;
1006 1006                  lgrp_kstat->ks_data = &lgrp_kstat_data;
1007 1007                  lgrp_kstat->ks_update = lgrp_kstat_extract;
1008 1008                  my_lgrp->lgrp_kstat = lgrp_kstat;
1009 1009                  kstat_install(lgrp_kstat);
1010 1010          }
1011 1011  }
1012 1012  
1013 1013  /*
1014 1014   * this will do something when we manage to remove now unused lgrps
1015 1015   */
1016 1016  
1017 1017  /* ARGSUSED */
1018 1018  void
1019 1019  lgrp_kstat_destroy(cpu_t *cp)
1020 1020  {
1021 1021          ASSERT(MUTEX_HELD(&cpu_lock));
1022 1022  }
1023 1023  
1024 1024  /*
1025 1025   * Called when a CPU is off-lined.
1026 1026   */
1027 1027  static void
1028 1028  lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
1029 1029  {
1030 1030          lgrp_t *my_lgrp;
1031 1031          struct cpu *prev;
1032 1032          struct cpu *next;
1033 1033  
1034 1034          ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
1035 1035  
1036 1036          prev = cp->cpu_prev_lgrp;
1037 1037          next = cp->cpu_next_lgrp;
1038 1038  
1039 1039          prev->cpu_next_lgrp = next;
1040 1040          next->cpu_prev_lgrp = prev;
1041 1041  
1042 1042          /*
1043 1043           * just because I'm paranoid doesn't mean...
1044 1044           */
1045 1045  
1046 1046          cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
1047 1047  
1048 1048          my_lgrp = lgrp_table[lgrpid];
1049 1049          my_lgrp->lgrp_cpucnt--;
1050 1050  
1051 1051          /*
1052 1052           * Removing last CPU in lgroup, so update lgroup topology
1053 1053           */
1054 1054          if (my_lgrp->lgrp_cpucnt == 0) {
1055 1055                  klgrpset_t      changed;
1056 1056                  int             count;
1057 1057                  int             i;
1058 1058  
1059 1059                  my_lgrp->lgrp_cpu = NULL;
1060 1060  
1061 1061                  /*
1062 1062                   * Remove this lgroup from its lgroup CPU resources and remove
1063 1063                   * lgroup from lgroup topology if it doesn't have any more
1064 1064                   * resources in it now
1065 1065                   */
1066 1066                  klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1067 1067                  if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1068 1068                          count = 0;
1069 1069                          klgrpset_clear(changed);
1070 1070                          count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1071 1071                              lgrp_alloc_max + 1, &changed);
1072 1072                          return;
1073 1073                  }
1074 1074  
1075 1075                  /*
1076 1076                   * This lgroup isn't empty, so just remove it from CPU
1077 1077                   * resources of any lgroups that contain it as such
1078 1078                   */
1079 1079                  for (i = 0; i <= lgrp_alloc_max; i++) {
1080 1080                          lgrp_t          *lgrp;
1081 1081  
1082 1082                          lgrp = lgrp_table[i];
1083 1083                          if (!LGRP_EXISTS(lgrp) ||
1084 1084                              !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1085 1085                              lgrpid))
1086 1086                                  continue;
1087 1087  
1088 1088                          klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1089 1089                  }
1090 1090                  return;
1091 1091          }
1092 1092  
1093 1093          if (my_lgrp->lgrp_cpu == cp)
1094 1094                  my_lgrp->lgrp_cpu = next;
1095 1095  
1096 1096  }
1097 1097  
1098 1098  /*
1099 1099   * Update memory nodes in target lgroups and return ones that get changed
1100 1100   */
1101 1101  int
1102 1102  lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1103 1103  {
1104 1104          int     count;
1105 1105          int     i;
1106 1106          int     j;
1107 1107          lgrp_t  *lgrp;
1108 1108          lgrp_t  *lgrp_rsrc;
1109 1109  
1110 1110          count = 0;
1111 1111          if (changed)
1112 1112                  klgrpset_clear(*changed);
1113 1113  
1114 1114          if (klgrpset_isempty(target))
1115 1115                  return (0);
1116 1116  
1117 1117          /*
1118 1118           * Find each lgroup in target lgroups
1119 1119           */
1120 1120          for (i = 0; i <= lgrp_alloc_max; i++) {
1121 1121                  /*
1122 1122                   * Skip any lgroups that don't exist or aren't in target group
1123 1123                   */
1124 1124                  lgrp = lgrp_table[i];
1125 1125                  if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1126 1126                          continue;
1127 1127                  }
1128 1128  
1129 1129                  /*
1130 1130                   * Initialize memnodes for intermediate lgroups to 0
1131 1131                   * and update them from scratch since they may have completely
1132 1132                   * changed
1133 1133                   */
1134 1134                  if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1135 1135                          lgrp->lgrp_mnodes = (mnodeset_t)0;
1136 1136                          lgrp->lgrp_nmnodes = 0;
1137 1137                  }
1138 1138  
1139 1139                  /*
1140 1140                   * Update memory nodes of of target lgroup with memory nodes
1141 1141                   * from each lgroup in its lgroup memory resource set
1142 1142                   */
1143 1143                  for (j = 0; j <= lgrp_alloc_max; j++) {
1144 1144                          int     k;
1145 1145  
1146 1146                          /*
1147 1147                           * Skip any lgroups that don't exist or aren't in
1148 1148                           * memory resources of target lgroup
1149 1149                           */
1150 1150                          lgrp_rsrc = lgrp_table[j];
1151 1151                          if (!LGRP_EXISTS(lgrp_rsrc) ||
1152 1152                              !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1153 1153                              j))
1154 1154                                  continue;
1155 1155  
1156 1156                          /*
1157 1157                           * Update target lgroup's memnodes to include memnodes
1158 1158                           * of this lgroup
1159 1159                           */
1160 1160                          for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1161 1161                                  mnodeset_t      mnode_mask;
1162 1162  
1163 1163                                  mnode_mask = (mnodeset_t)1 << k;
1164 1164                                  if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1165 1165                                      !(lgrp->lgrp_mnodes & mnode_mask)) {
1166 1166                                          lgrp->lgrp_mnodes |= mnode_mask;
1167 1167                                          lgrp->lgrp_nmnodes++;
1168 1168                                  }
1169 1169                          }
1170 1170                          count++;
1171 1171                          if (changed)
1172 1172                                  klgrpset_add(*changed, lgrp->lgrp_id);
1173 1173                  }
1174 1174          }
1175 1175  
1176 1176          return (count);
1177 1177  }
1178 1178  
1179 1179  /*
1180 1180   * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1181 1181   * is moved from one board to another. The "from" and "to" arguments specify the
1182 1182   * source and the destination of the move.
1183 1183   *
1184 1184   * See plat_lgrp_config() for a detailed description of the copy-rename
1185 1185   * semantics.
1186 1186   *
1187 1187   * The lgrp_mem_rename() is called by the platform copy-rename code to update
1188 1188   * the lgroup topology which is changing as memory moves from one lgroup to
1189 1189   * another. It removes the mnode from the source lgroup and re-inserts it in the
1190 1190   * target lgroup.
1191 1191   *
1192 1192   * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1193 1193   * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1194 1194   * copy-rename operation.
1195 1195   *
1196 1196   * There is one case which requires special handling. If the system contains
1197 1197   * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1198 1198   * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1199 1199   * lgrp_mem_init), but there is a window when the system has no memory in the
1200 1200   * lgroup hierarchy. If another thread tries to allocate memory during this
1201 1201   * window, the allocation will fail, although the system has physical memory.
1202 1202   * This may cause a system panic or a deadlock (some sleeping memory allocations
1203 1203   * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1204 1204   * the mnode back).
1205 1205   *
1206 1206   * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1207 1207   * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1208 1208   * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1209 1209   * but it updates the rest of the lgroup topology as if the mnode was actually
1210 1210   * removed. The lgrp_mem_init() function recognizes that the mnode being
1211 1211   * inserted represents such a special case and updates the topology
1212 1212   * appropriately.
1213 1213   */
1214 1214  void
1215 1215  lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1216 1216  {
1217 1217          /*
1218 1218           * Remove the memory from the source node and add it to the destination
1219 1219           * node.
1220 1220           */
1221 1221          lgrp_mem_fini(mnode, from, B_TRUE);
1222 1222          lgrp_mem_init(mnode, to, B_TRUE);
1223 1223  }
1224 1224  
1225 1225  /*
1226 1226   * Called to indicate that the lgrp with platform handle "hand" now
1227 1227   * contains the memory identified by "mnode".
1228 1228   *
1229 1229   * LOCKING for this routine is a bit tricky. Usually it is called without
1230 1230   * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1231 1231   * callers. During DR of the board containing the caged memory it may be called
1232 1232   * with cpu_lock already held and CPUs paused.
1233 1233   *
1234 1234   * If the insertion is part of the DR copy-rename and the inserted mnode (and
1235 1235   * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1236 1236   * dealing with the special case of DR copy-rename described in
1237 1237   * lgrp_mem_rename().
1238 1238   */
1239 1239  void
1240 1240  lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1241 1241  {
1242 1242          klgrpset_t      changed;
1243 1243          int             count;
1244 1244          int             i;
1245 1245          lgrp_t          *my_lgrp;
1246 1246          lgrp_id_t       lgrpid;
1247 1247          mnodeset_t      mnodes_mask = ((mnodeset_t)1 << mnode);
1248 1248          boolean_t       drop_lock = B_FALSE;
1249 1249          boolean_t       need_synch = B_FALSE;
1250 1250  
1251 1251          /*
1252 1252           * Grab CPU lock (if we haven't already)
1253 1253           */
1254 1254          if (!MUTEX_HELD(&cpu_lock)) {
1255 1255                  mutex_enter(&cpu_lock);
1256 1256                  drop_lock = B_TRUE;
1257 1257          }
1258 1258  
1259 1259          /*
1260 1260           * This routine may be called from a context where we already
1261 1261           * hold cpu_lock, and have already paused cpus.
1262 1262           */
1263 1263          if (!cpus_paused())
1264 1264                  need_synch = B_TRUE;
1265 1265  
1266 1266          /*
1267 1267           * Check if this mnode is already configured and return immediately if
1268 1268           * it is.
1269 1269           *
1270 1270           * NOTE: in special case of copy-rename of the only remaining mnode,
1271 1271           * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1272 1272           * recognize this case and continue as usual, but skip the update to
1273 1273           * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1274 1274           * in topology, temporarily introduced by lgrp_mem_fini().
1275 1275           */
1276 1276          if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1277 1277              lgrp_root->lgrp_mnodes & mnodes_mask) {
1278 1278                  if (drop_lock)
1279 1279                          mutex_exit(&cpu_lock);
1280 1280                  return;
1281 1281          }
1282 1282  
1283 1283          /*
1284 1284           * Update lgroup topology with new memory resources, keeping track of
1285 1285           * which lgroups change
1286 1286           */
1287 1287          count = 0;
1288 1288          klgrpset_clear(changed);
1289 1289          my_lgrp = lgrp_hand_to_lgrp(hand);

↓ open down ↓

1289 lines elided

↑ open up ↑

1290 1290          if (my_lgrp == NULL) {
1291 1291                  /* new lgrp */
1292 1292                  my_lgrp = lgrp_create();
1293 1293                  lgrpid = my_lgrp->lgrp_id;
1294 1294                  my_lgrp->lgrp_plathand = hand;
1295 1295                  my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1296 1296                  klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1297 1297                  klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1298 1298  
1299 1299                  if (need_synch)
1300      -                        pause_cpus(NULL);
     1300 +                        pause_cpus(NULL, NULL);
1301 1301                  count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1302 1302                      &changed);
1303 1303                  if (need_synch)
1304 1304                          start_cpus();
1305 1305          } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1306 1306              > 0) {
1307 1307                  /*
1308 1308                   * Leaf lgroup was created, but latency wasn't available
1309 1309                   * then.  So, set latency for it and fill in rest of lgroup
1310 1310                   * topology  now that we know how far it is from other leaf
1311 1311                   * lgroups.
1312 1312                   */
1313 1313                  klgrpset_clear(changed);
1314 1314                  lgrpid = my_lgrp->lgrp_id;
1315 1315                  if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1316 1316                      lgrpid))
1317 1317                          klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1318 1318                  if (need_synch)
1319      -                        pause_cpus(NULL);
     1319 +                        pause_cpus(NULL, NULL);
1320 1320                  count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1321 1321                      &changed);
1322 1322                  if (need_synch)
1323 1323                          start_cpus();
1324 1324          } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1325 1325              my_lgrp->lgrp_id)) {
1326 1326                  /*
1327 1327                   * Add new lgroup memory resource to existing lgroup
1328 1328                   */
1329 1329                  lgrpid = my_lgrp->lgrp_id;

1330 1330                  klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1331 1331                  klgrpset_add(changed, lgrpid);
1332 1332                  count++;
1333 1333                  for (i = 0; i <= lgrp_alloc_max; i++) {
1334 1334                          lgrp_t          *lgrp;
1335 1335  
1336 1336                          lgrp = lgrp_table[i];
1337 1337                          if (!LGRP_EXISTS(lgrp) ||
1338 1338                              !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1339 1339                                  continue;
1340 1340  
1341 1341                          klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1342 1342                          klgrpset_add(changed, lgrp->lgrp_id);
1343 1343                          count++;
1344 1344                  }
1345 1345          }
1346 1346  
1347 1347          /*
1348 1348           * Add memory node to lgroup and remove lgroup from ones that need
1349 1349           * to be updated
1350 1350           */
1351 1351          if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1352 1352                  my_lgrp->lgrp_mnodes |= mnodes_mask;
1353 1353                  my_lgrp->lgrp_nmnodes++;
1354 1354          }
1355 1355          klgrpset_del(changed, lgrpid);
1356 1356  
1357 1357          /*
1358 1358           * Update memory node information for all lgroups that changed and
1359 1359           * contain new memory node as a resource
1360 1360           */
1361 1361          if (count)
1362 1362                  (void) lgrp_mnode_update(changed, NULL);
1363 1363  
1364 1364          if (drop_lock)
1365 1365                  mutex_exit(&cpu_lock);
1366 1366  }
1367 1367  
1368 1368  /*
1369 1369   * Called to indicate that the lgroup associated with the platform
1370 1370   * handle "hand" no longer contains given memory node
1371 1371   *
1372 1372   * LOCKING for this routine is a bit tricky. Usually it is called without
1373 1373   * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1374 1374   * callers. During DR of the board containing the caged memory it may be called
1375 1375   * with cpu_lock already held and CPUs paused.
1376 1376   *
1377 1377   * If the deletion is part of the DR copy-rename and the deleted mnode is the
1378 1378   * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1379 1379   * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1380 1380   * the same mnode back into the topology. See lgrp_mem_rename() and
1381 1381   * lgrp_mem_init() for additional details.
1382 1382   */
1383 1383  void
1384 1384  lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1385 1385  {
1386 1386          klgrpset_t      changed;
1387 1387          int             count;
1388 1388          int             i;
1389 1389          lgrp_t          *my_lgrp;
1390 1390          lgrp_id_t       lgrpid;
1391 1391          mnodeset_t      mnodes_mask;
1392 1392          boolean_t       drop_lock = B_FALSE;
1393 1393          boolean_t       need_synch = B_FALSE;
1394 1394  
1395 1395          /*
1396 1396           * Grab CPU lock (if we haven't already)
1397 1397           */
1398 1398          if (!MUTEX_HELD(&cpu_lock)) {
1399 1399                  mutex_enter(&cpu_lock);
1400 1400                  drop_lock = B_TRUE;
1401 1401          }
1402 1402  
1403 1403          /*
1404 1404           * This routine may be called from a context where we already
1405 1405           * hold cpu_lock and have already paused cpus.
1406 1406           */
1407 1407          if (!cpus_paused())
1408 1408                  need_synch = B_TRUE;
1409 1409  
1410 1410          my_lgrp = lgrp_hand_to_lgrp(hand);
1411 1411  
1412 1412          /*
1413 1413           * The lgrp *must* be pre-existing
1414 1414           */
1415 1415          ASSERT(my_lgrp != NULL);
1416 1416  
1417 1417          /*
1418 1418           * Delete memory node from lgroups which contain it
1419 1419           */
1420 1420          mnodes_mask = ((mnodeset_t)1 << mnode);
1421 1421          for (i = 0; i <= lgrp_alloc_max; i++) {
1422 1422                  lgrp_t *lgrp = lgrp_table[i];
1423 1423                  /*
1424 1424                   * Skip any non-existent lgroups and any lgroups that don't
1425 1425                   * contain leaf lgroup of memory as a memory resource
1426 1426                   */
1427 1427                  if (!LGRP_EXISTS(lgrp) ||
1428 1428                      !(lgrp->lgrp_mnodes & mnodes_mask))
1429 1429                          continue;
1430 1430  
1431 1431                  /*
1432 1432                   * Avoid removing the last mnode from the root in the DR
1433 1433                   * copy-rename case. See lgrp_mem_rename() for details.
1434 1434                   */
1435 1435                  if (is_copy_rename &&
1436 1436                      (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1437 1437                          continue;
1438 1438  
1439 1439                  /*
1440 1440                   * Remove memory node from lgroup.
1441 1441                   */
1442 1442                  lgrp->lgrp_mnodes &= ~mnodes_mask;
1443 1443                  lgrp->lgrp_nmnodes--;
1444 1444                  ASSERT(lgrp->lgrp_nmnodes >= 0);
1445 1445          }
1446 1446          ASSERT(lgrp_root->lgrp_nmnodes > 0);
1447 1447  
1448 1448          /*
1449 1449           * Don't need to update lgroup topology if this lgroup still has memory.
1450 1450           *
1451 1451           * In the special case of DR copy-rename with the only mnode being
1452 1452           * removed, the lgrp_mnodes for the root is always non-zero, but we
1453 1453           * still need to update the lgroup topology.
1454 1454           */
1455 1455          if ((my_lgrp->lgrp_nmnodes > 0) &&
1456 1456              !(is_copy_rename && (my_lgrp == lgrp_root) &&
1457 1457              (my_lgrp->lgrp_mnodes == mnodes_mask))) {
1458 1458                  if (drop_lock)
1459 1459                          mutex_exit(&cpu_lock);
1460 1460                  return;
1461 1461          }
1462 1462  
1463 1463          /*
1464 1464           * This lgroup does not contain any memory now
1465 1465           */
1466 1466          klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1467 1467  
1468 1468          /*
1469 1469           * Remove this lgroup from lgroup topology if it does not contain any

↓ open down ↓

140 lines elided

↑ open up ↑

1470 1470           * resources now
1471 1471           */
1472 1472          lgrpid = my_lgrp->lgrp_id;
1473 1473          count = 0;
1474 1474          klgrpset_clear(changed);
1475 1475          if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1476 1476                  /*
1477 1477                   * Delete lgroup when no more resources
1478 1478                   */
1479 1479                  if (need_synch)
1480      -                        pause_cpus(NULL);
     1480 +                        pause_cpus(NULL, NULL);
1481 1481                  count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1482 1482                      lgrp_alloc_max + 1, &changed);
1483 1483                  ASSERT(count > 0);
1484 1484                  if (need_synch)
1485 1485                          start_cpus();
1486 1486          } else {
1487 1487                  /*
1488 1488                   * Remove lgroup from memory resources of any lgroups that
1489 1489                   * contain it as such
1490 1490                   */

1491 1491                  for (i = 0; i <= lgrp_alloc_max; i++) {
1492 1492                          lgrp_t          *lgrp;
1493 1493  
1494 1494                          lgrp = lgrp_table[i];
1495 1495                          if (!LGRP_EXISTS(lgrp) ||
1496 1496                              !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1497 1497                              lgrpid))
1498 1498                                  continue;
1499 1499  
1500 1500                          klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1501 1501                  }
1502 1502          }
1503 1503          if (drop_lock)
1504 1504                  mutex_exit(&cpu_lock);
1505 1505  }
1506 1506  
1507 1507  /*
1508 1508   * Return lgroup with given platform handle
1509 1509   */
1510 1510  lgrp_t *
1511 1511  lgrp_hand_to_lgrp(lgrp_handle_t hand)
1512 1512  {
1513 1513          int     i;
1514 1514          lgrp_t  *lgrp;
1515 1515  
1516 1516          if (hand == LGRP_NULL_HANDLE)
1517 1517                  return (NULL);
1518 1518  
1519 1519          for (i = 0; i <= lgrp_alloc_max; i++) {
1520 1520                  lgrp = lgrp_table[i];
1521 1521                  if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1522 1522                          return (lgrp);
1523 1523          }
1524 1524          return (NULL);
1525 1525  }
1526 1526  
1527 1527  /*
1528 1528   * Return the home lgroup of the current thread.
1529 1529   * We must do this with kernel preemption disabled, since we don't want our
1530 1530   * thread to be re-homed while we're poking around with its lpl, and the lpl
1531 1531   * should never be NULL.
1532 1532   *
1533 1533   * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1534 1534   * is enabled because of DR.  Callers can use disable kernel preemption
1535 1535   * around this call to guarantee that the lgroup will be valid beyond this
1536 1536   * routine, since kernel preemption can be recursive.
1537 1537   */
1538 1538  lgrp_t *
1539 1539  lgrp_home_lgrp(void)
1540 1540  {
1541 1541          lgrp_t  *lgrp;
1542 1542          lpl_t   *lpl;
1543 1543  
1544 1544          kpreempt_disable();
1545 1545  
1546 1546          lpl = curthread->t_lpl;
1547 1547          ASSERT(lpl != NULL);
1548 1548          ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1549 1549          ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1550 1550          lgrp = lgrp_table[lpl->lpl_lgrpid];
1551 1551  
1552 1552          kpreempt_enable();
1553 1553  
1554 1554          return (lgrp);
1555 1555  }
1556 1556  
1557 1557  /*
1558 1558   * Return ID of home lgroup for given thread
1559 1559   * (See comments for lgrp_home_lgrp() for special care and handling
1560 1560   * instructions)
1561 1561   */
1562 1562  lgrp_id_t
1563 1563  lgrp_home_id(kthread_t *t)
1564 1564  {
1565 1565          lgrp_id_t       lgrp;
1566 1566          lpl_t           *lpl;
1567 1567  
1568 1568          ASSERT(t != NULL);
1569 1569          /*
1570 1570           * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1571 1571           * cannot since the HAT layer can call into this routine to
1572 1572           * determine the locality for its data structures in the context
1573 1573           * of a page fault.
1574 1574           */
1575 1575  
1576 1576          kpreempt_disable();
1577 1577  
1578 1578          lpl = t->t_lpl;
1579 1579          ASSERT(lpl != NULL);
1580 1580          ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1581 1581          lgrp = lpl->lpl_lgrpid;
1582 1582  
1583 1583          kpreempt_enable();
1584 1584  
1585 1585          return (lgrp);
1586 1586  }
1587 1587  
1588 1588  /*
1589 1589   * Return lgroup containing the physical memory for the given page frame number
1590 1590   */
1591 1591  lgrp_t *
1592 1592  lgrp_pfn_to_lgrp(pfn_t pfn)
1593 1593  {
1594 1594          lgrp_handle_t   hand;
1595 1595          int             i;
1596 1596          lgrp_t          *lgrp;
1597 1597  
1598 1598          hand = lgrp_plat_pfn_to_hand(pfn);
1599 1599          if (hand != LGRP_NULL_HANDLE)
1600 1600                  for (i = 0; i <= lgrp_alloc_max; i++) {
1601 1601                          lgrp = lgrp_table[i];
1602 1602                          if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1603 1603                                  return (lgrp);
1604 1604                  }
1605 1605          return (NULL);
1606 1606  }
1607 1607  
1608 1608  /*
1609 1609   * Return lgroup containing the physical memory for the given page frame number
1610 1610   */
1611 1611  lgrp_t *
1612 1612  lgrp_phys_to_lgrp(u_longlong_t physaddr)
1613 1613  {
1614 1614          lgrp_handle_t   hand;
1615 1615          int             i;
1616 1616          lgrp_t          *lgrp;
1617 1617          pfn_t           pfn;
1618 1618  
1619 1619          pfn = btop(physaddr);
1620 1620          hand = lgrp_plat_pfn_to_hand(pfn);
1621 1621          if (hand != LGRP_NULL_HANDLE)
1622 1622                  for (i = 0; i <= lgrp_alloc_max; i++) {
1623 1623                          lgrp = lgrp_table[i];
1624 1624                          if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1625 1625                                  return (lgrp);
1626 1626                  }
1627 1627          return (NULL);
1628 1628  }
1629 1629  
1630 1630  /*
1631 1631   * Return the leaf lgroup containing the given CPU
1632 1632   *
1633 1633   * The caller needs to take precautions necessary to prevent
1634 1634   * "cpu", and it's lpl from going away across a call to this function.
1635 1635   * hint: kpreempt_disable()/kpreempt_enable()
1636 1636   */
1637 1637  static lgrp_t *
1638 1638  lgrp_cpu_to_lgrp(cpu_t *cpu)
1639 1639  {
1640 1640          return (cpu->cpu_lpl->lpl_lgrp);
1641 1641  }
1642 1642  
1643 1643  /*
1644 1644   * Return the sum of the partition loads in an lgrp divided by
1645 1645   * the number of CPUs in the lgrp.  This is our best approximation
1646 1646   * of an 'lgroup load average' for a useful per-lgroup kstat.
1647 1647   */
1648 1648  static uint64_t
1649 1649  lgrp_sum_loadavgs(lgrp_t *lgrp)
1650 1650  {
1651 1651          cpu_t *cpu;
1652 1652          int ncpu;
1653 1653          uint64_t loads = 0;
1654 1654  
1655 1655          mutex_enter(&cpu_lock);
1656 1656  
1657 1657          cpu = lgrp->lgrp_cpu;
1658 1658          ncpu = lgrp->lgrp_cpucnt;
1659 1659  
1660 1660          if (cpu == NULL || ncpu == 0) {
1661 1661                  mutex_exit(&cpu_lock);
1662 1662                  return (0ull);
1663 1663          }
1664 1664  
1665 1665          do {
1666 1666                  loads += cpu->cpu_lpl->lpl_loadavg;
1667 1667                  cpu = cpu->cpu_next_lgrp;
1668 1668          } while (cpu != lgrp->lgrp_cpu);
1669 1669  
1670 1670          mutex_exit(&cpu_lock);
1671 1671  
1672 1672          return (loads / ncpu);
1673 1673  }
1674 1674  
1675 1675  void
1676 1676  lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1677 1677  {
1678 1678          struct lgrp_stats *pstats;
1679 1679  
1680 1680          /*
1681 1681           * Verify that the caller isn't trying to add to
1682 1682           * a statistic for an lgroup that has gone away
1683 1683           */
1684 1684          if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1685 1685                  return;
1686 1686  
1687 1687          pstats = &lgrp_stats[lgrpid];
1688 1688          atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1689 1689  }
1690 1690  
1691 1691  int64_t
1692 1692  lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1693 1693  {
1694 1694          uint64_t val;
1695 1695          struct lgrp_stats *pstats;
1696 1696  
1697 1697          if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1698 1698                  return ((int64_t)0);
1699 1699  
1700 1700          pstats = &lgrp_stats[lgrpid];
1701 1701          LGRP_STAT_READ(pstats, stat, val);
1702 1702          return (val);
1703 1703  }
1704 1704  
1705 1705  /*
1706 1706   * Reset all kstats for lgrp specified by its lgrpid.
1707 1707   */
1708 1708  static void
1709 1709  lgrp_kstat_reset(lgrp_id_t lgrpid)
1710 1710  {
1711 1711          lgrp_stat_t stat;
1712 1712  
1713 1713          if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1714 1714                  return;
1715 1715  
1716 1716          for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1717 1717                  LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1718 1718          }
1719 1719  }
1720 1720  
1721 1721  /*
1722 1722   * Collect all per-lgrp statistics for the lgrp associated with this
1723 1723   * kstat, and store them in the ks_data array.
1724 1724   *
1725 1725   * The superuser can reset all the running counter statistics for an
1726 1726   * lgrp by writing to any of the lgrp's stats.
1727 1727   */
1728 1728  static int
1729 1729  lgrp_kstat_extract(kstat_t *ksp, int rw)
1730 1730  {
1731 1731          lgrp_stat_t             stat;
1732 1732          struct kstat_named      *ksd;
1733 1733          lgrp_t                  *lgrp;
1734 1734          lgrp_id_t               lgrpid;
1735 1735  
1736 1736          lgrp = (lgrp_t *)ksp->ks_private;
1737 1737  
1738 1738          ksd = (struct kstat_named *)ksp->ks_data;
1739 1739          ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1740 1740  
1741 1741          lgrpid = lgrp->lgrp_id;
1742 1742  
1743 1743          if (lgrpid == LGRP_NONE) {
1744 1744                  /*
1745 1745                   * Return all zeroes as stats for freed lgrp.
1746 1746                   */
1747 1747                  for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1748 1748                          ksd[stat].value.i64 = 0;
1749 1749                  }
1750 1750                  ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1751 1751                  ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1752 1752                  ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1753 1753                  ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1754 1754                  ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1755 1755          } else if (rw != KSTAT_WRITE) {
1756 1756                  /*
1757 1757                   * Handle counter stats
1758 1758                   */
1759 1759                  for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1760 1760                          ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1761 1761                  }
1762 1762  
1763 1763                  /*
1764 1764                   * Handle kernel data snapshot stats
1765 1765                   */
1766 1766                  ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1767 1767                  ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1768 1768                      lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1769 1769                  ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1770 1770                      lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1771 1771                  ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1772 1772                      lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1773 1773                  ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1774 1774                  ksd[stat + LGRP_LOADAVG_SCALE].value.i64 =
1775 1775                      lgrp_loadavg_max_effect;
1776 1776          } else {
1777 1777                  lgrp_kstat_reset(lgrpid);
1778 1778          }
1779 1779  
1780 1780          return (0);
1781 1781  }
1782 1782  
1783 1783  int
1784 1784  lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1785 1785  {
1786 1786          cpu_t   *cp;
1787 1787  
1788 1788          mutex_enter(&cpu_lock);
1789 1789  
1790 1790          if ((cp = cpu_get(id)) == NULL) {
1791 1791                  mutex_exit(&cpu_lock);
1792 1792                  return (EINVAL);
1793 1793          }
1794 1794  
1795 1795          if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1796 1796                  mutex_exit(&cpu_lock);
1797 1797                  return (EINVAL);
1798 1798          }
1799 1799  
1800 1800          ASSERT(cp->cpu_lpl != NULL);
1801 1801  
1802 1802          *lp = cp->cpu_lpl->lpl_lgrpid;
1803 1803  
1804 1804          mutex_exit(&cpu_lock);
1805 1805  
1806 1806          return (0);
1807 1807  }
1808 1808  
1809 1809  int
1810 1810  lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1811 1811  {
1812 1812          cpu_t *cp;
1813 1813  
1814 1814          mutex_enter(&cpu_lock);
1815 1815  
1816 1816          if ((cp = cpu_get(id)) == NULL) {
1817 1817                  mutex_exit(&cpu_lock);
1818 1818                  return (EINVAL);
1819 1819          }
1820 1820  
1821 1821          ASSERT(cp->cpu_lpl != NULL);
1822 1822  
1823 1823          *lp = cp->cpu_lpl->lpl_loadavg;
1824 1824  
1825 1825          mutex_exit(&cpu_lock);
1826 1826  
1827 1827          return (0);
1828 1828  }
1829 1829  
1830 1830  /*
1831 1831   * Add a resource named by lpl_leaf to rset of lpl_target
1832 1832   *
1833 1833   * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1834 1834   * resource. It is adjusted here, as this is presently the only place that we
1835 1835   * can be certain a resource addition has succeeded.
1836 1836   *
1837 1837   * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1838 1838   * list in order until it reaches a NULL.  (This list is required to be NULL
1839 1839   * terminated, too).  This is done so that we can mark start pos + 1, so that
1840 1840   * each lpl is traversed sequentially, but in a different order.  We hope this
1841 1841   * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
1842 1842   */
1843 1843  
1844 1844  void
1845 1845  lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1846 1846  {
1847 1847          int             i;
1848 1848          int             entry_slot = 0;
1849 1849  
1850 1850          /* return if leaf is already present */
1851 1851          for (i = 0; i < lpl_target->lpl_nrset; i++) {
1852 1852                  if (lpl_target->lpl_rset[i] == lpl_leaf) {
1853 1853                          return;
1854 1854                  }
1855 1855  
1856 1856                  if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1857 1857                      lpl_leaf->lpl_lgrpid) {
1858 1858                          break;
1859 1859                  }
1860 1860          }
1861 1861  
1862 1862          /* insert leaf, update counts */
1863 1863          entry_slot = i;
1864 1864          i = lpl_target->lpl_nrset++;
1865 1865  
1866 1866          /*
1867 1867           * Start at the end of the rset array and work backwards towards the
1868 1868           * slot into which the new lpl will be inserted. This effectively
1869 1869           * preserves the current ordering by scooting everybody over one entry,
1870 1870           * and placing the new entry into the space created.
1871 1871           */
1872 1872          while (i-- > entry_slot) {
1873 1873                  lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1874 1874                  lpl_target->lpl_id2rset[lpl_target->lpl_rset[i]->lpl_lgrpid] =
1875 1875                      i + 1;
1876 1876          }
1877 1877  
1878 1878          lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1879 1879          lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = entry_slot;
1880 1880  
1881 1881          lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1882 1882  }
1883 1883  
1884 1884  /*
1885 1885   * Update each of lpl_parent's children with a reference to their parent.
1886 1886   * The lgrp topology is used as the reference since it is fully
1887 1887   * consistent and correct at this point.
1888 1888   * This should be called after any potential change in lpl_parent's
1889 1889   * rset.
1890 1890   */
1891 1891  static void
1892 1892  lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1893 1893  {
1894 1894          klgrpset_t      children;
1895 1895          int             i;
1896 1896  
1897 1897          children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1898 1898          if (klgrpset_isempty(children))
1899 1899                  return; /* nothing to do */
1900 1900  
1901 1901          for (i = 0; i <= lgrp_alloc_max; i++) {
1902 1902                  if (klgrpset_ismember(children, i)) {
1903 1903                          /*
1904 1904                           * (Re)set the parent. It may be incorrect if
1905 1905                           * lpl_parent is new in the topology.
1906 1906                           */
1907 1907                          cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1908 1908                  }
1909 1909          }
1910 1910  }
1911 1911  
1912 1912  /*
1913 1913   * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1914 1914   *
1915 1915   * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1916 1916   * resource. The values are adjusted here, as this is the only place that we can
1917 1917   * be certain a resource was successfully deleted.
1918 1918   */
1919 1919  void
1920 1920  lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1921 1921  {
1922 1922          int i;
1923 1923          lpl_t *leaf;
1924 1924  
1925 1925          if (lpl_target->lpl_nrset == 0)
1926 1926                  return;
1927 1927  
1928 1928          /* find leaf in intermediate node */
1929 1929          for (i = 0; i < lpl_target->lpl_nrset; i++) {
1930 1930                  if (lpl_target->lpl_rset[i] == lpl_leaf)
1931 1931                          break;
1932 1932          }
1933 1933  
1934 1934          /* return if leaf not found */
1935 1935          if (lpl_target->lpl_rset[i] != lpl_leaf)
1936 1936                  return;
1937 1937  
1938 1938          /* prune leaf, compress array */
1939 1939          lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1940 1940          lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = -1;
1941 1941          lpl_target->lpl_ncpu--;
1942 1942          do {
1943 1943                  lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1944 1944                  /*
1945 1945                   * Update the lgrp id <=> rset mapping
1946 1946                   */
1947 1947                  if ((leaf = lpl_target->lpl_rset[i]) != NULL) {
1948 1948                          lpl_target->lpl_id2rset[leaf->lpl_lgrpid] = i;
1949 1949                  }
1950 1950          } while (i++ < lpl_target->lpl_nrset);
1951 1951  }
1952 1952  
1953 1953  /*
1954 1954   * Check to see if the resource set of the target lpl contains the
1955 1955   * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
1956 1956   */
1957 1957  
1958 1958  int
1959 1959  lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1960 1960  {
1961 1961          int i;
1962 1962  
1963 1963          for (i = 0; i < lpl_target->lpl_nrset; i++) {
1964 1964                  if (lpl_target->lpl_rset[i] == lpl_leaf)
1965 1965                          return (1);
1966 1966          }
1967 1967  
1968 1968          return (0);
1969 1969  }
1970 1970  
1971 1971  /*
1972 1972   * Called when we change cpu lpl membership.  This increments or decrements the
1973 1973   * per-cpu counter in every lpl in which our leaf appears.
1974 1974   */
1975 1975  void
1976 1976  lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
1977 1977  {
1978 1978          cpupart_t       *cpupart;
1979 1979          lgrp_t          *lgrp_leaf;
1980 1980          lgrp_t          *lgrp_cur;
1981 1981          lpl_t           *lpl_leaf;
1982 1982          lpl_t           *lpl_cur;
1983 1983          int             i;
1984 1984  
1985 1985          ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
1986 1986  
1987 1987          cpupart = cp->cpu_part;
1988 1988          lpl_leaf = cp->cpu_lpl;
1989 1989          lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
1990 1990  
1991 1991          for (i = 0; i <= lgrp_alloc_max; i++) {
1992 1992                  lgrp_cur = lgrp_table[i];
1993 1993  
1994 1994                  /*
1995 1995                   * Don't adjust if the lgrp isn't there, if we're the leaf lpl
1996 1996                   * for the cpu in question, or if the current lgrp and leaf
1997 1997                   * don't share the same resources.
1998 1998                   */
1999 1999  
2000 2000                  if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
2001 2001                      !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
2002 2002                      lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
2003 2003                          continue;
2004 2004  
2005 2005  
2006 2006                  lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2007 2007  
2008 2008                  if (lpl_cur->lpl_nrset > 0) {
2009 2009                          if (act == LPL_INCREMENT) {
2010 2010                                  lpl_cur->lpl_ncpu++;
2011 2011                          } else if (act == LPL_DECREMENT) {
2012 2012                                  lpl_cur->lpl_ncpu--;
2013 2013                          }
2014 2014                  }
2015 2015          }
2016 2016  }
2017 2017  
2018 2018  /*
2019 2019   * Initialize lpl with given resources and specified lgrp
2020 2020   */
2021 2021  void
2022 2022  lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
2023 2023  {
2024 2024          lpl->lpl_lgrpid = lgrp->lgrp_id;
2025 2025          lpl->lpl_loadavg = 0;
2026 2026          if (lpl == lpl_leaf)
2027 2027                  lpl->lpl_ncpu = 1;
2028 2028          else
2029 2029                  lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2030 2030          lpl->lpl_nrset = 1;
2031 2031          lpl->lpl_rset[0] = lpl_leaf;
2032 2032          lpl->lpl_id2rset[lpl_leaf->lpl_lgrpid] = 0;
2033 2033          lpl->lpl_lgrp = lgrp;
2034 2034          lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2035 2035          lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2036 2036  }
2037 2037  
2038 2038  /*
2039 2039   * Clear an unused lpl
2040 2040   */
2041 2041  void
2042 2042  lpl_clear(lpl_t *lpl)
2043 2043  {
2044 2044          /*
2045 2045           * Clear out all fields in the lpl except:
2046 2046           *    lpl_lgrpid - to facilitate debugging
2047 2047           *    lpl_rset, lpl_rset_sz, lpl_id2rset - rset array references / size
2048 2048           *
2049 2049           * Note that the lpl's rset and id2rset mapping are cleared as well.
2050 2050           */
2051 2051          lpl->lpl_loadavg = 0;
2052 2052          lpl->lpl_ncpu = 0;
2053 2053          lpl->lpl_lgrp = NULL;
2054 2054          lpl->lpl_parent = NULL;
2055 2055          lpl->lpl_cpus = NULL;
2056 2056          lpl->lpl_nrset = 0;
2057 2057          lpl->lpl_homed_time = 0;
2058 2058          bzero(lpl->lpl_rset, sizeof (lpl->lpl_rset[0]) * lpl->lpl_rset_sz);
2059 2059          bzero(lpl->lpl_id2rset,
2060 2060              sizeof (lpl->lpl_id2rset[0]) * lpl->lpl_rset_sz);
2061 2061  }
2062 2062  
2063 2063  /*
2064 2064   * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2065 2065   * is in sync with the lgroup toplogy in the system.  The lpl topology may not
2066 2066   * make full use of all of the lgroup topology, but this checks to make sure
2067 2067   * that for the parts that it does use, it has correctly understood the
2068 2068   * relationships that exist. This function returns
2069 2069   * 0 if the topology is correct, and a non-zero error code, for non-debug
2070 2070   * kernels if incorrect.  Asserts are spread throughout the code to aid in
2071 2071   * debugging on a DEBUG kernel.
2072 2072   */
2073 2073  int
2074 2074  lpl_topo_verify(cpupart_t *cpupart)
2075 2075  {
2076 2076          lgrp_t          *lgrp;
2077 2077          lpl_t           *lpl;
2078 2078          klgrpset_t      rset;
2079 2079          klgrpset_t      cset;
2080 2080          cpu_t           *cpu;
2081 2081          cpu_t           *cp_start;
2082 2082          int             i;
2083 2083          int             j;
2084 2084          int             sum;
2085 2085  
2086 2086          /* topology can't be incorrect if it doesn't exist */
2087 2087          if (!lgrp_topo_initialized || !lgrp_initialized)
2088 2088                  return (LPL_TOPO_CORRECT);
2089 2089  
2090 2090          ASSERT(cpupart != NULL);
2091 2091  
2092 2092          for (i = 0; i <= lgrp_alloc_max; i++) {
2093 2093                  lgrp = lgrp_table[i];
2094 2094                  lpl = NULL;
2095 2095                  /* make sure lpls are allocated */
2096 2096                  ASSERT(cpupart->cp_lgrploads);
2097 2097                  if (!cpupart->cp_lgrploads)
2098 2098                          return (LPL_TOPO_PART_HAS_NO_LPL);
2099 2099  
2100 2100                  lpl = &cpupart->cp_lgrploads[i];
2101 2101                  /* make sure our index is good */
2102 2102                  ASSERT(i < cpupart->cp_nlgrploads);
2103 2103  
2104 2104                  /* if lgroup doesn't exist, make sure lpl is empty */
2105 2105                  if (!LGRP_EXISTS(lgrp)) {
2106 2106                          ASSERT(lpl->lpl_ncpu == 0);
2107 2107                          if (lpl->lpl_ncpu > 0) {
2108 2108                                  return (LPL_TOPO_CPUS_NOT_EMPTY);
2109 2109                          } else {
2110 2110                                  continue;
2111 2111                          }
2112 2112                  }
2113 2113  
2114 2114                  /* verify that lgroup and lpl are identically numbered */
2115 2115                  ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2116 2116  
2117 2117                  /* if lgroup isn't in our partition, make sure lpl is empty */
2118 2118                  if (!klgrpset_intersects(lgrp->lgrp_leaves,
2119 2119                      cpupart->cp_lgrpset)) {
2120 2120                          ASSERT(lpl->lpl_ncpu == 0);
2121 2121                          if (lpl->lpl_ncpu > 0) {
2122 2122                                  return (LPL_TOPO_CPUS_NOT_EMPTY);
2123 2123                          }
2124 2124                          /*
2125 2125                           * lpl is empty, and lgroup isn't in partition.  verify
2126 2126                           * that lpl doesn't show up in anyone else's rsets (in
2127 2127                           * this partition, anyway)
2128 2128                           */
2129 2129                          for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2130 2130                                  lpl_t *i_lpl; /* lpl we're iterating over */
2131 2131  
2132 2132                                  i_lpl = &cpupart->cp_lgrploads[j];
2133 2133  
2134 2134                                  ASSERT(!lpl_rset_contains(i_lpl, lpl));
2135 2135                                  if (lpl_rset_contains(i_lpl, lpl)) {
2136 2136                                          return (LPL_TOPO_LPL_ORPHANED);
2137 2137                                  }
2138 2138                          }
2139 2139                          /* lgroup is empty, and everything is ok. continue */
2140 2140                          continue;
2141 2141                  }
2142 2142  
2143 2143  
2144 2144                  /* lgroup is in this partition, now check it against lpl */
2145 2145  
2146 2146                  /* do both have matching lgrps? */
2147 2147                  ASSERT(lgrp == lpl->lpl_lgrp);
2148 2148                  if (lgrp != lpl->lpl_lgrp) {
2149 2149                          return (LPL_TOPO_LGRP_MISMATCH);
2150 2150                  }
2151 2151  
2152 2152                  /* do the parent lgroups exist and do they match? */
2153 2153                  if (lgrp->lgrp_parent) {
2154 2154                          ASSERT(lpl->lpl_parent);
2155 2155                          ASSERT(lgrp->lgrp_parent->lgrp_id ==
2156 2156                              lpl->lpl_parent->lpl_lgrpid);
2157 2157  
2158 2158                          if (!lpl->lpl_parent) {
2159 2159                                  return (LPL_TOPO_MISSING_PARENT);
2160 2160                          } else if (lgrp->lgrp_parent->lgrp_id !=
2161 2161                              lpl->lpl_parent->lpl_lgrpid) {
2162 2162                                  return (LPL_TOPO_PARENT_MISMATCH);
2163 2163                          }
2164 2164                  }
2165 2165  
2166 2166                  /* only leaf lgroups keep a cpucnt, only check leaves */
2167 2167                  if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2168 2168  
2169 2169                          /* verify that lgrp is also a leaf */
2170 2170                          ASSERT((lgrp->lgrp_childcnt == 0) &&
2171 2171                              (klgrpset_ismember(lgrp->lgrp_leaves,
2172 2172                              lpl->lpl_lgrpid)));
2173 2173  
2174 2174                          if ((lgrp->lgrp_childcnt > 0) ||
2175 2175                              (!klgrpset_ismember(lgrp->lgrp_leaves,
2176 2176                              lpl->lpl_lgrpid))) {
2177 2177                                  return (LPL_TOPO_LGRP_NOT_LEAF);
2178 2178                          }
2179 2179  
2180 2180                          ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2181 2181                              (lpl->lpl_ncpu > 0));
2182 2182                          if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2183 2183                              (lpl->lpl_ncpu <= 0)) {
2184 2184                                  return (LPL_TOPO_BAD_CPUCNT);
2185 2185                          }
2186 2186  
2187 2187                          /*
2188 2188                           * Check that lpl_ncpu also matches the number of
2189 2189                           * cpus in the lpl's linked list.  This only exists in
2190 2190                           * leaves, but they should always match.
2191 2191                           */
2192 2192                          j = 0;
2193 2193                          cpu = cp_start = lpl->lpl_cpus;
2194 2194                          while (cpu != NULL) {
2195 2195                                  j++;
2196 2196  
2197 2197                                  /* check to make sure cpu's lpl is leaf lpl */
2198 2198                                  ASSERT(cpu->cpu_lpl == lpl);
2199 2199                                  if (cpu->cpu_lpl != lpl) {
2200 2200                                          return (LPL_TOPO_CPU_HAS_BAD_LPL);
2201 2201                                  }
2202 2202  
2203 2203                                  /* check next cpu */
2204 2204                                  if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2205 2205                                          continue;
2206 2206                                  } else {
2207 2207                                          cpu = NULL;
2208 2208                                  }
2209 2209                          }
2210 2210  
2211 2211                          ASSERT(j == lpl->lpl_ncpu);
2212 2212                          if (j != lpl->lpl_ncpu) {
2213 2213                                  return (LPL_TOPO_LPL_BAD_NCPU);
2214 2214                          }
2215 2215  
2216 2216                          /*
2217 2217                           * Also, check that leaf lpl is contained in all
2218 2218                           * intermediate lpls that name the leaf as a descendant
2219 2219                           */
2220 2220                          for (j = 0; j <= lgrp_alloc_max; j++) {
2221 2221                                  klgrpset_t intersect;
2222 2222                                  lgrp_t *lgrp_cand;
2223 2223                                  lpl_t *lpl_cand;
2224 2224  
2225 2225                                  lgrp_cand = lgrp_table[j];
2226 2226                                  intersect = klgrpset_intersects(
2227 2227                                      lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2228 2228                                      cpupart->cp_lgrpset);
2229 2229  
2230 2230                                  if (!LGRP_EXISTS(lgrp_cand) ||
2231 2231                                      !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2232 2232                                      cpupart->cp_lgrpset) ||
2233 2233                                      (intersect == 0))
2234 2234                                          continue;
2235 2235  
2236 2236                                  lpl_cand =
2237 2237                                      &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2238 2238  
2239 2239                                  if (klgrpset_ismember(intersect,
2240 2240                                      lgrp->lgrp_id)) {
2241 2241                                          ASSERT(lpl_rset_contains(lpl_cand,
2242 2242                                              lpl));
2243 2243  
2244 2244                                          if (!lpl_rset_contains(lpl_cand, lpl)) {
2245 2245                                                  return (LPL_TOPO_RSET_MSSNG_LF);
2246 2246                                          }
2247 2247                                  }
2248 2248                          }
2249 2249  
2250 2250                  } else { /* non-leaf specific checks */
2251 2251  
2252 2252                          /*
2253 2253                           * Non-leaf lpls should have lpl_cpus == NULL
2254 2254                           * verify that this is so
2255 2255                           */
2256 2256                          ASSERT(lpl->lpl_cpus == NULL);
2257 2257                          if (lpl->lpl_cpus != NULL) {
2258 2258                                  return (LPL_TOPO_NONLEAF_HAS_CPUS);
2259 2259                          }
2260 2260  
2261 2261                          /*
2262 2262                           * verify that the sum of the cpus in the leaf resources
2263 2263                           * is equal to the total ncpu in the intermediate
2264 2264                           */
2265 2265                          for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2266 2266                                  sum += lpl->lpl_rset[j]->lpl_ncpu;
2267 2267                          }
2268 2268  
2269 2269                          ASSERT(sum == lpl->lpl_ncpu);
2270 2270                          if (sum != lpl->lpl_ncpu) {
2271 2271                                  return (LPL_TOPO_LPL_BAD_NCPU);
2272 2272                          }
2273 2273                  }
2274 2274  
2275 2275                  /*
2276 2276                   * Check the rset of the lpl in question.  Make sure that each
2277 2277                   * rset contains a subset of the resources in
2278 2278                   * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
2279 2279                   * sure that each rset doesn't include resources that are
2280 2280                   * outside of that set.  (Which would be resources somehow not
2281 2281                   * accounted for).
2282 2282                   */
2283 2283                  klgrpset_clear(rset);
2284 2284                  for (j = 0; j < lpl->lpl_nrset; j++) {
2285 2285                          klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2286 2286                  }
2287 2287                  klgrpset_copy(cset, rset);
2288 2288                  /* make sure lpl rset matches lgrp rset */
2289 2289                  klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2290 2290                  /* make sure rset is contained with in partition, too */
2291 2291                  klgrpset_diff(cset, cpupart->cp_lgrpset);
2292 2292  
2293 2293                  ASSERT(klgrpset_isempty(rset) && klgrpset_isempty(cset));
2294 2294                  if (!klgrpset_isempty(rset) || !klgrpset_isempty(cset)) {
2295 2295                          return (LPL_TOPO_RSET_MISMATCH);
2296 2296                  }
2297 2297  
2298 2298                  /*
2299 2299                   * check to make sure lpl_nrset matches the number of rsets
2300 2300                   * contained in the lpl
2301 2301                   */
2302 2302                  for (j = 0; j < lpl->lpl_nrset; j++) {
2303 2303                          if (lpl->lpl_rset[j] == NULL)
2304 2304                                  break;
2305 2305                  }
2306 2306  
2307 2307                  ASSERT(j == lpl->lpl_nrset);
2308 2308                  if (j != lpl->lpl_nrset) {
2309 2309                          return (LPL_TOPO_BAD_RSETCNT);
2310 2310                  }
2311 2311  
2312 2312          }
2313 2313          return (LPL_TOPO_CORRECT);
2314 2314  }
2315 2315  
2316 2316  /*
2317 2317   * Flatten lpl topology to given number of levels.  This is presently only
2318 2318   * implemented for a flatten to 2 levels, which will prune out the intermediates
2319 2319   * and home the leaf lpls to the root lpl.
2320 2320   */
2321 2321  int
2322 2322  lpl_topo_flatten(int levels)
2323 2323  {
2324 2324          int             i;
2325 2325          uint_t          sum;
2326 2326          lgrp_t          *lgrp_cur;
2327 2327          lpl_t           *lpl_cur;
2328 2328          lpl_t           *lpl_root;
2329 2329          cpupart_t       *cp;
2330 2330  
2331 2331          if (levels != 2)
2332 2332                  return (0);
2333 2333  
2334 2334          /* called w/ cpus paused - grab no locks! */
2335 2335          ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2336 2336              !lgrp_initialized);
2337 2337  
2338 2338          cp = cp_list_head;
2339 2339          do {
2340 2340                  lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2341 2341                  ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2342 2342  
2343 2343                  for (i = 0; i <= lgrp_alloc_max; i++) {
2344 2344                          lgrp_cur = lgrp_table[i];
2345 2345                          lpl_cur = &cp->cp_lgrploads[i];
2346 2346  
2347 2347                          if ((lgrp_cur == lgrp_root) ||
2348 2348                              (!LGRP_EXISTS(lgrp_cur) &&
2349 2349                              (lpl_cur->lpl_ncpu == 0)))
2350 2350                                  continue;
2351 2351  
2352 2352                          if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2353 2353                                  /*
2354 2354                                   * this should be a deleted intermediate, so
2355 2355                                   * clear it
2356 2356                                   */
2357 2357                                  lpl_clear(lpl_cur);
2358 2358                          } else if ((lpl_cur->lpl_nrset == 1) &&
2359 2359                              (lpl_cur->lpl_rset[0] == lpl_cur) &&
2360 2360                              ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2361 2361                              (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2362 2362                                  /*
2363 2363                                   * this is a leaf whose parent was deleted, or
2364 2364                                   * whose parent had their lgrp deleted.  (And
2365 2365                                   * whose parent will soon be deleted).  Point
2366 2366                                   * this guy back to the root lpl.
2367 2367                                   */
2368 2368                                  lpl_cur->lpl_parent = lpl_root;
2369 2369                                  lpl_rset_add(lpl_root, lpl_cur);
2370 2370                          }
2371 2371  
2372 2372                  }
2373 2373  
2374 2374                  /*
2375 2375                   * Now that we're done, make sure the count on the root lpl is
2376 2376                   * correct, and update the hints of the children for the sake of
2377 2377                   * thoroughness
2378 2378                   */
2379 2379                  for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2380 2380                          sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2381 2381                  }
2382 2382                  lpl_root->lpl_ncpu = sum;
2383 2383                  lpl_child_update(lpl_root, cp);
2384 2384  
2385 2385                  cp = cp->cp_next;
2386 2386          } while (cp != cp_list_head);
2387 2387  
2388 2388          return (levels);
2389 2389  }
2390 2390  
2391 2391  /*
2392 2392   * Insert a lpl into the resource hierarchy and create any additional lpls that
2393 2393   * are necessary to represent the varying states of locality for the cpu
2394 2394   * resoruces newly added to the partition.
2395 2395   *
2396 2396   * This routine is clever enough that it can correctly add resources from the
2397 2397   * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
2398 2398   * those for which the lpl is a leaf as opposed to simply a named equally local
2399 2399   * resource).  The one special case that needs additional processing is when a
2400 2400   * new intermediate lpl is introduced.  Since the main loop only traverses
2401 2401   * looking to add the leaf resource where it does not yet exist, additional work
2402 2402   * is necessary to add other leaf resources that may need to exist in the newly
2403 2403   * created intermediate.  This is performed by the second inner loop, and is
2404 2404   * only done when the check for more than one overlapping resource succeeds.
2405 2405   */
2406 2406  
2407 2407  void
2408 2408  lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2409 2409  {
2410 2410          int             i;
2411 2411          int             j;
2412 2412          int             rset_num_intersect;
2413 2413          lgrp_t          *lgrp_cur;
2414 2414          lpl_t           *lpl_cur;
2415 2415          lpl_t           *lpl_parent;
2416 2416          lgrp_id_t       parent_id;
2417 2417          klgrpset_t      rset_intersect; /* resources in cpupart and lgrp */
2418 2418  
2419 2419          for (i = 0; i <= lgrp_alloc_max; i++) {
2420 2420                  lgrp_cur = lgrp_table[i];
2421 2421  
2422 2422                  /*
2423 2423                   * Don't insert if the lgrp isn't there, if the leaf isn't
2424 2424                   * contained within the current lgrp, or if the current lgrp has
2425 2425                   * no leaves in this partition
2426 2426                   */
2427 2427  
2428 2428                  if (!LGRP_EXISTS(lgrp_cur) ||
2429 2429                      !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2430 2430                      lpl_leaf->lpl_lgrpid) ||
2431 2431                      !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2432 2432                      cpupart->cp_lgrpset))
2433 2433                          continue;
2434 2434  
2435 2435                  lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2436 2436                  if (lgrp_cur->lgrp_parent != NULL) {
2437 2437                          /* if lgrp has a parent, assign it properly */
2438 2438                          parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2439 2439                          lpl_parent = &cpupart->cp_lgrploads[parent_id];
2440 2440                  } else {
2441 2441                          /* if not, make sure parent ptr gets set to null */
2442 2442                          lpl_parent = NULL;
2443 2443                  }
2444 2444  
2445 2445                  if (lpl_cur == lpl_leaf) {
2446 2446                          /*
2447 2447                           * Almost all leaf state was initialized elsewhere.  The
2448 2448                           * only thing left to do is to set the parent.
2449 2449                           */
2450 2450                          lpl_cur->lpl_parent = lpl_parent;
2451 2451                          continue;
2452 2452                  }
2453 2453  
2454 2454                  lpl_clear(lpl_cur);
2455 2455                  lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2456 2456  
2457 2457                  lpl_cur->lpl_parent = lpl_parent;
2458 2458  
2459 2459                  /* does new lpl need to be populated with other resources? */
2460 2460                  rset_intersect =
2461 2461                      klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2462 2462                      cpupart->cp_lgrpset);
2463 2463                  klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2464 2464  
2465 2465                  if (rset_num_intersect > 1) {
2466 2466                          /*
2467 2467                           * If so, figure out what lpls have resources that
2468 2468                           * intersect this one, and add them.
2469 2469                           */
2470 2470                          for (j = 0; j <= lgrp_alloc_max; j++) {
2471 2471                                  lgrp_t  *lgrp_cand;     /* candidate lgrp */
2472 2472                                  lpl_t   *lpl_cand;      /* candidate lpl */
2473 2473  
2474 2474                                  lgrp_cand = lgrp_table[j];
2475 2475                                  if (!LGRP_EXISTS(lgrp_cand) ||
2476 2476                                      !klgrpset_ismember(rset_intersect,
2477 2477                                      lgrp_cand->lgrp_id))
2478 2478                                          continue;
2479 2479                                  lpl_cand =
2480 2480                                      &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2481 2481                                  lpl_rset_add(lpl_cur, lpl_cand);
2482 2482                          }
2483 2483                  }
2484 2484                  /*
2485 2485                   * This lpl's rset has changed. Update the hint in it's
2486 2486                   * children.
2487 2487                   */
2488 2488                  lpl_child_update(lpl_cur, cpupart);
2489 2489          }
2490 2490  }
2491 2491  
2492 2492  /*
2493 2493   * remove a lpl from the hierarchy of resources, clearing its state when
2494 2494   * finished.  If the lpls at the intermediate levels of the hierarchy have no
2495 2495   * remaining resources, or no longer name a leaf resource in the cpu-partition,
2496 2496   * delete them as well.
2497 2497   */
2498 2498  
2499 2499  void
2500 2500  lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2501 2501  {
2502 2502          int             i;
2503 2503          lgrp_t          *lgrp_cur;
2504 2504          lpl_t           *lpl_cur;
2505 2505          klgrpset_t      leaf_intersect; /* intersection of leaves */
2506 2506  
2507 2507          for (i = 0; i <= lgrp_alloc_max; i++) {
2508 2508                  lgrp_cur = lgrp_table[i];
2509 2509  
2510 2510                  /*
2511 2511                   * Don't attempt to remove from lgrps that aren't there, that
2512 2512                   * don't contain our leaf, or from the leaf itself. (We do that
2513 2513                   * later)
2514 2514                   */
2515 2515  
2516 2516                  if (!LGRP_EXISTS(lgrp_cur))
2517 2517                          continue;
2518 2518  
2519 2519                  lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2520 2520  
2521 2521                  if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2522 2522                      lpl_leaf->lpl_lgrpid) ||
2523 2523                      (lpl_cur == lpl_leaf)) {
2524 2524                          continue;
2525 2525                  }
2526 2526  
2527 2527                  /*
2528 2528                   * This is a slightly sleazy simplification in that we have
2529 2529                   * already marked the cp_lgrpset as no longer containing the
2530 2530                   * leaf we've deleted.  Any lpls that pass the above checks
2531 2531                   * based upon lgrp membership but not necessarily cpu-part
2532 2532                   * membership also get cleared by the checks below.  Currently
2533 2533                   * this is harmless, as the lpls should be empty anyway.
2534 2534                   *
2535 2535                   * In particular, we want to preserve lpls that have additional
2536 2536                   * leaf resources, even though we don't yet have a processor
2537 2537                   * architecture that represents resources this way.
2538 2538                   */
2539 2539  
2540 2540                  leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2541 2541                      cpupart->cp_lgrpset);
2542 2542  
2543 2543                  lpl_rset_del(lpl_cur, lpl_leaf);
2544 2544                  if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2545 2545                          lpl_clear(lpl_cur);
2546 2546                  } else {
2547 2547                          /*
2548 2548                           * Update this lpl's children
2549 2549                           */
2550 2550                          lpl_child_update(lpl_cur, cpupart);
2551 2551                  }
2552 2552          }
2553 2553          lpl_clear(lpl_leaf);
2554 2554  }
2555 2555  
2556 2556  /*
2557 2557   * add a cpu to a partition in terms of lgrp load avg bookeeping
2558 2558   *
2559 2559   * The lpl (cpu partition load average information) is now arranged in a
2560 2560   * hierarchical fashion whereby resources that are closest, ie. most local, to
2561 2561   * the cpu in question are considered to be leaves in a tree of resources.
2562 2562   * There are two general cases for cpu additon:
2563 2563   *
2564 2564   * 1. A lpl structure that contains resources already in the hierarchy tree.
2565 2565   * In this case, all of the associated lpl relationships have been defined, and
2566 2566   * all that is necessary is that we link the new cpu into the per-lpl list of
2567 2567   * cpus, and increment the ncpu count of all places where this cpu resource will
2568 2568   * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2569 2569   * pushing is accomplished by this routine.
2570 2570   *
2571 2571   * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2572 2572   * not exist yet.  In this case, it is necessary to build the leaf lpl, and
2573 2573   * construct the hierarchy of state necessary to name it's more distant
2574 2574   * resources, if they should exist.  The leaf structure is initialized by this
2575 2575   * routine, as is the cpu-partition state for the lgrp membership.  This routine
2576 2576   * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2577 2577   * and builds all of the "ancestoral" state necessary to identify resources at
2578 2578   * differing levels of locality.
2579 2579   */
2580 2580  void
2581 2581  lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2582 2582  {
2583 2583          cpupart_t       *cpupart;
2584 2584          lgrp_t          *lgrp_leaf;
2585 2585          lpl_t           *lpl_leaf;
2586 2586  
2587 2587          /* called sometimes w/ cpus paused - grab no locks */
2588 2588          ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2589 2589  
2590 2590          cpupart = cp->cpu_part;
2591 2591          lgrp_leaf = lgrp_table[lgrpid];
2592 2592  
2593 2593          /* don't add non-existent lgrp */
2594 2594          ASSERT(LGRP_EXISTS(lgrp_leaf));
2595 2595          lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2596 2596          cp->cpu_lpl = lpl_leaf;
2597 2597  
2598 2598          /* only leaf lpls contain cpus */
2599 2599  
2600 2600          if (lpl_leaf->lpl_ncpu++ == 0) {
2601 2601                  lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2602 2602                  klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2603 2603                  lpl_leaf_insert(lpl_leaf, cpupart);
2604 2604          } else {
2605 2605                  /*
2606 2606                   * the lpl should already exist in the parent, so just update
2607 2607                   * the count of available CPUs
2608 2608                   */
2609 2609                  lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2610 2610          }
2611 2611  
2612 2612          /* link cpu into list of cpus in lpl */
2613 2613  
2614 2614          if (lpl_leaf->lpl_cpus) {
2615 2615                  cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2616 2616                  cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2617 2617                  lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2618 2618                  lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2619 2619          } else {
2620 2620                  /*
2621 2621                   * We increment ncpu immediately after we create a new leaf
2622 2622                   * lpl, so assert that ncpu == 1 for the case where we don't
2623 2623                   * have any cpu pointers yet.
2624 2624                   */
2625 2625                  ASSERT(lpl_leaf->lpl_ncpu == 1);
2626 2626                  lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2627 2627          }
2628 2628  
2629 2629  }
2630 2630  
2631 2631  
2632 2632  /*
2633 2633   * remove a cpu from a partition in terms of lgrp load avg bookeeping
2634 2634   *
2635 2635   * The lpl (cpu partition load average information) is now arranged in a
2636 2636   * hierarchical fashion whereby resources that are closest, ie. most local, to
2637 2637   * the cpu in question are considered to be leaves in a tree of resources.
2638 2638   * There are two removal cases in question:
2639 2639   *
2640 2640   * 1. Removal of the resource in the leaf leaves other resources remaining in
2641 2641   * that leaf.  (Another cpu still exists at this level of locality).  In this
2642 2642   * case, the count of available cpus is decremented in all assocated lpls by
2643 2643   * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2644 2644   * from the per-cpu lpl list.
2645 2645   *
2646 2646   * 2. Removal of the resource results in the lpl containing no resources.  (It's
2647 2647   * empty)  In this case, all of what has occurred for the first step must take
2648 2648   * place; however, additionally we must remove the lpl structure itself, prune
2649 2649   * out any stranded lpls that do not directly name a leaf resource, and mark the
2650 2650   * cpu partition in question as no longer containing resources from the lgrp of
2651 2651   * the lpl that has been delted.  Cpu-partition changes are handled by this
2652 2652   * method, but the lpl_leaf_remove function deals with the details of pruning
2653 2653   * out the empty lpl and any of its orphaned direct ancestors.
2654 2654   */
2655 2655  void
2656 2656  lgrp_part_del_cpu(cpu_t *cp)
2657 2657  {
2658 2658          lpl_t           *lpl;
2659 2659          lpl_t           *leaf_lpl;
2660 2660          lgrp_t          *lgrp_leaf;
2661 2661  
2662 2662          /* called sometimes w/ cpus paused - grab no locks */
2663 2663  
2664 2664          ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2665 2665  
2666 2666          lpl = leaf_lpl = cp->cpu_lpl;
2667 2667          lgrp_leaf = leaf_lpl->lpl_lgrp;
2668 2668  
2669 2669          /* don't delete a leaf that isn't there */
2670 2670          ASSERT(LGRP_EXISTS(lgrp_leaf));
2671 2671  
2672 2672          /* no double-deletes */
2673 2673          ASSERT(lpl->lpl_ncpu);
2674 2674          if (--lpl->lpl_ncpu == 0) {
2675 2675                  /*
2676 2676                   * This was the last cpu in this lgroup for this partition,
2677 2677                   * clear its bit in the partition's lgroup bitmask
2678 2678                   */
2679 2679                  klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2680 2680  
2681 2681                  /* eliminate remaning lpl link pointers in cpu, lpl */
2682 2682                  lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2683 2683  
2684 2684                  lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2685 2685          } else {
2686 2686  
2687 2687                  /* unlink cpu from lists of cpus in lpl */
2688 2688                  cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2689 2689                  cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2690 2690                  if (lpl->lpl_cpus == cp) {
2691 2691                          lpl->lpl_cpus = cp->cpu_next_lpl;
2692 2692                  }
2693 2693  
2694 2694                  /*
2695 2695                   * Update the cpu count in the lpls associated with parent
2696 2696                   * lgroups.
2697 2697                   */
2698 2698                  lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2699 2699  
2700 2700          }
2701 2701          /* clear cpu's lpl ptr when we're all done */
2702 2702          cp->cpu_lpl = NULL;
2703 2703  }
2704 2704  
2705 2705  /*
2706 2706   * Recompute load average for the specified partition/lgrp fragment.
2707 2707   *
2708 2708   * We rely on the fact that this routine is called from the clock thread
2709 2709   * at a point before the clock thread can block (i.e. before its first
2710 2710   * lock request).  Since the clock thread can not be preempted (since it
2711 2711   * runs at highest priority), we know that cpu partitions can not change
2712 2712   * (since doing so would require either the repartition requester or the
2713 2713   * cpu_pause thread to run on this cpu), so we can update the cpu's load
2714 2714   * without grabbing cpu_lock.
2715 2715   */
2716 2716  void
2717 2717  lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2718 2718  {
2719 2719          uint_t          ncpu;
2720 2720          int64_t         old, new, f;
2721 2721  
2722 2722          /*
2723 2723           * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2724 2724           */
2725 2725          static short expval[] = {
2726 2726              0, 3196, 1618, 1083,
2727 2727              814, 652, 543, 466,
2728 2728              408, 363, 326, 297,
2729 2729              272, 251, 233, 218,
2730 2730              204, 192, 181, 172,
2731 2731              163, 155, 148, 142,
2732 2732              136, 130, 125, 121,
2733 2733              116, 112, 109, 105
2734 2734          };
2735 2735  
2736 2736          /* ASSERT (called from clock level) */
2737 2737  
2738 2738          if ((lpl == NULL) ||    /* we're booting - this is easiest for now */
2739 2739              ((ncpu = lpl->lpl_ncpu) == 0)) {
2740 2740                  return;
2741 2741          }
2742 2742  
2743 2743          for (;;) {
2744 2744  
2745 2745                  if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2746 2746                          f = expval[1]/ncpu; /* good approx. for large ncpu */
2747 2747                  else
2748 2748                          f = expval[ncpu];
2749 2749  
2750 2750                  /*
2751 2751                   * Modify the load average atomically to avoid losing
2752 2752                   * anticipatory load updates (see lgrp_move_thread()).
2753 2753                   */
2754 2754                  if (ageflag) {
2755 2755                          /*
2756 2756                           * We're supposed to both update and age the load.
2757 2757                           * This happens 10 times/sec. per cpu.  We do a
2758 2758                           * little hoop-jumping to avoid integer overflow.
2759 2759                           */
2760 2760                          int64_t         q, r;
2761 2761  
2762 2762                          do {
2763 2763                                  old = new = lpl->lpl_loadavg;
2764 2764                                  q = (old  >> 16) << 7;
2765 2765                                  r = (old  & 0xffff) << 7;
2766 2766                                  new += ((long long)(nrcpus - q) * f -
2767 2767                                      ((r * f) >> 16)) >> 7;
2768 2768  
2769 2769                                  /*
2770 2770                                   * Check for overflow
2771 2771                                   */
2772 2772                                  if (new > LGRP_LOADAVG_MAX)
2773 2773                                          new = LGRP_LOADAVG_MAX;
2774 2774                                  else if (new < 0)
2775 2775                                          new = 0;
2776 2776                          } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2777 2777                              new) != old);
2778 2778                  } else {
2779 2779                          /*
2780 2780                           * We're supposed to update the load, but not age it.
2781 2781                           * This option is used to update the load (which either
2782 2782                           * has already been aged in this 1/10 sec. interval or
2783 2783                           * soon will be) to account for a remotely executing
2784 2784                           * thread.
2785 2785                           */
2786 2786                          do {
2787 2787                                  old = new = lpl->lpl_loadavg;
2788 2788                                  new += f;
2789 2789                                  /*
2790 2790                                   * Check for overflow
2791 2791                                   * Underflow not possible here
2792 2792                                   */
2793 2793                                  if (new < old)
2794 2794                                          new = LGRP_LOADAVG_MAX;
2795 2795                          } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2796 2796                              new) != old);
2797 2797                  }
2798 2798  
2799 2799                  /*
2800 2800                   * Do the same for this lpl's parent
2801 2801                   */
2802 2802                  if ((lpl = lpl->lpl_parent) == NULL)
2803 2803                          break;
2804 2804                  ncpu = lpl->lpl_ncpu;
2805 2805          }
2806 2806  }
2807 2807  
2808 2808  /*
2809 2809   * Initialize lpl topology in the target based on topology currently present in
2810 2810   * lpl_bootstrap.
2811 2811   *
2812 2812   * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2813 2813   * initialize cp_default list of lpls. Up to this point all topology operations
2814 2814   * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2815 2815   * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2816 2816   * `target' points to the list of lpls in cp_default and `size' is the size of
2817 2817   * this list.
2818 2818   *
2819 2819   * This function walks the lpl topology in lpl_bootstrap and does for things:
2820 2820   *
2821 2821   * 1) Copies all fields from lpl_bootstrap to the target.
2822 2822   *
2823 2823   * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2824 2824   *
2825 2825   * 3) Updates lpl_parent pointers to point to the lpls in the target list
2826 2826   *    instead of lpl_bootstrap.
2827 2827   *
2828 2828   * 4) Updates pointers in the resource list of the target to point to the lpls
2829 2829   *    in the target list instead of lpl_bootstrap.
2830 2830   *
2831 2831   * After lpl_topo_bootstrap() completes, target contains the same information
2832 2832   * that would be present there if it were used during boot instead of
2833 2833   * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2834 2834   * and it is bzeroed.
2835 2835   */
2836 2836  void
2837 2837  lpl_topo_bootstrap(lpl_t *target, int size)
2838 2838  {
2839 2839          lpl_t   *lpl = lpl_bootstrap;
2840 2840          lpl_t   *target_lpl = target;
2841 2841          lpl_t   **rset;
2842 2842          int     *id2rset;
2843 2843          int     sz;
2844 2844          int     howmany;
2845 2845          int     id;
2846 2846          int     i;
2847 2847  
2848 2848          /*
2849 2849           * The only target that should be passed here is cp_default lpl list.
2850 2850           */
2851 2851          ASSERT(target == cp_default.cp_lgrploads);
2852 2852          ASSERT(size == cp_default.cp_nlgrploads);
2853 2853          ASSERT(!lgrp_topo_initialized);
2854 2854          ASSERT(ncpus == 1);
2855 2855  
2856 2856          howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2857 2857          for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2858 2858                  /*
2859 2859                   * Copy all fields from lpl, except for the rset,
2860 2860                   * lgrp id <=> rset mapping storage,
2861 2861                   * and amount of storage
2862 2862                   */
2863 2863                  rset = target_lpl->lpl_rset;
2864 2864                  id2rset = target_lpl->lpl_id2rset;
2865 2865                  sz = target_lpl->lpl_rset_sz;
2866 2866  
2867 2867                  *target_lpl = *lpl;
2868 2868  
2869 2869                  target_lpl->lpl_rset_sz = sz;
2870 2870                  target_lpl->lpl_rset = rset;
2871 2871                  target_lpl->lpl_id2rset = id2rset;
2872 2872  
2873 2873                  /*
2874 2874                   * Substitute CPU0 lpl pointer with one relative to target.
2875 2875                   */
2876 2876                  if (lpl->lpl_cpus == CPU) {
2877 2877                          ASSERT(CPU->cpu_lpl == lpl);
2878 2878                          CPU->cpu_lpl = target_lpl;
2879 2879                  }
2880 2880  
2881 2881                  /*
2882 2882                   * Substitute parent information with parent relative to target.
2883 2883                   */
2884 2884                  if (lpl->lpl_parent != NULL)
2885 2885                          target_lpl->lpl_parent = (lpl_t *)
2886 2886                              (((uintptr_t)lpl->lpl_parent -
2887 2887                              (uintptr_t)lpl_bootstrap) +
2888 2888                              (uintptr_t)target);
2889 2889  
2890 2890                  /*
2891 2891                   * Walk over resource set substituting pointers relative to
2892 2892                   * lpl_bootstrap's rset to pointers relative to target's
2893 2893                   */
2894 2894                  ASSERT(lpl->lpl_nrset <= 1);
2895 2895  
2896 2896                  for (id = 0; id < lpl->lpl_nrset; id++) {
2897 2897                          if (lpl->lpl_rset[id] != NULL) {
2898 2898                                  target_lpl->lpl_rset[id] = (lpl_t *)
2899 2899                                      (((uintptr_t)lpl->lpl_rset[id] -
2900 2900                                      (uintptr_t)lpl_bootstrap) +
2901 2901                                      (uintptr_t)target);
2902 2902                          }
2903 2903                          target_lpl->lpl_id2rset[id] =
2904 2904                              lpl->lpl_id2rset[id];
2905 2905                  }
2906 2906          }
2907 2907  
2908 2908          /*
2909 2909           * Clean up the bootstrap lpls since we have switched over to the
2910 2910           * actual lpl array in the default cpu partition.
2911 2911           *
2912 2912           * We still need to keep one empty lpl around for newly starting
2913 2913           * slave CPUs to reference should they need to make it through the
2914 2914           * dispatcher prior to their lgrp/lpl initialization.
2915 2915           *
2916 2916           * The lpl related dispatcher code has been designed to work properly
2917 2917           * (and without extra checks) for this special case of a zero'ed
2918 2918           * bootstrap lpl. Such an lpl appears to the dispatcher as an lpl
2919 2919           * with lgrpid 0 and an empty resource set. Iteration over the rset
2920 2920           * array by the dispatcher is also NULL terminated for this reason.
2921 2921           *
2922 2922           * This provides the desired behaviour for an uninitialized CPU.
2923 2923           * It shouldn't see any other CPU to either dispatch to or steal
2924 2924           * from until it is properly initialized.
2925 2925           */
2926 2926          bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2927 2927          bzero(lpl_bootstrap_id2rset, sizeof (lpl_bootstrap_id2rset));
2928 2928          bzero(lpl_bootstrap_rset, sizeof (lpl_bootstrap_rset));
2929 2929  
2930 2930          lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
2931 2931          lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
2932 2932  }
2933 2933  
2934 2934  /*
2935 2935   * If the lowest load among the lgroups a process' threads are currently
2936 2936   * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2937 2937   * expanding the process to a new lgroup.
2938 2938   */
2939 2939  #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2940 2940  lgrp_load_t     lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2941 2941  
2942 2942  #define LGRP_EXPAND_PROC_THRESH(ncpu) \
2943 2943          ((lgrp_expand_proc_thresh) / (ncpu))
2944 2944  
2945 2945  /*
2946 2946   * A process will be expanded to a new lgroup only if the difference between
2947 2947   * the lowest load on the lgroups the process' thread's are currently spread
2948 2948   * across and the lowest load on the other lgroups in the process' partition
2949 2949   * is greater than lgrp_expand_proc_diff.
2950 2950   */
2951 2951  #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2952 2952  lgrp_load_t     lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2953 2953  
2954 2954  #define LGRP_EXPAND_PROC_DIFF(ncpu) \
2955 2955          ((lgrp_expand_proc_diff) / (ncpu))
2956 2956  
2957 2957  /*
2958 2958   * The loadavg tolerance accounts for "noise" inherent in the load, which may
2959 2959   * be present due to impreciseness of the load average decay algorithm.
2960 2960   *
2961 2961   * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2962 2962   * tolerance is scaled by the number of cpus in the lgroup just like
2963 2963   * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2964 2964   * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2965 2965   * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2966 2966   */
2967 2967  uint32_t        lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2968 2968  #define LGRP_LOADAVG_TOLERANCE(ncpu)    \
2969 2969          ((lgrp_loadavg_tolerance) / ncpu)
2970 2970  
2971 2971  /*
2972 2972   * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2973 2973   * average is above this threshold
2974 2974   */
2975 2975  uint32_t        lgrp_load_thresh = UINT32_MAX;
2976 2976  
2977 2977  /*
2978 2978   * lgrp_choose() will try to skip any lgroups with less memory
2979 2979   * than this free when choosing a home lgroup
2980 2980   */
2981 2981  pgcnt_t lgrp_mem_free_thresh = 0;
2982 2982  
2983 2983  /*
2984 2984   * When choosing between similarly loaded lgroups, lgrp_choose() will pick
2985 2985   * one based on one of the following policies:
2986 2986   * - Random selection
2987 2987   * - Pseudo round robin placement
2988 2988   * - Longest time since a thread was last placed
2989 2989   */
2990 2990  #define LGRP_CHOOSE_RANDOM      1
2991 2991  #define LGRP_CHOOSE_RR          2
2992 2992  #define LGRP_CHOOSE_TIME        3
2993 2993  
2994 2994  int     lgrp_choose_policy = LGRP_CHOOSE_TIME;
2995 2995  
2996 2996  /*
2997 2997   * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
2998 2998   * be bound to a CPU or processor set.
2999 2999   *
3000 3000   * Arguments:
3001 3001   *      t               The thread
3002 3002   *      cpupart         The partition the thread belongs to.
3003 3003   *
3004 3004   * NOTE: Should at least be called with the cpu_lock held, kernel preemption
3005 3005   *       disabled, or thread_lock held (at splhigh) to protect against the CPU
3006 3006   *       partitions changing out from under us and assumes that given thread is
3007 3007   *       protected.  Also, called sometimes w/ cpus paused or kernel preemption
3008 3008   *       disabled, so don't grab any locks because we should never block under
3009 3009   *       those conditions.
3010 3010   */
3011 3011  lpl_t *
3012 3012  lgrp_choose(kthread_t *t, cpupart_t *cpupart)
3013 3013  {
3014 3014          lgrp_load_t     bestload, bestrload;
3015 3015          int             lgrpid_offset, lgrp_count;
3016 3016          lgrp_id_t       lgrpid, lgrpid_start;
3017 3017          lpl_t           *lpl, *bestlpl, *bestrlpl;
3018 3018          klgrpset_t      lgrpset;
3019 3019          proc_t          *p;
3020 3020  
3021 3021          ASSERT(t != NULL);
3022 3022          ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3023 3023              THREAD_LOCK_HELD(t));
3024 3024          ASSERT(cpupart != NULL);
3025 3025  
3026 3026          p = t->t_procp;
3027 3027  
3028 3028          /* A process should always be in an active partition */
3029 3029          ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
3030 3030  
3031 3031          bestlpl = bestrlpl = NULL;
3032 3032          bestload = bestrload = LGRP_LOADAVG_MAX;
3033 3033          lgrpset = cpupart->cp_lgrpset;
3034 3034  
3035 3035          switch (lgrp_choose_policy) {
3036 3036          case LGRP_CHOOSE_RR:
3037 3037                  lgrpid = cpupart->cp_lgrp_hint;
3038 3038                  do {
3039 3039                          if (++lgrpid > lgrp_alloc_max)
3040 3040                                  lgrpid = 0;
3041 3041                  } while (!klgrpset_ismember(lgrpset, lgrpid));
3042 3042  
3043 3043                  break;
3044 3044          default:
3045 3045          case LGRP_CHOOSE_TIME:
3046 3046          case LGRP_CHOOSE_RANDOM:
3047 3047                  klgrpset_nlgrps(lgrpset, lgrp_count);
3048 3048                  lgrpid_offset =
3049 3049                      (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3050 3050                  for (lgrpid = 0; ; lgrpid++) {
3051 3051                          if (klgrpset_ismember(lgrpset, lgrpid)) {
3052 3052                                  if (--lgrpid_offset == 0)
3053 3053                                          break;
3054 3054                          }
3055 3055                  }
3056 3056                  break;
3057 3057          }
3058 3058  
3059 3059          lgrpid_start = lgrpid;
3060 3060  
3061 3061          DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3062 3062              lgrp_id_t, cpupart->cp_lgrp_hint);
3063 3063  
3064 3064          /*
3065 3065           * Use lgroup affinities (if any) to choose best lgroup
3066 3066           *
3067 3067           * NOTE: Assumes that thread is protected from going away and its
3068 3068           *       lgroup affinities won't change (ie. p_lock, or
3069 3069           *       thread_lock() being held and/or CPUs paused)
3070 3070           */
3071 3071          if (t->t_lgrp_affinity) {
3072 3072                  lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE);
3073 3073                  if (lpl != NULL)
3074 3074                          return (lpl);
3075 3075          }
3076 3076  
3077 3077          ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3078 3078  
3079 3079          do {
3080 3080                  pgcnt_t npgs;
3081 3081  
3082 3082                  /*
3083 3083                   * Skip any lgroups outside of thread's pset
3084 3084                   */
3085 3085                  if (!klgrpset_ismember(lgrpset, lgrpid)) {
3086 3086                          if (++lgrpid > lgrp_alloc_max)
3087 3087                                  lgrpid = 0;     /* wrap the search */
3088 3088                          continue;
3089 3089                  }
3090 3090  
3091 3091                  /*
3092 3092                   * Skip any non-leaf lgroups
3093 3093                   */
3094 3094                  if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3095 3095                          continue;
3096 3096  
3097 3097                  /*
3098 3098                   * Skip any lgroups without enough free memory
3099 3099                   * (when threshold set to nonzero positive value)
3100 3100                   */
3101 3101                  if (lgrp_mem_free_thresh > 0) {
3102 3102                          npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3103 3103                          if (npgs < lgrp_mem_free_thresh) {
3104 3104                                  if (++lgrpid > lgrp_alloc_max)
3105 3105                                          lgrpid = 0;     /* wrap the search */
3106 3106                                  continue;
3107 3107                          }
3108 3108                  }
3109 3109  
3110 3110                  lpl = &cpupart->cp_lgrploads[lgrpid];
3111 3111                  if (klgrpset_isempty(p->p_lgrpset) ||
3112 3112                      klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3113 3113                          /*
3114 3114                           * Either this is a new process or the process already
3115 3115                           * has threads on this lgrp, so this is a preferred
3116 3116                           * lgroup for the thread.
3117 3117                           */
3118 3118                          if (bestlpl == NULL ||
3119 3119                              lpl_pick(lpl, bestlpl)) {
3120 3120                                  bestload = lpl->lpl_loadavg;
3121 3121                                  bestlpl = lpl;
3122 3122                          }
3123 3123                  } else {
3124 3124                          /*
3125 3125                           * The process doesn't have any threads on this lgrp,
3126 3126                           * but we're willing to consider this lgrp if the load
3127 3127                           * difference is big enough to justify splitting up
3128 3128                           * the process' threads.
3129 3129                           */
3130 3130                          if (bestrlpl == NULL ||
3131 3131                              lpl_pick(lpl, bestrlpl)) {
3132 3132                                  bestrload = lpl->lpl_loadavg;
3133 3133                                  bestrlpl = lpl;
3134 3134                          }
3135 3135                  }
3136 3136                  if (++lgrpid > lgrp_alloc_max)
3137 3137                          lgrpid = 0;     /* wrap the search */
3138 3138          } while (lgrpid != lgrpid_start);
3139 3139  
3140 3140          /*
3141 3141           * Return root lgroup if threshold isn't set to maximum value and
3142 3142           * lowest lgroup load average more than a certain threshold
3143 3143           */
3144 3144          if (lgrp_load_thresh != UINT32_MAX &&
3145 3145              bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3146 3146                  return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3147 3147  
3148 3148          /*
3149 3149           * If all the lgroups over which the thread's process is spread are
3150 3150           * heavily loaded, or otherwise undesirable, we'll consider placing
3151 3151           * the thread on one of the other leaf lgroups in the thread's
3152 3152           * partition.
3153 3153           */
3154 3154          if ((bestlpl == NULL) ||
3155 3155              ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3156 3156              (bestrload < bestload) &&   /* paranoid about wraparound */
3157 3157              (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3158 3158              bestload))) {
3159 3159                  bestlpl = bestrlpl;
3160 3160          }
3161 3161  
3162 3162          if (bestlpl == NULL) {
3163 3163                  /*
3164 3164                   * No lgroup looked particularly good, but we still
3165 3165                   * have to pick something. Go with the randomly selected
3166 3166                   * legal lgroup we started with above.
3167 3167                   */
3168 3168                  bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3169 3169          }
3170 3170  
3171 3171          cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3172 3172          bestlpl->lpl_homed_time = gethrtime_unscaled();
3173 3173  
3174 3174          ASSERT(bestlpl->lpl_ncpu > 0);
3175 3175          return (bestlpl);
3176 3176  }
3177 3177  
3178 3178  /*
3179 3179   * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
3180 3180   * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
3181 3181   */
3182 3182  static int
3183 3183  lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3184 3184  {
3185 3185          lgrp_load_t     l1, l2;
3186 3186          lgrp_load_t     tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3187 3187  
3188 3188          l1 = lpl1->lpl_loadavg;
3189 3189          l2 = lpl2->lpl_loadavg;
3190 3190  
3191 3191          if ((l1 + tolerance < l2) && (l1 < l2)) {
3192 3192                  /* lpl1 is significantly less loaded than lpl2 */
3193 3193                  return (1);
3194 3194          }
3195 3195  
3196 3196          if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3197 3197              l1 + tolerance >= l2 && l1 < l2 &&
3198 3198              lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3199 3199                  /*
3200 3200                   * lpl1's load is within the tolerance of lpl2. We're
3201 3201                   * willing to consider it be to better however if
3202 3202                   * it has been longer since we last homed a thread there
3203 3203                   */
3204 3204                  return (1);
3205 3205          }
3206 3206  
3207 3207          return (0);
3208 3208  }
3209 3209  
3210 3210  /*
3211 3211   * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a
3212 3212   * process that uses text replication changed home lgrp. This info is used by
3213 3213   * segvn asyncronous thread to detect if it needs to recheck what lgrps
3214 3214   * should be used for text replication.
3215 3215   */
3216 3216  static uint64_t lgrp_trthr_moves = 0;
3217 3217  
3218 3218  uint64_t
3219 3219  lgrp_get_trthr_migrations(void)
3220 3220  {
3221 3221          return (lgrp_trthr_moves);
3222 3222  }
3223 3223  
3224 3224  void
3225 3225  lgrp_update_trthr_migrations(uint64_t incr)
3226 3226  {
3227 3227          atomic_add_64(&lgrp_trthr_moves, incr);
3228 3228  }
3229 3229  
3230 3230  /*
3231 3231   * An LWP is expected to be assigned to an lgroup for at least this long
3232 3232   * for its anticipatory load to be justified.  NOTE that this value should
3233 3233   * not be set extremely huge (say, larger than 100 years), to avoid problems
3234 3234   * with overflow in the calculation that uses it.
3235 3235   */
3236 3236  #define LGRP_MIN_NSEC   (NANOSEC / 10)          /* 1/10 of a second */
3237 3237  hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3238 3238  
3239 3239  /*
3240 3240   * Routine to change a thread's lgroup affiliation.  This routine updates
3241 3241   * the thread's kthread_t struct and its process' proc_t struct to note the
3242 3242   * thread's new lgroup affiliation, and its lgroup affinities.
3243 3243   *
3244 3244   * Note that this is the only routine that modifies a thread's t_lpl field,
3245 3245   * and that adds in or removes anticipatory load.
3246 3246   *
3247 3247   * If the thread is exiting, newlpl is NULL.
3248 3248   *
3249 3249   * Locking:
3250 3250   * The following lock must be held on entry:
3251 3251   *      cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3252 3252   *              doesn't get removed from t's partition
3253 3253   *
3254 3254   * This routine is not allowed to grab any locks, since it may be called
3255 3255   * with cpus paused (such as from cpu_offline).
3256 3256   */
3257 3257  void
3258 3258  lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3259 3259  {
3260 3260          proc_t          *p;
3261 3261          lpl_t           *lpl, *oldlpl;
3262 3262          lgrp_id_t       oldid;
3263 3263          kthread_t       *tp;
3264 3264          uint_t          ncpu;
3265 3265          lgrp_load_t     old, new;
3266 3266  
3267 3267          ASSERT(t);
3268 3268          ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3269 3269              THREAD_LOCK_HELD(t));
3270 3270  
3271 3271          /*
3272 3272           * If not changing lpls, just return
3273 3273           */
3274 3274          if ((oldlpl = t->t_lpl) == newlpl)
3275 3275                  return;
3276 3276  
3277 3277          /*
3278 3278           * Make sure the thread's lwp hasn't exited (if so, this thread is now
3279 3279           * associated with process 0 rather than with its original process).
3280 3280           */
3281 3281          if (t->t_proc_flag & TP_LWPEXIT) {
3282 3282                  if (newlpl != NULL) {
3283 3283                          t->t_lpl = newlpl;
3284 3284                  }
3285 3285                  return;
3286 3286          }
3287 3287  
3288 3288          p = ttoproc(t);
3289 3289  
3290 3290          /*
3291 3291           * If the thread had a previous lgroup, update its process' p_lgrpset
3292 3292           * to account for it being moved from its old lgroup.
3293 3293           */
3294 3294          if ((oldlpl != NULL) && /* thread had a previous lgroup */
3295 3295              (p->p_tlist != NULL)) {
3296 3296                  oldid = oldlpl->lpl_lgrpid;
3297 3297  
3298 3298                  if (newlpl != NULL)
3299 3299                          lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3300 3300  
3301 3301                  if ((do_lgrpset_delete) &&
3302 3302                      (klgrpset_ismember(p->p_lgrpset, oldid))) {
3303 3303                          for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3304 3304                                  /*
3305 3305                                   * Check if a thread other than the thread
3306 3306                                   * that's moving is assigned to the same
3307 3307                                   * lgroup as the thread that's moving.  Note
3308 3308                                   * that we have to compare lgroup IDs, rather
3309 3309                                   * than simply comparing t_lpl's, since the
3310 3310                                   * threads may belong to different partitions
3311 3311                                   * but be assigned to the same lgroup.
3312 3312                                   */
3313 3313                                  ASSERT(tp->t_lpl != NULL);
3314 3314  
3315 3315                                  if ((tp != t) &&
3316 3316                                      (tp->t_lpl->lpl_lgrpid == oldid)) {
3317 3317                                          /*
3318 3318                                           * Another thread is assigned to the
3319 3319                                           * same lgroup as the thread that's
3320 3320                                           * moving, p_lgrpset doesn't change.
3321 3321                                           */
3322 3322                                          break;
3323 3323                                  } else if (tp == p->p_tlist) {
3324 3324                                          /*
3325 3325                                           * No other thread is assigned to the
3326 3326                                           * same lgroup as the exiting thread,
3327 3327                                           * clear the lgroup's bit in p_lgrpset.
3328 3328                                           */
3329 3329                                          klgrpset_del(p->p_lgrpset, oldid);
3330 3330                                          break;
3331 3331                                  }
3332 3332                          }
3333 3333                  }
3334 3334  
3335 3335                  /*
3336 3336                   * If this thread was assigned to its old lgroup for such a
3337 3337                   * short amount of time that the anticipatory load that was
3338 3338                   * added on its behalf has aged very little, remove that
3339 3339                   * anticipatory load.
3340 3340                   */
3341 3341                  if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3342 3342                      ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3343 3343                          lpl = oldlpl;
3344 3344                          for (;;) {
3345 3345                                  do {
3346 3346                                          old = new = lpl->lpl_loadavg;
3347 3347                                          new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3348 3348                                          if (new > old) {
3349 3349                                                  /*
3350 3350                                                   * this can happen if the load
3351 3351                                                   * average was aged since we
3352 3352                                                   * added in the anticipatory
3353 3353                                                   * load
3354 3354                                                   */
3355 3355                                                  new = 0;
3356 3356                                          }
3357 3357                                  } while (cas32(
3358 3358                                      (lgrp_load_t *)&lpl->lpl_loadavg, old,
3359 3359                                      new) != old);
3360 3360  
3361 3361                                  lpl = lpl->lpl_parent;
3362 3362                                  if (lpl == NULL)
3363 3363                                          break;
3364 3364  
3365 3365                                  ncpu = lpl->lpl_ncpu;
3366 3366                                  ASSERT(ncpu > 0);
3367 3367                          }
3368 3368                  }
3369 3369          }
3370 3370          /*
3371 3371           * If the thread has a new lgroup (i.e. it's not exiting), update its
3372 3372           * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3373 3373           * to its new lgroup to account for its move to its new lgroup.
3374 3374           */
3375 3375          if (newlpl != NULL) {
3376 3376                  /*
3377 3377                   * This thread is moving to a new lgroup
3378 3378                   */
3379 3379                  t->t_lpl = newlpl;
3380 3380                  if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) {
3381 3381                          p->p_t1_lgrpid = newlpl->lpl_lgrpid;
3382 3382                          membar_producer();
3383 3383                          if (p->p_tr_lgrpid != LGRP_NONE &&
3384 3384                              p->p_tr_lgrpid != p->p_t1_lgrpid) {
3385 3385                                  lgrp_update_trthr_migrations(1);
3386 3386                          }
3387 3387                  }
3388 3388  
3389 3389                  /*
3390 3390                   * Reflect move in load average of new lgroup
3391 3391                   * unless it is root lgroup
3392 3392                   */
3393 3393                  if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3394 3394                          return;
3395 3395  
3396 3396                  if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3397 3397                          klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3398 3398                  }
3399 3399  
3400 3400                  /*
3401 3401                   * It'll take some time for the load on the new lgroup
3402 3402                   * to reflect this thread's placement on it.  We'd
3403 3403                   * like not, however, to have all threads between now
3404 3404                   * and then also piling on to this lgroup.  To avoid
3405 3405                   * this pileup, we anticipate the load this thread
3406 3406                   * will generate on its new lgroup.  The goal is to
3407 3407                   * make the lgroup's load appear as though the thread
3408 3408                   * had been there all along.  We're very conservative
3409 3409                   * in calculating this anticipatory load, we assume
3410 3410                   * the worst case case (100% CPU-bound thread).  This
3411 3411                   * may be modified in the future to be more accurate.
3412 3412                   */
3413 3413                  lpl = newlpl;
3414 3414                  for (;;) {
3415 3415                          ncpu = lpl->lpl_ncpu;
3416 3416                          ASSERT(ncpu > 0);
3417 3417                          do {
3418 3418                                  old = new = lpl->lpl_loadavg;
3419 3419                                  new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3420 3420                                  /*
3421 3421                                   * Check for overflow
3422 3422                                   * Underflow not possible here
3423 3423                                   */
3424 3424                                  if (new < old)
3425 3425                                          new = UINT32_MAX;
3426 3426                          } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
3427 3427                              new) != old);
3428 3428  
3429 3429                          lpl = lpl->lpl_parent;
3430 3430                          if (lpl == NULL)
3431 3431                                  break;
3432 3432                  }
3433 3433                  t->t_anttime = gethrtime();
3434 3434          }
3435 3435  }
3436 3436  
3437 3437  /*
3438 3438   * Return lgroup memory allocation policy given advice from madvise(3C)
3439 3439   */
3440 3440  lgrp_mem_policy_t
3441 3441  lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3442 3442  {
3443 3443          switch (advice) {
3444 3444          case MADV_ACCESS_LWP:
3445 3445                  return (LGRP_MEM_POLICY_NEXT);
3446 3446          case MADV_ACCESS_MANY:
3447 3447                  return (LGRP_MEM_POLICY_RANDOM);
3448 3448          default:
3449 3449                  return (lgrp_mem_policy_default(size, type));
3450 3450          }
3451 3451  }
3452 3452  
3453 3453  /*
3454 3454   * Figure out default policy
3455 3455   */
3456 3456  lgrp_mem_policy_t
3457 3457  lgrp_mem_policy_default(size_t size, int type)
3458 3458  {
3459 3459          cpupart_t               *cp;
3460 3460          lgrp_mem_policy_t       policy;
3461 3461          size_t                  pset_mem_size;
3462 3462  
3463 3463          /*
3464 3464           * Randomly allocate memory across lgroups for shared memory
3465 3465           * beyond a certain threshold
3466 3466           */
3467 3467          if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3468 3468              (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3469 3469                  /*
3470 3470                   * Get total memory size of current thread's pset
3471 3471                   */
3472 3472                  kpreempt_disable();
3473 3473                  cp = curthread->t_cpupart;
3474 3474                  klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3475 3475                  kpreempt_enable();
3476 3476  
3477 3477                  /*
3478 3478                   * Choose policy to randomly allocate memory across
3479 3479                   * lgroups in pset if it will fit and is not default
3480 3480                   * partition.  Otherwise, allocate memory randomly
3481 3481                   * across machine.
3482 3482                   */
3483 3483                  if (lgrp_mem_pset_aware && size < pset_mem_size)
3484 3484                          policy = LGRP_MEM_POLICY_RANDOM_PSET;
3485 3485                  else
3486 3486                          policy = LGRP_MEM_POLICY_RANDOM;
3487 3487          } else
3488 3488                  /*
3489 3489                   * Apply default policy for private memory and
3490 3490                   * shared memory under the respective random
3491 3491                   * threshold.
3492 3492                   */
3493 3493                  policy = lgrp_mem_default_policy;
3494 3494  
3495 3495          return (policy);
3496 3496  }
3497 3497  
3498 3498  /*
3499 3499   * Get memory allocation policy for this segment
3500 3500   */
3501 3501  lgrp_mem_policy_info_t *
3502 3502  lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3503 3503  {
3504 3504          lgrp_mem_policy_info_t  *policy_info;
3505 3505          extern struct seg_ops   segspt_ops;
3506 3506          extern struct seg_ops   segspt_shmops;
3507 3507  
3508 3508          /*
3509 3509           * This is for binary compatibility to protect against third party
3510 3510           * segment drivers which haven't recompiled to allow for
3511 3511           * SEGOP_GETPOLICY()
3512 3512           */
3513 3513          if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3514 3514              seg->s_ops != &segspt_shmops)
3515 3515                  return (NULL);
3516 3516  
3517 3517          policy_info = NULL;
3518 3518          if (seg->s_ops->getpolicy != NULL)
3519 3519                  policy_info = SEGOP_GETPOLICY(seg, vaddr);
3520 3520  
3521 3521          return (policy_info);
3522 3522  }
3523 3523  
3524 3524  /*
3525 3525   * Set policy for allocating private memory given desired policy, policy info,
3526 3526   * size in bytes of memory that policy is being applied.
3527 3527   * Return 0 if policy wasn't set already and 1 if policy was set already
3528 3528   */
3529 3529  int
3530 3530  lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3531 3531      lgrp_mem_policy_info_t *policy_info, size_t size)
3532 3532  {
3533 3533  
3534 3534          ASSERT(policy_info != NULL);
3535 3535  
3536 3536          if (policy == LGRP_MEM_POLICY_DEFAULT)
3537 3537                  policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3538 3538  
3539 3539          /*
3540 3540           * Policy set already?
3541 3541           */
3542 3542          if (policy == policy_info->mem_policy)
3543 3543                  return (1);
3544 3544  
3545 3545          /*
3546 3546           * Set policy
3547 3547           */
3548 3548          policy_info->mem_policy = policy;
3549 3549          policy_info->mem_lgrpid = LGRP_NONE;
3550 3550  
3551 3551          return (0);
3552 3552  }
3553 3553  
3554 3554  
3555 3555  /*
3556 3556   * Get shared memory allocation policy with given tree and offset
3557 3557   */
3558 3558  lgrp_mem_policy_info_t *
3559 3559  lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3560 3560      u_offset_t vn_off)
3561 3561  {
3562 3562          u_offset_t              off;
3563 3563          lgrp_mem_policy_info_t  *policy_info;
3564 3564          lgrp_shm_policy_seg_t   *policy_seg;
3565 3565          lgrp_shm_locality_t     *shm_locality;
3566 3566          avl_tree_t              *tree;
3567 3567          avl_index_t             where;
3568 3568  
3569 3569          /*
3570 3570           * Get policy segment tree from anon_map or vnode and use specified
3571 3571           * anon index or vnode offset as offset
3572 3572           *
3573 3573           * Assume that no lock needs to be held on anon_map or vnode, since
3574 3574           * they should be protected by their reference count which must be
3575 3575           * nonzero for an existing segment
3576 3576           */
3577 3577          if (amp) {
3578 3578                  ASSERT(amp->refcnt != 0);
3579 3579                  shm_locality = amp->locality;
3580 3580                  if (shm_locality == NULL)
3581 3581                          return (NULL);
3582 3582                  tree = shm_locality->loc_tree;
3583 3583                  off = ptob(anon_index);
3584 3584          } else if (vp) {
3585 3585                  shm_locality = vp->v_locality;
3586 3586                  if (shm_locality == NULL)
3587 3587                          return (NULL);
3588 3588                  ASSERT(shm_locality->loc_count != 0);
3589 3589                  tree = shm_locality->loc_tree;
3590 3590                  off = vn_off;
3591 3591          }
3592 3592  
3593 3593          if (tree == NULL)
3594 3594                  return (NULL);
3595 3595  
3596 3596          /*
3597 3597           * Lookup policy segment for offset into shared object and return
3598 3598           * policy info
3599 3599           */
3600 3600          rw_enter(&shm_locality->loc_lock, RW_READER);
3601 3601          policy_info = NULL;
3602 3602          policy_seg = avl_find(tree, &off, &where);
3603 3603          if (policy_seg)
3604 3604                  policy_info = &policy_seg->shm_policy;
3605 3605          rw_exit(&shm_locality->loc_lock);
3606 3606  
3607 3607          return (policy_info);
3608 3608  }
3609 3609  
3610 3610  /*
3611 3611   * Default memory allocation policy for kernel segmap pages
3612 3612   */
3613 3613  lgrp_mem_policy_t       lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
3614 3614  
3615 3615  /*
3616 3616   * Return lgroup to use for allocating memory
3617 3617   * given the segment and address
3618 3618   *
3619 3619   * There isn't any mutual exclusion that exists between calls
3620 3620   * to this routine and DR, so this routine and whomever calls it
3621 3621   * should be mindful of the possibility that the lgrp returned
3622 3622   * may be deleted. If this happens, dereferences of the lgrp
3623 3623   * pointer will still be safe, but the resources in the lgrp will
3624 3624   * be gone, and LGRP_EXISTS() will no longer be true.
3625 3625   */
3626 3626  lgrp_t *
3627 3627  lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3628 3628  {
3629 3629          int                     i;
3630 3630          lgrp_t                  *lgrp;
3631 3631          klgrpset_t              lgrpset;
3632 3632          int                     lgrps_spanned;
3633 3633          unsigned long           off;
3634 3634          lgrp_mem_policy_t       policy;
3635 3635          lgrp_mem_policy_info_t  *policy_info;
3636 3636          ushort_t                random;
3637 3637          int                     stat = 0;
3638 3638          extern struct seg       *segkmap;
3639 3639  
3640 3640          /*
3641 3641           * Just return null if the lgrp framework hasn't finished
3642 3642           * initializing or if this is a UMA machine.
3643 3643           */
3644 3644          if (nlgrps == 1 || !lgrp_initialized)
3645 3645                  return (lgrp_root);
3646 3646  
3647 3647          /*
3648 3648           * Get memory allocation policy for this segment
3649 3649           */
3650 3650          policy = lgrp_mem_default_policy;
3651 3651          if (seg != NULL) {
3652 3652                  if (seg->s_as == &kas) {
3653 3653                          if (seg == segkmap)
3654 3654                                  policy = lgrp_segmap_default_policy;
3655 3655                          if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3656 3656                              policy == LGRP_MEM_POLICY_RANDOM_PSET)
3657 3657                                  policy = LGRP_MEM_POLICY_RANDOM;
3658 3658                  } else {
3659 3659                          policy_info = lgrp_mem_policy_get(seg, vaddr);
3660 3660                          if (policy_info != NULL) {
3661 3661                                  policy = policy_info->mem_policy;
3662 3662                                  if (policy == LGRP_MEM_POLICY_NEXT_SEG) {
3663 3663                                          lgrp_id_t id = policy_info->mem_lgrpid;
3664 3664                                          ASSERT(id != LGRP_NONE);
3665 3665                                          ASSERT(id < NLGRPS_MAX);
3666 3666                                          lgrp = lgrp_table[id];
3667 3667                                          if (!LGRP_EXISTS(lgrp)) {
3668 3668                                                  policy = LGRP_MEM_POLICY_NEXT;
3669 3669                                          } else {
3670 3670                                                  lgrp_stat_add(id,
3671 3671                                                      LGRP_NUM_NEXT_SEG, 1);
3672 3672                                                  return (lgrp);
3673 3673                                          }
3674 3674                                  }
3675 3675                          }
3676 3676                  }
3677 3677          }
3678 3678          lgrpset = 0;
3679 3679  
3680 3680          /*
3681 3681           * Initialize lgroup to home by default
3682 3682           */
3683 3683          lgrp = lgrp_home_lgrp();
3684 3684  
3685 3685          /*
3686 3686           * When homing threads on root lgrp, override default memory
3687 3687           * allocation policies with root lgroup memory allocation policy
3688 3688           */
3689 3689          if (lgrp == lgrp_root)
3690 3690                  policy = lgrp_mem_policy_root;
3691 3691  
3692 3692          /*
3693 3693           * Implement policy
3694 3694           */
3695 3695          switch (policy) {
3696 3696          case LGRP_MEM_POLICY_NEXT_CPU:
3697 3697  
3698 3698                  /*
3699 3699                   * Return lgroup of current CPU which faulted on memory
3700 3700                   * If the CPU isn't currently in an lgrp, then opt to
3701 3701                   * allocate from the root.
3702 3702                   *
3703 3703                   * Kernel preemption needs to be disabled here to prevent
3704 3704                   * the current CPU from going away before lgrp is found.
3705 3705                   */
3706 3706                  if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
3707 3707                          lgrp = lgrp_root;
3708 3708                  } else {
3709 3709                          kpreempt_disable();
3710 3710                          lgrp = lgrp_cpu_to_lgrp(CPU);
3711 3711                          kpreempt_enable();
3712 3712                  }
3713 3713                  break;
3714 3714  
3715 3715          case LGRP_MEM_POLICY_NEXT:
3716 3716          case LGRP_MEM_POLICY_DEFAULT:
3717 3717          default:
3718 3718  
3719 3719                  /*
3720 3720                   * Just return current thread's home lgroup
3721 3721                   * for default policy (next touch)
3722 3722                   * If the thread is homed to the root,
3723 3723                   * then the default policy is random across lgroups.
3724 3724                   * Fallthrough to the random case.
3725 3725                   */
3726 3726                  if (lgrp != lgrp_root) {
3727 3727                          if (policy == LGRP_MEM_POLICY_NEXT)
3728 3728                                  lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3729 3729                          else
3730 3730                                  lgrp_stat_add(lgrp->lgrp_id,
3731 3731                                      LGRP_NUM_DEFAULT, 1);
3732 3732                          break;
3733 3733                  }
3734 3734                  /* LINTED fallthrough on case statement */
3735 3735          case LGRP_MEM_POLICY_RANDOM:
3736 3736  
3737 3737                  /*
3738 3738                   * Return a random leaf lgroup with memory
3739 3739                   */
3740 3740                  lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3741 3741                  /*
3742 3742                   * Count how many lgroups are spanned
3743 3743                   */
3744 3744                  klgrpset_nlgrps(lgrpset, lgrps_spanned);
3745 3745  
3746 3746                  /*
3747 3747                   * There may be no memnodes in the root lgroup during DR copy
3748 3748                   * rename on a system with only two boards (memnodes)
3749 3749                   * configured. In this case just return the root lgrp.
3750 3750                   */
3751 3751                  if (lgrps_spanned == 0) {
3752 3752                          lgrp = lgrp_root;
3753 3753                          break;
3754 3754                  }
3755 3755  
3756 3756                  /*
3757 3757                   * Pick a random offset within lgroups spanned
3758 3758                   * and return lgroup at that offset
3759 3759                   */
3760 3760                  random = (ushort_t)gethrtime() >> 4;
3761 3761                  off = random % lgrps_spanned;
3762 3762                  ASSERT(off <= lgrp_alloc_max);
3763 3763  
3764 3764                  for (i = 0; i <= lgrp_alloc_max; i++) {
3765 3765                          if (!klgrpset_ismember(lgrpset, i))
3766 3766                                  continue;
3767 3767                          if (off)
3768 3768                                  off--;
3769 3769                          else {
3770 3770                                  lgrp = lgrp_table[i];
3771 3771                                  lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3772 3772                                      1);
3773 3773                                  break;
3774 3774                          }
3775 3775                  }
3776 3776                  break;
3777 3777  
3778 3778          case LGRP_MEM_POLICY_RANDOM_PROC:
3779 3779  
3780 3780                  /*
3781 3781                   * Grab copy of bitmask of lgroups spanned by
3782 3782                   * this process
3783 3783                   */
3784 3784                  klgrpset_copy(lgrpset, curproc->p_lgrpset);
3785 3785                  stat = LGRP_NUM_RANDOM_PROC;
3786 3786  
3787 3787                  /* LINTED fallthrough on case statement */
3788 3788          case LGRP_MEM_POLICY_RANDOM_PSET:
3789 3789  
3790 3790                  if (!stat)
3791 3791                          stat = LGRP_NUM_RANDOM_PSET;
3792 3792  
3793 3793                  if (klgrpset_isempty(lgrpset)) {
3794 3794                          /*
3795 3795                           * Grab copy of bitmask of lgroups spanned by
3796 3796                           * this processor set
3797 3797                           */
3798 3798                          kpreempt_disable();
3799 3799                          klgrpset_copy(lgrpset,
3800 3800                              curthread->t_cpupart->cp_lgrpset);
3801 3801                          kpreempt_enable();
3802 3802                  }
3803 3803  
3804 3804                  /*
3805 3805                   * Count how many lgroups are spanned
3806 3806                   */
3807 3807                  klgrpset_nlgrps(lgrpset, lgrps_spanned);
3808 3808                  ASSERT(lgrps_spanned <= nlgrps);
3809 3809  
3810 3810                  /*
3811 3811                   * Probably lgrps_spanned should be always non-zero, but to be
3812 3812                   * on the safe side we return lgrp_root if it is empty.
3813 3813                   */
3814 3814                  if (lgrps_spanned == 0) {
3815 3815                          lgrp = lgrp_root;
3816 3816                          break;
3817 3817                  }
3818 3818  
3819 3819                  /*
3820 3820                   * Pick a random offset within lgroups spanned
3821 3821                   * and return lgroup at that offset
3822 3822                   */
3823 3823                  random = (ushort_t)gethrtime() >> 4;
3824 3824                  off = random % lgrps_spanned;
3825 3825                  ASSERT(off <= lgrp_alloc_max);
3826 3826  
3827 3827                  for (i = 0; i <= lgrp_alloc_max; i++) {
3828 3828                          if (!klgrpset_ismember(lgrpset, i))
3829 3829                                  continue;
3830 3830                          if (off)
3831 3831                                  off--;
3832 3832                          else {
3833 3833                                  lgrp = lgrp_table[i];
3834 3834                                  lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3835 3835                                      1);
3836 3836                                  break;
3837 3837                          }
3838 3838                  }
3839 3839                  break;
3840 3840  
3841 3841          case LGRP_MEM_POLICY_ROUNDROBIN:
3842 3842  
3843 3843                  /*
3844 3844                   * Use offset within segment to determine
3845 3845                   * offset from home lgroup to choose for
3846 3846                   * next lgroup to allocate memory from
3847 3847                   */
3848 3848                  off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3849 3849                      (lgrp_alloc_max + 1);
3850 3850  
3851 3851                  kpreempt_disable();
3852 3852                  lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3853 3853                  i = lgrp->lgrp_id;
3854 3854                  kpreempt_enable();
3855 3855  
3856 3856                  while (off > 0) {
3857 3857                          i = (i + 1) % (lgrp_alloc_max + 1);
3858 3858                          lgrp = lgrp_table[i];
3859 3859                          if (klgrpset_ismember(lgrpset, i))
3860 3860                                  off--;
3861 3861                  }
3862 3862                  lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3863 3863  
3864 3864                  break;
3865 3865          }
3866 3866  
3867 3867          ASSERT(lgrp != NULL);
3868 3868          return (lgrp);
3869 3869  }
3870 3870  
3871 3871  /*
3872 3872   * Return the number of pages in an lgroup
3873 3873   *
3874 3874   * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3875 3875   *       could cause tests that rely on the numat driver to fail....
3876 3876   */
3877 3877  pgcnt_t
3878 3878  lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3879 3879  {
3880 3880          lgrp_t *lgrp;
3881 3881  
3882 3882          lgrp = lgrp_table[lgrpid];
3883 3883          if (!LGRP_EXISTS(lgrp) ||
3884 3884              klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3885 3885              !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3886 3886                  return (0);
3887 3887  
3888 3888          return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3889 3889  }
3890 3890  
3891 3891  /*
3892 3892   * Initialize lgroup shared memory allocation policy support
3893 3893   */
3894 3894  void
3895 3895  lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3896 3896  {
3897 3897          lgrp_shm_locality_t     *shm_locality;
3898 3898  
3899 3899          /*
3900 3900           * Initialize locality field in anon_map
3901 3901           * Don't need any locks because this is called when anon_map is
3902 3902           * allocated, but not used anywhere yet.
3903 3903           */
3904 3904          if (amp) {
3905 3905                  ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3906 3906                  if (amp->locality == NULL) {
3907 3907                          /*
3908 3908                           * Allocate and initialize shared memory locality info
3909 3909                           * and set anon_map locality pointer to it
3910 3910                           * Drop lock across kmem_alloc(KM_SLEEP)
3911 3911                           */
3912 3912                          ANON_LOCK_EXIT(&amp->a_rwlock);
3913 3913                          shm_locality = kmem_alloc(sizeof (*shm_locality),
3914 3914                              KM_SLEEP);
3915 3915                          rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3916 3916                              NULL);
3917 3917                          shm_locality->loc_count = 1;    /* not used for amp */
3918 3918                          shm_locality->loc_tree = NULL;
3919 3919  
3920 3920                          /*
3921 3921                           * Reacquire lock and check to see whether anyone beat
3922 3922                           * us to initializing the locality info
3923 3923                           */
3924 3924                          ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3925 3925                          if (amp->locality != NULL) {
3926 3926                                  rw_destroy(&shm_locality->loc_lock);
3927 3927                                  kmem_free(shm_locality,
3928 3928                                      sizeof (*shm_locality));
3929 3929                          } else
3930 3930                                  amp->locality = shm_locality;
3931 3931                  }
3932 3932                  ANON_LOCK_EXIT(&amp->a_rwlock);
3933 3933                  return;
3934 3934          }
3935 3935  
3936 3936          /*
3937 3937           * Allocate shared vnode policy info if vnode is not locality aware yet
3938 3938           */
3939 3939          mutex_enter(&vp->v_lock);
3940 3940          if ((vp->v_flag & V_LOCALITY) == 0) {
3941 3941                  /*
3942 3942                   * Allocate and initialize shared memory locality info
3943 3943                   */
3944 3944                  mutex_exit(&vp->v_lock);
3945 3945                  shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3946 3946                  rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3947 3947                  shm_locality->loc_count = 1;
3948 3948                  shm_locality->loc_tree = NULL;
3949 3949  
3950 3950                  /*
3951 3951                   * Point vnode locality field at shared vnode policy info
3952 3952                   * and set locality aware flag in vnode
3953 3953                   */
3954 3954                  mutex_enter(&vp->v_lock);
3955 3955                  if ((vp->v_flag & V_LOCALITY) == 0) {
3956 3956                          vp->v_locality = shm_locality;
3957 3957                          vp->v_flag |= V_LOCALITY;
3958 3958                  } else {
3959 3959                          /*
3960 3960                           * Lost race so free locality info and increment count.
3961 3961                           */
3962 3962                          rw_destroy(&shm_locality->loc_lock);
3963 3963                          kmem_free(shm_locality, sizeof (*shm_locality));
3964 3964                          shm_locality = vp->v_locality;
3965 3965                          shm_locality->loc_count++;
3966 3966                  }
3967 3967                  mutex_exit(&vp->v_lock);
3968 3968  
3969 3969                  return;
3970 3970          }
3971 3971  
3972 3972          /*
3973 3973           * Increment reference count of number of segments mapping this vnode
3974 3974           * shared
3975 3975           */
3976 3976          shm_locality = vp->v_locality;
3977 3977          shm_locality->loc_count++;
3978 3978          mutex_exit(&vp->v_lock);
3979 3979  }
3980 3980  
3981 3981  /*
3982 3982   * Destroy the given shared memory policy segment tree
3983 3983   */
3984 3984  void
3985 3985  lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3986 3986  {
3987 3987          lgrp_shm_policy_seg_t   *cur;
3988 3988          lgrp_shm_policy_seg_t   *next;
3989 3989  
3990 3990          if (tree == NULL)
3991 3991                  return;
3992 3992  
3993 3993          cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
3994 3994          while (cur != NULL) {
3995 3995                  next = AVL_NEXT(tree, cur);
3996 3996                  avl_remove(tree, cur);
3997 3997                  kmem_free(cur, sizeof (*cur));
3998 3998                  cur = next;
3999 3999          }
4000 4000          kmem_free(tree, sizeof (avl_tree_t));
4001 4001  }
4002 4002  
4003 4003  /*
4004 4004   * Uninitialize lgroup shared memory allocation policy support
4005 4005   */
4006 4006  void
4007 4007  lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
4008 4008  {
4009 4009          lgrp_shm_locality_t     *shm_locality;
4010 4010  
4011 4011          /*
4012 4012           * For anon_map, deallocate shared memory policy tree and
4013 4013           * zero locality field
4014 4014           * Don't need any locks because anon_map is being freed
4015 4015           */
4016 4016          if (amp) {
4017 4017                  if (amp->locality == NULL)
4018 4018                          return;
4019 4019                  shm_locality = amp->locality;
4020 4020                  shm_locality->loc_count = 0;    /* not really used for amp */
4021 4021                  rw_destroy(&shm_locality->loc_lock);
4022 4022                  lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4023 4023                  kmem_free(shm_locality, sizeof (*shm_locality));
4024 4024                  amp->locality = 0;
4025 4025                  return;
4026 4026          }
4027 4027  
4028 4028          /*
4029 4029           * For vnode, decrement reference count of segments mapping this vnode
4030 4030           * shared and delete locality info if reference count drops to 0
4031 4031           */
4032 4032          mutex_enter(&vp->v_lock);
4033 4033          shm_locality = vp->v_locality;
4034 4034          shm_locality->loc_count--;
4035 4035  
4036 4036          if (shm_locality->loc_count == 0) {
4037 4037                  rw_destroy(&shm_locality->loc_lock);
4038 4038                  lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4039 4039                  kmem_free(shm_locality, sizeof (*shm_locality));
4040 4040                  vp->v_locality = 0;
4041 4041                  vp->v_flag &= ~V_LOCALITY;
4042 4042          }
4043 4043          mutex_exit(&vp->v_lock);
4044 4044  }
4045 4045  
4046 4046  /*
4047 4047   * Compare two shared memory policy segments
4048 4048   * Used by AVL tree code for searching
4049 4049   */
4050 4050  int
4051 4051  lgrp_shm_policy_compar(const void *x, const void *y)
4052 4052  {
4053 4053          lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
4054 4054          lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
4055 4055  
4056 4056          if (a->shm_off < b->shm_off)
4057 4057                  return (-1);
4058 4058          if (a->shm_off >= b->shm_off + b->shm_size)
4059 4059                  return (1);
4060 4060          return (0);
4061 4061  }
4062 4062  
4063 4063  /*
4064 4064   * Concatenate seg1 with seg2 and remove seg2
4065 4065   */
4066 4066  static int
4067 4067  lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
4068 4068      lgrp_shm_policy_seg_t *seg2)
4069 4069  {
4070 4070          if (!seg1 || !seg2 ||
4071 4071              seg1->shm_off + seg1->shm_size != seg2->shm_off ||
4072 4072              seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
4073 4073                  return (-1);
4074 4074  
4075 4075          seg1->shm_size += seg2->shm_size;
4076 4076          avl_remove(tree, seg2);
4077 4077          kmem_free(seg2, sizeof (*seg2));
4078 4078          return (0);
4079 4079  }
4080 4080  
4081 4081  /*
4082 4082   * Split segment at given offset and return rightmost (uppermost) segment
4083 4083   * Assumes that there are no overlapping segments
4084 4084   */
4085 4085  static lgrp_shm_policy_seg_t *
4086 4086  lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
4087 4087      u_offset_t off)
4088 4088  {
4089 4089          lgrp_shm_policy_seg_t   *newseg;
4090 4090          avl_index_t             where;
4091 4091  
4092 4092          ASSERT(seg != NULL);
4093 4093          ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
4094 4094  
4095 4095          if (!seg || off < seg->shm_off || off > seg->shm_off +
4096 4096              seg->shm_size)
4097 4097                  return (NULL);
4098 4098  
4099 4099          if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4100 4100                  return (seg);
4101 4101  
4102 4102          /*
4103 4103           * Adjust size of left segment and allocate new (right) segment
4104 4104           */
4105 4105          newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4106 4106          newseg->shm_policy = seg->shm_policy;
4107 4107          newseg->shm_off = off;
4108 4108          newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4109 4109          seg->shm_size = off - seg->shm_off;
4110 4110  
4111 4111          /*
4112 4112           * Find where to insert new segment in AVL tree and insert it
4113 4113           */
4114 4114          (void) avl_find(tree, &off, &where);
4115 4115          avl_insert(tree, newseg, where);
4116 4116  
4117 4117          return (newseg);
4118 4118  }
4119 4119  
4120 4120  /*
4121 4121   * Set shared memory allocation policy on specified shared object at given
4122 4122   * offset and length
4123 4123   *
4124 4124   * Return 0 if policy wasn't set already, 1 if policy was set already, and
4125 4125   * -1 if can't set policy.
4126 4126   */
4127 4127  int
4128 4128  lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4129 4129      ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4130 4130  {
4131 4131          u_offset_t              eoff;
4132 4132          lgrp_shm_policy_seg_t   *next;
4133 4133          lgrp_shm_policy_seg_t   *newseg;
4134 4134          u_offset_t              off;
4135 4135          u_offset_t              oldeoff;
4136 4136          lgrp_shm_policy_seg_t   *prev;
4137 4137          int                     retval;
4138 4138          lgrp_shm_policy_seg_t   *seg;
4139 4139          lgrp_shm_locality_t     *shm_locality;
4140 4140          avl_tree_t              *tree;
4141 4141          avl_index_t             where;
4142 4142  
4143 4143          ASSERT(amp || vp);
4144 4144          ASSERT((len & PAGEOFFSET) == 0);
4145 4145  
4146 4146          if (len == 0)
4147 4147                  return (-1);
4148 4148  
4149 4149          retval = 0;
4150 4150  
4151 4151          /*
4152 4152           * Get locality info and starting offset into shared object
4153 4153           * Try anon map first and then vnode
4154 4154           * Assume that no locks need to be held on anon_map or vnode, since
4155 4155           * it should be protected by its reference count which must be nonzero
4156 4156           * for an existing segment.
4157 4157           */
4158 4158          if (amp) {
4159 4159                  /*
4160 4160                   * Get policy info from anon_map
4161 4161                   *
4162 4162                   */
4163 4163                  ASSERT(amp->refcnt != 0);
4164 4164                  if (amp->locality == NULL)
4165 4165                          lgrp_shm_policy_init(amp, NULL);
4166 4166                  shm_locality = amp->locality;
4167 4167                  off = ptob(anon_index);
4168 4168          } else if (vp) {
4169 4169                  /*
4170 4170                   * Get policy info from vnode
4171 4171                   */
4172 4172                  if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4173 4173                          lgrp_shm_policy_init(NULL, vp);
4174 4174                  shm_locality = vp->v_locality;
4175 4175                  ASSERT(shm_locality->loc_count != 0);
4176 4176                  off = vn_off;
4177 4177          } else
4178 4178                  return (-1);
4179 4179  
4180 4180          ASSERT((off & PAGEOFFSET) == 0);
4181 4181  
4182 4182          /*
4183 4183           * Figure out default policy
4184 4184           */
4185 4185          if (policy == LGRP_MEM_POLICY_DEFAULT)
4186 4186                  policy = lgrp_mem_policy_default(len, MAP_SHARED);
4187 4187  
4188 4188          /*
4189 4189           * Create AVL tree if there isn't one yet
4190 4190           * and set locality field to point at it
4191 4191           */
4192 4192          rw_enter(&shm_locality->loc_lock, RW_WRITER);
4193 4193          tree = shm_locality->loc_tree;
4194 4194          if (!tree) {
4195 4195                  rw_exit(&shm_locality->loc_lock);
4196 4196  
4197 4197                  tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4198 4198  
4199 4199                  rw_enter(&shm_locality->loc_lock, RW_WRITER);
4200 4200                  if (shm_locality->loc_tree == NULL) {
4201 4201                          avl_create(tree, lgrp_shm_policy_compar,
4202 4202                              sizeof (lgrp_shm_policy_seg_t),
4203 4203                              offsetof(lgrp_shm_policy_seg_t, shm_tree));
4204 4204                          shm_locality->loc_tree = tree;
4205 4205                  } else {
4206 4206                          /*
4207 4207                           * Another thread managed to set up the tree
4208 4208                           * before we could. Free the tree we allocated
4209 4209                           * and use the one that's already there.
4210 4210                           */
4211 4211                          kmem_free(tree, sizeof (*tree));
4212 4212                          tree = shm_locality->loc_tree;
4213 4213                  }
4214 4214          }
4215 4215  
4216 4216          /*
4217 4217           * Set policy
4218 4218           *
4219 4219           * Need to maintain hold on writer's lock to keep tree from
4220 4220           * changing out from under us
4221 4221           */
4222 4222          while (len != 0) {
4223 4223                  /*
4224 4224                   * Find policy segment for specified offset into shared object
4225 4225                   */
4226 4226                  seg = avl_find(tree, &off, &where);
4227 4227  
4228 4228                  /*
4229 4229                   * Didn't find any existing segment that contains specified
4230 4230                   * offset, so allocate new segment, insert it, and concatenate
4231 4231                   * with adjacent segments if possible
4232 4232                   */
4233 4233                  if (seg == NULL) {
4234 4234                          newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4235 4235                              KM_SLEEP);
4236 4236                          newseg->shm_policy.mem_policy = policy;
4237 4237                          newseg->shm_policy.mem_lgrpid = LGRP_NONE;
4238 4238                          newseg->shm_off = off;
4239 4239                          avl_insert(tree, newseg, where);
4240 4240  
4241 4241                          /*
4242 4242                           * Check to see whether new segment overlaps with next
4243 4243                           * one, set length of new segment accordingly, and
4244 4244                           * calculate remaining length and next offset
4245 4245                           */
4246 4246                          seg = AVL_NEXT(tree, newseg);
4247 4247                          if (seg == NULL || off + len <= seg->shm_off) {
4248 4248                                  newseg->shm_size = len;
4249 4249                                  len = 0;
4250 4250                          } else {
4251 4251                                  newseg->shm_size = seg->shm_off - off;
4252 4252                                  off = seg->shm_off;
4253 4253                                  len -= newseg->shm_size;
4254 4254                          }
4255 4255  
4256 4256                          /*
4257 4257                           * Try to concatenate new segment with next and
4258 4258                           * previous ones, since they might have the same policy
4259 4259                           * now.  Grab previous and next segments first because
4260 4260                           * they will change on concatenation.
4261 4261                           */
4262 4262                          prev =  AVL_PREV(tree, newseg);
4263 4263                          next = AVL_NEXT(tree, newseg);
4264 4264                          (void) lgrp_shm_policy_concat(tree, newseg, next);
4265 4265                          (void) lgrp_shm_policy_concat(tree, prev, newseg);
4266 4266  
4267 4267                          continue;
4268 4268                  }
4269 4269  
4270 4270                  eoff = off + len;
4271 4271                  oldeoff = seg->shm_off + seg->shm_size;
4272 4272  
4273 4273                  /*
4274 4274                   * Policy set already?
4275 4275                   */
4276 4276                  if (policy == seg->shm_policy.mem_policy) {
4277 4277                          /*
4278 4278                           * Nothing left to do if offset and length
4279 4279                           * fall within this segment
4280 4280                           */
4281 4281                          if (eoff <= oldeoff) {
4282 4282                                  retval = 1;
4283 4283                                  break;
4284 4284                          } else {
4285 4285                                  len = eoff - oldeoff;
4286 4286                                  off = oldeoff;
4287 4287                                  continue;
4288 4288                          }
4289 4289                  }
4290 4290  
4291 4291                  /*
4292 4292                   * Specified offset and length match existing segment exactly
4293 4293                   */
4294 4294                  if (off == seg->shm_off && len == seg->shm_size) {
4295 4295                          /*
4296 4296                           * Set policy and update current length
4297 4297                           */
4298 4298                          seg->shm_policy.mem_policy = policy;
4299 4299                          seg->shm_policy.mem_lgrpid = LGRP_NONE;
4300 4300                          len = 0;
4301 4301  
4302 4302                          /*
4303 4303                           * Try concatenating new segment with previous and next
4304 4304                           * segments, since they might have the same policy now.
4305 4305                           * Grab previous and next segments first because they
4306 4306                           * will change on concatenation.
4307 4307                           */
4308 4308                          prev =  AVL_PREV(tree, seg);
4309 4309                          next = AVL_NEXT(tree, seg);
4310 4310                          (void) lgrp_shm_policy_concat(tree, seg, next);
4311 4311                          (void) lgrp_shm_policy_concat(tree, prev, seg);
4312 4312                  } else {
4313 4313                          /*
4314 4314                           * Specified offset and length only apply to part of
4315 4315                           * existing segment
4316 4316                           */
4317 4317  
4318 4318                          /*
4319 4319                           * New segment starts in middle of old one, so split
4320 4320                           * new one off near beginning of old one
4321 4321                           */
4322 4322                          newseg = NULL;
4323 4323                          if (off > seg->shm_off) {
4324 4324                                  newseg = lgrp_shm_policy_split(tree, seg, off);
4325 4325  
4326 4326                                  /*
4327 4327                                   * New segment ends where old one did, so try
4328 4328                                   * to concatenate with next segment
4329 4329                                   */
4330 4330                                  if (eoff == oldeoff) {
4331 4331                                          newseg->shm_policy.mem_policy = policy;
4332 4332                                          newseg->shm_policy.mem_lgrpid =
4333 4333                                              LGRP_NONE;
4334 4334                                          (void) lgrp_shm_policy_concat(tree,
4335 4335                                              newseg, AVL_NEXT(tree, newseg));
4336 4336                                          break;
4337 4337                                  }
4338 4338                          }
4339 4339  
4340 4340                          /*
4341 4341                           * New segment ends before old one, so split off end of
4342 4342                           * old one
4343 4343                           */
4344 4344                          if (eoff < oldeoff) {
4345 4345                                  if (newseg) {
4346 4346                                          (void) lgrp_shm_policy_split(tree,
4347 4347                                              newseg, eoff);
4348 4348                                          newseg->shm_policy.mem_policy = policy;
4349 4349                                          newseg->shm_policy.mem_lgrpid =
4350 4350                                              LGRP_NONE;
4351 4351                                  } else {
4352 4352                                          (void) lgrp_shm_policy_split(tree, seg,
4353 4353                                              eoff);
4354 4354                                          seg->shm_policy.mem_policy = policy;
4355 4355                                          seg->shm_policy.mem_lgrpid = LGRP_NONE;
4356 4356                                  }
4357 4357  
4358 4358                                  if (off == seg->shm_off)
4359 4359                                          (void) lgrp_shm_policy_concat(tree,
4360 4360                                              AVL_PREV(tree, seg), seg);
4361 4361                                  break;
4362 4362                          }
4363 4363  
4364 4364                          /*
4365 4365                           * Calculate remaining length and next offset
4366 4366                           */
4367 4367                          len = eoff - oldeoff;
4368 4368                          off = oldeoff;
4369 4369                  }
4370 4370          }
4371 4371  
4372 4372          rw_exit(&shm_locality->loc_lock);
4373 4373          return (retval);
4374 4374  }
4375 4375  
4376 4376  /*
4377 4377   * Return the best memnode from which to allocate memory given
4378 4378   * an lgroup.
4379 4379   *
4380 4380   * "c" is for cookie, which is good enough for me.
4381 4381   * It references a cookie struct that should be zero'ed to initialize.
4382 4382   * The cookie should live on the caller's stack.
4383 4383   *
4384 4384   * The routine returns -1 when:
4385 4385   *      - traverse is 0, and all the memnodes in "lgrp" have been returned.
4386 4386   *      - traverse is 1, and all the memnodes in the system have been
4387 4387   *        returned.
4388 4388   */
4389 4389  int
4390 4390  lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4391 4391  {
4392 4392          lgrp_t          *lp = c->lmc_lgrp;
4393 4393          mnodeset_t      nodes = c->lmc_nodes;
4394 4394          int             cnt = c->lmc_cnt;
4395 4395          int             offset, mnode;
4396 4396  
4397 4397          extern int      max_mem_nodes;
4398 4398  
4399 4399          /*
4400 4400           * If the set is empty, and the caller is willing, traverse
4401 4401           * up the hierarchy until we find a non-empty set.
4402 4402           */
4403 4403          while (nodes == (mnodeset_t)0 || cnt <= 0) {
4404 4404                  if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4405 4405                      ((lp = lp->lgrp_parent) == NULL))
4406 4406                          return (-1);
4407 4407  
4408 4408                  nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4409 4409                  cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4410 4410          }
4411 4411  
4412 4412          /*
4413 4413           * Select a memnode by picking one at a "random" offset.
4414 4414           * Because of DR, memnodes can come and go at any time.
4415 4415           * This code must be able to cope with the possibility
4416 4416           * that the nodes count "cnt" is inconsistent with respect
4417 4417           * to the number of elements actually in "nodes", and
4418 4418           * therefore that the offset chosen could be greater than
4419 4419           * the number of elements in the set (some memnodes may
4420 4420           * have dissapeared just before cnt was read).
4421 4421           * If this happens, the search simply wraps back to the
4422 4422           * beginning of the set.
4423 4423           */
4424 4424          ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4425 4425          offset = c->lmc_rand % cnt;
4426 4426          do {
4427 4427                  for (mnode = 0; mnode < max_mem_nodes; mnode++)
4428 4428                          if (nodes & ((mnodeset_t)1 << mnode))
4429 4429                                  if (!offset--)
4430 4430                                          break;
4431 4431          } while (mnode >= max_mem_nodes);
4432 4432  
4433 4433          /* Found a node. Store state before returning. */
4434 4434          c->lmc_lgrp = lp;
4435 4435          c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4436 4436          c->lmc_cnt = cnt - 1;
4437 4437          c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4438 4438          c->lmc_ntried++;
4439 4439  
4440 4440          return (mnode);
4441 4441  }

↓ open down ↓

2951 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX