XXXX-pass-in-cpu_pause_func-via-pause_cpus Wdiff usr/src/uts/sun4v/os/mpo.c

Print this page

XXXX pass in cpu_pause_func via pause_cpus

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/sun4v/os/mpo.c
          +++ new/usr/src/uts/sun4v/os/mpo.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
  25   25   */
  26   26  
  27   27  #include <sys/types.h>
  28   28  #include <sys/sysmacros.h>
  29   29  #include <sys/machsystm.h>
  30   30  #include <sys/machparam.h>
  31   31  #include <sys/cmn_err.h>
  32   32  #include <sys/stat.h>
  33   33  #include <sys/mach_descrip.h>
  34   34  #include <sys/memnode.h>
  35   35  #include <sys/mdesc.h>
  36   36  #include <sys/mpo.h>
  37   37  #include <vm/page.h>
  38   38  #include <vm/vm_dep.h>
  39   39  #include <vm/hat_sfmmu.h>
  40   40  #include <sys/promif.h>
  41   41  
  42   42  /*
  43   43   * MPO and the sun4v memory representation
  44   44   * ---------------------------------------
  45   45   *
  46   46   * Latency groups are defined in the sun4v achitecture by memory-latency-group
  47   47   * nodes in the Machine Description, as specified in FWARC/2007/260.  These
  48   48   * tie together cpu nodes and mblock nodes, and contain mask and match
  49   49   * properties that identify the portion of an mblock that belongs to the
  50   50   * lgroup.  Mask and match are defined in the Physical Address (PA) space,
  51   51   * but an mblock defines Real Addresses (RA).  To translate, the mblock
  52   52   * includes the property address-congruence-offset, hereafter referred to as
  53   53   * ra_to_pa.  A real address ra is a member of an lgroup if
  54   54   *
  55   55   *      (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match
  56   56   *
  57   57   * The MD is traversed, and information on all mblocks is kept in the array
  58   58   * mpo_mblock[].  Information on all CPUs, including which lgroup they map
  59   59   * to, is kept in the array mpo_cpu[].
  60   60   *
  61   61   * This implementation makes (and verifies) the simplifying assumption that
  62   62   * the mask bits are the same for all defined lgroups, and that all 1 bits in
  63   63   * the mask are contiguous.  Thus the number of lgroups is bounded by the
  64   64   * number of possible mask values, and the lgrp_handle_t is defined as the
  65   65   * mask value, shifted right to eliminate the 0 bit positions in mask.  The
  66   66   * masks and values are also referred to as "home bits" in the code.
  67   67   *
  68   68   * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup
  69   69   * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock
  70   70   * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the
  71   71   * home bits.  This yields the mem_node.
  72   72   *
  73   73   * Interfaces
  74   74   * ----------
  75   75   *
  76   76   * This file exports the following entry points:
  77   77   *
  78   78   * plat_lgrp_init()
  79   79   * plat_build_mem_nodes()
  80   80   * plat_lgrp_cpu_to_hand()
  81   81   * plat_lgrp_latency()
  82   82   * plat_pfn_to_mem_node()
  83   83   *      These implement the usual platform lgroup interfaces.
  84   84   *
  85   85   * plat_rapfn_to_papfn()
  86   86   *      Recover the PA page coloring bits from an RA.
  87   87   *
  88   88   * plat_mem_node_iterator_init()
  89   89   *      Initialize an iterator to efficiently step through pages in a mem_node.
  90   90   *
  91   91   * plat_mem_node_intersect_range()
  92   92   *      Find the intersection with a mem_node.
  93   93   *
  94   94   * plat_slice_add()
  95   95   * plat_slice_del()
  96   96   *      Platform hooks to add/delete a pfn range.
  97   97   *
  98   98   * Internal Organization
  99   99   * ---------------------
 100  100   *
 101  101   * A number of routines are used both boot/DR code which (re)build
 102  102   * appropriate MPO structures.
 103  103   *
 104  104   * mblock_alloc()
 105  105   *      Allocate memory for mblocks and stripes as
 106  106   *      appropriate for boot or memory DR.
 107  107   *
 108  108   * mblock_free()
 109  109   *      Free memory allocated by mblock_alloc.
 110  110   *
 111  111   * mblock_update()
 112  112   *      Build mblocks based on mblock nodes read from the MD.
 113  113   *
 114  114   * mblock_update_add()
 115  115   *      Rebuild mblocks after a memory DR add operation.
 116  116   *
 117  117   * mblock_update_del()
 118  118   *      Rebuild mblocks after a memory DR delete operation.
 119  119   *
 120  120   * mblock_install()
 121  121   *      Install mblocks as the new configuration.
 122  122   *
 123  123   * mstripe_update()
 124  124   *      Build stripes based on mblocks.
 125  125   *
 126  126   * mnode_update()
 127  127   *      Call memnode layer to add/del a pfn range, based on stripes.
 128  128   *
 129  129   * The platform interfaces allocate all memory required for the
 130  130   * particualar update first, block access to the MPO structures
 131  131   * while they are updated, and free old structures after the update.
 132  132   */
 133  133  
 134  134  int     sun4v_mpo_enable = 1;
 135  135  int     sun4v_mpo_debug = 0;
 136  136  char    sun4v_mpo_status[256] = "";
 137  137  
 138  138  /* Save CPU info from the MD and associate CPUs with lgroups */
 139  139  static  struct cpu_md mpo_cpu[NCPU];
 140  140  
 141  141  /* Save lgroup info from the MD */
 142  142  #define MAX_MD_LGROUPS 32
 143  143  static  struct  lgrp_md mpo_lgroup[MAX_MD_LGROUPS];
 144  144  static  int     n_lgrpnodes = 0;
 145  145  static  int     n_locality_groups = 0;
 146  146  static  int     max_locality_groups = 0;
 147  147  static  int     szc_mask0 = 0;
 148  148  
 149  149  /* Save mblocks from the MD */
 150  150  #define SMALL_MBLOCKS_COUNT     8
 151  151  static  struct  mblock_md *mpo_mblock;
 152  152  static  struct  mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT];
 153  153  static  int     n_mblocks = 0;
 154  154  
 155  155  /* Save mem_node stripes calculate from mblocks and lgroups. */
 156  156  static mem_stripe_t *mem_stripes;
 157  157  static  mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES];
 158  158  static  int     n_mem_stripes = 0;
 159  159  static  pfn_t   mnode_stride;   /* distance between stripes, start to start */
 160  160  static  int     stripe_shift;   /* stride/stripes expressed as a shift */
 161  161  static  pfn_t   mnode_pages;    /* mem_node stripe width */
 162  162  
 163  163  /* Save home mask and shift used to calculate lgrp_handle_t values */
 164  164  static  uint64_t home_mask = 0;
 165  165  static  pfn_t   home_mask_pfn = 0;
 166  166  static  int     home_mask_shift = 0;
 167  167  static  uint_t  home_mask_pfn_shift = 0;
 168  168  
 169  169  /* Save lowest and highest latencies found across all lgroups */
 170  170  static  int     lower_latency = 0;
 171  171  static  int     higher_latency = 0;
 172  172  
 173  173  static  pfn_t   base_ra_to_pa_pfn = 0;  /* ra_to_pa for single mblock memory */
 174  174  static  int     mpo_genid;              /* config gen; updated by mem DR */
 175  175  static  mpo_config_t mpo_config;        /* current mblocks and stripes */
 176  176  
 177  177  typedef enum { U_ADD, U_ADD_ALL, U_DEL } update_t;
 178  178  
 179  179  static  int     valid_pages(md_t *md, mde_cookie_t cpu0);
 180  180  static  int     unique_home_mem_lg_count(uint64_t mem_lg_homeset);
 181  181  static  int     fix_interleave(void);
 182  182  
 183  183  static int  mblock_alloc(mpo_config_t *, update_t, int nmblocks);
 184  184  static void mblock_install(mpo_config_t *);
 185  185  static void mblock_free(mpo_config_t *);
 186  186  static void mblock_update(mpo_config_t *, md_t, mde_cookie_t *mblocknodes);
 187  187  static void mblock_update_add(mpo_config_t *);
 188  188  static void mblock_update_del(mpo_config_t *, mpo_config_t *, pfn_t, pfn_t);
 189  189  static void mstripe_update(mpo_config_t *);
 190  190  static void mnode_update(mpo_config_t *, pfn_t, pfn_t, update_t);
 191  191  
 192  192  /* Debug support */
 193  193  #if defined(DEBUG) && !defined(lint)
 194  194  #define VALIDATE_SLICE(base, end) {                                     \
 195  195          ASSERT(IS_P2ALIGNED(ptob(base), TTEBYTES(TTE256M)));            \
 196  196          ASSERT(IS_P2ALIGNED(ptob(end - base + 1), TTEBYTES(TTE256M)));  \
 197  197  }
 198  198  #define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args)
 199  199  #else
 200  200  #define VALIDATE_SLICE(base, end)
 201  201  #define MPO_DEBUG(...)
 202  202  #endif  /* DEBUG */
 203  203  
 204  204  /* Record status message, viewable from mdb */
 205  205  #define MPO_STATUS(args...) {                                                 \
 206  206          (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args);   \
 207  207          MPO_DEBUG(sun4v_mpo_status);                                          \
 208  208  }
 209  209  
 210  210  /*
 211  211   * The MPO locks are to protect the MPO metadata while that
 212  212   * information is updated as a result of a memory DR operation.

↓ open down ↓

212 lines elided

↑ open up ↑

 213  213   * The read lock must be acquired to read the metadata and the
 214  214   * write locks must be acquired to update it.
 215  215   */
 216  216  #define mpo_rd_lock     kpreempt_disable
 217  217  #define mpo_rd_unlock   kpreempt_enable
 218  218  
 219  219  static void
 220  220  mpo_wr_lock()
 221  221  {
 222  222          mutex_enter(&cpu_lock);
 223      -        pause_cpus(NULL);
      223 +        pause_cpus(NULL, NULL);
 224  224          mutex_exit(&cpu_lock);
 225  225  }
 226  226  
 227  227  static void
 228  228  mpo_wr_unlock()
 229  229  {
 230  230          mutex_enter(&cpu_lock);
 231  231          start_cpus();
 232  232          mutex_exit(&cpu_lock);
 233  233  }

 234  234  
 235  235  /*
 236  236   * Routine to read a uint64_t from a given md
 237  237   */
 238  238  static  int64_t
 239  239  get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val)
 240  240  {
 241  241          int err = md_get_prop_val(md, node, propname, val);
 242  242          return (err);
 243  243  }
 244  244  
 245  245  static int
 246  246  mblock_cmp(const void *a, const void *b)
 247  247  {
 248  248          struct mblock_md *m1 = (struct mblock_md *)a;
 249  249          struct mblock_md *m2 = (struct mblock_md *)b;
 250  250  
 251  251          if (m1->base < m2->base)
 252  252                  return (-1);
 253  253          else if (m1->base == m2->base)
 254  254                  return (0);
 255  255          else
 256  256                  return (1);
 257  257  }
 258  258  
 259  259  static void
 260  260  mblock_sort(struct mblock_md *mblocks, int n)
 261  261  {
 262  262          extern void qsort(void *, size_t, size_t,
 263  263              int (*)(const void *, const void *));
 264  264  
 265  265          qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp);
 266  266  }
 267  267  
 268  268  static void
 269  269  mpo_update_tunables(void)
 270  270  {
 271  271          int i, ncpu_min;
 272  272  
 273  273          /*
 274  274           * lgrp_expand_proc_thresh is the minimum load on the lgroups
 275  275           * this process is currently running on before considering
 276  276           *  expanding threads to another lgroup.
 277  277           *
 278  278           * lgrp_expand_proc_diff determines how much less the remote lgroup
 279  279           *  must be loaded before expanding to it.
 280  280           *
 281  281           * On sun4v CMT processors, threads share a core pipeline, and
 282  282           * at less than 100% utilization, best throughput is obtained by
 283  283           * spreading threads across more cores, even if some are in a
 284  284           * different lgroup.  Spread threads to a new lgroup if the
 285  285           * current group is more than 50% loaded.  Because of virtualization,
 286  286           * lgroups may have different numbers of CPUs, but the tunables
 287  287           * apply to all lgroups, so find the smallest lgroup and compute
 288  288           * 50% loading.
 289  289           */
 290  290  
 291  291          ncpu_min = NCPU;
 292  292          for (i = 0; i < n_lgrpnodes; i++) {
 293  293                  int ncpu = mpo_lgroup[i].ncpu;
 294  294                  if (ncpu != 0 && ncpu < ncpu_min)
 295  295                          ncpu_min = ncpu;
 296  296          }
 297  297          lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2;
 298  298  
 299  299          /* new home may only be half as loaded as the existing home to use it */
 300  300          lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2;
 301  301  
 302  302          lgrp_loadavg_tolerance = lgrp_loadavg_max_effect;
 303  303  }
 304  304  
 305  305  static mde_cookie_t
 306  306  cpuid_to_cpunode(md_t *md, int cpuid)
 307  307  {
 308  308          mde_cookie_t    rootnode, foundnode, *cpunodes;
 309  309          uint64_t        cpuid_prop;
 310  310          int     n_cpunodes, i;
 311  311  
 312  312          if (md == NULL)
 313  313                  return (MDE_INVAL_ELEM_COOKIE);
 314  314  
 315  315          rootnode = md_root_node(md);
 316  316          if (rootnode == MDE_INVAL_ELEM_COOKIE)
 317  317                  return (MDE_INVAL_ELEM_COOKIE);
 318  318  
 319  319          n_cpunodes = md_alloc_scan_dag(md, rootnode, PROP_LG_CPU,
 320  320              "fwd", &cpunodes);
 321  321          if (n_cpunodes <= 0 || n_cpunodes > NCPU)
 322  322                  goto cpuid_fail;
 323  323  
 324  324          for (i = 0; i < n_cpunodes; i++) {
 325  325                  if (md_get_prop_val(md, cpunodes[i], PROP_LG_CPU_ID,
 326  326                      &cpuid_prop))
 327  327                          break;
 328  328                  if (cpuid_prop == (uint64_t)cpuid) {
 329  329                          foundnode = cpunodes[i];
 330  330                          md_free_scan_dag(md, &cpunodes);
 331  331                          return (foundnode);
 332  332                  }
 333  333          }
 334  334  cpuid_fail:
 335  335          if (n_cpunodes > 0)
 336  336                  md_free_scan_dag(md, &cpunodes);
 337  337          return (MDE_INVAL_ELEM_COOKIE);
 338  338  }
 339  339  
 340  340  static int
 341  341  mpo_cpu_to_lgroup(md_t *md, mde_cookie_t cpunode)
 342  342  {
 343  343          mde_cookie_t *nodes;
 344  344          uint64_t latency, lowest_latency;
 345  345          uint64_t address_match, lowest_address_match;
 346  346          int n_lgroups, j, result = 0;
 347  347  
 348  348          /* Find lgroup nodes reachable from this cpu */
 349  349          n_lgroups = md_alloc_scan_dag(md, cpunode, PROP_LG_MEM_LG,
 350  350              "fwd", &nodes);
 351  351  
 352  352          lowest_latency = ~(0UL);
 353  353  
 354  354          /* Find the lgroup node with the smallest latency */
 355  355          for (j = 0; j < n_lgroups; j++) {
 356  356                  result = get_int(md, nodes[j], PROP_LG_LATENCY,
 357  357                      &latency);
 358  358                  result |= get_int(md, nodes[j], PROP_LG_MATCH,
 359  359                      &address_match);
 360  360                  if (result != 0) {
 361  361                          j = -1;
 362  362                          goto to_lgrp_done;
 363  363                  }
 364  364                  if (latency < lowest_latency) {
 365  365                          lowest_latency = latency;
 366  366                          lowest_address_match = address_match;
 367  367                  }
 368  368          }
 369  369          for (j = 0; j < n_lgrpnodes; j++) {
 370  370                  if ((mpo_lgroup[j].latency == lowest_latency) &&
 371  371                      (mpo_lgroup[j].addr_match == lowest_address_match))
 372  372                          break;
 373  373          }
 374  374          if (j == n_lgrpnodes)
 375  375                  j = -1;
 376  376  
 377  377  to_lgrp_done:
 378  378          if (n_lgroups > 0)
 379  379                  md_free_scan_dag(md, &nodes);
 380  380          return (j);
 381  381  }
 382  382  
 383  383  /* Called when DR'ing in a CPU */
 384  384  void
 385  385  mpo_cpu_add(md_t *md, int cpuid)
 386  386  {
 387  387          mde_cookie_t cpunode;
 388  388  
 389  389          int i;
 390  390  
 391  391          if (n_lgrpnodes <= 0)
 392  392                  return;
 393  393  
 394  394          if (md == NULL)
 395  395                  goto add_fail;
 396  396  
 397  397          cpunode = cpuid_to_cpunode(md, cpuid);
 398  398          if (cpunode == MDE_INVAL_ELEM_COOKIE)
 399  399                  goto add_fail;
 400  400  
 401  401          i = mpo_cpu_to_lgroup(md, cpunode);
 402  402          if (i == -1)
 403  403                  goto add_fail;
 404  404  
 405  405          mpo_cpu[cpuid].lgrp_index = i;
 406  406          mpo_cpu[cpuid].home = mpo_lgroup[i].addr_match >> home_mask_shift;
 407  407          mpo_lgroup[i].ncpu++;
 408  408          mpo_update_tunables();
 409  409          return;
 410  410  add_fail:
 411  411          panic("mpo_cpu_add: Cannot read MD");
 412  412  }
 413  413  
 414  414  /* Called when DR'ing out a CPU */
 415  415  void
 416  416  mpo_cpu_remove(int cpuid)
 417  417  {
 418  418          int i;
 419  419  
 420  420          if (n_lgrpnodes <= 0)
 421  421                  return;
 422  422  
 423  423          i = mpo_cpu[cpuid].lgrp_index;
 424  424          mpo_lgroup[i].ncpu--;
 425  425          mpo_cpu[cpuid].home = 0;
 426  426          mpo_cpu[cpuid].lgrp_index = -1;
 427  427          mpo_update_tunables();
 428  428  }
 429  429  
 430  430  static mde_cookie_t
 431  431  md_get_root(md_t *md)
 432  432  {
 433  433          mde_cookie_t root = MDE_INVAL_ELEM_COOKIE;
 434  434          int n_nodes;
 435  435  
 436  436          n_nodes = md_node_count(md);
 437  437  
 438  438          if (n_nodes <= 0) {
 439  439                  MPO_STATUS("md_get_root: No nodes in node count\n");
 440  440                  return (root);
 441  441          }
 442  442  
 443  443          root = md_root_node(md);
 444  444  
 445  445          if (root == MDE_INVAL_ELEM_COOKIE) {
 446  446                  MPO_STATUS("md_get_root: Root node is missing\n");
 447  447                  return (root);
 448  448          }
 449  449  
 450  450          MPO_DEBUG("md_get_root: Node Count: %d\n", n_nodes);
 451  451          MPO_DEBUG("md_get_root: md: %p\n", md);
 452  452          MPO_DEBUG("md_get_root: root: %lx\n", root);
 453  453  done:
 454  454          return (root);
 455  455  }
 456  456  
 457  457  static int
 458  458  lgrp_update(md_t *md, mde_cookie_t root)
 459  459  {
 460  460          int i, j, result;
 461  461          int ret_val = 0;
 462  462          int sub_page_fix;
 463  463          mde_cookie_t *nodes, *lgrpnodes;
 464  464  
 465  465          n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG,
 466  466              "fwd", &lgrpnodes);
 467  467  
 468  468          if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) {
 469  469                  MPO_STATUS("lgrp_update: No Lgroups\n");
 470  470                  ret_val = -1;
 471  471                  goto fail;
 472  472          }
 473  473  
 474  474          MPO_DEBUG("lgrp_update: mem_lgs: %d\n", n_lgrpnodes);
 475  475  
 476  476          for (i = 0; i < n_lgrpnodes; i++) {
 477  477                  mpo_lgroup[i].node = lgrpnodes[i];
 478  478                  mpo_lgroup[i].id = i;
 479  479                  mpo_lgroup[i].ncpu = 0;
 480  480                  result = get_int(md, lgrpnodes[i], PROP_LG_MASK,
 481  481                      &mpo_lgroup[i].addr_mask);
 482  482                  result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH,
 483  483                      &mpo_lgroup[i].addr_match);
 484  484  
 485  485                  /*
 486  486                   * If either the mask or match properties are missing, set to 0
 487  487                   */
 488  488                  if (result < 0) {
 489  489                          mpo_lgroup[i].addr_mask = 0;
 490  490                          mpo_lgroup[i].addr_match = 0;
 491  491                  }
 492  492  
 493  493                  /* Set latency to 0 if property not present */
 494  494  
 495  495                  result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY,
 496  496                      &mpo_lgroup[i].latency);
 497  497                  if (result < 0)
 498  498                          mpo_lgroup[i].latency = 0;
 499  499          }
 500  500  
 501  501          /*
 502  502           * Sub-page level interleave is not yet supported.  Check for it,
 503  503           * and remove sub-page interleaved lgroups from mpo_lgroup and
 504  504           * n_lgrpnodes.  If no lgroups are left, return.
 505  505           */
 506  506  
 507  507          sub_page_fix = fix_interleave();
 508  508          if (n_lgrpnodes == 0) {
 509  509                  ret_val = -1;
 510  510                  goto fail;
 511  511          }
 512  512  
 513  513          /* Ensure that all of the addr_mask values are the same */
 514  514  
 515  515          for (i = 0; i < n_lgrpnodes; i++) {
 516  516                  if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) {
 517  517                          MPO_STATUS("lgrp_update: "
 518  518                              "addr_mask values are not the same\n");
 519  519                          ret_val = -1;
 520  520                          goto fail;
 521  521                  }
 522  522          }
 523  523  
 524  524          /*
 525  525           * Ensure that all lgrp nodes see all the mblocks. However, if
 526  526           * sub-page interleave is being fixed, they do not, so skip
 527  527           * the check.
 528  528           */
 529  529  
 530  530          if (sub_page_fix == 0) {
 531  531                  for (i = 0; i < n_lgrpnodes; i++) {
 532  532                          j = md_alloc_scan_dag(md, mpo_lgroup[i].node,
 533  533                              PROP_LG_MBLOCK, "fwd", &nodes);
 534  534                          md_free_scan_dag(md, &nodes);
 535  535                          if (j != n_mblocks) {
 536  536                                  MPO_STATUS("lgrp_update: "
 537  537                                      "sub-page interleave is being fixed\n");
 538  538                                  ret_val = -1;
 539  539                                  goto fail;
 540  540                          }
 541  541                  }
 542  542          }
 543  543  fail:
 544  544          if (n_lgrpnodes > 0) {
 545  545                  md_free_scan_dag(md, &lgrpnodes);
 546  546                  for (i = 0; i < n_lgrpnodes; i++)
 547  547                          mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE;
 548  548          }
 549  549  
 550  550          return (ret_val);
 551  551  }
 552  552  
 553  553  /*
 554  554   *
 555  555   * Traverse the MD to determine:
 556  556   *
 557  557   *  Number of CPU nodes, lgrp_nodes, and mblocks
 558  558   *  Then for each lgrp_node, obtain the appropriate data.
 559  559   *  For each CPU, determine its home locality and store it.
 560  560   *  For each mblock, retrieve its data and store it.
 561  561   */
 562  562  static  int
 563  563  lgrp_traverse(md_t *md)
 564  564  {
 565  565          mde_cookie_t root, *cpunodes, *mblocknodes;
 566  566          int o;
 567  567          uint64_t i, k, stripe, stride;
 568  568          uint64_t mem_lg_homeset = 0;
 569  569          int ret_val = 0;
 570  570          int result = 0;
 571  571          int n_cpunodes = 0;
 572  572          mpo_config_t new_config;
 573  573  
 574  574          if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE) {
 575  575                  ret_val = -1;
 576  576                  goto fail;
 577  577          }
 578  578  
 579  579          n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd",
 580  580              &mblocknodes);
 581  581          if (n_mblocks <= 0) {
 582  582                  MPO_STATUS("lgrp_traverse: No mblock nodes detected in Machine "
 583  583                      "Descriptor\n");
 584  584                  ret_val = -1;
 585  585                  goto fail;
 586  586          }
 587  587  
 588  588          /*
 589  589           * Build the Memory Nodes.  Do this before any possibility of
 590  590           * bailing from this routine so we obtain ra_to_pa (needed for page
 591  591           * coloring) even when there are no lgroups defined.
 592  592           */
 593  593          if (mblock_alloc(&new_config, U_ADD_ALL, n_mblocks) < 0) {
 594  594                  ret_val = -1;
 595  595                  goto fail;
 596  596          }
 597  597  
 598  598          mblock_update(&new_config, md, mblocknodes);
 599  599          mblock_install(&new_config);
 600  600  
 601  601          /* Page coloring hook is required so we can iterate through mnodes */
 602  602          if (&page_next_pfn_for_color_cpu == NULL) {
 603  603                  MPO_STATUS("lgrp_traverse: No page coloring support\n");
 604  604                  ret_val = -1;
 605  605                  goto fail;
 606  606          }
 607  607  
 608  608          /* Global enable for mpo */
 609  609          if (sun4v_mpo_enable == 0) {
 610  610                  MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n");
 611  611                  ret_val = -1;
 612  612                  goto fail;
 613  613          }
 614  614  
 615  615          n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes);
 616  616  
 617  617          if (n_cpunodes <= 0 || n_cpunodes > NCPU) {
 618  618                  MPO_STATUS("lgrp_traverse: No CPU nodes detected "
 619  619                      "in MD\n");
 620  620                  ret_val = -1;
 621  621                  goto fail;
 622  622          }
 623  623  
 624  624          MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes);
 625  625  
 626  626          if ((ret_val = lgrp_update(md, root)) == -1)
 627  627                  goto fail;
 628  628  
 629  629          /*
 630  630           * Use the address mask from the first lgroup node
 631  631           * to establish our home_mask.
 632  632           */
 633  633          home_mask = mpo_lgroup[0].addr_mask;
 634  634          home_mask_pfn = btop(home_mask);
 635  635          home_mask_shift = lowbit(home_mask) - 1;
 636  636          home_mask_pfn_shift = home_mask_shift - PAGESHIFT;
 637  637          mnode_pages = btop(1ULL << home_mask_shift);
 638  638  
 639  639          /*
 640  640           * How many values are possible in home mask?  Assume the mask
 641  641           * bits are contiguous.
 642  642           */
 643  643          max_locality_groups =
 644  644              1 << highbit(home_mask_pfn >> home_mask_pfn_shift);
 645  645  
 646  646          stripe_shift = highbit(max_locality_groups) - 1;
 647  647          stripe = ptob(mnode_pages);
 648  648          stride = max_locality_groups * stripe;
 649  649          mnode_stride = btop(stride);
 650  650  
 651  651          /* Now verify the home mask bits are contiguous */
 652  652  
 653  653          if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) {
 654  654                  MPO_STATUS("lgrp_traverse: "
 655  655                      "home mask bits are not contiguous\n");
 656  656                  ret_val = -1;
 657  657                  goto fail;
 658  658          }
 659  659  
 660  660          /* Record all of the home bits */
 661  661  
 662  662          for (i = 0; i < n_lgrpnodes; i++) {
 663  663                  HOMESET_ADD(mem_lg_homeset,
 664  664                      mpo_lgroup[i].addr_match >> home_mask_shift);
 665  665          }
 666  666  
 667  667          /* Count the number different "home"  mem_lg's we've discovered */
 668  668  
 669  669          n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset);
 670  670  
 671  671          /* If we have only 1 locality group then we can exit */
 672  672          if (n_locality_groups == 1) {
 673  673                  MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n");
 674  674                  ret_val = -1;
 675  675                  goto fail;
 676  676          }
 677  677  
 678  678          /*
 679  679           * Set the latencies.  A CPU's lgroup is defined by the lowest
 680  680           * latency found.  All other memory is considered remote, and the
 681  681           * remote latency is represented by the highest latency found.
 682  682           * Thus hierarchical lgroups, if any, are approximated by a
 683  683           * two level scheme.
 684  684           *
 685  685           * The Solaris MPO framework by convention wants to see latencies
 686  686           * in units of nano-sec/10. In the MD, the units are defined to be
 687  687           * pico-seconds.
 688  688           */
 689  689  
 690  690          lower_latency = mpo_lgroup[0].latency;
 691  691          higher_latency = mpo_lgroup[0].latency;
 692  692  
 693  693          for (i = 1; i < n_lgrpnodes; i++) {
 694  694                  if (mpo_lgroup[i].latency < lower_latency) {
 695  695                          lower_latency = mpo_lgroup[i].latency;
 696  696                  }
 697  697                  if (mpo_lgroup[i].latency > higher_latency) {
 698  698                          higher_latency = mpo_lgroup[i].latency;
 699  699                  }
 700  700          }
 701  701          lower_latency /= 10000;
 702  702          higher_latency /= 10000;
 703  703  
 704  704          /* Clear our CPU data */
 705  705  
 706  706          for (i = 0; i < NCPU; i++) {
 707  707                  mpo_cpu[i].home = 0;
 708  708                  mpo_cpu[i].lgrp_index = -1;
 709  709          }
 710  710  
 711  711          /* Build the CPU nodes */
 712  712          for (i = 0; i < n_cpunodes; i++) {
 713  713  
 714  714                  /* Read in the lgroup nodes */
 715  715                  result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k);
 716  716                  if (result < 0) {
 717  717                          MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n");
 718  718                          ret_val = -1;
 719  719                          goto fail;
 720  720                  }
 721  721  
 722  722                  o = mpo_cpu_to_lgroup(md, cpunodes[i]);
 723  723                  if (o == -1) {
 724  724                          ret_val = -1;
 725  725                          goto fail;
 726  726                  }
 727  727                  mpo_cpu[k].lgrp_index = o;
 728  728                  mpo_cpu[k].home = mpo_lgroup[o].addr_match >> home_mask_shift;
 729  729                  mpo_lgroup[o].ncpu++;
 730  730          }
 731  731          /* Validate that no large pages cross mnode boundaries. */
 732  732          if (valid_pages(md, cpunodes[0]) == 0) {
 733  733                  ret_val = -1;
 734  734                  goto fail;
 735  735          }
 736  736  
 737  737  fail:
 738  738          if (n_cpunodes > 0)
 739  739                  md_free_scan_dag(md, &cpunodes);
 740  740          if (n_mblocks > 0)
 741  741                  md_free_scan_dag(md, &mblocknodes);
 742  742          else
 743  743                  panic("lgrp_traverse: No memory blocks found");
 744  744  
 745  745          if (ret_val == 0) {
 746  746                  MPO_STATUS("MPO feature is enabled.\n");
 747  747          } else
 748  748                  sun4v_mpo_enable = 0;   /* set this for DR */
 749  749  
 750  750          return (ret_val);
 751  751  }
 752  752  
 753  753  /*
 754  754   *  Determine the number of unique mem_lg's present in our system
 755  755   */
 756  756  static  int
 757  757  unique_home_mem_lg_count(uint64_t mem_lg_homeset)
 758  758  {
 759  759          int homeid;
 760  760          int count = 0;
 761  761  
 762  762          /*
 763  763           * Scan the "home" bits of the mem_lgs, count
 764  764           * the number that are unique.
 765  765           */
 766  766  
 767  767          for (homeid = 0; homeid < NLGRPS_MAX; homeid++) {
 768  768                  if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) {
 769  769                          count++;
 770  770                  }
 771  771          }
 772  772  
 773  773          MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n",
 774  774              mem_lg_homeset);
 775  775          MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count);
 776  776  
 777  777          /* Default must be at least one */
 778  778          if (count == 0)
 779  779                  count = 1;
 780  780  
 781  781          return (count);
 782  782  }
 783  783  
 784  784  /*
 785  785   * Platform specific lgroup initialization
 786  786   */
 787  787  void
 788  788  plat_lgrp_init(void)
 789  789  {
 790  790          md_t *md;
 791  791          int rc;
 792  792  
 793  793          /* Get the Machine Descriptor handle */
 794  794  
 795  795          md = md_get_handle();
 796  796  
 797  797          /* If not, we cannot continue */
 798  798  
 799  799          if (md == NULL) {
 800  800                  panic("cannot access machine descriptor\n");
 801  801          } else {
 802  802                  rc = lgrp_traverse(md);
 803  803                  (void) md_fini_handle(md);
 804  804          }
 805  805  
 806  806          /*
 807  807           * If we can't process the MD for lgroups then at least let the
 808  808           * system try to boot.  Assume we have one lgroup so that
 809  809           * when plat_build_mem_nodes is called, it will attempt to init
 810  810           * an mnode based on the supplied memory segment.
 811  811           */
 812  812  
 813  813          if (rc == -1) {
 814  814                  home_mask_pfn = 0;
 815  815                  max_locality_groups = 1;
 816  816                  n_locality_groups = 1;
 817  817                  return;
 818  818          }
 819  819  
 820  820          mem_node_pfn_shift = 0;
 821  821          mem_node_physalign = 0;
 822  822  
 823  823          /* Use lgroup-aware TSB allocations */
 824  824          tsb_lgrp_affinity = 1;
 825  825  
 826  826          /* Require that a home lgroup have some memory to be chosen */
 827  827          lgrp_mem_free_thresh = 1;
 828  828  
 829  829          /* Standard home-on-next-touch policy */
 830  830          lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT;
 831  831  
 832  832          /* Disable option to choose root lgroup if all leaf lgroups are busy */
 833  833          lgrp_load_thresh = UINT32_MAX;
 834  834  
 835  835          mpo_update_tunables();
 836  836  }
 837  837  
 838  838  /*
 839  839   *  Helper routine for debugging calls to mem_node_add_slice()
 840  840   */
 841  841  static  void
 842  842  mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn)
 843  843  {
 844  844  #if defined(DEBUG) && !defined(lint)
 845  845          static int slice_count = 0;
 846  846  
 847  847          slice_count++;
 848  848          MPO_DEBUG("mem_add_slice(%d): basepfn: %lx  endpfn: %lx\n",
 849  849              slice_count, basepfn, endpfn);
 850  850  #endif
 851  851          mem_node_add_slice(basepfn, endpfn);
 852  852  }
 853  853  
 854  854  static  void
 855  855  mpo_mem_node_del_slice(pfn_t basepfn, pfn_t endpfn)
 856  856  {
 857  857  #if defined(DEBUG) && !defined(lint)
 858  858          static int slice_count = 0;
 859  859  
 860  860          slice_count++;
 861  861          MPO_DEBUG("mem_del_slice(%d): basepfn: %lx  endpfn: %lx\n",
 862  862              slice_count, basepfn, endpfn);
 863  863  #endif
 864  864          mem_node_del_slice(basepfn, endpfn);
 865  865  }
 866  866  
 867  867  /*
 868  868   *  Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node()
 869  869   */
 870  870  static  void
 871  871  mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode)
 872  872  {
 873  873          MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld, "
 874  874              "mnode index: %d\n", plathand, mnode);
 875  875          plat_assign_lgrphand_to_mem_node(plathand, mnode);
 876  876  }
 877  877  
 878  878  /*
 879  879   * plat_build_mem_nodes()
 880  880   *
 881  881   * Define the mem_nodes based on the modified boot memory list,
 882  882   * or based on info read from the MD in plat_lgrp_init().
 883  883   *
 884  884   * When the home mask lies in the middle of the address bits (as it does on
 885  885   * Victoria Falls), then the memory in one mem_node is no longer contiguous;
 886  886   * it is striped across an mblock in a repeating pattern of contiguous memory
 887  887   * followed by a gap.  The stripe width is the size of the contiguous piece.
 888  888   * The stride is the distance from the start of one contiguous piece to the
 889  889   * start of the next.  The gap is thus stride - stripe_width.
 890  890   *
 891  891   * The stripe of an mnode that falls within an mblock is described by the type
 892  892   * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock.  The
 893  893   * mem_stripe_t's are kept in a global array mem_stripes[].  The index into
 894  894   * this array is predetermined.  The mem_stripe_t that describes mnode m
 895  895   * within mpo_mblock[i] is stored at
 896  896   *       mem_stripes[ m + i * max_locality_groups ]
 897  897   *
 898  898   * max_locality_groups is the total number of possible locality groups,
 899  899   * as defined by the size of the home mask, even if the memory assigned
 900  900   * to the domain is small and does not cover all the lgroups.  Thus some
 901  901   * mem_stripe_t's may be empty.
 902  902   *
 903  903   * The members of mem_stripe_t are:
 904  904   *      physbase: First valid page in mem_node in the corresponding mblock
 905  905   *      physmax: Last valid page in mem_node in mblock
 906  906   *      offset:  The full stripe width starts at physbase - offset.
 907  907   *          Thus if offset is non-zero, this mem_node starts in the middle
 908  908   *          of a stripe width, and the second full stripe starts at
 909  909   *          physbase - offset + stride.  (even though physmax may fall in the
 910  910   *          middle of a stripe width, we do not save the ending fragment size
 911  911   *          in this data structure.)
 912  912   *      exists: Set to 1 if the mblock has memory in this mem_node stripe.
 913  913   *
 914  914   *      The stripe width is kept in the global mnode_pages.
 915  915   *      The stride is kept in the global mnode_stride.
 916  916   *      All the above use pfn's as the unit.
 917  917   *
 918  918   * As an example, the memory layout for a domain with 2 mblocks and 4
 919  919   * mem_nodes 0,1,2,3 could look like this:
 920  920   *
 921  921   *      123012301230 ...        012301230123 ...
 922  922   *        mblock 0                mblock 1
 923  923   */
 924  924  
 925  925  /*ARGSUSED*/
 926  926  void
 927  927  plat_build_mem_nodes(prom_memlist_t *list, size_t nelems)
 928  928  {
 929  929          int elem;
 930  930          uint64_t base, len;
 931  931  
 932  932          /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */
 933  933          max_mem_nodes = max_locality_groups;
 934  934  
 935  935          mstripe_update(&mpo_config);
 936  936  
 937  937          /* Check for non-MPO sun4v platforms */
 938  938          if (n_locality_groups <= 1) {
 939  939                  mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0);
 940  940                  for (elem = 0; elem < nelems; list++, elem++) {
 941  941                          base = list->addr;
 942  942                          len = list->size;
 943  943  
 944  944                          mpo_mem_node_add_slice(btop(base),
 945  945                              btop(base + len - 1));
 946  946                  }
 947  947                  mem_node_pfn_shift = 0;
 948  948                  mem_node_physalign = 0;
 949  949          } else
 950  950                  mnode_update(&mpo_config, 0, 0, U_ADD_ALL);
 951  951  
 952  952          /*
 953  953           * Indicate to vm_pagelist that the hpm_counters array
 954  954           * should be shared because the ranges overlap.
 955  955           */
 956  956          if (max_mem_nodes > 1) {
 957  957                  interleaved_mnodes = 1;
 958  958          }
 959  959  }
 960  960  
 961  961  /*
 962  962   * Return the locality group value for the supplied processor
 963  963   */
 964  964  lgrp_handle_t
 965  965  plat_lgrp_cpu_to_hand(processorid_t id)
 966  966  {
 967  967          lgrp_handle_t lgrphand;
 968  968  
 969  969          mpo_rd_lock();
 970  970          if (n_locality_groups > 1) {
 971  971                  lgrphand = (lgrp_handle_t)mpo_cpu[(int)id].home;
 972  972          } else {
 973  973                  lgrphand = (lgrp_handle_t)LGRP_DEFAULT_HANDLE; /* Default */
 974  974          }
 975  975          mpo_rd_unlock();
 976  976  
 977  977          return (lgrphand);
 978  978  }
 979  979  
 980  980  int
 981  981  plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
 982  982  {
 983  983          /*
 984  984           * Return min remote latency when there are more than two lgroups
 985  985           * (root and child) and getting latency between two different lgroups
 986  986           * or root is involved.
 987  987           */
 988  988          if (lgrp_optimizations() && (from != to ||
 989  989              from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) {
 990  990                  return ((int)higher_latency);
 991  991          } else {
 992  992                  return ((int)lower_latency);
 993  993          }
 994  994  }
 995  995  
 996  996  int
 997  997  plat_pfn_to_mem_node(pfn_t pfn)
 998  998  {
 999  999          int i, mnode;
1000 1000          pfn_t ra_to_pa_pfn;
1001 1001          struct mblock_md *mb;
1002 1002  
1003 1003          if (n_locality_groups <= 1)
1004 1004                  return (0);
1005 1005  
1006 1006          /*
1007 1007           * The mnode is defined to be 1:1 with the lgroup handle, which
1008 1008           * is taken from from the home bits.  Find the mblock in which
1009 1009           * the pfn falls to get the ra_to_pa adjustment, and extract
1010 1010           * the home bits.
1011 1011           */
1012 1012          mpo_rd_lock();
1013 1013          mb = &mpo_mblock[0];
1014 1014          for (i = 0; i < n_mblocks; i++) {
1015 1015                  if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) {
1016 1016                          ra_to_pa_pfn = btop(mb->ra_to_pa);
1017 1017                          mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >>
1018 1018                              home_mask_pfn_shift);
1019 1019                          ASSERT(mnode < max_mem_nodes);
1020 1020                          mpo_rd_unlock();
1021 1021                          return (mnode);
1022 1022                  }
1023 1023                  mb++;
1024 1024          }
1025 1025  
1026 1026          panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn);
1027 1027          return (pfn);
1028 1028  }
1029 1029  
1030 1030  /*
1031 1031   * plat_rapfn_to_papfn
1032 1032   *
1033 1033   * Convert a pfn in RA space to a pfn in PA space, in which the page coloring
1034 1034   * and home mask bits are correct.  The upper bits do not necessarily
1035 1035   * match the actual PA, however.
1036 1036   */
1037 1037  pfn_t
1038 1038  plat_rapfn_to_papfn(pfn_t pfn)
1039 1039  {
1040 1040          int i;
1041 1041          pfn_t ra_to_pa_pfn;
1042 1042          struct mblock_md *mb;
1043 1043  
1044 1044          ASSERT(n_mblocks > 0);
1045 1045          if (n_mblocks == 1)
1046 1046                  return (pfn + base_ra_to_pa_pfn);
1047 1047  
1048 1048          /*
1049 1049           * Find the mblock in which the pfn falls
1050 1050           * in order to get the ra_to_pa adjustment.
1051 1051           */
1052 1052          mpo_rd_lock();
1053 1053          for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) {
1054 1054                  if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) {
1055 1055                          ra_to_pa_pfn = btop(mb->ra_to_pa);
1056 1056                          mpo_rd_unlock();
1057 1057                          return (pfn + ra_to_pa_pfn);
1058 1058                  }
1059 1059          }
1060 1060  
1061 1061          panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn);
1062 1062          return (pfn);
1063 1063  }
1064 1064  
1065 1065  /*
1066 1066   * plat_mem_node_iterator_init()
1067 1067   *      Initialize cookie "it" to iterate over pfn's in an mnode.  There is
1068 1068   *      no additional iterator function.  The caller uses the info from
1069 1069   *      the iterator structure directly.
1070 1070   *
1071 1071   *      pfn: starting pfn.
1072 1072   *      mnode: desired mnode.
1073 1073   *      szc: desired page size.
1074 1074   *      init:
1075 1075   *          if 1, start a new traversal, initialize "it", find first
1076 1076   *              mblock containing pfn, and return its starting pfn
1077 1077   *              within the mnode.
1078 1078   *          if 0, continue the previous traversal using passed-in data
1079 1079   *              from "it", advance to the next mblock, and return its
1080 1080   *              starting pfn within the mnode.
1081 1081   *      it: returns readonly data to the caller; see below.
1082 1082   *
1083 1083   *      The input pfn must be aligned for the page size szc.
1084 1084   *
1085 1085   *      Returns: starting pfn for the iteration for the mnode/mblock,
1086 1086   *          which is aligned according to the page size,
1087 1087   *          or returns (pfn_t)(-1) if the input pfn lies past the last
1088 1088   *          valid pfn of the mnode.
1089 1089   *      Returns misc values in the "it" struct that allows the caller
1090 1090   *          to advance the pfn within an mblock using address arithmetic;
1091 1091   *          see definition of mem_node_iterator_t in vm_dep.h.
1092 1092   *          When the caller calculates a pfn that is greater than the
1093 1093   *          returned value it->mi_mblock_end, the caller should again
1094 1094   *          call plat_mem_node_iterator_init, passing init=0.
1095 1095   *
1096 1096   *          The last mblock in continuation case may be invalid because
1097 1097   *          of memory DR.  To detect this situation mi_genid is checked
1098 1098   *          against mpo_genid which is incremented after a memory DR
1099 1099   *          operation.  See also plat_slice_add()/plat_slice_del().
1100 1100   */
1101 1101  pfn_t
1102 1102  plat_mem_node_iterator_init(pfn_t pfn, int mnode, uchar_t szc,
1103 1103      mem_node_iterator_t *it, int init)
1104 1104  {
1105 1105          int i;
1106 1106          pgcnt_t szcpgcnt = PNUM_SIZE(szc);
1107 1107          struct mblock_md *mblock;
1108 1108          pfn_t base, end;
1109 1109          mem_stripe_t *ms;
1110 1110          uint64_t szcpagesize;
1111 1111  
1112 1112          ASSERT(it != NULL);
1113 1113          ASSERT(mnode >= 0 && mnode < max_mem_nodes);
1114 1114          ASSERT(n_mblocks > 0);
1115 1115          ASSERT(P2PHASE(pfn, szcpgcnt) == 0);
1116 1116  
1117 1117          mpo_rd_lock();
1118 1118  
1119 1119          if (init || (it->mi_genid != mpo_genid)) {
1120 1120                  it->mi_genid = mpo_genid;
1121 1121                  it->mi_last_mblock = 0;
1122 1122                  it->mi_init = 1;
1123 1123          }
1124 1124  
1125 1125          /* Check if mpo is not enabled and we only have one mblock */
1126 1126          if (n_locality_groups == 1 && n_mblocks == 1) {
1127 1127                  if (P2PHASE(base_ra_to_pa_pfn, szcpgcnt)) {
1128 1128                          pfn = (pfn_t)-1;
1129 1129                          goto done;
1130 1130                  }
1131 1131                  it->mi_mnode = mnode;
1132 1132                  it->mi_ra_to_pa = base_ra_to_pa_pfn;
1133 1133                  it->mi_mnode_pfn_mask = 0;
1134 1134                  it->mi_mnode_pfn_shift = 0;
1135 1135                  it->mi_mnode_mask = 0;
1136 1136                  it->mi_mblock_base = mem_node_config[mnode].physbase;
1137 1137                  it->mi_mblock_end = mem_node_config[mnode].physmax;
1138 1138                  if (pfn < it->mi_mblock_base)
1139 1139                          pfn = P2ROUNDUP(it->mi_mblock_base, szcpgcnt);
1140 1140                  if ((pfn + szcpgcnt - 1) > it->mi_mblock_end)
1141 1141                          pfn = (pfn_t)-1;
1142 1142                  goto done;
1143 1143          }
1144 1144  
1145 1145          /* init=1 means begin iterator, init=0 means continue */
1146 1146          if (init == 1) {
1147 1147                  i = 0;
1148 1148          } else {
1149 1149                  ASSERT(it->mi_last_mblock < n_mblocks);
1150 1150                  i = it->mi_last_mblock;
1151 1151                  ASSERT(pfn >
1152 1152                      mem_stripes[i * max_locality_groups + mnode].physmax);
1153 1153                  if (++i == n_mblocks) {
1154 1154                          pfn = (pfn_t)-1;
1155 1155                          goto done;
1156 1156                  }
1157 1157          }
1158 1158  
1159 1159          /*
1160 1160           * Find mblock that contains pfn for mnode's stripe, or first such an
1161 1161           * mblock after pfn, else pfn is out of bound and we'll return -1.
1162 1162           * mblocks and stripes are sorted in ascending address order.
1163 1163           */
1164 1164          szcpagesize = szcpgcnt << PAGESHIFT;
1165 1165          for (; i < n_mblocks; i++) {
1166 1166                  if (P2PHASE(mpo_mblock[i].ra_to_pa, szcpagesize))
1167 1167                          continue;
1168 1168                  ms = &mem_stripes[i * max_locality_groups + mnode];
1169 1169                  if (ms->exists && (pfn + szcpgcnt - 1) <= ms->physmax &&
1170 1170                      (P2ROUNDUP(ms->physbase, szcpgcnt) + szcpgcnt - 1) <=
1171 1171                      ms->physmax)
1172 1172                          break;
1173 1173          }
1174 1174          if (i == n_mblocks) {
1175 1175                  it->mi_last_mblock = i - 1;
1176 1176                  pfn = (pfn_t)-1;
1177 1177                  goto done;
1178 1178          }
1179 1179  
1180 1180          it->mi_last_mblock = i;
1181 1181  
1182 1182          mblock = &mpo_mblock[i];
1183 1183          base = ms->physbase;
1184 1184          end = ms->physmax;
1185 1185  
1186 1186          it->mi_mnode = mnode;
1187 1187          it->mi_ra_to_pa = btop(mblock->ra_to_pa);
1188 1188          it->mi_mblock_base = base;
1189 1189          it->mi_mblock_end = end;
1190 1190          it->mi_mnode_pfn_mask = home_mask_pfn;  /* is 0 for non-MPO case */
1191 1191          it->mi_mnode_pfn_shift = home_mask_pfn_shift;
1192 1192          it->mi_mnode_mask = max_locality_groups - 1;
1193 1193          if (pfn < base) {
1194 1194                  pfn = P2ROUNDUP(base, szcpgcnt);
1195 1195                  ASSERT(pfn + szcpgcnt - 1 <= end);
1196 1196          }
1197 1197          ASSERT((pfn + szcpgcnt - 1) <= mpo_mblock[i].end_pfn);
1198 1198  done:
1199 1199          mpo_rd_unlock();
1200 1200          return (pfn);
1201 1201  }
1202 1202  
1203 1203  /*
1204 1204   * plat_mem_node_intersect_range()
1205 1205   *
1206 1206   * Find the intersection between a memnode and a range of pfn's.
1207 1207   */
1208 1208  void
1209 1209  plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len,
1210 1210      int mnode, pgcnt_t *npages_out)
1211 1211  {
1212 1212          pfn_t offset, len, hole, base, end, test_end, frag;
1213 1213          pfn_t nearest;
1214 1214          mem_stripe_t *ms;
1215 1215          int i, npages;
1216 1216  
1217 1217          *npages_out = 0;
1218 1218  
1219 1219          if (!mem_node_config[mnode].exists || test_len == 0)
1220 1220                  return;
1221 1221  
1222 1222          base = mem_node_config[mnode].physbase;
1223 1223          end = mem_node_config[mnode].physmax;
1224 1224  
1225 1225          test_end = test_base + test_len - 1;
1226 1226          if (end < test_base || base > test_end)
1227 1227                  return;
1228 1228  
1229 1229          if (n_locality_groups == 1) {
1230 1230                  *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1;
1231 1231                  return;
1232 1232          }
1233 1233  
1234 1234          hole = mnode_stride - mnode_pages;
1235 1235          npages = 0;
1236 1236  
1237 1237          /*
1238 1238           * Iterate over all the stripes for this mnode (one per mblock),
1239 1239           * find the intersection with each, and accumulate the intersections.
1240 1240           *
1241 1241           * Determing the intersection with a stripe is tricky.  If base or end
1242 1242           * fall outside the mem_node bounds, round them to physbase/physmax of
1243 1243           * mem_node.  If base or end fall in a gap, round them to start of
1244 1244           * nearest stripe.  If they fall within a stripe, keep base or end,
1245 1245           * but calculate the fragment size that should be excluded from the
1246 1246           * stripe.  Calculate how many strides fall in the adjusted range,
1247 1247           * multiply by stripe width, and add the start and end fragments.
1248 1248           */
1249 1249  
1250 1250          mpo_rd_lock();
1251 1251          for (i = mnode; i < n_mem_stripes; i += max_locality_groups) {
1252 1252                  ms = &mem_stripes[i];
1253 1253                  if (ms->exists &&
1254 1254                      test_base <= (end = ms->physmax) &&
1255 1255                      test_end >= (base = ms->physbase)) {
1256 1256  
1257 1257                          offset = ms->offset;
1258 1258  
1259 1259                          if (test_base > base) {
1260 1260                                  /* Round test_base to next multiple of stride */
1261 1261                                  len = P2ROUNDUP(test_base - (base - offset),
1262 1262                                      mnode_stride);
1263 1263                                  nearest = base - offset + len;
1264 1264                                  /*
1265 1265                                   * Compute distance from test_base to the
1266 1266                                   * stride boundary to see if test_base falls
1267 1267                                   * in the stripe or in the hole.
1268 1268                                   */
1269 1269                                  if (nearest - test_base > hole) {
1270 1270                                          /*
1271 1271                                           * test_base lies in stripe,
1272 1272                                           * and offset should be excluded.
1273 1273                                           */
1274 1274                                          offset = test_base -
1275 1275                                              (nearest - mnode_stride);
1276 1276                                          base = test_base;
1277 1277                                  } else {
1278 1278                                          /* round up to next stripe start */
1279 1279                                          offset = 0;
1280 1280                                          base = nearest;
1281 1281                                          if (base > end)
1282 1282                                                  continue;
1283 1283                                  }
1284 1284  
1285 1285                          }
1286 1286  
1287 1287                          if (test_end < end)
1288 1288                                  end = test_end;
1289 1289                          end++;          /* adjust to an exclusive bound */
1290 1290  
1291 1291                          /* Round end to next multiple of stride */
1292 1292                          len = P2ROUNDUP(end - (base - offset), mnode_stride);
1293 1293                          nearest = (base - offset) + len;
1294 1294                          if (nearest - end <= hole) {
1295 1295                                  /* end falls in hole, use entire last stripe */
1296 1296                                  frag = 0;
1297 1297                          } else {
1298 1298                                  /* end falls in stripe, compute fragment */
1299 1299                                  frag = nearest - hole - end;
1300 1300                          }
1301 1301  
1302 1302                          len = (len >> stripe_shift) - offset - frag;
1303 1303                          npages += len;
1304 1304                  }
1305 1305          }
1306 1306  
1307 1307          *npages_out = npages;
1308 1308          mpo_rd_unlock();
1309 1309  }
1310 1310  
1311 1311  /*
1312 1312   * valid_pages()
1313 1313   *
1314 1314   * Return 1 if pages are valid and do not cross mnode boundaries
1315 1315   * (which would break page free list assumptions), and 0 otherwise.
1316 1316   */
1317 1317  
1318 1318  #define MNODE(pa)       \
1319 1319          ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift)
1320 1320  
1321 1321  static int
1322 1322  valid_pages(md_t *md, mde_cookie_t cpu0)
1323 1323  {
1324 1324          int i, max_szc;
1325 1325          uint64_t last_page_base, szc_mask;
1326 1326          uint64_t max_page_len, max_coalesce_len;
1327 1327          struct mblock_md *mb = mpo_mblock;
1328 1328  
1329 1329          /*
1330 1330           * Find the smaller of the largest page possible and supported.
1331 1331           * mmu_exported_pagesize_mask is not yet initialized, so read
1332 1332           * it from the MD.  Apply minimal fixups in case of broken MDs
1333 1333           * to get a sane mask.
1334 1334           */
1335 1335  
1336 1336          if (cpu0 == NULL)
1337 1337                  szc_mask = szc_mask0;
1338 1338          else {
1339 1339                  if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask))
1340 1340                          szc_mask = 0;
1341 1341                  /* largest in sun4v default support */
1342 1342                  szc_mask |=  (1 << TTE4M);
1343 1343                  szc_mask0 = szc_mask;
1344 1344          }
1345 1345          max_szc = highbit(szc_mask) - 1;
1346 1346          if (max_szc > TTE256M)
1347 1347                  max_szc = TTE256M;
1348 1348          max_page_len = TTEBYTES(max_szc);
1349 1349  
1350 1350          /*
1351 1351           * Page coalescing code coalesces all sizes up to 256M on sun4v, even
1352 1352           * if mmu-page-size-list does not contain it, so 256M pages must fall
1353 1353           * within one mnode to use MPO.
1354 1354           */
1355 1355          max_coalesce_len = TTEBYTES(TTE256M);
1356 1356          ASSERT(max_coalesce_len >= max_page_len);
1357 1357  
1358 1358          if (ptob(mnode_pages) < max_coalesce_len) {
1359 1359                  MPO_STATUS("Page too large; MPO disabled: page = %lx, "
1360 1360                      "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages));
1361 1361                  return (0);
1362 1362          }
1363 1363  
1364 1364          for (i = 0; i < n_mblocks; i++) {
1365 1365                  uint64_t base = mb->base;
1366 1366                  uint64_t end = mb->base + mb->size - 1;
1367 1367                  uint64_t ra_to_pa = mb->ra_to_pa;
1368 1368  
1369 1369                  /*
1370 1370                   * If mblock is smaller than the max page size, then
1371 1371                   * RA = PA mod MAXPAGE is not guaranteed, but it must
1372 1372                   * not span mnodes.
1373 1373                   */
1374 1374                  if (mb->size < max_page_len) {
1375 1375                          if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) {
1376 1376                                  MPO_STATUS("Small mblock spans mnodes; "
1377 1377                                      "MPO disabled: base = %lx, end = %lx, "
1378 1378                                      "ra2pa = %lx\n", base, end, ra_to_pa);
1379 1379                                  return (0);
1380 1380                          }
1381 1381                  } else {
1382 1382                          /* Verify RA = PA mod MAXPAGE, using coalesce size */
1383 1383                          uint64_t pa_base = base + ra_to_pa;
1384 1384                          if ((base & (max_coalesce_len - 1)) !=
1385 1385                              (pa_base & (max_coalesce_len - 1))) {
1386 1386                                  MPO_STATUS("bad page alignment; MPO disabled: "
1387 1387                                      "ra = %lx, pa = %lx, pagelen = %lx\n",
1388 1388                                      base, pa_base, max_coalesce_len);
1389 1389                                  return (0);
1390 1390                          }
1391 1391                  }
1392 1392  
1393 1393                  /*
1394 1394                   * Find start of last large page in mblock in RA space.
1395 1395                   * If page extends into the next mblock, verify the
1396 1396                   * mnode does not change.
1397 1397                   */
1398 1398                  last_page_base = P2ALIGN(end, max_coalesce_len);
1399 1399                  if (i + 1 < n_mblocks &&
1400 1400                      last_page_base + max_coalesce_len > mb[1].base &&
1401 1401                      MNODE(last_page_base + ra_to_pa) !=
1402 1402                      MNODE(mb[1].base + mb[1].ra_to_pa)) {
1403 1403                          MPO_STATUS("Large page spans mblocks; MPO disabled: "
1404 1404                              "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, "
1405 1405                              "pagelen = %lx\n", end, ra_to_pa, mb[1].base,
1406 1406                              mb[1].ra_to_pa, max_coalesce_len);
1407 1407                          return (0);
1408 1408                  }
1409 1409  
1410 1410                  mb++;
1411 1411          }
1412 1412          return (1);
1413 1413  }
1414 1414  
1415 1415  
1416 1416  /*
1417 1417   * fix_interleave() - Find lgroups with sub-page sized memory interleave,
1418 1418   * if any, and remove them.  This yields a config where the "coarse
1419 1419   * grained" lgroups cover all of memory, even though part of that memory
1420 1420   * is fine grain interleaved and does not deliver a purely local memory
1421 1421   * latency.
1422 1422   *
1423 1423   * This function reads and modifies the globals:
1424 1424   *      mpo_lgroup[], n_lgrpnodes
1425 1425   *
1426 1426   * Returns 1 if lgroup nodes were removed, 0 otherwise.
1427 1427   */
1428 1428  
1429 1429  static int
1430 1430  fix_interleave(void)
1431 1431  {
1432 1432          int i, j;
1433 1433          uint64_t mask = 0;
1434 1434  
1435 1435          j = 0;
1436 1436          for (i = 0; i < n_lgrpnodes; i++) {
1437 1437                  if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) {
1438 1438                          /* remove this lgroup */
1439 1439                          mask = mpo_lgroup[i].addr_mask;
1440 1440                  } else {
1441 1441                          mpo_lgroup[j++] = mpo_lgroup[i];
1442 1442                  }
1443 1443          }
1444 1444          n_lgrpnodes = j;
1445 1445  
1446 1446          if (mask != 0)
1447 1447                  MPO_STATUS("sub-page interleave %lx found; "
1448 1448                      "removing lgroup.\n", mask);
1449 1449  
1450 1450          return (mask != 0);
1451 1451  }
1452 1452  
1453 1453  /*
1454 1454   * mblock_alloc
1455 1455   *
1456 1456   * Allocate memory for mblock an stripe arrays from either static or
1457 1457   * dynamic space depending on utype, and return the result in mc.
1458 1458   * Returns 0 on success and -1 on error.
1459 1459   */
1460 1460  
1461 1461  static int
1462 1462  mblock_alloc(mpo_config_t *mc, update_t utype, int nmblocks)
1463 1463  {
1464 1464          mblock_md_t *mb = NULL;
1465 1465          mem_stripe_t *ms = NULL;
1466 1466          int nstripes = MAX_MEM_NODES * nmblocks;
1467 1467          size_t mblocksz = nmblocks * sizeof (struct mblock_md);
1468 1468          size_t mstripesz = nstripes * sizeof (mem_stripe_t);
1469 1469          size_t allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz));
1470 1470  
1471 1471          /*
1472 1472           * Allocate space for mblocks and mstripes.
1473 1473           *
1474 1474           * For DR allocations, just use kmem_alloc(), and set
1475 1475           * mc_alloc_sz to indicate it was used.
1476 1476           *
1477 1477           * For boot allocation:
1478 1478           * If we have a small number of mblocks we will use the space
1479 1479           * that we preallocated. Otherwise, we will dynamically
1480 1480           * allocate the space from the prom and map it to the
1481 1481           * reserved VA at MPOBUF_BASE.
1482 1482           */
1483 1483  
1484 1484          if (utype == U_ADD || utype == U_DEL) {
1485 1485                  mb = (struct mblock_md *)kmem_zalloc(allocsz, KM_SLEEP);
1486 1486                  ms = (mem_stripe_t *)(mb + nmblocks);
1487 1487                  mc->mc_alloc_sz = allocsz;
1488 1488          } else if (nmblocks <= SMALL_MBLOCKS_COUNT) {
1489 1489                  mb = &small_mpo_mblocks[0];
1490 1490                  ms = &small_mem_stripes[0];
1491 1491                  mc->mc_alloc_sz = 0;
1492 1492          } else {
1493 1493                  /* Ensure that we dont request more space than reserved */
1494 1494                  if (allocsz > MPOBUF_SIZE) {
1495 1495                          MPO_STATUS("mblock_alloc: Insufficient space "
1496 1496                              "for mblock structures \n");
1497 1497                          return (-1);
1498 1498                  }
1499 1499                  mb = (struct mblock_md *)
1500 1500                      prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE);
1501 1501                  if (mb != (struct mblock_md *)MPOBUF_BASE) {
1502 1502                          MPO_STATUS("mblock_alloc: Cannot allocate space "
1503 1503                              "for mblocks \n");
1504 1504                          return (-1);
1505 1505                  }
1506 1506                  mpo_heap32_buf = (caddr_t)MPOBUF_BASE;
1507 1507                  mpo_heap32_bufsz = MPOBUF_SIZE;
1508 1508                  ms = (mem_stripe_t *)(mb + nmblocks);
1509 1509                  mc->mc_alloc_sz = 0;
1510 1510          }
1511 1511          mc->mc_mblocks = mb;
1512 1512          mc->mc_stripes = ms;
1513 1513          mc->mc_nmblocks = nmblocks;
1514 1514          mc->mc_nstripes = nstripes;
1515 1515          MPO_DEBUG("mblock_alloc: mblocks: %d\n", nmblocks);
1516 1516          return (0);
1517 1517  }
1518 1518  
1519 1519  /*
1520 1520   * mblock_free
1521 1521   *
1522 1522   * Free memory in mc that was allocated by mblock_alloc.
1523 1523   */
1524 1524  
1525 1525  static void
1526 1526  mblock_free(mpo_config_t *mc)
1527 1527  {
1528 1528          if (mc->mc_alloc_sz > 0) {
1529 1529                  ASSERT(mc->mc_mblocks != mpo_mblock);
1530 1530                  kmem_free((caddr_t)mc->mc_mblocks, mc->mc_alloc_sz);
1531 1531          }
1532 1532          bzero(mc, sizeof (*mc));
1533 1533  }
1534 1534  
1535 1535  /*
1536 1536   * mblock_install
1537 1537   *
1538 1538   * Install mblock config passed in mc as the global configuration.
1539 1539   * May only be called at boot or while holding mpo_wr_lock.
1540 1540   */
1541 1541  
1542 1542  static void
1543 1543  mblock_install(mpo_config_t *mc)
1544 1544  {
1545 1545          mpo_mblock = mc->mc_mblocks;
1546 1546          n_mblocks = mc->mc_nmblocks;
1547 1547          mem_stripes = mc->mc_stripes;
1548 1548          n_mem_stripes = mc->mc_nstripes;
1549 1549          base_ra_to_pa_pfn = btop(mc->mc_mblocks[0].ra_to_pa);
1550 1550          mpo_config = *mc;
1551 1551  }
1552 1552  
1553 1553  /*
1554 1554   * mblock_update
1555 1555   *
1556 1556   * Traverse mblocknodes, read the mblock properties from the MD, and
1557 1557   * save the mblocks in mc.
1558 1558   */
1559 1559  
1560 1560  static void
1561 1561  mblock_update(mpo_config_t *mc, md_t md, mde_cookie_t *mblocknodes)
1562 1562  {
1563 1563          uint64_t i, j;
1564 1564          int result = 0;
1565 1565          mblock_md_t *mblock = mc->mc_mblocks;
1566 1566  
1567 1567          for (i = 0, j = 0; j < mc->mc_nmblocks; j++) {
1568 1568  
1569 1569                  /* Without a base or size value we will fail */
1570 1570                  result = get_int(md, mblocknodes[j], PROP_LG_BASE,
1571 1571                      &mblock[i].base);
1572 1572                  if (result < 0) {
1573 1573                          MPO_STATUS("mblock_update: "
1574 1574                              "PROP_LG_BASE is missing\n");
1575 1575                          mc->mc_nmblocks = 0;
1576 1576                          return;
1577 1577                  }
1578 1578  
1579 1579                  result = get_int(md, mblocknodes[j], PROP_LG_SIZE,
1580 1580                      &mblock[i].size);
1581 1581                  if (result < 0) {
1582 1582                          MPO_STATUS("mblock_update: "
1583 1583                              "PROP_LG_SIZE is missing\n");
1584 1584                          mc->mc_nmblocks = 0;
1585 1585                          return;
1586 1586                  }
1587 1587  
1588 1588                  result = get_int(md, mblocknodes[j],
1589 1589                      PROP_LG_RA_PA_OFFSET, &mblock[i].ra_to_pa);
1590 1590  
1591 1591                  /* If we don't have an ra_pa_offset, just set it to 0 */
1592 1592                  if (result < 0)
1593 1593                          mblock[i].ra_to_pa = 0;
1594 1594  
1595 1595                  MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, "
1596 1596                      "ra_to_pa = %lx\n", i,
1597 1597                      mblock[i].base,
1598 1598                      mblock[i].size,
1599 1599                      mblock[i].ra_to_pa);
1600 1600  
1601 1601                  /* check for unsupportable values of base and size */
1602 1602                  if (mblock[i].base > mblock[i].base + mblock[i].size) {
1603 1603                          MPO_STATUS("mblock_update: "
1604 1604                              "PROP_LG_BASE+PROP_LG_SIZE is invalid: "
1605 1605                              "base = %lx, size = %lx\n",
1606 1606                              mblock[i].base, mblock[i].size);
1607 1607                          mc->mc_nmblocks = 0;
1608 1608                          return;
1609 1609                  }
1610 1610  
1611 1611                  /* eliminate size==0 blocks */
1612 1612                  if (mblock[i].size != 0) {
1613 1613                          uint64_t base = mblock[i].base;
1614 1614                          uint64_t end = base + mblock[i].size;
1615 1615                          ASSERT(end > base);
1616 1616                          mblock[i].base_pfn = btop(base);
1617 1617                          mblock[i].end_pfn = btop(end - 1);
1618 1618                          i++;
1619 1619                  }
1620 1620          }
1621 1621  
1622 1622          if (i == 0) {
1623 1623                  MPO_STATUS("mblock_update: "
1624 1624                      "No non-empty mblock nodes were found "
1625 1625                      "in the Machine Descriptor\n");
1626 1626                  mc->mc_nmblocks = 0;
1627 1627                  return;
1628 1628          }
1629 1629          ASSERT(i <= mc->mc_nmblocks);
1630 1630          mc->mc_nmblocks = i;
1631 1631  
1632 1632          /* Must sort mblocks by address for mem_node_iterator_init() */
1633 1633          mblock_sort(mblock, mc->mc_nmblocks);
1634 1634  }
1635 1635  
1636 1636  /*
1637 1637   * mblock_update_add
1638 1638   *
1639 1639   * Update mblock config after a memory DR add.  The added range is not
1640 1640   * needed, as we read *all* mblock nodes from the MD.  Save the mblocks
1641 1641   * in mc.
1642 1642   */
1643 1643  
1644 1644  static void
1645 1645  mblock_update_add(mpo_config_t *mc)
1646 1646  {
1647 1647          md_t *md;
1648 1648          mde_cookie_t root, *mblocknodes;
1649 1649          int nmblocks = 0;
1650 1650  
1651 1651          if ((md = md_get_handle()) == NULL) {
1652 1652                  MPO_STATUS("Cannot access Machine Descriptor\n");
1653 1653                  goto error;
1654 1654          }
1655 1655  
1656 1656          if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE)
1657 1657                  goto error;
1658 1658  
1659 1659          nmblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd",
1660 1660              &mblocknodes);
1661 1661          if (nmblocks <= 0) {
1662 1662                  MPO_STATUS("No mblock nodes detected in Machine Descriptor\n");
1663 1663                  goto error;
1664 1664          }
1665 1665  
1666 1666          if (mblock_alloc(mc, U_ADD, nmblocks) < 0)
1667 1667                  goto error;
1668 1668  
1669 1669          mblock_update(mc, md, mblocknodes);
1670 1670          md_free_scan_dag(md, &mblocknodes);
1671 1671          (void) md_fini_handle(md);
1672 1672          return;
1673 1673  error:
1674 1674          panic("mblock_update_add: cannot process mblocks from MD.\n");
1675 1675  }
1676 1676  
1677 1677  /*
1678 1678   * mblock_update_del
1679 1679   *
1680 1680   * Update mblocks after a memory DR deletion of the range (ubase, uend).
1681 1681   * Allocate a new mblock config, copy old config to the new, modify the new
1682 1682   * mblocks to reflect the deletion.   The new mblocks are returned in
1683 1683   * mc_new and are not yet installed as the active config.
1684 1684   */
1685 1685  
1686 1686  static void
1687 1687  mblock_update_del(mpo_config_t *mc_new, mpo_config_t *mc_old, pfn_t ubase,
1688 1688      pfn_t uend)
1689 1689  {
1690 1690          int i, j;
1691 1691          pfn_t base, end;
1692 1692          mblock_md_t *mblock;
1693 1693          int nmblocks = mc_old->mc_nmblocks;
1694 1694  
1695 1695          MPO_DEBUG("mblock_update_del(0x%lx, 0x%lx)\n", ubase, uend);
1696 1696  
1697 1697          /*
1698 1698           * Allocate mblocks in mc_new and copy the old to the new.
1699 1699           * Allocate one extra in case the deletion splits an mblock.
1700 1700           */
1701 1701          if (mblock_alloc(mc_new, U_DEL, nmblocks + 1) < 0)
1702 1702                  return;
1703 1703          mblock = mc_new->mc_mblocks;
1704 1704          bcopy(mc_old->mc_mblocks, mblock, nmblocks * sizeof (mblock_md_t));
1705 1705  
1706 1706          /*
1707 1707           * Find the mblock containing the deleted range and adjust it in
1708 1708           * the new config.
1709 1709           */
1710 1710          for (i = 0; i < nmblocks; i++) {
1711 1711  
1712 1712                  base = btop(mblock[i].base);
1713 1713                  end = base + btop(mblock[i].size) - 1;
1714 1714  
1715 1715                  /*
1716 1716                   * Adjust the mblock based on the subset that was deleted.
1717 1717                   *
1718 1718                   * If the entire mblk was deleted, compact the table.
1719 1719                   *
1720 1720                   * If the middle of the mblk was deleted, extend
1721 1721                   * the table.  Space for the new slot was already
1722 1722                   * allocated.
1723 1723                   *
1724 1724                   * The memory to be deleted is a mblock or a subset of
1725 1725                   * and does not span multiple mblocks.
1726 1726                   */
1727 1727                  if (base == ubase && end == uend) {
1728 1728                          for (j = i; j < nmblocks - 1; j++)
1729 1729                                  mblock[j] = mblock[j + 1];
1730 1730                          nmblocks--;
1731 1731                          bzero(&mblock[nmblocks], sizeof (*mblock));
1732 1732                          break;
1733 1733                  } else if (base < ubase && end > uend) {
1734 1734                          for (j = nmblocks - 1; j >= i; j--)
1735 1735                                  mblock[j + 1] = mblock[j];
1736 1736                          mblock[i].size = ptob(ubase - base);
1737 1737                          mblock[i].end_pfn = ubase - 1;
1738 1738                          mblock[i + 1].base = ptob(uend + 1);
1739 1739                          mblock[i + 1].size = ptob(end - uend);
1740 1740                          mblock[i + 1].base_pfn = uend + 1;
1741 1741                          nmblocks++;
1742 1742                          break;
1743 1743                  } else if (base == ubase) {
1744 1744                          MPO_DEBUG("mblock_update_del: shrink>"
1745 1745                              " i=%d base=0x%lx end=0x%lx", i, base, end);
1746 1746                          mblock[i].base = ptob(uend + 1);
1747 1747                          mblock[i].size -= ptob(uend - ubase + 1);
1748 1748                          base = uend + 1;
1749 1749                          mblock[i].base_pfn = base;
1750 1750                          mblock[i].end_pfn = end;
1751 1751                          MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end);
1752 1752                          break;
1753 1753                  } else if (end == uend) {
1754 1754                          MPO_DEBUG("mblock_update_del: shrink<"
1755 1755                              " i=%d base=0x%lx end=0x%lx", i, base, end);
1756 1756                          mblock[i].size -= ptob(uend - ubase + 1);
1757 1757                          end = ubase - 1;
1758 1758                          mblock[i].base_pfn = base;
1759 1759                          mblock[i].end_pfn = end;
1760 1760                          MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end);
1761 1761                          break;
1762 1762                  }
1763 1763          }
1764 1764          mc_new->mc_nmblocks = nmblocks;
1765 1765          ASSERT(end > base);
1766 1766  }
1767 1767  
1768 1768  /*
1769 1769   * mstripe_update
1770 1770   *
1771 1771   * Read mblocks from mc and update mstripes in mc
1772 1772   */
1773 1773  
1774 1774  static void
1775 1775  mstripe_update(mpo_config_t *mc)
1776 1776  {
1777 1777          lgrp_handle_t lgrphand, lgrp_start;
1778 1778          int i, mnode;
1779 1779          uint64_t offset, stripe_end, base, end, ra_to_pa, stride;
1780 1780          uint64_t stripe, frag, remove;
1781 1781          mem_stripe_t *ms;
1782 1782          mblock_md_t *mblock = mc->mc_mblocks;
1783 1783          int nmblocks = mc->mc_nmblocks;
1784 1784          int mstripesz = MAX_MEM_NODES * nmblocks * sizeof (mem_stripe_t);
1785 1785  
1786 1786          /* Check for non-MPO sun4v platforms or memory DR removal */
1787 1787          if (n_locality_groups <= 1) {
1788 1788                  ASSERT(n_locality_groups == 1);
1789 1789                  ASSERT(max_locality_groups == 1 && max_mem_nodes == 1);
1790 1790  
1791 1791                  if (nmblocks == 1) {
1792 1792                          mc->mc_nstripes = 0;
1793 1793                  } else {
1794 1794                          mc->mc_nstripes = nmblocks;
1795 1795                          bzero(mc->mc_stripes, mstripesz);
1796 1796                          for (i = 0; i < nmblocks; i++) {
1797 1797                                  mc->mc_stripes[i].exists = 1;
1798 1798                                  mc->mc_stripes[i].physbase = mblock[i].base_pfn;
1799 1799                                  mc->mc_stripes[i].physmax = mblock[i].end_pfn;
1800 1800                          }
1801 1801                  }
1802 1802                  return;
1803 1803          }
1804 1804  
1805 1805          bzero(mc->mc_stripes, mstripesz);
1806 1806          mc->mc_nstripes = max_locality_groups * nmblocks;
1807 1807          stripe = ptob(mnode_pages);
1808 1808          stride = max_locality_groups * stripe;
1809 1809  
1810 1810          for (i = 0; i < nmblocks; i++) {
1811 1811                  base = mblock[i].base;
1812 1812                  end = base + mblock[i].size;
1813 1813                  ra_to_pa = mblock[i].ra_to_pa;
1814 1814  
1815 1815                  /* Find the offset from the prev stripe boundary in PA space. */
1816 1816                  offset = (base + ra_to_pa) & (stripe - 1);
1817 1817  
1818 1818                  /* Set the next stripe boundary. */
1819 1819                  stripe_end = base - offset + stripe;
1820 1820  
1821 1821                  lgrp_start = (((base + ra_to_pa) & home_mask) >>
1822 1822                      home_mask_shift);
1823 1823                  lgrphand = lgrp_start;
1824 1824  
1825 1825                  /*
1826 1826                   * Loop over all lgroups covered by the mblock, creating a
1827 1827                   * stripe for each.  Stop when lgrp_start is visited again.
1828 1828                   */
1829 1829                  do {
1830 1830                          /* mblock may not span all lgroups */
1831 1831                          if (base >= end)
1832 1832                                  break;
1833 1833  
1834 1834                          mnode = lgrphand;
1835 1835                          ASSERT(mnode < max_mem_nodes);
1836 1836  
1837 1837                          /*
1838 1838                           * Calculate the size of the fragment that does not
1839 1839                           * belong to the mnode in the last partial stride.
1840 1840                           */
1841 1841                          frag = (end - (base - offset)) & (stride - 1);
1842 1842                          if (frag == 0) {
1843 1843                                  /* remove the gap */
1844 1844                                  remove = stride - stripe;
1845 1845                          } else if (frag < stripe) {
1846 1846                                  /* fragment fits in stripe; keep it all */
1847 1847                                  remove = 0;
1848 1848                          } else {
1849 1849                                  /* fragment is large; trim after whole stripe */
1850 1850                                  remove = frag - stripe;
1851 1851                          }
1852 1852  
1853 1853                          ms = &mc->mc_stripes[i * max_locality_groups + mnode];
1854 1854                          ms->physbase = btop(base);
1855 1855                          ms->physmax = btop(end - 1 - remove);
1856 1856                          ms->offset = btop(offset);
1857 1857                          ms->exists = 1;
1858 1858  
1859 1859                          base = stripe_end;
1860 1860                          stripe_end += stripe;
1861 1861                          offset = 0;
1862 1862                          lgrphand = (((base + ra_to_pa) & home_mask) >>
1863 1863                              home_mask_shift);
1864 1864                  } while (lgrphand != lgrp_start);
1865 1865          }
1866 1866  }
1867 1867  
1868 1868  #define INTERSECT(a, b, c, d)                           \
1869 1869          if (((a) >= (c) && (a) <= (d)) ||               \
1870 1870              ((c) >= (a) && (c) <= (b))) {               \
1871 1871                  (c) = MAX((a), (c));                    \
1872 1872                  (d) = MIN((b), (d));                    \
1873 1873          } else {                                        \
1874 1874                  ASSERT((a) >= (d) || (b) <= (c));       \
1875 1875                  continue;                               \
1876 1876          }                                               \
1877 1877  
1878 1878  /*
1879 1879   * mnode_update
1880 1880   *
1881 1881   * Read stripes from mc and update mnode extents.  The mnode extents are
1882 1882   * part of the live configuration, so this can only be done at boot time
1883 1883   * or while holding the mpo_wr_lock.
1884 1884   */
1885 1885  
1886 1886  static void
1887 1887  mnode_update(mpo_config_t *mc, pfn_t ubase, pfn_t uend, update_t utype)
1888 1888  {
1889 1889          int i, j, mnode, found;
1890 1890          pfn_t base, end;
1891 1891          mem_stripe_t *ms;
1892 1892  
1893 1893          MPO_DEBUG("mnode_udpate: basepfn: %lx  endpfn: %lx\n", ubase, uend);
1894 1894  
1895 1895          if (n_locality_groups <= 1 && mc->mc_nmblocks == 1) {
1896 1896                  if (utype == U_ADD)
1897 1897                          mpo_mem_node_add_slice(ubase, uend);
1898 1898                  else if (utype == U_DEL)
1899 1899                          mpo_mem_node_del_slice(ubase, uend);
1900 1900                  else
1901 1901                          panic("mnode update: %d: invalid\n", utype);
1902 1902                  return;
1903 1903          }
1904 1904  
1905 1905          found = 0;
1906 1906          for (i = 0; i < mc->mc_nmblocks; i++) {
1907 1907                  for (mnode = 0; mnode < max_locality_groups; mnode++) {
1908 1908  
1909 1909                          j = i * max_locality_groups + mnode;
1910 1910                          ms = &mc->mc_stripes[j];
1911 1911                          if (!ms->exists)
1912 1912                                  continue;
1913 1913  
1914 1914                          base = ms->physbase;
1915 1915                          end = ms->physmax;
1916 1916  
1917 1917                          /*
1918 1918                           * Look for the mstripes intersecting this slice.
1919 1919                           *
1920 1920                           * The mstripe and slice pairs may not be equal
1921 1921                           * if a subset of a mblock is added/deleted.
1922 1922                           */
1923 1923                          switch (utype) {
1924 1924                          case U_ADD:
1925 1925                                  INTERSECT(ubase, uend, base, end);
1926 1926                                  /*FALLTHROUGH*/
1927 1927                          case U_ADD_ALL:
1928 1928                                  if (n_locality_groups > 1)
1929 1929                                          mpo_plat_assign_lgrphand_to_mem_node(
1930 1930                                              mnode, mnode);
1931 1931                                  mpo_mem_node_add_slice(base, end);
1932 1932                                  break;
1933 1933                          case U_DEL:
1934 1934                                  INTERSECT(ubase, uend, base, end);
1935 1935                                  mpo_mem_node_del_slice(base, end);
1936 1936                                  break;
1937 1937                          default:
1938 1938                                  panic("mnode_update: %d: invalid\n", utype);
1939 1939                                  break;
1940 1940                          }
1941 1941  
1942 1942                          found++;
1943 1943                  }
1944 1944          }
1945 1945  
1946 1946          if (!found)
1947 1947                  panic("mnode_update: mstripe not found");
1948 1948  
1949 1949  #ifdef  DEBUG
1950 1950          if (utype == U_ADD_ALL || utype == U_DEL)
1951 1951                  return;
1952 1952          found = 0;
1953 1953          for (i = 0; i < max_mem_nodes; i++) {
1954 1954                  if (!mem_node_config[i].exists)
1955 1955                          continue;
1956 1956                  if (ubase >= mem_node_config[i].physbase &&
1957 1957                      ubase <= mem_node_config[i].physmax)
1958 1958                          found |= 1;
1959 1959                  if (uend >= mem_node_config[i].physbase &&
1960 1960                      uend <= mem_node_config[i].physmax)
1961 1961                          found |= 2;
1962 1962          }
1963 1963          ASSERT(found == 3);
1964 1964          {
1965 1965                  pfn_t minpfn, maxpfn;
1966 1966  
1967 1967                  mem_node_max_range(&minpfn, &maxpfn);
1968 1968                  ASSERT(minpfn <= ubase);
1969 1969                  ASSERT(maxpfn >= uend);
1970 1970          }
1971 1971  #endif
1972 1972  }
1973 1973  
1974 1974  /*
1975 1975   * Plat_slice_add()/plat_slice_del() are the platform hooks
1976 1976   * for adding/deleting a pfn range to/from the system.
1977 1977   *
1978 1978   * Platform_slice_add() is used for both boot/DR cases.
1979 1979   *
1980 1980   * - Zeus has already added the mblocks to the MD, so read the updated
1981 1981   *   MD and allocate all data structures required to manage the new memory
1982 1982   *   configuration.
1983 1983   *
1984 1984   * - Recompute the stripes which are derived from the mblocks.
1985 1985   *
1986 1986   * - Update (expand) the mnode extents and install the modified mblocks as
1987 1987   *   the new mpo config.  This must be done while holding the mpo_wr_lock
1988 1988   *   to guarantee that no other threads access the mpo meta-data.
1989 1989   *
1990 1990   * - Unlock MPO data structures; the new config is live.  Free the old config.
1991 1991   *
1992 1992   * Plat_slice_del() is used for DR only.
1993 1993   *
1994 1994   * - Zeus has not yet modified the MD to reflect the deletion, so copy
1995 1995   *   the old mpo mblocks and delete the range from the copy.
1996 1996   *
1997 1997   * - Recompute the stripes which are derived from the mblocks.
1998 1998   *
1999 1999   * - Update (shrink) the mnode extents and install the modified mblocks as
2000 2000   *   the new mpo config.  This must be done while holding the mpo_wr_lock
2001 2001   *   to guarantee that no other threads access the mpo meta-data.
2002 2002   *
2003 2003   * - Unlock MPO data structures; the new config is live.  Free the old config.
2004 2004   */
2005 2005  
2006 2006  void
2007 2007  plat_slice_add(pfn_t base, pfn_t end)
2008 2008  {
2009 2009          mpo_config_t old_config = mpo_config;
2010 2010          mpo_config_t new_config;
2011 2011  
2012 2012          VALIDATE_SLICE(base, end);
2013 2013          mblock_update_add(&new_config);
2014 2014          mstripe_update(&new_config);
2015 2015          mpo_wr_lock();
2016 2016          mblock_install(&new_config);
2017 2017          /* Use new config to add all ranges for mnode_update */
2018 2018          mnode_update(&new_config, base, end, U_ADD);
2019 2019          mpo_genid++;
2020 2020          mpo_wr_unlock();
2021 2021          mblock_free(&old_config);
2022 2022  }
2023 2023  
2024 2024  void
2025 2025  plat_slice_del(pfn_t base, pfn_t end)
2026 2026  {
2027 2027          mpo_config_t old_config = mpo_config;
2028 2028          mpo_config_t new_config;
2029 2029  
2030 2030          VALIDATE_SLICE(base, end);
2031 2031          mblock_update_del(&new_config, &old_config, base, end);
2032 2032          mstripe_update(&new_config);
2033 2033          mpo_wr_lock();
2034 2034          /* Use old config to find deleted range for mnode_update */
2035 2035          mnode_update(&old_config, base, end, U_DEL);
2036 2036          mblock_install(&new_config);
2037 2037          mpo_genid++;
2038 2038          mpo_wr_unlock();
2039 2039          mblock_free(&old_config);
2040 2040  }

↓ open down ↓

1807 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX