XXXX-pass-in-cpu_pause_func-via-pause_cpus Wdiff usr/src/uts/common/os/mem_config.c

Print this page

XXXX pass in cpu_pause_func via pause_cpus

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/mem_config.c
          +++ new/usr/src/uts/common/os/mem_config.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  #include <sys/types.h>
  27   27  #include <sys/cmn_err.h>
  28   28  #include <sys/vmem.h>
  29   29  #include <sys/kmem.h>
  30   30  #include <sys/systm.h>
  31   31  #include <sys/machsystm.h>      /* for page_freelist_coalesce() */
  32   32  #include <sys/errno.h>
  33   33  #include <sys/memnode.h>
  34   34  #include <sys/memlist.h>
  35   35  #include <sys/memlist_impl.h>
  36   36  #include <sys/tuneable.h>
  37   37  #include <sys/proc.h>
  38   38  #include <sys/disp.h>
  39   39  #include <sys/debug.h>
  40   40  #include <sys/vm.h>
  41   41  #include <sys/callb.h>
  42   42  #include <sys/memlist_plat.h>   /* for installed_top_size() */
  43   43  #include <sys/condvar_impl.h>   /* for CV_HAS_WAITERS() */
  44   44  #include <sys/dumphdr.h>        /* for dump_resize() */
  45   45  #include <sys/atomic.h>         /* for use in stats collection */
  46   46  #include <sys/rwlock.h>
  47   47  #include <sys/cpuvar.h>
  48   48  #include <vm/seg_kmem.h>
  49   49  #include <vm/seg_kpm.h>
  50   50  #include <vm/page.h>
  51   51  #include <vm/vm_dep.h>
  52   52  #define SUNDDI_IMPL             /* so sunddi.h will not redefine splx() et al */
  53   53  #include <sys/sunddi.h>
  54   54  #include <sys/mem_config.h>
  55   55  #include <sys/mem_cage.h>
  56   56  #include <sys/lgrp.h>
  57   57  #include <sys/ddi.h>
  58   58  #include <sys/modctl.h>
  59   59  
  60   60  extern struct memlist *phys_avail;
  61   61  
  62   62  extern uint_t page_ctrs_adjust(int);
  63   63  void page_ctrs_cleanup(void);
  64   64  static void kphysm_setup_post_add(pgcnt_t);
  65   65  static int kphysm_setup_pre_del(pgcnt_t);
  66   66  static void kphysm_setup_post_del(pgcnt_t, int);
  67   67  
  68   68  static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
  69   69  
  70   70  static int delspan_reserve(pfn_t, pgcnt_t);
  71   71  static void delspan_unreserve(pfn_t, pgcnt_t);
  72   72  
  73   73  kmutex_t memseg_lists_lock;
  74   74  struct memseg *memseg_va_avail;
  75   75  struct memseg *memseg_alloc(void);
  76   76  static struct memseg *memseg_delete_junk;
  77   77  static struct memseg *memseg_edit_junk;
  78   78  void memseg_remap_init(void);
  79   79  static void memseg_remap_to_dummy(struct memseg *);
  80   80  static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
  81   81  static struct memseg *memseg_reuse(pgcnt_t);
  82   82  
  83   83  static struct kmem_cache *memseg_cache;
  84   84  
  85   85  /*
  86   86   * Interfaces to manage externally allocated
  87   87   * page_t memory (metadata) for a memseg.
  88   88   */
  89   89  #pragma weak    memseg_alloc_meta
  90   90  #pragma weak    memseg_free_meta
  91   91  #pragma weak    memseg_get_metapfn
  92   92  #pragma weak    memseg_remap_meta
  93   93  
  94   94  extern int ppvm_enable;
  95   95  extern page_t *ppvm_base;
  96   96  extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *);
  97   97  extern void memseg_free_meta(void *, pgcnt_t);
  98   98  extern pfn_t memseg_get_metapfn(void *, pgcnt_t);
  99   99  extern void memseg_remap_meta(struct memseg *);
 100  100  static int memseg_is_dynamic(struct memseg *);
 101  101  static int memseg_includes_meta(struct memseg *);
 102  102  pfn_t memseg_get_start(struct memseg *);
 103  103  static void memseg_cpu_vm_flush(void);
 104  104  
 105  105  int meta_alloc_enable;
 106  106  
 107  107  #ifdef  DEBUG
 108  108  static int memseg_debug;
 109  109  #define MEMSEG_DEBUG(args...) if (memseg_debug) printf(args)
 110  110  #else
 111  111  #define MEMSEG_DEBUG(...)
 112  112  #endif
 113  113  
 114  114  /*
 115  115   * Add a chunk of memory to the system.
 116  116   * base: starting PAGESIZE page of new memory.
 117  117   * npgs: length in PAGESIZE pages.
 118  118   *
 119  119   * Adding mem this way doesn't increase the size of the hash tables;
 120  120   * growing them would be too hard.  This should be OK, but adding memory
 121  121   * dynamically most likely means more hash misses, since the tables will
 122  122   * be smaller than they otherwise would be.
 123  123   */
 124  124  int
 125  125  kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
 126  126  {
 127  127          page_t *pp;
 128  128          page_t          *opp, *oepp, *segpp;
 129  129          struct memseg   *seg;
 130  130          uint64_t        avmem;
 131  131          pfn_t           pfn;
 132  132          pfn_t           pt_base = base;
 133  133          pgcnt_t         tpgs = npgs;
 134  134          pgcnt_t         metapgs = 0;
 135  135          int             exhausted;
 136  136          pfn_t           pnum;
 137  137          int             mnode;
 138  138          caddr_t         vaddr;
 139  139          int             reuse;
 140  140          int             mlret;
 141  141          int             rv;
 142  142          int             flags;
 143  143          int             meta_alloc = 0;
 144  144          void            *mapva;
 145  145          void            *metabase = (void *)base;
 146  146          pgcnt_t         nkpmpgs = 0;
 147  147          offset_t        kpm_pages_off;
 148  148  
 149  149          cmn_err(CE_CONT,
 150  150              "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
 151  151              npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
 152  152  
 153  153          /*
 154  154           * Add this span in the delete list to prevent interactions.
 155  155           */
 156  156          if (!delspan_reserve(base, npgs)) {
 157  157                  return (KPHYSM_ESPAN);
 158  158          }
 159  159          /*
 160  160           * Check to see if any of the memory span has been added
 161  161           * by trying an add to the installed memory list. This
 162  162           * forms the interlocking process for add.
 163  163           */
 164  164  
 165  165          memlist_write_lock();
 166  166  
 167  167          mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
 168  168              (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
 169  169  
 170  170          if (mlret == MEML_SPANOP_OK)
 171  171                  installed_top_size(phys_install, &physmax, &physinstalled);
 172  172  
 173  173          memlist_write_unlock();
 174  174  
 175  175          if (mlret != MEML_SPANOP_OK) {
 176  176                  if (mlret == MEML_SPANOP_EALLOC) {
 177  177                          delspan_unreserve(pt_base, tpgs);
 178  178                          return (KPHYSM_ERESOURCE);
 179  179                  } else if (mlret == MEML_SPANOP_ESPAN) {
 180  180                          delspan_unreserve(pt_base, tpgs);
 181  181                          return (KPHYSM_ESPAN);
 182  182                  } else {
 183  183                          delspan_unreserve(pt_base, tpgs);
 184  184                          return (KPHYSM_ERESOURCE);
 185  185                  }
 186  186          }
 187  187  
 188  188          if (meta_alloc_enable) {
 189  189                  /*
 190  190                   * Allocate the page_t's from existing memory;
 191  191                   * if that fails, allocate from the incoming memory.
 192  192                   */
 193  193                  rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs);
 194  194                  if (rv == KPHYSM_OK) {
 195  195                          ASSERT(metapgs);
 196  196                          ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
 197  197                          meta_alloc = 1;
 198  198                          goto mapalloc;
 199  199                  }
 200  200          }
 201  201  
 202  202          /*
 203  203           * We store the page_t's for this new memory in the first
 204  204           * few pages of the chunk. Here, we go and get'em ...
 205  205           */
 206  206  
 207  207          /*
 208  208           * The expression after the '-' gives the number of pages
 209  209           * that will fit in the new memory based on a requirement
 210  210           * of (PAGESIZE + sizeof (page_t)) bytes per page.
 211  211           */
 212  212          metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
 213  213              (PAGESIZE + sizeof (page_t)));
 214  214  
 215  215          npgs -= metapgs;
 216  216          base += metapgs;
 217  217  
 218  218          ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
 219  219  
 220  220          exhausted = (metapgs == 0 || npgs == 0);
 221  221  
 222  222          if (kpm_enable && !exhausted) {
 223  223                  pgcnt_t start, end, nkpmpgs_prelim;
 224  224                  size_t  ptsz;
 225  225  
 226  226                  /*
 227  227                   * A viable kpm large page mapping must not overlap two
 228  228                   * dynamic memsegs. Therefore the total size is checked
 229  229                   * to be at least kpm_pgsz and also whether start and end
 230  230                   * points are at least kpm_pgsz aligned.
 231  231                   */
 232  232                  if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
 233  233                      pmodkpmp(base + npgs)) {
 234  234  
 235  235                          kphysm_addmem_error_undospan(pt_base, tpgs);
 236  236  
 237  237                          /*
 238  238                           * There is no specific error code for violating
 239  239                           * kpm granularity constraints.
 240  240                           */
 241  241                          return (KPHYSM_ENOTVIABLE);
 242  242                  }
 243  243  
 244  244                  start = kpmptop(ptokpmp(base));
 245  245                  end = kpmptop(ptokpmp(base + npgs));
 246  246                  nkpmpgs_prelim = ptokpmp(end - start);
 247  247                  ptsz = npgs * sizeof (page_t);
 248  248                  metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
 249  249                  exhausted = (tpgs <= metapgs);
 250  250                  if (!exhausted) {
 251  251                          npgs = tpgs - metapgs;
 252  252                          base = pt_base + metapgs;
 253  253  
 254  254                          /* final nkpmpgs */
 255  255                          start = kpmptop(ptokpmp(base));
 256  256                          nkpmpgs = ptokpmp(end - start);
 257  257                          kpm_pages_off = ptsz +
 258  258                              (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
 259  259                  }
 260  260          }
 261  261  
 262  262          /*
 263  263           * Is memory area supplied too small?
 264  264           */
 265  265          if (exhausted) {
 266  266                  kphysm_addmem_error_undospan(pt_base, tpgs);
 267  267                  /*
 268  268                   * There is no specific error code for 'too small'.
 269  269                   */
 270  270                  return (KPHYSM_ERESOURCE);
 271  271          }
 272  272  
 273  273  mapalloc:
 274  274          /*
 275  275           * We may re-use a previously allocated VA space for the page_ts
 276  276           * eventually, but we need to initialize and lock the pages first.
 277  277           */
 278  278  
 279  279          /*
 280  280           * Get an address in the kernel address map, map
 281  281           * the page_t pages and see if we can touch them.
 282  282           */
 283  283  
 284  284          mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
 285  285          if (mapva == NULL) {
 286  286                  cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
 287  287                      " Can't allocate VA for page_ts");
 288  288  
 289  289                  if (meta_alloc)
 290  290                          memseg_free_meta(metabase, metapgs);
 291  291                  kphysm_addmem_error_undospan(pt_base, tpgs);
 292  292  
 293  293                  return (KPHYSM_ERESOURCE);
 294  294          }
 295  295          pp = mapva;
 296  296  
 297  297          if (physmax < (pt_base + tpgs))
 298  298                  physmax = (pt_base + tpgs);
 299  299  
 300  300          /*
 301  301           * In the remapping code we map one page at a time so we must do
 302  302           * the same here to match mapping sizes.
 303  303           */
 304  304          pfn = pt_base;
 305  305          vaddr = (caddr_t)pp;
 306  306          for (pnum = 0; pnum < metapgs; pnum++) {
 307  307                  if (meta_alloc)
 308  308                          pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum);
 309  309                  hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
 310  310                      PROT_READ | PROT_WRITE,
 311  311                      HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
 312  312                  pfn++;
 313  313                  vaddr += ptob(1);
 314  314          }
 315  315  
 316  316          if (ddi_peek32((dev_info_t *)NULL,
 317  317              (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
 318  318  
 319  319                  cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
 320  320                      " Can't access pp array at 0x%p [phys 0x%lx]",
 321  321                      (void *)pp, pt_base);
 322  322  
 323  323                  hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
 324  324                      HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
 325  325  
 326  326                  vmem_free(heap_arena, mapva, ptob(metapgs));
 327  327                  if (meta_alloc)
 328  328                          memseg_free_meta(metabase, metapgs);
 329  329                  kphysm_addmem_error_undospan(pt_base, tpgs);
 330  330  
 331  331                  return (KPHYSM_EFAULT);
 332  332          }
 333  333  
 334  334          /*
 335  335           * Add this memory slice to its memory node translation.
 336  336           *
 337  337           * Note that right now, each node may have only one slice;
 338  338           * this may change with COD or in larger SSM systems with
 339  339           * nested latency groups, so we must not assume that the
 340  340           * node does not yet exist.
 341  341           *
 342  342           * Note that there may be multiple memory nodes associated with
 343  343           * a single lgrp node on x86 systems.
 344  344           */
 345  345          pnum = pt_base + tpgs - 1;
 346  346          mem_node_add_range(pt_base, pnum);
 347  347  
 348  348          /*
 349  349           * Allocate or resize page counters as necessary to accommodate
 350  350           * the increase in memory pages.
 351  351           */
 352  352          mnode = PFN_2_MEM_NODE(pnum);
 353  353          PAGE_CTRS_ADJUST(base, npgs, rv);
 354  354          if (rv) {
 355  355  
 356  356                  mem_node_del_range(pt_base, pnum);
 357  357  
 358  358                  /* cleanup the  page counters */
 359  359                  page_ctrs_cleanup();
 360  360  
 361  361                  hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
 362  362                      HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
 363  363  
 364  364                  vmem_free(heap_arena, mapva, ptob(metapgs));
 365  365                  if (meta_alloc)
 366  366                          memseg_free_meta(metabase, metapgs);
 367  367                  kphysm_addmem_error_undospan(pt_base, tpgs);
 368  368  
 369  369                  return (KPHYSM_ERESOURCE);
 370  370          }
 371  371  
 372  372          /*
 373  373           * Update the phys_avail memory list.
 374  374           * The phys_install list was done at the start.
 375  375           */
 376  376  
 377  377          memlist_write_lock();
 378  378  
 379  379          mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
 380  380              (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
 381  381          ASSERT(mlret == MEML_SPANOP_OK);
 382  382  
 383  383          memlist_write_unlock();
 384  384  
 385  385          /* See if we can find a memseg to re-use. */
 386  386          if (meta_alloc) {
 387  387                  seg = memseg_reuse(0);
 388  388                  reuse = 1;      /* force unmapping of temp mapva */
 389  389                  flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC;
 390  390                  /*
 391  391                   * There is a 1:1 fixed relationship between a pfn
 392  392                   * and a page_t VA.  The pfn is used as an index into
 393  393                   * the ppvm_base page_t table in order to calculate
 394  394                   * the page_t base address for a given pfn range.
 395  395                   */
 396  396                  segpp = ppvm_base + base;
 397  397          } else {
 398  398                  seg = memseg_reuse(metapgs);
 399  399                  reuse = (seg != NULL);
 400  400                  flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL;
 401  401                  segpp = pp;
 402  402          }
 403  403  
 404  404          /*
 405  405           * Initialize the memseg structure representing this memory
 406  406           * and add it to the existing list of memsegs. Do some basic
 407  407           * initialization and add the memory to the system.
 408  408           * In order to prevent lock deadlocks, the add_physmem()
 409  409           * code is repeated here, but split into several stages.
 410  410           *
 411  411           * If a memseg is reused, invalidate memseg pointers in
 412  412           * all cpu vm caches.  We need to do this this since the check
 413  413           *      pp >= seg->pages && pp < seg->epages
 414  414           * used in various places is not atomic and so the first compare
 415  415           * can happen before reuse and the second compare after reuse.
 416  416           * The invalidation ensures that a memseg is not deferenced while
 417  417           * it's page/pfn pointers are changing.
 418  418           */
 419  419          if (seg == NULL) {
 420  420                  seg = memseg_alloc();
 421  421                  ASSERT(seg != NULL);
 422  422                  seg->msegflags = flags;
 423  423                  MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p",
 424  424                      (void *)seg, (void *)(seg->pages));
 425  425                  seg->pages = segpp;
 426  426          } else {
 427  427                  ASSERT(seg->msegflags == flags);
 428  428                  ASSERT(seg->pages_base == seg->pages_end);
 429  429                  MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p",
 430  430                      (void *)seg, (void *)(seg->pages));
 431  431                  if (meta_alloc) {
 432  432                          memseg_cpu_vm_flush();
 433  433                          seg->pages = segpp;
 434  434                  }
 435  435          }
 436  436  
 437  437          seg->epages = seg->pages + npgs;
 438  438          seg->pages_base = base;
 439  439          seg->pages_end = base + npgs;
 440  440  
 441  441          /*
 442  442           * Initialize metadata. The page_ts are set to locked state
 443  443           * ready to be freed.
 444  444           */
 445  445          bzero((caddr_t)pp, ptob(metapgs));
 446  446  
 447  447          pfn = seg->pages_base;
 448  448          /* Save the original pp base in case we reuse a memseg. */
 449  449          opp = pp;
 450  450          oepp = opp + npgs;
 451  451          for (pp = opp; pp < oepp; pp++) {
 452  452                  pp->p_pagenum = pfn;
 453  453                  pfn++;
 454  454                  page_iolock_init(pp);
 455  455                  while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
 456  456                          continue;
 457  457                  pp->p_offset = (u_offset_t)-1;
 458  458          }
 459  459  
 460  460          if (reuse) {
 461  461                  /* Remap our page_ts to the re-used memseg VA space. */
 462  462                  pfn = pt_base;
 463  463                  vaddr = (caddr_t)seg->pages;
 464  464                  for (pnum = 0; pnum < metapgs; pnum++) {
 465  465                          if (meta_alloc)
 466  466                                  pfn = memseg_get_metapfn(metabase,
 467  467                                      (pgcnt_t)pnum);
 468  468                          hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
 469  469                              PROT_READ | PROT_WRITE,
 470  470                              HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
 471  471                          pfn++;
 472  472                          vaddr += ptob(1);
 473  473                  }
 474  474  
 475  475                  hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
 476  476                      HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
 477  477  
 478  478                  vmem_free(heap_arena, mapva, ptob(metapgs));
 479  479          }
 480  480  
 481  481          hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
 482  482  
 483  483          memsegs_lock(1);
 484  484  
 485  485          /*
 486  486           * The new memseg is inserted at the beginning of the list.
 487  487           * Not only does this save searching for the tail, but in the
 488  488           * case of a re-used memseg, it solves the problem of what
 489  489           * happens if some process has still got a pointer to the
 490  490           * memseg and follows the next pointer to continue traversing
 491  491           * the memsegs list.
 492  492           */
 493  493  
 494  494          hat_kpm_addmem_mseg_insert(seg);
 495  495  
 496  496          seg->next = memsegs;
 497  497          membar_producer();
 498  498  
 499  499          hat_kpm_addmem_memsegs_update(seg);
 500  500  
 501  501          memsegs = seg;
 502  502  
 503  503          build_pfn_hash();
 504  504  
 505  505          total_pages += npgs;
 506  506  
 507  507          /*
 508  508           * Recalculate the paging parameters now total_pages has changed.
 509  509           * This will also cause the clock hands to be reset before next use.
 510  510           */
 511  511          setupclock(1);
 512  512  
 513  513          memsegs_unlock(1);
 514  514  
 515  515          PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);
 516  516  
 517  517          /*
 518  518           * Free the pages outside the lock to avoid locking loops.
 519  519           */
 520  520          for (pp = seg->pages; pp < seg->epages; pp++) {
 521  521                  page_free(pp, 1);
 522  522          }
 523  523  
 524  524          /*
 525  525           * Now that we've updated the appropriate memory lists we
 526  526           * need to reset a number of globals, since we've increased memory.
 527  527           * Several have already been updated for us as noted above. The
 528  528           * globals we're interested in at this point are:
 529  529           *   physmax - highest page frame number.
 530  530           *   physinstalled - number of pages currently installed (done earlier)
 531  531           *   maxmem - max free pages in the system
 532  532           *   physmem - physical memory pages available
 533  533           *   availrmem - real memory available
 534  534           */
 535  535  
 536  536          mutex_enter(&freemem_lock);
 537  537          maxmem += npgs;
 538  538          physmem += npgs;
 539  539          availrmem += npgs;
 540  540          availrmem_initial += npgs;
 541  541  
 542  542          mutex_exit(&freemem_lock);
 543  543  
 544  544          dump_resize();
 545  545  
 546  546          page_freelist_coalesce_all(mnode);
 547  547  
 548  548          kphysm_setup_post_add(npgs);
 549  549  
 550  550          cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
 551  551              "(0x%" PRIx64 ")\n",
 552  552              physinstalled << (PAGESHIFT - 10),
 553  553              (uint64_t)physinstalled << PAGESHIFT);
 554  554  
 555  555          avmem = (uint64_t)freemem << PAGESHIFT;
 556  556          cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
 557  557              "avail mem = %" PRId64 "\n", avmem);
 558  558  
 559  559          /*
 560  560           * Update lgroup generation number on single lgroup systems
 561  561           */
 562  562          if (nlgrps == 1)
 563  563                  lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
 564  564  
 565  565          /*
 566  566           * Inform DDI of update
 567  567           */
 568  568          ddi_mem_update((uint64_t)(pt_base) << PAGESHIFT,
 569  569              (uint64_t)(tpgs) << PAGESHIFT);
 570  570  
 571  571          delspan_unreserve(pt_base, tpgs);
 572  572  
 573  573          return (KPHYSM_OK);             /* Successfully added system memory */
 574  574  }
 575  575  
 576  576  /*
 577  577   * There are various error conditions in kphysm_add_memory_dynamic()
 578  578   * which require a rollback of already changed global state.
 579  579   */
 580  580  static void
 581  581  kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
 582  582  {
 583  583          int mlret;
 584  584  
 585  585          /* Unreserve memory span. */
 586  586          memlist_write_lock();
 587  587  
 588  588          mlret = memlist_delete_span(
 589  589              (uint64_t)(pt_base) << PAGESHIFT,
 590  590              (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
 591  591  
 592  592          ASSERT(mlret == MEML_SPANOP_OK);
 593  593          phys_install_has_changed();
 594  594          installed_top_size(phys_install, &physmax, &physinstalled);
 595  595  
 596  596          memlist_write_unlock();
 597  597          delspan_unreserve(pt_base, tpgs);
 598  598  }
 599  599  
 600  600  /*
 601  601   * Only return an available memseg of exactly the right size
 602  602   * if size is required.
 603  603   * When the meta data area has it's own virtual address space
 604  604   * we will need to manage this more carefully and do best fit
 605  605   * allocations, possibly splitting an available area.
 606  606   */
 607  607  struct memseg *
 608  608  memseg_reuse(pgcnt_t metapgs)
 609  609  {
 610  610          int type;
 611  611          struct memseg **segpp, *seg;
 612  612  
 613  613          mutex_enter(&memseg_lists_lock);
 614  614  
 615  615          segpp = &memseg_va_avail;
 616  616          for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
 617  617                  caddr_t end;
 618  618  
 619  619                  /*
 620  620                   * Make sure we are reusing the right segment type.
 621  621                   */
 622  622                  type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC;
 623  623  
 624  624                  if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC))
 625  625                      != type)
 626  626                          continue;
 627  627  
 628  628                  if (kpm_enable)
 629  629                          end = hat_kpm_mseg_reuse(seg);
 630  630                  else
 631  631                          end = (caddr_t)seg->epages;
 632  632  
 633  633                  /*
 634  634                   * Check for the right size if it is provided.
 635  635                   */
 636  636                  if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) {
 637  637                          *segpp = seg->lnext;
 638  638                          seg->lnext = NULL;
 639  639                          break;
 640  640                  }
 641  641          }
 642  642          mutex_exit(&memseg_lists_lock);
 643  643  
 644  644          return (seg);
 645  645  }
 646  646  
 647  647  static uint_t handle_gen;
 648  648  
 649  649  struct memdelspan {
 650  650          struct memdelspan *mds_next;
 651  651          pfn_t           mds_base;
 652  652          pgcnt_t         mds_npgs;
 653  653          uint_t          *mds_bitmap;
 654  654          uint_t          *mds_bitmap_retired;
 655  655  };
 656  656  
 657  657  #define NBPBMW          (sizeof (uint_t) * NBBY)
 658  658  #define MDS_BITMAPBYTES(MDSP) \
 659  659          ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
 660  660  
 661  661  struct transit_list {
 662  662          struct transit_list     *trl_next;
 663  663          struct memdelspan       *trl_spans;
 664  664          int                     trl_collect;
 665  665  };
 666  666  
 667  667  struct transit_list_head {
 668  668          kmutex_t                trh_lock;
 669  669          struct transit_list     *trh_head;
 670  670  };
 671  671  
 672  672  static struct transit_list_head transit_list_head;
 673  673  
 674  674  struct mem_handle;
 675  675  static void transit_list_collect(struct mem_handle *, int);
 676  676  static void transit_list_insert(struct transit_list *);
 677  677  static void transit_list_remove(struct transit_list *);
 678  678  
 679  679  #ifdef DEBUG
 680  680  #define MEM_DEL_STATS
 681  681  #endif /* DEBUG */
 682  682  
 683  683  #ifdef MEM_DEL_STATS
 684  684  static int mem_del_stat_print = 0;
 685  685  struct mem_del_stat {
 686  686          uint_t  nloop;
 687  687          uint_t  need_free;
 688  688          uint_t  free_loop;
 689  689          uint_t  free_low;
 690  690          uint_t  free_failed;
 691  691          uint_t  ncheck;
 692  692          uint_t  nopaget;
 693  693          uint_t  lockfail;
 694  694          uint_t  nfree;
 695  695          uint_t  nreloc;
 696  696          uint_t  nrelocfail;
 697  697          uint_t  already_done;
 698  698          uint_t  first_notfree;
 699  699          uint_t  npplocked;
 700  700          uint_t  nlockreloc;
 701  701          uint_t  nnorepl;
 702  702          uint_t  nmodreloc;
 703  703          uint_t  ndestroy;
 704  704          uint_t  nputpage;
 705  705          uint_t  nnoreclaim;
 706  706          uint_t  ndelay;
 707  707          uint_t  demotefail;
 708  708          uint64_t nticks_total;
 709  709          uint64_t nticks_pgrp;
 710  710          uint_t  retired;
 711  711          uint_t  toxic;
 712  712          uint_t  failing;
 713  713          uint_t  modtoxic;
 714  714          uint_t  npplkdtoxic;
 715  715          uint_t  gptlmodfail;
 716  716          uint_t  gptllckfail;
 717  717  };
 718  718  /*
 719  719   * The stat values are only incremented in the delete thread
 720  720   * so no locking or atomic required.
 721  721   */
 722  722  #define MDSTAT_INCR(MHP, FLD)   (MHP)->mh_delstat.FLD++
 723  723  #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck))
 724  724  #define MDSTAT_PGRP(MHP, ntck)  ((MHP)->mh_delstat.nticks_pgrp += (ntck))
 725  725  static void mem_del_stat_print_func(struct mem_handle *);
 726  726  #define MDSTAT_PRINT(MHP)       mem_del_stat_print_func((MHP))
 727  727  #else /* MEM_DEL_STATS */
 728  728  #define MDSTAT_INCR(MHP, FLD)
 729  729  #define MDSTAT_TOTAL(MHP, ntck)
 730  730  #define MDSTAT_PGRP(MHP, ntck)
 731  731  #define MDSTAT_PRINT(MHP)
 732  732  #endif /* MEM_DEL_STATS */
 733  733  
 734  734  typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
 735  735          MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
 736  736  
 737  737  /*
 738  738   * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
 739  739   * The mutex may not be required for other fields, dependent on mh_state.
 740  740   */
 741  741  struct mem_handle {
 742  742          kmutex_t        mh_mutex;
 743  743          struct mem_handle *mh_next;
 744  744          memhandle_t     mh_exthandle;
 745  745          mhnd_state_t    mh_state;
 746  746          struct transit_list mh_transit;
 747  747          pgcnt_t         mh_phys_pages;
 748  748          pgcnt_t         mh_vm_pages;
 749  749          pgcnt_t         mh_hold_todo;
 750  750          void            (*mh_delete_complete)(void *, int error);
 751  751          void            *mh_delete_complete_arg;
 752  752          volatile uint_t mh_cancel;
 753  753          volatile uint_t mh_dr_aio_cleanup_cancel;
 754  754          volatile uint_t mh_aio_cleanup_done;
 755  755          kcondvar_t      mh_cv;
 756  756          kthread_id_t    mh_thread_id;
 757  757          page_t          *mh_deleted;    /* link through p_next */
 758  758  #ifdef MEM_DEL_STATS
 759  759          struct mem_del_stat mh_delstat;
 760  760  #endif /* MEM_DEL_STATS */
 761  761  };
 762  762  
 763  763  static struct mem_handle *mem_handle_head;
 764  764  static kmutex_t mem_handle_list_mutex;
 765  765  
 766  766  static struct mem_handle *
 767  767  kphysm_allocate_mem_handle()
 768  768  {
 769  769          struct mem_handle *mhp;
 770  770  
 771  771          mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
 772  772          mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
 773  773          mutex_enter(&mem_handle_list_mutex);
 774  774          mutex_enter(&mhp->mh_mutex);
 775  775          /* handle_gen is protected by list mutex. */
 776  776          mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
 777  777          mhp->mh_next = mem_handle_head;
 778  778          mem_handle_head = mhp;
 779  779          mutex_exit(&mem_handle_list_mutex);
 780  780  
 781  781          return (mhp);
 782  782  }
 783  783  
 784  784  static void
 785  785  kphysm_free_mem_handle(struct mem_handle *mhp)
 786  786  {
 787  787          struct mem_handle **mhpp;
 788  788  
 789  789          ASSERT(mutex_owned(&mhp->mh_mutex));
 790  790          ASSERT(mhp->mh_state == MHND_FREE);
 791  791          /*
 792  792           * Exit the mutex to preserve locking order. This is OK
 793  793           * here as once in the FREE state, the handle cannot
 794  794           * be found by a lookup.
 795  795           */
 796  796          mutex_exit(&mhp->mh_mutex);
 797  797  
 798  798          mutex_enter(&mem_handle_list_mutex);
 799  799          mhpp = &mem_handle_head;
 800  800          while (*mhpp != NULL && *mhpp != mhp)
 801  801                  mhpp = &(*mhpp)->mh_next;
 802  802          ASSERT(*mhpp == mhp);
 803  803          /*
 804  804           * No need to lock the handle (mh_mutex) as only
 805  805           * mh_next changing and this is the only thread that
 806  806           * can be referncing mhp.
 807  807           */
 808  808          *mhpp = mhp->mh_next;
 809  809          mutex_exit(&mem_handle_list_mutex);
 810  810  
 811  811          mutex_destroy(&mhp->mh_mutex);
 812  812          kmem_free(mhp, sizeof (struct mem_handle));
 813  813  }
 814  814  
 815  815  /*
 816  816   * This function finds the internal mem_handle corresponding to an
 817  817   * external handle and returns it with the mh_mutex held.
 818  818   */
 819  819  static struct mem_handle *
 820  820  kphysm_lookup_mem_handle(memhandle_t handle)
 821  821  {
 822  822          struct mem_handle *mhp;
 823  823  
 824  824          mutex_enter(&mem_handle_list_mutex);
 825  825          for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
 826  826                  if (mhp->mh_exthandle == handle) {
 827  827                          mutex_enter(&mhp->mh_mutex);
 828  828                          /*
 829  829                           * The state of the handle could have been changed
 830  830                           * by kphysm_del_release() while waiting for mh_mutex.
 831  831                           */
 832  832                          if (mhp->mh_state == MHND_FREE) {
 833  833                                  mutex_exit(&mhp->mh_mutex);
 834  834                                  continue;
 835  835                          }
 836  836                          break;
 837  837                  }
 838  838          }
 839  839          mutex_exit(&mem_handle_list_mutex);
 840  840          return (mhp);
 841  841  }
 842  842  
 843  843  int
 844  844  kphysm_del_gethandle(memhandle_t *xmhp)
 845  845  {
 846  846          struct mem_handle *mhp;
 847  847  
 848  848          mhp = kphysm_allocate_mem_handle();
 849  849          /*
 850  850           * The handle is allocated using KM_SLEEP, so cannot fail.
 851  851           * If the implementation is changed, the correct error to return
 852  852           * here would be KPHYSM_ENOHANDLES.
 853  853           */
 854  854          ASSERT(mhp->mh_state == MHND_FREE);
 855  855          mhp->mh_state = MHND_INIT;
 856  856          *xmhp = mhp->mh_exthandle;
 857  857          mutex_exit(&mhp->mh_mutex);
 858  858          return (KPHYSM_OK);
 859  859  }
 860  860  
 861  861  static int
 862  862  overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
 863  863  {
 864  864          pfn_t e1, e2;
 865  865  
 866  866          e1 = b1 + l1;
 867  867          e2 = b2 + l2;
 868  868  
 869  869          return (!(b2 >= e1 || b1 >= e2));
 870  870  }
 871  871  
 872  872  static int can_remove_pgs(pgcnt_t);
 873  873  
 874  874  static struct memdelspan *
 875  875  span_to_install(pfn_t base, pgcnt_t npgs)
 876  876  {
 877  877          struct memdelspan *mdsp;
 878  878          struct memdelspan *mdsp_new;
 879  879          uint64_t address, size, thislen;
 880  880          struct memlist *mlp;
 881  881  
 882  882          mdsp_new = NULL;
 883  883  
 884  884          address = (uint64_t)base << PAGESHIFT;
 885  885          size = (uint64_t)npgs << PAGESHIFT;
 886  886          while (size != 0) {
 887  887                  memlist_read_lock();
 888  888                  for (mlp = phys_install; mlp != NULL; mlp = mlp->ml_next) {
 889  889                          if (address >= (mlp->ml_address + mlp->ml_size))
 890  890                                  continue;
 891  891                          if ((address + size) > mlp->ml_address)
 892  892                                  break;
 893  893                  }
 894  894                  if (mlp == NULL) {
 895  895                          address += size;
 896  896                          size = 0;
 897  897                          thislen = 0;
 898  898                  } else {
 899  899                          if (address < mlp->ml_address) {
 900  900                                  size -= (mlp->ml_address - address);
 901  901                                  address = mlp->ml_address;
 902  902                          }
 903  903                          ASSERT(address >= mlp->ml_address);
 904  904                          if ((address + size) >
 905  905                              (mlp->ml_address + mlp->ml_size)) {
 906  906                                  thislen =
 907  907                                      mlp->ml_size - (address - mlp->ml_address);
 908  908                          } else {
 909  909                                  thislen = size;
 910  910                          }
 911  911                  }
 912  912                  memlist_read_unlock();
 913  913                  /* TODO: phys_install could change now */
 914  914                  if (thislen == 0)
 915  915                          continue;
 916  916                  mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
 917  917                  mdsp->mds_base = btop(address);
 918  918                  mdsp->mds_npgs = btop(thislen);
 919  919                  mdsp->mds_next = mdsp_new;
 920  920                  mdsp_new = mdsp;
 921  921                  address += thislen;
 922  922                  size -= thislen;
 923  923          }
 924  924          return (mdsp_new);
 925  925  }
 926  926  
 927  927  static void
 928  928  free_delspans(struct memdelspan *mdsp)
 929  929  {
 930  930          struct memdelspan *amdsp;
 931  931  
 932  932          while ((amdsp = mdsp) != NULL) {
 933  933                  mdsp = amdsp->mds_next;
 934  934                  kmem_free(amdsp, sizeof (struct memdelspan));
 935  935          }
 936  936  }
 937  937  
 938  938  /*
 939  939   * Concatenate lists. No list ordering is required.
 940  940   */
 941  941  
 942  942  static void
 943  943  delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
 944  944  {
 945  945          while (*mdspp != NULL)
 946  946                  mdspp = &(*mdspp)->mds_next;
 947  947  
 948  948          *mdspp = mdsp;
 949  949  }
 950  950  
 951  951  /*
 952  952   * Given a new list of delspans, check there is no overlap with
 953  953   * all existing span activity (add or delete) and then concatenate
 954  954   * the new spans to the given list.
 955  955   * Return 1 for OK, 0 if overlapping.
 956  956   */
 957  957  static int
 958  958  delspan_insert(
 959  959          struct transit_list *my_tlp,
 960  960          struct memdelspan *mdsp_new)
 961  961  {
 962  962          struct transit_list_head *trh;
 963  963          struct transit_list *tlp;
 964  964          int ret;
 965  965  
 966  966          trh = &transit_list_head;
 967  967  
 968  968          ASSERT(my_tlp != NULL);
 969  969          ASSERT(mdsp_new != NULL);
 970  970  
 971  971          ret = 1;
 972  972          mutex_enter(&trh->trh_lock);
 973  973          /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
 974  974          for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
 975  975                  struct memdelspan *mdsp;
 976  976  
 977  977                  for (mdsp = tlp->trl_spans; mdsp != NULL;
 978  978                      mdsp = mdsp->mds_next) {
 979  979                          struct memdelspan *nmdsp;
 980  980  
 981  981                          for (nmdsp = mdsp_new; nmdsp != NULL;
 982  982                              nmdsp = nmdsp->mds_next) {
 983  983                                  if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
 984  984                                      nmdsp->mds_base, nmdsp->mds_npgs)) {
 985  985                                          ret = 0;
 986  986                                          goto done;
 987  987                                  }
 988  988                          }
 989  989                  }
 990  990          }
 991  991  done:
 992  992          if (ret != 0) {
 993  993                  if (my_tlp->trl_spans == NULL)
 994  994                          transit_list_insert(my_tlp);
 995  995                  delspan_concat(&my_tlp->trl_spans, mdsp_new);
 996  996          }
 997  997          mutex_exit(&trh->trh_lock);
 998  998          return (ret);
 999  999  }
1000 1000  
1001 1001  static void
1002 1002  delspan_remove(
1003 1003          struct transit_list *my_tlp,
1004 1004          pfn_t base,
1005 1005          pgcnt_t npgs)
1006 1006  {
1007 1007          struct transit_list_head *trh;
1008 1008          struct memdelspan *mdsp;
1009 1009  
1010 1010          trh = &transit_list_head;
1011 1011  
1012 1012          ASSERT(my_tlp != NULL);
1013 1013  
1014 1014          mutex_enter(&trh->trh_lock);
1015 1015          if ((mdsp = my_tlp->trl_spans) != NULL) {
1016 1016                  if (npgs == 0) {
1017 1017                          my_tlp->trl_spans = NULL;
1018 1018                          free_delspans(mdsp);
1019 1019                          transit_list_remove(my_tlp);
1020 1020                  } else {
1021 1021                          struct memdelspan **prv;
1022 1022  
1023 1023                          prv = &my_tlp->trl_spans;
1024 1024                          while (mdsp != NULL) {
1025 1025                                  pfn_t p_end;
1026 1026  
1027 1027                                  p_end = mdsp->mds_base + mdsp->mds_npgs;
1028 1028                                  if (mdsp->mds_base >= base &&
1029 1029                                      p_end <= (base + npgs)) {
1030 1030                                          *prv = mdsp->mds_next;
1031 1031                                          mdsp->mds_next = NULL;
1032 1032                                          free_delspans(mdsp);
1033 1033                                  } else {
1034 1034                                          prv = &mdsp->mds_next;
1035 1035                                  }
1036 1036                                  mdsp = *prv;
1037 1037                          }
1038 1038                          if (my_tlp->trl_spans == NULL)
1039 1039                                  transit_list_remove(my_tlp);
1040 1040                  }
1041 1041          }
1042 1042          mutex_exit(&trh->trh_lock);
1043 1043  }
1044 1044  
1045 1045  /*
1046 1046   * Reserve interface for add to stop delete before add finished.
1047 1047   * This list is only accessed through the delspan_insert/remove
1048 1048   * functions and so is fully protected by the mutex in struct transit_list.
1049 1049   */
1050 1050  
1051 1051  static struct transit_list reserve_transit;
1052 1052  
1053 1053  static int
1054 1054  delspan_reserve(pfn_t base, pgcnt_t npgs)
1055 1055  {
1056 1056          struct memdelspan *mdsp;
1057 1057          int ret;
1058 1058  
1059 1059          mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
1060 1060          mdsp->mds_base = base;
1061 1061          mdsp->mds_npgs = npgs;
1062 1062          if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
1063 1063                  free_delspans(mdsp);
1064 1064          }
1065 1065          return (ret);
1066 1066  }
1067 1067  
1068 1068  static void
1069 1069  delspan_unreserve(pfn_t base, pgcnt_t npgs)
1070 1070  {
1071 1071          delspan_remove(&reserve_transit, base, npgs);
1072 1072  }
1073 1073  
1074 1074  /*
1075 1075   * Return whether memseg was created by kphysm_add_memory_dynamic().
1076 1076   */
1077 1077  static int
1078 1078  memseg_is_dynamic(struct memseg *seg)
1079 1079  {
1080 1080          return (seg->msegflags & MEMSEG_DYNAMIC);
1081 1081  }
1082 1082  
1083 1083  int
1084 1084  kphysm_del_span(
1085 1085          memhandle_t handle,
1086 1086          pfn_t base,
1087 1087          pgcnt_t npgs)
1088 1088  {
1089 1089          struct mem_handle *mhp;
1090 1090          struct memseg *seg;
1091 1091          struct memdelspan *mdsp;
1092 1092          struct memdelspan *mdsp_new;
1093 1093          pgcnt_t phys_pages, vm_pages;
1094 1094          pfn_t p_end;
1095 1095          page_t *pp;
1096 1096          int ret;
1097 1097  
1098 1098          mhp = kphysm_lookup_mem_handle(handle);
1099 1099          if (mhp == NULL) {
1100 1100                  return (KPHYSM_EHANDLE);
1101 1101          }
1102 1102          if (mhp->mh_state != MHND_INIT) {
1103 1103                  mutex_exit(&mhp->mh_mutex);
1104 1104                  return (KPHYSM_ESEQUENCE);
1105 1105          }
1106 1106  
1107 1107          /*
1108 1108           * Intersect the span with the installed memory list (phys_install).
1109 1109           */
1110 1110          mdsp_new = span_to_install(base, npgs);
1111 1111          if (mdsp_new == NULL) {
1112 1112                  /*
1113 1113                   * No physical memory in this range. Is this an
1114 1114                   * error? If an attempt to start the delete is made
1115 1115                   * for OK returns from del_span such as this, start will
1116 1116                   * return an error.
1117 1117                   * Could return KPHYSM_ENOWORK.
1118 1118                   */
1119 1119                  /*
1120 1120                   * It is assumed that there are no error returns
1121 1121                   * from span_to_install() due to kmem_alloc failure.
1122 1122                   */
1123 1123                  mutex_exit(&mhp->mh_mutex);
1124 1124                  return (KPHYSM_OK);
1125 1125          }
1126 1126          /*
1127 1127           * Does this span overlap an existing span?
1128 1128           */
1129 1129          if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
1130 1130                  /*
1131 1131                   * Differentiate between already on list for this handle
1132 1132                   * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
1133 1133                   */
1134 1134                  ret = KPHYSM_EBUSY;
1135 1135                  for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1136 1136                      mdsp = mdsp->mds_next) {
1137 1137                          if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
1138 1138                              base, npgs)) {
1139 1139                                  ret = KPHYSM_EDUP;
1140 1140                                  break;
1141 1141                          }
1142 1142                  }
1143 1143                  mutex_exit(&mhp->mh_mutex);
1144 1144                  free_delspans(mdsp_new);
1145 1145                  return (ret);
1146 1146          }
1147 1147          /*
1148 1148           * At this point the spans in mdsp_new have been inserted into the
1149 1149           * list of spans for this handle and thereby to the global list of
1150 1150           * spans being processed. Each of these spans must now be checked
1151 1151           * for relocatability. As a side-effect segments in the memseg list
1152 1152           * may be split.
1153 1153           *
1154 1154           * Note that mdsp_new can no longer be used as it is now part of
1155 1155           * a larger list. Select elements of this larger list based
1156 1156           * on base and npgs.
1157 1157           */
1158 1158  restart:
1159 1159          phys_pages = 0;
1160 1160          vm_pages = 0;
1161 1161          ret = KPHYSM_OK;
1162 1162          for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1163 1163              mdsp = mdsp->mds_next) {
1164 1164                  pgcnt_t pages_checked;
1165 1165  
1166 1166                  if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
1167 1167                          continue;
1168 1168                  }
1169 1169                  p_end = mdsp->mds_base + mdsp->mds_npgs;
1170 1170                  /*
1171 1171                   * The pages_checked count is a hack. All pages should be
1172 1172                   * checked for relocatability. Those not covered by memsegs
1173 1173                   * should be tested with arch_kphysm_del_span_ok().
1174 1174                   */
1175 1175                  pages_checked = 0;
1176 1176                  for (seg = memsegs; seg; seg = seg->next) {
1177 1177                          pfn_t mseg_start;
1178 1178  
1179 1179                          if (seg->pages_base >= p_end ||
1180 1180                              seg->pages_end <= mdsp->mds_base) {
1181 1181                                  /* Span and memseg don't overlap. */
1182 1182                                  continue;
1183 1183                          }
1184 1184                          mseg_start = memseg_get_start(seg);
1185 1185                          /* Check that segment is suitable for delete. */
1186 1186                          if (memseg_includes_meta(seg)) {
1187 1187                                  /*
1188 1188                                   * Check that this segment is completely
1189 1189                                   * within the span.
1190 1190                                   */
1191 1191                                  if (mseg_start < mdsp->mds_base ||
1192 1192                                      seg->pages_end > p_end) {
1193 1193                                          ret = KPHYSM_EBUSY;
1194 1194                                          break;
1195 1195                                  }
1196 1196                                  pages_checked += seg->pages_end - mseg_start;
1197 1197                          } else {
1198 1198                                  /*
1199 1199                                   * If this segment is larger than the span,
1200 1200                                   * try to split it. After the split, it
1201 1201                                   * is necessary to restart.
1202 1202                                   */
1203 1203                                  if (seg->pages_base < mdsp->mds_base ||
1204 1204                                      seg->pages_end > p_end) {
1205 1205                                          pfn_t abase;
1206 1206                                          pgcnt_t anpgs;
1207 1207                                          int s_ret;
1208 1208  
1209 1209                                          /* Split required.  */
1210 1210                                          if (mdsp->mds_base < seg->pages_base)
1211 1211                                                  abase = seg->pages_base;
1212 1212                                          else
1213 1213                                                  abase = mdsp->mds_base;
1214 1214                                          if (p_end > seg->pages_end)
1215 1215                                                  anpgs = seg->pages_end - abase;
1216 1216                                          else
1217 1217                                                  anpgs = p_end - abase;
1218 1218                                          s_ret = kphysm_split_memseg(abase,
1219 1219                                              anpgs);
1220 1220                                          if (s_ret == 0) {
1221 1221                                                  /* Split failed. */
1222 1222                                                  ret = KPHYSM_ERESOURCE;
1223 1223                                                  break;
1224 1224                                          }
1225 1225                                          goto restart;
1226 1226                                  }
1227 1227                                  pages_checked +=
1228 1228                                      seg->pages_end - seg->pages_base;
1229 1229                          }
1230 1230                          /*
1231 1231                           * The memseg is wholly within the delete span.
1232 1232                           * The individual pages can now be checked.
1233 1233                           */
1234 1234                          /* Cage test. */
1235 1235                          for (pp = seg->pages; pp < seg->epages; pp++) {
1236 1236                                  if (PP_ISNORELOC(pp)) {
1237 1237                                          ret = KPHYSM_ENONRELOC;
1238 1238                                          break;
1239 1239                                  }
1240 1240                          }
1241 1241                          if (ret != KPHYSM_OK) {
1242 1242                                  break;
1243 1243                          }
1244 1244                          phys_pages += (seg->pages_end - mseg_start);
1245 1245                          vm_pages += MSEG_NPAGES(seg);
1246 1246                  }
1247 1247                  if (ret != KPHYSM_OK)
1248 1248                          break;
1249 1249                  if (pages_checked != mdsp->mds_npgs) {
1250 1250                          ret = KPHYSM_ENONRELOC;
1251 1251                          break;
1252 1252                  }
1253 1253          }
1254 1254  
1255 1255          if (ret == KPHYSM_OK) {
1256 1256                  mhp->mh_phys_pages += phys_pages;
1257 1257                  mhp->mh_vm_pages += vm_pages;
1258 1258          } else {
1259 1259                  /*
1260 1260                   * Keep holding the mh_mutex to prevent it going away.
1261 1261                   */
1262 1262                  delspan_remove(&mhp->mh_transit, base, npgs);
1263 1263          }
1264 1264          mutex_exit(&mhp->mh_mutex);
1265 1265          return (ret);
1266 1266  }
1267 1267  
1268 1268  int
1269 1269  kphysm_del_span_query(
1270 1270          pfn_t base,
1271 1271          pgcnt_t npgs,
1272 1272          memquery_t *mqp)
1273 1273  {
1274 1274          struct memdelspan *mdsp;
1275 1275          struct memdelspan *mdsp_new;
1276 1276          int done_first_nonreloc;
1277 1277  
1278 1278          mqp->phys_pages = 0;
1279 1279          mqp->managed = 0;
1280 1280          mqp->nonrelocatable = 0;
1281 1281          mqp->first_nonrelocatable = 0;
1282 1282          mqp->last_nonrelocatable = 0;
1283 1283  
1284 1284          mdsp_new = span_to_install(base, npgs);
1285 1285          /*
1286 1286           * It is OK to proceed here if mdsp_new == NULL.
1287 1287           */
1288 1288          done_first_nonreloc = 0;
1289 1289          for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
1290 1290                  pfn_t sbase;
1291 1291                  pgcnt_t snpgs;
1292 1292  
1293 1293                  mqp->phys_pages += mdsp->mds_npgs;
1294 1294                  sbase = mdsp->mds_base;
1295 1295                  snpgs = mdsp->mds_npgs;
1296 1296                  while (snpgs != 0) {
1297 1297                          struct memseg *lseg, *seg;
1298 1298                          pfn_t p_end;
1299 1299                          page_t *pp;
1300 1300                          pfn_t mseg_start;
1301 1301  
1302 1302                          p_end = sbase + snpgs;
1303 1303                          /*
1304 1304                           * Find the lowest addressed memseg that starts
1305 1305                           * after sbase and account for it.
1306 1306                           * This is to catch dynamic memsegs whose start
1307 1307                           * is hidden.
1308 1308                           */
1309 1309                          seg = NULL;
1310 1310                          for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
1311 1311                                  if ((lseg->pages_base >= sbase) ||
1312 1312                                      (lseg->pages_base < p_end &&
1313 1313                                      lseg->pages_end > sbase)) {
1314 1314                                          if (seg == NULL ||
1315 1315                                              seg->pages_base > lseg->pages_base)
1316 1316                                                  seg = lseg;
1317 1317                                  }
1318 1318                          }
1319 1319                          if (seg != NULL) {
1320 1320                                  mseg_start = memseg_get_start(seg);
1321 1321                                  /*
1322 1322                                   * Now have the full extent of the memseg so
1323 1323                                   * do the range check.
1324 1324                                   */
1325 1325                                  if (mseg_start >= p_end ||
1326 1326                                      seg->pages_end <= sbase) {
1327 1327                                          /* Span does not overlap memseg. */
1328 1328                                          seg = NULL;
1329 1329                                  }
1330 1330                          }
1331 1331                          /*
1332 1332                           * Account for gap either before the segment if
1333 1333                           * there is one or to the end of the span.
1334 1334                           */
1335 1335                          if (seg == NULL || mseg_start > sbase) {
1336 1336                                  pfn_t a_end;
1337 1337  
1338 1338                                  a_end = (seg == NULL) ? p_end : mseg_start;
1339 1339                                  /*
1340 1340                                   * Check with arch layer for relocatability.
1341 1341                                   */
1342 1342                                  if (arch_kphysm_del_span_ok(sbase,
1343 1343                                      (a_end - sbase))) {
1344 1344                                          /*
1345 1345                                           * No non-relocatble pages in this
1346 1346                                           * area, avoid the fine-grained
1347 1347                                           * test.
1348 1348                                           */
1349 1349                                          snpgs -= (a_end - sbase);
1350 1350                                          sbase = a_end;
1351 1351                                  }
1352 1352                                  while (sbase < a_end) {
1353 1353                                          if (!arch_kphysm_del_span_ok(sbase,
1354 1354                                              1)) {
1355 1355                                                  mqp->nonrelocatable++;
1356 1356                                                  if (!done_first_nonreloc) {
1357 1357                                                          mqp->
1358 1358                                                              first_nonrelocatable
1359 1359                                                              = sbase;
1360 1360                                                          done_first_nonreloc = 1;
1361 1361                                                  }
1362 1362                                                  mqp->last_nonrelocatable =
1363 1363                                                      sbase;
1364 1364                                          }
1365 1365                                          sbase++;
1366 1366                                          snpgs--;
1367 1367                                  }
1368 1368                          }
1369 1369                          if (seg != NULL) {
1370 1370                                  ASSERT(mseg_start <= sbase);
1371 1371                                  if (seg->pages_base != mseg_start &&
1372 1372                                      seg->pages_base > sbase) {
1373 1373                                          pgcnt_t skip_pgs;
1374 1374  
1375 1375                                          /*
1376 1376                                           * Skip the page_t area of a
1377 1377                                           * dynamic memseg.
1378 1378                                           */
1379 1379                                          skip_pgs = seg->pages_base - sbase;
1380 1380                                          if (snpgs <= skip_pgs) {
1381 1381                                                  sbase += snpgs;
1382 1382                                                  snpgs = 0;
1383 1383                                                  continue;
1384 1384                                          }
1385 1385                                          snpgs -= skip_pgs;
1386 1386                                          sbase += skip_pgs;
1387 1387                                  }
1388 1388                                  ASSERT(snpgs != 0);
1389 1389                                  ASSERT(seg->pages_base <= sbase);
1390 1390                                  /*
1391 1391                                   * The individual pages can now be checked.
1392 1392                                   */
1393 1393                                  for (pp = seg->pages +
1394 1394                                      (sbase - seg->pages_base);
1395 1395                                      snpgs != 0 && pp < seg->epages; pp++) {
1396 1396                                          mqp->managed++;
1397 1397                                          if (PP_ISNORELOC(pp)) {
1398 1398                                                  mqp->nonrelocatable++;
1399 1399                                                  if (!done_first_nonreloc) {
1400 1400                                                          mqp->
1401 1401                                                              first_nonrelocatable
1402 1402                                                              = sbase;
1403 1403                                                          done_first_nonreloc = 1;
1404 1404                                                  }
1405 1405                                                  mqp->last_nonrelocatable =
1406 1406                                                      sbase;
1407 1407                                          }
1408 1408                                          sbase++;
1409 1409                                          snpgs--;
1410 1410                                  }
1411 1411                          }
1412 1412                  }
1413 1413          }
1414 1414  
1415 1415          free_delspans(mdsp_new);
1416 1416  
1417 1417          return (KPHYSM_OK);
1418 1418  }
1419 1419  
1420 1420  /*
1421 1421   * This release function can be called at any stage as follows:
1422 1422   *      _gethandle only called
1423 1423   *      _span(s) only called
1424 1424   *      _start called but failed
1425 1425   *      delete thread exited
1426 1426   */
1427 1427  int
1428 1428  kphysm_del_release(memhandle_t handle)
1429 1429  {
1430 1430          struct mem_handle *mhp;
1431 1431  
1432 1432          mhp = kphysm_lookup_mem_handle(handle);
1433 1433          if (mhp == NULL) {
1434 1434                  return (KPHYSM_EHANDLE);
1435 1435          }
1436 1436          switch (mhp->mh_state) {
1437 1437          case MHND_STARTING:
1438 1438          case MHND_RUNNING:
1439 1439                  mutex_exit(&mhp->mh_mutex);
1440 1440                  return (KPHYSM_ENOTFINISHED);
1441 1441          case MHND_FREE:
1442 1442                  ASSERT(mhp->mh_state != MHND_FREE);
1443 1443                  mutex_exit(&mhp->mh_mutex);
1444 1444                  return (KPHYSM_EHANDLE);
1445 1445          case MHND_INIT:
1446 1446                  break;
1447 1447          case MHND_DONE:
1448 1448                  break;
1449 1449          case MHND_RELEASE:
1450 1450                  mutex_exit(&mhp->mh_mutex);
1451 1451                  return (KPHYSM_ESEQUENCE);
1452 1452          default:
1453 1453  #ifdef DEBUG
1454 1454                  cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
1455 1455                      (void *)mhp, mhp->mh_state);
1456 1456  #endif /* DEBUG */
1457 1457                  mutex_exit(&mhp->mh_mutex);
1458 1458                  return (KPHYSM_EHANDLE);
1459 1459          }
1460 1460          /*
1461 1461           * Set state so that we can wait if necessary.
1462 1462           * Also this means that we have read/write access to all
1463 1463           * fields except mh_exthandle and mh_state.
1464 1464           */
1465 1465          mhp->mh_state = MHND_RELEASE;
1466 1466          /*
1467 1467           * The mem_handle cannot be de-allocated by any other operation
1468 1468           * now, so no need to hold mh_mutex.
1469 1469           */
1470 1470          mutex_exit(&mhp->mh_mutex);
1471 1471  
1472 1472          delspan_remove(&mhp->mh_transit, 0, 0);
1473 1473          mhp->mh_phys_pages = 0;
1474 1474          mhp->mh_vm_pages = 0;
1475 1475          mhp->mh_hold_todo = 0;
1476 1476          mhp->mh_delete_complete = NULL;
1477 1477          mhp->mh_delete_complete_arg = NULL;
1478 1478          mhp->mh_cancel = 0;
1479 1479  
1480 1480          mutex_enter(&mhp->mh_mutex);
1481 1481          ASSERT(mhp->mh_state == MHND_RELEASE);
1482 1482          mhp->mh_state = MHND_FREE;
1483 1483  
1484 1484          kphysm_free_mem_handle(mhp);
1485 1485  
1486 1486          return (KPHYSM_OK);
1487 1487  }
1488 1488  
1489 1489  /*
1490 1490   * This cancel function can only be called with the thread running.
1491 1491   */
1492 1492  int
1493 1493  kphysm_del_cancel(memhandle_t handle)
1494 1494  {
1495 1495          struct mem_handle *mhp;
1496 1496  
1497 1497          mhp = kphysm_lookup_mem_handle(handle);
1498 1498          if (mhp == NULL) {
1499 1499                  return (KPHYSM_EHANDLE);
1500 1500          }
1501 1501          if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
1502 1502                  mutex_exit(&mhp->mh_mutex);
1503 1503                  return (KPHYSM_ENOTRUNNING);
1504 1504          }
1505 1505          /*
1506 1506           * Set the cancel flag and wake the delete thread up.
1507 1507           * The thread may be waiting on I/O, so the effect of the cancel
1508 1508           * may be delayed.
1509 1509           */
1510 1510          if (mhp->mh_cancel == 0) {
1511 1511                  mhp->mh_cancel = KPHYSM_ECANCELLED;
1512 1512                  cv_signal(&mhp->mh_cv);
1513 1513          }
1514 1514          mutex_exit(&mhp->mh_mutex);
1515 1515          return (KPHYSM_OK);
1516 1516  }
1517 1517  
1518 1518  int
1519 1519  kphysm_del_status(
1520 1520          memhandle_t handle,
1521 1521          memdelstat_t *mdstp)
1522 1522  {
1523 1523          struct mem_handle *mhp;
1524 1524  
1525 1525          mhp = kphysm_lookup_mem_handle(handle);
1526 1526          if (mhp == NULL) {
1527 1527                  return (KPHYSM_EHANDLE);
1528 1528          }
1529 1529          /*
1530 1530           * Calling kphysm_del_status() is allowed before the delete
1531 1531           * is started to allow for status display.
1532 1532           */
1533 1533          if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
1534 1534              mhp->mh_state != MHND_RUNNING) {
1535 1535                  mutex_exit(&mhp->mh_mutex);
1536 1536                  return (KPHYSM_ENOTRUNNING);
1537 1537          }
1538 1538          mdstp->phys_pages = mhp->mh_phys_pages;
1539 1539          mdstp->managed = mhp->mh_vm_pages;
1540 1540          mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
1541 1541          mutex_exit(&mhp->mh_mutex);
1542 1542          return (KPHYSM_OK);
1543 1543  }
1544 1544  
1545 1545  static int mem_delete_additional_pages = 100;
1546 1546  
1547 1547  static int
1548 1548  can_remove_pgs(pgcnt_t npgs)
1549 1549  {
1550 1550          /*
1551 1551           * If all pageable pages were paged out, freemem would
1552 1552           * equal availrmem.  There is a minimum requirement for
1553 1553           * availrmem.
1554 1554           */
1555 1555          if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
1556 1556              < npgs)
1557 1557                  return (0);
1558 1558          /* TODO: check swap space, etc. */
1559 1559          return (1);
1560 1560  }
1561 1561  
1562 1562  static int
1563 1563  get_availrmem(pgcnt_t npgs)
1564 1564  {
1565 1565          int ret;
1566 1566  
1567 1567          mutex_enter(&freemem_lock);
1568 1568          ret = can_remove_pgs(npgs);
1569 1569          if (ret != 0)
1570 1570                  availrmem -= npgs;
1571 1571          mutex_exit(&freemem_lock);
1572 1572          return (ret);
1573 1573  }
1574 1574  
1575 1575  static void
1576 1576  put_availrmem(pgcnt_t npgs)
1577 1577  {
1578 1578          mutex_enter(&freemem_lock);
1579 1579          availrmem += npgs;
1580 1580          mutex_exit(&freemem_lock);
1581 1581  }
1582 1582  
1583 1583  #define FREEMEM_INCR    100
1584 1584  static pgcnt_t freemem_incr = FREEMEM_INCR;
1585 1585  #define DEL_FREE_WAIT_FRAC      4
1586 1586  #define DEL_FREE_WAIT_TICKS     ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
1587 1587  
1588 1588  #define DEL_BUSY_WAIT_FRAC      20
1589 1589  #define DEL_BUSY_WAIT_TICKS     ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
1590 1590  
1591 1591  static void kphysm_del_cleanup(struct mem_handle *);
1592 1592  
1593 1593  static void page_delete_collect(page_t *, struct mem_handle *);
1594 1594  
1595 1595  static pgcnt_t
1596 1596  delthr_get_freemem(struct mem_handle *mhp)
1597 1597  {
1598 1598          pgcnt_t free_get;
1599 1599          int ret;
1600 1600  
1601 1601          ASSERT(MUTEX_HELD(&mhp->mh_mutex));
1602 1602  
1603 1603          MDSTAT_INCR(mhp, need_free);
1604 1604          /*
1605 1605           * Get up to freemem_incr pages.
1606 1606           */
1607 1607          free_get = freemem_incr;
1608 1608          if (free_get > mhp->mh_hold_todo)
1609 1609                  free_get = mhp->mh_hold_todo;
1610 1610          /*
1611 1611           * Take free_get pages away from freemem,
1612 1612           * waiting if necessary.
1613 1613           */
1614 1614  
1615 1615          while (!mhp->mh_cancel) {
1616 1616                  mutex_exit(&mhp->mh_mutex);
1617 1617                  MDSTAT_INCR(mhp, free_loop);
1618 1618                  /*
1619 1619                   * Duplicate test from page_create_throttle()
1620 1620                   * but don't override with !PG_WAIT.
1621 1621                   */
1622 1622                  if (freemem < (free_get + throttlefree)) {
1623 1623                          MDSTAT_INCR(mhp, free_low);
1624 1624                          ret = 0;
1625 1625                  } else {
1626 1626                          ret = page_create_wait(free_get, 0);
1627 1627                          if (ret == 0) {
1628 1628                                  /* EMPTY */
1629 1629                                  MDSTAT_INCR(mhp, free_failed);
1630 1630                          }
1631 1631                  }
1632 1632                  if (ret != 0) {
1633 1633                          mutex_enter(&mhp->mh_mutex);
1634 1634                          return (free_get);
1635 1635                  }
1636 1636  
1637 1637                  /*
1638 1638                   * Put pressure on pageout.
1639 1639                   */
1640 1640                  page_needfree(free_get);
1641 1641                  cv_signal(&proc_pageout->p_cv);
1642 1642  
1643 1643                  mutex_enter(&mhp->mh_mutex);
1644 1644                  (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
1645 1645                      DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK);
1646 1646                  mutex_exit(&mhp->mh_mutex);
1647 1647                  page_needfree(-(spgcnt_t)free_get);
1648 1648  
1649 1649                  mutex_enter(&mhp->mh_mutex);
1650 1650          }
1651 1651          return (0);
1652 1652  }
1653 1653  
1654 1654  #define DR_AIO_CLEANUP_DELAY    25000   /* 0.025secs, in usec */
1655 1655  #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100
1656 1656  /*
1657 1657   * This function is run as a helper thread for delete_memory_thread.
1658 1658   * It is needed in order to force kaio cleanup, so that pages used in kaio
1659 1659   * will be unlocked and subsequently relocated by delete_memory_thread.
1660 1660   * The address of the delete_memory_threads's mem_handle is passed in to
1661 1661   * this thread function, and is used to set the mh_aio_cleanup_done member
1662 1662   * prior to calling thread_exit().
1663 1663   */
1664 1664  static void
1665 1665  dr_aio_cleanup_thread(caddr_t amhp)
1666 1666  {
1667 1667          proc_t *procp;
1668 1668          int (*aio_cleanup_dr_delete_memory)(proc_t *);
1669 1669          int cleaned;
1670 1670          int n = 0;
1671 1671          struct mem_handle *mhp;
1672 1672          volatile uint_t *pcancel;
1673 1673  
1674 1674          mhp = (struct mem_handle *)amhp;
1675 1675          ASSERT(mhp != NULL);
1676 1676          pcancel = &mhp->mh_dr_aio_cleanup_cancel;
1677 1677          if (modload("sys", "kaio") == -1) {
1678 1678                  mhp->mh_aio_cleanup_done = 1;
1679 1679                  cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
1680 1680                  thread_exit();
1681 1681          }
1682 1682          aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
1683 1683              modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
1684 1684          if (aio_cleanup_dr_delete_memory == NULL) {
1685 1685                  mhp->mh_aio_cleanup_done = 1;
1686 1686                  cmn_err(CE_WARN,
1687 1687              "aio_cleanup_dr_delete_memory not found in kaio");
1688 1688                  thread_exit();
1689 1689          }
1690 1690          do {
1691 1691                  cleaned = 0;
1692 1692                  mutex_enter(&pidlock);
1693 1693                  for (procp = practive; (*pcancel == 0) && (procp != NULL);
1694 1694                      procp = procp->p_next) {
1695 1695                          mutex_enter(&procp->p_lock);
1696 1696                          if (procp->p_aio != NULL) {
1697 1697                                  /* cleanup proc's outstanding kaio */
1698 1698                                  cleaned +=
1699 1699                                      (*aio_cleanup_dr_delete_memory)(procp);
1700 1700                          }
1701 1701                          mutex_exit(&procp->p_lock);
1702 1702                  }
1703 1703                  mutex_exit(&pidlock);
1704 1704                  if ((*pcancel == 0) &&
1705 1705                      (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
1706 1706                          /* delay a bit before retrying all procs again */
1707 1707                          delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
1708 1708                          n = 0;
1709 1709                  }
1710 1710          } while (*pcancel == 0);
1711 1711          mhp->mh_aio_cleanup_done = 1;
1712 1712          thread_exit();
1713 1713  }
1714 1714  
1715 1715  static void
1716 1716  delete_memory_thread(caddr_t amhp)
1717 1717  {
1718 1718          struct mem_handle *mhp;
1719 1719          struct memdelspan *mdsp;
1720 1720          callb_cpr_t cprinfo;
1721 1721          page_t *pp_targ;
1722 1722          spgcnt_t freemem_left;
1723 1723          void (*del_complete_funcp)(void *, int error);
1724 1724          void *del_complete_arg;
1725 1725          int comp_code;
1726 1726          int ret;
1727 1727          int first_scan;
1728 1728          uint_t szc;
1729 1729  #ifdef MEM_DEL_STATS
1730 1730          uint64_t start_total, ntick_total;
1731 1731          uint64_t start_pgrp, ntick_pgrp;
1732 1732  #endif /* MEM_DEL_STATS */
1733 1733  
1734 1734          mhp = (struct mem_handle *)amhp;
1735 1735  
1736 1736  #ifdef MEM_DEL_STATS
1737 1737          start_total = ddi_get_lbolt();
1738 1738  #endif /* MEM_DEL_STATS */
1739 1739  
1740 1740          CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
1741 1741              callb_generic_cpr, "memdel");
1742 1742  
1743 1743          mutex_enter(&mhp->mh_mutex);
1744 1744          ASSERT(mhp->mh_state == MHND_STARTING);
1745 1745  
1746 1746          mhp->mh_state = MHND_RUNNING;
1747 1747          mhp->mh_thread_id = curthread;
1748 1748  
1749 1749          mhp->mh_hold_todo = mhp->mh_vm_pages;
1750 1750          mutex_exit(&mhp->mh_mutex);
1751 1751  
1752 1752          /* Allocate the remap pages now, if necessary. */
1753 1753          memseg_remap_init();
1754 1754  
1755 1755          /*
1756 1756           * Subtract from availrmem now if possible as availrmem
1757 1757           * may not be available by the end of the delete.
1758 1758           */
1759 1759          if (!get_availrmem(mhp->mh_vm_pages)) {
1760 1760                  comp_code = KPHYSM_ENOTVIABLE;
1761 1761                  mutex_enter(&mhp->mh_mutex);
1762 1762                  goto early_exit;
1763 1763          }
1764 1764  
1765 1765          ret = kphysm_setup_pre_del(mhp->mh_vm_pages);
1766 1766  
1767 1767          mutex_enter(&mhp->mh_mutex);
1768 1768  
1769 1769          if (ret != 0) {
1770 1770                  mhp->mh_cancel = KPHYSM_EREFUSED;
1771 1771                  goto refused;
1772 1772          }
1773 1773  
1774 1774          transit_list_collect(mhp, 1);
1775 1775  
1776 1776          for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1777 1777              mdsp = mdsp->mds_next) {
1778 1778                  ASSERT(mdsp->mds_bitmap == NULL);
1779 1779                  mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
1780 1780                  mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
1781 1781                      KM_SLEEP);
1782 1782          }
1783 1783  
1784 1784          first_scan = 1;
1785 1785          freemem_left = 0;
1786 1786          /*
1787 1787           * Start dr_aio_cleanup_thread, which periodically iterates
1788 1788           * through the process list and invokes aio cleanup.  This
1789 1789           * is needed in order to avoid a deadly embrace between the
1790 1790           * delete_memory_thread (waiting on writer lock for page, with the
1791 1791           * exclusive-wanted bit set), kaio read request threads (waiting for a
1792 1792           * reader lock on the same page that is wanted by the
1793 1793           * delete_memory_thread), and threads waiting for kaio completion
1794 1794           * (blocked on spt_amp->lock).
1795 1795           */
1796 1796          mhp->mh_dr_aio_cleanup_cancel = 0;
1797 1797          mhp->mh_aio_cleanup_done = 0;
1798 1798          (void) thread_create(NULL, 0, dr_aio_cleanup_thread,
1799 1799              (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
1800 1800          while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
1801 1801                  pgcnt_t collected;
1802 1802  
1803 1803                  MDSTAT_INCR(mhp, nloop);
1804 1804                  collected = 0;
1805 1805                  for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
1806 1806                      (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
1807 1807                          pfn_t pfn, p_end;
1808 1808  
1809 1809                          p_end = mdsp->mds_base + mdsp->mds_npgs;
1810 1810                          for (pfn = mdsp->mds_base; (pfn < p_end) &&
1811 1811                              (mhp->mh_cancel == 0); pfn++) {
1812 1812                                  page_t *pp, *tpp, *tpp_targ;
1813 1813                                  pgcnt_t bit;
1814 1814                                  struct vnode *vp;
1815 1815                                  u_offset_t offset;
1816 1816                                  int mod, result;
1817 1817                                  spgcnt_t pgcnt;
1818 1818  
1819 1819                                  bit = pfn - mdsp->mds_base;
1820 1820                                  if ((mdsp->mds_bitmap[bit / NBPBMW] &
1821 1821                                      (1 << (bit % NBPBMW))) != 0) {
1822 1822                                          MDSTAT_INCR(mhp, already_done);
1823 1823                                          continue;
1824 1824                                  }
1825 1825                                  if (freemem_left == 0) {
1826 1826                                          freemem_left += delthr_get_freemem(mhp);
1827 1827                                          if (freemem_left == 0)
1828 1828                                                  break;
1829 1829                                  }
1830 1830  
1831 1831                                  /*
1832 1832                                   * Release mh_mutex - some of this
1833 1833                                   * stuff takes some time (eg PUTPAGE).
1834 1834                                   */
1835 1835  
1836 1836                                  mutex_exit(&mhp->mh_mutex);
1837 1837                                  MDSTAT_INCR(mhp, ncheck);
1838 1838  
1839 1839                                  pp = page_numtopp_nolock(pfn);
1840 1840                                  if (pp == NULL) {
1841 1841                                          /*
1842 1842                                           * Not covered by a page_t - will
1843 1843                                           * be dealt with elsewhere.
1844 1844                                           */
1845 1845                                          MDSTAT_INCR(mhp, nopaget);
1846 1846                                          mutex_enter(&mhp->mh_mutex);
1847 1847                                          mdsp->mds_bitmap[bit / NBPBMW] |=
1848 1848                                              (1 << (bit % NBPBMW));
1849 1849                                          continue;
1850 1850                                  }
1851 1851  
1852 1852                                  if (!page_try_reclaim_lock(pp, SE_EXCL,
1853 1853                                      SE_EXCL_WANTED | SE_RETIRED)) {
1854 1854                                          /*
1855 1855                                           * Page in use elsewhere.  Skip it.
1856 1856                                           */
1857 1857                                          MDSTAT_INCR(mhp, lockfail);
1858 1858                                          mutex_enter(&mhp->mh_mutex);
1859 1859                                          continue;
1860 1860                                  }
1861 1861                                  /*
1862 1862                                   * See if the cage expanded into the delete.
1863 1863                                   * This can happen as we have to allow the
1864 1864                                   * cage to expand.
1865 1865                                   */
1866 1866                                  if (PP_ISNORELOC(pp)) {
1867 1867                                          page_unlock(pp);
1868 1868                                          mutex_enter(&mhp->mh_mutex);
1869 1869                                          mhp->mh_cancel = KPHYSM_ENONRELOC;
1870 1870                                          break;
1871 1871                                  }
1872 1872                                  if (PP_RETIRED(pp)) {
1873 1873                                          /*
1874 1874                                           * Page has been retired and is
1875 1875                                           * not part of the cage so we
1876 1876                                           * can now do the accounting for
1877 1877                                           * it.
1878 1878                                           */
1879 1879                                          MDSTAT_INCR(mhp, retired);
1880 1880                                          mutex_enter(&mhp->mh_mutex);
1881 1881                                          mdsp->mds_bitmap[bit / NBPBMW]
1882 1882                                              |= (1 << (bit % NBPBMW));
1883 1883                                          mdsp->mds_bitmap_retired[bit /
1884 1884                                              NBPBMW] |=
1885 1885                                              (1 << (bit % NBPBMW));
1886 1886                                          mhp->mh_hold_todo--;
1887 1887                                          continue;
1888 1888                                  }
1889 1889                                  ASSERT(freemem_left != 0);
1890 1890                                  if (PP_ISFREE(pp)) {
1891 1891                                          /*
1892 1892                                           * Like page_reclaim() only 'freemem'
1893 1893                                           * processing is already done.
1894 1894                                           */
1895 1895                                          MDSTAT_INCR(mhp, nfree);
1896 1896                                  free_page_collect:
1897 1897                                          if (PP_ISAGED(pp)) {
1898 1898                                                  page_list_sub(pp,
1899 1899                                                      PG_FREE_LIST);
1900 1900                                          } else {
1901 1901                                                  page_list_sub(pp,
1902 1902                                                      PG_CACHE_LIST);
1903 1903                                          }
1904 1904                                          PP_CLRFREE(pp);
1905 1905                                          PP_CLRAGED(pp);
1906 1906                                          collected++;
1907 1907                                          mutex_enter(&mhp->mh_mutex);
1908 1908                                          page_delete_collect(pp, mhp);
1909 1909                                          mdsp->mds_bitmap[bit / NBPBMW] |=
1910 1910                                              (1 << (bit % NBPBMW));
1911 1911                                          freemem_left--;
1912 1912                                          continue;
1913 1913                                  }
1914 1914                                  ASSERT(pp->p_vnode != NULL);
1915 1915                                  if (first_scan) {
1916 1916                                          MDSTAT_INCR(mhp, first_notfree);
1917 1917                                          page_unlock(pp);
1918 1918                                          mutex_enter(&mhp->mh_mutex);
1919 1919                                          continue;
1920 1920                                  }
1921 1921                                  /*
1922 1922                                   * Keep stats on pages encountered that
1923 1923                                   * are marked for retirement.
1924 1924                                   */
1925 1925                                  if (PP_TOXIC(pp)) {
1926 1926                                          MDSTAT_INCR(mhp, toxic);
1927 1927                                  } else if (PP_PR_REQ(pp)) {
1928 1928                                          MDSTAT_INCR(mhp, failing);
1929 1929                                  }
1930 1930                                  /*
1931 1931                                   * In certain cases below, special exceptions
1932 1932                                   * are made for pages that are toxic.  This
1933 1933                                   * is because the current meaning of toxic
1934 1934                                   * is that an uncorrectable error has been
1935 1935                                   * previously associated with the page.
1936 1936                                   */
1937 1937                                  if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1938 1938                                          if (!PP_TOXIC(pp)) {
1939 1939                                                  /*
1940 1940                                                   * Must relocate locked in
1941 1941                                                   * memory pages.
1942 1942                                                   */
1943 1943  #ifdef MEM_DEL_STATS
1944 1944                                                  start_pgrp = ddi_get_lbolt();
1945 1945  #endif /* MEM_DEL_STATS */
1946 1946                                                  /*
1947 1947                                                   * Lock all constituent pages
1948 1948                                                   * of a large page to ensure
1949 1949                                                   * that p_szc won't change.
1950 1950                                                   */
1951 1951                                                  if (!group_page_trylock(pp,
1952 1952                                                      SE_EXCL)) {
1953 1953                                                          MDSTAT_INCR(mhp,
1954 1954                                                              gptllckfail);
1955 1955                                                          page_unlock(pp);
1956 1956                                                          mutex_enter(
1957 1957                                                              &mhp->mh_mutex);
1958 1958                                                          continue;
1959 1959                                                  }
1960 1960                                                  MDSTAT_INCR(mhp, npplocked);
1961 1961                                                  pp_targ =
1962 1962                                                      page_get_replacement_page(
1963 1963                                                      pp, NULL, 0);
1964 1964                                                  if (pp_targ != NULL) {
1965 1965  #ifdef MEM_DEL_STATS
1966 1966                                                          ntick_pgrp =
1967 1967                                                              (uint64_t)
1968 1968                                                              ddi_get_lbolt() -
1969 1969                                                              start_pgrp;
1970 1970  #endif /* MEM_DEL_STATS */
1971 1971                                                          MDSTAT_PGRP(mhp,
1972 1972                                                              ntick_pgrp);
1973 1973                                                          MDSTAT_INCR(mhp,
1974 1974                                                              nlockreloc);
1975 1975                                                          goto reloc;
1976 1976                                                  }
1977 1977                                                  group_page_unlock(pp);
1978 1978                                                  page_unlock(pp);
1979 1979  #ifdef MEM_DEL_STATS
1980 1980                                                  ntick_pgrp =
1981 1981                                                      (uint64_t)ddi_get_lbolt() -
1982 1982                                                      start_pgrp;
1983 1983  #endif /* MEM_DEL_STATS */
1984 1984                                                  MDSTAT_PGRP(mhp, ntick_pgrp);
1985 1985                                                  MDSTAT_INCR(mhp, nnorepl);
1986 1986                                                  mutex_enter(&mhp->mh_mutex);
1987 1987                                                  continue;
1988 1988                                          } else {
1989 1989                                                  /*
1990 1990                                                   * Cannot do anything about
1991 1991                                                   * this page because it is
1992 1992                                                   * toxic.
1993 1993                                                   */
1994 1994                                                  MDSTAT_INCR(mhp, npplkdtoxic);
1995 1995                                                  page_unlock(pp);
1996 1996                                                  mutex_enter(&mhp->mh_mutex);
1997 1997                                                  continue;
1998 1998                                          }
1999 1999                                  }
2000 2000                                  /*
2001 2001                                   * Unload the mappings and check if mod bit
2002 2002                                   * is set.
2003 2003                                   */
2004 2004                                  ASSERT(!PP_ISKAS(pp));
2005 2005                                  (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2006 2006                                  mod = hat_ismod(pp);
2007 2007  
2008 2008  #ifdef MEM_DEL_STATS
2009 2009                                  start_pgrp = ddi_get_lbolt();
2010 2010  #endif /* MEM_DEL_STATS */
2011 2011                                  if (mod && !PP_TOXIC(pp)) {
2012 2012                                          /*
2013 2013                                           * Lock all constituent pages
2014 2014                                           * of a large page to ensure
2015 2015                                           * that p_szc won't change.
2016 2016                                           */
2017 2017                                          if (!group_page_trylock(pp, SE_EXCL)) {
2018 2018                                                  MDSTAT_INCR(mhp, gptlmodfail);
2019 2019                                                  page_unlock(pp);
2020 2020                                                  mutex_enter(&mhp->mh_mutex);
2021 2021                                                  continue;
2022 2022                                          }
2023 2023                                          pp_targ = page_get_replacement_page(pp,
2024 2024                                              NULL, 0);
2025 2025                                          if (pp_targ != NULL) {
2026 2026                                                  MDSTAT_INCR(mhp, nmodreloc);
2027 2027  #ifdef MEM_DEL_STATS
2028 2028                                                  ntick_pgrp =
2029 2029                                                      (uint64_t)ddi_get_lbolt() -
2030 2030                                                      start_pgrp;
2031 2031  #endif /* MEM_DEL_STATS */
2032 2032                                                  MDSTAT_PGRP(mhp, ntick_pgrp);
2033 2033                                                  goto reloc;
2034 2034                                          }
2035 2035                                          group_page_unlock(pp);
2036 2036                                  }
2037 2037  
2038 2038                                  if (!page_try_demote_pages(pp)) {
2039 2039                                          MDSTAT_INCR(mhp, demotefail);
2040 2040                                          page_unlock(pp);
2041 2041  #ifdef MEM_DEL_STATS
2042 2042                                          ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2043 2043                                              start_pgrp;
2044 2044  #endif /* MEM_DEL_STATS */
2045 2045                                          MDSTAT_PGRP(mhp, ntick_pgrp);
2046 2046                                          mutex_enter(&mhp->mh_mutex);
2047 2047                                          continue;
2048 2048                                  }
2049 2049  
2050 2050                                  /*
2051 2051                                   * Regular 'page-out'.
2052 2052                                   */
2053 2053                                  if (!mod) {
2054 2054                                          MDSTAT_INCR(mhp, ndestroy);
2055 2055                                          page_destroy(pp, 1);
2056 2056                                          /*
2057 2057                                           * page_destroy was called with
2058 2058                                           * dontfree. As long as p_lckcnt
2059 2059                                           * and p_cowcnt are both zero, the
2060 2060                                           * only additional action of
2061 2061                                           * page_destroy with !dontfree is to
2062 2062                                           * call page_free, so we can collect
2063 2063                                           * the page here.
2064 2064                                           */
2065 2065                                          collected++;
2066 2066  #ifdef MEM_DEL_STATS
2067 2067                                          ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2068 2068                                              start_pgrp;
2069 2069  #endif /* MEM_DEL_STATS */
2070 2070                                          MDSTAT_PGRP(mhp, ntick_pgrp);
2071 2071                                          mutex_enter(&mhp->mh_mutex);
2072 2072                                          page_delete_collect(pp, mhp);
2073 2073                                          mdsp->mds_bitmap[bit / NBPBMW] |=
2074 2074                                              (1 << (bit % NBPBMW));
2075 2075                                          continue;
2076 2076                                  }
2077 2077                                  /*
2078 2078                                   * The page is toxic and the mod bit is
2079 2079                                   * set, we cannot do anything here to deal
2080 2080                                   * with it.
2081 2081                                   */
2082 2082                                  if (PP_TOXIC(pp)) {
2083 2083                                          page_unlock(pp);
2084 2084  #ifdef MEM_DEL_STATS
2085 2085                                          ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2086 2086                                              start_pgrp;
2087 2087  #endif /* MEM_DEL_STATS */
2088 2088                                          MDSTAT_PGRP(mhp, ntick_pgrp);
2089 2089                                          MDSTAT_INCR(mhp, modtoxic);
2090 2090                                          mutex_enter(&mhp->mh_mutex);
2091 2091                                          continue;
2092 2092                                  }
2093 2093                                  MDSTAT_INCR(mhp, nputpage);
2094 2094                                  vp = pp->p_vnode;
2095 2095                                  offset = pp->p_offset;
2096 2096                                  VN_HOLD(vp);
2097 2097                                  page_unlock(pp);
2098 2098                                  (void) VOP_PUTPAGE(vp, offset, PAGESIZE,
2099 2099                                      B_INVAL|B_FORCE, kcred, NULL);
2100 2100                                  VN_RELE(vp);
2101 2101  #ifdef MEM_DEL_STATS
2102 2102                                  ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2103 2103                                      start_pgrp;
2104 2104  #endif /* MEM_DEL_STATS */
2105 2105                                  MDSTAT_PGRP(mhp, ntick_pgrp);
2106 2106                                  /*
2107 2107                                   * Try to get the page back immediately
2108 2108                                   * so that it can be collected.
2109 2109                                   */
2110 2110                                  pp = page_numtopp_nolock(pfn);
2111 2111                                  if (pp == NULL) {
2112 2112                                          MDSTAT_INCR(mhp, nnoreclaim);
2113 2113                                          /*
2114 2114                                           * This should not happen as this
2115 2115                                           * thread is deleting the page.
2116 2116                                           * If this code is generalized, this
2117 2117                                           * becomes a reality.
2118 2118                                           */
2119 2119  #ifdef DEBUG
2120 2120                                          cmn_err(CE_WARN,
2121 2121                                              "delete_memory_thread(0x%p) "
2122 2122                                              "pfn 0x%lx has no page_t",
2123 2123                                              (void *)mhp, pfn);
2124 2124  #endif /* DEBUG */
2125 2125                                          mutex_enter(&mhp->mh_mutex);
2126 2126                                          continue;
2127 2127                                  }
2128 2128                                  if (page_try_reclaim_lock(pp, SE_EXCL,
2129 2129                                      SE_EXCL_WANTED | SE_RETIRED)) {
2130 2130                                          if (PP_ISFREE(pp)) {
2131 2131                                                  goto free_page_collect;
2132 2132                                          }
2133 2133                                          page_unlock(pp);
2134 2134                                  }
2135 2135                                  MDSTAT_INCR(mhp, nnoreclaim);
2136 2136                                  mutex_enter(&mhp->mh_mutex);
2137 2137                                  continue;
2138 2138  
2139 2139                          reloc:
2140 2140                                  /*
2141 2141                                   * Got some freemem and a target
2142 2142                                   * page, so move the data to avoid
2143 2143                                   * I/O and lock problems.
2144 2144                                   */
2145 2145                                  ASSERT(!page_iolock_assert(pp));
2146 2146                                  MDSTAT_INCR(mhp, nreloc);
2147 2147                                  /*
2148 2148                                   * page_relocate() will return pgcnt: the
2149 2149                                   * number of consecutive pages relocated.
2150 2150                                   * If it is successful, pp will be a
2151 2151                                   * linked list of the page structs that
2152 2152                                   * were relocated. If page_relocate() is
2153 2153                                   * unsuccessful, pp will be unmodified.
2154 2154                                   */
2155 2155  #ifdef MEM_DEL_STATS
2156 2156                                  start_pgrp = ddi_get_lbolt();
2157 2157  #endif /* MEM_DEL_STATS */
2158 2158                                  result = page_relocate(&pp, &pp_targ, 0, 0,
2159 2159                                      &pgcnt, NULL);
2160 2160  #ifdef MEM_DEL_STATS
2161 2161                                  ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2162 2162                                      start_pgrp;
2163 2163  #endif /* MEM_DEL_STATS */
2164 2164                                  MDSTAT_PGRP(mhp, ntick_pgrp);
2165 2165                                  if (result != 0) {
2166 2166                                          MDSTAT_INCR(mhp, nrelocfail);
2167 2167                                          /*
2168 2168                                           * We did not succeed. We need
2169 2169                                           * to give the pp_targ pages back.
2170 2170                                           * page_free(pp_targ, 1) without
2171 2171                                           * the freemem accounting.
2172 2172                                           */
2173 2173                                          group_page_unlock(pp);
2174 2174                                          page_free_replacement_page(pp_targ);
2175 2175                                          page_unlock(pp);
2176 2176                                          mutex_enter(&mhp->mh_mutex);
2177 2177                                          continue;
2178 2178                                  }
2179 2179  
2180 2180                                  /*
2181 2181                                   * We will then collect pgcnt pages.
2182 2182                                   */
2183 2183                                  ASSERT(pgcnt > 0);
2184 2184                                  mutex_enter(&mhp->mh_mutex);
2185 2185                                  /*
2186 2186                                   * We need to make sure freemem_left is
2187 2187                                   * large enough.
2188 2188                                   */
2189 2189                                  while ((freemem_left < pgcnt) &&
2190 2190                                      (!mhp->mh_cancel)) {
2191 2191                                          freemem_left +=
2192 2192                                              delthr_get_freemem(mhp);
2193 2193                                  }
2194 2194  
2195 2195                                  /*
2196 2196                                   * Do not proceed if mh_cancel is set.
2197 2197                                   */
2198 2198                                  if (mhp->mh_cancel) {
2199 2199                                          while (pp_targ != NULL) {
2200 2200                                                  /*
2201 2201                                                   * Unlink and unlock each page.
2202 2202                                                   */
2203 2203                                                  tpp_targ = pp_targ;
2204 2204                                                  page_sub(&pp_targ, tpp_targ);
2205 2205                                                  page_unlock(tpp_targ);
2206 2206                                          }
2207 2207                                          /*
2208 2208                                           * We need to give the pp pages back.
2209 2209                                           * page_free(pp, 1) without the
2210 2210                                           * freemem accounting.
2211 2211                                           */
2212 2212                                          page_free_replacement_page(pp);
2213 2213                                          break;
2214 2214                                  }
2215 2215  
2216 2216                                  /* Now remove pgcnt from freemem_left */
2217 2217                                  freemem_left -= pgcnt;
2218 2218                                  ASSERT(freemem_left >= 0);
2219 2219                                  szc = pp->p_szc;
2220 2220                                  while (pp != NULL) {
2221 2221                                          /*
2222 2222                                           * pp and pp_targ were passed back as
2223 2223                                           * a linked list of pages.
2224 2224                                           * Unlink and unlock each page.
2225 2225                                           */
2226 2226                                          tpp_targ = pp_targ;
2227 2227                                          page_sub(&pp_targ, tpp_targ);
2228 2228                                          page_unlock(tpp_targ);
2229 2229                                          /*
2230 2230                                           * The original page is now free
2231 2231                                           * so remove it from the linked
2232 2232                                           * list and collect it.
2233 2233                                           */
2234 2234                                          tpp = pp;
2235 2235                                          page_sub(&pp, tpp);
2236 2236                                          pfn = page_pptonum(tpp);
2237 2237                                          collected++;
2238 2238                                          ASSERT(PAGE_EXCL(tpp));
2239 2239                                          ASSERT(tpp->p_vnode == NULL);
2240 2240                                          ASSERT(!hat_page_is_mapped(tpp));
2241 2241                                          ASSERT(tpp->p_szc == szc);
2242 2242                                          tpp->p_szc = 0;
2243 2243                                          page_delete_collect(tpp, mhp);
2244 2244                                          bit = pfn - mdsp->mds_base;
2245 2245                                          mdsp->mds_bitmap[bit / NBPBMW] |=
2246 2246                                              (1 << (bit % NBPBMW));
2247 2247                                  }
2248 2248                                  ASSERT(pp_targ == NULL);
2249 2249                          }
2250 2250                  }
2251 2251                  first_scan = 0;
2252 2252                  if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
2253 2253                      (collected == 0)) {
2254 2254                          /*
2255 2255                           * This code is needed as we cannot wait
2256 2256                           * for a page to be locked OR the delete to
2257 2257                           * be cancelled.  Also, we must delay so
2258 2258                           * that other threads get a chance to run
2259 2259                           * on our cpu, otherwise page locks may be
2260 2260                           * held indefinitely by those threads.
2261 2261                           */
2262 2262                          MDSTAT_INCR(mhp, ndelay);
2263 2263                          CALLB_CPR_SAFE_BEGIN(&cprinfo);
2264 2264                          (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
2265 2265                              DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK);
2266 2266                          CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2267 2267                  }
2268 2268          }
2269 2269          /* stop the dr aio cleanup thread */
2270 2270          mhp->mh_dr_aio_cleanup_cancel = 1;
2271 2271          transit_list_collect(mhp, 0);
2272 2272          if (freemem_left != 0) {
2273 2273                  /* Return any surplus. */
2274 2274                  page_create_putback(freemem_left);
2275 2275                  freemem_left = 0;
2276 2276          }
2277 2277  #ifdef MEM_DEL_STATS
2278 2278          ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
2279 2279  #endif /* MEM_DEL_STATS */
2280 2280          MDSTAT_TOTAL(mhp, ntick_total);
2281 2281          MDSTAT_PRINT(mhp);
2282 2282  
2283 2283          /*
2284 2284           * If the memory delete was cancelled, exclusive-wanted bits must
2285 2285           * be cleared. If there are retired pages being deleted, they need
2286 2286           * to be unretired.
2287 2287           */
2288 2288          for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2289 2289              mdsp = mdsp->mds_next) {
2290 2290                  pfn_t pfn, p_end;
2291 2291  
2292 2292                  p_end = mdsp->mds_base + mdsp->mds_npgs;
2293 2293                  for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
2294 2294                          page_t *pp;
2295 2295                          pgcnt_t bit;
2296 2296  
2297 2297                          bit = pfn - mdsp->mds_base;
2298 2298                          if (mhp->mh_cancel) {
2299 2299                                  pp = page_numtopp_nolock(pfn);
2300 2300                                  if (pp != NULL) {
2301 2301                                          if ((mdsp->mds_bitmap[bit / NBPBMW] &
2302 2302                                              (1 << (bit % NBPBMW))) == 0) {
2303 2303                                                  page_lock_clr_exclwanted(pp);
2304 2304                                          }
2305 2305                                  }
2306 2306                          } else {
2307 2307                                  pp = NULL;
2308 2308                          }
2309 2309                          if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
2310 2310                              (1 << (bit % NBPBMW))) != 0) {
2311 2311                                  /* do we already have pp? */
2312 2312                                  if (pp == NULL) {
2313 2313                                          pp = page_numtopp_nolock(pfn);
2314 2314                                  }
2315 2315                                  ASSERT(pp != NULL);
2316 2316                                  ASSERT(PP_RETIRED(pp));
2317 2317                                  if (mhp->mh_cancel != 0) {
2318 2318                                          page_unlock(pp);
2319 2319                                          /*
2320 2320                                           * To satisfy ASSERT below in
2321 2321                                           * cancel code.
2322 2322                                           */
2323 2323                                          mhp->mh_hold_todo++;
2324 2324                                  } else {
2325 2325                                          (void) page_unretire_pp(pp,
2326 2326                                              PR_UNR_CLEAN);
2327 2327                                  }
2328 2328                          }
2329 2329                  }
2330 2330          }
2331 2331          /*
2332 2332           * Free retired page bitmap and collected page bitmap
2333 2333           */
2334 2334          for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2335 2335              mdsp = mdsp->mds_next) {
2336 2336                  ASSERT(mdsp->mds_bitmap_retired != NULL);
2337 2337                  kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
2338 2338                  mdsp->mds_bitmap_retired = NULL;        /* Paranoia. */
2339 2339                  ASSERT(mdsp->mds_bitmap != NULL);
2340 2340                  kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
2341 2341                  mdsp->mds_bitmap = NULL;        /* Paranoia. */
2342 2342          }
2343 2343  
2344 2344          /* wait for our dr aio cancel thread to exit */
2345 2345          while (!(mhp->mh_aio_cleanup_done)) {
2346 2346                  CALLB_CPR_SAFE_BEGIN(&cprinfo);
2347 2347                  delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
2348 2348                  CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2349 2349          }
2350 2350  refused:
2351 2351          if (mhp->mh_cancel != 0) {
2352 2352                  page_t *pp;
2353 2353  
2354 2354                  comp_code = mhp->mh_cancel;
2355 2355                  /*
2356 2356                   * Go through list of deleted pages (mh_deleted) freeing
2357 2357                   * them.
2358 2358                   */
2359 2359                  while ((pp = mhp->mh_deleted) != NULL) {
2360 2360                          mhp->mh_deleted = pp->p_next;
2361 2361                          mhp->mh_hold_todo++;
2362 2362                          mutex_exit(&mhp->mh_mutex);
2363 2363                          /* Restore p_next. */
2364 2364                          pp->p_next = pp->p_prev;
2365 2365                          if (PP_ISFREE(pp)) {
2366 2366                                  cmn_err(CE_PANIC,
2367 2367                                      "page %p is free",
2368 2368                                      (void *)pp);
2369 2369                          }
2370 2370                          page_free(pp, 1);
2371 2371                          mutex_enter(&mhp->mh_mutex);
2372 2372                  }
2373 2373                  ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);
2374 2374  
2375 2375                  mutex_exit(&mhp->mh_mutex);
2376 2376                  put_availrmem(mhp->mh_vm_pages);
2377 2377                  mutex_enter(&mhp->mh_mutex);
2378 2378  
2379 2379                  goto t_exit;
2380 2380          }
2381 2381  
2382 2382          /*
2383 2383           * All the pages are no longer in use and are exclusively locked.
2384 2384           */
2385 2385  
2386 2386          mhp->mh_deleted = NULL;
2387 2387  
2388 2388          kphysm_del_cleanup(mhp);
2389 2389  
2390 2390          /*
2391 2391           * mem_node_del_range needs to be after kphysm_del_cleanup so
2392 2392           * that the mem_node_config[] will remain intact for the cleanup.
2393 2393           */
2394 2394          for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2395 2395              mdsp = mdsp->mds_next) {
2396 2396                  mem_node_del_range(mdsp->mds_base,
2397 2397                      mdsp->mds_base + mdsp->mds_npgs - 1);
2398 2398          }
2399 2399          /* cleanup the page counters */
2400 2400          page_ctrs_cleanup();
2401 2401  
2402 2402          comp_code = KPHYSM_OK;
2403 2403  
2404 2404  t_exit:
2405 2405          mutex_exit(&mhp->mh_mutex);
2406 2406          kphysm_setup_post_del(mhp->mh_vm_pages,
2407 2407              (comp_code == KPHYSM_OK) ? 0 : 1);
2408 2408          mutex_enter(&mhp->mh_mutex);
2409 2409  
2410 2410  early_exit:
2411 2411          /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
2412 2412          mhp->mh_state = MHND_DONE;
2413 2413          del_complete_funcp = mhp->mh_delete_complete;
2414 2414          del_complete_arg = mhp->mh_delete_complete_arg;
2415 2415          CALLB_CPR_EXIT(&cprinfo);
2416 2416          (*del_complete_funcp)(del_complete_arg, comp_code);
2417 2417          thread_exit();
2418 2418          /*NOTREACHED*/
2419 2419  }
2420 2420  
2421 2421  /*
2422 2422   * Start the delete of the memory from the system.
2423 2423   */
2424 2424  int
2425 2425  kphysm_del_start(
2426 2426          memhandle_t handle,
2427 2427          void (*complete)(void *, int),
2428 2428          void *complete_arg)
2429 2429  {
2430 2430          struct mem_handle *mhp;
2431 2431  
2432 2432          mhp = kphysm_lookup_mem_handle(handle);
2433 2433          if (mhp == NULL) {
2434 2434                  return (KPHYSM_EHANDLE);
2435 2435          }
2436 2436          switch (mhp->mh_state) {
2437 2437          case MHND_FREE:
2438 2438                  ASSERT(mhp->mh_state != MHND_FREE);
2439 2439                  mutex_exit(&mhp->mh_mutex);
2440 2440                  return (KPHYSM_EHANDLE);
2441 2441          case MHND_INIT:
2442 2442                  break;
2443 2443          case MHND_STARTING:
2444 2444          case MHND_RUNNING:
2445 2445                  mutex_exit(&mhp->mh_mutex);
2446 2446                  return (KPHYSM_ESEQUENCE);
2447 2447          case MHND_DONE:
2448 2448                  mutex_exit(&mhp->mh_mutex);
2449 2449                  return (KPHYSM_ESEQUENCE);
2450 2450          case MHND_RELEASE:
2451 2451                  mutex_exit(&mhp->mh_mutex);
2452 2452                  return (KPHYSM_ESEQUENCE);
2453 2453          default:
2454 2454  #ifdef DEBUG
2455 2455                  cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
2456 2456                      (void *)mhp, mhp->mh_state);
2457 2457  #endif /* DEBUG */
2458 2458                  mutex_exit(&mhp->mh_mutex);
2459 2459                  return (KPHYSM_EHANDLE);
2460 2460          }
2461 2461  
2462 2462          if (mhp->mh_transit.trl_spans == NULL) {
2463 2463                  mutex_exit(&mhp->mh_mutex);
2464 2464                  return (KPHYSM_ENOWORK);
2465 2465          }
2466 2466  
2467 2467          ASSERT(complete != NULL);
2468 2468          mhp->mh_delete_complete = complete;
2469 2469          mhp->mh_delete_complete_arg = complete_arg;
2470 2470          mhp->mh_state = MHND_STARTING;
2471 2471          /*
2472 2472           * Release the mutex in case thread_create sleeps.
2473 2473           */
2474 2474          mutex_exit(&mhp->mh_mutex);
2475 2475  
2476 2476          /*
2477 2477           * The "obvious" process for this thread is pageout (proc_pageout)
2478 2478           * but this gives the thread too much power over freemem
2479 2479           * which results in freemem starvation.
2480 2480           */
2481 2481          (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
2482 2482              TS_RUN, maxclsyspri - 1);
2483 2483  
2484 2484          return (KPHYSM_OK);
2485 2485  }
2486 2486  
2487 2487  static kmutex_t pp_dummy_lock;          /* Protects init. of pp_dummy. */
2488 2488  static caddr_t pp_dummy;
2489 2489  static pgcnt_t pp_dummy_npages;
2490 2490  static pfn_t *pp_dummy_pfn;     /* Array of dummy pfns. */
2491 2491  
2492 2492  static void
2493 2493  memseg_remap_init_pages(page_t *pages, page_t *epages)
2494 2494  {
2495 2495          page_t *pp;
2496 2496  
2497 2497          for (pp = pages; pp < epages; pp++) {
2498 2498                  pp->p_pagenum = PFN_INVALID;    /* XXXX */
2499 2499                  pp->p_offset = (u_offset_t)-1;
2500 2500                  page_iolock_init(pp);
2501 2501                  while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
2502 2502                          continue;
2503 2503                  page_lock_delete(pp);
2504 2504          }
2505 2505  }
2506 2506  
2507 2507  void
2508 2508  memseg_remap_init()
2509 2509  {
2510 2510          mutex_enter(&pp_dummy_lock);
2511 2511          if (pp_dummy == NULL) {
2512 2512                  uint_t dpages;
2513 2513                  int i;
2514 2514  
2515 2515                  /*
2516 2516                   * dpages starts off as the size of the structure and
2517 2517                   * ends up as the minimum number of pages that will
2518 2518                   * hold a whole number of page_t structures.
2519 2519                   */
2520 2520                  dpages = sizeof (page_t);
2521 2521                  ASSERT(dpages != 0);
2522 2522                  ASSERT(dpages <= MMU_PAGESIZE);
2523 2523  
2524 2524                  while ((dpages & 1) == 0)
2525 2525                          dpages >>= 1;
2526 2526  
2527 2527                  pp_dummy_npages = dpages;
2528 2528                  /*
2529 2529                   * Allocate pp_dummy pages directly from static_arena,
2530 2530                   * since these are whole page allocations and are
2531 2531                   * referenced by physical address.  This also has the
2532 2532                   * nice fringe benefit of hiding the memory from
2533 2533                   * ::findleaks since it doesn't deal well with allocated
2534 2534                   * kernel heap memory that doesn't have any mappings.
2535 2535                   */
2536 2536                  pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
2537 2537                      PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
2538 2538                  bzero(pp_dummy, ptob(pp_dummy_npages));
2539 2539                  ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
2540 2540                  pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
2541 2541                      pp_dummy_npages, KM_SLEEP);
2542 2542                  for (i = 0; i < pp_dummy_npages; i++) {
2543 2543                          pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
2544 2544                              &pp_dummy[MMU_PAGESIZE * i]);
2545 2545                          ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
2546 2546                  }
2547 2547                  /*
2548 2548                   * Initialize the page_t's to a known 'deleted' state
2549 2549                   * that matches the state of deleted pages.
2550 2550                   */
2551 2551                  memseg_remap_init_pages((page_t *)pp_dummy,
2552 2552                      (page_t *)(pp_dummy + ptob(pp_dummy_npages)));
2553 2553                  /* Remove kmem mappings for the pages for safety. */
2554 2554                  hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
2555 2555                      HAT_UNLOAD_UNLOCK);
2556 2556                  /* Leave pp_dummy pointer set as flag that init is done. */
2557 2557          }
2558 2558          mutex_exit(&pp_dummy_lock);
2559 2559  }
2560 2560  
2561 2561  /*
2562 2562   * Remap a page-aglined range of page_t's to dummy pages.
2563 2563   */
2564 2564  void
2565 2565  remap_to_dummy(caddr_t va, pgcnt_t metapgs)
2566 2566  {
2567 2567          int phase;
2568 2568  
2569 2569          ASSERT(IS_P2ALIGNED((uint64_t)(uintptr_t)va, PAGESIZE));
2570 2570  
2571 2571          /*
2572 2572           * We may start remapping at a non-zero page offset
2573 2573           * within the dummy pages since the low/high ends
2574 2574           * of the outgoing pp's could be shared by other
2575 2575           * memsegs (see memseg_remap_meta).
2576 2576           */
2577 2577          phase = btop((uint64_t)(uintptr_t)va) % pp_dummy_npages;
2578 2578          /*CONSTCOND*/
2579 2579          ASSERT(PAGESIZE % sizeof (page_t) || phase == 0);
2580 2580  
2581 2581          while (metapgs != 0) {
2582 2582                  pgcnt_t n;
2583 2583                  int i, j;
2584 2584  
2585 2585                  n = pp_dummy_npages;
2586 2586                  if (n > metapgs)
2587 2587                          n = metapgs;
2588 2588                  for (i = 0; i < n; i++) {
2589 2589                          j = (i + phase) % pp_dummy_npages;
2590 2590                          hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j],
2591 2591                              PROT_READ,
2592 2592                              HAT_LOAD | HAT_LOAD_NOCONSIST |
2593 2593                              HAT_LOAD_REMAP);
2594 2594                          va += ptob(1);
2595 2595                  }
2596 2596                  metapgs -= n;
2597 2597          }
2598 2598  }
2599 2599  
2600 2600  static void
2601 2601  memseg_remap_to_dummy(struct memseg *seg)
2602 2602  {
2603 2603          caddr_t pp;
2604 2604          pgcnt_t metapgs;
2605 2605  
2606 2606          ASSERT(memseg_is_dynamic(seg));
2607 2607          ASSERT(pp_dummy != NULL);
2608 2608  
2609 2609  
2610 2610          if (!memseg_includes_meta(seg)) {
2611 2611                  memseg_remap_meta(seg);
2612 2612                  return;
2613 2613          }
2614 2614  
2615 2615          pp = (caddr_t)seg->pages;
2616 2616          metapgs = seg->pages_base - memseg_get_start(seg);
2617 2617          ASSERT(metapgs != 0);
2618 2618  
2619 2619          seg->pages_end = seg->pages_base;
2620 2620  
2621 2621          remap_to_dummy(pp, metapgs);
2622 2622  }
2623 2623  
2624 2624  /*
2625 2625   * Transition all the deleted pages to the deleted state so that
2626 2626   * page_lock will not wait. The page_lock_delete call will
2627 2627   * also wake up any waiters.
2628 2628   */
2629 2629  static void
2630 2630  memseg_lock_delete_all(struct memseg *seg)
2631 2631  {
2632 2632          page_t *pp;
2633 2633  
2634 2634          for (pp = seg->pages; pp < seg->epages; pp++) {
2635 2635                  pp->p_pagenum = PFN_INVALID;    /* XXXX */
2636 2636                  page_lock_delete(pp);
2637 2637          }
2638 2638  }
2639 2639  
2640 2640  static void
2641 2641  kphysm_del_cleanup(struct mem_handle *mhp)
2642 2642  {
2643 2643          struct memdelspan       *mdsp;
2644 2644          struct memseg           *seg;
2645 2645          struct memseg           **segpp;
2646 2646          struct memseg           *seglist;
2647 2647          pfn_t                   p_end;
2648 2648          uint64_t                avmem;
2649 2649          pgcnt_t                 avpgs;
2650 2650          pgcnt_t                 npgs;
2651 2651  
2652 2652          avpgs = mhp->mh_vm_pages;
2653 2653  
2654 2654          memsegs_lock(1);
2655 2655  
2656 2656          /*
2657 2657           * remove from main segment list.
2658 2658           */
2659 2659          npgs = 0;
2660 2660          seglist = NULL;
2661 2661          for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2662 2662              mdsp = mdsp->mds_next) {
2663 2663                  p_end = mdsp->mds_base + mdsp->mds_npgs;
2664 2664                  for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
2665 2665                          if (seg->pages_base >= p_end ||
2666 2666                              seg->pages_end <= mdsp->mds_base) {
2667 2667                                  /* Span and memseg don't overlap. */
2668 2668                                  segpp = &((*segpp)->next);
2669 2669                                  continue;
2670 2670                          }
2671 2671                          ASSERT(seg->pages_base >= mdsp->mds_base);
2672 2672                          ASSERT(seg->pages_end <= p_end);
2673 2673  
2674 2674                          PLCNT_MODIFY_MAX(seg->pages_base,
2675 2675                              seg->pages_base - seg->pages_end);
2676 2676  
2677 2677                          /* Hide the memseg from future scans. */
2678 2678                          hat_kpm_delmem_mseg_update(seg, segpp);
2679 2679                          *segpp = seg->next;
2680 2680                          membar_producer();      /* TODO: Needed? */
2681 2681                          npgs += MSEG_NPAGES(seg);
2682 2682  
2683 2683                          /*
2684 2684                           * Leave the deleted segment's next pointer intact
2685 2685                           * in case a memsegs scanning loop is walking this
2686 2686                           * segment concurrently.
2687 2687                           */
2688 2688                          seg->lnext = seglist;
2689 2689                          seglist = seg;
2690 2690                  }
2691 2691          }
2692 2692  
2693 2693          build_pfn_hash();
2694 2694  
2695 2695          ASSERT(npgs < total_pages);
2696 2696          total_pages -= npgs;
2697 2697  
2698 2698          /*
2699 2699           * Recalculate the paging parameters now total_pages has changed.
2700 2700           * This will also cause the clock hands to be reset before next use.
2701 2701           */
2702 2702          setupclock(1);
2703 2703  
2704 2704          memsegs_unlock(1);
2705 2705  
2706 2706          mutex_exit(&mhp->mh_mutex);
2707 2707  
2708 2708          while ((seg = seglist) != NULL) {
2709 2709                  pfn_t mseg_start;
2710 2710                  pfn_t mseg_base, mseg_end;
2711 2711                  pgcnt_t mseg_npgs;
2712 2712                  int mlret;
2713 2713  
2714 2714                  seglist = seg->lnext;
2715 2715  
2716 2716                  /*
2717 2717                   * Put the page_t's into the deleted state to stop
2718 2718                   * cv_wait()s on the pages. When we remap, the dummy
2719 2719                   * page_t's will be in the same state.
2720 2720                   */
2721 2721                  memseg_lock_delete_all(seg);
2722 2722                  /*
2723 2723                   * Collect up information based on pages_base and pages_end
2724 2724                   * early so that we can flag early that the memseg has been
2725 2725                   * deleted by setting pages_end == pages_base.
2726 2726                   */
2727 2727                  mseg_base = seg->pages_base;
2728 2728                  mseg_end = seg->pages_end;
2729 2729                  mseg_npgs = MSEG_NPAGES(seg);
2730 2730                  mseg_start = memseg_get_start(seg);
2731 2731  
2732 2732                  if (memseg_is_dynamic(seg)) {
2733 2733                          /* Remap the meta data to our special dummy area. */
2734 2734                          memseg_remap_to_dummy(seg);
2735 2735  
2736 2736                          mutex_enter(&memseg_lists_lock);
2737 2737                          seg->lnext = memseg_va_avail;
2738 2738                          memseg_va_avail = seg;
2739 2739                          mutex_exit(&memseg_lists_lock);
2740 2740                  } else {
2741 2741                          /*
2742 2742                           * For memory whose page_ts were allocated
2743 2743                           * at boot, we need to find a new use for
2744 2744                           * the page_t memory.
2745 2745                           * For the moment, just leak it.
2746 2746                           * (It is held in the memseg_delete_junk list.)
2747 2747                           */
2748 2748                          seg->pages_end = seg->pages_base;
2749 2749  
2750 2750                          mutex_enter(&memseg_lists_lock);
2751 2751                          seg->lnext = memseg_delete_junk;
2752 2752                          memseg_delete_junk = seg;
2753 2753                          mutex_exit(&memseg_lists_lock);
2754 2754                  }
2755 2755  
2756 2756                  /* Must not use seg now as it could be re-used. */
2757 2757  
2758 2758                  memlist_write_lock();
2759 2759  
2760 2760                  mlret = memlist_delete_span(
2761 2761                      (uint64_t)(mseg_base) << PAGESHIFT,
2762 2762                      (uint64_t)(mseg_npgs) << PAGESHIFT,
2763 2763                      &phys_avail);
2764 2764                  ASSERT(mlret == MEML_SPANOP_OK);
2765 2765  
2766 2766                  mlret = memlist_delete_span(
2767 2767                      (uint64_t)(mseg_start) << PAGESHIFT,
2768 2768                      (uint64_t)(mseg_end - mseg_start) <<
2769 2769                      PAGESHIFT,
2770 2770                      &phys_install);
2771 2771                  ASSERT(mlret == MEML_SPANOP_OK);
2772 2772                  phys_install_has_changed();
2773 2773  
2774 2774                  memlist_write_unlock();
2775 2775          }
2776 2776  
2777 2777          memlist_read_lock();
2778 2778          installed_top_size(phys_install, &physmax, &physinstalled);
2779 2779          memlist_read_unlock();
2780 2780  
2781 2781          mutex_enter(&freemem_lock);
2782 2782          maxmem -= avpgs;
2783 2783          physmem -= avpgs;
2784 2784          /* availrmem is adjusted during the delete. */
2785 2785          availrmem_initial -= avpgs;
2786 2786  
2787 2787          mutex_exit(&freemem_lock);
2788 2788  
2789 2789          dump_resize();
2790 2790  
2791 2791          cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
2792 2792              "(0x%" PRIx64 ")\n",
2793 2793              physinstalled << (PAGESHIFT - 10),
2794 2794              (uint64_t)physinstalled << PAGESHIFT);
2795 2795  
2796 2796          avmem = (uint64_t)freemem << PAGESHIFT;
2797 2797          cmn_err(CE_CONT, "?kphysm_delete: "
2798 2798              "avail mem = %" PRId64 "\n", avmem);
2799 2799  
2800 2800          /*
2801 2801           * Update lgroup generation number on single lgroup systems
2802 2802           */
2803 2803          if (nlgrps == 1)
2804 2804                  lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
2805 2805  
2806 2806          /* Successfully deleted system memory */
2807 2807          mutex_enter(&mhp->mh_mutex);
2808 2808  }
2809 2809  
2810 2810  static uint_t mdel_nullvp_waiter;
2811 2811  
2812 2812  static void
2813 2813  page_delete_collect(
2814 2814          page_t *pp,
2815 2815          struct mem_handle *mhp)
2816 2816  {
2817 2817          if (pp->p_vnode) {
2818 2818                  page_hashout(pp, (kmutex_t *)NULL);
2819 2819                  /* do not do PP_SETAGED(pp); */
2820 2820          } else {
2821 2821                  kmutex_t *sep;
2822 2822  
2823 2823                  sep = page_se_mutex(pp);
2824 2824                  mutex_enter(sep);
2825 2825                  if (CV_HAS_WAITERS(&pp->p_cv)) {
2826 2826                          mdel_nullvp_waiter++;
2827 2827                          cv_broadcast(&pp->p_cv);
2828 2828                  }
2829 2829                  mutex_exit(sep);
2830 2830          }
2831 2831          ASSERT(pp->p_next == pp->p_prev);
2832 2832          ASSERT(pp->p_next == NULL || pp->p_next == pp);
2833 2833          pp->p_next = mhp->mh_deleted;
2834 2834          mhp->mh_deleted = pp;
2835 2835          ASSERT(mhp->mh_hold_todo != 0);
2836 2836          mhp->mh_hold_todo--;
2837 2837  }
2838 2838  
2839 2839  static void
2840 2840  transit_list_collect(struct mem_handle *mhp, int v)
2841 2841  {
2842 2842          struct transit_list_head *trh;
2843 2843  
2844 2844          trh = &transit_list_head;
2845 2845          mutex_enter(&trh->trh_lock);
2846 2846          mhp->mh_transit.trl_collect = v;
2847 2847          mutex_exit(&trh->trh_lock);
2848 2848  }
2849 2849  
2850 2850  static void
2851 2851  transit_list_insert(struct transit_list *tlp)
2852 2852  {
2853 2853          struct transit_list_head *trh;
2854 2854  
2855 2855          trh = &transit_list_head;
2856 2856          ASSERT(MUTEX_HELD(&trh->trh_lock));
2857 2857          tlp->trl_next = trh->trh_head;
2858 2858          trh->trh_head = tlp;
2859 2859  }
2860 2860  
2861 2861  static void
2862 2862  transit_list_remove(struct transit_list *tlp)
2863 2863  {
2864 2864          struct transit_list_head *trh;
2865 2865          struct transit_list **tlpp;
2866 2866  
2867 2867          trh = &transit_list_head;
2868 2868          tlpp = &trh->trh_head;
2869 2869          ASSERT(MUTEX_HELD(&trh->trh_lock));
2870 2870          while (*tlpp != NULL && *tlpp != tlp)
2871 2871                  tlpp = &(*tlpp)->trl_next;
2872 2872          ASSERT(*tlpp != NULL);
2873 2873          if (*tlpp == tlp)
2874 2874                  *tlpp = tlp->trl_next;
2875 2875          tlp->trl_next = NULL;
2876 2876  }
2877 2877  
2878 2878  static struct transit_list *
2879 2879  pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
2880 2880  {
2881 2881          struct transit_list *tlp;
2882 2882  
2883 2883          for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
2884 2884                  struct memdelspan *mdsp;
2885 2885  
2886 2886                  for (mdsp = tlp->trl_spans; mdsp != NULL;
2887 2887                      mdsp = mdsp->mds_next) {
2888 2888                          if (pfnum >= mdsp->mds_base &&
2889 2889                              pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
2890 2890                                  return (tlp);
2891 2891                          }
2892 2892                  }
2893 2893          }
2894 2894          return (NULL);
2895 2895  }
2896 2896  
2897 2897  int
2898 2898  pfn_is_being_deleted(pfn_t pfnum)
2899 2899  {
2900 2900          struct transit_list_head *trh;
2901 2901          struct transit_list *tlp;
2902 2902          int ret;
2903 2903  
2904 2904          trh = &transit_list_head;
2905 2905          if (trh->trh_head == NULL)
2906 2906                  return (0);
2907 2907  
2908 2908          mutex_enter(&trh->trh_lock);
2909 2909          tlp = pfnum_to_transit_list(trh, pfnum);
2910 2910          ret = (tlp != NULL && tlp->trl_collect);
2911 2911          mutex_exit(&trh->trh_lock);
2912 2912  
2913 2913          return (ret);
2914 2914  }
2915 2915  
2916 2916  #ifdef MEM_DEL_STATS
2917 2917  extern int hz;
2918 2918  static void
2919 2919  mem_del_stat_print_func(struct mem_handle *mhp)
2920 2920  {
2921 2921          uint64_t tmp;
2922 2922  
2923 2923          if (mem_del_stat_print) {
2924 2924                  printf("memory delete loop %x/%x, statistics%s\n",
2925 2925                      (uint_t)mhp->mh_transit.trl_spans->mds_base,
2926 2926                      (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
2927 2927                      (mhp->mh_cancel ? " (cancelled)" : ""));
2928 2928                  printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
2929 2929                  printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
2930 2930                  printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
2931 2931                  printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
2932 2932                  printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
2933 2933                  printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
2934 2934                  printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
2935 2935                  printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
2936 2936                  printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
2937 2937                  printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
2938 2938                  printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
2939 2939                  printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
2940 2940                  printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
2941 2941                  printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
2942 2942                  printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
2943 2943                  printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
2944 2944                  printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
2945 2945                  printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
2946 2946                  printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
2947 2947                  printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
2948 2948                  printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
2949 2949                  printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
2950 2950                  printf("\t%8u retired\n", mhp->mh_delstat.retired);
2951 2951                  printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
2952 2952                  printf("\t%8u failing\n", mhp->mh_delstat.failing);
2953 2953                  printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
2954 2954                  printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
2955 2955                  printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
2956 2956                  printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
2957 2957                  tmp = mhp->mh_delstat.nticks_total / hz;  /* seconds */
2958 2958                  printf(
2959 2959                      "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
2960 2960                      mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);
2961 2961  
2962 2962                  tmp = mhp->mh_delstat.nticks_pgrp / hz;  /* seconds */
2963 2963                  printf(
2964 2964                      "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
2965 2965                      mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
2966 2966          }
2967 2967  }
2968 2968  #endif /* MEM_DEL_STATS */
2969 2969  
2970 2970  struct mem_callback {
2971 2971          kphysm_setup_vector_t   *vec;
2972 2972          void                    *arg;
2973 2973  };
2974 2974  
2975 2975  #define NMEMCALLBACKS           100
2976 2976  
2977 2977  static struct mem_callback mem_callbacks[NMEMCALLBACKS];
2978 2978  static uint_t nmemcallbacks;
2979 2979  static krwlock_t mem_callback_rwlock;
2980 2980  
2981 2981  int
2982 2982  kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
2983 2983  {
2984 2984          uint_t i, found;
2985 2985  
2986 2986          /*
2987 2987           * This test will become more complicated when the version must
2988 2988           * change.
2989 2989           */
2990 2990          if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
2991 2991                  return (EINVAL);
2992 2992  
2993 2993          if (vec->post_add == NULL || vec->pre_del == NULL ||
2994 2994              vec->post_del == NULL)
2995 2995                  return (EINVAL);
2996 2996  
2997 2997          rw_enter(&mem_callback_rwlock, RW_WRITER);
2998 2998          for (i = 0, found = 0; i < nmemcallbacks; i++) {
2999 2999                  if (mem_callbacks[i].vec == NULL && found == 0)
3000 3000                          found = i + 1;
3001 3001                  if (mem_callbacks[i].vec == vec &&
3002 3002                      mem_callbacks[i].arg == arg) {
3003 3003  #ifdef DEBUG
3004 3004                          /* Catch this in DEBUG kernels. */
3005 3005                          cmn_err(CE_WARN, "kphysm_setup_func_register"
3006 3006                              "(0x%p, 0x%p) duplicate registration from 0x%p",
3007 3007                              (void *)vec, arg, (void *)caller());
3008 3008  #endif /* DEBUG */
3009 3009                          rw_exit(&mem_callback_rwlock);
3010 3010                          return (EEXIST);
3011 3011                  }
3012 3012          }
3013 3013          if (found != 0) {
3014 3014                  i = found - 1;
3015 3015          } else {
3016 3016                  ASSERT(nmemcallbacks < NMEMCALLBACKS);
3017 3017                  if (nmemcallbacks == NMEMCALLBACKS) {
3018 3018                          rw_exit(&mem_callback_rwlock);
3019 3019                          return (ENOMEM);
3020 3020                  }
3021 3021                  i = nmemcallbacks++;
3022 3022          }
3023 3023          mem_callbacks[i].vec = vec;
3024 3024          mem_callbacks[i].arg = arg;
3025 3025          rw_exit(&mem_callback_rwlock);
3026 3026          return (0);
3027 3027  }
3028 3028  
3029 3029  void
3030 3030  kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
3031 3031  {
3032 3032          uint_t i;
3033 3033  
3034 3034          rw_enter(&mem_callback_rwlock, RW_WRITER);
3035 3035          for (i = 0; i < nmemcallbacks; i++) {
3036 3036                  if (mem_callbacks[i].vec == vec &&
3037 3037                      mem_callbacks[i].arg == arg) {
3038 3038                          mem_callbacks[i].vec = NULL;
3039 3039                          mem_callbacks[i].arg = NULL;
3040 3040                          if (i == (nmemcallbacks - 1))
3041 3041                                  nmemcallbacks--;
3042 3042                          break;
3043 3043                  }
3044 3044          }
3045 3045          rw_exit(&mem_callback_rwlock);
3046 3046  }
3047 3047  
3048 3048  static void
3049 3049  kphysm_setup_post_add(pgcnt_t delta_pages)
3050 3050  {
3051 3051          uint_t i;
3052 3052  
3053 3053          rw_enter(&mem_callback_rwlock, RW_READER);
3054 3054          for (i = 0; i < nmemcallbacks; i++) {
3055 3055                  if (mem_callbacks[i].vec != NULL) {
3056 3056                          (*mem_callbacks[i].vec->post_add)
3057 3057                              (mem_callbacks[i].arg, delta_pages);
3058 3058                  }
3059 3059          }
3060 3060          rw_exit(&mem_callback_rwlock);
3061 3061  }
3062 3062  
3063 3063  /*
3064 3064   * Note the locking between pre_del and post_del: The reader lock is held
3065 3065   * between the two calls to stop the set of functions from changing.
3066 3066   */
3067 3067  
3068 3068  static int
3069 3069  kphysm_setup_pre_del(pgcnt_t delta_pages)
3070 3070  {
3071 3071          uint_t i;
3072 3072          int ret;
3073 3073          int aret;
3074 3074  
3075 3075          ret = 0;
3076 3076          rw_enter(&mem_callback_rwlock, RW_READER);
3077 3077          for (i = 0; i < nmemcallbacks; i++) {
3078 3078                  if (mem_callbacks[i].vec != NULL) {
3079 3079                          aret = (*mem_callbacks[i].vec->pre_del)
3080 3080                              (mem_callbacks[i].arg, delta_pages);
3081 3081                          ret |= aret;
3082 3082                  }
3083 3083          }
3084 3084  
3085 3085          return (ret);
3086 3086  }
3087 3087  
3088 3088  static void
3089 3089  kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
3090 3090  {
3091 3091          uint_t i;
3092 3092  
3093 3093          for (i = 0; i < nmemcallbacks; i++) {
3094 3094                  if (mem_callbacks[i].vec != NULL) {
3095 3095                          (*mem_callbacks[i].vec->post_del)
3096 3096                              (mem_callbacks[i].arg, delta_pages, cancelled);
3097 3097                  }
3098 3098          }
3099 3099          rw_exit(&mem_callback_rwlock);
3100 3100  }
3101 3101  
3102 3102  static int
3103 3103  kphysm_split_memseg(
3104 3104          pfn_t base,
3105 3105          pgcnt_t npgs)
3106 3106  {
3107 3107          struct memseg *seg;
3108 3108          struct memseg **segpp;
3109 3109          pgcnt_t size_low, size_high;
3110 3110          struct memseg *seg_low, *seg_mid, *seg_high;
3111 3111  
3112 3112          /*
3113 3113           * Lock the memsegs list against other updates now
3114 3114           */
3115 3115          memsegs_lock(1);
3116 3116  
3117 3117          /*
3118 3118           * Find boot time memseg that wholly covers this area.
3119 3119           */
3120 3120  
3121 3121          /* First find the memseg with page 'base' in it. */
3122 3122          for (segpp = &memsegs; (seg = *segpp) != NULL;
3123 3123              segpp = &((*segpp)->next)) {
3124 3124                  if (base >= seg->pages_base && base < seg->pages_end)
3125 3125                          break;
3126 3126          }
3127 3127          if (seg == NULL) {
3128 3128                  memsegs_unlock(1);
3129 3129                  return (0);
3130 3130          }
3131 3131          if (memseg_includes_meta(seg)) {
3132 3132                  memsegs_unlock(1);
3133 3133                  return (0);
3134 3134          }
3135 3135          if ((base + npgs) > seg->pages_end) {
3136 3136                  memsegs_unlock(1);
3137 3137                  return (0);
3138 3138          }
3139 3139  
3140 3140          /*
3141 3141           * Work out the size of the two segments that will
3142 3142           * surround the new segment, one for low address
3143 3143           * and one for high.
3144 3144           */
3145 3145          ASSERT(base >= seg->pages_base);
3146 3146          size_low = base - seg->pages_base;
3147 3147          ASSERT(seg->pages_end >= (base + npgs));
3148 3148          size_high = seg->pages_end - (base + npgs);
3149 3149  
3150 3150          /*
3151 3151           * Sanity check.
3152 3152           */
3153 3153          if ((size_low + size_high) == 0) {
3154 3154                  memsegs_unlock(1);
3155 3155                  return (0);
3156 3156          }
3157 3157  
3158 3158          /*
3159 3159           * Allocate the new structures. The old memseg will not be freed
3160 3160           * as there may be a reference to it.
3161 3161           */
3162 3162          seg_low = NULL;
3163 3163          seg_high = NULL;
3164 3164  
3165 3165          if (size_low != 0)
3166 3166                  seg_low = memseg_alloc();
3167 3167  
3168 3168          seg_mid = memseg_alloc();
3169 3169  
3170 3170          if (size_high != 0)
3171 3171                  seg_high = memseg_alloc();
3172 3172  
3173 3173          /*
3174 3174           * All allocation done now.
3175 3175           */
3176 3176          if (size_low != 0) {
3177 3177                  seg_low->pages = seg->pages;
3178 3178                  seg_low->epages = seg_low->pages + size_low;
3179 3179                  seg_low->pages_base = seg->pages_base;
3180 3180                  seg_low->pages_end = seg_low->pages_base + size_low;
3181 3181                  seg_low->next = seg_mid;
3182 3182                  seg_low->msegflags = seg->msegflags;
3183 3183          }
3184 3184          if (size_high != 0) {
3185 3185                  seg_high->pages = seg->epages - size_high;
3186 3186                  seg_high->epages = seg_high->pages + size_high;
3187 3187                  seg_high->pages_base = seg->pages_end - size_high;
3188 3188                  seg_high->pages_end = seg_high->pages_base + size_high;
3189 3189                  seg_high->next = seg->next;
3190 3190                  seg_high->msegflags = seg->msegflags;
3191 3191          }
3192 3192  
3193 3193          seg_mid->pages = seg->pages + size_low;
3194 3194          seg_mid->pages_base = seg->pages_base + size_low;
3195 3195          seg_mid->epages = seg->epages - size_high;
3196 3196          seg_mid->pages_end = seg->pages_end - size_high;
3197 3197          seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
3198 3198          seg_mid->msegflags = seg->msegflags;
3199 3199  
3200 3200          /*
3201 3201           * Update hat_kpm specific info of all involved memsegs and
3202 3202           * allow hat_kpm specific global chain updates.
3203 3203           */
3204 3204          hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);
3205 3205  
3206 3206          /*
3207 3207           * At this point we have two equivalent memseg sub-chains,
3208 3208           * seg and seg_low/seg_mid/seg_high, which both chain on to
3209 3209           * the same place in the global chain. By re-writing the pointer
3210 3210           * in the previous element we switch atomically from using the old
3211 3211           * (seg) to the new.
3212 3212           */
3213 3213          *segpp = (seg_low != NULL) ? seg_low : seg_mid;
3214 3214  
3215 3215          membar_enter();
3216 3216  
3217 3217          build_pfn_hash();
3218 3218          memsegs_unlock(1);
3219 3219  
3220 3220          /*
3221 3221           * We leave the old segment, 'seg', intact as there may be
3222 3222           * references to it. Also, as the value of total_pages has not
3223 3223           * changed and the memsegs list is effectively the same when
3224 3224           * accessed via the old or the new pointer, we do not have to
3225 3225           * cause pageout_scanner() to re-evaluate its hand pointers.
3226 3226           *
3227 3227           * We currently do not re-use or reclaim the page_t memory.
3228 3228           * If we do, then this may have to change.
3229 3229           */
3230 3230  
3231 3231          mutex_enter(&memseg_lists_lock);
3232 3232          seg->lnext = memseg_edit_junk;
3233 3233          memseg_edit_junk = seg;
3234 3234          mutex_exit(&memseg_lists_lock);
3235 3235  
3236 3236          return (1);
3237 3237  }
3238 3238  
3239 3239  /*
3240 3240   * The sfmmu hat layer (e.g.) accesses some parts of the memseg
3241 3241   * structure using physical addresses. Therefore a kmem_cache is
3242 3242   * used with KMC_NOHASH to avoid page crossings within a memseg
3243 3243   * structure. KMC_NOHASH requires that no external (outside of
3244 3244   * slab) information is allowed. This, in turn, implies that the
3245 3245   * cache's slabsize must be exactly a single page, since per-slab
3246 3246   * information (e.g. the freelist for the slab) is kept at the
3247 3247   * end of the slab, where it is easy to locate. Should be changed
3248 3248   * when a more obvious kmem_cache interface/flag will become
3249 3249   * available.
3250 3250   */
3251 3251  void
3252 3252  mem_config_init()
3253 3253  {
3254 3254          memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
3255 3255              0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
3256 3256  }
3257 3257  
3258 3258  struct memseg *
3259 3259  memseg_alloc()
3260 3260  {
3261 3261          struct memseg *seg;
3262 3262  
3263 3263          seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3264 3264          bzero(seg, sizeof (struct memseg));
3265 3265  
3266 3266          return (seg);
3267 3267  }
3268 3268  
3269 3269  /*
3270 3270   * Return whether the page_t memory for this memseg
3271 3271   * is included in the memseg itself.
3272 3272   */
3273 3273  static int
3274 3274  memseg_includes_meta(struct memseg *seg)
3275 3275  {
3276 3276          return (seg->msegflags & MEMSEG_META_INCL);
3277 3277  }
3278 3278  
3279 3279  pfn_t
3280 3280  memseg_get_start(struct memseg *seg)
3281 3281  {
3282 3282          pfn_t           pt_start;
3283 3283  
3284 3284          if (memseg_includes_meta(seg)) {
3285 3285                  pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
3286 3286  
3287 3287                  /* Meta data is required to be at the beginning */
3288 3288                  ASSERT(pt_start < seg->pages_base);
3289 3289          } else
3290 3290                  pt_start = seg->pages_base;
3291 3291  
3292 3292          return (pt_start);
3293 3293  }
3294 3294

↓ open down ↓

3294 lines elided

↑ open up ↑

3295 3295  /*
3296 3296   * Invalidate memseg pointers in cpu private vm data caches.
3297 3297   */
3298 3298  static void
3299 3299  memseg_cpu_vm_flush()
3300 3300  {
3301 3301          cpu_t *cp;
3302 3302          vm_cpu_data_t *vc;
3303 3303  
3304 3304          mutex_enter(&cpu_lock);
3305      -        pause_cpus(NULL);
     3305 +        pause_cpus(NULL, NULL);
3306 3306  
3307 3307          cp = cpu_list;
3308 3308          do {
3309 3309                  vc = cp->cpu_vm_data;
3310 3310                  vc->vc_pnum_memseg = NULL;
3311 3311                  vc->vc_pnext_memseg = NULL;
3312 3312  
3313 3313          } while ((cp = cp->cpu_next) != cpu_list);
3314 3314  
3315 3315          start_cpus();
3316 3316          mutex_exit(&cpu_lock);
3317 3317  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX