1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  * Copyright 2012 Joyent, Inc.  All rights reserved.
  27  */
  28 
  29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  30 /*      All Rights Reserved   */
  31 
  32 /*
  33  * Portions of this source code were derived from Berkeley 4.3 BSD
  34  * under license from the Regents of the University of California.
  35  */
  36 
  37 
  38 /*
  39  * This file contains common functions to access and manage the page lists.
  40  * Many of these routines originated from platform dependent modules
  41  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
  42  * a platform independent manner.
  43  *
  44  * vm/vm_dep.h provides for platform specific support.
  45  */
  46 
  47 #include <sys/types.h>
  48 #include <sys/debug.h>
  49 #include <sys/cmn_err.h>
  50 #include <sys/systm.h>
  51 #include <sys/atomic.h>
  52 #include <sys/sysmacros.h>
  53 #include <vm/as.h>
  54 #include <vm/page.h>
  55 #include <vm/seg_kmem.h>
  56 #include <vm/seg_vn.h>
  57 #include <sys/vmsystm.h>
  58 #include <sys/memnode.h>
  59 #include <vm/vm_dep.h>
  60 #include <sys/lgrp.h>
  61 #include <sys/mem_config.h>
  62 #include <sys/callb.h>
  63 #include <sys/mem_cage.h>
  64 #include <sys/sdt.h>
  65 #include <sys/dumphdr.h>
  66 #include <sys/swap.h>
  67 
  68 extern uint_t   vac_colors;
  69 
  70 #define MAX_PRAGMA_ALIGN        128
  71 
  72 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
  73 
  74 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
  75 #pragma align   L2CACHE_ALIGN_MAX(vm_cpu_data0)
  76 #else
  77 #pragma align   MAX_PRAGMA_ALIGN(vm_cpu_data0)
  78 #endif
  79 char            vm_cpu_data0[VM_CPU_DATA_PADSIZE];
  80 
  81 /*
  82  * number of page colors equivalent to reqested color in page_get routines.
  83  * If set, keeps large pages intact longer and keeps MPO allocation
  84  * from the local mnode in favor of acquiring the 'correct' page color from
  85  * a demoted large page or from a remote mnode.
  86  */
  87 uint_t  colorequiv;
  88 
  89 /*
  90  * color equivalency mask for each page size.
  91  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
  92  * High 4 bits determine the number of high order bits of the color to ignore.
  93  * Low 4 bits determines number of low order bits of color to ignore (it's only
  94  * relevant for hashed index based page coloring).
  95  */
  96 uchar_t colorequivszc[MMU_PAGE_SIZES];
  97 
  98 /*
  99  * if set, specifies the percentage of large pages that are free from within
 100  * a large page region before attempting to lock those pages for
 101  * page_get_contig_pages processing.
 102  *
 103  * Should be turned on when kpr is available when page_trylock_contig_pages
 104  * can be more selective.
 105  */
 106 
 107 int     ptcpthreshold;
 108 
 109 /*
 110  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
 111  * Enabled by default via pgcplimitsearch.
 112  *
 113  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
 114  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
 115  * bound. This upper bound range guarantees:
 116  *    - all large page 'slots' will be searched over time
 117  *    - the minimum (1) large page candidates considered on each pgcp call
 118  *    - count doesn't wrap around to 0
 119  */
 120 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES];
 121 int     pgcplimitsearch = 1;
 122 
 123 #define PGCPFAILMAX             (1 << (highbit(physinstalled) - 1))
 124 #define SETPGCPFAILCNT(szc)                                             \
 125         if (++pgcpfailcnt[szc] >= PGCPFAILMAX)                               \
 126                 pgcpfailcnt[szc] = PGCPFAILMAX / 2;
 127 
 128 #ifdef VM_STATS
 129 struct vmm_vmstats_str  vmm_vmstats;
 130 
 131 #endif /* VM_STATS */
 132 
 133 #if defined(__sparc)
 134 #define LPGCREATE       0
 135 #else
 136 /* enable page_get_contig_pages */
 137 #define LPGCREATE       1
 138 #endif
 139 
 140 int pg_contig_disable;
 141 int pg_lpgcreate_nocage = LPGCREATE;
 142 
 143 /*
 144  * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
 145  */
 146 #define PFNNULL         0
 147 
 148 /* Flags involved in promotion and demotion routines */
 149 #define PC_FREE         0x1     /* put page on freelist */
 150 #define PC_ALLOC        0x2     /* return page for allocation */
 151 
 152 /*
 153  * Flag for page_demote to be used with PC_FREE to denote that we don't care
 154  * what the color is as the color parameter to the function is ignored.
 155  */
 156 #define PC_NO_COLOR     (-1)
 157 
 158 /* mtype value for page_promote to use when mtype does not matter */
 159 #define PC_MTYPE_ANY    (-1)
 160 
 161 /*
 162  * page counters candidates info
 163  * See page_ctrs_cands comment below for more details.
 164  * fields are as follows:
 165  *      pcc_pages_free:         # pages which freelist coalesce can create
 166  *      pcc_color_free:         pointer to page free counts per color
 167  */
 168 typedef struct pcc_info {
 169         pgcnt_t pcc_pages_free;
 170         pgcnt_t *pcc_color_free;
 171         uint_t  pad[12];
 172 } pcc_info_t;
 173 
 174 /*
 175  * On big machines it can take a long time to check page_counters
 176  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
 177  * updated sum of all elements of the corresponding page_counters arrays.
 178  * page_freelist_coalesce() searches page_counters only if an appropriate
 179  * element of page_ctrs_cands array is greater than 0.
 180  *
 181  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
 182  */
 183 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
 184 
 185 /*
 186  * Return in val the total number of free pages which can be created
 187  * for the given mnode (m), mrange (g), and region size (r)
 188  */
 189 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) {                           \
 190         int i;                                                          \
 191         val = 0;                                                        \
 192         for (i = 0; i < NPC_MUTEX; i++) {                            \
 193             val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;    \
 194         }                                                               \
 195 }
 196 
 197 /*
 198  * Return in val the total number of free pages which can be created
 199  * for the given mnode (m), mrange (g), region size (r), and color (c)
 200  */
 201 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {                   \
 202         int i;                                                          \
 203         val = 0;                                                        \
 204         ASSERT((c) < PAGE_GET_PAGECOLORS(r));                                \
 205         for (i = 0; i < NPC_MUTEX; i++) {                            \
 206             val +=                                                      \
 207                 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];  \
 208         }                                                               \
 209 }
 210 
 211 /*
 212  * We can only allow a single thread to update a counter within the physical
 213  * range of the largest supported page size. That is the finest granularity
 214  * possible since the counter values are dependent on each other
 215  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
 216  * ctr_mutex lock index for a particular physical range.
 217  */
 218 static kmutex_t *ctr_mutex[NPC_MUTEX];
 219 
 220 #define PP_CTR_LOCK_INDX(pp)                                            \
 221         (((pp)->p_pagenum >>                                           \
 222             (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
 223 
 224 #define INVALID_COLOR 0xffffffff
 225 #define INVALID_MASK  0xffffffff
 226 
 227 /*
 228  * Local functions prototypes.
 229  */
 230 
 231 void page_ctr_add(int, int, page_t *, int);
 232 void page_ctr_add_internal(int, int, page_t *, int);
 233 void page_ctr_sub(int, int, page_t *, int);
 234 void page_ctr_sub_internal(int, int, page_t *, int);
 235 void page_freelist_lock(int);
 236 void page_freelist_unlock(int);
 237 page_t *page_promote(int, pfn_t, uchar_t, int, int);
 238 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
 239 page_t *page_freelist_split(uchar_t,
 240     uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
 241 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
 242 static int page_trylock_cons(page_t *pp, se_t se);
 243 
 244 /*
 245  * The page_counters array below is used to keep track of free contiguous
 246  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
 247  * This contains an array of counters, the size of the array, a shift value
 248  * used to convert a pagenum into a counter array index or vice versa, as
 249  * well as a cache of the last successful index to be promoted to a larger
 250  * page size.  As an optimization, we keep track of the last successful index
 251  * to be promoted per page color for the given size region, and this is
 252  * allocated dynamically based upon the number of colors for a given
 253  * region size.
 254  *
 255  * Conceptually, the page counters are represented as:
 256  *
 257  *      page_counters[region_size][mnode]
 258  *
 259  *      region_size:    size code of a candidate larger page made up
 260  *                      of contiguous free smaller pages.
 261  *
 262  *      page_counters[region_size][mnode].hpm_counters[index]:
 263  *              represents how many (region_size - 1) pages either
 264  *              exist or can be created within the given index range.
 265  *
 266  * Let's look at a sparc example:
 267  *      If we want to create a free 512k page, we look at region_size 2
 268  *      for the mnode we want.  We calculate the index and look at a specific
 269  *      hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
 270  *      this location, it means that 8 64k pages either exist or can be created
 271  *      from 8K pages in order to make a single free 512k page at the given
 272  *      index.  Note that when a region is full, it will contribute to the
 273  *      counts in the region above it.  Thus we will not know what page
 274  *      size the free pages will be which can be promoted to this new free
 275  *      page unless we look at all regions below the current region.
 276  */
 277 
 278 /*
 279  * Note: hpmctr_t is defined in platform vm_dep.h
 280  * hw_page_map_t contains all the information needed for the page_counters
 281  * logic. The fields are as follows:
 282  *
 283  *      hpm_counters:   dynamically allocated array to hold counter data
 284  *      hpm_entries:    entries in hpm_counters
 285  *      hpm_shift:      shift for pnum/array index conv
 286  *      hpm_base:       PFN mapped to counter index 0
 287  *      hpm_color_current:      last index in counter array for this color at
 288  *                              which we successfully created a large page
 289  */
 290 typedef struct hw_page_map {
 291         hpmctr_t        *hpm_counters;
 292         size_t          hpm_entries;
 293         int             hpm_shift;
 294         pfn_t           hpm_base;
 295         size_t          *hpm_color_current[MAX_MNODE_MRANGES];
 296 #if defined(__sparc)
 297         uint_t          pad[4];
 298 #endif
 299 } hw_page_map_t;
 300 
 301 /*
 302  * Element zero is not used, but is allocated for convenience.
 303  */
 304 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
 305 
 306 /*
 307  * Cached value of MNODE_RANGE_CNT(mnode).
 308  * This is a function call in x86.
 309  */
 310 static int mnode_nranges[MAX_MEM_NODES];
 311 static int mnode_maxmrange[MAX_MEM_NODES];
 312 
 313 /*
 314  * The following macros are convenient ways to get access to the individual
 315  * elements of the page_counters arrays.  They can be used on both
 316  * the left side and right side of equations.
 317  */
 318 #define PAGE_COUNTERS(mnode, rg_szc, idx)                       \
 319         (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
 320 
 321 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc)                   \
 322         (page_counters[(rg_szc)][(mnode)].hpm_counters)
 323 
 324 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc)                      \
 325         (page_counters[(rg_szc)][(mnode)].hpm_shift)
 326 
 327 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc)                    \
 328         (page_counters[(rg_szc)][(mnode)].hpm_entries)
 329 
 330 #define PAGE_COUNTERS_BASE(mnode, rg_szc)                       \
 331         (page_counters[(rg_szc)][(mnode)].hpm_base)
 332 
 333 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)             \
 334         (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
 335 
 336 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)       \
 337         (page_counters[(rg_szc)][(mnode)].                              \
 338         hpm_color_current[(mrange)][(color)])
 339 
 340 #define PNUM_TO_IDX(mnode, rg_szc, pnum)                        \
 341         (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>      \
 342                 PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
 343 
 344 #define IDX_TO_PNUM(mnode, rg_szc, index)                       \
 345         (PAGE_COUNTERS_BASE((mnode), (rg_szc)) +                \
 346                 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
 347 
 348 /*
 349  * Protects the hpm_counters and hpm_color_current memory from changing while
 350  * looking at page counters information.
 351  * Grab the write lock to modify what these fields point at.
 352  * Grab the read lock to prevent any pointers from changing.
 353  * The write lock can not be held during memory allocation due to a possible
 354  * recursion deadlock with trying to grab the read lock while the
 355  * write lock is already held.
 356  */
 357 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
 358 
 359 
 360 /*
 361  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
 362  */
 363 void
 364 cpu_vm_data_init(struct cpu *cp)
 365 {
 366         if (cp == CPU0) {
 367                 cp->cpu_vm_data = (void *)&vm_cpu_data0;
 368         } else {
 369                 void    *kmptr;
 370                 int     align;
 371                 size_t  sz;
 372 
 373                 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
 374                 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
 375                 kmptr = kmem_zalloc(sz, KM_SLEEP);
 376                 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
 377                 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
 378                 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
 379         }
 380 }
 381 
 382 /*
 383  * free cpu_vm_data
 384  */
 385 void
 386 cpu_vm_data_destroy(struct cpu *cp)
 387 {
 388         if (cp->cpu_seqid && cp->cpu_vm_data) {
 389                 ASSERT(cp != CPU0);
 390                 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
 391                     ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
 392         }
 393         cp->cpu_vm_data = NULL;
 394 }
 395 
 396 
 397 /*
 398  * page size to page size code
 399  */
 400 int
 401 page_szc(size_t pagesize)
 402 {
 403         int     i = 0;
 404 
 405         while (hw_page_array[i].hp_size) {
 406                 if (pagesize == hw_page_array[i].hp_size)
 407                         return (i);
 408                 i++;
 409         }
 410         return (-1);
 411 }
 412 
 413 /*
 414  * page size to page size code with the restriction that it be a supported
 415  * user page size.  If it's not a supported user page size, -1 will be returned.
 416  */
 417 int
 418 page_szc_user_filtered(size_t pagesize)
 419 {
 420         int szc = page_szc(pagesize);
 421         if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
 422                 return (szc);
 423         }
 424         return (-1);
 425 }
 426 
 427 /*
 428  * Return how many page sizes are available for the user to use.  This is
 429  * what the hardware supports and not based upon how the OS implements the
 430  * support of different page sizes.
 431  *
 432  * If legacy is non-zero, return the number of pagesizes available to legacy
 433  * applications. The number of legacy page sizes might be less than the
 434  * exported user page sizes. This is to prevent legacy applications that
 435  * use the largest page size returned from getpagesizes(3c) from inadvertantly
 436  * using the 'new' large pagesizes.
 437  */
 438 uint_t
 439 page_num_user_pagesizes(int legacy)
 440 {
 441         if (legacy)
 442                 return (mmu_legacy_page_sizes);
 443         return (mmu_exported_page_sizes);
 444 }
 445 
 446 uint_t
 447 page_num_pagesizes(void)
 448 {
 449         return (mmu_page_sizes);
 450 }
 451 
 452 /*
 453  * returns the count of the number of base pagesize pages associated with szc
 454  */
 455 pgcnt_t
 456 page_get_pagecnt(uint_t szc)
 457 {
 458         if (szc >= mmu_page_sizes)
 459                 panic("page_get_pagecnt: out of range %d", szc);
 460         return (hw_page_array[szc].hp_pgcnt);
 461 }
 462 
 463 size_t
 464 page_get_pagesize(uint_t szc)
 465 {
 466         if (szc >= mmu_page_sizes)
 467                 panic("page_get_pagesize: out of range %d", szc);
 468         return (hw_page_array[szc].hp_size);
 469 }
 470 
 471 /*
 472  * Return the size of a page based upon the index passed in.  An index of
 473  * zero refers to the smallest page size in the system, and as index increases
 474  * it refers to the next larger supported page size in the system.
 475  * Note that szc and userszc may not be the same due to unsupported szc's on
 476  * some systems.
 477  */
 478 size_t
 479 page_get_user_pagesize(uint_t userszc)
 480 {
 481         uint_t szc = USERSZC_2_SZC(userszc);
 482 
 483         if (szc >= mmu_page_sizes)
 484                 panic("page_get_user_pagesize: out of range %d", szc);
 485         return (hw_page_array[szc].hp_size);
 486 }
 487 
 488 uint_t
 489 page_get_shift(uint_t szc)
 490 {
 491         if (szc >= mmu_page_sizes)
 492                 panic("page_get_shift: out of range %d", szc);
 493         return (PAGE_GET_SHIFT(szc));
 494 }
 495 
 496 uint_t
 497 page_get_pagecolors(uint_t szc)
 498 {
 499         if (szc >= mmu_page_sizes)
 500                 panic("page_get_pagecolors: out of range %d", szc);
 501         return (PAGE_GET_PAGECOLORS(szc));
 502 }
 503 
 504 /*
 505  * this assigns the desired equivalent color after a split
 506  */
 507 uint_t
 508 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
 509     uint_t ncolor, uint_t ceq_mask)
 510 {
 511         ASSERT(nszc > szc);
 512         ASSERT(szc < mmu_page_sizes);
 513         ASSERT(color < PAGE_GET_PAGECOLORS(szc));
 514         ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
 515 
 516         color &= ceq_mask;
 517         ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
 518         return (color | (ncolor & ~ceq_mask));
 519 }
 520 
 521 /*
 522  * The interleaved_mnodes flag is set when mnodes overlap in
 523  * the physbase..physmax range, but have disjoint slices.
 524  * In this case hpm_counters is shared by all mnodes.
 525  * This flag is set dynamically by the platform.
 526  */
 527 int interleaved_mnodes = 0;
 528 
 529 /*
 530  * Called by startup().
 531  * Size up the per page size free list counters based on physmax
 532  * of each node and max_mem_nodes.
 533  *
 534  * If interleaved_mnodes is set we need to find the first mnode that
 535  * exists. hpm_counters for the first mnode will then be shared by
 536  * all other mnodes. If interleaved_mnodes is not set, just set
 537  * first=mnode each time. That means there will be no sharing.
 538  */
 539 size_t
 540 page_ctrs_sz(void)
 541 {
 542         int     r;              /* region size */
 543         int     mnode;
 544         int     firstmn;        /* first mnode that exists */
 545         int     nranges;
 546         pfn_t   physbase;
 547         pfn_t   physmax;
 548         uint_t  ctrs_sz = 0;
 549         int     i;
 550         pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 551 
 552         /*
 553          * We need to determine how many page colors there are for each
 554          * page size in order to allocate memory for any color specific
 555          * arrays.
 556          */
 557         for (i = 0; i < mmu_page_sizes; i++) {
 558                 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
 559         }
 560 
 561         for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
 562 
 563                 pgcnt_t r_pgcnt;
 564                 pfn_t   r_base;
 565                 pgcnt_t r_align;
 566 
 567                 if (mem_node_config[mnode].exists == 0)
 568                         continue;
 569 
 570                 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
 571                 nranges = MNODE_RANGE_CNT(mnode);
 572                 mnode_nranges[mnode] = nranges;
 573                 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
 574 
 575                 /*
 576                  * determine size needed for page counter arrays with
 577                  * base aligned to large page size.
 578                  */
 579                 for (r = 1; r < mmu_page_sizes; r++) {
 580                         /* add in space for hpm_color_current */
 581                         ctrs_sz += sizeof (size_t) *
 582                             colors_per_szc[r] * nranges;
 583 
 584                         if (firstmn != mnode)
 585                                 continue;
 586 
 587                         /* add in space for hpm_counters */
 588                         r_align = page_get_pagecnt(r);
 589                         r_base = physbase;
 590                         r_base &= ~(r_align - 1);
 591                         r_pgcnt = howmany(physmax - r_base + 1, r_align);
 592 
 593                         /*
 594                          * Round up to always allocate on pointer sized
 595                          * boundaries.
 596                          */
 597                         ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
 598                             sizeof (hpmctr_t *));
 599                 }
 600         }
 601 
 602         for (r = 1; r < mmu_page_sizes; r++) {
 603                 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
 604         }
 605 
 606         /* add in space for page_ctrs_cands and pcc_color_free */
 607         ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
 608             mmu_page_sizes * NPC_MUTEX;
 609 
 610         for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 611 
 612                 if (mem_node_config[mnode].exists == 0)
 613                         continue;
 614 
 615                 nranges = mnode_nranges[mnode];
 616                 ctrs_sz += sizeof (pcc_info_t) * nranges *
 617                     mmu_page_sizes * NPC_MUTEX;
 618                 for (r = 1; r < mmu_page_sizes; r++) {
 619                         ctrs_sz += sizeof (pgcnt_t) * nranges *
 620                             colors_per_szc[r] * NPC_MUTEX;
 621                 }
 622         }
 623 
 624         /* ctr_mutex */
 625         ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
 626 
 627         /* size for page list counts */
 628         PLCNT_SZ(ctrs_sz);
 629 
 630         /*
 631          * add some slop for roundups. page_ctrs_alloc will roundup the start
 632          * address of the counters to ecache_alignsize boundary for every
 633          * memory node.
 634          */
 635         return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
 636 }
 637 
 638 caddr_t
 639 page_ctrs_alloc(caddr_t alloc_base)
 640 {
 641         int     mnode;
 642         int     mrange, nranges;
 643         int     r;              /* region size */
 644         int     i;
 645         int     firstmn;        /* first mnode that exists */
 646         pfn_t   physbase;
 647         pfn_t   physmax;
 648         pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 649 
 650         /*
 651          * We need to determine how many page colors there are for each
 652          * page size in order to allocate memory for any color specific
 653          * arrays.
 654          */
 655         for (i = 0; i < mmu_page_sizes; i++) {
 656                 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
 657         }
 658 
 659         for (r = 1; r < mmu_page_sizes; r++) {
 660                 page_counters[r] = (hw_page_map_t *)alloc_base;
 661                 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
 662         }
 663 
 664         /* page_ctrs_cands and pcc_color_free array */
 665         for (i = 0; i < NPC_MUTEX; i++) {
 666                 for (r = 1; r < mmu_page_sizes; r++) {
 667 
 668                         page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
 669                         alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
 670 
 671                         for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 672                                 pcc_info_t *pi;
 673 
 674                                 if (mem_node_config[mnode].exists == 0)
 675                                         continue;
 676 
 677                                 nranges = mnode_nranges[mnode];
 678 
 679                                 pi = (pcc_info_t *)alloc_base;
 680                                 alloc_base += sizeof (pcc_info_t) * nranges;
 681                                 page_ctrs_cands[i][r][mnode] = pi;
 682 
 683                                 for (mrange = 0; mrange < nranges; mrange++) {
 684                                         pi->pcc_color_free =
 685                                             (pgcnt_t *)alloc_base;
 686                                         alloc_base += sizeof (pgcnt_t) *
 687                                             colors_per_szc[r];
 688                                         pi++;
 689                                 }
 690                         }
 691                 }
 692         }
 693 
 694         /* ctr_mutex */
 695         for (i = 0; i < NPC_MUTEX; i++) {
 696                 ctr_mutex[i] = (kmutex_t *)alloc_base;
 697                 alloc_base += (max_mem_nodes * sizeof (kmutex_t));
 698         }
 699 
 700         /* initialize page list counts */
 701         PLCNT_INIT(alloc_base);
 702 
 703         for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
 704 
 705                 pgcnt_t r_pgcnt;
 706                 pfn_t   r_base;
 707                 pgcnt_t r_align;
 708                 int     r_shift;
 709                 int     nranges = mnode_nranges[mnode];
 710 
 711                 if (mem_node_config[mnode].exists == 0)
 712                         continue;
 713 
 714                 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
 715 
 716                 for (r = 1; r < mmu_page_sizes; r++) {
 717                         /*
 718                          * the page_counters base has to be aligned to the
 719                          * page count of page size code r otherwise the counts
 720                          * will cross large page boundaries.
 721                          */
 722                         r_align = page_get_pagecnt(r);
 723                         r_base = physbase;
 724                         /* base needs to be aligned - lower to aligned value */
 725                         r_base &= ~(r_align - 1);
 726                         r_pgcnt = howmany(physmax - r_base + 1, r_align);
 727                         r_shift = PAGE_BSZS_SHIFT(r);
 728 
 729                         PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
 730                         PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
 731                         PAGE_COUNTERS_BASE(mnode, r) = r_base;
 732                         for (mrange = 0; mrange < nranges; mrange++) {
 733                                 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
 734                                     r, mrange) = (size_t *)alloc_base;
 735                                 alloc_base += sizeof (size_t) *
 736                                     colors_per_szc[r];
 737                         }
 738                         for (i = 0; i < colors_per_szc[r]; i++) {
 739                                 uint_t color_mask = colors_per_szc[r] - 1;
 740                                 pfn_t  pfnum = r_base;
 741                                 size_t idx;
 742                                 int mrange;
 743                                 MEM_NODE_ITERATOR_DECL(it);
 744 
 745                                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
 746                                 if (pfnum == (pfn_t)-1) {
 747                                         idx = 0;
 748                                 } else {
 749                                         PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
 750                                             color_mask, color_mask, &it);
 751                                         idx = PNUM_TO_IDX(mnode, r, pfnum);
 752                                         idx = (idx >= r_pgcnt) ? 0 : idx;
 753                                 }
 754                                 for (mrange = 0; mrange < nranges; mrange++) {
 755                                         PAGE_COUNTERS_CURRENT_COLOR(mnode,
 756                                             r, i, mrange) = idx;
 757                                 }
 758                         }
 759 
 760                         /* hpm_counters may be shared by all mnodes */
 761                         if (firstmn == mnode) {
 762                                 PAGE_COUNTERS_COUNTERS(mnode, r) =
 763                                     (hpmctr_t *)alloc_base;
 764                                 alloc_base +=
 765                                     P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
 766                                     sizeof (hpmctr_t *));
 767                         } else {
 768                                 PAGE_COUNTERS_COUNTERS(mnode, r) =
 769                                     PAGE_COUNTERS_COUNTERS(firstmn, r);
 770                         }
 771 
 772                         /*
 773                          * Verify that PNUM_TO_IDX and IDX_TO_PNUM
 774                          * satisfy the identity requirement.
 775                          * We should be able to go from one to the other
 776                          * and get consistent values.
 777                          */
 778                         ASSERT(PNUM_TO_IDX(mnode, r,
 779                             (IDX_TO_PNUM(mnode, r, 0))) == 0);
 780                         ASSERT(IDX_TO_PNUM(mnode, r,
 781                             (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
 782                 }
 783                 /*
 784                  * Roundup the start address of the page_counters to
 785                  * cache aligned boundary for every memory node.
 786                  * page_ctrs_sz() has added some slop for these roundups.
 787                  */
 788                 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
 789                     L2CACHE_ALIGN);
 790         }
 791 
 792         /* Initialize other page counter specific data structures. */
 793         for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
 794                 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
 795         }
 796 
 797         return (alloc_base);
 798 }
 799 
 800 /*
 801  * Functions to adjust region counters for each size free list.
 802  * Caller is responsible to acquire the ctr_mutex lock if necessary and
 803  * thus can be called during startup without locks.
 804  */
 805 /* ARGSUSED */
 806 void
 807 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
 808 {
 809         ssize_t         r;      /* region size */
 810         ssize_t         idx;
 811         pfn_t           pfnum;
 812         int             lckidx;
 813 
 814         ASSERT(mnode == PP_2_MEM_NODE(pp));
 815         ASSERT(mtype == PP_2_MTYPE(pp));
 816 
 817         ASSERT(pp->p_szc < mmu_page_sizes);
 818 
 819         PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
 820 
 821         /* no counter update needed for largest page size */
 822         if (pp->p_szc >= mmu_page_sizes - 1) {
 823                 return;
 824         }
 825 
 826         r = pp->p_szc + 1;
 827         pfnum = pp->p_pagenum;
 828         lckidx = PP_CTR_LOCK_INDX(pp);
 829 
 830         /*
 831          * Increment the count of free pages for the current
 832          * region. Continue looping up in region size incrementing
 833          * count if the preceeding region is full.
 834          */
 835         while (r < mmu_page_sizes) {
 836                 idx = PNUM_TO_IDX(mnode, r, pfnum);
 837 
 838                 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
 839                 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
 840 
 841                 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
 842                         break;
 843                 } else {
 844                         int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
 845                         pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
 846                             [MTYPE_2_MRANGE(mnode, root_mtype)];
 847 
 848                         cand->pcc_pages_free++;
 849                         cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
 850                 }
 851                 r++;
 852         }
 853 }
 854 
 855 void
 856 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
 857 {
 858         int             lckidx = PP_CTR_LOCK_INDX(pp);
 859         kmutex_t        *lock = &ctr_mutex[lckidx][mnode];
 860 
 861         mutex_enter(lock);
 862         page_ctr_add_internal(mnode, mtype, pp, flags);
 863         mutex_exit(lock);
 864 }
 865 
 866 void
 867 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
 868 {
 869         int             lckidx;
 870         ssize_t         r;      /* region size */
 871         ssize_t         idx;
 872         pfn_t           pfnum;
 873 
 874         ASSERT(mnode == PP_2_MEM_NODE(pp));
 875         ASSERT(mtype == PP_2_MTYPE(pp));
 876 
 877         ASSERT(pp->p_szc < mmu_page_sizes);
 878 
 879         PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
 880 
 881         /* no counter update needed for largest page size */
 882         if (pp->p_szc >= mmu_page_sizes - 1) {
 883                 return;
 884         }
 885 
 886         r = pp->p_szc + 1;
 887         pfnum = pp->p_pagenum;
 888         lckidx = PP_CTR_LOCK_INDX(pp);
 889 
 890         /*
 891          * Decrement the count of free pages for the current
 892          * region. Continue looping up in region size decrementing
 893          * count if the preceeding region was full.
 894          */
 895         while (r < mmu_page_sizes) {
 896                 idx = PNUM_TO_IDX(mnode, r, pfnum);
 897 
 898                 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
 899                 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
 900 
 901                 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
 902                         break;
 903                 } else {
 904                         int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
 905                         pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
 906                             [MTYPE_2_MRANGE(mnode, root_mtype)];
 907 
 908                         ASSERT(cand->pcc_pages_free != 0);
 909                         ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
 910 
 911                         cand->pcc_pages_free--;
 912                         cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
 913                 }
 914                 r++;
 915         }
 916 }
 917 
 918 void
 919 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
 920 {
 921         int             lckidx = PP_CTR_LOCK_INDX(pp);
 922         kmutex_t        *lock = &ctr_mutex[lckidx][mnode];
 923 
 924         mutex_enter(lock);
 925         page_ctr_sub_internal(mnode, mtype, pp, flags);
 926         mutex_exit(lock);
 927 }
 928 
 929 /*
 930  * Adjust page counters following a memory attach, since typically the
 931  * size of the array needs to change, and the PFN to counter index
 932  * mapping needs to change.
 933  *
 934  * It is possible this mnode did not exist at startup. In that case
 935  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
 936  * to change (a theoretical possibility on x86), which means pcc_color_free
 937  * arrays must be extended.
 938  */
 939 uint_t
 940 page_ctrs_adjust(int mnode)
 941 {
 942         pgcnt_t npgs;
 943         int     r;              /* region size */
 944         int     i;
 945         size_t  pcsz, old_csz;
 946         hpmctr_t *new_ctr, *old_ctr;
 947         pfn_t   oldbase, newbase;
 948         pfn_t   physbase, physmax;
 949         size_t  old_npgs;
 950         hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
 951         size_t  size_cache[MMU_PAGE_SIZES];
 952         size_t  *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 953         size_t  *old_color_array[MAX_MNODE_MRANGES];
 954         pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 955         pcc_info_t **cands_cache;
 956         pcc_info_t *old_pi, *pi;
 957         pgcnt_t *pgcntp;
 958         int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
 959         int cands_cache_nranges;
 960         int old_maxmrange, new_maxmrange;
 961         int rc = 0;
 962         int oldmnode;
 963 
 964         cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
 965             MMU_PAGE_SIZES, KM_NOSLEEP);
 966         if (cands_cache == NULL)
 967                 return (ENOMEM);
 968 
 969         i = -1;
 970         HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
 971 
 972         newbase = physbase & ~PC_BASE_ALIGN_MASK;
 973         npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
 974 
 975         /* prepare to free non-null pointers on the way out */
 976         cands_cache_nranges = nranges;
 977         bzero(ctr_cache, sizeof (ctr_cache));
 978         bzero(color_cache, sizeof (color_cache));
 979 
 980         /*
 981          * We need to determine how many page colors there are for each
 982          * page size in order to allocate memory for any color specific
 983          * arrays.
 984          */
 985         for (r = 0; r < mmu_page_sizes; r++) {
 986                 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
 987         }
 988 
 989         /*
 990          * Preallocate all of the new hpm_counters arrays as we can't
 991          * hold the page_ctrs_rwlock as a writer and allocate memory.
 992          * If we can't allocate all of the arrays, undo our work so far
 993          * and return failure.
 994          */
 995         for (r = 1; r < mmu_page_sizes; r++) {
 996                 pcsz = npgs >> PAGE_BSZS_SHIFT(r);
 997                 size_cache[r] = pcsz;
 998                 ctr_cache[r] = kmem_zalloc(pcsz *
 999                     sizeof (hpmctr_t), KM_NOSLEEP);
1000                 if (ctr_cache[r] == NULL) {
1001                         rc = ENOMEM;
1002                         goto cleanup;
1003                 }
1004         }
1005 
1006         /*
1007          * Preallocate all of the new color current arrays as we can't
1008          * hold the page_ctrs_rwlock as a writer and allocate memory.
1009          * If we can't allocate all of the arrays, undo our work so far
1010          * and return failure.
1011          */
1012         for (r = 1; r < mmu_page_sizes; r++) {
1013                 for (mrange = 0; mrange < nranges; mrange++) {
1014                         color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1015                             colors_per_szc[r], KM_NOSLEEP);
1016                         if (color_cache[r][mrange] == NULL) {
1017                                 rc = ENOMEM;
1018                                 goto cleanup;
1019                         }
1020                 }
1021         }
1022 
1023         /*
1024          * Preallocate all of the new pcc_info_t arrays as we can't
1025          * hold the page_ctrs_rwlock as a writer and allocate memory.
1026          * If we can't allocate all of the arrays, undo our work so far
1027          * and return failure.
1028          */
1029         for (r = 1; r < mmu_page_sizes; r++) {
1030                 for (i = 0; i < NPC_MUTEX; i++) {
1031                         pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1032                             KM_NOSLEEP);
1033                         if (pi == NULL) {
1034                                 rc = ENOMEM;
1035                                 goto cleanup;
1036                         }
1037                         cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1038 
1039                         for (mrange = 0; mrange < nranges; mrange++, pi++) {
1040                                 pgcntp = kmem_zalloc(colors_per_szc[r] *
1041                                     sizeof (pgcnt_t), KM_NOSLEEP);
1042                                 if (pgcntp == NULL) {
1043                                         rc = ENOMEM;
1044                                         goto cleanup;
1045                                 }
1046                                 pi->pcc_color_free = pgcntp;
1047                         }
1048                 }
1049         }
1050 
1051         /*
1052          * Grab the write lock to prevent others from walking these arrays
1053          * while we are modifying them.
1054          */
1055         PAGE_CTRS_WRITE_LOCK(mnode);
1056 
1057         /*
1058          * For interleaved mnodes, find the first mnode
1059          * with valid page counters since the current
1060          * mnode may have just been added and not have
1061          * valid page counters.
1062          */
1063         if (interleaved_mnodes) {
1064                 for (i = 0; i < max_mem_nodes; i++)
1065                         if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
1066                                 break;
1067                 ASSERT(i < max_mem_nodes);
1068                 oldmnode = i;
1069         } else
1070                 oldmnode = mnode;
1071 
1072         old_nranges = mnode_nranges[mnode];
1073         cands_cache_nranges = old_nranges;
1074         mnode_nranges[mnode] = nranges;
1075         old_maxmrange = mnode_maxmrange[mnode];
1076         mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1077         new_maxmrange = mnode_maxmrange[mnode];
1078 
1079         for (r = 1; r < mmu_page_sizes; r++) {
1080                 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1081                 old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
1082                 old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
1083                 oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
1084                 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
1085                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1086                         old_color_array[mrange] =
1087                             PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1088                             r, mrange);
1089                 }
1090 
1091                 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1092                 new_ctr = ctr_cache[r];
1093                 ctr_cache[r] = NULL;
1094                 if (old_ctr != NULL &&
1095                     (oldbase + old_npgs > newbase) &&
1096                     (newbase + npgs > oldbase)) {
1097                         /*
1098                          * Map the intersection of the old and new
1099                          * counters into the new array.
1100                          */
1101                         size_t offset;
1102                         if (newbase > oldbase) {
1103                                 offset = (newbase - oldbase) >>
1104                                     PAGE_COUNTERS_SHIFT(mnode, r);
1105                                 bcopy(old_ctr + offset, new_ctr,
1106                                     MIN(pcsz, (old_csz - offset)) *
1107                                     sizeof (hpmctr_t));
1108                         } else {
1109                                 offset = (oldbase - newbase) >>
1110                                     PAGE_COUNTERS_SHIFT(mnode, r);
1111                                 bcopy(old_ctr, new_ctr + offset,
1112                                     MIN(pcsz - offset, old_csz) *
1113                                     sizeof (hpmctr_t));
1114                         }
1115                 }
1116 
1117                 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1118                 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1119                 PAGE_COUNTERS_BASE(mnode, r) = newbase;
1120 
1121                 /* update shared hpm_counters in other mnodes */
1122                 if (interleaved_mnodes) {
1123                         for (i = 0; i < max_mem_nodes; i++) {
1124                                 if ((i == mnode) ||
1125                                     (mem_node_config[i].exists == 0))
1126                                         continue;
1127                                 ASSERT(
1128                                     PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
1129                                     PAGE_COUNTERS_COUNTERS(i, r) == NULL);
1130                                 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1131                                 PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1132                                 PAGE_COUNTERS_BASE(i, r) = newbase;
1133                         }
1134                 }
1135 
1136                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1137                         PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1138                             color_cache[r][mrange];
1139                         color_cache[r][mrange] = NULL;
1140                 }
1141                 /*
1142                  * for now, just reset on these events as it's probably
1143                  * not worthwhile to try and optimize this.
1144                  */
1145                 for (i = 0; i < colors_per_szc[r]; i++) {
1146                         uint_t color_mask = colors_per_szc[r] - 1;
1147                         int mlo = interleaved_mnodes ? 0 : mnode;
1148                         int mhi = interleaved_mnodes ? max_mem_nodes :
1149                             (mnode + 1);
1150                         int m;
1151                         pfn_t  pfnum;
1152                         size_t idx;
1153                         MEM_NODE_ITERATOR_DECL(it);
1154 
1155                         for (m = mlo; m < mhi; m++) {
1156                                 if (mem_node_config[m].exists == 0)
1157                                         continue;
1158                                 pfnum = newbase;
1159                                 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1160                                 if (pfnum == (pfn_t)-1) {
1161                                         idx = 0;
1162                                 } else {
1163                                         PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1164                                             color_mask, color_mask, &it);
1165                                         idx = PNUM_TO_IDX(m, r, pfnum);
1166                                         idx = (idx < pcsz) ? idx : 0;
1167                                 }
1168                                 for (mrange = 0; mrange < nranges; mrange++) {
1169                                         if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
1170                                             r, mrange) != NULL)
1171                                                 PAGE_COUNTERS_CURRENT_COLOR(m,
1172                                                     r, i, mrange) = idx;
1173                                 }
1174                         }
1175                 }
1176 
1177                 /* cache info for freeing out of the critical path */
1178                 if ((caddr_t)old_ctr >= kernelheap &&
1179                     (caddr_t)old_ctr < ekernelheap) {
1180                         ctr_cache[r] = old_ctr;
1181                         size_cache[r] = old_csz;
1182                 }
1183                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1184                         size_t *tmp = old_color_array[mrange];
1185                         if ((caddr_t)tmp >= kernelheap &&
1186                             (caddr_t)tmp < ekernelheap) {
1187                                 color_cache[r][mrange] = tmp;
1188                         }
1189                 }
1190                 /*
1191                  * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1192                  * satisfy the identity requirement.
1193                  * We should be able to go from one to the other
1194                  * and get consistent values.
1195                  */
1196                 ASSERT(PNUM_TO_IDX(mnode, r,
1197                     (IDX_TO_PNUM(mnode, r, 0))) == 0);
1198                 ASSERT(IDX_TO_PNUM(mnode, r,
1199                     (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1200 
1201                 /* pcc_info_t and pcc_color_free */
1202                 for (i = 0; i < NPC_MUTEX; i++) {
1203                         pcc_info_t *epi;
1204                         pcc_info_t *eold_pi;
1205 
1206                         pi = cands_cache[i * MMU_PAGE_SIZES + r];
1207                         old_pi = page_ctrs_cands[i][r][mnode];
1208                         page_ctrs_cands[i][r][mnode] = pi;
1209                         cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1210 
1211                         /* preserve old pcc_color_free values, if any */
1212                         if (old_pi == NULL)
1213                                 continue;
1214 
1215                         /*
1216                          * when/if x86 does DR, must account for
1217                          * possible change in range index when
1218                          * preserving pcc_info
1219                          */
1220                         epi = &pi[nranges];
1221                         eold_pi = &old_pi[old_nranges];
1222                         if (new_maxmrange > old_maxmrange) {
1223                                 pi += new_maxmrange - old_maxmrange;
1224                         } else if (new_maxmrange < old_maxmrange) {
1225                                 old_pi += old_maxmrange - new_maxmrange;
1226                         }
1227                         for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1228                                 pcc_info_t tmp = *pi;
1229                                 *pi = *old_pi;
1230                                 *old_pi = tmp;
1231                         }
1232                 }
1233         }
1234         PAGE_CTRS_WRITE_UNLOCK(mnode);
1235 
1236         /*
1237          * Now that we have dropped the write lock, it is safe to free all
1238          * of the memory we have cached above.
1239          * We come thru here to free memory when pre-alloc fails, and also to
1240          * free old pointers which were recorded while locked.
1241          */
1242 cleanup:
1243         for (r = 1; r < mmu_page_sizes; r++) {
1244                 if (ctr_cache[r] != NULL) {
1245                         kmem_free(ctr_cache[r],
1246                             size_cache[r] * sizeof (hpmctr_t));
1247                 }
1248                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1249                         if (color_cache[r][mrange] != NULL) {
1250                                 kmem_free(color_cache[r][mrange],
1251                                     colors_per_szc[r] * sizeof (size_t));
1252                         }
1253                 }
1254                 for (i = 0; i < NPC_MUTEX; i++) {
1255                         pi = cands_cache[i * MMU_PAGE_SIZES + r];
1256                         if (pi == NULL)
1257                                 continue;
1258                         nr = cands_cache_nranges;
1259                         for (mrange = 0; mrange < nr; mrange++, pi++) {
1260                                 pgcntp = pi->pcc_color_free;
1261                                 if (pgcntp == NULL)
1262                                         continue;
1263                                 if ((caddr_t)pgcntp >= kernelheap &&
1264                                     (caddr_t)pgcntp < ekernelheap) {
1265                                         kmem_free(pgcntp,
1266                                             colors_per_szc[r] *
1267                                             sizeof (pgcnt_t));
1268                                 }
1269                         }
1270                         pi = cands_cache[i * MMU_PAGE_SIZES + r];
1271                         if ((caddr_t)pi >= kernelheap &&
1272                             (caddr_t)pi < ekernelheap) {
1273                                 kmem_free(pi, nr * sizeof (pcc_info_t));
1274                         }
1275                 }
1276         }
1277 
1278         kmem_free(cands_cache,
1279             sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1280         return (rc);
1281 }
1282 
1283 /*
1284  * Cleanup the hpm_counters field in the page counters
1285  * array.
1286  */
1287 void
1288 page_ctrs_cleanup(void)
1289 {
1290         int r;  /* region size */
1291         int i;  /* mnode index */
1292 
1293         /*
1294          * Get the page counters write lock while we are
1295          * setting the page hpm_counters field to NULL
1296          * for non-existent mnodes.
1297          */
1298         for (i = 0; i < max_mem_nodes; i++) {
1299                 PAGE_CTRS_WRITE_LOCK(i);
1300                 if (mem_node_config[i].exists) {
1301                         PAGE_CTRS_WRITE_UNLOCK(i);
1302                         continue;
1303                 }
1304                 for (r = 1; r < mmu_page_sizes; r++) {
1305                         PAGE_COUNTERS_COUNTERS(i, r) = NULL;
1306                 }
1307                 PAGE_CTRS_WRITE_UNLOCK(i);
1308         }
1309 }
1310 
1311 #ifdef DEBUG
1312 
1313 /*
1314  * confirm pp is a large page corresponding to szc
1315  */
1316 void
1317 chk_lpg(page_t *pp, uchar_t szc)
1318 {
1319         spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1320         uint_t noreloc;
1321 
1322         if (npgs == 1) {
1323                 ASSERT(pp->p_szc == 0);
1324                 ASSERT(pp->p_next == pp);
1325                 ASSERT(pp->p_prev == pp);
1326                 return;
1327         }
1328 
1329         ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1330         ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1331 
1332         ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1333         ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1334         ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1335         ASSERT(pp->p_prev == (pp + (npgs - 1)));
1336 
1337         /*
1338          * Check list of pages.
1339          */
1340         noreloc = PP_ISNORELOC(pp);
1341         while (npgs--) {
1342                 if (npgs != 0) {
1343                         ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1344                         ASSERT(pp->p_next == (pp + 1));
1345                 }
1346                 ASSERT(pp->p_szc == szc);
1347                 ASSERT(PP_ISFREE(pp));
1348                 ASSERT(PP_ISAGED(pp));
1349                 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1350                 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1351                 ASSERT(pp->p_vnode  == NULL);
1352                 ASSERT(PP_ISNORELOC(pp) == noreloc);
1353 
1354                 pp = pp->p_next;
1355         }
1356 }
1357 #endif /* DEBUG */
1358 
1359 void
1360 page_freelist_lock(int mnode)
1361 {
1362         int i;
1363         for (i = 0; i < NPC_MUTEX; i++) {
1364                 mutex_enter(FPC_MUTEX(mnode, i));
1365                 mutex_enter(CPC_MUTEX(mnode, i));
1366         }
1367 }
1368 
1369 void
1370 page_freelist_unlock(int mnode)
1371 {
1372         int i;
1373         for (i = 0; i < NPC_MUTEX; i++) {
1374                 mutex_exit(FPC_MUTEX(mnode, i));
1375                 mutex_exit(CPC_MUTEX(mnode, i));
1376         }
1377 }
1378 
1379 /*
1380  * add pp to the specified page list. Defaults to head of the page list
1381  * unless PG_LIST_TAIL is specified.
1382  */
1383 void
1384 page_list_add(page_t *pp, int flags)
1385 {
1386         page_t          **ppp;
1387         kmutex_t        *pcm;
1388         uint_t          bin, mtype;
1389         int             mnode;
1390 
1391         ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1392         ASSERT(PP_ISFREE(pp));
1393         ASSERT(!hat_page_is_mapped(pp));
1394         ASSERT(hat_page_getshare(pp) == 0);
1395 
1396         /*
1397          * Large pages should be freed via page_list_add_pages().
1398          */
1399         ASSERT(pp->p_szc == 0);
1400 
1401         /*
1402          * Don't need to lock the freelist first here
1403          * because the page isn't on the freelist yet.
1404          * This means p_szc can't change on us.
1405          */
1406 
1407         bin = PP_2_BIN(pp);
1408         mnode = PP_2_MEM_NODE(pp);
1409         mtype = PP_2_MTYPE(pp);
1410 
1411         if (flags & PG_LIST_ISINIT) {
1412                 /*
1413                  * PG_LIST_ISINIT is set during system startup (ie. single
1414                  * threaded), add a page to the free list and add to the
1415                  * the free region counters w/o any locking
1416                  */
1417                 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1418 
1419                 /* inline version of page_add() */
1420                 if (*ppp != NULL) {
1421                         pp->p_next = *ppp;
1422                         pp->p_prev = (*ppp)->p_prev;
1423                         (*ppp)->p_prev = pp;
1424                         pp->p_prev->p_next = pp;
1425                 } else
1426                         *ppp = pp;
1427 
1428                 page_ctr_add_internal(mnode, mtype, pp, flags);
1429                 VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1430         } else {
1431                 pcm = PC_BIN_MUTEX(mnode, bin, flags);
1432 
1433                 if (flags & PG_FREE_LIST) {
1434                         VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1435                         ASSERT(PP_ISAGED(pp));
1436                         ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1437 
1438                 } else {
1439                         VM_STAT_ADD(vmm_vmstats.pladd_cache);
1440                         ASSERT(pp->p_vnode);
1441                         ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1442                         ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1443                 }
1444                 mutex_enter(pcm);
1445                 page_add(ppp, pp);
1446 
1447                 if (flags & PG_LIST_TAIL)
1448                         *ppp = (*ppp)->p_next;
1449                 /*
1450                  * Add counters before releasing pcm mutex to avoid a race with
1451                  * page_freelist_coalesce and page_freelist_split.
1452                  */
1453                 page_ctr_add(mnode, mtype, pp, flags);
1454                 mutex_exit(pcm);
1455         }
1456 
1457 
1458 #if defined(__sparc)
1459         if (PP_ISNORELOC(pp)) {
1460                 kcage_freemem_add(1);
1461         }
1462 #endif
1463         /*
1464          * It is up to the caller to unlock the page!
1465          */
1466         ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1467 }
1468 
1469 
1470 #ifdef __sparc
1471 /*
1472  * This routine is only used by kcage_init during system startup.
1473  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1474  * without the overhead of taking locks and updating counters.
1475  */
1476 void
1477 page_list_noreloc_startup(page_t *pp)
1478 {
1479         page_t          **ppp;
1480         uint_t          bin;
1481         int             mnode;
1482         int             mtype;
1483         int             flags = 0;
1484 
1485         /*
1486          * If this is a large page on the freelist then
1487          * break it up into smaller pages.
1488          */
1489         if (pp->p_szc != 0)
1490                 page_boot_demote(pp);
1491 
1492         /*
1493          * Get list page is currently on.
1494          */
1495         bin = PP_2_BIN(pp);
1496         mnode = PP_2_MEM_NODE(pp);
1497         mtype = PP_2_MTYPE(pp);
1498         ASSERT(mtype == MTYPE_RELOC);
1499         ASSERT(pp->p_szc == 0);
1500 
1501         if (PP_ISAGED(pp)) {
1502                 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1503                 flags |= PG_FREE_LIST;
1504         } else {
1505                 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1506                 flags |= PG_CACHE_LIST;
1507         }
1508 
1509         ASSERT(*ppp != NULL);
1510 
1511         /*
1512          * Delete page from current list.
1513          */
1514         if (*ppp == pp)
1515                 *ppp = pp->p_next;           /* go to next page */
1516         if (*ppp == pp) {
1517                 *ppp = NULL;                    /* page list is gone */
1518         } else {
1519                 pp->p_prev->p_next = pp->p_next;
1520                 pp->p_next->p_prev = pp->p_prev;
1521         }
1522 
1523         /*
1524          * Decrement page counters
1525          */
1526         page_ctr_sub_internal(mnode, mtype, pp, flags);
1527 
1528         /*
1529          * Set no reloc for cage initted pages.
1530          */
1531         PP_SETNORELOC(pp);
1532 
1533         mtype = PP_2_MTYPE(pp);
1534         ASSERT(mtype == MTYPE_NORELOC);
1535 
1536         /*
1537          * Get new list for page.
1538          */
1539         if (PP_ISAGED(pp)) {
1540                 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1541         } else {
1542                 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1543         }
1544 
1545         /*
1546          * Insert page on new list.
1547          */
1548         if (*ppp == NULL) {
1549                 *ppp = pp;
1550                 pp->p_next = pp->p_prev = pp;
1551         } else {
1552                 pp->p_next = *ppp;
1553                 pp->p_prev = (*ppp)->p_prev;
1554                 (*ppp)->p_prev = pp;
1555                 pp->p_prev->p_next = pp;
1556         }
1557 
1558         /*
1559          * Increment page counters
1560          */
1561         page_ctr_add_internal(mnode, mtype, pp, flags);
1562 
1563         /*
1564          * Update cage freemem counter
1565          */
1566         atomic_add_long(&kcage_freemem, 1);
1567 }
1568 #else   /* __sparc */
1569 
1570 /* ARGSUSED */
1571 void
1572 page_list_noreloc_startup(page_t *pp)
1573 {
1574         panic("page_list_noreloc_startup: should be here only for sparc");
1575 }
1576 #endif
1577 
1578 void
1579 page_list_add_pages(page_t *pp, int flags)
1580 {
1581         kmutex_t *pcm;
1582         pgcnt_t pgcnt;
1583         uint_t  bin, mtype, i;
1584         int     mnode;
1585 
1586         /* default to freelist/head */
1587         ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1588 
1589         CHK_LPG(pp, pp->p_szc);
1590         VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1591 
1592         bin = PP_2_BIN(pp);
1593         mnode = PP_2_MEM_NODE(pp);
1594         mtype = PP_2_MTYPE(pp);
1595 
1596         if (flags & PG_LIST_ISINIT) {
1597                 ASSERT(pp->p_szc == mmu_page_sizes - 1);
1598                 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1599                 ASSERT(!PP_ISNORELOC(pp));
1600                 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1601         } else {
1602 
1603                 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1604 
1605                 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1606 
1607                 mutex_enter(pcm);
1608                 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1609                 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1610                 mutex_exit(pcm);
1611 
1612                 pgcnt = page_get_pagecnt(pp->p_szc);
1613 #if defined(__sparc)
1614                 if (PP_ISNORELOC(pp))
1615                         kcage_freemem_add(pgcnt);
1616 #endif
1617                 for (i = 0; i < pgcnt; i++, pp++)
1618                         page_unlock_nocapture(pp);
1619         }
1620 }
1621 
1622 /*
1623  * During boot, need to demote a large page to base
1624  * pagesize pages for seg_kmem for use in boot_alloc()
1625  */
1626 void
1627 page_boot_demote(page_t *pp)
1628 {
1629         ASSERT(pp->p_szc != 0);
1630         ASSERT(PP_ISFREE(pp));
1631         ASSERT(PP_ISAGED(pp));
1632 
1633         (void) page_demote(PP_2_MEM_NODE(pp),
1634             PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
1635             PC_FREE);
1636 
1637         ASSERT(PP_ISFREE(pp));
1638         ASSERT(PP_ISAGED(pp));
1639         ASSERT(pp->p_szc == 0);
1640 }
1641 
1642 /*
1643  * Take a particular page off of whatever freelist the page
1644  * is claimed to be on.
1645  *
1646  * NOTE: Only used for PAGESIZE pages.
1647  */
1648 void
1649 page_list_sub(page_t *pp, int flags)
1650 {
1651         int             bin;
1652         uint_t          mtype;
1653         int             mnode;
1654         kmutex_t        *pcm;
1655         page_t          **ppp;
1656 
1657         ASSERT(PAGE_EXCL(pp));
1658         ASSERT(PP_ISFREE(pp));
1659 
1660         /*
1661          * The p_szc field can only be changed by page_promote()
1662          * and page_demote(). Only free pages can be promoted and
1663          * demoted and the free list MUST be locked during these
1664          * operations. So to prevent a race in page_list_sub()
1665          * between computing which bin of the freelist lock to
1666          * grab and actually grabing the lock we check again that
1667          * the bin we locked is still the correct one. Notice that
1668          * the p_szc field could have actually changed on us but
1669          * if the bin happens to still be the same we are safe.
1670          */
1671 try_again:
1672         bin = PP_2_BIN(pp);
1673         mnode = PP_2_MEM_NODE(pp);
1674         pcm = PC_BIN_MUTEX(mnode, bin, flags);
1675         mutex_enter(pcm);
1676         if (PP_2_BIN(pp) != bin) {
1677                 mutex_exit(pcm);
1678                 goto try_again;
1679         }
1680         mtype = PP_2_MTYPE(pp);
1681 
1682         if (flags & PG_FREE_LIST) {
1683                 VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1684                 ASSERT(PP_ISAGED(pp));
1685                 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1686         } else {
1687                 VM_STAT_ADD(vmm_vmstats.plsub_cache);
1688                 ASSERT(!PP_ISAGED(pp));
1689                 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1690         }
1691 
1692         /*
1693          * Common PAGESIZE case.
1694          *
1695          * Note that we locked the freelist. This prevents
1696          * any page promotion/demotion operations. Therefore
1697          * the p_szc will not change until we drop pcm mutex.
1698          */
1699         if (pp->p_szc == 0) {
1700                 page_sub(ppp, pp);
1701                 /*
1702                  * Subtract counters before releasing pcm mutex
1703                  * to avoid race with page_freelist_coalesce.
1704                  */
1705                 page_ctr_sub(mnode, mtype, pp, flags);
1706                 mutex_exit(pcm);
1707 
1708 #if defined(__sparc)
1709                 if (PP_ISNORELOC(pp)) {
1710                         kcage_freemem_sub(1);
1711                 }
1712 #endif
1713                 return;
1714         }
1715 
1716         /*
1717          * Large pages on the cache list are not supported.
1718          */
1719         if (flags & PG_CACHE_LIST)
1720                 panic("page_list_sub: large page on cachelist");
1721 
1722         /*
1723          * Slow but rare.
1724          *
1725          * Somebody wants this particular page which is part
1726          * of a large page. In this case we just demote the page
1727          * if it's on the freelist.
1728          *
1729          * We have to drop pcm before locking the entire freelist.
1730          * Once we have re-locked the freelist check to make sure
1731          * the page hasn't already been demoted or completely
1732          * freed.
1733          */
1734         mutex_exit(pcm);
1735         page_freelist_lock(mnode);
1736         if (pp->p_szc != 0) {
1737                 /*
1738                  * Large page is on freelist.
1739                  */
1740                 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1741                     0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1742         }
1743         ASSERT(PP_ISFREE(pp));
1744         ASSERT(PP_ISAGED(pp));
1745         ASSERT(pp->p_szc == 0);
1746 
1747         /*
1748          * Subtract counters before releasing pcm mutex
1749          * to avoid race with page_freelist_coalesce.
1750          */
1751         bin = PP_2_BIN(pp);
1752         mtype = PP_2_MTYPE(pp);
1753         ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1754 
1755         page_sub(ppp, pp);
1756         page_ctr_sub(mnode, mtype, pp, flags);
1757         page_freelist_unlock(mnode);
1758 
1759 #if defined(__sparc)
1760         if (PP_ISNORELOC(pp)) {
1761                 kcage_freemem_sub(1);
1762         }
1763 #endif
1764 }
1765 
1766 void
1767 page_list_sub_pages(page_t *pp, uint_t szc)
1768 {
1769         kmutex_t *pcm;
1770         uint_t  bin, mtype;
1771         int     mnode;
1772 
1773         ASSERT(PAGE_EXCL(pp));
1774         ASSERT(PP_ISFREE(pp));
1775         ASSERT(PP_ISAGED(pp));
1776 
1777         /*
1778          * See comment in page_list_sub().
1779          */
1780 try_again:
1781         bin = PP_2_BIN(pp);
1782         mnode = PP_2_MEM_NODE(pp);
1783         pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1784         mutex_enter(pcm);
1785         if (PP_2_BIN(pp) != bin) {
1786                 mutex_exit(pcm);
1787                 goto    try_again;
1788         }
1789 
1790         /*
1791          * If we're called with a page larger than szc or it got
1792          * promoted above szc before we locked the freelist then
1793          * drop pcm and re-lock entire freelist. If page still larger
1794          * than szc then demote it.
1795          */
1796         if (pp->p_szc > szc) {
1797                 mutex_exit(pcm);
1798                 pcm = NULL;
1799                 page_freelist_lock(mnode);
1800                 if (pp->p_szc > szc) {
1801                         VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1802                         (void) page_demote(mnode,
1803                             PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
1804                             pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1805                 }
1806                 bin = PP_2_BIN(pp);
1807         }
1808         ASSERT(PP_ISFREE(pp));
1809         ASSERT(PP_ISAGED(pp));
1810         ASSERT(pp->p_szc <= szc);
1811         ASSERT(pp == PP_PAGEROOT(pp));
1812 
1813         VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1814 
1815         mtype = PP_2_MTYPE(pp);
1816         if (pp->p_szc != 0) {
1817                 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1818                 CHK_LPG(pp, pp->p_szc);
1819         } else {
1820                 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1821                 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1822         }
1823         page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1824 
1825         if (pcm != NULL) {
1826                 mutex_exit(pcm);
1827         } else {
1828                 page_freelist_unlock(mnode);
1829         }
1830 
1831 #if defined(__sparc)
1832         if (PP_ISNORELOC(pp)) {
1833                 pgcnt_t pgcnt;
1834 
1835                 pgcnt = page_get_pagecnt(pp->p_szc);
1836                 kcage_freemem_sub(pgcnt);
1837         }
1838 #endif
1839 }
1840 
1841 /*
1842  * Add the page to the front of a linked list of pages
1843  * using the p_next & p_prev pointers for the list.
1844  * The caller is responsible for protecting the list pointers.
1845  */
1846 void
1847 mach_page_add(page_t **ppp, page_t *pp)
1848 {
1849         if (*ppp == NULL) {
1850                 pp->p_next = pp->p_prev = pp;
1851         } else {
1852                 pp->p_next = *ppp;
1853                 pp->p_prev = (*ppp)->p_prev;
1854                 (*ppp)->p_prev = pp;
1855                 pp->p_prev->p_next = pp;
1856         }
1857         *ppp = pp;
1858 }
1859 
1860 /*
1861  * Remove this page from a linked list of pages
1862  * using the p_next & p_prev pointers for the list.
1863  *
1864  * The caller is responsible for protecting the list pointers.
1865  */
1866 void
1867 mach_page_sub(page_t **ppp, page_t *pp)
1868 {
1869         ASSERT(PP_ISFREE(pp));
1870 
1871         if (*ppp == NULL || pp == NULL)
1872                 panic("mach_page_sub");
1873 
1874         if (*ppp == pp)
1875                 *ppp = pp->p_next;           /* go to next page */
1876 
1877         if (*ppp == pp)
1878                 *ppp = NULL;                    /* page list is gone */
1879         else {
1880                 pp->p_prev->p_next = pp->p_next;
1881                 pp->p_next->p_prev = pp->p_prev;
1882         }
1883         pp->p_prev = pp->p_next = pp;             /* make pp a list of one */
1884 }
1885 
1886 /*
1887  * Routine fsflush uses to gradually coalesce the free list into larger pages.
1888  */
1889 void
1890 page_promote_size(page_t *pp, uint_t cur_szc)
1891 {
1892         pfn_t pfn;
1893         int mnode;
1894         int idx;
1895         int new_szc = cur_szc + 1;
1896         int full = FULL_REGION_CNT(new_szc);
1897 
1898         pfn = page_pptonum(pp);
1899         mnode = PFN_2_MEM_NODE(pfn);
1900 
1901         page_freelist_lock(mnode);
1902 
1903         idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1904         if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1905                 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1906 
1907         page_freelist_unlock(mnode);
1908 }
1909 
1910 static uint_t page_promote_err;
1911 static uint_t page_promote_noreloc_err;
1912 
1913 /*
1914  * Create a single larger page (of szc new_szc) from smaller contiguous pages
1915  * for the given mnode starting at pfnum. Pages involved are on the freelist
1916  * before the call and may be returned to the caller if requested, otherwise
1917  * they will be placed back on the freelist.
1918  * If flags is PC_ALLOC, then the large page will be returned to the user in
1919  * a state which is consistent with a page being taken off the freelist.  If
1920  * we failed to lock the new large page, then we will return NULL to the
1921  * caller and put the large page on the freelist instead.
1922  * If flags is PC_FREE, then the large page will be placed on the freelist,
1923  * and NULL will be returned.
1924  * The caller is responsible for locking the freelist as well as any other
1925  * accounting which needs to be done for a returned page.
1926  *
1927  * RFE: For performance pass in pp instead of pfnum so
1928  *      we can avoid excessive calls to page_numtopp_nolock().
1929  *      This would depend on an assumption that all contiguous
1930  *      pages are in the same memseg so we can just add/dec
1931  *      our pp.
1932  *
1933  * Lock ordering:
1934  *
1935  *      There is a potential but rare deadlock situation
1936  *      for page promotion and demotion operations. The problem
1937  *      is there are two paths into the freelist manager and
1938  *      they have different lock orders:
1939  *
1940  *      page_create()
1941  *              lock freelist
1942  *              page_lock(EXCL)
1943  *              unlock freelist
1944  *              return
1945  *              caller drops page_lock
1946  *
1947  *      page_free() and page_reclaim()
1948  *              caller grabs page_lock(EXCL)
1949  *
1950  *              lock freelist
1951  *              unlock freelist
1952  *              drop page_lock
1953  *
1954  *      What prevents a thread in page_create() from deadlocking
1955  *      with a thread freeing or reclaiming the same page is the
1956  *      page_trylock() in page_get_freelist(). If the trylock fails
1957  *      it skips the page.
1958  *
1959  *      The lock ordering for promotion and demotion is the same as
1960  *      for page_create(). Since the same deadlock could occur during
1961  *      page promotion and freeing or reclaiming of a page on the
1962  *      cache list we might have to fail the operation and undo what
1963  *      have done so far. Again this is rare.
1964  */
1965 page_t *
1966 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1967 {
1968         page_t          *pp, *pplist, *tpp, *start_pp;
1969         pgcnt_t         new_npgs, npgs;
1970         uint_t          bin;
1971         pgcnt_t         tmpnpgs, pages_left;
1972         uint_t          noreloc;
1973         int             which_list;
1974         ulong_t         index;
1975         kmutex_t        *phm;
1976 
1977         /*
1978          * General algorithm:
1979          * Find the starting page
1980          * Walk each page struct removing it from the freelist,
1981          * and linking it to all the other pages removed.
1982          * Once all pages are off the freelist,
1983          * walk the list, modifying p_szc to new_szc and what
1984          * ever other info needs to be done to create a large free page.
1985          * According to the flags, either return the page or put it
1986          * on the freelist.
1987          */
1988 
1989         start_pp = page_numtopp_nolock(pfnum);
1990         ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1991         new_npgs = page_get_pagecnt(new_szc);
1992         ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1993 
1994         /* don't return page of the wrong mtype */
1995         if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1996                         return (NULL);
1997 
1998         /*
1999          * Loop through smaller pages to confirm that all pages
2000          * give the same result for PP_ISNORELOC().
2001          * We can check this reliably here as the protocol for setting
2002          * P_NORELOC requires pages to be taken off the free list first.
2003          */
2004         noreloc = PP_ISNORELOC(start_pp);
2005         for (pp = start_pp + new_npgs; --pp > start_pp; ) {
2006                 if (noreloc != PP_ISNORELOC(pp)) {
2007                         page_promote_noreloc_err++;
2008                         page_promote_err++;
2009                         return (NULL);
2010                 }
2011         }
2012 
2013         pages_left = new_npgs;
2014         pplist = NULL;
2015         pp = start_pp;
2016 
2017         /* Loop around coalescing the smaller pages into a big page. */
2018         while (pages_left) {
2019                 /*
2020                  * Remove from the freelist.
2021                  */
2022                 ASSERT(PP_ISFREE(pp));
2023                 bin = PP_2_BIN(pp);
2024                 ASSERT(mnode == PP_2_MEM_NODE(pp));
2025                 mtype = PP_2_MTYPE(pp);
2026                 if (PP_ISAGED(pp)) {
2027 
2028                         /*
2029                          * PG_FREE_LIST
2030                          */
2031                         if (pp->p_szc) {
2032                                 page_vpsub(&PAGE_FREELISTS(mnode,
2033                                     pp->p_szc, bin, mtype), pp);
2034                         } else {
2035                                 mach_page_sub(&PAGE_FREELISTS(mnode, 0,
2036                                     bin, mtype), pp);
2037                         }
2038                         which_list = PG_FREE_LIST;
2039                 } else {
2040                         ASSERT(pp->p_szc == 0);
2041 
2042                         /*
2043                          * PG_CACHE_LIST
2044                          *
2045                          * Since this page comes from the
2046                          * cachelist, we must destroy the
2047                          * vnode association.
2048                          */
2049                         if (!page_trylock(pp, SE_EXCL)) {
2050                                 goto fail_promote;
2051                         }
2052 
2053                         /*
2054                          * We need to be careful not to deadlock
2055                          * with another thread in page_lookup().
2056                          * The page_lookup() thread could be holding
2057                          * the same phm that we need if the two
2058                          * pages happen to hash to the same phm lock.
2059                          * At this point we have locked the entire
2060                          * freelist and page_lookup() could be trying
2061                          * to grab a freelist lock.
2062                          */
2063                         index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
2064                         phm = PAGE_HASH_MUTEX(index);
2065                         if (!mutex_tryenter(phm)) {
2066                                 page_unlock_nocapture(pp);
2067                                 goto fail_promote;
2068                         }
2069 
2070                         mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
2071                         page_hashout(pp, phm);
2072                         mutex_exit(phm);
2073                         PP_SETAGED(pp);
2074                         page_unlock_nocapture(pp);
2075                         which_list = PG_CACHE_LIST;
2076                 }
2077                 page_ctr_sub(mnode, mtype, pp, which_list);
2078 
2079                 /*
2080                  * Concatenate the smaller page(s) onto
2081                  * the large page list.
2082                  */
2083                 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
2084                 pages_left -= npgs;
2085                 tpp = pp;
2086                 while (npgs--) {
2087                         tpp->p_szc = new_szc;
2088                         tpp = tpp->p_next;
2089                 }
2090                 page_list_concat(&pplist, &pp);
2091                 pp += tmpnpgs;
2092         }
2093         CHK_LPG(pplist, new_szc);
2094 
2095         /*
2096          * return the page to the user if requested
2097          * in the properly locked state.
2098          */
2099         if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
2100                 return (pplist);
2101         }
2102 
2103         /*
2104          * Otherwise place the new large page on the freelist
2105          */
2106         bin = PP_2_BIN(pplist);
2107         mnode = PP_2_MEM_NODE(pplist);
2108         mtype = PP_2_MTYPE(pplist);
2109         page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
2110 
2111         page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
2112         return (NULL);
2113 
2114 fail_promote:
2115         /*
2116          * A thread must have still been freeing or
2117          * reclaiming the page on the cachelist.
2118          * To prevent a deadlock undo what we have
2119          * done sofar and return failure. This
2120          * situation can only happen while promoting
2121          * PAGESIZE pages.
2122          */
2123         page_promote_err++;
2124         while (pplist) {
2125                 pp = pplist;
2126                 mach_page_sub(&pplist, pp);
2127                 pp->p_szc = 0;
2128                 bin = PP_2_BIN(pp);
2129                 mtype = PP_2_MTYPE(pp);
2130                 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2131                 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2132         }
2133         return (NULL);
2134 
2135 }
2136 
2137 /*
2138  * Break up a large page into smaller size pages.
2139  * Pages involved are on the freelist before the call and may
2140  * be returned to the caller if requested, otherwise they will
2141  * be placed back on the freelist.
2142  * The caller is responsible for locking the freelist as well as any other
2143  * accounting which needs to be done for a returned page.
2144  * If flags is not PC_ALLOC, the color argument is ignored, and thus
2145  * technically, any value may be passed in but PC_NO_COLOR is the standard
2146  * which should be followed for clarity's sake.
2147  * Returns a page whose pfn is < pfnmax
2148  */
2149 page_t *
2150 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
2151     uchar_t new_szc, int color, int flags)
2152 {
2153         page_t  *pp, *pplist, *npplist;
2154         pgcnt_t npgs, n;
2155         uint_t  bin;
2156         uint_t  mtype;
2157         page_t  *ret_pp = NULL;
2158 
2159         ASSERT(cur_szc != 0);
2160         ASSERT(new_szc < cur_szc);
2161 
2162         pplist = page_numtopp_nolock(pfnum);
2163         ASSERT(pplist != NULL);
2164 
2165         ASSERT(pplist->p_szc == cur_szc);
2166 
2167         bin = PP_2_BIN(pplist);
2168         ASSERT(mnode == PP_2_MEM_NODE(pplist));
2169         mtype = PP_2_MTYPE(pplist);
2170         page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2171 
2172         CHK_LPG(pplist, cur_szc);
2173         page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2174 
2175         /*
2176          * Number of PAGESIZE pages for smaller new_szc
2177          * page.
2178          */
2179         npgs = page_get_pagecnt(new_szc);
2180 
2181         while (pplist) {
2182                 pp = pplist;
2183 
2184                 ASSERT(pp->p_szc == cur_szc);
2185 
2186                 /*
2187                  * We either break it up into PAGESIZE pages or larger.
2188                  */
2189                 if (npgs == 1) {        /* PAGESIZE case */
2190                         mach_page_sub(&pplist, pp);
2191                         ASSERT(pp->p_szc == cur_szc);
2192                         ASSERT(new_szc == 0);
2193                         ASSERT(mnode == PP_2_MEM_NODE(pp));
2194                         pp->p_szc = new_szc;
2195                         bin = PP_2_BIN(pp);
2196                         if ((bin == color) && (flags == PC_ALLOC) &&
2197                             (ret_pp == NULL) && (pfnmax == 0 ||
2198                             pp->p_pagenum < pfnmax) &&
2199                             page_trylock_cons(pp, SE_EXCL)) {
2200                                 ret_pp = pp;
2201                         } else {
2202                                 mtype = PP_2_MTYPE(pp);
2203                                 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2204                                     mtype), pp);
2205                                 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2206                         }
2207                 } else {
2208                         page_t *try_to_return_this_page = NULL;
2209                         int count = 0;
2210 
2211                         /*
2212                          * Break down into smaller lists of pages.
2213                          */
2214                         page_list_break(&pplist, &npplist, npgs);
2215 
2216                         pp = pplist;
2217                         n = npgs;
2218                         while (n--) {
2219                                 ASSERT(pp->p_szc == cur_szc);
2220                                 /*
2221                                  * Check whether all the pages in this list
2222                                  * fit the request criteria.
2223                                  */
2224                                 if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
2225                                         count++;
2226                                 }
2227                                 pp->p_szc = new_szc;
2228                                 pp = pp->p_next;
2229                         }
2230 
2231                         if (count == npgs &&
2232                             (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
2233                                 try_to_return_this_page = pp;
2234                         }
2235 
2236                         CHK_LPG(pplist, new_szc);
2237 
2238                         bin = PP_2_BIN(pplist);
2239                         if (try_to_return_this_page)
2240                                 ASSERT(mnode ==
2241                                     PP_2_MEM_NODE(try_to_return_this_page));
2242                         if ((bin == color) && (flags == PC_ALLOC) &&
2243                             (ret_pp == NULL) && try_to_return_this_page &&
2244                             page_trylock_cons(try_to_return_this_page,
2245                             SE_EXCL)) {
2246                                 ret_pp = try_to_return_this_page;
2247                         } else {
2248                                 mtype = PP_2_MTYPE(pp);
2249                                 page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
2250                                     bin, mtype), pplist);
2251 
2252                                 page_ctr_add(mnode, mtype, pplist,
2253                                     PG_FREE_LIST);
2254                         }
2255                         pplist = npplist;
2256                 }
2257         }
2258         return (ret_pp);
2259 }
2260 
2261 int mpss_coalesce_disable = 0;
2262 
2263 /*
2264  * Coalesce free pages into a page of the given szc and color if possible.
2265  * Return the pointer to the page created, otherwise, return NULL.
2266  *
2267  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2268  */
2269 page_t *
2270 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2271     int mtype, pfn_t pfnhi)
2272 {
2273         int     r = szc;                /* region size */
2274         int     mrange;
2275         uint_t  full, bin, color_mask, wrap = 0;
2276         pfn_t   pfnum, lo, hi;
2277         size_t  len, idx, idx0;
2278         pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc);
2279         page_t  *ret_pp;
2280         MEM_NODE_ITERATOR_DECL(it);
2281 #if defined(__sparc)
2282         pfn_t pfnum0, nlo, nhi;
2283 #endif
2284 
2285         if (mpss_coalesce_disable) {
2286                 ASSERT(szc < MMU_PAGE_SIZES);
2287                 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2288                 return (NULL);
2289         }
2290 
2291         ASSERT(szc < mmu_page_sizes);
2292         color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2293         ASSERT(ceq_mask <= color_mask);
2294         ASSERT(color <= color_mask);
2295         color &= ceq_mask;
2296 
2297         /* Prevent page_counters dynamic memory from being freed */
2298         rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2299 
2300         mrange = MTYPE_2_MRANGE(mnode, mtype);
2301         ASSERT(mrange < mnode_nranges[mnode]);
2302         VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2303 
2304         /* get pfn range for mtype */
2305         len = PAGE_COUNTERS_ENTRIES(mnode, r);
2306         MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2307         hi++;
2308 
2309         /* use lower limit if given */
2310         if (pfnhi != PFNNULL && pfnhi < hi)
2311                 hi = pfnhi;
2312 
2313         /* round to szcpgcnt boundaries */
2314         lo = P2ROUNDUP(lo, szcpgcnt);
2315         MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2316         if (lo == (pfn_t)-1) {
2317                 rw_exit(&page_ctrs_rwlock[mnode]);
2318                 return (NULL);
2319         }
2320         hi = hi & ~(szcpgcnt - 1);
2321 
2322         /* set lo to the closest pfn of the right color */
2323         if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2324             (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2325                 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2326                     &it);
2327         }
2328 
2329         if (hi <= lo) {
2330                 rw_exit(&page_ctrs_rwlock[mnode]);
2331                 return (NULL);
2332         }
2333 
2334         full = FULL_REGION_CNT(r);
2335 
2336         /* calculate the number of page candidates and initial search index */
2337         bin = color;
2338         idx0 = (size_t)(-1);
2339         do {
2340                 pgcnt_t acand;
2341 
2342                 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2343                 if (acand) {
2344                         idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2345                             r, bin, mrange);
2346                         idx0 = MIN(idx0, idx);
2347                         cands += acand;
2348                 }
2349                 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2350         } while (bin != color);
2351 
2352         if (cands == 0) {
2353                 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2354                 rw_exit(&page_ctrs_rwlock[mnode]);
2355                 return (NULL);
2356         }
2357 
2358         pfnum = IDX_TO_PNUM(mnode, r, idx0);
2359         if (pfnum < lo || pfnum >= hi) {
2360                 pfnum = lo;
2361         } else {
2362                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2363                 if (pfnum == (pfn_t)-1) {
2364                         pfnum = lo;
2365                         MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2366                         ASSERT(pfnum != (pfn_t)-1);
2367                 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2368                     (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2369                         /* invalid color, get the closest correct pfn */
2370                         PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2371                             color_mask, &it);
2372                         if (pfnum >= hi) {
2373                                 pfnum = lo;
2374                                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2375                         }
2376                 }
2377         }
2378 
2379         /* set starting index */
2380         idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2381         ASSERT(idx0 < len);
2382 
2383 #if defined(__sparc)
2384         pfnum0 = pfnum;         /* page corresponding to idx0 */
2385         nhi = 0;                /* search kcage ranges */
2386 #endif
2387 
2388         for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2389 
2390 #if defined(__sparc)
2391                 /*
2392                  * Find lowest intersection of kcage ranges and mnode.
2393                  * MTYPE_NORELOC means look in the cage, otherwise outside.
2394                  */
2395                 if (nhi <= pfnum) {
2396                         if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
2397                             (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
2398                                 goto wrapit;
2399 
2400                         /* jump to the next page in the range */
2401                         if (pfnum < nlo) {
2402                                 pfnum = P2ROUNDUP(nlo, szcpgcnt);
2403                                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2404                                 idx = PNUM_TO_IDX(mnode, r, pfnum);
2405                                 if (idx >= len || pfnum >= hi)
2406                                         goto wrapit;
2407                                 if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
2408                                     ceq_mask)
2409                                         goto next;
2410                                 if (interleaved_mnodes &&
2411                                     PFN_2_MEM_NODE(pfnum) != mnode)
2412                                         goto next;
2413                         }
2414                 }
2415 #endif
2416 
2417                 if (PAGE_COUNTERS(mnode, r, idx) != full)
2418                         goto next;
2419 
2420                 /*
2421                  * RFE: For performance maybe we can do something less
2422                  *      brutal than locking the entire freelist. So far
2423                  *      this doesn't seem to be a performance problem?
2424                  */
2425                 page_freelist_lock(mnode);
2426                 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2427                         ret_pp =
2428                             page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2429                         if (ret_pp != NULL) {
2430                                 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2431                                 PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2432                                     PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2433                                 page_freelist_unlock(mnode);
2434                                 rw_exit(&page_ctrs_rwlock[mnode]);
2435 #if defined(__sparc)
2436                                 if (PP_ISNORELOC(ret_pp)) {
2437                                         pgcnt_t npgs;
2438 
2439                                         npgs = page_get_pagecnt(ret_pp->p_szc);
2440                                         kcage_freemem_sub(npgs);
2441                                 }
2442 #endif
2443                                 return (ret_pp);
2444                         }
2445                 } else {
2446                         VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2447                 }
2448 
2449                 page_freelist_unlock(mnode);
2450                 /*
2451                  * No point looking for another page if we've
2452                  * already tried all of the ones that
2453                  * page_ctr_cands indicated.  Stash off where we left
2454                  * off.
2455                  * Note: this is not exact since we don't hold the
2456                  * page_freelist_locks before we initially get the
2457                  * value of cands for performance reasons, but should
2458                  * be a decent approximation.
2459                  */
2460                 if (--cands == 0) {
2461                         PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2462                             idx;
2463                         break;
2464                 }
2465 next:
2466                 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2467                     color_mask, &it);
2468                 idx = PNUM_TO_IDX(mnode, r, pfnum);
2469                 if (idx >= len || pfnum >= hi) {
2470 wrapit:
2471                         pfnum = lo;
2472                         MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2473                         idx = PNUM_TO_IDX(mnode, r, pfnum);
2474                         wrap++;
2475 #if defined(__sparc)
2476                         nhi = 0;        /* search kcage ranges */
2477 #endif
2478                 }
2479         }
2480 
2481         rw_exit(&page_ctrs_rwlock[mnode]);
2482         VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2483         return (NULL);
2484 }
2485 
2486 /*
2487  * For the given mnode, promote as many small pages to large pages as possible.
2488  * mnode can be -1, which means do them all
2489  */
2490 void
2491 page_freelist_coalesce_all(int mnode)
2492 {
2493         int     r;              /* region size */
2494         int     idx, full;
2495         size_t  len;
2496         int doall = interleaved_mnodes || mnode < 0;
2497         int mlo = doall ? 0 : mnode;
2498         int mhi = doall ? max_mem_nodes : (mnode + 1);
2499 
2500         VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2501 
2502         if (mpss_coalesce_disable) {
2503                 return;
2504         }
2505 
2506         /*
2507          * Lock the entire freelist and coalesce what we can.
2508          *
2509          * Always promote to the largest page possible
2510          * first to reduce the number of page promotions.
2511          */
2512         for (mnode = mlo; mnode < mhi; mnode++) {
2513                 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2514                 page_freelist_lock(mnode);
2515         }
2516         for (r = mmu_page_sizes - 1; r > 0; r--) {
2517                 for (mnode = mlo; mnode < mhi; mnode++) {
2518                         pgcnt_t cands = 0;
2519                         int mrange, nranges = mnode_nranges[mnode];
2520 
2521                         for (mrange = 0; mrange < nranges; mrange++) {
2522                                 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2523                                 if (cands != 0)
2524                                         break;
2525                         }
2526                         if (cands == 0) {
2527                                 VM_STAT_ADD(vmm_vmstats.
2528                                     page_ctrs_cands_skip_all);
2529                                 continue;
2530                         }
2531 
2532                         full = FULL_REGION_CNT(r);
2533                         len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2534 
2535                         for (idx = 0; idx < len; idx++) {
2536                                 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2537                                         pfn_t pfnum =
2538                                             IDX_TO_PNUM(mnode, r, idx);
2539                                         int tmnode = interleaved_mnodes ?
2540                                             PFN_2_MEM_NODE(pfnum) : mnode;
2541 
2542                                         ASSERT(pfnum >=
2543                                             mem_node_config[tmnode].physbase &&
2544                                             pfnum <
2545                                             mem_node_config[tmnode].physmax);
2546 
2547                                         (void) page_promote(tmnode,
2548                                             pfnum, r, PC_FREE, PC_MTYPE_ANY);
2549                                 }
2550                         }
2551                         /* shared hpm_counters covers all mnodes, so we quit */
2552                         if (interleaved_mnodes)
2553                                 break;
2554                 }
2555         }
2556         for (mnode = mlo; mnode < mhi; mnode++) {
2557                 page_freelist_unlock(mnode);
2558                 rw_exit(&page_ctrs_rwlock[mnode]);
2559         }
2560 }
2561 
2562 /*
2563  * This is where all polices for moving pages around
2564  * to different page size free lists is implemented.
2565  * Returns 1 on success, 0 on failure.
2566  *
2567  * So far these are the priorities for this algorithm in descending
2568  * order:
2569  *
2570  *      1) When servicing a request try to do so with a free page
2571  *         from next size up. Helps defer fragmentation as long
2572  *         as possible.
2573  *
2574  *      2) Page coalesce on demand. Only when a freelist
2575  *         larger than PAGESIZE is empty and step 1
2576  *         will not work since all larger size lists are
2577  *         also empty.
2578  *
2579  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2580  */
2581 
2582 page_t *
2583 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2584     pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
2585 {
2586         uchar_t nszc = szc + 1;
2587         uint_t  bin, sbin, bin_prev;
2588         page_t  *pp, *firstpp;
2589         page_t  *ret_pp = NULL;
2590         uint_t  color_mask;
2591 
2592         if (nszc == mmu_page_sizes)
2593                 return (NULL);
2594 
2595         ASSERT(nszc < mmu_page_sizes);
2596         color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2597         bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2598         bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2599             PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2600 
2601         VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2602         /*
2603          * First try to break up a larger page to fill current size freelist.
2604          */
2605         while (plw->plw_bins[nszc] != 0) {
2606 
2607                 ASSERT(nszc < mmu_page_sizes);
2608 
2609                 /*
2610                  * If page found then demote it.
2611                  */
2612                 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2613                         page_freelist_lock(mnode);
2614                         firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2615 
2616                         /*
2617                          * If pfnhi is not PFNNULL, look for large page below
2618                          * pfnhi. PFNNULL signifies no pfn requirement.
2619                          */
2620                         if (pp &&
2621                             ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
2622                             (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
2623                                 do {
2624                                         pp = pp->p_vpnext;
2625                                         if (pp == firstpp) {
2626                                                 pp = NULL;
2627                                                 break;
2628                                         }
2629                                 } while ((pfnhi != PFNNULL &&
2630                                     pp->p_pagenum >= pfnhi) ||
2631                                     (pfnlo != PFNNULL &&
2632                                     pp->p_pagenum < pfnlo));
2633 
2634                                 if (pfnhi != PFNNULL && pp != NULL)
2635                                         ASSERT(pp->p_pagenum < pfnhi);
2636 
2637                                 if (pfnlo != PFNNULL && pp != NULL)
2638                                         ASSERT(pp->p_pagenum >= pfnlo);
2639                         }
2640                         if (pp) {
2641                                 uint_t ccolor = page_correct_color(szc, nszc,
2642                                     color, bin, plw->plw_ceq_mask[szc]);
2643 
2644                                 ASSERT(pp->p_szc == nszc);
2645                                 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2646                                 ret_pp = page_demote(mnode, pp->p_pagenum,
2647                                     pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
2648                                 if (ret_pp) {
2649                                         page_freelist_unlock(mnode);
2650 #if defined(__sparc)
2651                                         if (PP_ISNORELOC(ret_pp)) {
2652                                                 pgcnt_t npgs;
2653 
2654                                                 npgs = page_get_pagecnt(
2655                                                     ret_pp->p_szc);
2656                                                 kcage_freemem_sub(npgs);
2657                                         }
2658 #endif
2659                                         return (ret_pp);
2660                                 }
2661                         }
2662                         page_freelist_unlock(mnode);
2663                 }
2664 
2665                 /* loop through next size bins */
2666                 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2667                 plw->plw_bins[nszc]--;
2668 
2669                 if (bin == sbin) {
2670                         uchar_t nnszc = nszc + 1;
2671 
2672                         /* we are done with this page size - check next */
2673                         if (plw->plw_bins[nnszc] == 0)
2674                                 /* we have already checked next size bins */
2675                                 break;
2676 
2677                         bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2678                         if (bin_prev != INVALID_COLOR) {
2679                                 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2680                                 if (!((bin ^ bin_prev) &
2681                                     plw->plw_ceq_mask[nnszc]))
2682                                         break;
2683                         }
2684                         ASSERT(nnszc < mmu_page_sizes);
2685                         color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2686                         nszc = nnszc;
2687                         ASSERT(nszc < mmu_page_sizes);
2688                 }
2689         }
2690 
2691         return (ret_pp);
2692 }
2693 
2694 /*
2695  * Helper routine used only by the freelist code to lock
2696  * a page. If the page is a large page then it succeeds in
2697  * locking all the constituent pages or none at all.
2698  * Returns 1 on sucess, 0 on failure.
2699  */
2700 static int
2701 page_trylock_cons(page_t *pp, se_t se)
2702 {
2703         page_t  *tpp, *first_pp = pp;
2704 
2705         /*
2706          * Fail if can't lock first or only page.
2707          */
2708         if (!page_trylock(pp, se)) {
2709                 return (0);
2710         }
2711 
2712         /*
2713          * PAGESIZE: common case.
2714          */
2715         if (pp->p_szc == 0) {
2716                 return (1);
2717         }
2718 
2719         /*
2720          * Large page case.
2721          */
2722         tpp = pp->p_next;
2723         while (tpp != pp) {
2724                 if (!page_trylock(tpp, se)) {
2725                         /*
2726                          * On failure unlock what we have locked so far.
2727                          * We want to avoid attempting to capture these
2728                          * pages as the pcm mutex may be held which could
2729                          * lead to a recursive mutex panic.
2730                          */
2731                         while (first_pp != tpp) {
2732                                 page_unlock_nocapture(first_pp);
2733                                 first_pp = first_pp->p_next;
2734                         }
2735                         return (0);
2736                 }
2737                 tpp = tpp->p_next;
2738         }
2739         return (1);
2740 }
2741 
2742 /*
2743  * init context for walking page lists
2744  * Called when a page of the given szc in unavailable. Sets markers
2745  * for the beginning of the search to detect when search has
2746  * completed a full cycle. Sets flags for splitting larger pages
2747  * and coalescing smaller pages. Page walking procedes until a page
2748  * of the desired equivalent color is found.
2749  */
2750 void
2751 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2752     int use_ceq, page_list_walker_t *plw)
2753 {
2754         uint_t  nszc, ceq_mask, colors;
2755         uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2756 
2757         ASSERT(szc < mmu_page_sizes);
2758         colors = PAGE_GET_PAGECOLORS(szc);
2759 
2760         plw->plw_colors = colors;
2761         plw->plw_color_mask = colors - 1;
2762         plw->plw_bin_marker = plw->plw_bin0 = bin;
2763         plw->plw_bin_split_prev = bin;
2764         plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2765 
2766         /*
2767          * if vac aliasing is possible make sure lower order color
2768          * bits are never ignored
2769          */
2770         if (vac_colors > 1)
2771                 ceq &= 0xf0;
2772 
2773         /*
2774          * calculate the number of non-equivalent colors and
2775          * color equivalency mask
2776          */
2777         plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2778         ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2779         ASSERT(plw->plw_ceq_dif > 0);
2780         plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2781 
2782         if (flags & PG_MATCH_COLOR) {
2783                 if (cpu_page_colors <  0) {
2784                         /*
2785                          * this is a heterogeneous machine with different CPUs
2786                          * having different size e$ (not supported for ni2/rock
2787                          */
2788                         uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2789                         cpucolors = MAX(cpucolors, 1);
2790                         ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2791                         plw->plw_ceq_mask[szc] =
2792                             MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2793                 }
2794                 plw->plw_ceq_dif = 1;
2795         }
2796 
2797         /* we can split pages in the freelist, but not the cachelist */
2798         if (can_split) {
2799                 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2800 
2801                 /* set next szc color masks and number of free list bins */
2802                 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2803                         plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2804                             plw->plw_ceq_mask[szc]);
2805                         plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2806                 }
2807                 plw->plw_ceq_mask[nszc] = INVALID_MASK;
2808                 plw->plw_bins[nszc] = 0;
2809 
2810         } else {
2811                 ASSERT(szc == 0);
2812                 plw->plw_do_split = 0;
2813                 plw->plw_bins[1] = 0;
2814                 plw->plw_ceq_mask[1] = INVALID_MASK;
2815         }
2816 }
2817 
2818 /*
2819  * set mark to flag where next split should occur
2820  */
2821 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {                    \
2822         uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);                       \
2823         uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);         \
2824         uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
2825         plw->plw_split_next =                                                     \
2826                 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);       \
2827         if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2828                 plw->plw_split_next =                                             \
2829                 INC_MASKED(plw->plw_split_next,                                   \
2830                     neq_mask, plw->plw_color_mask);                       \
2831         }                                                                    \
2832 }
2833 
2834 uint_t
2835 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2836 {
2837         uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2838         uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
2839         uchar_t nszc = szc + 1;
2840 
2841         nbin = ADD_MASKED(bin,
2842             plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2843 
2844         if (plw->plw_do_split) {
2845                 plw->plw_bin_split_prev = bin;
2846                 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2847                 plw->plw_do_split = 0;
2848         }
2849 
2850         if (szc == 0) {
2851                 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2852                         if (nbin == plw->plw_bin0 &&
2853                             (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2854                                 nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2855                                     neq_mask, plw->plw_color_mask);
2856                                 plw->plw_bin_split_prev = plw->plw_bin0;
2857                         }
2858 
2859                         if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2860                                 plw->plw_bin_marker =
2861                                     nbin = INC_MASKED(nbin, neq_mask,
2862                                     plw->plw_color_mask);
2863                                 plw->plw_bin_split_prev = plw->plw_bin0;
2864                                 /*
2865                                  * large pages all have the same vac color
2866                                  * so by now we should be done with next
2867                                  * size page splitting process
2868                                  */
2869                                 ASSERT(plw->plw_bins[1] == 0);
2870                                 plw->plw_do_split = 0;
2871                                 return (nbin);
2872                         }
2873 
2874                 } else {
2875                         uint_t bin_jump = (vac_colors == 1) ?
2876                             (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2877 
2878                         bin_jump &= ~(vac_colors - 1);
2879 
2880                         nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2881                             plw->plw_color_mask);
2882 
2883                         if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2884 
2885                                 plw->plw_bin_marker = nbin = nbin0;
2886 
2887                                 if (plw->plw_bins[nszc] != 0) {
2888                                         /*
2889                                          * check if next page size bin is the
2890                                          * same as the next page size bin for
2891                                          * bin0
2892                                          */
2893                                         nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2894                                             nbin);
2895                                         bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2896                                             plw->plw_bin0);
2897 
2898                                         if ((bin0_nsz ^ nbin_nsz) &
2899                                             plw->plw_ceq_mask[nszc])
2900                                                 plw->plw_do_split = 1;
2901                                 }
2902                                 return (nbin);
2903                         }
2904                 }
2905         }
2906 
2907         if (plw->plw_bins[nszc] != 0) {
2908                 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2909                 if (!((plw->plw_split_next ^ nbin_nsz) &
2910                     plw->plw_ceq_mask[nszc]))
2911                         plw->plw_do_split = 1;
2912         }
2913 
2914         return (nbin);
2915 }
2916 
2917 page_t *
2918 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2919     uint_t flags)
2920 {
2921         kmutex_t                *pcm;
2922         page_t                  *pp, *first_pp;
2923         uint_t                  sbin;
2924         int                     plw_initialized;
2925         page_list_walker_t      plw;
2926 
2927         ASSERT(szc < mmu_page_sizes);
2928 
2929         VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2930 
2931         MTYPE_START(mnode, mtype, flags);
2932         if (mtype < 0) {     /* mnode does not have memory in mtype range */
2933                 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2934                 return (NULL);
2935         }
2936 try_again:
2937 
2938         plw_initialized = 0;
2939         plw.plw_ceq_dif = 1;
2940 
2941         /*
2942          * Only hold one freelist lock at a time, that way we
2943          * can start anywhere and not have to worry about lock
2944          * ordering.
2945          */
2946         for (plw.plw_count = 0;
2947             plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2948                 sbin = bin;
2949                 do {
2950                         if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2951                                 goto bin_empty_1;
2952 
2953                         pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2954                         mutex_enter(pcm);
2955                         pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2956                         if (pp == NULL)
2957                                 goto bin_empty_0;
2958 
2959                         /*
2960                          * These were set before the page
2961                          * was put on the free list,
2962                          * they must still be set.
2963                          */
2964                         ASSERT(PP_ISFREE(pp));
2965                         ASSERT(PP_ISAGED(pp));
2966                         ASSERT(pp->p_vnode == NULL);
2967                         ASSERT(pp->p_hash == NULL);
2968                         ASSERT(pp->p_offset == (u_offset_t)-1);
2969                         ASSERT(pp->p_szc == szc);
2970                         ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2971 
2972                         /*
2973                          * Walk down the hash chain.
2974                          * 8k pages are linked on p_next
2975                          * and p_prev fields. Large pages
2976                          * are a contiguous group of
2977                          * constituent pages linked together
2978                          * on their p_next and p_prev fields.
2979                          * The large pages are linked together
2980                          * on the hash chain using p_vpnext
2981                          * p_vpprev of the base constituent
2982                          * page of each large page.
2983                          */
2984                         first_pp = pp;
2985                         while (!page_trylock_cons(pp, SE_EXCL)) {
2986                                 if (szc == 0) {
2987                                         pp = pp->p_next;
2988                                 } else {
2989                                         pp = pp->p_vpnext;
2990                                 }
2991 
2992                                 ASSERT(PP_ISFREE(pp));
2993                                 ASSERT(PP_ISAGED(pp));
2994                                 ASSERT(pp->p_vnode == NULL);
2995                                 ASSERT(pp->p_hash == NULL);
2996                                 ASSERT(pp->p_offset == (u_offset_t)-1);
2997                                 ASSERT(pp->p_szc == szc);
2998                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2999 
3000                                 if (pp == first_pp)
3001                                         goto bin_empty_0;
3002                         }
3003 
3004                         ASSERT(pp != NULL);
3005                         ASSERT(mtype == PP_2_MTYPE(pp));
3006                         ASSERT(pp->p_szc == szc);
3007                         if (szc == 0) {
3008                                 page_sub(&PAGE_FREELISTS(mnode,
3009                                     szc, bin, mtype), pp);
3010                         } else {
3011                                 page_vpsub(&PAGE_FREELISTS(mnode,
3012                                     szc, bin, mtype), pp);
3013                                 CHK_LPG(pp, szc);
3014                         }
3015                         page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3016 
3017                         if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
3018                                 panic("free page is not. pp %p", (void *)pp);
3019                         mutex_exit(pcm);
3020 
3021 #if defined(__sparc)
3022                         ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
3023                             (flags & PG_NORELOC) == 0);
3024 
3025                         if (PP_ISNORELOC(pp))
3026                                 kcage_freemem_sub(page_get_pagecnt(szc));
3027 #endif
3028                         VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
3029                         return (pp);
3030 
3031 bin_empty_0:
3032                         mutex_exit(pcm);
3033 bin_empty_1:
3034                         if (plw_initialized == 0) {
3035                                 page_list_walk_init(szc, flags, bin, 1, 1,
3036                                     &plw);
3037                                 plw_initialized = 1;
3038                                 ASSERT(plw.plw_colors <=
3039                                     PAGE_GET_PAGECOLORS(szc));
3040                                 ASSERT(plw.plw_colors > 0);
3041                                 ASSERT((plw.plw_colors &
3042                                     (plw.plw_colors - 1)) == 0);
3043                                 ASSERT(bin < plw.plw_colors);
3044                                 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
3045                         }
3046                         /* calculate the next bin with equivalent color */
3047                         bin = ADD_MASKED(bin, plw.plw_bin_step,
3048                             plw.plw_ceq_mask[szc], plw.plw_color_mask);
3049                 } while (sbin != bin);
3050 
3051                 /*
3052                  * color bins are all empty if color match. Try and
3053                  * satisfy the request by breaking up or coalescing
3054                  * pages from a different size freelist of the correct
3055                  * color that satisfies the ORIGINAL color requested.
3056                  * If that fails then try pages of the same size but
3057                  * different colors assuming we are not called with
3058                  * PG_MATCH_COLOR.
3059                  */
3060                 if (plw.plw_do_split &&
3061                     (pp = page_freelist_split(szc, bin, mnode,
3062                     mtype, PFNNULL, PFNNULL, &plw)) != NULL)
3063                         return (pp);
3064 
3065                 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
3066                     bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
3067                         return (pp);
3068 
3069                 if (plw.plw_ceq_dif > 1)
3070                         bin = page_list_walk_next_bin(szc, bin, &plw);
3071         }
3072 
3073         /* if allowed, cycle through additional mtypes */
3074         MTYPE_NEXT(mnode, mtype, flags);
3075         if (mtype >= 0)
3076                 goto try_again;
3077 
3078         VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
3079 
3080         return (NULL);
3081 }
3082 
3083 /*
3084  * Returns the count of free pages for 'pp' with size code 'szc'.
3085  * Note: This function does not return an exact value as the page freelist
3086  * locks are not held and thus the values in the page_counters may be
3087  * changing as we walk through the data.
3088  */
3089 static int
3090 page_freecnt(int mnode, page_t *pp, uchar_t szc)
3091 {
3092         pgcnt_t pgfree;
3093         pgcnt_t cnt;
3094         ssize_t r = szc;        /* region size */
3095         ssize_t idx;
3096         int     i;
3097         int     full, range;
3098 
3099         /* Make sure pagenum passed in is aligned properly */
3100         ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
3101         ASSERT(szc > 0);
3102 
3103         /* Prevent page_counters dynamic memory from being freed */
3104         rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
3105         idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3106         cnt = PAGE_COUNTERS(mnode, r, idx);
3107         pgfree = cnt << PNUM_SHIFT(r - 1);
3108         range = FULL_REGION_CNT(szc);
3109 
3110         /* Check for completely full region */
3111         if (cnt == range) {
3112                 rw_exit(&page_ctrs_rwlock[mnode]);
3113                 return (pgfree);
3114         }
3115 
3116         while (--r > 0) {
3117                 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3118                 full = FULL_REGION_CNT(r);
3119                 for (i = 0; i < range; i++, idx++) {
3120                         cnt = PAGE_COUNTERS(mnode, r, idx);
3121                         /*
3122                          * If cnt here is full, that means we have already
3123                          * accounted for these pages earlier.
3124                          */
3125                         if (cnt != full) {
3126                                 pgfree += (cnt << PNUM_SHIFT(r - 1));
3127                         }
3128                 }
3129                 range *= full;
3130         }
3131         rw_exit(&page_ctrs_rwlock[mnode]);
3132         return (pgfree);
3133 }
3134 
3135 /*
3136  * Called from page_geti_contig_pages to exclusively lock constituent pages
3137  * starting from 'spp' for page size code 'szc'.
3138  *
3139  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
3140  * region needs to be greater than or equal to the threshold.
3141  */
3142 static int
3143 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
3144 {
3145         pgcnt_t pgcnt = PNUM_SIZE(szc);
3146         pgcnt_t pgfree, i;
3147         page_t *pp;
3148 
3149         VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
3150 
3151 
3152         if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
3153                 goto skipptcpcheck;
3154         /*
3155          * check if there are sufficient free pages available before attempting
3156          * to trylock. Count is approximate as page counters can change.
3157          */
3158         pgfree = page_freecnt(mnode, spp, szc);
3159 
3160         /* attempt to trylock if there are sufficient already free pages */
3161         if (pgfree < pgcnt/ptcpthreshold) {
3162                 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
3163                 return (0);
3164         }
3165 
3166 skipptcpcheck:
3167 
3168         for (i = 0; i < pgcnt; i++) {
3169                 pp = &spp[i];
3170                 if (!page_trylock(pp, SE_EXCL)) {
3171                         VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
3172                         while (--i != (pgcnt_t)-1) {
3173                                 pp = &spp[i];
3174                                 ASSERT(PAGE_EXCL(pp));
3175                                 page_unlock_nocapture(pp);
3176                         }
3177                         return (0);
3178                 }
3179                 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
3180                 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
3181                     !PP_ISFREE(pp)) {
3182                         VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
3183                         ASSERT(i == 0);
3184                         page_unlock_nocapture(pp);
3185                         return (0);
3186                 }
3187 
3188                 /*
3189                  * If a page has been marked non-relocatable or has been
3190                  * explicitly locked in memory, we don't want to relocate it;
3191                  * unlock the pages and fail the operation.
3192                  */
3193                 if (PP_ISNORELOC(pp) ||
3194                     pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
3195                         VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3196                         while (i != (pgcnt_t)-1) {
3197                                 pp = &spp[i];
3198                                 ASSERT(PAGE_EXCL(pp));
3199                                 page_unlock_nocapture(pp);
3200                                 i--;
3201                         }
3202                         return (0);
3203                 }
3204         }
3205         VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3206         return (1);
3207 }
3208 
3209 /*
3210  * Claim large page pointed to by 'pp'. 'pp' is the starting set
3211  * of 'szc' constituent pages that had been locked exclusively previously.
3212  * Will attempt to relocate constituent pages in use.
3213  */
3214 static page_t *
3215 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3216 {
3217         spgcnt_t pgcnt, npgs, i;
3218         page_t *targpp, *rpp, *hpp;
3219         page_t *replpp = NULL;
3220         page_t *pplist = NULL;
3221 
3222         ASSERT(pp != NULL);
3223 
3224         pgcnt = page_get_pagecnt(szc);
3225         while (pgcnt) {
3226                 ASSERT(PAGE_EXCL(pp));
3227                 ASSERT(!PP_ISNORELOC(pp));
3228                 if (PP_ISFREE(pp)) {
3229                         /*
3230                          * If this is a PG_FREE_LIST page then its
3231                          * size code can change underneath us due to
3232                          * page promotion or demotion. As an optimzation
3233                          * use page_list_sub_pages() instead of
3234                          * page_list_sub().
3235                          */
3236                         if (PP_ISAGED(pp)) {
3237                                 page_list_sub_pages(pp, szc);
3238                                 if (pp->p_szc == szc) {
3239                                         return (pp);
3240                                 }
3241                                 ASSERT(pp->p_szc < szc);
3242                                 npgs = page_get_pagecnt(pp->p_szc);
3243                                 hpp = pp;
3244                                 for (i = 0; i < npgs; i++, pp++) {
3245                                         pp->p_szc = szc;
3246                                 }
3247                                 page_list_concat(&pplist, &hpp);
3248                                 pgcnt -= npgs;
3249                                 continue;
3250                         }
3251                         ASSERT(!PP_ISAGED(pp));
3252                         ASSERT(pp->p_szc == 0);
3253                         page_list_sub(pp, PG_CACHE_LIST);
3254                         page_hashout(pp, NULL);
3255                         PP_SETAGED(pp);
3256                         pp->p_szc = szc;
3257                         page_list_concat(&pplist, &pp);
3258                         pp++;
3259                         pgcnt--;
3260                         continue;
3261                 }
3262                 npgs = page_get_pagecnt(pp->p_szc);
3263 
3264                 /*
3265                  * page_create_wait freemem accounting done by caller of
3266                  * page_get_freelist and not necessary to call it prior to
3267                  * calling page_get_replacement_page.
3268                  *
3269                  * page_get_replacement_page can call page_get_contig_pages
3270                  * to acquire a large page (szc > 0); the replacement must be
3271                  * smaller than the contig page size to avoid looping or
3272                  * szc == 0 and PGI_PGCPSZC0 is set.
3273                  */
3274                 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3275                         replpp = page_get_replacement_page(pp, NULL, 0);
3276                         if (replpp) {
3277                                 npgs = page_get_pagecnt(pp->p_szc);
3278                                 ASSERT(npgs <= pgcnt);
3279                                 targpp = pp;
3280                         }
3281                 }
3282 
3283                 /*
3284                  * If replacement is NULL or do_page_relocate fails, fail
3285                  * coalescing of pages.
3286                  */
3287                 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3288                     &npgs, NULL) != 0)) {
3289                         /*
3290                          * Unlock un-processed target list
3291                          */
3292                         while (pgcnt--) {
3293                                 ASSERT(PAGE_EXCL(pp));
3294                                 page_unlock_nocapture(pp);
3295                                 pp++;
3296                         }
3297                         /*
3298                          * Free the processed target list.
3299                          */
3300                         while (pplist) {
3301                                 pp = pplist;
3302                                 page_sub(&pplist, pp);
3303                                 ASSERT(PAGE_EXCL(pp));
3304                                 ASSERT(pp->p_szc == szc);
3305                                 ASSERT(PP_ISFREE(pp));
3306                                 ASSERT(PP_ISAGED(pp));
3307                                 pp->p_szc = 0;
3308                                 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3309                                 page_unlock_nocapture(pp);
3310                         }
3311 
3312                         if (replpp != NULL)
3313                                 page_free_replacement_page(replpp);
3314 
3315                         return (NULL);
3316                 }
3317                 ASSERT(pp == targpp);
3318 
3319                 /* LINTED */
3320                 ASSERT(hpp = pp); /* That's right, it's an assignment */
3321 
3322                 pp += npgs;
3323                 pgcnt -= npgs;
3324 
3325                 while (npgs--) {
3326                         ASSERT(PAGE_EXCL(targpp));
3327                         ASSERT(!PP_ISFREE(targpp));
3328                         ASSERT(!PP_ISNORELOC(targpp));
3329                         PP_SETFREE(targpp);
3330                         ASSERT(PP_ISAGED(targpp));
3331                         ASSERT(targpp->p_szc < szc || (szc == 0 &&
3332                             (flags & PGI_PGCPSZC0)));
3333                         targpp->p_szc = szc;
3334                         targpp = targpp->p_next;
3335 
3336                         rpp = replpp;
3337                         ASSERT(rpp != NULL);
3338                         page_sub(&replpp, rpp);
3339                         ASSERT(PAGE_EXCL(rpp));
3340                         ASSERT(!PP_ISFREE(rpp));
3341                         page_unlock_nocapture(rpp);
3342                 }
3343                 ASSERT(targpp == hpp);
3344                 ASSERT(replpp == NULL);
3345                 page_list_concat(&pplist, &targpp);
3346         }
3347         CHK_LPG(pplist, szc);
3348         return (pplist);
3349 }
3350 
3351 /*
3352  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3353  * of 0 means nothing left after trim.
3354  */
3355 int
3356 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3357 {
3358         pfn_t   kcagepfn;
3359         int     decr;
3360         int     rc = 0;
3361 
3362         if (PP_ISNORELOC(mseg->pages)) {
3363                 if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3364 
3365                         /* lower part of this mseg inside kernel cage */
3366                         decr = kcage_current_pfn(&kcagepfn);
3367 
3368                         /* kernel cage may have transitioned past mseg */
3369                         if (kcagepfn >= mseg->pages_base &&
3370                             kcagepfn < mseg->pages_end) {
3371                                 ASSERT(decr == 0);
3372                                 *lo = MAX(kcagepfn, pfnlo);
3373                                 *hi = MIN(pfnhi, (mseg->pages_end - 1));
3374                                 rc = 1;
3375                         }
3376                 }
3377                 /* else entire mseg in the cage */
3378         } else {
3379                 if (PP_ISNORELOC(mseg->epages - 1)) {
3380 
3381                         /* upper part of this mseg inside kernel cage */
3382                         decr = kcage_current_pfn(&kcagepfn);
3383 
3384                         /* kernel cage may have transitioned past mseg */
3385                         if (kcagepfn >= mseg->pages_base &&
3386                             kcagepfn < mseg->pages_end) {
3387                                 ASSERT(decr);
3388                                 *hi = MIN(kcagepfn, pfnhi);
3389                                 *lo = MAX(pfnlo, mseg->pages_base);
3390                                 rc = 1;
3391                         }
3392                 } else {
3393                         /* entire mseg outside of kernel cage */
3394                         *lo = MAX(pfnlo, mseg->pages_base);
3395                         *hi = MIN(pfnhi, (mseg->pages_end - 1));
3396                         rc = 1;
3397                 }
3398         }
3399         return (rc);
3400 }
3401 
3402 /*
3403  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3404  * page with size code 'szc'. Claiming such a page requires acquiring
3405  * exclusive locks on all constituent pages (page_trylock_contig_pages),
3406  * relocating pages in use and concatenating these constituent pages into a
3407  * large page.
3408  *
3409  * The page lists do not have such a large page and page_freelist_split has
3410  * already failed to demote larger pages and/or coalesce smaller free pages.
3411  *
3412  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3413  * pages with the same color as 'bin'.
3414  *
3415  * 'pfnflag' specifies the subset of the pfn range to search.
3416  */
3417 
3418 static page_t *
3419 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3420     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3421 {
3422         struct memseg *mseg;
3423         pgcnt_t szcpgcnt = page_get_pagecnt(szc);
3424         pgcnt_t szcpgmask = szcpgcnt - 1;
3425         pfn_t   randpfn;
3426         page_t *pp, *randpp, *endpp;
3427         uint_t colors, ceq_mask;
3428         /* LINTED : set but not used in function */
3429         uint_t color_mask;
3430         pfn_t hi, lo;
3431         uint_t skip;
3432         MEM_NODE_ITERATOR_DECL(it);
3433 
3434         ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3435 
3436         pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3437 
3438         if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
3439                 return (NULL);
3440 
3441         ASSERT(szc < mmu_page_sizes);
3442 
3443         colors = PAGE_GET_PAGECOLORS(szc);
3444         color_mask = colors - 1;
3445         if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3446                 uchar_t ceq = colorequivszc[szc];
3447                 uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3448 
3449                 ASSERT(ceq_dif > 0);
3450                 ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3451         } else {
3452                 ceq_mask = 0;
3453         }
3454 
3455         ASSERT(bin < colors);
3456 
3457         /* clear "non-significant" color bits */
3458         bin &= ceq_mask;
3459 
3460         /*
3461          * trim the pfn range to search based on pfnflag. pfnflag is set
3462          * when there have been previous page_get_contig_page failures to
3463          * limit the search.
3464          *
3465          * The high bit in pfnflag specifies the number of 'slots' in the
3466          * pfn range and the remainder of pfnflag specifies which slot.
3467          * For example, a value of 1010b would mean the second slot of
3468          * the pfn range that has been divided into 8 slots.
3469          */
3470         if (pfnflag > 1) {
3471                 int     slots = 1 << (highbit(pfnflag) - 1);
3472                 int     slotid = pfnflag & (slots - 1);
3473                 pgcnt_t szcpages;
3474                 int     slotlen;
3475 
3476                 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3477                 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3478                 slotlen = howmany(szcpages, slots);
3479                 /* skip if 'slotid' slot is empty */
3480                 if (slotid * slotlen >= szcpages)
3481                         return (NULL);
3482                 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3483                 ASSERT(pfnlo < pfnhi);
3484                 if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3485                         pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3486         }
3487 
3488         /*
3489          * This routine is can be called recursively so we shouldn't
3490          * acquire a reader lock if a write request is pending. This
3491          * could lead to a deadlock with the DR thread.
3492          *
3493          * Returning NULL informs the caller that we could not get
3494          * a contig page with the required characteristics.
3495          */
3496 
3497         if (!memsegs_trylock(0))
3498                 return (NULL);
3499 
3500         /*
3501          * loop through memsegs to look for contig page candidates
3502          */
3503 
3504         for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3505                 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3506                         /* no overlap */
3507                         continue;
3508                 }
3509 
3510                 if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3511                         /* mseg too small */
3512                         continue;
3513 
3514                 /*
3515                  * trim off kernel cage pages from pfn range and check for
3516                  * a trimmed pfn range returned that does not span the
3517                  * desired large page size.
3518                  */
3519                 if (kcage_on) {
3520                         if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
3521                             lo >= hi || ((hi - lo) + 1) < szcpgcnt)
3522                                 continue;
3523                 } else {
3524                         lo = MAX(pfnlo, mseg->pages_base);
3525                         hi = MIN(pfnhi, (mseg->pages_end - 1));
3526                 }
3527 
3528                 /* round to szcpgcnt boundaries */
3529                 lo = P2ROUNDUP(lo, szcpgcnt);
3530 
3531                 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3532                 hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3533 
3534                 if (hi <= lo)
3535                         continue;
3536 
3537                 /*
3538                  * set lo to point to the pfn for the desired bin. Large
3539                  * page sizes may only have a single page color
3540                  */
3541                 skip = szcpgcnt;
3542                 if (ceq_mask > 0 || interleaved_mnodes) {
3543                         /* set lo to point at appropriate color */
3544                         if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3545                             (interleaved_mnodes &&
3546                             PFN_2_MEM_NODE(lo) != mnode)) {
3547                                 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3548                                     color_mask, &it);
3549                         }
3550                         if (hi <= lo)
3551                                 /* mseg cannot satisfy color request */
3552                                 continue;
3553                 }
3554 
3555                 /* randomly choose a point between lo and hi to begin search */
3556 
3557                 randpfn = (pfn_t)GETTICK();
3558                 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3559                 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3560                 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3561                         if (randpfn != (pfn_t)-1) {
3562                                 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3563                                     ceq_mask, color_mask, &it);
3564                         }
3565                         if (randpfn >= hi) {
3566                                 randpfn = lo;
3567                                 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3568                                     &it);
3569                         }
3570                 }
3571                 randpp = mseg->pages + (randpfn - mseg->pages_base);
3572 
3573                 ASSERT(randpp->p_pagenum == randpfn);
3574 
3575                 pp = randpp;
3576                 endpp =  mseg->pages + (hi - mseg->pages_base) + 1;
3577 
3578                 ASSERT(randpp + szcpgcnt <= endpp);
3579 
3580                 do {
3581                         ASSERT(!(pp->p_pagenum & szcpgmask));
3582                         ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3583 
3584                         if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3585                                 /* pages unlocked by page_claim on failure */
3586                                 if (page_claim_contig_pages(pp, szc, flags)) {
3587                                         memsegs_unlock(0);
3588                                         return (pp);
3589                                 }
3590                         }
3591 
3592                         if (ceq_mask == 0 && !interleaved_mnodes) {
3593                                 pp += skip;
3594                         } else {
3595                                 pfn_t pfn = pp->p_pagenum;
3596 
3597                                 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3598                                     ceq_mask, color_mask, &it);
3599                                 if (pfn == (pfn_t)-1) {
3600                                         pp = endpp;
3601                                 } else {
3602                                         pp = mseg->pages +
3603                                             (pfn - mseg->pages_base);
3604                                 }
3605                         }
3606                         if (pp >= endpp) {
3607                                 /* start from the beginning */
3608                                 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3609                                 pp = mseg->pages + (lo - mseg->pages_base);
3610                                 ASSERT(pp->p_pagenum == lo);
3611                                 ASSERT(pp + szcpgcnt <= endpp);
3612                         }
3613                 } while (pp != randpp);
3614         }
3615         memsegs_unlock(0);
3616         return (NULL);
3617 }
3618 
3619 
3620 /*
3621  * controlling routine that searches through physical memory in an attempt to
3622  * claim a large page based on the input parameters.
3623  * on the page free lists.
3624  *
3625  * calls page_geti_contig_pages with an initial pfn range from the mnode
3626  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3627  * that overlaps with the kernel cage or does not match the requested page
3628  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
3629  * page_geti_contig_pages may further limit the search range based on
3630  * previous failure counts (pgcpfailcnt[]).
3631  *
3632  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3633  * pagesize page that satisfies mtype.
3634  */
3635 page_t *
3636 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3637     uint_t flags)
3638 {
3639         pfn_t           pfnlo, pfnhi;   /* contig pages pfn range */
3640         page_t          *pp;
3641         pgcnt_t         pfnflag = 0;    /* no limit on search if 0 */
3642 
3643         VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3644 
3645         /* no allocations from cage */
3646         flags |= PGI_NOCAGE;
3647 
3648         /* LINTED */
3649         MTYPE_START(mnode, mtype, flags);
3650         if (mtype < 0) {     /* mnode does not have memory in mtype range */
3651                 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3652                 return (NULL);
3653         }
3654 
3655         ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3656 
3657         /* do not limit search and ignore color if hi pri */
3658 
3659         if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3660                 pfnflag = pgcpfailcnt[szc];
3661 
3662         /* remove color match to improve chances */
3663 
3664         if (flags & PGI_PGCPHIPRI || pfnflag)
3665                 flags &= ~PG_MATCH_COLOR;
3666 
3667         do {
3668                 /* get pfn range based on mnode and mtype */
3669                 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3670 
3671                 ASSERT(pfnhi >= pfnlo);
3672 
3673                 pp = page_geti_contig_pages(mnode, bin, szc, flags,
3674                     pfnlo, pfnhi, pfnflag);
3675 
3676                 if (pp != NULL) {
3677                         pfnflag = pgcpfailcnt[szc];
3678                         if (pfnflag) {
3679                                 /* double the search size */
3680                                 pgcpfailcnt[szc] = pfnflag >> 1;
3681                         }
3682                         VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3683                         return (pp);
3684                 }
3685                 MTYPE_NEXT(mnode, mtype, flags);
3686         } while (mtype >= 0);
3687 
3688         VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3689         return (NULL);
3690 }
3691 
3692 #if defined(__i386) || defined(__amd64)
3693 /*
3694  * Determine the likelihood of finding/coalescing a szc page.
3695  * Return 0 if the likelihood is small otherwise return 1.
3696  *
3697  * For now, be conservative and check only 1g pages and return 0
3698  * if there had been previous coalescing failures and the szc pages
3699  * needed to satisfy request would exhaust most of freemem.
3700  */
3701 int
3702 page_chk_freelist(uint_t szc)
3703 {
3704         pgcnt_t         pgcnt;
3705 
3706         if (szc <= 1)
3707                 return (1);
3708 
3709         pgcnt = page_get_pagecnt(szc);
3710         if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
3711                 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
3712                 return (0);
3713         }
3714         VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
3715         return (1);
3716 }
3717 #endif
3718 
3719 /*
3720  * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
3721  *
3722  * Does its own locking and accounting.
3723  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3724  * pages of the proper color even if there are pages of a different color.
3725  *
3726  * Finds a page, removes it, THEN locks it.
3727  */
3728 
3729 /*ARGSUSED*/
3730 page_t *
3731 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3732         caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3733 {
3734         struct as       *as = seg->s_as;
3735         page_t          *pp = NULL;
3736         ulong_t         bin;
3737         uchar_t         szc;
3738         int             mnode;
3739         int             mtype;
3740         page_t          *(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3741         lgrp_mnode_cookie_t     lgrp_cookie;
3742 
3743         page_get_func = page_get_mnode_freelist;
3744 
3745         /*
3746          * If we aren't passed a specific lgroup, or passed a freed lgrp
3747          * assume we wish to allocate near to the current thread's home.
3748          */
3749         if (!LGRP_EXISTS(lgrp))
3750                 lgrp = lgrp_home_lgrp();
3751 
3752         if (kcage_on) {
3753                 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3754                     kcage_freemem < kcage_throttlefree + btop(size) &&
3755                     curthread != kcage_cageout_thread) {
3756                         /*
3757                          * Set a "reserve" of kcage_throttlefree pages for
3758                          * PG_PANIC and cageout thread allocations.
3759                          *
3760                          * Everybody else has to serialize in
3761                          * page_create_get_something() to get a cage page, so
3762                          * that we don't deadlock cageout!
3763                          */
3764                         return (NULL);
3765                 }
3766         } else {
3767                 flags &= ~PG_NORELOC;
3768                 flags |= PGI_NOCAGE;
3769         }
3770 
3771         /* LINTED */
3772         MTYPE_INIT(mtype, vp, vaddr, flags, size);
3773 
3774         /*
3775          * Convert size to page size code.
3776          */
3777         if ((szc = page_szc(size)) == (uchar_t)-1)
3778                 panic("page_get_freelist: illegal page size request");
3779         ASSERT(szc < mmu_page_sizes);
3780 
3781         VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3782 
3783         /* LINTED */
3784         AS_2_BIN(as, seg, vp, vaddr, bin, szc);
3785 
3786         ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3787 
3788         /*
3789          * Try to get a local page first, but try remote if we can't
3790          * get a page of the right color.
3791          */
3792 pgretry:
3793         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3794         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3795                 pp = page_get_func(mnode, bin, mtype, szc, flags);
3796                 if (pp != NULL) {
3797                         VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3798                         DTRACE_PROBE4(page__get,
3799                             lgrp_t *, lgrp,
3800                             int, mnode,
3801                             ulong_t, bin,
3802                             uint_t, flags);
3803                         return (pp);
3804                 }
3805         }
3806         ASSERT(pp == NULL);
3807 
3808         /*
3809          * for non-SZC0 PAGESIZE requests, check cachelist before checking
3810          * remote free lists.  Caller expected to call page_get_cachelist which
3811          * will check local cache lists and remote free lists.
3812          */
3813         if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3814                 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3815                 return (NULL);
3816         }
3817 
3818         ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3819 
3820         lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3821 
3822         if (!(flags & PG_LOCAL)) {
3823                 /*
3824                  * Try to get a non-local freelist page.
3825                  */
3826                 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3827                 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3828                         pp = page_get_func(mnode, bin, mtype, szc, flags);
3829                         if (pp != NULL) {
3830                                 DTRACE_PROBE4(page__get,
3831                                     lgrp_t *, lgrp,
3832                                     int, mnode,
3833                                     ulong_t, bin,
3834                                     uint_t, flags);
3835                                 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3836                                 return (pp);
3837                         }
3838                 }
3839                 ASSERT(pp == NULL);
3840         }
3841 
3842         /*
3843          * when the cage is off chances are page_get_contig_pages() will fail
3844          * to lock a large page chunk therefore when the cage is off it's not
3845          * called by default.  this can be changed via /etc/system.
3846          *
3847          * page_get_contig_pages() also called to acquire a base pagesize page
3848          * for page_create_get_something().
3849          */
3850         if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3851             (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3852             (page_get_func != page_get_contig_pages)) {
3853 
3854                 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3855                 page_get_func = page_get_contig_pages;
3856                 goto pgretry;
3857         }
3858 
3859         if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3860             page_get_func == page_get_contig_pages)
3861                 SETPGCPFAILCNT(szc);
3862 
3863         VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3864         return (NULL);
3865 }
3866 
3867 /*
3868  * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
3869  *
3870  * Does its own locking.
3871  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3872  * pages of the proper color even if there are pages of a different color.
3873  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
3874  * try to lock one of them.  If no page can be locked, try the
3875  * next bin.  Return NULL if a page can not be found and locked.
3876  *
3877  * Finds a pages, trys to lock it, then removes it.
3878  */
3879 
3880 /*ARGSUSED*/
3881 page_t *
3882 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3883     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3884 {
3885         page_t          *pp;
3886         struct as       *as = seg->s_as;
3887         ulong_t         bin;
3888         /*LINTED*/
3889         int             mnode;
3890         int             mtype;
3891         lgrp_mnode_cookie_t     lgrp_cookie;
3892 
3893         /*
3894          * If we aren't passed a specific lgroup, or pasased a freed lgrp
3895          * assume we wish to allocate near to the current thread's home.
3896          */
3897         if (!LGRP_EXISTS(lgrp))
3898                 lgrp = lgrp_home_lgrp();
3899 
3900         if (!kcage_on) {
3901                 flags &= ~PG_NORELOC;
3902                 flags |= PGI_NOCAGE;
3903         }
3904 
3905         if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3906             kcage_freemem <= kcage_throttlefree) {
3907                 /*
3908                  * Reserve kcage_throttlefree pages for critical kernel
3909                  * threads.
3910                  *
3911                  * Everybody else has to go to page_create_get_something()
3912                  * to get a cage page, so we don't deadlock cageout.
3913                  */
3914                 return (NULL);
3915         }
3916 
3917         /* LINTED */
3918         AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3919 
3920         ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3921 
3922         /* LINTED */
3923         MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
3924 
3925         VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3926 
3927         /*
3928          * Try local cachelists first
3929          */
3930         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3931         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3932                 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3933                 if (pp != NULL) {
3934                         VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3935                         DTRACE_PROBE4(page__get,
3936                             lgrp_t *, lgrp,
3937                             int, mnode,
3938                             ulong_t, bin,
3939                             uint_t, flags);
3940                         return (pp);
3941                 }
3942         }
3943 
3944         lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3945 
3946         /*
3947          * Try freelists/cachelists that are farther away
3948          * This is our only chance to allocate remote pages for PAGESIZE
3949          * requests.
3950          */
3951         LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3952         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3953                 pp = page_get_mnode_freelist(mnode, bin, mtype,
3954                     0, flags);
3955                 if (pp != NULL) {
3956                         VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3957                         DTRACE_PROBE4(page__get,
3958                             lgrp_t *, lgrp,
3959                             int, mnode,
3960                             ulong_t, bin,
3961                             uint_t, flags);
3962                         return (pp);
3963                 }
3964                 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3965                 if (pp != NULL) {
3966                         VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3967                         DTRACE_PROBE4(page__get,
3968                             lgrp_t *, lgrp,
3969                             int, mnode,
3970                             ulong_t, bin,
3971                             uint_t, flags);
3972                         return (pp);
3973                 }
3974         }
3975 
3976         VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3977         return (NULL);
3978 }
3979 
3980 page_t *
3981 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3982 {
3983         kmutex_t                *pcm;
3984         page_t                  *pp, *first_pp;
3985         uint_t                  sbin;
3986         int                     plw_initialized;
3987         page_list_walker_t      plw;
3988 
3989         VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3990 
3991         /* LINTED */
3992         MTYPE_START(mnode, mtype, flags);
3993         if (mtype < 0) {     /* mnode does not have memory in mtype range */
3994                 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3995                 return (NULL);
3996         }
3997 
3998 try_again:
3999 
4000         plw_initialized = 0;
4001         plw.plw_ceq_dif = 1;
4002 
4003         /*
4004          * Only hold one cachelist lock at a time, that way we
4005          * can start anywhere and not have to worry about lock
4006          * ordering.
4007          */
4008 
4009         for (plw.plw_count = 0;
4010             plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
4011                 sbin = bin;
4012                 do {
4013 
4014                         if (!PAGE_CACHELISTS(mnode, bin, mtype))
4015                                 goto bin_empty_1;
4016                         pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
4017                         mutex_enter(pcm);
4018                         pp = PAGE_CACHELISTS(mnode, bin, mtype);
4019                         if (pp == NULL)
4020                                 goto bin_empty_0;
4021 
4022                         first_pp = pp;
4023                         ASSERT(pp->p_vnode);
4024                         ASSERT(PP_ISAGED(pp) == 0);
4025                         ASSERT(pp->p_szc == 0);
4026                         ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
4027                         while (!page_trylock(pp, SE_EXCL)) {
4028                                 pp = pp->p_next;
4029                                 ASSERT(pp->p_szc == 0);
4030                                 if (pp == first_pp) {
4031                                         /*
4032                                          * We have searched the complete list!
4033                                          * And all of them (might only be one)
4034                                          * are locked. This can happen since
4035                                          * these pages can also be found via
4036                                          * the hash list. When found via the
4037                                          * hash list, they are locked first,
4038                                          * then removed. We give up to let the
4039                                          * other thread run.
4040                                          */
4041                                         pp = NULL;
4042                                         break;
4043                                 }
4044                                 ASSERT(pp->p_vnode);
4045                                 ASSERT(PP_ISFREE(pp));
4046                                 ASSERT(PP_ISAGED(pp) == 0);
4047                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
4048                                     mnode);
4049                         }
4050 
4051                         if (pp) {
4052                                 page_t  **ppp;
4053                                 /*
4054                                  * Found and locked a page.
4055                                  * Pull it off the list.
4056                                  */
4057                                 ASSERT(mtype == PP_2_MTYPE(pp));
4058                                 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
4059                                 page_sub(ppp, pp);
4060                                 /*
4061                                  * Subtract counters before releasing pcm mutex
4062                                  * to avoid a race with page_freelist_coalesce
4063                                  * and page_freelist_split.
4064                                  */
4065                                 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
4066                                 mutex_exit(pcm);
4067                                 ASSERT(pp->p_vnode);
4068                                 ASSERT(PP_ISAGED(pp) == 0);
4069 #if defined(__sparc)
4070                                 ASSERT(!kcage_on ||
4071                                     (flags & PG_NORELOC) == 0 ||
4072                                     PP_ISNORELOC(pp));
4073                                 if (PP_ISNORELOC(pp)) {
4074                                         kcage_freemem_sub(1);
4075                                 }
4076 #endif
4077                                 VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
4078                                 return (pp);
4079                         }
4080 bin_empty_0:
4081                         mutex_exit(pcm);
4082 bin_empty_1:
4083                         if (plw_initialized == 0) {
4084                                 page_list_walk_init(0, flags, bin, 0, 1, &plw);
4085                                 plw_initialized = 1;
4086                         }
4087                         /* calculate the next bin with equivalent color */
4088                         bin = ADD_MASKED(bin, plw.plw_bin_step,
4089                             plw.plw_ceq_mask[0], plw.plw_color_mask);
4090                 } while (sbin != bin);
4091 
4092                 if (plw.plw_ceq_dif > 1)
4093                         bin = page_list_walk_next_bin(0, bin, &plw);
4094         }
4095 
4096         MTYPE_NEXT(mnode, mtype, flags);
4097         if (mtype >= 0)
4098                 goto try_again;
4099 
4100         VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
4101         return (NULL);
4102 }
4103 
4104 #ifdef DEBUG
4105 #define REPL_PAGE_STATS
4106 #endif /* DEBUG */
4107 
4108 #ifdef REPL_PAGE_STATS
4109 struct repl_page_stats {
4110         uint_t  ngets;
4111         uint_t  ngets_noreloc;
4112         uint_t  npgr_noreloc;
4113         uint_t  nnopage_first;
4114         uint_t  nnopage;
4115         uint_t  nhashout;
4116         uint_t  nnofree;
4117         uint_t  nnext_pp;
4118 } repl_page_stats;
4119 #define REPL_STAT_INCR(v)       atomic_add_32(&repl_page_stats.v, 1)
4120 #else /* REPL_PAGE_STATS */
4121 #define REPL_STAT_INCR(v)
4122 #endif /* REPL_PAGE_STATS */
4123 
4124 int     pgrppgcp;
4125 
4126 /*
4127  * The freemem accounting must be done by the caller.
4128  * First we try to get a replacement page of the same size as like_pp,
4129  * if that is not possible, then we just get a set of discontiguous
4130  * PAGESIZE pages.
4131  */
4132 page_t *
4133 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
4134     uint_t pgrflags)
4135 {
4136         page_t          *like_pp;
4137         page_t          *pp, *pplist;
4138         page_t          *pl = NULL;
4139         ulong_t         bin;
4140         int             mnode, page_mnode;
4141         int             szc;
4142         spgcnt_t        npgs, pg_cnt;
4143         pfn_t           pfnum;
4144         int             mtype;
4145         int             flags = 0;
4146         lgrp_mnode_cookie_t     lgrp_cookie;
4147         lgrp_t          *lgrp;
4148 
4149         REPL_STAT_INCR(ngets);
4150         like_pp = orig_like_pp;
4151         ASSERT(PAGE_EXCL(like_pp));
4152 
4153         szc = like_pp->p_szc;
4154         npgs = page_get_pagecnt(szc);
4155         /*
4156          * Now we reset like_pp to the base page_t.
4157          * That way, we won't walk past the end of this 'szc' page.
4158          */
4159         pfnum = PFN_BASE(like_pp->p_pagenum, szc);
4160         like_pp = page_numtopp_nolock(pfnum);
4161         ASSERT(like_pp->p_szc == szc);
4162 
4163         if (PP_ISNORELOC(like_pp)) {
4164                 ASSERT(kcage_on);
4165                 REPL_STAT_INCR(ngets_noreloc);
4166                 flags = PGI_RELOCONLY;
4167         } else if (pgrflags & PGR_NORELOC) {
4168                 ASSERT(kcage_on);
4169                 REPL_STAT_INCR(npgr_noreloc);
4170                 flags = PG_NORELOC;
4171         }
4172 
4173         /*
4174          * Kernel pages must always be replaced with the same size
4175          * pages, since we cannot properly handle demotion of kernel
4176          * pages.
4177          */
4178         if (PP_ISKAS(like_pp))
4179                 pgrflags |= PGR_SAMESZC;
4180 
4181         /* LINTED */
4182         MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
4183 
4184         while (npgs) {
4185                 pplist = NULL;
4186                 for (;;) {
4187                         pg_cnt = page_get_pagecnt(szc);
4188                         bin = PP_2_BIN(like_pp);
4189                         ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
4190                         ASSERT(pg_cnt <= npgs);
4191 
4192                         /*
4193                          * If an lgroup was specified, try to get the
4194                          * page from that lgroup.
4195                          * NOTE: Must be careful with code below because
4196                          *       lgroup may disappear and reappear since there
4197                          *       is no locking for lgroup here.
4198                          */
4199                         if (LGRP_EXISTS(lgrp_target)) {
4200                                 /*
4201                                  * Keep local variable for lgroup separate
4202                                  * from lgroup argument since this code should
4203                                  * only be exercised when lgroup argument
4204                                  * exists....
4205                                  */
4206                                 lgrp = lgrp_target;
4207 
4208                                 /* Try the lgroup's freelists first */
4209                                 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4210                                     LGRP_SRCH_LOCAL);
4211                                 while ((pplist == NULL) &&
4212                                     (mnode = lgrp_memnode_choose(&lgrp_cookie))
4213                                     != -1) {
4214                                         pplist =
4215                                             page_get_mnode_freelist(mnode, bin,
4216                                             mtype, szc, flags);
4217                                 }
4218 
4219                                 /*
4220                                  * Now try it's cachelists if this is a
4221                                  * small page. Don't need to do it for
4222                                  * larger ones since page_freelist_coalesce()
4223                                  * already failed.
4224                                  */
4225                                 if (pplist != NULL || szc != 0)
4226                                         break;
4227 
4228                                 /* Now try it's cachelists */
4229                                 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4230                                     LGRP_SRCH_LOCAL);
4231 
4232                                 while ((pplist == NULL) &&
4233                                     (mnode = lgrp_memnode_choose(&lgrp_cookie))
4234                                     != -1) {
4235                                         pplist =
4236                                             page_get_mnode_cachelist(bin, flags,
4237                                             mnode, mtype);
4238                                 }
4239                                 if (pplist != NULL) {
4240                                         page_hashout(pplist, NULL);
4241                                         PP_SETAGED(pplist);
4242                                         REPL_STAT_INCR(nhashout);
4243                                         break;
4244                                 }
4245                                 /* Done looking in this lgroup. Bail out. */
4246                                 break;
4247                         }
4248 
4249                         /*
4250                          * No lgroup was specified (or lgroup was removed by
4251                          * DR, so just try to get the page as close to
4252                          * like_pp's mnode as possible.
4253                          * First try the local freelist...
4254                          */
4255                         mnode = PP_2_MEM_NODE(like_pp);
4256                         pplist = page_get_mnode_freelist(mnode, bin,
4257                             mtype, szc, flags);
4258                         if (pplist != NULL)
4259                                 break;
4260 
4261                         REPL_STAT_INCR(nnofree);
4262 
4263                         /*
4264                          * ...then the local cachelist. Don't need to do it for
4265                          * larger pages cause page_freelist_coalesce() already
4266                          * failed there anyway.
4267                          */
4268                         if (szc == 0) {
4269                                 pplist = page_get_mnode_cachelist(bin, flags,
4270                                     mnode, mtype);
4271                                 if (pplist != NULL) {
4272                                         page_hashout(pplist, NULL);
4273                                         PP_SETAGED(pplist);
4274                                         REPL_STAT_INCR(nhashout);
4275                                         break;
4276                                 }
4277                         }
4278 
4279                         /* Now try remote freelists */
4280                         page_mnode = mnode;
4281                         lgrp =
4282                             lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4283                         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4284                             LGRP_SRCH_HIER);
4285                         while (pplist == NULL &&
4286                             (mnode = lgrp_memnode_choose(&lgrp_cookie))
4287                             != -1) {
4288                                 /*
4289                                  * Skip local mnode.
4290                                  */
4291                                 if ((mnode == page_mnode) ||
4292                                     (mem_node_config[mnode].exists == 0))
4293                                         continue;
4294 
4295                                 pplist = page_get_mnode_freelist(mnode,
4296                                     bin, mtype, szc, flags);
4297                         }
4298 
4299                         if (pplist != NULL)
4300                                 break;
4301 
4302 
4303                         /* Now try remote cachelists */
4304                         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4305                             LGRP_SRCH_HIER);
4306                         while (pplist == NULL && szc == 0) {
4307                                 mnode = lgrp_memnode_choose(&lgrp_cookie);
4308                                 if (mnode == -1)
4309                                         break;
4310                                 /*
4311                                  * Skip local mnode.
4312                                  */
4313                                 if ((mnode == page_mnode) ||
4314                                     (mem_node_config[mnode].exists == 0))
4315                                         continue;
4316 
4317                                 pplist = page_get_mnode_cachelist(bin,
4318                                     flags, mnode, mtype);
4319 
4320                                 if (pplist != NULL) {
4321                                         page_hashout(pplist, NULL);
4322                                         PP_SETAGED(pplist);
4323                                         REPL_STAT_INCR(nhashout);
4324                                         break;
4325                                 }
4326                         }
4327 
4328                         /*
4329                          * Break out of while loop under the following cases:
4330                          * - If we successfully got a page.
4331                          * - If pgrflags specified only returning a specific
4332                          *   page size and we could not find that page size.
4333                          * - If we could not satisfy the request with PAGESIZE
4334                          *   or larger pages.
4335                          */
4336                         if (pplist != NULL || szc == 0)
4337                                 break;
4338 
4339                         if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4340                                 /* try to find contig page */
4341 
4342                                 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4343                                     LGRP_SRCH_HIER);
4344 
4345                                 while ((pplist == NULL) &&
4346                                     (mnode =
4347                                     lgrp_memnode_choose(&lgrp_cookie))
4348                                     != -1) {
4349                                         pplist = page_get_contig_pages(
4350                                             mnode, bin, mtype, szc,
4351                                             flags | PGI_PGCPHIPRI);
4352                                 }
4353                                 break;
4354                         }
4355 
4356                         /*
4357                          * The correct thing to do here is try the next
4358                          * page size down using szc--. Due to a bug
4359                          * with the processing of HAT_RELOAD_SHARE
4360                          * where the sfmmu_ttecnt arrays of all
4361                          * hats sharing an ISM segment don't get updated,
4362                          * using intermediate size pages for relocation
4363                          * can lead to continuous page faults.
4364                          */
4365                         szc = 0;
4366                 }
4367 
4368                 if (pplist != NULL) {
4369                         DTRACE_PROBE4(page__get,
4370                             lgrp_t *, lgrp,
4371                             int, mnode,
4372                             ulong_t, bin,
4373                             uint_t, flags);
4374 
4375                         while (pplist != NULL && pg_cnt--) {
4376                                 ASSERT(pplist != NULL);
4377                                 pp = pplist;
4378                                 page_sub(&pplist, pp);
4379                                 PP_CLRFREE(pp);
4380                                 PP_CLRAGED(pp);
4381                                 page_list_concat(&pl, &pp);
4382                                 npgs--;
4383                                 like_pp = like_pp + 1;
4384                                 REPL_STAT_INCR(nnext_pp);
4385                         }
4386                         ASSERT(pg_cnt == 0);
4387                 } else {
4388                         break;
4389                 }
4390         }
4391 
4392         if (npgs) {
4393                 /*
4394                  * We were unable to allocate the necessary number
4395                  * of pages.
4396                  * We need to free up any pl.
4397                  */
4398                 REPL_STAT_INCR(nnopage);
4399                 page_free_replacement_page(pl);
4400                 return (NULL);
4401         } else {
4402                 return (pl);
4403         }
4404 }
4405 
4406 /*
4407  * demote a free large page to it's constituent pages
4408  */
4409 void
4410 page_demote_free_pages(page_t *pp)
4411 {
4412 
4413         int mnode;
4414 
4415         ASSERT(pp != NULL);
4416         ASSERT(PAGE_LOCKED(pp));
4417         ASSERT(PP_ISFREE(pp));
4418         ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4419 
4420         mnode = PP_2_MEM_NODE(pp);
4421         page_freelist_lock(mnode);
4422         if (pp->p_szc != 0) {
4423                 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4424                     pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4425         }
4426         page_freelist_unlock(mnode);
4427         ASSERT(pp->p_szc == 0);
4428 }
4429 
4430 /*
4431  * Factor in colorequiv to check additional 'equivalent' bins.
4432  * colorequiv may be set in /etc/system
4433  */
4434 void
4435 page_set_colorequiv_arr(void)
4436 {
4437         if (colorequiv > 1) {
4438                 int i;
4439                 uint_t sv_a = lowbit(colorequiv) - 1;
4440 
4441                 if (sv_a > 15)
4442                         sv_a = 15;
4443 
4444                 for (i = 0; i < MMU_PAGE_SIZES; i++) {
4445                         uint_t colors;
4446                         uint_t a = sv_a;
4447 
4448                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
4449                                 continue;
4450                         }
4451                         while ((colors >> a) == 0)
4452                                 a--;
4453                         if ((a << 4) > colorequivszc[i]) {
4454                                 colorequivszc[i] = (a << 4);
4455                         }
4456                 }
4457         }
4458 }