1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989  AT&T        */
  26 /*        All Rights Reserved   */
  27 
  28 /*
  29  * University Copyright- Copyright (c) 1982, 1986, 1988
  30  * The Regents of the University of California
  31  * All Rights Reserved
  32  *
  33  * University Acknowledgment- Portions of this document are derived from
  34  * software developed by the University of California, Berkeley, and its
  35  * contributors.
  36  */
  37 
  38 /*
  39  * VM - physical page management.
  40  */
  41 
  42 #include <sys/types.h>
  43 #include <sys/t_lock.h>
  44 #include <sys/param.h>
  45 #include <sys/systm.h>
  46 #include <sys/errno.h>
  47 #include <sys/time.h>
  48 #include <sys/vnode.h>
  49 #include <sys/vm.h>
  50 #include <sys/vtrace.h>
  51 #include <sys/swap.h>
  52 #include <sys/cmn_err.h>
  53 #include <sys/tuneable.h>
  54 #include <sys/sysmacros.h>
  55 #include <sys/cpuvar.h>
  56 #include <sys/callb.h>
  57 #include <sys/debug.h>
  58 #include <sys/tnf_probe.h>
  59 #include <sys/condvar_impl.h>
  60 #include <sys/mem_config.h>
  61 #include <sys/mem_cage.h>
  62 #include <sys/kmem.h>
  63 #include <sys/atomic.h>
  64 #include <sys/strlog.h>
  65 #include <sys/mman.h>
  66 #include <sys/ontrap.h>
  67 #include <sys/lgrp.h>
  68 #include <sys/vfs.h>
  69 
  70 #include <vm/hat.h>
  71 #include <vm/anon.h>
  72 #include <vm/page.h>
  73 #include <vm/seg.h>
  74 #include <vm/pvn.h>
  75 #include <vm/seg_kmem.h>
  76 #include <vm/vm_dep.h>
  77 #include <sys/vm_usage.h>
  78 #include <fs/fs_subr.h>
  79 #include <sys/ddi.h>
  80 #include <sys/modctl.h>
  81 
  82 static pgcnt_t max_page_get;    /* max page_get request size in pages */
  83 pgcnt_t total_pages = 0;        /* total number of pages (used by /proc) */
  84 
  85 /*
  86  * freemem_lock protects all freemem variables:
  87  * availrmem. Also this lock protects the globals which track the
  88  * availrmem changes for accurate kernel footprint calculation.
  89  * See below for an explanation of these
  90  * globals.
  91  */
  92 kmutex_t freemem_lock;
  93 pgcnt_t availrmem;
  94 pgcnt_t availrmem_initial;
  95 
  96 /*
  97  * These globals track availrmem changes to get a more accurate
  98  * estimate of tke kernel size. Historically pp_kernel is used for
  99  * kernel size and is based on availrmem. But availrmem is adjusted for
 100  * locked pages in the system not just for kernel locked pages.
 101  * These new counters will track the pages locked through segvn and
 102  * by explicit user locking.
 103  *
 104  * pages_locked : How many pages are locked because of user specified
 105  * locking through mlock or plock.
 106  *
 107  * pages_useclaim,pages_claimed : These two variables track the
 108  * claim adjustments because of the protection changes on a segvn segment.
 109  *
 110  * All these globals are protected by the same lock which protects availrmem.
 111  */
 112 pgcnt_t pages_locked = 0;
 113 pgcnt_t pages_useclaim = 0;
 114 pgcnt_t pages_claimed = 0;
 115 
 116 
 117 /*
 118  * new_freemem_lock protects freemem, freemem_wait & freemem_cv.
 119  */
 120 static kmutex_t new_freemem_lock;
 121 static uint_t   freemem_wait;   /* someone waiting for freemem */
 122 static kcondvar_t freemem_cv;
 123 
 124 /*
 125  * The logical page free list is maintained as two lists, the 'free'
 126  * and the 'cache' lists.
 127  * The free list contains those pages that should be reused first.
 128  *
 129  * The implementation of the lists is machine dependent.
 130  * page_get_freelist(), page_get_cachelist(),
 131  * page_list_sub(), and page_list_add()
 132  * form the interface to the machine dependent implementation.
 133  *
 134  * Pages with p_free set are on the cache list.
 135  * Pages with p_free and p_age set are on the free list,
 136  *
 137  * A page may be locked while on either list.
 138  */
 139 
 140 /*
 141  * free list accounting stuff.
 142  *
 143  *
 144  * Spread out the value for the number of pages on the
 145  * page free and page cache lists.  If there is just one
 146  * value, then it must be under just one lock.
 147  * The lock contention and cache traffic are a real bother.
 148  *
 149  * When we acquire and then drop a single pcf lock
 150  * we can start in the middle of the array of pcf structures.
 151  * If we acquire more than one pcf lock at a time, we need to
 152  * start at the front to avoid deadlocking.
 153  *
 154  * pcf_count holds the number of pages in each pool.
 155  *
 156  * pcf_block is set when page_create_get_something() has asked the
 157  * PSM page freelist and page cachelist routines without specifying
 158  * a color and nothing came back.  This is used to block anything
 159  * else from moving pages from one list to the other while the
 160  * lists are searched again.  If a page is freeed while pcf_block is
 161  * set, then pcf_reserve is incremented.  pcgs_unblock() takes care
 162  * of clearning pcf_block, doing the wakeups, etc.
 163  */
 164 
 165 #define MAX_PCF_FANOUT NCPU
 166 static uint_t pcf_fanout = 1; /* Will get changed at boot time */
 167 static uint_t pcf_fanout_mask = 0;
 168 
 169 struct pcf {
 170         kmutex_t        pcf_lock;       /* protects the structure */
 171         uint_t          pcf_count;      /* page count */
 172         uint_t          pcf_wait;       /* number of waiters */
 173         uint_t          pcf_block;      /* pcgs flag to page_free() */
 174         uint_t          pcf_reserve;    /* pages freed after pcf_block set */
 175         uint_t          pcf_fill[10];   /* to line up on the caches */
 176 };
 177 
 178 /*
 179  * PCF_INDEX hash needs to be dynamic (every so often the hash changes where
 180  * it will hash the cpu to).  This is done to prevent a drain condition
 181  * from happening.  This drain condition will occur when pcf_count decrement
 182  * occurs on cpu A and the increment of pcf_count always occurs on cpu B.  An
 183  * example of this shows up with device interrupts.  The dma buffer is allocated
 184  * by the cpu requesting the IO thus the pcf_count is decremented based on that.
 185  * When the memory is returned by the interrupt thread, the pcf_count will be
 186  * incremented based on the cpu servicing the interrupt.
 187  */
 188 static struct pcf pcf[MAX_PCF_FANOUT];
 189 #define PCF_INDEX() ((int)(((long)CPU->cpu_seqid) + \
 190         (randtick() >> 24)) & (pcf_fanout_mask))
 191 
 192 static int pcf_decrement_bucket(pgcnt_t);
 193 static int pcf_decrement_multiple(pgcnt_t *, pgcnt_t, int);
 194 
 195 kmutex_t        pcgs_lock;              /* serializes page_create_get_ */
 196 kmutex_t        pcgs_cagelock;          /* serializes NOSLEEP cage allocs */
 197 kmutex_t        pcgs_wait_lock;         /* used for delay in pcgs */
 198 static kcondvar_t       pcgs_cv;        /* cv for delay in pcgs */
 199 
 200 #ifdef VM_STATS
 201 
 202 /*
 203  * No locks, but so what, they are only statistics.
 204  */
 205 
 206 static struct page_tcnt {
 207         int     pc_free_cache;          /* free's into cache list */
 208         int     pc_free_dontneed;       /* free's with dontneed */
 209         int     pc_free_pageout;        /* free's from pageout */
 210         int     pc_free_free;           /* free's into free list */
 211         int     pc_free_pages;          /* free's into large page free list */
 212         int     pc_destroy_pages;       /* large page destroy's */
 213         int     pc_get_cache;           /* get's from cache list */
 214         int     pc_get_free;            /* get's from free list */
 215         int     pc_reclaim;             /* reclaim's */
 216         int     pc_abortfree;           /* abort's of free pages */
 217         int     pc_find_hit;            /* find's that find page */
 218         int     pc_find_miss;           /* find's that don't find page */
 219         int     pc_destroy_free;        /* # of free pages destroyed */
 220 #define PC_HASH_CNT     (4*PAGE_HASHAVELEN)
 221         int     pc_find_hashlen[PC_HASH_CNT+1];
 222         int     pc_addclaim_pages;
 223         int     pc_subclaim_pages;
 224         int     pc_free_replacement_page[2];
 225         int     pc_try_demote_pages[6];
 226         int     pc_demote_pages[2];
 227 } pagecnt;
 228 
 229 uint_t  hashin_count;
 230 uint_t  hashin_not_held;
 231 uint_t  hashin_already;
 232 
 233 uint_t  hashout_count;
 234 uint_t  hashout_not_held;
 235 
 236 uint_t  page_create_count;
 237 uint_t  page_create_not_enough;
 238 uint_t  page_create_not_enough_again;
 239 uint_t  page_create_zero;
 240 uint_t  page_create_hashout;
 241 uint_t  page_create_page_lock_failed;
 242 uint_t  page_create_trylock_failed;
 243 uint_t  page_create_found_one;
 244 uint_t  page_create_hashin_failed;
 245 uint_t  page_create_dropped_phm;
 246 
 247 uint_t  page_create_new;
 248 uint_t  page_create_exists;
 249 uint_t  page_create_putbacks;
 250 uint_t  page_create_overshoot;
 251 
 252 uint_t  page_reclaim_zero;
 253 uint_t  page_reclaim_zero_locked;
 254 
 255 uint_t  page_rename_exists;
 256 uint_t  page_rename_count;
 257 
 258 uint_t  page_lookup_cnt[20];
 259 uint_t  page_lookup_nowait_cnt[10];
 260 uint_t  page_find_cnt;
 261 uint_t  page_exists_cnt;
 262 uint_t  page_exists_forreal_cnt;
 263 uint_t  page_lookup_dev_cnt;
 264 uint_t  get_cachelist_cnt;
 265 uint_t  page_create_cnt[10];
 266 uint_t  alloc_pages[9];
 267 uint_t  page_exphcontg[19];
 268 uint_t  page_create_large_cnt[10];
 269 
 270 /*
 271  * Collects statistics.
 272  */
 273 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
 274         uint_t  mylen = 0; \
 275                         \
 276         for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \
 277                 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
 278                         break; \
 279         } \
 280         if ((pp) != NULL) \
 281                 pagecnt.pc_find_hit++; \
 282         else \
 283                 pagecnt.pc_find_miss++; \
 284         if (mylen > PC_HASH_CNT) \
 285                 mylen = PC_HASH_CNT; \
 286         pagecnt.pc_find_hashlen[mylen]++; \
 287 }
 288 
 289 #else   /* VM_STATS */
 290 
 291 /*
 292  * Don't collect statistics
 293  */
 294 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
 295         for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
 296                 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
 297                         break; \
 298         } \
 299 }
 300 
 301 #endif  /* VM_STATS */
 302 
 303 
 304 
 305 #ifdef DEBUG
 306 #define MEMSEG_SEARCH_STATS
 307 #endif
 308 
 309 #ifdef MEMSEG_SEARCH_STATS
 310 struct memseg_stats {
 311     uint_t nsearch;
 312     uint_t nlastwon;
 313     uint_t nhashwon;
 314     uint_t nnotfound;
 315 } memseg_stats;
 316 
 317 #define MEMSEG_STAT_INCR(v) \
 318         atomic_inc_32(&memseg_stats.v)
 319 #else
 320 #define MEMSEG_STAT_INCR(x)
 321 #endif
 322 
 323 struct memseg *memsegs;         /* list of memory segments */
 324 
 325 /*
 326  * /etc/system tunable to control large page allocation hueristic.
 327  *
 328  * Setting to LPAP_LOCAL will heavily prefer the local lgroup over remote lgroup
 329  * for large page allocation requests.  If a large page is not readily
 330  * avaliable on the local freelists we will go through additional effort
 331  * to create a large page, potentially moving smaller pages around to coalesce
 332  * larger pages in the local lgroup.
 333  * Default value of LPAP_DEFAULT will go to remote freelists if large pages
 334  * are not readily available in the local lgroup.
 335  */
 336 enum lpap {
 337         LPAP_DEFAULT,   /* default large page allocation policy */
 338         LPAP_LOCAL      /* local large page allocation policy */
 339 };
 340 
 341 enum lpap lpg_alloc_prefer = LPAP_DEFAULT;
 342 
 343 static void page_init_mem_config(void);
 344 static int page_do_hashin(page_t *, vnode_t *, u_offset_t);
 345 static void page_do_hashout(page_t *);
 346 static void page_capture_init();
 347 int page_capture_take_action(page_t *, uint_t, void *);
 348 
 349 static void page_demote_vp_pages(page_t *);
 350 
 351 
 352 void
 353 pcf_init(void)
 354 
 355 {
 356         if (boot_ncpus != -1) {
 357                 pcf_fanout = boot_ncpus;
 358         } else {
 359                 pcf_fanout = max_ncpus;
 360         }
 361 #ifdef sun4v
 362         /*
 363          * Force at least 4 buckets if possible for sun4v.
 364          */
 365         pcf_fanout = MAX(pcf_fanout, 4);
 366 #endif /* sun4v */
 367 
 368         /*
 369          * Round up to the nearest power of 2.
 370          */
 371         pcf_fanout = MIN(pcf_fanout, MAX_PCF_FANOUT);
 372         if (!ISP2(pcf_fanout)) {
 373                 pcf_fanout = 1 << highbit(pcf_fanout);
 374 
 375                 if (pcf_fanout > MAX_PCF_FANOUT) {
 376                         pcf_fanout = 1 << (highbit(MAX_PCF_FANOUT) - 1);
 377                 }
 378         }
 379         pcf_fanout_mask = pcf_fanout - 1;
 380 }
 381 
 382 /*
 383  * vm subsystem related initialization
 384  */
 385 void
 386 vm_init(void)
 387 {
 388         boolean_t callb_vm_cpr(void *, int);
 389 
 390         (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
 391         page_init_mem_config();
 392         page_retire_init();
 393         vm_usage_init();
 394         page_capture_init();
 395 }
 396 
 397 /*
 398  * This function is called at startup and when memory is added or deleted.
 399  */
 400 void
 401 init_pages_pp_maximum()
 402 {
 403         static pgcnt_t p_min;
 404         static pgcnt_t pages_pp_maximum_startup;
 405         static pgcnt_t avrmem_delta;
 406         static int init_done;
 407         static int user_set;    /* true if set in /etc/system */
 408 
 409         if (init_done == 0) {
 410 
 411                 /* If the user specified a value, save it */
 412                 if (pages_pp_maximum != 0) {
 413                         user_set = 1;
 414                         pages_pp_maximum_startup = pages_pp_maximum;
 415                 }
 416 
 417                 /*
 418                  * Setting of pages_pp_maximum is based first time
 419                  * on the value of availrmem just after the start-up
 420                  * allocations. To preserve this relationship at run
 421                  * time, use a delta from availrmem_initial.
 422                  */
 423                 ASSERT(availrmem_initial >= availrmem);
 424                 avrmem_delta = availrmem_initial - availrmem;
 425 
 426                 /* The allowable floor of pages_pp_maximum */
 427                 p_min = tune.t_minarmem + 100;
 428 
 429                 /* Make sure we don't come through here again. */
 430                 init_done = 1;
 431         }
 432         /*
 433          * Determine pages_pp_maximum, the number of currently available
 434          * pages (availrmem) that can't be `locked'. If not set by
 435          * the user, we set it to 4% of the currently available memory
 436          * plus 4MB.
 437          * But we also insist that it be greater than tune.t_minarmem;
 438          * otherwise a process could lock down a lot of memory, get swapped
 439          * out, and never have enough to get swapped back in.
 440          */
 441         if (user_set)
 442                 pages_pp_maximum = pages_pp_maximum_startup;
 443         else
 444                 pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25)
 445                     + btop(4 * 1024 * 1024);
 446 
 447         if (pages_pp_maximum <= p_min) {
 448                 pages_pp_maximum = p_min;
 449         }
 450 }
 451 
 452 void
 453 set_max_page_get(pgcnt_t target_total_pages)
 454 {
 455         max_page_get = target_total_pages / 2;
 456 }
 457 
 458 static pgcnt_t pending_delete;
 459 
 460 /*ARGSUSED*/
 461 static void
 462 page_mem_config_post_add(
 463         void *arg,
 464         pgcnt_t delta_pages)
 465 {
 466         set_max_page_get(total_pages - pending_delete);
 467         init_pages_pp_maximum();
 468 }
 469 
 470 /*ARGSUSED*/
 471 static int
 472 page_mem_config_pre_del(
 473         void *arg,
 474         pgcnt_t delta_pages)
 475 {
 476         pgcnt_t nv;
 477 
 478         nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages);
 479         set_max_page_get(total_pages - nv);
 480         return (0);
 481 }
 482 
 483 /*ARGSUSED*/
 484 static void
 485 page_mem_config_post_del(
 486         void *arg,
 487         pgcnt_t delta_pages,
 488         int cancelled)
 489 {
 490         pgcnt_t nv;
 491 
 492         nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages);
 493         set_max_page_get(total_pages - nv);
 494         if (!cancelled)
 495                 init_pages_pp_maximum();
 496 }
 497 
 498 static kphysm_setup_vector_t page_mem_config_vec = {
 499         KPHYSM_SETUP_VECTOR_VERSION,
 500         page_mem_config_post_add,
 501         page_mem_config_pre_del,
 502         page_mem_config_post_del,
 503 };
 504 
 505 static void
 506 page_init_mem_config(void)
 507 {
 508         int ret;
 509 
 510         ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL);
 511         ASSERT(ret == 0);
 512 }
 513 
 514 /*
 515  * Evenly spread out the PCF counters for large free pages
 516  */
 517 static void
 518 page_free_large_ctr(pgcnt_t npages)
 519 {
 520         static struct pcf       *p = pcf;
 521         pgcnt_t                 lump;
 522 
 523         freemem += npages;
 524 
 525         lump = roundup(npages, pcf_fanout) / pcf_fanout;
 526 
 527         while (npages > 0) {
 528 
 529                 ASSERT(!p->pcf_block);
 530 
 531                 if (lump < npages) {
 532                         p->pcf_count += (uint_t)lump;
 533                         npages -= lump;
 534                 } else {
 535                         p->pcf_count += (uint_t)npages;
 536                         npages = 0;
 537                 }
 538 
 539                 ASSERT(!p->pcf_wait);
 540 
 541                 if (++p > &pcf[pcf_fanout - 1])
 542                         p = pcf;
 543         }
 544 
 545         ASSERT(npages == 0);
 546 }
 547 
 548 /*
 549  * Add a physical chunk of memory to the system free lists during startup.
 550  * Platform specific startup() allocates the memory for the page structs.
 551  *
 552  * num  - number of page structures
 553  * base - page number (pfn) to be associated with the first page.
 554  *
 555  * Since we are doing this during startup (ie. single threaded), we will
 556  * use shortcut routines to avoid any locking overhead while putting all
 557  * these pages on the freelists.
 558  *
 559  * NOTE: Any changes performed to page_free(), must also be performed to
 560  *       add_physmem() since this is how we initialize all page_t's at
 561  *       boot time.
 562  */
 563 void
 564 add_physmem(
 565         page_t  *pp,
 566         pgcnt_t num,
 567         pfn_t   pnum)
 568 {
 569         page_t  *root = NULL;
 570         uint_t  szc = page_num_pagesizes() - 1;
 571         pgcnt_t large = page_get_pagecnt(szc);
 572         pgcnt_t cnt = 0;
 573 
 574         TRACE_2(TR_FAC_VM, TR_PAGE_INIT,
 575             "add_physmem:pp %p num %lu", pp, num);
 576 
 577         /*
 578          * Arbitrarily limit the max page_get request
 579          * to 1/2 of the page structs we have.
 580          */
 581         total_pages += num;
 582         set_max_page_get(total_pages);
 583 
 584         PLCNT_MODIFY_MAX(pnum, (long)num);
 585 
 586         /*
 587          * The physical space for the pages array
 588          * representing ram pages has already been
 589          * allocated.  Here we initialize each lock
 590          * in the page structure, and put each on
 591          * the free list
 592          */
 593         for (; num; pp++, pnum++, num--) {
 594 
 595                 /*
 596                  * this needs to fill in the page number
 597                  * and do any other arch specific initialization
 598                  */
 599                 add_physmem_cb(pp, pnum);
 600 
 601                 pp->p_lckcnt = 0;
 602                 pp->p_cowcnt = 0;
 603                 pp->p_slckcnt = 0;
 604 
 605                 /*
 606                  * Initialize the page lock as unlocked, since nobody
 607                  * can see or access this page yet.
 608                  */
 609                 pp->p_selock = 0;
 610 
 611                 /*
 612                  * Initialize IO lock
 613                  */
 614                 page_iolock_init(pp);
 615 
 616                 /*
 617                  * initialize other fields in the page_t
 618                  */
 619                 PP_SETFREE(pp);
 620                 page_clr_all_props(pp);
 621                 PP_SETAGED(pp);
 622                 pp->p_offset = (u_offset_t)-1;
 623                 pp->p_next = pp;
 624                 pp->p_prev = pp;
 625 
 626                 /*
 627                  * Simple case: System doesn't support large pages.
 628                  */
 629                 if (szc == 0) {
 630                         pp->p_szc = 0;
 631                         page_free_at_startup(pp);
 632                         continue;
 633                 }
 634 
 635                 /*
 636                  * Handle unaligned pages, we collect them up onto
 637                  * the root page until we have a full large page.
 638                  */
 639                 if (!IS_P2ALIGNED(pnum, large)) {
 640 
 641                         /*
 642                          * If not in a large page,
 643                          * just free as small page.
 644                          */
 645                         if (root == NULL) {
 646                                 pp->p_szc = 0;
 647                                 page_free_at_startup(pp);
 648                                 continue;
 649                         }
 650 
 651                         /*
 652                          * Link a constituent page into the large page.
 653                          */
 654                         pp->p_szc = szc;
 655                         page_list_concat(&root, &pp);
 656 
 657                         /*
 658                          * When large page is fully formed, free it.
 659                          */
 660                         if (++cnt == large) {
 661                                 page_free_large_ctr(cnt);
 662                                 page_list_add_pages(root, PG_LIST_ISINIT);
 663                                 root = NULL;
 664                                 cnt = 0;
 665                         }
 666                         continue;
 667                 }
 668 
 669                 /*
 670                  * At this point we have a page number which
 671                  * is aligned. We assert that we aren't already
 672                  * in a different large page.
 673                  */
 674                 ASSERT(IS_P2ALIGNED(pnum, large));
 675                 ASSERT(root == NULL && cnt == 0);
 676 
 677                 /*
 678                  * If insufficient number of pages left to form
 679                  * a large page, just free the small page.
 680                  */
 681                 if (num < large) {
 682                         pp->p_szc = 0;
 683                         page_free_at_startup(pp);
 684                         continue;
 685                 }
 686 
 687                 /*
 688                  * Otherwise start a new large page.
 689                  */
 690                 pp->p_szc = szc;
 691                 cnt++;
 692                 root = pp;
 693         }
 694         ASSERT(root == NULL && cnt == 0);
 695 }
 696 
 697 /*
 698  * Find a page representing the specified [vp, offset].
 699  * If we find the page but it is intransit coming in,
 700  * it will have an "exclusive" lock and we wait for
 701  * the i/o to complete.  A page found on the free list
 702  * is always reclaimed and then locked.  On success, the page
 703  * is locked, its data is valid and it isn't on the free
 704  * list, while a NULL is returned if the page doesn't exist.
 705  */
 706 page_t *
 707 page_lookup(vnode_t *vp, u_offset_t off, se_t se)
 708 {
 709         return (page_lookup_create(vp, off, se, NULL, NULL, 0));
 710 }
 711 
 712 /*
 713  * Find a page representing the specified [vp, offset].
 714  * We either return the one we found or, if passed in,
 715  * create one with identity of [vp, offset] of the
 716  * pre-allocated page. If we find existing page but it is
 717  * intransit coming in, it will have an "exclusive" lock
 718  * and we wait for the i/o to complete.  A page found on
 719  * the free list is always reclaimed and then locked.
 720  * On success, the page is locked, its data is valid and
 721  * it isn't on the free list, while a NULL is returned
 722  * if the page doesn't exist and newpp is NULL;
 723  */
 724 page_t *
 725 page_lookup_create(
 726         vnode_t *vp,
 727         u_offset_t off,
 728         se_t se,
 729         page_t *newpp,
 730         spgcnt_t *nrelocp,
 731         int flags)
 732 {
 733         page_t          *pp;
 734         kmutex_t        *phm;
 735         ulong_t         index;
 736         uint_t          hash_locked;
 737         uint_t          es;
 738 
 739         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
 740         VM_STAT_ADD(page_lookup_cnt[0]);
 741         ASSERT(newpp ? PAGE_EXCL(newpp) : 1);
 742 
 743         /*
 744          * Acquire the appropriate page hash lock since
 745          * we have to search the hash list.  Pages that
 746          * hash to this list can't change identity while
 747          * this lock is held.
 748          */
 749         hash_locked = 0;
 750         index = PAGE_HASH_FUNC(vp, off);
 751         phm = NULL;
 752 top:
 753         PAGE_HASH_SEARCH(index, pp, vp, off);
 754         if (pp != NULL) {
 755                 VM_STAT_ADD(page_lookup_cnt[1]);
 756                 es = (newpp != NULL) ? 1 : 0;
 757                 es |= flags;
 758                 if (!hash_locked) {
 759                         VM_STAT_ADD(page_lookup_cnt[2]);
 760                         if (!page_try_reclaim_lock(pp, se, es)) {
 761                                 /*
 762                                  * On a miss, acquire the phm.  Then
 763                                  * next time, page_lock() will be called,
 764                                  * causing a wait if the page is busy.
 765                                  * just looping with page_trylock() would
 766                                  * get pretty boring.
 767                                  */
 768                                 VM_STAT_ADD(page_lookup_cnt[3]);
 769                                 phm = PAGE_HASH_MUTEX(index);
 770                                 mutex_enter(phm);
 771                                 hash_locked = 1;
 772                                 goto top;
 773                         }
 774                 } else {
 775                         VM_STAT_ADD(page_lookup_cnt[4]);
 776                         if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) {
 777                                 VM_STAT_ADD(page_lookup_cnt[5]);
 778                                 goto top;
 779                         }
 780                 }
 781 
 782                 /*
 783                  * Since `pp' is locked it can not change identity now.
 784                  * Reconfirm we locked the correct page.
 785                  *
 786                  * Both the p_vnode and p_offset *must* be cast volatile
 787                  * to force a reload of their values: The PAGE_HASH_SEARCH
 788                  * macro will have stuffed p_vnode and p_offset into
 789                  * registers before calling page_trylock(); another thread,
 790                  * actually holding the hash lock, could have changed the
 791                  * page's identity in memory, but our registers would not
 792                  * be changed, fooling the reconfirmation.  If the hash
 793                  * lock was held during the search, the casting would
 794                  * not be needed.
 795                  */
 796                 VM_STAT_ADD(page_lookup_cnt[6]);
 797                 if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
 798                     ((volatile u_offset_t)(pp->p_offset) != off)) {
 799                         VM_STAT_ADD(page_lookup_cnt[7]);
 800                         if (hash_locked) {
 801                                 panic("page_lookup_create: lost page %p",
 802                                     (void *)pp);
 803                                 /*NOTREACHED*/
 804                         }
 805                         page_unlock(pp);
 806                         phm = PAGE_HASH_MUTEX(index);
 807                         mutex_enter(phm);
 808                         hash_locked = 1;
 809                         goto top;
 810                 }
 811 
 812                 /*
 813                  * If page_trylock() was called, then pp may still be on
 814                  * the cachelist (can't be on the free list, it would not
 815                  * have been found in the search).  If it is on the
 816                  * cachelist it must be pulled now. To pull the page from
 817                  * the cachelist, it must be exclusively locked.
 818                  *
 819                  * The other big difference between page_trylock() and
 820                  * page_lock(), is that page_lock() will pull the
 821                  * page from whatever free list (the cache list in this
 822                  * case) the page is on.  If page_trylock() was used
 823                  * above, then we have to do the reclaim ourselves.
 824                  */
 825                 if ((!hash_locked) && (PP_ISFREE(pp))) {
 826                         ASSERT(PP_ISAGED(pp) == 0);
 827                         VM_STAT_ADD(page_lookup_cnt[8]);
 828 
 829                         /*
 830                          * page_relcaim will insure that we
 831                          * have this page exclusively
 832                          */
 833 
 834                         if (!page_reclaim(pp, NULL)) {
 835                                 /*
 836                                  * Page_reclaim dropped whatever lock
 837                                  * we held.
 838                                  */
 839                                 VM_STAT_ADD(page_lookup_cnt[9]);
 840                                 phm = PAGE_HASH_MUTEX(index);
 841                                 mutex_enter(phm);
 842                                 hash_locked = 1;
 843                                 goto top;
 844                         } else if (se == SE_SHARED && newpp == NULL) {
 845                                 VM_STAT_ADD(page_lookup_cnt[10]);
 846                                 page_downgrade(pp);
 847                         }
 848                 }
 849 
 850                 if (hash_locked) {
 851                         mutex_exit(phm);
 852                 }
 853 
 854                 if (newpp != NULL && pp->p_szc < newpp->p_szc &&
 855                     PAGE_EXCL(pp) && nrelocp != NULL) {
 856                         ASSERT(nrelocp != NULL);
 857                         (void) page_relocate(&pp, &newpp, 1, 1, nrelocp,
 858                             NULL);
 859                         if (*nrelocp > 0) {
 860                                 VM_STAT_COND_ADD(*nrelocp == 1,
 861                                     page_lookup_cnt[11]);
 862                                 VM_STAT_COND_ADD(*nrelocp > 1,
 863                                     page_lookup_cnt[12]);
 864                                 pp = newpp;
 865                                 se = SE_EXCL;
 866                         } else {
 867                                 if (se == SE_SHARED) {
 868                                         page_downgrade(pp);
 869                                 }
 870                                 VM_STAT_ADD(page_lookup_cnt[13]);
 871                         }
 872                 } else if (newpp != NULL && nrelocp != NULL) {
 873                         if (PAGE_EXCL(pp) && se == SE_SHARED) {
 874                                 page_downgrade(pp);
 875                         }
 876                         VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc,
 877                             page_lookup_cnt[14]);
 878                         VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc,
 879                             page_lookup_cnt[15]);
 880                         VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc,
 881                             page_lookup_cnt[16]);
 882                 } else if (newpp != NULL && PAGE_EXCL(pp)) {
 883                         se = SE_EXCL;
 884                 }
 885         } else if (!hash_locked) {
 886                 VM_STAT_ADD(page_lookup_cnt[17]);
 887                 phm = PAGE_HASH_MUTEX(index);
 888                 mutex_enter(phm);
 889                 hash_locked = 1;
 890                 goto top;
 891         } else if (newpp != NULL) {
 892                 /*
 893                  * If we have a preallocated page then
 894                  * insert it now and basically behave like
 895                  * page_create.
 896                  */
 897                 VM_STAT_ADD(page_lookup_cnt[18]);
 898                 /*
 899                  * Since we hold the page hash mutex and
 900                  * just searched for this page, page_hashin
 901                  * had better not fail.  If it does, that
 902                  * means some thread did not follow the
 903                  * page hash mutex rules.  Panic now and
 904                  * get it over with.  As usual, go down
 905                  * holding all the locks.
 906                  */
 907                 ASSERT(MUTEX_HELD(phm));
 908                 if (!page_hashin(newpp, vp, off, phm)) {
 909                         ASSERT(MUTEX_HELD(phm));
 910                         panic("page_lookup_create: hashin failed %p %p %llx %p",
 911                             (void *)newpp, (void *)vp, off, (void *)phm);
 912                         /*NOTREACHED*/
 913                 }
 914                 ASSERT(MUTEX_HELD(phm));
 915                 mutex_exit(phm);
 916                 phm = NULL;
 917                 page_set_props(newpp, P_REF);
 918                 page_io_lock(newpp);
 919                 pp = newpp;
 920                 se = SE_EXCL;
 921         } else {
 922                 VM_STAT_ADD(page_lookup_cnt[19]);
 923                 mutex_exit(phm);
 924         }
 925 
 926         ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
 927 
 928         ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1);
 929 
 930         return (pp);
 931 }
 932 
 933 /*
 934  * Search the hash list for the page representing the
 935  * specified [vp, offset] and return it locked.  Skip
 936  * free pages and pages that cannot be locked as requested.
 937  * Used while attempting to kluster pages.
 938  */
 939 page_t *
 940 page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se)
 941 {
 942         page_t          *pp;
 943         kmutex_t        *phm;
 944         ulong_t         index;
 945         uint_t          locked;
 946 
 947         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
 948         VM_STAT_ADD(page_lookup_nowait_cnt[0]);
 949 
 950         index = PAGE_HASH_FUNC(vp, off);
 951         PAGE_HASH_SEARCH(index, pp, vp, off);
 952         locked = 0;
 953         if (pp == NULL) {
 954 top:
 955                 VM_STAT_ADD(page_lookup_nowait_cnt[1]);
 956                 locked = 1;
 957                 phm = PAGE_HASH_MUTEX(index);
 958                 mutex_enter(phm);
 959                 PAGE_HASH_SEARCH(index, pp, vp, off);
 960         }
 961 
 962         if (pp == NULL || PP_ISFREE(pp)) {
 963                 VM_STAT_ADD(page_lookup_nowait_cnt[2]);
 964                 pp = NULL;
 965         } else {
 966                 if (!page_trylock(pp, se)) {
 967                         VM_STAT_ADD(page_lookup_nowait_cnt[3]);
 968                         pp = NULL;
 969                 } else {
 970                         VM_STAT_ADD(page_lookup_nowait_cnt[4]);
 971                         /*
 972                          * See the comment in page_lookup()
 973                          */
 974                         if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
 975                             ((u_offset_t)(pp->p_offset) != off)) {
 976                                 VM_STAT_ADD(page_lookup_nowait_cnt[5]);
 977                                 if (locked) {
 978                                         panic("page_lookup_nowait %p",
 979                                             (void *)pp);
 980                                         /*NOTREACHED*/
 981                                 }
 982                                 page_unlock(pp);
 983                                 goto top;
 984                         }
 985                         if (PP_ISFREE(pp)) {
 986                                 VM_STAT_ADD(page_lookup_nowait_cnt[6]);
 987                                 page_unlock(pp);
 988                                 pp = NULL;
 989                         }
 990                 }
 991         }
 992         if (locked) {
 993                 VM_STAT_ADD(page_lookup_nowait_cnt[7]);
 994                 mutex_exit(phm);
 995         }
 996 
 997         ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
 998 
 999         return (pp);
1000 }
1001 
1002 /*
1003  * Search the hash list for a page with the specified [vp, off]
1004  * that is known to exist and is already locked.  This routine
1005  * is typically used by segment SOFTUNLOCK routines.
1006  */
1007 page_t *
1008 page_find(vnode_t *vp, u_offset_t off)
1009 {
1010         page_t          *pp;
1011         kmutex_t        *phm;
1012         ulong_t         index;
1013 
1014         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1015         VM_STAT_ADD(page_find_cnt);
1016 
1017         index = PAGE_HASH_FUNC(vp, off);
1018         phm = PAGE_HASH_MUTEX(index);
1019 
1020         mutex_enter(phm);
1021         PAGE_HASH_SEARCH(index, pp, vp, off);
1022         mutex_exit(phm);
1023 
1024         ASSERT(pp == NULL || PAGE_LOCKED(pp) || panicstr);
1025         return (pp);
1026 }
1027 
1028 /*
1029  * Determine whether a page with the specified [vp, off]
1030  * currently exists in the system.  Obviously this should
1031  * only be considered as a hint since nothing prevents the
1032  * page from disappearing or appearing immediately after
1033  * the return from this routine. Subsequently, we don't
1034  * even bother to lock the list.
1035  */
1036 page_t *
1037 page_exists(vnode_t *vp, u_offset_t off)
1038 {
1039         page_t  *pp;
1040         ulong_t         index;
1041 
1042         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1043         VM_STAT_ADD(page_exists_cnt);
1044 
1045         index = PAGE_HASH_FUNC(vp, off);
1046         PAGE_HASH_SEARCH(index, pp, vp, off);
1047 
1048         return (pp);
1049 }
1050 
1051 /*
1052  * Determine if physically contiguous pages exist for [vp, off] - [vp, off +
1053  * page_size(szc)) range.  if they exist and ppa is not NULL fill ppa array
1054  * with these pages locked SHARED. If necessary reclaim pages from
1055  * freelist. Return 1 if contiguous pages exist and 0 otherwise.
1056  *
1057  * If we fail to lock pages still return 1 if pages exist and contiguous.
1058  * But in this case return value is just a hint. ppa array won't be filled.
1059  * Caller should initialize ppa[0] as NULL to distinguish return value.
1060  *
1061  * Returns 0 if pages don't exist or not physically contiguous.
1062  *
1063  * This routine doesn't work for anonymous(swapfs) pages.
1064  */
1065 int
1066 page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[])
1067 {
1068         pgcnt_t pages;
1069         pfn_t pfn;
1070         page_t *rootpp;
1071         pgcnt_t i;
1072         pgcnt_t j;
1073         u_offset_t save_off = off;
1074         ulong_t index;
1075         kmutex_t *phm;
1076         page_t *pp;
1077         uint_t pszc;
1078         int loopcnt = 0;
1079 
1080         ASSERT(szc != 0);
1081         ASSERT(vp != NULL);
1082         ASSERT(!IS_SWAPFSVP(vp));
1083         ASSERT(!VN_ISKAS(vp));
1084 
1085 again:
1086         if (++loopcnt > 3) {
1087                 VM_STAT_ADD(page_exphcontg[0]);
1088                 return (0);
1089         }
1090 
1091         index = PAGE_HASH_FUNC(vp, off);
1092         phm = PAGE_HASH_MUTEX(index);
1093 
1094         mutex_enter(phm);
1095         PAGE_HASH_SEARCH(index, pp, vp, off);
1096         mutex_exit(phm);
1097 
1098         VM_STAT_ADD(page_exphcontg[1]);
1099 
1100         if (pp == NULL) {
1101                 VM_STAT_ADD(page_exphcontg[2]);
1102                 return (0);
1103         }
1104 
1105         pages = page_get_pagecnt(szc);
1106         rootpp = pp;
1107         pfn = rootpp->p_pagenum;
1108 
1109         if ((pszc = pp->p_szc) >= szc && ppa != NULL) {
1110                 VM_STAT_ADD(page_exphcontg[3]);
1111                 if (!page_trylock(pp, SE_SHARED)) {
1112                         VM_STAT_ADD(page_exphcontg[4]);
1113                         return (1);
1114                 }
1115                 /*
1116                  * Also check whether p_pagenum was modified by DR.
1117                  */
1118                 if (pp->p_szc != pszc || pp->p_vnode != vp ||
1119                     pp->p_offset != off || pp->p_pagenum != pfn) {
1120                         VM_STAT_ADD(page_exphcontg[5]);
1121                         page_unlock(pp);
1122                         off = save_off;
1123                         goto again;
1124                 }
1125                 /*
1126                  * szc was non zero and vnode and offset matched after we
1127                  * locked the page it means it can't become free on us.
1128                  */
1129                 ASSERT(!PP_ISFREE(pp));
1130                 if (!IS_P2ALIGNED(pfn, pages)) {
1131                         page_unlock(pp);
1132                         return (0);
1133                 }
1134                 ppa[0] = pp;
1135                 pp++;
1136                 off += PAGESIZE;
1137                 pfn++;
1138                 for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1139                         if (!page_trylock(pp, SE_SHARED)) {
1140                                 VM_STAT_ADD(page_exphcontg[6]);
1141                                 pp--;
1142                                 while (i-- > 0) {
1143                                         page_unlock(pp);
1144                                         pp--;
1145                                 }
1146                                 ppa[0] = NULL;
1147                                 return (1);
1148                         }
1149                         if (pp->p_szc != pszc) {
1150                                 VM_STAT_ADD(page_exphcontg[7]);
1151                                 page_unlock(pp);
1152                                 pp--;
1153                                 while (i-- > 0) {
1154                                         page_unlock(pp);
1155                                         pp--;
1156                                 }
1157                                 ppa[0] = NULL;
1158                                 off = save_off;
1159                                 goto again;
1160                         }
1161                         /*
1162                          * szc the same as for previous already locked pages
1163                          * with right identity. Since this page had correct
1164                          * szc after we locked it can't get freed or destroyed
1165                          * and therefore must have the expected identity.
1166                          */
1167                         ASSERT(!PP_ISFREE(pp));
1168                         if (pp->p_vnode != vp ||
1169                             pp->p_offset != off) {
1170                                 panic("page_exists_physcontig: "
1171                                     "large page identity doesn't match");
1172                         }
1173                         ppa[i] = pp;
1174                         ASSERT(pp->p_pagenum == pfn);
1175                 }
1176                 VM_STAT_ADD(page_exphcontg[8]);
1177                 ppa[pages] = NULL;
1178                 return (1);
1179         } else if (pszc >= szc) {
1180                 VM_STAT_ADD(page_exphcontg[9]);
1181                 if (!IS_P2ALIGNED(pfn, pages)) {
1182                         return (0);
1183                 }
1184                 return (1);
1185         }
1186 
1187         if (!IS_P2ALIGNED(pfn, pages)) {
1188                 VM_STAT_ADD(page_exphcontg[10]);
1189                 return (0);
1190         }
1191 
1192         if (page_numtomemseg_nolock(pfn) !=
1193             page_numtomemseg_nolock(pfn + pages - 1)) {
1194                 VM_STAT_ADD(page_exphcontg[11]);
1195                 return (0);
1196         }
1197 
1198         /*
1199          * We loop up 4 times across pages to promote page size.
1200          * We're extra cautious to promote page size atomically with respect
1201          * to everybody else.  But we can probably optimize into 1 loop if
1202          * this becomes an issue.
1203          */
1204 
1205         for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1206                 if (!page_trylock(pp, SE_EXCL)) {
1207                         VM_STAT_ADD(page_exphcontg[12]);
1208                         break;
1209                 }
1210                 /*
1211                  * Check whether p_pagenum was modified by DR.
1212                  */
1213                 if (pp->p_pagenum != pfn) {
1214                         page_unlock(pp);
1215                         break;
1216                 }
1217                 if (pp->p_vnode != vp ||
1218                     pp->p_offset != off) {
1219                         VM_STAT_ADD(page_exphcontg[13]);
1220                         page_unlock(pp);
1221                         break;
1222                 }
1223                 if (pp->p_szc >= szc) {
1224                         ASSERT(i == 0);
1225                         page_unlock(pp);
1226                         off = save_off;
1227                         goto again;
1228                 }
1229         }
1230 
1231         if (i != pages) {
1232                 VM_STAT_ADD(page_exphcontg[14]);
1233                 --pp;
1234                 while (i-- > 0) {
1235                         page_unlock(pp);
1236                         --pp;
1237                 }
1238                 return (0);
1239         }
1240 
1241         pp = rootpp;
1242         for (i = 0; i < pages; i++, pp++) {
1243                 if (PP_ISFREE(pp)) {
1244                         VM_STAT_ADD(page_exphcontg[15]);
1245                         ASSERT(!PP_ISAGED(pp));
1246                         ASSERT(pp->p_szc == 0);
1247                         if (!page_reclaim(pp, NULL)) {
1248                                 break;
1249                         }
1250                 } else {
1251                         ASSERT(pp->p_szc < szc);
1252                         VM_STAT_ADD(page_exphcontg[16]);
1253                         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1254                 }
1255         }
1256         if (i < pages) {
1257                 VM_STAT_ADD(page_exphcontg[17]);
1258                 /*
1259                  * page_reclaim failed because we were out of memory.
1260                  * drop the rest of the locks and return because this page
1261                  * must be already reallocated anyway.
1262                  */
1263                 pp = rootpp;
1264                 for (j = 0; j < pages; j++, pp++) {
1265                         if (j != i) {
1266                                 page_unlock(pp);
1267                         }
1268                 }
1269                 return (0);
1270         }
1271 
1272         off = save_off;
1273         pp = rootpp;
1274         for (i = 0; i < pages; i++, pp++, off += PAGESIZE) {
1275                 ASSERT(PAGE_EXCL(pp));
1276                 ASSERT(!PP_ISFREE(pp));
1277                 ASSERT(!hat_page_is_mapped(pp));
1278                 ASSERT(pp->p_vnode == vp);
1279                 ASSERT(pp->p_offset == off);
1280                 pp->p_szc = szc;
1281         }
1282         pp = rootpp;
1283         for (i = 0; i < pages; i++, pp++) {
1284                 if (ppa == NULL) {
1285                         page_unlock(pp);
1286                 } else {
1287                         ppa[i] = pp;
1288                         page_downgrade(ppa[i]);
1289                 }
1290         }
1291         if (ppa != NULL) {
1292                 ppa[pages] = NULL;
1293         }
1294         VM_STAT_ADD(page_exphcontg[18]);
1295         ASSERT(vp->v_pages != NULL);
1296         return (1);
1297 }
1298 
1299 /*
1300  * Determine whether a page with the specified [vp, off]
1301  * currently exists in the system and if so return its
1302  * size code. Obviously this should only be considered as
1303  * a hint since nothing prevents the page from disappearing
1304  * or appearing immediately after the return from this routine.
1305  */
1306 int
1307 page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc)
1308 {
1309         page_t          *pp;
1310         kmutex_t        *phm;
1311         ulong_t         index;
1312         int             rc = 0;
1313 
1314         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1315         ASSERT(szc != NULL);
1316         VM_STAT_ADD(page_exists_forreal_cnt);
1317 
1318         index = PAGE_HASH_FUNC(vp, off);
1319         phm = PAGE_HASH_MUTEX(index);
1320 
1321         mutex_enter(phm);
1322         PAGE_HASH_SEARCH(index, pp, vp, off);
1323         if (pp != NULL) {
1324                 *szc = pp->p_szc;
1325                 rc = 1;
1326         }
1327         mutex_exit(phm);
1328         return (rc);
1329 }
1330 
1331 /* wakeup threads waiting for pages in page_create_get_something() */
1332 void
1333 wakeup_pcgs(void)
1334 {
1335         if (!CV_HAS_WAITERS(&pcgs_cv))
1336                 return;
1337         cv_broadcast(&pcgs_cv);
1338 }
1339 
1340 /*
1341  * 'freemem' is used all over the kernel as an indication of how many
1342  * pages are free (either on the cache list or on the free page list)
1343  * in the system.  In very few places is a really accurate 'freemem'
1344  * needed.  To avoid contention of the lock protecting a the
1345  * single freemem, it was spread out into NCPU buckets.  Set_freemem
1346  * sets freemem to the total of all NCPU buckets.  It is called from
1347  * clock() on each TICK.
1348  */
1349 void
1350 set_freemem()
1351 {
1352         struct pcf      *p;
1353         ulong_t         t;
1354         uint_t          i;
1355 
1356         t = 0;
1357         p = pcf;
1358         for (i = 0;  i < pcf_fanout; i++) {
1359                 t += p->pcf_count;
1360                 p++;
1361         }
1362         freemem = t;
1363 
1364         /*
1365          * Don't worry about grabbing mutex.  It's not that
1366          * critical if we miss a tick or two.  This is
1367          * where we wakeup possible delayers in
1368          * page_create_get_something().
1369          */
1370         wakeup_pcgs();
1371 }
1372 
1373 ulong_t
1374 get_freemem()
1375 {
1376         struct pcf      *p;
1377         ulong_t         t;
1378         uint_t          i;
1379 
1380         t = 0;
1381         p = pcf;
1382         for (i = 0; i < pcf_fanout; i++) {
1383                 t += p->pcf_count;
1384                 p++;
1385         }
1386         /*
1387          * We just calculated it, might as well set it.
1388          */
1389         freemem = t;
1390         return (t);
1391 }
1392 
1393 /*
1394  * Acquire all of the page cache & free (pcf) locks.
1395  */
1396 void
1397 pcf_acquire_all()
1398 {
1399         struct pcf      *p;
1400         uint_t          i;
1401 
1402         p = pcf;
1403         for (i = 0; i < pcf_fanout; i++) {
1404                 mutex_enter(&p->pcf_lock);
1405                 p++;
1406         }
1407 }
1408 
1409 /*
1410  * Release all the pcf_locks.
1411  */
1412 void
1413 pcf_release_all()
1414 {
1415         struct pcf      *p;
1416         uint_t          i;
1417 
1418         p = pcf;
1419         for (i = 0; i < pcf_fanout; i++) {
1420                 mutex_exit(&p->pcf_lock);
1421                 p++;
1422         }
1423 }
1424 
1425 /*
1426  * Inform the VM system that we need some pages freed up.
1427  * Calls must be symmetric, e.g.:
1428  *
1429  *      page_needfree(100);
1430  *      wait a bit;
1431  *      page_needfree(-100);
1432  */
1433 void
1434 page_needfree(spgcnt_t npages)
1435 {
1436         mutex_enter(&new_freemem_lock);
1437         needfree += npages;
1438         mutex_exit(&new_freemem_lock);
1439 }
1440 
1441 /*
1442  * Throttle for page_create(): try to prevent freemem from dropping
1443  * below throttlefree.  We can't provide a 100% guarantee because
1444  * KM_NOSLEEP allocations, page_reclaim(), and various other things
1445  * nibble away at the freelist.  However, we can block all PG_WAIT
1446  * allocations until memory becomes available.  The motivation is
1447  * that several things can fall apart when there's no free memory:
1448  *
1449  * (1) If pageout() needs memory to push a page, the system deadlocks.
1450  *
1451  * (2) By (broken) specification, timeout(9F) can neither fail nor
1452  *     block, so it has no choice but to panic the system if it
1453  *     cannot allocate a callout structure.
1454  *
1455  * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block;
1456  *     it panics if it cannot allocate a callback structure.
1457  *
1458  * (4) Untold numbers of third-party drivers have not yet been hardened
1459  *     against KM_NOSLEEP and/or allocb() failures; they simply assume
1460  *     success and panic the system with a data fault on failure.
1461  *     (The long-term solution to this particular problem is to ship
1462  *     hostile fault-injecting DEBUG kernels with the DDK.)
1463  *
1464  * It is theoretically impossible to guarantee success of non-blocking
1465  * allocations, but in practice, this throttle is very hard to break.
1466  */
1467 static int
1468 page_create_throttle(pgcnt_t npages, int flags)
1469 {
1470         ulong_t fm;
1471         uint_t  i;
1472         pgcnt_t tf;     /* effective value of throttlefree */
1473 
1474         /*
1475          * Normal priority allocations.
1476          */
1477         if ((flags & (PG_WAIT | PG_NORMALPRI)) == PG_NORMALPRI) {
1478                 ASSERT(!(flags & (PG_PANIC | PG_PUSHPAGE)));
1479                 return (freemem >= npages + throttlefree);
1480         }
1481 
1482         /*
1483          * Never deny pages when:
1484          * - it's a thread that cannot block [NOMEMWAIT()]
1485          * - the allocation cannot block and must not fail
1486          * - the allocation cannot block and is pageout dispensated
1487          */
1488         if (NOMEMWAIT() ||
1489             ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) ||
1490             ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE))
1491                 return (1);
1492 
1493         /*
1494          * If the allocation can't block, we look favorably upon it
1495          * unless we're below pageout_reserve.  In that case we fail
1496          * the allocation because we want to make sure there are a few
1497          * pages available for pageout.
1498          */
1499         if ((flags & PG_WAIT) == 0)
1500                 return (freemem >= npages + pageout_reserve);
1501 
1502         /* Calculate the effective throttlefree value */
1503         tf = throttlefree -
1504             ((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
1505 
1506         cv_signal(&proc_pageout->p_cv);
1507 
1508         for (;;) {
1509                 fm = 0;
1510                 pcf_acquire_all();
1511                 mutex_enter(&new_freemem_lock);
1512                 for (i = 0; i < pcf_fanout; i++) {
1513                         fm += pcf[i].pcf_count;
1514                         pcf[i].pcf_wait++;
1515                         mutex_exit(&pcf[i].pcf_lock);
1516                 }
1517                 freemem = fm;
1518                 if (freemem >= npages + tf) {
1519                         mutex_exit(&new_freemem_lock);
1520                         break;
1521                 }
1522                 needfree += npages;
1523                 freemem_wait++;
1524                 cv_wait(&freemem_cv, &new_freemem_lock);
1525                 freemem_wait--;
1526                 needfree -= npages;
1527                 mutex_exit(&new_freemem_lock);
1528         }
1529         return (1);
1530 }
1531 
1532 /*
1533  * page_create_wait() is called to either coalesce pages from the
1534  * different pcf buckets or to wait because there simply are not
1535  * enough pages to satisfy the caller's request.
1536  *
1537  * Sadly, this is called from platform/vm/vm_machdep.c
1538  */
1539 int
1540 page_create_wait(pgcnt_t npages, uint_t flags)
1541 {
1542         pgcnt_t         total;
1543         uint_t          i;
1544         struct pcf      *p;
1545 
1546         /*
1547          * Wait until there are enough free pages to satisfy our
1548          * entire request.
1549          * We set needfree += npages before prodding pageout, to make sure
1550          * it does real work when npages > lotsfree > freemem.
1551          */
1552         VM_STAT_ADD(page_create_not_enough);
1553 
1554         ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1);
1555 checkagain:
1556         if ((flags & PG_NORELOC) &&
1557             kcage_freemem < kcage_throttlefree + npages)
1558                 (void) kcage_create_throttle(npages, flags);
1559 
1560         if (freemem < npages + throttlefree)
1561                 if (!page_create_throttle(npages, flags))
1562                         return (0);
1563 
1564         if (pcf_decrement_bucket(npages) ||
1565             pcf_decrement_multiple(&total, npages, 0))
1566                 return (1);
1567 
1568         /*
1569          * All of the pcf locks are held, there are not enough pages
1570          * to satisfy the request (npages < total).
1571          * Be sure to acquire the new_freemem_lock before dropping
1572          * the pcf locks.  This prevents dropping wakeups in page_free().
1573          * The order is always pcf_lock then new_freemem_lock.
1574          *
1575          * Since we hold all the pcf locks, it is a good time to set freemem.
1576          *
1577          * If the caller does not want to wait, return now.
1578          * Else turn the pageout daemon loose to find something
1579          * and wait till it does.
1580          *
1581          */
1582         freemem = total;
1583 
1584         if ((flags & PG_WAIT) == 0) {
1585                 pcf_release_all();
1586 
1587                 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM,
1588                 "page_create_nomem:npages %ld freemem %ld", npages, freemem);
1589                 return (0);
1590         }
1591 
1592         ASSERT(proc_pageout != NULL);
1593         cv_signal(&proc_pageout->p_cv);
1594 
1595         TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START,
1596             "page_create_sleep_start: freemem %ld needfree %ld",
1597             freemem, needfree);
1598 
1599         /*
1600          * We are going to wait.
1601          * We currently hold all of the pcf_locks,
1602          * get the new_freemem_lock (it protects freemem_wait),
1603          * before dropping the pcf_locks.
1604          */
1605         mutex_enter(&new_freemem_lock);
1606 
1607         p = pcf;
1608         for (i = 0; i < pcf_fanout; i++) {
1609                 p->pcf_wait++;
1610                 mutex_exit(&p->pcf_lock);
1611                 p++;
1612         }
1613 
1614         needfree += npages;
1615         freemem_wait++;
1616 
1617         cv_wait(&freemem_cv, &new_freemem_lock);
1618 
1619         freemem_wait--;
1620         needfree -= npages;
1621 
1622         mutex_exit(&new_freemem_lock);
1623 
1624         TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END,
1625             "page_create_sleep_end: freemem %ld needfree %ld",
1626             freemem, needfree);
1627 
1628         VM_STAT_ADD(page_create_not_enough_again);
1629         goto checkagain;
1630 }
1631 /*
1632  * A routine to do the opposite of page_create_wait().
1633  */
1634 void
1635 page_create_putback(spgcnt_t npages)
1636 {
1637         struct pcf      *p;
1638         pgcnt_t         lump;
1639         uint_t          *which;
1640 
1641         /*
1642          * When a contiguous lump is broken up, we have to
1643          * deal with lots of pages (min 64) so lets spread
1644          * the wealth around.
1645          */
1646         lump = roundup(npages, pcf_fanout) / pcf_fanout;
1647         freemem += npages;
1648 
1649         for (p = pcf; (npages > 0) && (p < &pcf[pcf_fanout]); p++) {
1650                 which = &p->pcf_count;
1651 
1652                 mutex_enter(&p->pcf_lock);
1653 
1654                 if (p->pcf_block) {
1655                         which = &p->pcf_reserve;
1656                 }
1657 
1658                 if (lump < npages) {
1659                         *which += (uint_t)lump;
1660                         npages -= lump;
1661                 } else {
1662                         *which += (uint_t)npages;
1663                         npages = 0;
1664                 }
1665 
1666                 if (p->pcf_wait) {
1667                         mutex_enter(&new_freemem_lock);
1668                         /*
1669                          * Check to see if some other thread
1670                          * is actually waiting.  Another bucket
1671                          * may have woken it up by now.  If there
1672                          * are no waiters, then set our pcf_wait
1673                          * count to zero to avoid coming in here
1674                          * next time.
1675                          */
1676                         if (freemem_wait) {
1677                                 if (npages > 1) {
1678                                         cv_broadcast(&freemem_cv);
1679                                 } else {
1680                                         cv_signal(&freemem_cv);
1681                                 }
1682                                 p->pcf_wait--;
1683                         } else {
1684                                 p->pcf_wait = 0;
1685                         }
1686                         mutex_exit(&new_freemem_lock);
1687                 }
1688                 mutex_exit(&p->pcf_lock);
1689         }
1690         ASSERT(npages == 0);
1691 }
1692 
1693 /*
1694  * A helper routine for page_create_get_something.
1695  * The indenting got to deep down there.
1696  * Unblock the pcf counters.  Any pages freed after
1697  * pcf_block got set are moved to pcf_count and
1698  * wakeups (cv_broadcast() or cv_signal()) are done as needed.
1699  */
1700 static void
1701 pcgs_unblock(void)
1702 {
1703         int             i;
1704         struct pcf      *p;
1705 
1706         /* Update freemem while we're here. */
1707         freemem = 0;
1708         p = pcf;
1709         for (i = 0; i < pcf_fanout; i++) {
1710                 mutex_enter(&p->pcf_lock);
1711                 ASSERT(p->pcf_count == 0);
1712                 p->pcf_count = p->pcf_reserve;
1713                 p->pcf_block = 0;
1714                 freemem += p->pcf_count;
1715                 if (p->pcf_wait) {
1716                         mutex_enter(&new_freemem_lock);
1717                         if (freemem_wait) {
1718                                 if (p->pcf_reserve > 1) {
1719                                         cv_broadcast(&freemem_cv);
1720                                         p->pcf_wait = 0;
1721                                 } else {
1722                                         cv_signal(&freemem_cv);
1723                                         p->pcf_wait--;
1724                                 }
1725                         } else {
1726                                 p->pcf_wait = 0;
1727                         }
1728                         mutex_exit(&new_freemem_lock);
1729                 }
1730                 p->pcf_reserve = 0;
1731                 mutex_exit(&p->pcf_lock);
1732                 p++;
1733         }
1734 }
1735 
1736 /*
1737  * Called from page_create_va() when both the cache and free lists
1738  * have been checked once.
1739  *
1740  * Either returns a page or panics since the accounting was done
1741  * way before we got here.
1742  *
1743  * We don't come here often, so leave the accounting on permanently.
1744  */
1745 
1746 #define MAX_PCGS        100
1747 
1748 #ifdef  DEBUG
1749 #define PCGS_TRIES      100
1750 #else   /* DEBUG */
1751 #define PCGS_TRIES      10
1752 #endif  /* DEBUG */
1753 
1754 #ifdef  VM_STATS
1755 uint_t  pcgs_counts[PCGS_TRIES];
1756 uint_t  pcgs_too_many;
1757 uint_t  pcgs_entered;
1758 uint_t  pcgs_entered_noreloc;
1759 uint_t  pcgs_locked;
1760 uint_t  pcgs_cagelocked;
1761 #endif  /* VM_STATS */
1762 
1763 static page_t *
1764 page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg,
1765     caddr_t vaddr, uint_t flags)
1766 {
1767         uint_t          count;
1768         page_t          *pp;
1769         uint_t          locked, i;
1770         struct  pcf     *p;
1771         lgrp_t          *lgrp;
1772         int             cagelocked = 0;
1773 
1774         VM_STAT_ADD(pcgs_entered);
1775 
1776         /*
1777          * Tap any reserve freelists: if we fail now, we'll die
1778          * since the page(s) we're looking for have already been
1779          * accounted for.
1780          */
1781         flags |= PG_PANIC;
1782 
1783         if ((flags & PG_NORELOC) != 0) {
1784                 VM_STAT_ADD(pcgs_entered_noreloc);
1785                 /*
1786                  * Requests for free pages from critical threads
1787                  * such as pageout still won't throttle here, but
1788                  * we must try again, to give the cageout thread
1789                  * another chance to catch up. Since we already
1790                  * accounted for the pages, we had better get them
1791                  * this time.
1792                  *
1793                  * N.B. All non-critical threads acquire the pcgs_cagelock
1794                  * to serialize access to the freelists. This implements a
1795                  * turnstile-type synchornization to avoid starvation of
1796                  * critical requests for PG_NORELOC memory by non-critical
1797                  * threads: all non-critical threads must acquire a 'ticket'
1798                  * before passing through, which entails making sure
1799                  * kcage_freemem won't fall below minfree prior to grabbing
1800                  * pages from the freelists.
1801                  */
1802                 if (kcage_create_throttle(1, flags) == KCT_NONCRIT) {
1803                         mutex_enter(&pcgs_cagelock);
1804                         cagelocked = 1;
1805                         VM_STAT_ADD(pcgs_cagelocked);
1806                 }
1807         }
1808 
1809         /*
1810          * Time to get serious.
1811          * We failed to get a `correctly colored' page from both the
1812          * free and cache lists.
1813          * We escalate in stage.
1814          *
1815          * First try both lists without worring about color.
1816          *
1817          * Then, grab all page accounting locks (ie. pcf[]) and
1818          * steal any pages that they have and set the pcf_block flag to
1819          * stop deletions from the lists.  This will help because
1820          * a page can get added to the free list while we are looking
1821          * at the cache list, then another page could be added to the cache
1822          * list allowing the page on the free list to be removed as we
1823          * move from looking at the cache list to the free list. This
1824          * could happen over and over. We would never find the page
1825          * we have accounted for.
1826          *
1827          * Noreloc pages are a subset of the global (relocatable) page pool.
1828          * They are not tracked separately in the pcf bins, so it is
1829          * impossible to know when doing pcf accounting if the available
1830          * page(s) are noreloc pages or not. When looking for a noreloc page
1831          * it is quite easy to end up here even if the global (relocatable)
1832          * page pool has plenty of free pages but the noreloc pool is empty.
1833          *
1834          * When the noreloc pool is empty (or low), additional noreloc pages
1835          * are created by converting pages from the global page pool. This
1836          * process will stall during pcf accounting if the pcf bins are
1837          * already locked. Such is the case when a noreloc allocation is
1838          * looping here in page_create_get_something waiting for more noreloc
1839          * pages to appear.
1840          *
1841          * Short of adding a new field to the pcf bins to accurately track
1842          * the number of free noreloc pages, we instead do not grab the
1843          * pcgs_lock, do not set the pcf blocks and do not timeout when
1844          * allocating a noreloc page. This allows noreloc allocations to
1845          * loop without blocking global page pool allocations.
1846          *
1847          * NOTE: the behaviour of page_create_get_something has not changed
1848          * for the case of global page pool allocations.
1849          */
1850 
1851         flags &= ~PG_MATCH_COLOR;
1852         locked = 0;
1853 #if defined(__i386) || defined(__amd64)
1854         flags = page_create_update_flags_x86(flags);
1855 #endif
1856 
1857         lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
1858 
1859         for (count = 0; kcage_on || count < MAX_PCGS; count++) {
1860                 pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
1861                     flags, lgrp);
1862                 if (pp == NULL) {
1863                         pp = page_get_cachelist(vp, off, seg, vaddr,
1864                             flags, lgrp);
1865                 }
1866                 if (pp == NULL) {
1867                         /*
1868                          * Serialize.  Don't fight with other pcgs().
1869                          */
1870                         if (!locked && (!kcage_on || !(flags & PG_NORELOC))) {
1871                                 mutex_enter(&pcgs_lock);
1872                                 VM_STAT_ADD(pcgs_locked);
1873                                 locked = 1;
1874                                 p = pcf;
1875                                 for (i = 0; i < pcf_fanout; i++) {
1876                                         mutex_enter(&p->pcf_lock);
1877                                         ASSERT(p->pcf_block == 0);
1878                                         p->pcf_block = 1;
1879                                         p->pcf_reserve = p->pcf_count;
1880                                         p->pcf_count = 0;
1881                                         mutex_exit(&p->pcf_lock);
1882                                         p++;
1883                                 }
1884                                 freemem = 0;
1885                         }
1886 
1887                         if (count) {
1888                                 /*
1889                                  * Since page_free() puts pages on
1890                                  * a list then accounts for it, we
1891                                  * just have to wait for page_free()
1892                                  * to unlock any page it was working
1893                                  * with. The page_lock()-page_reclaim()
1894                                  * path falls in the same boat.
1895                                  *
1896                                  * We don't need to check on the
1897                                  * PG_WAIT flag, we have already
1898                                  * accounted for the page we are
1899                                  * looking for in page_create_va().
1900                                  *
1901                                  * We just wait a moment to let any
1902                                  * locked pages on the lists free up,
1903                                  * then continue around and try again.
1904                                  *
1905                                  * Will be awakened by set_freemem().
1906                                  */
1907                                 mutex_enter(&pcgs_wait_lock);
1908                                 cv_wait(&pcgs_cv, &pcgs_wait_lock);
1909                                 mutex_exit(&pcgs_wait_lock);
1910                         }
1911                 } else {
1912 #ifdef VM_STATS
1913                         if (count >= PCGS_TRIES) {
1914                                 VM_STAT_ADD(pcgs_too_many);
1915                         } else {
1916                                 VM_STAT_ADD(pcgs_counts[count]);
1917                         }
1918 #endif
1919                         if (locked) {
1920                                 pcgs_unblock();
1921                                 mutex_exit(&pcgs_lock);
1922                         }
1923                         if (cagelocked)
1924                                 mutex_exit(&pcgs_cagelock);
1925                         return (pp);
1926                 }
1927         }
1928         /*
1929          * we go down holding the pcf locks.
1930          */
1931         panic("no %spage found %d",
1932             ((flags & PG_NORELOC) ? "non-reloc " : ""), count);
1933         /*NOTREACHED*/
1934 }
1935 
1936 /*
1937  * Create enough pages for "bytes" worth of data starting at
1938  * "off" in "vp".
1939  *
1940  *      Where flag must be one of:
1941  *
1942  *              PG_EXCL:        Exclusive create (fail if any page already
1943  *                              exists in the page cache) which does not
1944  *                              wait for memory to become available.
1945  *
1946  *              PG_WAIT:        Non-exclusive create which can wait for
1947  *                              memory to become available.
1948  *
1949  *              PG_PHYSCONTIG:  Allocate physically contiguous pages.
1950  *                              (Not Supported)
1951  *
1952  * A doubly linked list of pages is returned to the caller.  Each page
1953  * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock)
1954  * lock.
1955  *
1956  * Unable to change the parameters to page_create() in a minor release,
1957  * we renamed page_create() to page_create_va(), changed all known calls
1958  * from page_create() to page_create_va(), and created this wrapper.
1959  *
1960  * Upon a major release, we should break compatibility by deleting this
1961  * wrapper, and replacing all the strings "page_create_va", with "page_create".
1962  *
1963  * NOTE: There is a copy of this interface as page_create_io() in
1964  *       i86/vm/vm_machdep.c. Any bugs fixed here should be applied
1965  *       there.
1966  */
1967 page_t *
1968 page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags)
1969 {
1970         caddr_t random_vaddr;
1971         struct seg kseg;
1972 
1973 #ifdef DEBUG
1974         cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p",
1975             (void *)caller());
1976 #endif
1977 
1978         random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^
1979             (uintptr_t)(off >> PAGESHIFT));
1980         kseg.s_as = &kas;
1981 
1982         return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr));
1983 }
1984 
1985 #ifdef DEBUG
1986 uint32_t pg_alloc_pgs_mtbf = 0;
1987 #endif
1988 
1989 /*
1990  * Used for large page support. It will attempt to allocate
1991  * a large page(s) off the freelist.
1992  *
1993  * Returns non zero on failure.
1994  */
1995 int
1996 page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr,
1997     page_t **basepp, page_t *ppa[], uint_t szc, int anypgsz, int pgflags)
1998 {
1999         pgcnt_t         npgs, curnpgs, totpgs;
2000         size_t          pgsz;
2001         page_t          *pplist = NULL, *pp;
2002         int             err = 0;
2003         lgrp_t          *lgrp;
2004 
2005         ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1));
2006         ASSERT(pgflags == 0 || pgflags == PG_LOCAL);
2007 
2008         /*
2009          * Check if system heavily prefers local large pages over remote
2010          * on systems with multiple lgroups.
2011          */
2012         if (lpg_alloc_prefer == LPAP_LOCAL && nlgrps > 1) {
2013                 pgflags = PG_LOCAL;
2014         }
2015 
2016         VM_STAT_ADD(alloc_pages[0]);
2017 
2018 #ifdef DEBUG
2019         if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) {
2020                 return (ENOMEM);
2021         }
2022 #endif
2023 
2024         /*
2025          * One must be NULL but not both.
2026          * And one must be non NULL but not both.
2027          */
2028         ASSERT(basepp != NULL || ppa != NULL);
2029         ASSERT(basepp == NULL || ppa == NULL);
2030 
2031 #if defined(__i386) || defined(__amd64)
2032         while (page_chk_freelist(szc) == 0) {
2033                 VM_STAT_ADD(alloc_pages[8]);
2034                 if (anypgsz == 0 || --szc == 0)
2035                         return (ENOMEM);
2036         }
2037 #endif
2038 
2039         pgsz = page_get_pagesize(szc);
2040         totpgs = curnpgs = npgs = pgsz >> PAGESHIFT;
2041 
2042         ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0);
2043 
2044         (void) page_create_wait(npgs, PG_WAIT);
2045 
2046         while (npgs && szc) {
2047                 lgrp = lgrp_mem_choose(seg, addr, pgsz);
2048                 if (pgflags == PG_LOCAL) {
2049                         pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2050                             pgflags, lgrp);
2051                         if (pp == NULL) {
2052                                 pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2053                                     0, lgrp);
2054                         }
2055                 } else {
2056                         pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2057                             0, lgrp);
2058                 }
2059                 if (pp != NULL) {
2060                         VM_STAT_ADD(alloc_pages[1]);
2061                         page_list_concat(&pplist, &pp);
2062                         ASSERT(npgs >= curnpgs);
2063                         npgs -= curnpgs;
2064                 } else if (anypgsz) {
2065                         VM_STAT_ADD(alloc_pages[2]);
2066                         szc--;
2067                         pgsz = page_get_pagesize(szc);
2068                         curnpgs = pgsz >> PAGESHIFT;
2069                 } else {
2070                         VM_STAT_ADD(alloc_pages[3]);
2071                         ASSERT(npgs == totpgs);
2072                         page_create_putback(npgs);
2073                         return (ENOMEM);
2074                 }
2075         }
2076         if (szc == 0) {
2077                 VM_STAT_ADD(alloc_pages[4]);
2078                 ASSERT(npgs != 0);
2079                 page_create_putback(npgs);
2080                 err = ENOMEM;
2081         } else if (basepp != NULL) {
2082                 ASSERT(npgs == 0);
2083                 ASSERT(ppa == NULL);
2084                 *basepp = pplist;
2085         }
2086 
2087         npgs = totpgs - npgs;
2088         pp = pplist;
2089 
2090         /*
2091          * Clear the free and age bits. Also if we were passed in a ppa then
2092          * fill it in with all the constituent pages from the large page. But
2093          * if we failed to allocate all the pages just free what we got.
2094          */
2095         while (npgs != 0) {
2096                 ASSERT(PP_ISFREE(pp));
2097                 ASSERT(PP_ISAGED(pp));
2098                 if (ppa != NULL || err != 0) {
2099                         if (err == 0) {
2100                                 VM_STAT_ADD(alloc_pages[5]);
2101                                 PP_CLRFREE(pp);
2102                                 PP_CLRAGED(pp);
2103                                 page_sub(&pplist, pp);
2104                                 *ppa++ = pp;
2105                                 npgs--;
2106                         } else {
2107                                 VM_STAT_ADD(alloc_pages[6]);
2108                                 ASSERT(pp->p_szc != 0);
2109                                 curnpgs = page_get_pagecnt(pp->p_szc);
2110                                 page_list_break(&pp, &pplist, curnpgs);
2111                                 page_list_add_pages(pp, 0);
2112                                 page_create_putback(curnpgs);
2113                                 ASSERT(npgs >= curnpgs);
2114                                 npgs -= curnpgs;
2115                         }
2116                         pp = pplist;
2117                 } else {
2118                         VM_STAT_ADD(alloc_pages[7]);
2119                         PP_CLRFREE(pp);
2120                         PP_CLRAGED(pp);
2121                         pp = pp->p_next;
2122                         npgs--;
2123                 }
2124         }
2125         return (err);
2126 }
2127 
2128 /*
2129  * Get a single large page off of the freelists, and set it up for use.
2130  * Number of bytes requested must be a supported page size.
2131  *
2132  * Note that this call may fail even if there is sufficient
2133  * memory available or PG_WAIT is set, so the caller must
2134  * be willing to fallback on page_create_va(), block and retry,
2135  * or fail the requester.
2136  */
2137 page_t *
2138 page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2139     struct seg *seg, caddr_t vaddr, void *arg)
2140 {
2141         pgcnt_t         npages;
2142         page_t          *pp;
2143         page_t          *rootpp;
2144         lgrp_t          *lgrp;
2145         lgrp_id_t       *lgrpid = (lgrp_id_t *)arg;
2146 
2147         ASSERT(vp != NULL);
2148 
2149         ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2150             PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2151         /* but no others */
2152 
2153         ASSERT((flags & PG_EXCL) == PG_EXCL);
2154 
2155         npages = btop(bytes);
2156 
2157         if (!kcage_on || panicstr) {
2158                 /*
2159                  * Cage is OFF, or we are single threaded in
2160                  * panic, so make everything a RELOC request.
2161                  */
2162                 flags &= ~PG_NORELOC;
2163         }
2164 
2165         /*
2166          * Make sure there's adequate physical memory available.
2167          * Note: PG_WAIT is ignored here.
2168          */
2169         if (freemem <= throttlefree + npages) {
2170                 VM_STAT_ADD(page_create_large_cnt[1]);
2171                 return (NULL);
2172         }
2173 
2174         /*
2175          * If cage is on, dampen draw from cage when available
2176          * cage space is low.
2177          */
2178         if ((flags & (PG_NORELOC | PG_WAIT)) ==  (PG_NORELOC | PG_WAIT) &&
2179             kcage_freemem < kcage_throttlefree + npages) {
2180 
2181                 /*
2182                  * The cage is on, the caller wants PG_NORELOC
2183                  * pages and available cage memory is very low.
2184                  * Call kcage_create_throttle() to attempt to
2185                  * control demand on the cage.
2186                  */
2187                 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) {
2188                         VM_STAT_ADD(page_create_large_cnt[2]);
2189                         return (NULL);
2190                 }
2191         }
2192 
2193         if (!pcf_decrement_bucket(npages) &&
2194             !pcf_decrement_multiple(NULL, npages, 1)) {
2195                 VM_STAT_ADD(page_create_large_cnt[4]);
2196                 return (NULL);
2197         }
2198 
2199         /*
2200          * This is where this function behaves fundamentally differently
2201          * than page_create_va(); since we're intending to map the page
2202          * with a single TTE, we have to get it as a physically contiguous
2203          * hardware pagesize chunk.  If we can't, we fail.
2204          */
2205         if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max &&
2206             LGRP_EXISTS(lgrp_table[*lgrpid]))
2207                 lgrp = lgrp_table[*lgrpid];
2208         else
2209                 lgrp = lgrp_mem_choose(seg, vaddr, bytes);
2210 
2211         if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr,
2212             bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) {
2213                 page_create_putback(npages);
2214                 VM_STAT_ADD(page_create_large_cnt[5]);
2215                 return (NULL);
2216         }
2217 
2218         /*
2219          * if we got the page with the wrong mtype give it back this is a
2220          * workaround for CR 6249718. When CR 6249718 is fixed we never get
2221          * inside "if" and the workaround becomes just a nop
2222          */
2223         if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) {
2224                 page_list_add_pages(rootpp, 0);
2225                 page_create_putback(npages);
2226                 VM_STAT_ADD(page_create_large_cnt[6]);
2227                 return (NULL);
2228         }
2229 
2230         /*
2231          * If satisfying this request has left us with too little
2232          * memory, start the wheels turning to get some back.  The
2233          * first clause of the test prevents waking up the pageout
2234          * daemon in situations where it would decide that there's
2235          * nothing to do.
2236          */
2237         if (nscan < desscan && freemem < minfree) {
2238                 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2239                     "pageout_cv_signal:freemem %ld", freemem);
2240                 cv_signal(&proc_pageout->p_cv);
2241         }
2242 
2243         pp = rootpp;
2244         while (npages--) {
2245                 ASSERT(PAGE_EXCL(pp));
2246                 ASSERT(pp->p_vnode == NULL);
2247                 ASSERT(!hat_page_is_mapped(pp));
2248                 PP_CLRFREE(pp);
2249                 PP_CLRAGED(pp);
2250                 if (!page_hashin(pp, vp, off, NULL))
2251                         panic("page_create_large: hashin failed: page %p",
2252                             (void *)pp);
2253                 page_io_lock(pp);
2254                 off += PAGESIZE;
2255                 pp = pp->p_next;
2256         }
2257 
2258         VM_STAT_ADD(page_create_large_cnt[0]);
2259         return (rootpp);
2260 }
2261 
2262 page_t *
2263 page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2264     struct seg *seg, caddr_t vaddr)
2265 {
2266         page_t          *plist = NULL;
2267         pgcnt_t         npages;
2268         pgcnt_t         found_on_free = 0;
2269         pgcnt_t         pages_req;
2270         page_t          *npp = NULL;
2271         struct pcf      *p;
2272         lgrp_t          *lgrp;
2273 
2274         TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
2275             "page_create_start:vp %p off %llx bytes %lu flags %x",
2276             vp, off, bytes, flags);
2277 
2278         ASSERT(bytes != 0 && vp != NULL);
2279 
2280         if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) {
2281                 panic("page_create: invalid flags");
2282                 /*NOTREACHED*/
2283         }
2284         ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2285             PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2286             /* but no others */
2287 
2288         pages_req = npages = btopr(bytes);
2289         /*
2290          * Try to see whether request is too large to *ever* be
2291          * satisfied, in order to prevent deadlock.  We arbitrarily
2292          * decide to limit maximum size requests to max_page_get.
2293          */
2294         if (npages >= max_page_get) {
2295                 if ((flags & PG_WAIT) == 0) {
2296                         TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG,
2297                             "page_create_toobig:vp %p off %llx npages "
2298                             "%lu max_page_get %lu",
2299                             vp, off, npages, max_page_get);
2300                         return (NULL);
2301                 } else {
2302                         cmn_err(CE_WARN,
2303                             "Request for too much kernel memory "
2304                             "(%lu bytes), will hang forever", bytes);
2305                         for (;;)
2306                                 delay(1000000000);
2307                 }
2308         }
2309 
2310         if (!kcage_on || panicstr) {
2311                 /*
2312                  * Cage is OFF, or we are single threaded in
2313                  * panic, so make everything a RELOC request.
2314                  */
2315                 flags &= ~PG_NORELOC;
2316         }
2317 
2318         if (freemem <= throttlefree + npages)
2319                 if (!page_create_throttle(npages, flags))
2320                         return (NULL);
2321 
2322         /*
2323          * If cage is on, dampen draw from cage when available
2324          * cage space is low.
2325          */
2326         if ((flags & PG_NORELOC) &&
2327             kcage_freemem < kcage_throttlefree + npages) {
2328 
2329                 /*
2330                  * The cage is on, the caller wants PG_NORELOC
2331                  * pages and available cage memory is very low.
2332                  * Call kcage_create_throttle() to attempt to
2333                  * control demand on the cage.
2334                  */
2335                 if (kcage_create_throttle(npages, flags) == KCT_FAILURE)
2336                         return (NULL);
2337         }
2338 
2339         VM_STAT_ADD(page_create_cnt[0]);
2340 
2341         if (!pcf_decrement_bucket(npages)) {
2342                 /*
2343                  * Have to look harder.  If npages is greater than
2344                  * one, then we might have to coalesce the counters.
2345                  *
2346                  * Go wait.  We come back having accounted
2347                  * for the memory.
2348                  */
2349                 VM_STAT_ADD(page_create_cnt[1]);
2350                 if (!page_create_wait(npages, flags)) {
2351                         VM_STAT_ADD(page_create_cnt[2]);
2352                         return (NULL);
2353                 }
2354         }
2355 
2356         TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
2357             "page_create_success:vp %p off %llx", vp, off);
2358 
2359         /*
2360          * If satisfying this request has left us with too little
2361          * memory, start the wheels turning to get some back.  The
2362          * first clause of the test prevents waking up the pageout
2363          * daemon in situations where it would decide that there's
2364          * nothing to do.
2365          */
2366         if (nscan < desscan && freemem < minfree) {
2367                 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2368                     "pageout_cv_signal:freemem %ld", freemem);
2369                 cv_signal(&proc_pageout->p_cv);
2370         }
2371 
2372         /*
2373          * Loop around collecting the requested number of pages.
2374          * Most of the time, we have to `create' a new page. With
2375          * this in mind, pull the page off the free list before
2376          * getting the hash lock.  This will minimize the hash
2377          * lock hold time, nesting, and the like.  If it turns
2378          * out we don't need the page, we put it back at the end.
2379          */
2380         while (npages--) {
2381                 page_t          *pp;
2382                 kmutex_t        *phm = NULL;
2383                 ulong_t         index;
2384 
2385                 index = PAGE_HASH_FUNC(vp, off);
2386 top:
2387                 ASSERT(phm == NULL);
2388                 ASSERT(index == PAGE_HASH_FUNC(vp, off));
2389                 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
2390 
2391                 if (npp == NULL) {
2392                         /*
2393                          * Try to get a page from the freelist (ie,
2394                          * a page with no [vp, off] tag).  If that
2395                          * fails, use the cachelist.
2396                          *
2397                          * During the first attempt at both the free
2398                          * and cache lists we try for the correct color.
2399                          */
2400                         /*
2401                          * XXXX-how do we deal with virtual indexed
2402                          * caches and and colors?
2403                          */
2404                         VM_STAT_ADD(page_create_cnt[4]);
2405                         /*
2406                          * Get lgroup to allocate next page of shared memory
2407                          * from and use it to specify where to allocate
2408                          * the physical memory
2409                          */
2410                         lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
2411                         npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
2412                             flags | PG_MATCH_COLOR, lgrp);
2413                         if (npp == NULL) {
2414                                 npp = page_get_cachelist(vp, off, seg,
2415                                     vaddr, flags | PG_MATCH_COLOR, lgrp);
2416                                 if (npp == NULL) {
2417                                         npp = page_create_get_something(vp,
2418                                             off, seg, vaddr,
2419                                             flags & ~PG_MATCH_COLOR);
2420                                 }
2421 
2422                                 if (PP_ISAGED(npp) == 0) {
2423                                         /*
2424                                          * Since this page came from the
2425                                          * cachelist, we must destroy the
2426                                          * old vnode association.
2427                                          */
2428                                         page_hashout(npp, NULL);
2429                                 }
2430                         }
2431                 }
2432 
2433                 /*
2434                  * We own this page!
2435                  */
2436                 ASSERT(PAGE_EXCL(npp));
2437                 ASSERT(npp->p_vnode == NULL);
2438                 ASSERT(!hat_page_is_mapped(npp));
2439                 PP_CLRFREE(npp);
2440                 PP_CLRAGED(npp);
2441 
2442                 /*
2443                  * Here we have a page in our hot little mits and are
2444                  * just waiting to stuff it on the appropriate lists.
2445                  * Get the mutex and check to see if it really does
2446                  * not exist.
2447                  */
2448                 phm = PAGE_HASH_MUTEX(index);
2449                 mutex_enter(phm);
2450                 PAGE_HASH_SEARCH(index, pp, vp, off);
2451                 if (pp == NULL) {
2452                         VM_STAT_ADD(page_create_new);
2453                         pp = npp;
2454                         npp = NULL;
2455                         if (!page_hashin(pp, vp, off, phm)) {
2456                                 /*
2457                                  * Since we hold the page hash mutex and
2458                                  * just searched for this page, page_hashin
2459                                  * had better not fail.  If it does, that
2460                                  * means somethread did not follow the
2461                                  * page hash mutex rules.  Panic now and
2462                                  * get it over with.  As usual, go down
2463                                  * holding all the locks.
2464                                  */
2465                                 ASSERT(MUTEX_HELD(phm));
2466                                 panic("page_create: "
2467                                     "hashin failed %p %p %llx %p",
2468                                     (void *)pp, (void *)vp, off, (void *)phm);
2469                                 /*NOTREACHED*/
2470                         }
2471                         ASSERT(MUTEX_HELD(phm));
2472                         mutex_exit(phm);
2473                         phm = NULL;
2474 
2475                         /*
2476                          * Hat layer locking need not be done to set
2477                          * the following bits since the page is not hashed
2478                          * and was on the free list (i.e., had no mappings).
2479                          *
2480                          * Set the reference bit to protect
2481                          * against immediate pageout
2482                          *
2483                          * XXXmh modify freelist code to set reference
2484                          * bit so we don't have to do it here.
2485                          */
2486                         page_set_props(pp, P_REF);
2487                         found_on_free++;
2488                 } else {
2489                         VM_STAT_ADD(page_create_exists);
2490                         if (flags & PG_EXCL) {
2491                                 /*
2492                                  * Found an existing page, and the caller
2493                                  * wanted all new pages.  Undo all of the work
2494                                  * we have done.
2495                                  */
2496                                 mutex_exit(phm);
2497                                 phm = NULL;
2498                                 while (plist != NULL) {
2499                                         pp = plist;
2500                                         page_sub(&plist, pp);
2501                                         page_io_unlock(pp);
2502                                         /* large pages should not end up here */
2503                                         ASSERT(pp->p_szc == 0);
2504                                         /*LINTED: constant in conditional ctx*/
2505                                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
2506                                 }
2507                                 VM_STAT_ADD(page_create_found_one);
2508                                 goto fail;
2509                         }
2510                         ASSERT(flags & PG_WAIT);
2511                         if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) {
2512                                 /*
2513                                  * Start all over again if we blocked trying
2514                                  * to lock the page.
2515                                  */
2516                                 mutex_exit(phm);
2517                                 VM_STAT_ADD(page_create_page_lock_failed);
2518                                 phm = NULL;
2519                                 goto top;
2520                         }
2521                         mutex_exit(phm);
2522                         phm = NULL;
2523 
2524                         if (PP_ISFREE(pp)) {
2525                                 ASSERT(PP_ISAGED(pp) == 0);
2526                                 VM_STAT_ADD(pagecnt.pc_get_cache);
2527                                 page_list_sub(pp, PG_CACHE_LIST);
2528                                 PP_CLRFREE(pp);
2529                                 found_on_free++;
2530                         }
2531                 }
2532 
2533                 /*
2534                  * Got a page!  It is locked.  Acquire the i/o
2535                  * lock since we are going to use the p_next and
2536                  * p_prev fields to link the requested pages together.
2537                  */
2538                 page_io_lock(pp);
2539                 page_add(&plist, pp);
2540                 plist = plist->p_next;
2541                 off += PAGESIZE;
2542                 vaddr += PAGESIZE;
2543         }
2544 
2545         ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1);
2546 fail:
2547         if (npp != NULL) {
2548                 /*
2549                  * Did not need this page after all.
2550                  * Put it back on the free list.
2551                  */
2552                 VM_STAT_ADD(page_create_putbacks);
2553                 PP_SETFREE(npp);
2554                 PP_SETAGED(npp);
2555                 npp->p_offset = (u_offset_t)-1;
2556                 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
2557                 page_unlock(npp);
2558 
2559         }
2560 
2561         ASSERT(pages_req >= found_on_free);
2562 
2563         {
2564                 uint_t overshoot = (uint_t)(pages_req - found_on_free);
2565 
2566                 if (overshoot) {
2567                         VM_STAT_ADD(page_create_overshoot);
2568                         p = &pcf[PCF_INDEX()];
2569                         mutex_enter(&p->pcf_lock);
2570                         if (p->pcf_block) {
2571                                 p->pcf_reserve += overshoot;
2572                         } else {
2573                                 p->pcf_count += overshoot;
2574                                 if (p->pcf_wait) {
2575                                         mutex_enter(&new_freemem_lock);
2576                                         if (freemem_wait) {
2577                                                 cv_signal(&freemem_cv);
2578                                                 p->pcf_wait--;
2579                                         } else {
2580                                                 p->pcf_wait = 0;
2581                                         }
2582                                         mutex_exit(&new_freemem_lock);
2583                                 }
2584                         }
2585                         mutex_exit(&p->pcf_lock);
2586                         /* freemem is approximate, so this test OK */
2587                         if (!p->pcf_block)
2588                                 freemem += overshoot;
2589                 }
2590         }
2591 
2592         return (plist);
2593 }
2594 
2595 /*
2596  * One or more constituent pages of this large page has been marked
2597  * toxic. Simply demote the large page to PAGESIZE pages and let
2598  * page_free() handle it. This routine should only be called by
2599  * large page free routines (page_free_pages() and page_destroy_pages().
2600  * All pages are locked SE_EXCL and have already been marked free.
2601  */
2602 static void
2603 page_free_toxic_pages(page_t *rootpp)
2604 {
2605         page_t  *tpp;
2606         pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc);
2607         uint_t  szc = rootpp->p_szc;
2608 
2609         for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) {
2610                 ASSERT(tpp->p_szc == szc);
2611                 ASSERT((PAGE_EXCL(tpp) &&
2612                     !page_iolock_assert(tpp)) || panicstr);
2613                 tpp->p_szc = 0;
2614         }
2615 
2616         while (rootpp != NULL) {
2617                 tpp = rootpp;
2618                 page_sub(&rootpp, tpp);
2619                 ASSERT(PP_ISFREE(tpp));
2620                 PP_CLRFREE(tpp);
2621                 page_free(tpp, 1);
2622         }
2623 }
2624 
2625 /*
2626  * Put page on the "free" list.
2627  * The free list is really two lists maintained by
2628  * the PSM of whatever machine we happen to be on.
2629  */
2630 void
2631 page_free(page_t *pp, int dontneed)
2632 {
2633         struct pcf      *p;
2634         uint_t          pcf_index;
2635 
2636         ASSERT((PAGE_EXCL(pp) &&
2637             !page_iolock_assert(pp)) || panicstr);
2638 
2639         if (PP_ISFREE(pp)) {
2640                 panic("page_free: page %p is free", (void *)pp);
2641         }
2642 
2643         if (pp->p_szc != 0) {
2644                 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
2645                     PP_ISKAS(pp)) {
2646                         panic("page_free: anon or kernel "
2647                             "or no vnode large page %p", (void *)pp);
2648                 }
2649                 page_demote_vp_pages(pp);
2650                 ASSERT(pp->p_szc == 0);
2651         }
2652 
2653         /*
2654          * The page_struct_lock need not be acquired to examine these
2655          * fields since the page has an "exclusive" lock.
2656          */
2657         if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
2658             pp->p_slckcnt != 0) {
2659                 panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d "
2660                     "slckcnt = %d", (void *)pp, page_pptonum(pp), pp->p_lckcnt,
2661                     pp->p_cowcnt, pp->p_slckcnt);
2662                 /*NOTREACHED*/
2663         }
2664 
2665         ASSERT(!hat_page_getshare(pp));
2666 
2667         PP_SETFREE(pp);
2668         ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) ||
2669             !hat_ismod(pp));
2670         page_clr_all_props(pp);
2671         ASSERT(!hat_page_getshare(pp));
2672 
2673         /*
2674          * Now we add the page to the head of the free list.
2675          * But if this page is associated with a paged vnode
2676          * then we adjust the head forward so that the page is
2677          * effectively at the end of the list.
2678          */
2679         if (pp->p_vnode == NULL) {
2680                 /*
2681                  * Page has no identity, put it on the free list.
2682                  */
2683                 PP_SETAGED(pp);
2684                 pp->p_offset = (u_offset_t)-1;
2685                 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
2686                 VM_STAT_ADD(pagecnt.pc_free_free);
2687                 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2688                     "page_free_free:pp %p", pp);
2689         } else {
2690                 PP_CLRAGED(pp);
2691 
2692                 if (!dontneed) {
2693                         /* move it to the tail of the list */
2694                         page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL);
2695 
2696                         VM_STAT_ADD(pagecnt.pc_free_cache);
2697                         TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL,
2698                             "page_free_cache_tail:pp %p", pp);
2699                 } else {
2700                         page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD);
2701 
2702                         VM_STAT_ADD(pagecnt.pc_free_dontneed);
2703                         TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD,
2704                             "page_free_cache_head:pp %p", pp);
2705                 }
2706         }
2707         page_unlock(pp);
2708 
2709         /*
2710          * Now do the `freemem' accounting.
2711          */
2712         pcf_index = PCF_INDEX();
2713         p = &pcf[pcf_index];
2714 
2715         mutex_enter(&p->pcf_lock);
2716         if (p->pcf_block) {
2717                 p->pcf_reserve += 1;
2718         } else {
2719                 p->pcf_count += 1;
2720                 if (p->pcf_wait) {
2721                         mutex_enter(&new_freemem_lock);
2722                         /*
2723                          * Check to see if some other thread
2724                          * is actually waiting.  Another bucket
2725                          * may have woken it up by now.  If there
2726                          * are no waiters, then set our pcf_wait
2727                          * count to zero to avoid coming in here
2728                          * next time.  Also, since only one page
2729                          * was put on the free list, just wake
2730                          * up one waiter.
2731                          */
2732                         if (freemem_wait) {
2733                                 cv_signal(&freemem_cv);
2734                                 p->pcf_wait--;
2735                         } else {
2736                                 p->pcf_wait = 0;
2737                         }
2738                         mutex_exit(&new_freemem_lock);
2739                 }
2740         }
2741         mutex_exit(&p->pcf_lock);
2742 
2743         /* freemem is approximate, so this test OK */
2744         if (!p->pcf_block)
2745                 freemem += 1;
2746 }
2747 
2748 /*
2749  * Put page on the "free" list during intial startup.
2750  * This happens during initial single threaded execution.
2751  */
2752 void
2753 page_free_at_startup(page_t *pp)
2754 {
2755         struct pcf      *p;
2756         uint_t          pcf_index;
2757 
2758         page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT);
2759         VM_STAT_ADD(pagecnt.pc_free_free);
2760 
2761         /*
2762          * Now do the `freemem' accounting.
2763          */
2764         pcf_index = PCF_INDEX();
2765         p = &pcf[pcf_index];
2766 
2767         ASSERT(p->pcf_block == 0);
2768         ASSERT(p->pcf_wait == 0);
2769         p->pcf_count += 1;
2770 
2771         /* freemem is approximate, so this is OK */
2772         freemem += 1;
2773 }
2774 
2775 void
2776 page_free_pages(page_t *pp)
2777 {
2778         page_t  *tpp, *rootpp = NULL;
2779         pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
2780         pgcnt_t i;
2781         uint_t  szc = pp->p_szc;
2782 
2783         VM_STAT_ADD(pagecnt.pc_free_pages);
2784         TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2785             "page_free_free:pp %p", pp);
2786 
2787         ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
2788         if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
2789                 panic("page_free_pages: not root page %p", (void *)pp);
2790                 /*NOTREACHED*/
2791         }
2792 
2793         for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
2794                 ASSERT((PAGE_EXCL(tpp) &&
2795                     !page_iolock_assert(tpp)) || panicstr);
2796                 if (PP_ISFREE(tpp)) {
2797                         panic("page_free_pages: page %p is free", (void *)tpp);
2798                         /*NOTREACHED*/
2799                 }
2800                 if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 ||
2801                     tpp->p_cowcnt != 0 || tpp->p_slckcnt != 0) {
2802                         panic("page_free_pages %p", (void *)tpp);
2803                         /*NOTREACHED*/
2804                 }
2805 
2806                 ASSERT(!hat_page_getshare(tpp));
2807                 ASSERT(tpp->p_vnode == NULL);
2808                 ASSERT(tpp->p_szc == szc);
2809 
2810                 PP_SETFREE(tpp);
2811                 page_clr_all_props(tpp);
2812                 PP_SETAGED(tpp);
2813                 tpp->p_offset = (u_offset_t)-1;
2814                 ASSERT(tpp->p_next == tpp);
2815                 ASSERT(tpp->p_prev == tpp);
2816                 page_list_concat(&rootpp, &tpp);
2817         }
2818         ASSERT(rootpp == pp);
2819 
2820         page_list_add_pages(rootpp, 0);
2821         page_create_putback(pgcnt);
2822 }
2823 
2824 int free_pages = 1;
2825 
2826 /*
2827  * This routine attempts to return pages to the cachelist via page_release().
2828  * It does not *have* to be successful in all cases, since the pageout scanner
2829  * will catch any pages it misses.  It does need to be fast and not introduce
2830  * too much overhead.
2831  *
2832  * If a page isn't found on the unlocked sweep of the page_hash bucket, we
2833  * don't lock and retry.  This is ok, since the page scanner will eventually
2834  * find any page we miss in free_vp_pages().
2835  */
2836 void
2837 free_vp_pages(vnode_t *vp, u_offset_t off, size_t len)
2838 {
2839         page_t *pp;
2840         u_offset_t eoff;
2841         extern int swap_in_range(vnode_t *, u_offset_t, size_t);
2842 
2843         eoff = off + len;
2844 
2845         if (free_pages == 0)
2846                 return;
2847         if (swap_in_range(vp, off, len))
2848                 return;
2849 
2850         for (; off < eoff; off += PAGESIZE) {
2851 
2852                 /*
2853                  * find the page using a fast, but inexact search. It'll be OK
2854                  * if a few pages slip through the cracks here.
2855                  */
2856                 pp = page_exists(vp, off);
2857 
2858                 /*
2859                  * If we didn't find the page (it may not exist), the page
2860                  * is free, looks still in use (shared), or we can't lock it,
2861                  * just give up.
2862                  */
2863                 if (pp == NULL ||
2864                     PP_ISFREE(pp) ||
2865                     page_share_cnt(pp) > 0 ||
2866                     !page_trylock(pp, SE_EXCL))
2867                         continue;
2868 
2869                 /*
2870                  * Once we have locked pp, verify that it's still the
2871                  * correct page and not already free
2872                  */
2873                 ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL));
2874                 if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) {
2875                         page_unlock(pp);
2876                         continue;
2877                 }
2878 
2879                 /*
2880                  * try to release the page...
2881                  */
2882                 (void) page_release(pp, 1);
2883         }
2884 }
2885 
2886 /*
2887  * Reclaim the given page from the free list.
2888  * If pp is part of a large pages, only the given constituent page is reclaimed
2889  * and the large page it belonged to will be demoted.  This can only happen
2890  * if the page is not on the cachelist.
2891  *
2892  * Returns 1 on success or 0 on failure.
2893  *
2894  * The page is unlocked if it can't be reclaimed (when freemem == 0).
2895  * If `lock' is non-null, it will be dropped and re-acquired if
2896  * the routine must wait while freemem is 0.
2897  *
2898  * As it turns out, boot_getpages() does this.  It picks a page,
2899  * based on where OBP mapped in some address, gets its pfn, searches
2900  * the memsegs, locks the page, then pulls it off the free list!
2901  */
2902 int
2903 page_reclaim(page_t *pp, kmutex_t *lock)
2904 {
2905         struct pcf      *p;
2906         struct cpu      *cpup;
2907         int             enough;
2908         uint_t          i;
2909 
2910         ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
2911         ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
2912 
2913         /*
2914          * If `freemem' is 0, we cannot reclaim this page from the
2915          * freelist, so release every lock we might hold: the page,
2916          * and the `lock' before blocking.
2917          *
2918          * The only way `freemem' can become 0 while there are pages
2919          * marked free (have their p->p_free bit set) is when the
2920          * system is low on memory and doing a page_create().  In
2921          * order to guarantee that once page_create() starts acquiring
2922          * pages it will be able to get all that it needs since `freemem'
2923          * was decreased by the requested amount.  So, we need to release
2924          * this page, and let page_create() have it.
2925          *
2926          * Since `freemem' being zero is not supposed to happen, just
2927          * use the usual hash stuff as a starting point.  If that bucket
2928          * is empty, then assume the worst, and start at the beginning
2929          * of the pcf array.  If we always start at the beginning
2930          * when acquiring more than one pcf lock, there won't be any
2931          * deadlock problems.
2932          */
2933 
2934         /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */
2935 
2936         if (freemem <= throttlefree && !page_create_throttle(1l, 0)) {
2937                 pcf_acquire_all();
2938                 goto page_reclaim_nomem;
2939         }
2940 
2941         enough = pcf_decrement_bucket(1);
2942 
2943         if (!enough) {
2944                 VM_STAT_ADD(page_reclaim_zero);
2945                 /*
2946                  * Check again. Its possible that some other thread
2947                  * could have been right behind us, and added one
2948                  * to a list somewhere.  Acquire each of the pcf locks
2949                  * until we find a page.
2950                  */
2951                 p = pcf;
2952                 for (i = 0; i < pcf_fanout; i++) {
2953                         mutex_enter(&p->pcf_lock);
2954                         if (p->pcf_count >= 1) {
2955                                 p->pcf_count -= 1;
2956                                 /*
2957                                  * freemem is not protected by any lock. Thus,
2958                                  * we cannot have any assertion containing
2959                                  * freemem here.
2960                                  */
2961                                 freemem -= 1;
2962                                 enough = 1;
2963                                 break;
2964                         }
2965                         p++;
2966                 }
2967 
2968                 if (!enough) {
2969 page_reclaim_nomem:
2970                         /*
2971                          * We really can't have page `pp'.
2972                          * Time for the no-memory dance with
2973                          * page_free().  This is just like
2974                          * page_create_wait().  Plus the added
2975                          * attraction of releasing whatever mutex
2976                          * we held when we were called with in `lock'.
2977                          * Page_unlock() will wakeup any thread
2978                          * waiting around for this page.
2979                          */
2980                         if (lock) {
2981                                 VM_STAT_ADD(page_reclaim_zero_locked);
2982                                 mutex_exit(lock);
2983                         }
2984                         page_unlock(pp);
2985 
2986                         /*
2987                          * get this before we drop all the pcf locks.
2988                          */
2989                         mutex_enter(&new_freemem_lock);
2990 
2991                         p = pcf;
2992                         for (i = 0; i < pcf_fanout; i++) {
2993                                 p->pcf_wait++;
2994                                 mutex_exit(&p->pcf_lock);
2995                                 p++;
2996                         }
2997 
2998                         freemem_wait++;
2999                         cv_wait(&freemem_cv, &new_freemem_lock);
3000                         freemem_wait--;
3001 
3002                         mutex_exit(&new_freemem_lock);
3003 
3004                         if (lock) {
3005                                 mutex_enter(lock);
3006                         }
3007                         return (0);
3008                 }
3009 
3010                 /*
3011                  * The pcf accounting has been done,
3012                  * though none of the pcf_wait flags have been set,
3013                  * drop the locks and continue on.
3014                  */
3015                 while (p >= pcf) {
3016                         mutex_exit(&p->pcf_lock);
3017                         p--;
3018                 }
3019         }
3020 
3021 
3022         VM_STAT_ADD(pagecnt.pc_reclaim);
3023 
3024         /*
3025          * page_list_sub will handle the case where pp is a large page.
3026          * It's possible that the page was promoted while on the freelist
3027          */
3028         if (PP_ISAGED(pp)) {
3029                 page_list_sub(pp, PG_FREE_LIST);
3030                 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE,
3031                     "page_reclaim_free:pp %p", pp);
3032         } else {
3033                 page_list_sub(pp, PG_CACHE_LIST);
3034                 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE,
3035                     "page_reclaim_cache:pp %p", pp);
3036         }
3037 
3038         /*
3039          * clear the p_free & p_age bits since this page is no longer
3040          * on the free list.  Notice that there was a brief time where
3041          * a page is marked as free, but is not on the list.
3042          *
3043          * Set the reference bit to protect against immediate pageout.
3044          */
3045         PP_CLRFREE(pp);
3046         PP_CLRAGED(pp);
3047         page_set_props(pp, P_REF);
3048 
3049         CPU_STATS_ENTER_K();
3050         cpup = CPU;     /* get cpup now that CPU cannot change */
3051         CPU_STATS_ADDQ(cpup, vm, pgrec, 1);
3052         CPU_STATS_ADDQ(cpup, vm, pgfrec, 1);
3053         CPU_STATS_EXIT_K();
3054         ASSERT(pp->p_szc == 0);
3055 
3056         return (1);
3057 }
3058 
3059 /*
3060  * Destroy identity of the page and put it back on
3061  * the page free list.  Assumes that the caller has
3062  * acquired the "exclusive" lock on the page.
3063  */
3064 void
3065 page_destroy(page_t *pp, int dontfree)
3066 {
3067         ASSERT((PAGE_EXCL(pp) &&
3068             !page_iolock_assert(pp)) || panicstr);
3069         ASSERT(pp->p_slckcnt == 0 || panicstr);
3070 
3071         if (pp->p_szc != 0) {
3072                 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
3073                     PP_ISKAS(pp)) {
3074                         panic("page_destroy: anon or kernel or no vnode "
3075                             "large page %p", (void *)pp);
3076                 }
3077                 page_demote_vp_pages(pp);
3078                 ASSERT(pp->p_szc == 0);
3079         }
3080 
3081         TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp);
3082 
3083         /*
3084          * Unload translations, if any, then hash out the
3085          * page to erase its identity.
3086          */
3087         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3088         page_hashout(pp, NULL);
3089 
3090         if (!dontfree) {
3091                 /*
3092                  * Acquire the "freemem_lock" for availrmem.
3093                  * The page_struct_lock need not be acquired for lckcnt
3094                  * and cowcnt since the page has an "exclusive" lock.
3095                  * We are doing a modified version of page_pp_unlock here.
3096                  */
3097                 if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) {
3098                         mutex_enter(&freemem_lock);
3099                         if (pp->p_lckcnt != 0) {
3100                                 availrmem++;
3101                                 pages_locked--;
3102                                 pp->p_lckcnt = 0;
3103                         }
3104                         if (pp->p_cowcnt != 0) {
3105                                 availrmem += pp->p_cowcnt;
3106                                 pages_locked -= pp->p_cowcnt;
3107                                 pp->p_cowcnt = 0;
3108                         }
3109                         mutex_exit(&freemem_lock);
3110                 }
3111                 /*
3112                  * Put the page on the "free" list.
3113                  */
3114                 page_free(pp, 0);
3115         }
3116 }
3117 
3118 void
3119 page_destroy_pages(page_t *pp)
3120 {
3121 
3122         page_t  *tpp, *rootpp = NULL;
3123         pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
3124         pgcnt_t i, pglcks = 0;
3125         uint_t  szc = pp->p_szc;
3126 
3127         ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
3128 
3129         VM_STAT_ADD(pagecnt.pc_destroy_pages);
3130 
3131         TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp);
3132 
3133         if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
3134                 panic("page_destroy_pages: not root page %p", (void *)pp);
3135                 /*NOTREACHED*/
3136         }
3137 
3138         for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
3139                 ASSERT((PAGE_EXCL(tpp) &&
3140                     !page_iolock_assert(tpp)) || panicstr);
3141                 ASSERT(tpp->p_slckcnt == 0 || panicstr);
3142                 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
3143                 page_hashout(tpp, NULL);
3144                 ASSERT(tpp->p_offset == (u_offset_t)-1);
3145                 if (tpp->p_lckcnt != 0) {
3146                         pglcks++;
3147                         tpp->p_lckcnt = 0;
3148                 } else if (tpp->p_cowcnt != 0) {
3149                         pglcks += tpp->p_cowcnt;
3150                         tpp->p_cowcnt = 0;
3151                 }
3152                 ASSERT(!hat_page_getshare(tpp));
3153                 ASSERT(tpp->p_vnode == NULL);
3154                 ASSERT(tpp->p_szc == szc);
3155 
3156                 PP_SETFREE(tpp);
3157                 page_clr_all_props(tpp);
3158                 PP_SETAGED(tpp);
3159                 ASSERT(tpp->p_next == tpp);
3160                 ASSERT(tpp->p_prev == tpp);
3161                 page_list_concat(&rootpp, &tpp);
3162         }
3163 
3164         ASSERT(rootpp == pp);
3165         if (pglcks != 0) {
3166                 mutex_enter(&freemem_lock);
3167                 availrmem += pglcks;
3168                 mutex_exit(&freemem_lock);
3169         }
3170 
3171         page_list_add_pages(rootpp, 0);
3172         page_create_putback(pgcnt);
3173 }
3174 
3175 /*
3176  * Similar to page_destroy(), but destroys pages which are
3177  * locked and known to be on the page free list.  Since
3178  * the page is known to be free and locked, no one can access
3179  * it.
3180  *
3181  * Also, the number of free pages does not change.
3182  */
3183 void
3184 page_destroy_free(page_t *pp)
3185 {
3186         ASSERT(PAGE_EXCL(pp));
3187         ASSERT(PP_ISFREE(pp));
3188         ASSERT(pp->p_vnode);
3189         ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0);
3190         ASSERT(!hat_page_is_mapped(pp));
3191         ASSERT(PP_ISAGED(pp) == 0);
3192         ASSERT(pp->p_szc == 0);
3193 
3194         VM_STAT_ADD(pagecnt.pc_destroy_free);
3195         page_list_sub(pp, PG_CACHE_LIST);
3196 
3197         page_hashout(pp, NULL);
3198         ASSERT(pp->p_vnode == NULL);
3199         ASSERT(pp->p_offset == (u_offset_t)-1);
3200         ASSERT(pp->p_hash == NULL);
3201 
3202         PP_SETAGED(pp);
3203         page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3204         page_unlock(pp);
3205 
3206         mutex_enter(&new_freemem_lock);
3207         if (freemem_wait) {
3208                 cv_signal(&freemem_cv);
3209         }
3210         mutex_exit(&new_freemem_lock);
3211 }
3212 
3213 /*
3214  * Rename the page "opp" to have an identity specified
3215  * by [vp, off].  If a page already exists with this name
3216  * it is locked and destroyed.  Note that the page's
3217  * translations are not unloaded during the rename.
3218  *
3219  * This routine is used by the anon layer to "steal" the
3220  * original page and is not unlike destroying a page and
3221  * creating a new page using the same page frame.
3222  *
3223  * XXX -- Could deadlock if caller 1 tries to rename A to B while
3224  * caller 2 tries to rename B to A.
3225  */
3226 void
3227 page_rename(page_t *opp, vnode_t *vp, u_offset_t off)
3228 {
3229         page_t          *pp;
3230         int             olckcnt = 0;
3231         int             ocowcnt = 0;
3232         kmutex_t        *phm;
3233         ulong_t         index;
3234 
3235         ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp));
3236         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3237         ASSERT(PP_ISFREE(opp) == 0);
3238 
3239         VM_STAT_ADD(page_rename_count);
3240 
3241         TRACE_3(TR_FAC_VM, TR_PAGE_RENAME,
3242             "page rename:pp %p vp %p off %llx", opp, vp, off);
3243 
3244         /*
3245          * CacheFS may call page_rename for a large NFS page
3246          * when both CacheFS and NFS mount points are used
3247          * by applications. Demote this large page before
3248          * renaming it, to ensure that there are no "partial"
3249          * large pages left lying around.
3250          */
3251         if (opp->p_szc != 0) {
3252                 vnode_t *ovp = opp->p_vnode;
3253                 ASSERT(ovp != NULL);
3254                 ASSERT(!IS_SWAPFSVP(ovp));
3255                 ASSERT(!VN_ISKAS(ovp));
3256                 page_demote_vp_pages(opp);
3257                 ASSERT(opp->p_szc == 0);
3258         }
3259 
3260         page_hashout(opp, NULL);
3261         PP_CLRAGED(opp);
3262 
3263         /*
3264          * Acquire the appropriate page hash lock, since
3265          * we're going to rename the page.
3266          */
3267         index = PAGE_HASH_FUNC(vp, off);
3268         phm = PAGE_HASH_MUTEX(index);
3269         mutex_enter(phm);
3270 top:
3271         /*
3272          * Look for an existing page with this name and destroy it if found.
3273          * By holding the page hash lock all the way to the page_hashin()
3274          * call, we are assured that no page can be created with this
3275          * identity.  In the case when the phm lock is dropped to undo any
3276          * hat layer mappings, the existing page is held with an "exclusive"
3277          * lock, again preventing another page from being created with
3278          * this identity.
3279          */
3280         PAGE_HASH_SEARCH(index, pp, vp, off);
3281         if (pp != NULL) {
3282                 VM_STAT_ADD(page_rename_exists);
3283 
3284                 /*
3285                  * As it turns out, this is one of only two places where
3286                  * page_lock() needs to hold the passed in lock in the
3287                  * successful case.  In all of the others, the lock could
3288                  * be dropped as soon as the attempt is made to lock
3289                  * the page.  It is tempting to add yet another arguement,
3290                  * PL_KEEP or PL_DROP, to let page_lock know what to do.
3291                  */
3292                 if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) {
3293                         /*
3294                          * Went to sleep because the page could not
3295                          * be locked.  We were woken up when the page
3296                          * was unlocked, or when the page was destroyed.
3297                          * In either case, `phm' was dropped while we
3298                          * slept.  Hence we should not just roar through
3299                          * this loop.
3300                          */
3301                         goto top;
3302                 }
3303 
3304                 /*
3305                  * If an existing page is a large page, then demote
3306                  * it to ensure that no "partial" large pages are
3307                  * "created" after page_rename. An existing page
3308                  * can be a CacheFS page, and can't belong to swapfs.
3309                  */
3310                 if (hat_page_is_mapped(pp)) {
3311                         /*
3312                          * Unload translations.  Since we hold the
3313                          * exclusive lock on this page, the page
3314                          * can not be changed while we drop phm.
3315                          * This is also not a lock protocol violation,
3316                          * but rather the proper way to do things.
3317                          */
3318                         mutex_exit(phm);
3319                         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3320                         if (pp->p_szc != 0) {
3321                                 ASSERT(!IS_SWAPFSVP(vp));
3322                                 ASSERT(!VN_ISKAS(vp));
3323                                 page_demote_vp_pages(pp);
3324                                 ASSERT(pp->p_szc == 0);
3325                         }
3326                         mutex_enter(phm);
3327                 } else if (pp->p_szc != 0) {
3328                         ASSERT(!IS_SWAPFSVP(vp));
3329                         ASSERT(!VN_ISKAS(vp));
3330                         mutex_exit(phm);
3331                         page_demote_vp_pages(pp);
3332                         ASSERT(pp->p_szc == 0);
3333                         mutex_enter(phm);
3334                 }
3335                 page_hashout(pp, phm);
3336         }
3337         /*
3338          * Hash in the page with the new identity.
3339          */
3340         if (!page_hashin(opp, vp, off, phm)) {
3341                 /*
3342                  * We were holding phm while we searched for [vp, off]
3343                  * and only dropped phm if we found and locked a page.
3344                  * If we can't create this page now, then some thing
3345                  * is really broken.
3346                  */
3347                 panic("page_rename: Can't hash in page: %p", (void *)pp);
3348                 /*NOTREACHED*/
3349         }
3350 
3351         ASSERT(MUTEX_HELD(phm));
3352         mutex_exit(phm);
3353 
3354         /*
3355          * Now that we have dropped phm, lets get around to finishing up
3356          * with pp.
3357          */
3358         if (pp != NULL) {
3359                 ASSERT(!hat_page_is_mapped(pp));
3360                 /* for now large pages should not end up here */
3361                 ASSERT(pp->p_szc == 0);
3362                 /*
3363                  * Save the locks for transfer to the new page and then
3364                  * clear them so page_free doesn't think they're important.
3365                  * The page_struct_lock need not be acquired for lckcnt and
3366                  * cowcnt since the page has an "exclusive" lock.
3367                  */
3368                 olckcnt = pp->p_lckcnt;
3369                 ocowcnt = pp->p_cowcnt;
3370                 pp->p_lckcnt = pp->p_cowcnt = 0;
3371 
3372                 /*
3373                  * Put the page on the "free" list after we drop
3374                  * the lock.  The less work under the lock the better.
3375                  */
3376                 /*LINTED: constant in conditional context*/
3377                 VN_DISPOSE(pp, B_FREE, 0, kcred);
3378         }
3379 
3380         /*
3381          * Transfer the lock count from the old page (if any).
3382          * The page_struct_lock need not be acquired for lckcnt and
3383          * cowcnt since the page has an "exclusive" lock.
3384          */
3385         opp->p_lckcnt += olckcnt;
3386         opp->p_cowcnt += ocowcnt;
3387 }
3388 
3389 /*
3390  * low level routine to add page `pp' to the hash and vp chains for [vp, offset]
3391  *
3392  * Pages are normally inserted at the start of a vnode's v_pages list.
3393  * If the vnode is VMODSORT and the page is modified, it goes at the end.
3394  * This can happen when a modified page is relocated for DR.
3395  *
3396  * Returns 1 on success and 0 on failure.
3397  */
3398 static int
3399 page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset)
3400 {
3401         page_t          **listp;
3402         page_t          *tp;
3403         ulong_t         index;
3404 
3405         ASSERT(PAGE_EXCL(pp));
3406         ASSERT(vp != NULL);
3407         ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3408 
3409         /*
3410          * Be sure to set these up before the page is inserted on the hash
3411          * list.  As soon as the page is placed on the list some other
3412          * thread might get confused and wonder how this page could
3413          * possibly hash to this list.
3414          */
3415         pp->p_vnode = vp;
3416         pp->p_offset = offset;
3417 
3418         /*
3419          * record if this page is on a swap vnode
3420          */
3421         if ((vp->v_flag & VISSWAP) != 0)
3422                 PP_SETSWAP(pp);
3423 
3424         index = PAGE_HASH_FUNC(vp, offset);
3425         ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index)));
3426         listp = &page_hash[index];
3427 
3428         /*
3429          * If this page is already hashed in, fail this attempt to add it.
3430          */
3431         for (tp = *listp; tp != NULL; tp = tp->p_hash) {
3432                 if (tp->p_vnode == vp && tp->p_offset == offset) {
3433                         pp->p_vnode = NULL;
3434                         pp->p_offset = (u_offset_t)(-1);
3435                         return (0);
3436                 }
3437         }
3438         pp->p_hash = *listp;
3439         *listp = pp;
3440 
3441         /*
3442          * Add the page to the vnode's list of pages
3443          */
3444         if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp))
3445                 listp = &vp->v_pages->p_vpprev->p_vpnext;
3446         else
3447                 listp = &vp->v_pages;
3448 
3449         page_vpadd(listp, pp);
3450 
3451         return (1);
3452 }
3453 
3454 /*
3455  * Add page `pp' to both the hash and vp chains for [vp, offset].
3456  *
3457  * Returns 1 on success and 0 on failure.
3458  * If hold is passed in, it is not dropped.
3459  */
3460 int
3461 page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold)
3462 {
3463         kmutex_t        *phm = NULL;
3464         kmutex_t        *vphm;
3465         int             rc;
3466 
3467         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3468         ASSERT(pp->p_fsdata == 0 || panicstr);
3469 
3470         TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN,
3471             "page_hashin:pp %p vp %p offset %llx",
3472             pp, vp, offset);
3473 
3474         VM_STAT_ADD(hashin_count);
3475 
3476         if (hold != NULL)
3477                 phm = hold;
3478         else {
3479                 VM_STAT_ADD(hashin_not_held);
3480                 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset));
3481                 mutex_enter(phm);
3482         }
3483 
3484         vphm = page_vnode_mutex(vp);
3485         mutex_enter(vphm);
3486         rc = page_do_hashin(pp, vp, offset);
3487         mutex_exit(vphm);
3488         if (hold == NULL)
3489                 mutex_exit(phm);
3490         if (rc == 0)
3491                 VM_STAT_ADD(hashin_already);
3492         return (rc);
3493 }
3494 
3495 /*
3496  * Remove page ``pp'' from the hash and vp chains and remove vp association.
3497  * All mutexes must be held
3498  */
3499 static void
3500 page_do_hashout(page_t *pp)
3501 {
3502         page_t  **hpp;
3503         page_t  *hp;
3504         vnode_t *vp = pp->p_vnode;
3505 
3506         ASSERT(vp != NULL);
3507         ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3508 
3509         /*
3510          * First, take pp off of its hash chain.
3511          */
3512         hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)];
3513 
3514         for (;;) {
3515                 hp = *hpp;
3516                 if (hp == pp)
3517                         break;
3518                 if (hp == NULL) {
3519                         panic("page_do_hashout");
3520                         /*NOTREACHED*/
3521                 }
3522                 hpp = &hp->p_hash;
3523         }
3524         *hpp = pp->p_hash;
3525 
3526         /*
3527          * Now remove it from its associated vnode.
3528          */
3529         if (vp->v_pages)
3530                 page_vpsub(&vp->v_pages, pp);
3531 
3532         pp->p_hash = NULL;
3533         page_clr_all_props(pp);
3534         PP_CLRSWAP(pp);
3535         pp->p_vnode = NULL;
3536         pp->p_offset = (u_offset_t)-1;
3537         pp->p_fsdata = 0;
3538 }
3539 
3540 /*
3541  * Remove page ``pp'' from the hash and vp chains and remove vp association.
3542  *
3543  * When `phm' is non-NULL it contains the address of the mutex protecting the
3544  * hash list pp is on.  It is not dropped.
3545  */
3546 void
3547 page_hashout(page_t *pp, kmutex_t *phm)
3548 {
3549         vnode_t         *vp;
3550         ulong_t         index;
3551         kmutex_t        *nphm;
3552         kmutex_t        *vphm;
3553         kmutex_t        *sep;
3554 
3555         ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1);
3556         ASSERT(pp->p_vnode != NULL);
3557         ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
3558         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode)));
3559 
3560         vp = pp->p_vnode;
3561 
3562         TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT,
3563             "page_hashout:pp %p vp %p", pp, vp);
3564 
3565         /* Kernel probe */
3566         TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */,
3567             tnf_opaque, vnode, vp,
3568             tnf_offset, offset, pp->p_offset);
3569 
3570         /*
3571          *
3572          */
3573         VM_STAT_ADD(hashout_count);
3574         index = PAGE_HASH_FUNC(vp, pp->p_offset);
3575         if (phm == NULL) {
3576                 VM_STAT_ADD(hashout_not_held);
3577                 nphm = PAGE_HASH_MUTEX(index);
3578                 mutex_enter(nphm);
3579         }
3580         ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1);
3581 
3582 
3583         /*
3584          * grab page vnode mutex and remove it...
3585          */
3586         vphm = page_vnode_mutex(vp);
3587         mutex_enter(vphm);
3588 
3589         page_do_hashout(pp);
3590 
3591         mutex_exit(vphm);
3592         if (phm == NULL)
3593                 mutex_exit(nphm);
3594 
3595         /*
3596          * Wake up processes waiting for this page.  The page's
3597          * identity has been changed, and is probably not the
3598          * desired page any longer.
3599          */
3600         sep = page_se_mutex(pp);
3601         mutex_enter(sep);
3602         pp->p_selock &= ~SE_EWANTED;
3603         if (CV_HAS_WAITERS(&pp->p_cv))
3604                 cv_broadcast(&pp->p_cv);
3605         mutex_exit(sep);
3606 }
3607 
3608 /*
3609  * Add the page to the front of a linked list of pages
3610  * using the p_next & p_prev pointers for the list.
3611  * The caller is responsible for protecting the list pointers.
3612  */
3613 void
3614 page_add(page_t **ppp, page_t *pp)
3615 {
3616         ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3617 
3618         page_add_common(ppp, pp);
3619 }
3620 
3621 
3622 
3623 /*
3624  *  Common code for page_add() and mach_page_add()
3625  */
3626 void
3627 page_add_common(page_t **ppp, page_t *pp)
3628 {
3629         if (*ppp == NULL) {
3630                 pp->p_next = pp->p_prev = pp;
3631         } else {
3632                 pp->p_next = *ppp;
3633                 pp->p_prev = (*ppp)->p_prev;
3634                 (*ppp)->p_prev = pp;
3635                 pp->p_prev->p_next = pp;
3636         }
3637         *ppp = pp;
3638 }
3639 
3640 
3641 /*
3642  * Remove this page from a linked list of pages
3643  * using the p_next & p_prev pointers for the list.
3644  *
3645  * The caller is responsible for protecting the list pointers.
3646  */
3647 void
3648 page_sub(page_t **ppp, page_t *pp)
3649 {
3650         ASSERT((PP_ISFREE(pp)) ? 1 :
3651             (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3652 
3653         if (*ppp == NULL || pp == NULL) {
3654                 panic("page_sub: bad arg(s): pp %p, *ppp %p",
3655                     (void *)pp, (void *)(*ppp));
3656                 /*NOTREACHED*/
3657         }
3658 
3659         page_sub_common(ppp, pp);
3660 }
3661 
3662 
3663 /*
3664  *  Common code for page_sub() and mach_page_sub()
3665  */
3666 void
3667 page_sub_common(page_t **ppp, page_t *pp)
3668 {
3669         if (*ppp == pp)
3670                 *ppp = pp->p_next;           /* go to next page */
3671 
3672         if (*ppp == pp)
3673                 *ppp = NULL;                    /* page list is gone */
3674         else {
3675                 pp->p_prev->p_next = pp->p_next;
3676                 pp->p_next->p_prev = pp->p_prev;
3677         }
3678         pp->p_prev = pp->p_next = pp;             /* make pp a list of one */
3679 }
3680 
3681 
3682 /*
3683  * Break page list cppp into two lists with npages in the first list.
3684  * The tail is returned in nppp.
3685  */
3686 void
3687 page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages)
3688 {
3689         page_t *s1pp = *oppp;
3690         page_t *s2pp;
3691         page_t *e1pp, *e2pp;
3692         long n = 0;
3693 
3694         if (s1pp == NULL) {
3695                 *nppp = NULL;
3696                 return;
3697         }
3698         if (npages == 0) {
3699                 *nppp = s1pp;
3700                 *oppp = NULL;
3701                 return;
3702         }
3703         for (n = 0, s2pp = *oppp; n < npages; n++) {
3704                 s2pp = s2pp->p_next;
3705         }
3706         /* Fix head and tail of new lists */
3707         e1pp = s2pp->p_prev;
3708         e2pp = s1pp->p_prev;
3709         s1pp->p_prev = e1pp;
3710         e1pp->p_next = s1pp;
3711         s2pp->p_prev = e2pp;
3712         e2pp->p_next = s2pp;
3713 
3714         /* second list empty */
3715         if (s2pp == s1pp) {
3716                 *oppp = s1pp;
3717                 *nppp = NULL;
3718         } else {
3719                 *oppp = s1pp;
3720                 *nppp = s2pp;
3721         }
3722 }
3723 
3724 /*
3725  * Concatenate page list nppp onto the end of list ppp.
3726  */
3727 void
3728 page_list_concat(page_t **ppp, page_t **nppp)
3729 {
3730         page_t *s1pp, *s2pp, *e1pp, *e2pp;
3731 
3732         if (*nppp == NULL) {
3733                 return;
3734         }
3735         if (*ppp == NULL) {
3736                 *ppp = *nppp;
3737                 return;
3738         }
3739         s1pp = *ppp;
3740         e1pp =  s1pp->p_prev;
3741         s2pp = *nppp;
3742         e2pp = s2pp->p_prev;
3743         s1pp->p_prev = e2pp;
3744         e2pp->p_next = s1pp;
3745         e1pp->p_next = s2pp;
3746         s2pp->p_prev = e1pp;
3747 }
3748 
3749 /*
3750  * return the next page in the page list
3751  */
3752 page_t *
3753 page_list_next(page_t *pp)
3754 {
3755         return (pp->p_next);
3756 }
3757 
3758 
3759 /*
3760  * Add the page to the front of the linked list of pages
3761  * using p_vpnext/p_vpprev pointers for the list.
3762  *
3763  * The caller is responsible for protecting the lists.
3764  */
3765 void
3766 page_vpadd(page_t **ppp, page_t *pp)
3767 {
3768         if (*ppp == NULL) {
3769                 pp->p_vpnext = pp->p_vpprev = pp;
3770         } else {
3771                 pp->p_vpnext = *ppp;
3772                 pp->p_vpprev = (*ppp)->p_vpprev;
3773                 (*ppp)->p_vpprev = pp;
3774                 pp->p_vpprev->p_vpnext = pp;
3775         }
3776         *ppp = pp;
3777 }
3778 
3779 /*
3780  * Remove this page from the linked list of pages
3781  * using p_vpnext/p_vpprev pointers for the list.
3782  *
3783  * The caller is responsible for protecting the lists.
3784  */
3785 void
3786 page_vpsub(page_t **ppp, page_t *pp)
3787 {
3788         if (*ppp == NULL || pp == NULL) {
3789                 panic("page_vpsub: bad arg(s): pp %p, *ppp %p",
3790                     (void *)pp, (void *)(*ppp));
3791                 /*NOTREACHED*/
3792         }
3793 
3794         if (*ppp == pp)
3795                 *ppp = pp->p_vpnext;         /* go to next page */
3796 
3797         if (*ppp == pp)
3798                 *ppp = NULL;                    /* page list is gone */
3799         else {
3800                 pp->p_vpprev->p_vpnext = pp->p_vpnext;
3801                 pp->p_vpnext->p_vpprev = pp->p_vpprev;
3802         }
3803         pp->p_vpprev = pp->p_vpnext = pp; /* make pp a list of one */
3804 }
3805 
3806 /*
3807  * Lock a physical page into memory "long term".  Used to support "lock
3808  * in memory" functions.  Accepts the page to be locked, and a cow variable
3809  * to indicate whether a the lock will travel to the new page during
3810  * a potential copy-on-write.
3811  */
3812 int
3813 page_pp_lock(
3814         page_t *pp,                     /* page to be locked */
3815         int cow,                        /* cow lock */
3816         int kernel)                     /* must succeed -- ignore checking */
3817 {
3818         int r = 0;                      /* result -- assume failure */
3819 
3820         ASSERT(PAGE_LOCKED(pp));
3821 
3822         page_struct_lock(pp);
3823         /*
3824          * Acquire the "freemem_lock" for availrmem.
3825          */
3826         if (cow) {
3827                 mutex_enter(&freemem_lock);
3828                 if ((availrmem > pages_pp_maximum) &&
3829                     (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
3830                         availrmem--;
3831                         pages_locked++;
3832                         mutex_exit(&freemem_lock);
3833                         r = 1;
3834                         if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
3835                                 cmn_err(CE_WARN,
3836                                     "COW lock limit reached on pfn 0x%lx",
3837                                     page_pptonum(pp));
3838                         }
3839                 } else
3840                         mutex_exit(&freemem_lock);
3841         } else {
3842                 if (pp->p_lckcnt) {
3843                         if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
3844                                 r = 1;
3845                                 if (++pp->p_lckcnt ==
3846                                     (ushort_t)PAGE_LOCK_MAXIMUM) {
3847                                         cmn_err(CE_WARN, "Page lock limit "
3848                                             "reached on pfn 0x%lx",
3849                                             page_pptonum(pp));
3850                                 }
3851                         }
3852                 } else {
3853                         if (kernel) {
3854                                 /* availrmem accounting done by caller */
3855                                 ++pp->p_lckcnt;
3856                                 r = 1;
3857                         } else {
3858                                 mutex_enter(&freemem_lock);
3859                                 if (availrmem > pages_pp_maximum) {
3860                                         availrmem--;
3861                                         pages_locked++;
3862                                         ++pp->p_lckcnt;
3863                                         r = 1;
3864                                 }
3865                                 mutex_exit(&freemem_lock);
3866                         }
3867                 }
3868         }
3869         page_struct_unlock(pp);
3870         return (r);
3871 }
3872 
3873 /*
3874  * Decommit a lock on a physical page frame.  Account for cow locks if
3875  * appropriate.
3876  */
3877 void
3878 page_pp_unlock(
3879         page_t *pp,                     /* page to be unlocked */
3880         int cow,                        /* expect cow lock */
3881         int kernel)                     /* this was a kernel lock */
3882 {
3883         ASSERT(PAGE_LOCKED(pp));
3884 
3885         page_struct_lock(pp);
3886         /*
3887          * Acquire the "freemem_lock" for availrmem.
3888          * If cowcnt or lcknt is already 0 do nothing; i.e., we
3889          * could be called to unlock even if nothing is locked. This could
3890          * happen if locked file pages were truncated (removing the lock)
3891          * and the file was grown again and new pages faulted in; the new
3892          * pages are unlocked but the segment still thinks they're locked.
3893          */
3894         if (cow) {
3895                 if (pp->p_cowcnt) {
3896                         mutex_enter(&freemem_lock);
3897                         pp->p_cowcnt--;
3898                         availrmem++;
3899                         pages_locked--;
3900                         mutex_exit(&freemem_lock);
3901                 }
3902         } else {
3903                 if (pp->p_lckcnt && --pp->p_lckcnt == 0) {
3904                         if (!kernel) {
3905                                 mutex_enter(&freemem_lock);
3906                                 availrmem++;
3907                                 pages_locked--;
3908                                 mutex_exit(&freemem_lock);
3909                         }
3910                 }
3911         }
3912         page_struct_unlock(pp);
3913 }
3914 
3915 /*
3916  * This routine reserves availrmem for npages;
3917  *      flags: KM_NOSLEEP or KM_SLEEP
3918  *      returns 1 on success or 0 on failure
3919  */
3920 int
3921 page_resv(pgcnt_t npages, uint_t flags)
3922 {
3923         mutex_enter(&freemem_lock);
3924         while (availrmem < tune.t_minarmem + npages) {
3925                 if (flags & KM_NOSLEEP) {
3926                         mutex_exit(&freemem_lock);
3927                         return (0);
3928                 }
3929                 mutex_exit(&freemem_lock);
3930                 page_needfree(npages);
3931                 kmem_reap();
3932                 delay(hz >> 2);
3933                 page_needfree(-(spgcnt_t)npages);
3934                 mutex_enter(&freemem_lock);
3935         }
3936         availrmem -= npages;
3937         mutex_exit(&freemem_lock);
3938         return (1);
3939 }
3940 
3941 /*
3942  * This routine unreserves availrmem for npages;
3943  */
3944 void
3945 page_unresv(pgcnt_t npages)
3946 {
3947         mutex_enter(&freemem_lock);
3948         availrmem += npages;
3949         mutex_exit(&freemem_lock);
3950 }
3951 
3952 /*
3953  * See Statement at the beginning of segvn_lockop() regarding
3954  * the way we handle cowcnts and lckcnts.
3955  *
3956  * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage
3957  * that breaks COW has PROT_WRITE.
3958  *
3959  * Note that, we may also break COW in case we are softlocking
3960  * on read access during physio;
3961  * in this softlock case, the vpage may not have PROT_WRITE.
3962  * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp'
3963  * if the vpage doesn't have PROT_WRITE.
3964  *
3965  * This routine is never called if we are stealing a page
3966  * in anon_private.
3967  *
3968  * The caller subtracted from availrmem for read only mapping.
3969  * if lckcnt is 1 increment availrmem.
3970  */
3971 void
3972 page_pp_useclaim(
3973         page_t *opp,            /* original page frame losing lock */
3974         page_t *npp,            /* new page frame gaining lock */
3975         uint_t  write_perm)     /* set if vpage has PROT_WRITE */
3976 {
3977         int payback = 0;
3978         int nidx, oidx;
3979 
3980         ASSERT(PAGE_LOCKED(opp));
3981         ASSERT(PAGE_LOCKED(npp));
3982 
3983         /*
3984          * Since we have two pages we probably have two locks.  We need to take
3985          * them in a defined order to avoid deadlocks.  It's also possible they
3986          * both hash to the same lock in which case this is a non-issue.
3987          */
3988         nidx = PAGE_LLOCK_HASH(PP_PAGEROOT(npp));
3989         oidx = PAGE_LLOCK_HASH(PP_PAGEROOT(opp));
3990         if (nidx < oidx) {
3991                 page_struct_lock(npp);
3992                 page_struct_lock(opp);
3993         } else if (oidx < nidx) {
3994                 page_struct_lock(opp);
3995                 page_struct_lock(npp);
3996         } else {        /* The pages hash to the same lock */
3997                 page_struct_lock(npp);
3998         }
3999 
4000         ASSERT(npp->p_cowcnt == 0);
4001         ASSERT(npp->p_lckcnt == 0);
4002 
4003         /* Don't use claim if nothing is locked (see page_pp_unlock above) */
4004         if ((write_perm && opp->p_cowcnt != 0) ||
4005             (!write_perm && opp->p_lckcnt != 0)) {
4006 
4007                 if (write_perm) {
4008                         npp->p_cowcnt++;
4009                         ASSERT(opp->p_cowcnt != 0);
4010                         opp->p_cowcnt--;
4011                 } else {
4012 
4013                         ASSERT(opp->p_lckcnt != 0);
4014 
4015                         /*
4016                          * We didn't need availrmem decremented if p_lckcnt on
4017                          * original page is 1. Here, we are unlocking
4018                          * read-only copy belonging to original page and
4019                          * are locking a copy belonging to new page.
4020                          */
4021                         if (opp->p_lckcnt == 1)
4022                                 payback = 1;
4023 
4024                         npp->p_lckcnt++;
4025                         opp->p_lckcnt--;
4026                 }
4027         }
4028         if (payback) {
4029                 mutex_enter(&freemem_lock);
4030                 availrmem++;
4031                 pages_useclaim--;
4032                 mutex_exit(&freemem_lock);
4033         }
4034 
4035         if (nidx < oidx) {
4036                 page_struct_unlock(opp);
4037                 page_struct_unlock(npp);
4038         } else if (oidx < nidx) {
4039                 page_struct_unlock(npp);
4040                 page_struct_unlock(opp);
4041         } else {        /* The pages hash to the same lock */
4042                 page_struct_unlock(npp);
4043         }
4044 }
4045 
4046 /*
4047  * Simple claim adjust functions -- used to support changes in
4048  * claims due to changes in access permissions.  Used by segvn_setprot().
4049  */
4050 int
4051 page_addclaim(page_t *pp)
4052 {
4053         int r = 0;                      /* result */
4054 
4055         ASSERT(PAGE_LOCKED(pp));
4056 
4057         page_struct_lock(pp);
4058         ASSERT(pp->p_lckcnt != 0);
4059 
4060         if (pp->p_lckcnt == 1) {
4061                 if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4062                         --pp->p_lckcnt;
4063                         r = 1;
4064                         if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4065                                 cmn_err(CE_WARN,
4066                                     "COW lock limit reached on pfn 0x%lx",
4067                                     page_pptonum(pp));
4068                         }
4069                 }
4070         } else {
4071                 mutex_enter(&freemem_lock);
4072                 if ((availrmem > pages_pp_maximum) &&
4073                     (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
4074                         --availrmem;
4075                         ++pages_claimed;
4076                         mutex_exit(&freemem_lock);
4077                         --pp->p_lckcnt;
4078                         r = 1;
4079                         if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4080                                 cmn_err(CE_WARN,
4081                                     "COW lock limit reached on pfn 0x%lx",
4082                                     page_pptonum(pp));
4083                         }
4084                 } else
4085                         mutex_exit(&freemem_lock);
4086         }
4087         page_struct_unlock(pp);
4088         return (r);
4089 }
4090 
4091 int
4092 page_subclaim(page_t *pp)
4093 {
4094         int r = 0;
4095 
4096         ASSERT(PAGE_LOCKED(pp));
4097 
4098         page_struct_lock(pp);
4099         ASSERT(pp->p_cowcnt != 0);
4100 
4101         if (pp->p_lckcnt) {
4102                 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4103                         r = 1;
4104                         /*
4105                          * for availrmem
4106                          */
4107                         mutex_enter(&freemem_lock);
4108                         availrmem++;
4109                         pages_claimed--;
4110                         mutex_exit(&freemem_lock);
4111 
4112                         pp->p_cowcnt--;
4113 
4114                         if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4115                                 cmn_err(CE_WARN,
4116                                     "Page lock limit reached on pfn 0x%lx",
4117                                     page_pptonum(pp));
4118                         }
4119                 }
4120         } else {
4121                 r = 1;
4122                 pp->p_cowcnt--;
4123                 pp->p_lckcnt++;
4124         }
4125         page_struct_unlock(pp);
4126         return (r);
4127 }
4128 
4129 /*
4130  * Variant of page_addclaim(), where ppa[] contains the pages of a single large
4131  * page.
4132  */
4133 int
4134 page_addclaim_pages(page_t  **ppa)
4135 {
4136         pgcnt_t lckpgs = 0, pg_idx;
4137 
4138         VM_STAT_ADD(pagecnt.pc_addclaim_pages);
4139 
4140         /*
4141          * Only need to take the page struct lock on the large page root.
4142          */
4143         page_struct_lock(ppa[0]);
4144         for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4145 
4146                 ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4147                 ASSERT(ppa[pg_idx]->p_lckcnt != 0);
4148                 if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4149                         page_struct_unlock(ppa[0]);
4150                         return (0);
4151                 }
4152                 if (ppa[pg_idx]->p_lckcnt > 1)
4153                         lckpgs++;
4154         }
4155 
4156         if (lckpgs != 0) {
4157                 mutex_enter(&freemem_lock);
4158                 if (availrmem >= pages_pp_maximum + lckpgs) {
4159                         availrmem -= lckpgs;
4160                         pages_claimed += lckpgs;
4161                 } else {
4162                         mutex_exit(&freemem_lock);
4163                         page_struct_unlock(ppa[0]);
4164                         return (0);
4165                 }
4166                 mutex_exit(&freemem_lock);
4167         }
4168 
4169         for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4170                 ppa[pg_idx]->p_lckcnt--;
4171                 ppa[pg_idx]->p_cowcnt++;
4172         }
4173         page_struct_unlock(ppa[0]);
4174         return (1);
4175 }
4176 
4177 /*
4178  * Variant of page_subclaim(), where ppa[] contains the pages of a single large
4179  * page.
4180  */
4181 int
4182 page_subclaim_pages(page_t  **ppa)
4183 {
4184         pgcnt_t ulckpgs = 0, pg_idx;
4185 
4186         VM_STAT_ADD(pagecnt.pc_subclaim_pages);
4187 
4188         /*
4189          * Only need to take the page struct lock on the large page root.
4190          */
4191         page_struct_lock(ppa[0]);
4192         for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4193 
4194                 ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4195                 ASSERT(ppa[pg_idx]->p_cowcnt != 0);
4196                 if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4197                         page_struct_unlock(ppa[0]);
4198                         return (0);
4199                 }
4200                 if (ppa[pg_idx]->p_lckcnt != 0)
4201                         ulckpgs++;
4202         }
4203 
4204         if (ulckpgs != 0) {
4205                 mutex_enter(&freemem_lock);
4206                 availrmem += ulckpgs;
4207                 pages_claimed -= ulckpgs;
4208                 mutex_exit(&freemem_lock);
4209         }
4210 
4211         for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4212                 ppa[pg_idx]->p_cowcnt--;
4213                 ppa[pg_idx]->p_lckcnt++;
4214 
4215         }
4216         page_struct_unlock(ppa[0]);
4217         return (1);
4218 }
4219 
4220 page_t *
4221 page_numtopp(pfn_t pfnum, se_t se)
4222 {
4223         page_t *pp;
4224 
4225 retry:
4226         pp = page_numtopp_nolock(pfnum);
4227         if (pp == NULL) {
4228                 return ((page_t *)NULL);
4229         }
4230 
4231         /*
4232          * Acquire the appropriate lock on the page.
4233          */
4234         while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) {
4235                 if (page_pptonum(pp) != pfnum)
4236                         goto retry;
4237                 continue;
4238         }
4239 
4240         if (page_pptonum(pp) != pfnum) {
4241                 page_unlock(pp);
4242                 goto retry;
4243         }
4244 
4245         return (pp);
4246 }
4247 
4248 page_t *
4249 page_numtopp_noreclaim(pfn_t pfnum, se_t se)
4250 {
4251         page_t *pp;
4252 
4253 retry:
4254         pp = page_numtopp_nolock(pfnum);
4255         if (pp == NULL) {
4256                 return ((page_t *)NULL);
4257         }
4258 
4259         /*
4260          * Acquire the appropriate lock on the page.
4261          */
4262         while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) {
4263                 if (page_pptonum(pp) != pfnum)
4264                         goto retry;
4265                 continue;
4266         }
4267 
4268         if (page_pptonum(pp) != pfnum) {
4269                 page_unlock(pp);
4270                 goto retry;
4271         }
4272 
4273         return (pp);
4274 }
4275 
4276 /*
4277  * This routine is like page_numtopp, but will only return page structs
4278  * for pages which are ok for loading into hardware using the page struct.
4279  */
4280 page_t *
4281 page_numtopp_nowait(pfn_t pfnum, se_t se)
4282 {
4283         page_t *pp;
4284 
4285 retry:
4286         pp = page_numtopp_nolock(pfnum);
4287         if (pp == NULL) {
4288                 return ((page_t *)NULL);
4289         }
4290 
4291         /*
4292          * Try to acquire the appropriate lock on the page.
4293          */
4294         if (PP_ISFREE(pp))
4295                 pp = NULL;
4296         else {
4297                 if (!page_trylock(pp, se))
4298                         pp = NULL;
4299                 else {
4300                         if (page_pptonum(pp) != pfnum) {
4301                                 page_unlock(pp);
4302                                 goto retry;
4303                         }
4304                         if (PP_ISFREE(pp)) {
4305                                 page_unlock(pp);
4306                                 pp = NULL;
4307                         }
4308                 }
4309         }
4310         return (pp);
4311 }
4312 
4313 #define SYNC_PROGRESS_NPAGES    1000
4314 
4315 /*
4316  * Returns a count of dirty pages that are in the process
4317  * of being written out.  If 'cleanit' is set, try to push the page.
4318  */
4319 pgcnt_t
4320 page_busy(int cleanit)
4321 {
4322         page_t *page0 = page_first();
4323         page_t *pp = page0;
4324         pgcnt_t nppbusy = 0;
4325         int counter = 0;
4326         u_offset_t off;
4327 
4328         do {
4329                 vnode_t *vp = pp->p_vnode;
4330 
4331                 /*
4332                  * Reset the sync timeout. The page list is very long
4333                  * on large memory systems.
4334                  */
4335                 if (++counter > SYNC_PROGRESS_NPAGES) {
4336                         counter = 0;
4337                         vfs_syncprogress();
4338                 }
4339 
4340                 /*
4341                  * A page is a candidate for syncing if it is:
4342                  *
4343                  * (a)  On neither the freelist nor the cachelist
4344                  * (b)  Hashed onto a vnode
4345                  * (c)  Not a kernel page
4346                  * (d)  Dirty
4347                  * (e)  Not part of a swapfile
4348                  * (f)  a page which belongs to a real vnode; eg has a non-null
4349                  *      v_vfsp pointer.
4350                  * (g)  Backed by a filesystem which doesn't have a
4351                  *      stubbed-out sync operation
4352                  */
4353                 if (!PP_ISFREE(pp) && vp != NULL && !VN_ISKAS(vp) &&
4354                     hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL &&
4355                     vfs_can_sync(vp->v_vfsp)) {
4356                         nppbusy++;
4357 
4358                         if (!cleanit)
4359                                 continue;
4360                         if (!page_trylock(pp, SE_EXCL))
4361                                 continue;
4362 
4363                         if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) ||
4364                             pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
4365                             !(hat_pagesync(pp,
4366                             HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) {
4367                                 page_unlock(pp);
4368                                 continue;
4369                         }
4370                         off = pp->p_offset;
4371                         VN_HOLD(vp);
4372                         page_unlock(pp);
4373                         (void) VOP_PUTPAGE(vp, off, PAGESIZE,
4374                             B_ASYNC | B_FREE, kcred, NULL);
4375                         VN_RELE(vp);
4376                 }
4377         } while ((pp = page_next(pp)) != page0);
4378 
4379         vfs_syncprogress();
4380         return (nppbusy);
4381 }
4382 
4383 void page_invalidate_pages(void);
4384 
4385 /*
4386  * callback handler to vm sub-system
4387  *
4388  * callers make sure no recursive entries to this func.
4389  */
4390 /*ARGSUSED*/
4391 boolean_t
4392 callb_vm_cpr(void *arg, int code)
4393 {
4394         if (code == CB_CODE_CPR_CHKPT)
4395                 page_invalidate_pages();
4396         return (B_TRUE);
4397 }
4398 
4399 /*
4400  * Invalidate all pages of the system.
4401  * It shouldn't be called until all user page activities are all stopped.
4402  */
4403 void
4404 page_invalidate_pages()
4405 {
4406         page_t *pp;
4407         page_t *page0;
4408         pgcnt_t nbusypages;
4409         int retry = 0;
4410         const int MAXRETRIES = 4;
4411 top:
4412         /*
4413          * Flush dirty pages and destroy the clean ones.
4414          */
4415         nbusypages = 0;
4416 
4417         pp = page0 = page_first();
4418         do {
4419                 struct vnode    *vp;
4420                 u_offset_t      offset;
4421                 int             mod;
4422 
4423                 /*
4424                  * skip the page if it has no vnode or the page associated
4425                  * with the kernel vnode or prom allocated kernel mem.
4426                  */
4427                 if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp))
4428                         continue;
4429 
4430                 /*
4431                  * skip the page which is already free invalidated.
4432                  */
4433                 if (PP_ISFREE(pp) && PP_ISAGED(pp))
4434                         continue;
4435 
4436                 /*
4437                  * skip pages that are already locked or can't be "exclusively"
4438                  * locked or are already free.  After we lock the page, check
4439                  * the free and age bits again to be sure it's not destroyed
4440                  * yet.
4441                  * To achieve max. parallelization, we use page_trylock instead
4442                  * of page_lock so that we don't get block on individual pages
4443                  * while we have thousands of other pages to process.
4444                  */
4445                 if (!page_trylock(pp, SE_EXCL)) {
4446                         nbusypages++;
4447                         continue;
4448                 } else if (PP_ISFREE(pp)) {
4449                         if (!PP_ISAGED(pp)) {
4450                                 page_destroy_free(pp);
4451                         } else {
4452                                 page_unlock(pp);
4453                         }
4454                         continue;
4455                 }
4456                 /*
4457                  * Is this page involved in some I/O? shared?
4458                  *
4459                  * The page_struct_lock need not be acquired to
4460                  * examine these fields since the page has an
4461                  * "exclusive" lock.
4462                  */
4463                 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
4464                         page_unlock(pp);
4465                         continue;
4466                 }
4467 
4468                 if (vp->v_type == VCHR) {
4469                         panic("vp->v_type == VCHR");
4470                         /*NOTREACHED*/
4471                 }
4472 
4473                 if (!page_try_demote_pages(pp)) {
4474                         page_unlock(pp);
4475                         continue;
4476                 }
4477 
4478                 /*
4479                  * Check the modified bit. Leave the bits alone in hardware
4480                  * (they will be modified if we do the putpage).
4481                  */
4482                 mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD)
4483                     & P_MOD);
4484                 if (mod) {
4485                         offset = pp->p_offset;
4486                         /*
4487                          * Hold the vnode before releasing the page lock
4488                          * to prevent it from being freed and re-used by
4489                          * some other thread.
4490                          */
4491                         VN_HOLD(vp);
4492                         page_unlock(pp);
4493                         /*
4494                          * No error return is checked here. Callers such as
4495                          * cpr deals with the dirty pages at the dump time
4496                          * if this putpage fails.
4497                          */
4498                         (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL,
4499                             kcred, NULL);
4500                         VN_RELE(vp);
4501                 } else {
4502                         /*LINTED: constant in conditional context*/
4503                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
4504                 }
4505         } while ((pp = page_next(pp)) != page0);
4506         if (nbusypages && retry++ < MAXRETRIES) {
4507                 delay(1);
4508                 goto top;
4509         }
4510 }
4511 
4512 /*
4513  * Replace the page "old" with the page "new" on the page hash and vnode lists
4514  *
4515  * the replacement must be done in place, ie the equivalent sequence:
4516  *
4517  *      vp = old->p_vnode;
4518  *      off = old->p_offset;
4519  *      page_do_hashout(old)
4520  *      page_do_hashin(new, vp, off)
4521  *
4522  * doesn't work, since
4523  *  1) if old is the only page on the vnode, the v_pages list has a window
4524  *     where it looks empty. This will break file system assumptions.
4525  * and
4526  *  2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list.
4527  */
4528 static void
4529 page_do_relocate_hash(page_t *new, page_t *old)
4530 {
4531         page_t  **hash_list;
4532         vnode_t *vp = old->p_vnode;
4533         kmutex_t *sep;
4534 
4535         ASSERT(PAGE_EXCL(old));
4536         ASSERT(PAGE_EXCL(new));
4537         ASSERT(vp != NULL);
4538         ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
4539         ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset))));
4540 
4541         /*
4542          * First find old page on the page hash list
4543          */
4544         hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)];
4545 
4546         for (;;) {
4547                 if (*hash_list == old)
4548                         break;
4549                 if (*hash_list == NULL) {
4550                         panic("page_do_hashout");
4551                         /*NOTREACHED*/
4552                 }
4553                 hash_list = &(*hash_list)->p_hash;
4554         }
4555 
4556         /*
4557          * update new and replace old with new on the page hash list
4558          */
4559         new->p_vnode = old->p_vnode;
4560         new->p_offset = old->p_offset;
4561         new->p_hash = old->p_hash;
4562         *hash_list = new;
4563 
4564         if ((new->p_vnode->v_flag & VISSWAP) != 0)
4565                 PP_SETSWAP(new);
4566 
4567         /*
4568          * replace old with new on the vnode's page list
4569          */
4570         if (old->p_vpnext == old) {
4571                 new->p_vpnext = new;
4572                 new->p_vpprev = new;
4573         } else {
4574                 new->p_vpnext = old->p_vpnext;
4575                 new->p_vpprev = old->p_vpprev;
4576                 new->p_vpnext->p_vpprev = new;
4577                 new->p_vpprev->p_vpnext = new;
4578         }
4579         if (vp->v_pages == old)
4580                 vp->v_pages = new;
4581 
4582         /*
4583          * clear out the old page
4584          */
4585         old->p_hash = NULL;
4586         old->p_vpnext = NULL;
4587         old->p_vpprev = NULL;
4588         old->p_vnode = NULL;
4589         PP_CLRSWAP(old);
4590         old->p_offset = (u_offset_t)-1;
4591         page_clr_all_props(old);
4592 
4593         /*
4594          * Wake up processes waiting for this page.  The page's
4595          * identity has been changed, and is probably not the
4596          * desired page any longer.
4597          */
4598         sep = page_se_mutex(old);
4599         mutex_enter(sep);
4600         old->p_selock &= ~SE_EWANTED;
4601         if (CV_HAS_WAITERS(&old->p_cv))
4602                 cv_broadcast(&old->p_cv);
4603         mutex_exit(sep);
4604 }
4605 
4606 /*
4607  * This function moves the identity of page "pp_old" to page "pp_new".
4608  * Both pages must be locked on entry.  "pp_new" is free, has no identity,
4609  * and need not be hashed out from anywhere.
4610  */
4611 void
4612 page_relocate_hash(page_t *pp_new, page_t *pp_old)
4613 {
4614         vnode_t *vp = pp_old->p_vnode;
4615         u_offset_t off = pp_old->p_offset;
4616         kmutex_t *phm, *vphm;
4617 
4618         /*
4619          * Rehash two pages
4620          */
4621         ASSERT(PAGE_EXCL(pp_old));
4622         ASSERT(PAGE_EXCL(pp_new));
4623         ASSERT(vp != NULL);
4624         ASSERT(pp_new->p_vnode == NULL);
4625 
4626         /*
4627          * hashout then hashin while holding the mutexes
4628          */
4629         phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off));
4630         mutex_enter(phm);
4631         vphm = page_vnode_mutex(vp);
4632         mutex_enter(vphm);
4633 
4634         page_do_relocate_hash(pp_new, pp_old);
4635 
4636         /* The following comment preserved from page_flip(). */
4637         pp_new->p_fsdata = pp_old->p_fsdata;
4638         pp_old->p_fsdata = 0;
4639         mutex_exit(vphm);
4640         mutex_exit(phm);
4641 
4642         /*
4643          * The page_struct_lock need not be acquired for lckcnt and
4644          * cowcnt since the page has an "exclusive" lock.
4645          */
4646         ASSERT(pp_new->p_lckcnt == 0);
4647         ASSERT(pp_new->p_cowcnt == 0);
4648         pp_new->p_lckcnt = pp_old->p_lckcnt;
4649         pp_new->p_cowcnt = pp_old->p_cowcnt;
4650         pp_old->p_lckcnt = pp_old->p_cowcnt = 0;
4651 
4652 }
4653 
4654 /*
4655  * Helper routine used to lock all remaining members of a
4656  * large page. The caller is responsible for passing in a locked
4657  * pp. If pp is a large page, then it succeeds in locking all the
4658  * remaining constituent pages or it returns with only the
4659  * original page locked.
4660  *
4661  * Returns 1 on success, 0 on failure.
4662  *
4663  * If success is returned this routine guarantees p_szc for all constituent
4664  * pages of a large page pp belongs to can't change. To achieve this we
4665  * recheck szc of pp after locking all constituent pages and retry if szc
4666  * changed (it could only decrease). Since hat_page_demote() needs an EXCL
4667  * lock on one of constituent pages it can't be running after all constituent
4668  * pages are locked.  hat_page_demote() with a lock on a constituent page
4669  * outside of this large page (i.e. pp belonged to a larger large page) is
4670  * already done with all constituent pages of pp since the root's p_szc is
4671  * changed last. Therefore no need to synchronize with hat_page_demote() that
4672  * locked a constituent page outside of pp's current large page.
4673  */
4674 #ifdef DEBUG
4675 uint32_t gpg_trylock_mtbf = 0;
4676 #endif
4677 
4678 int
4679 group_page_trylock(page_t *pp, se_t se)
4680 {
4681         page_t  *tpp;
4682         pgcnt_t npgs, i, j;
4683         uint_t pszc = pp->p_szc;
4684 
4685 #ifdef DEBUG
4686         if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) {
4687                 return (0);
4688         }
4689 #endif
4690 
4691         if (pp != PP_GROUPLEADER(pp, pszc)) {
4692                 return (0);
4693         }
4694 
4695 retry:
4696         ASSERT(PAGE_LOCKED_SE(pp, se));
4697         ASSERT(!PP_ISFREE(pp));
4698         if (pszc == 0) {
4699                 return (1);
4700         }
4701         npgs = page_get_pagecnt(pszc);
4702         tpp = pp + 1;
4703         for (i = 1; i < npgs; i++, tpp++) {
4704                 if (!page_trylock(tpp, se)) {
4705                         tpp = pp + 1;
4706                         for (j = 1; j < i; j++, tpp++) {
4707                                 page_unlock(tpp);
4708                         }
4709                         return (0);
4710                 }
4711         }
4712         if (pp->p_szc != pszc) {
4713                 ASSERT(pp->p_szc < pszc);
4714                 ASSERT(pp->p_vnode != NULL && !PP_ISKAS(pp) &&
4715                     !IS_SWAPFSVP(pp->p_vnode));
4716                 tpp = pp + 1;
4717                 for (i = 1; i < npgs; i++, tpp++) {
4718                         page_unlock(tpp);
4719                 }
4720                 pszc = pp->p_szc;
4721                 goto retry;
4722         }
4723         return (1);
4724 }
4725 
4726 void
4727 group_page_unlock(page_t *pp)
4728 {
4729         page_t *tpp;
4730         pgcnt_t npgs, i;
4731 
4732         ASSERT(PAGE_LOCKED(pp));
4733         ASSERT(!PP_ISFREE(pp));
4734         ASSERT(pp == PP_PAGEROOT(pp));
4735         npgs = page_get_pagecnt(pp->p_szc);
4736         for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) {
4737                 page_unlock(tpp);
4738         }
4739 }
4740 
4741 /*
4742  * returns
4743  * 0            : on success and *nrelocp is number of relocated PAGESIZE pages
4744  * ERANGE       : this is not a base page
4745  * EBUSY        : failure to get locks on the page/pages
4746  * ENOMEM       : failure to obtain replacement pages
4747  * EAGAIN       : OBP has not yet completed its boot-time handoff to the kernel
4748  * EIO          : An error occurred while trying to copy the page data
4749  *
4750  * Return with all constituent members of target and replacement
4751  * SE_EXCL locked. It is the callers responsibility to drop the
4752  * locks.
4753  */
4754 int
4755 do_page_relocate(
4756         page_t **target,
4757         page_t **replacement,
4758         int grouplock,
4759         spgcnt_t *nrelocp,
4760         lgrp_t *lgrp)
4761 {
4762         page_t *first_repl;
4763         page_t *repl;
4764         page_t *targ;
4765         page_t *pl = NULL;
4766         uint_t ppattr;
4767         pfn_t   pfn, repl_pfn;
4768         uint_t  szc;
4769         spgcnt_t npgs, i;
4770         int repl_contig = 0;
4771         uint_t flags = 0;
4772         spgcnt_t dofree = 0;
4773 
4774         *nrelocp = 0;
4775 
4776 #if defined(__sparc)
4777         /*
4778          * We need to wait till OBP has completed
4779          * its boot-time handoff of its resources to the kernel
4780          * before we allow page relocation
4781          */
4782         if (page_relocate_ready == 0) {
4783                 return (EAGAIN);
4784         }
4785 #endif
4786 
4787         /*
4788          * If this is not a base page,
4789          * just return with 0x0 pages relocated.
4790          */
4791         targ = *target;
4792         ASSERT(PAGE_EXCL(targ));
4793         ASSERT(!PP_ISFREE(targ));
4794         szc = targ->p_szc;
4795         ASSERT(szc < mmu_page_sizes);
4796         VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4797         pfn = targ->p_pagenum;
4798         if (pfn != PFN_BASE(pfn, szc)) {
4799                 VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]);
4800                 return (ERANGE);
4801         }
4802 
4803         if ((repl = *replacement) != NULL && repl->p_szc >= szc) {
4804                 repl_pfn = repl->p_pagenum;
4805                 if (repl_pfn != PFN_BASE(repl_pfn, szc)) {
4806                         VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]);
4807                         return (ERANGE);
4808                 }
4809                 repl_contig = 1;
4810         }
4811 
4812         /*
4813          * We must lock all members of this large page or we cannot
4814          * relocate any part of it.
4815          */
4816         if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) {
4817                 VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]);
4818                 return (EBUSY);
4819         }
4820 
4821         /*
4822          * reread szc it could have been decreased before
4823          * group_page_trylock() was done.
4824          */
4825         szc = targ->p_szc;
4826         ASSERT(szc < mmu_page_sizes);
4827         VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4828         ASSERT(pfn == PFN_BASE(pfn, szc));
4829 
4830         npgs = page_get_pagecnt(targ->p_szc);
4831 
4832         if (repl == NULL) {
4833                 dofree = npgs;          /* Size of target page in MMU pages */
4834                 if (!page_create_wait(dofree, 0)) {
4835                         if (grouplock != 0) {
4836                                 group_page_unlock(targ);
4837                         }
4838                         VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4839                         return (ENOMEM);
4840                 }
4841 
4842                 /*
4843                  * seg kmem pages require that the target and replacement
4844                  * page be the same pagesize.
4845                  */
4846                 flags = (VN_ISKAS(targ->p_vnode)) ? PGR_SAMESZC : 0;
4847                 repl = page_get_replacement_page(targ, lgrp, flags);
4848                 if (repl == NULL) {
4849                         if (grouplock != 0) {
4850                                 group_page_unlock(targ);
4851                         }
4852                         page_create_putback(dofree);
4853                         VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4854                         return (ENOMEM);
4855                 }
4856         }
4857 #ifdef DEBUG
4858         else {
4859                 ASSERT(PAGE_LOCKED(repl));
4860         }
4861 #endif /* DEBUG */
4862 
4863 #if defined(__sparc)
4864         /*
4865          * Let hat_page_relocate() complete the relocation if it's kernel page
4866          */
4867         if (VN_ISKAS(targ->p_vnode)) {
4868                 *replacement = repl;
4869                 if (hat_page_relocate(target, replacement, nrelocp) != 0) {
4870                         if (grouplock != 0) {
4871                                 group_page_unlock(targ);
4872                         }
4873                         if (dofree) {
4874                                 *replacement = NULL;
4875                                 page_free_replacement_page(repl);
4876                                 page_create_putback(dofree);
4877                         }
4878                         VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]);
4879                         return (EAGAIN);
4880                 }
4881                 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4882                 return (0);
4883         }
4884 #else
4885 #if defined(lint)
4886         dofree = dofree;
4887 #endif
4888 #endif
4889 
4890         first_repl = repl;
4891 
4892         for (i = 0; i < npgs; i++) {
4893                 ASSERT(PAGE_EXCL(targ));
4894                 ASSERT(targ->p_slckcnt == 0);
4895                 ASSERT(repl->p_slckcnt == 0);
4896 
4897                 (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD);
4898 
4899                 ASSERT(hat_page_getshare(targ) == 0);
4900                 ASSERT(!PP_ISFREE(targ));
4901                 ASSERT(targ->p_pagenum == (pfn + i));
4902                 ASSERT(repl_contig == 0 ||
4903                     repl->p_pagenum == (repl_pfn + i));
4904 
4905                 /*
4906                  * Copy the page contents and attributes then
4907                  * relocate the page in the page hash.
4908                  */
4909                 if (ppcopy(targ, repl) == 0) {
4910                         targ = *target;
4911                         repl = first_repl;
4912                         VM_STAT_ADD(vmm_vmstats.ppr_copyfail);
4913                         if (grouplock != 0) {
4914                                 group_page_unlock(targ);
4915                         }
4916                         if (dofree) {
4917                                 *replacement = NULL;
4918                                 page_free_replacement_page(repl);
4919                                 page_create_putback(dofree);
4920                         }
4921                         return (EIO);
4922                 }
4923 
4924                 targ++;
4925                 if (repl_contig != 0) {
4926                         repl++;
4927                 } else {
4928                         repl = repl->p_next;
4929                 }
4930         }
4931 
4932         repl = first_repl;
4933         targ = *target;
4934 
4935         for (i = 0; i < npgs; i++) {
4936                 ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO));
4937                 page_clr_all_props(repl);
4938                 page_set_props(repl, ppattr);
4939                 page_relocate_hash(repl, targ);
4940 
4941                 ASSERT(hat_page_getshare(targ) == 0);
4942                 ASSERT(hat_page_getshare(repl) == 0);
4943                 /*
4944                  * Now clear the props on targ, after the
4945                  * page_relocate_hash(), they no longer
4946                  * have any meaning.
4947                  */
4948                 page_clr_all_props(targ);
4949                 ASSERT(targ->p_next == targ);
4950                 ASSERT(targ->p_prev == targ);
4951                 page_list_concat(&pl, &targ);
4952 
4953                 targ++;
4954                 if (repl_contig != 0) {
4955                         repl++;
4956                 } else {
4957                         repl = repl->p_next;
4958                 }
4959         }
4960         /* assert that we have come full circle with repl */
4961         ASSERT(repl_contig == 1 || first_repl == repl);
4962 
4963         *target = pl;
4964         if (*replacement == NULL) {
4965                 ASSERT(first_repl == repl);
4966                 *replacement = repl;
4967         }
4968         VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4969         *nrelocp = npgs;
4970         return (0);
4971 }
4972 /*
4973  * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated.
4974  */
4975 int
4976 page_relocate(
4977         page_t **target,
4978         page_t **replacement,
4979         int grouplock,
4980         int freetarget,
4981         spgcnt_t *nrelocp,
4982         lgrp_t *lgrp)
4983 {
4984         spgcnt_t ret;
4985 
4986         /* do_page_relocate returns 0 on success or errno value */
4987         ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp);
4988 
4989         if (ret != 0 || freetarget == 0) {
4990                 return (ret);
4991         }
4992         if (*nrelocp == 1) {
4993                 ASSERT(*target != NULL);
4994                 page_free(*target, 1);
4995         } else {
4996                 page_t *tpp = *target;
4997                 uint_t szc = tpp->p_szc;
4998                 pgcnt_t npgs = page_get_pagecnt(szc);
4999                 ASSERT(npgs > 1);
5000                 ASSERT(szc != 0);
5001                 do {
5002                         ASSERT(PAGE_EXCL(tpp));
5003                         ASSERT(!hat_page_is_mapped(tpp));
5004                         ASSERT(tpp->p_szc == szc);
5005                         PP_SETFREE(tpp);
5006                         PP_SETAGED(tpp);
5007                         npgs--;
5008                 } while ((tpp = tpp->p_next) != *target);
5009                 ASSERT(npgs == 0);
5010                 page_list_add_pages(*target, 0);
5011                 npgs = page_get_pagecnt(szc);
5012                 page_create_putback(npgs);
5013         }
5014         return (ret);
5015 }
5016 
5017 /*
5018  * it is up to the caller to deal with pcf accounting.
5019  */
5020 void
5021 page_free_replacement_page(page_t *pplist)
5022 {
5023         page_t *pp;
5024 
5025         while (pplist != NULL) {
5026                 /*
5027                  * pp_targ is a linked list.
5028                  */
5029                 pp = pplist;
5030                 if (pp->p_szc == 0) {
5031                         page_sub(&pplist, pp);
5032                         page_clr_all_props(pp);
5033                         PP_SETFREE(pp);
5034                         PP_SETAGED(pp);
5035                         page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
5036                         page_unlock(pp);
5037                         VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]);
5038                 } else {
5039                         spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc);
5040                         page_t *tpp;
5041                         page_list_break(&pp, &pplist, curnpgs);
5042                         tpp = pp;
5043                         do {
5044                                 ASSERT(PAGE_EXCL(tpp));
5045                                 ASSERT(!hat_page_is_mapped(tpp));
5046                                 page_clr_all_props(tpp);
5047                                 PP_SETFREE(tpp);
5048                                 PP_SETAGED(tpp);
5049                         } while ((tpp = tpp->p_next) != pp);
5050                         page_list_add_pages(pp, 0);
5051                         VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]);
5052                 }
5053         }
5054 }
5055 
5056 /*
5057  * Relocate target to non-relocatable replacement page.
5058  */
5059 int
5060 page_relocate_cage(page_t **target, page_t **replacement)
5061 {
5062         page_t *tpp, *rpp;
5063         spgcnt_t pgcnt, npgs;
5064         int result;
5065 
5066         tpp = *target;
5067 
5068         ASSERT(PAGE_EXCL(tpp));
5069         ASSERT(tpp->p_szc == 0);
5070 
5071         pgcnt = btop(page_get_pagesize(tpp->p_szc));
5072 
5073         do {
5074                 (void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC);
5075                 rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC);
5076                 if (rpp == NULL) {
5077                         page_create_putback(pgcnt);
5078                         kcage_cageout_wakeup();
5079                 }
5080         } while (rpp == NULL);
5081 
5082         ASSERT(PP_ISNORELOC(rpp));
5083 
5084         result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL);
5085 
5086         if (result == 0) {
5087                 *replacement = rpp;
5088                 if (pgcnt != npgs)
5089                         panic("page_relocate_cage: partial relocation");
5090         }
5091 
5092         return (result);
5093 }
5094 
5095 /*
5096  * Release the page lock on a page, place on cachelist
5097  * tail if no longer mapped. Caller can let us know if
5098  * the page is known to be clean.
5099  */
5100 int
5101 page_release(page_t *pp, int checkmod)
5102 {
5103         int status;
5104 
5105         ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) &&
5106             (pp->p_vnode != NULL));
5107 
5108         if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) &&
5109             ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) &&
5110             pp->p_lckcnt == 0 && pp->p_cowcnt == 0 &&
5111             !hat_page_is_mapped(pp)) {
5112 
5113                 /*
5114                  * If page is modified, unlock it
5115                  *
5116                  * (p_nrm & P_MOD) bit has the latest stuff because:
5117                  * (1) We found that this page doesn't have any mappings
5118                  *      _after_ holding SE_EXCL and
5119                  * (2) We didn't drop SE_EXCL lock after the check in (1)
5120                  */
5121                 if (checkmod && hat_ismod(pp)) {
5122                         page_unlock(pp);
5123                         status = PGREL_MOD;
5124                 } else {
5125                         /*LINTED: constant in conditional context*/
5126                         VN_DISPOSE(pp, B_FREE, 0, kcred);
5127                         status = PGREL_CLEAN;
5128                 }
5129         } else {
5130                 page_unlock(pp);
5131                 status = PGREL_NOTREL;
5132         }
5133         return (status);
5134 }
5135 
5136 /*
5137  * Given a constituent page, try to demote the large page on the freelist.
5138  *
5139  * Returns nonzero if the page could be demoted successfully. Returns with
5140  * the constituent page still locked.
5141  */
5142 int
5143 page_try_demote_free_pages(page_t *pp)
5144 {
5145         page_t *rootpp = pp;
5146         pfn_t   pfn = page_pptonum(pp);
5147         spgcnt_t npgs;
5148         uint_t  szc = pp->p_szc;
5149 
5150         ASSERT(PP_ISFREE(pp));
5151         ASSERT(PAGE_EXCL(pp));
5152 
5153         /*
5154          * Adjust rootpp and lock it, if `pp' is not the base
5155          * constituent page.
5156          */
5157         npgs = page_get_pagecnt(pp->p_szc);
5158         if (npgs == 1) {
5159                 return (0);
5160         }
5161 
5162         if (!IS_P2ALIGNED(pfn, npgs)) {
5163                 pfn = P2ALIGN(pfn, npgs);
5164                 rootpp = page_numtopp_nolock(pfn);
5165         }
5166 
5167         if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) {
5168                 return (0);
5169         }
5170 
5171         if (rootpp->p_szc != szc) {
5172                 if (pp != rootpp)
5173                         page_unlock(rootpp);
5174                 return (0);
5175         }
5176 
5177         page_demote_free_pages(rootpp);
5178 
5179         if (pp != rootpp)
5180                 page_unlock(rootpp);
5181 
5182         ASSERT(PP_ISFREE(pp));
5183         ASSERT(PAGE_EXCL(pp));
5184         return (1);
5185 }
5186 
5187 /*
5188  * Given a constituent page, try to demote the large page.
5189  *
5190  * Returns nonzero if the page could be demoted successfully. Returns with
5191  * the constituent page still locked.
5192  */
5193 int
5194 page_try_demote_pages(page_t *pp)
5195 {
5196         page_t *tpp, *rootpp = pp;
5197         pfn_t   pfn = page_pptonum(pp);
5198         spgcnt_t i, npgs;
5199         uint_t  szc = pp->p_szc;
5200         vnode_t *vp = pp->p_vnode;
5201 
5202         ASSERT(PAGE_EXCL(pp));
5203 
5204         VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]);
5205 
5206         if (pp->p_szc == 0) {
5207                 VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]);
5208                 return (1);
5209         }
5210 
5211         if (vp != NULL && !IS_SWAPFSVP(vp) && !VN_ISKAS(vp)) {
5212                 VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
5213                 page_demote_vp_pages(pp);
5214                 ASSERT(pp->p_szc == 0);
5215                 return (1);
5216         }
5217 
5218         /*
5219          * Adjust rootpp if passed in is not the base
5220          * constituent page.
5221          */
5222         npgs = page_get_pagecnt(pp->p_szc);
5223         ASSERT(npgs > 1);
5224         if (!IS_P2ALIGNED(pfn, npgs)) {
5225                 pfn = P2ALIGN(pfn, npgs);
5226                 rootpp = page_numtopp_nolock(pfn);
5227                 VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]);
5228                 ASSERT(rootpp->p_vnode != NULL);
5229                 ASSERT(rootpp->p_szc == szc);
5230         }
5231 
5232         /*
5233          * We can't demote kernel pages since we can't hat_unload()
5234          * the mappings.
5235          */
5236         if (VN_ISKAS(rootpp->p_vnode))
5237                 return (0);
5238 
5239         /*
5240          * Attempt to lock all constituent pages except the page passed
5241          * in since it's already locked.
5242          */
5243         for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5244                 ASSERT(!PP_ISFREE(tpp));
5245                 ASSERT(tpp->p_vnode != NULL);
5246 
5247                 if (tpp != pp && !page_trylock(tpp, SE_EXCL))
5248                         break;
5249                 ASSERT(tpp->p_szc == rootpp->p_szc);
5250                 ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i);
5251         }
5252 
5253         /*
5254          * If we failed to lock them all then unlock what we have
5255          * locked so far and bail.
5256          */
5257         if (i < npgs) {
5258                 tpp = rootpp;
5259                 while (i-- > 0) {
5260                         if (tpp != pp)
5261                                 page_unlock(tpp);
5262                         tpp++;
5263                 }
5264                 VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]);
5265                 return (0);
5266         }
5267 
5268         for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5269                 ASSERT(PAGE_EXCL(tpp));
5270                 ASSERT(tpp->p_slckcnt == 0);
5271                 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
5272                 tpp->p_szc = 0;
5273         }
5274 
5275         /*
5276          * Unlock all pages except the page passed in.
5277          */
5278         for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5279                 ASSERT(!hat_page_is_mapped(tpp));
5280                 if (tpp != pp)
5281                         page_unlock(tpp);
5282         }
5283 
5284         VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]);
5285         return (1);
5286 }
5287 
5288 /*
5289  * Called by page_free() and page_destroy() to demote the page size code
5290  * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero
5291  * p_szc on free list, neither can we just clear p_szc of a single page_t
5292  * within a large page since it will break other code that relies on p_szc
5293  * being the same for all page_t's of a large page). Anonymous pages should
5294  * never end up here because anon_map_getpages() cannot deal with p_szc
5295  * changes after a single constituent page is locked.  While anonymous or
5296  * kernel large pages are demoted or freed the entire large page at a time
5297  * with all constituent pages locked EXCL for the file system pages we
5298  * have to be able to demote a large page (i.e. decrease all constituent pages
5299  * p_szc) with only just an EXCL lock on one of constituent pages. The reason
5300  * we can easily deal with anonymous page demotion the entire large page at a
5301  * time is that those operation originate at address space level and concern
5302  * the entire large page region with actual demotion only done when pages are
5303  * not shared with any other processes (therefore we can always get EXCL lock
5304  * on all anonymous constituent pages after clearing segment page
5305  * cache). However file system pages can be truncated or invalidated at a
5306  * PAGESIZE level from the file system side and end up in page_free() or
5307  * page_destroy() (we also allow only part of the large page to be SOFTLOCKed
5308  * and therefore pageout should be able to demote a large page by EXCL locking
5309  * any constituent page that is not under SOFTLOCK). In those cases we cannot
5310  * rely on being able to lock EXCL all constituent pages.
5311  *
5312  * To prevent szc changes on file system pages one has to lock all constituent
5313  * pages at least SHARED (or call page_szc_lock()). The only subsystem that
5314  * doesn't rely on locking all constituent pages (or using page_szc_lock()) to
5315  * prevent szc changes is hat layer that uses its own page level mlist
5316  * locks. hat assumes that szc doesn't change after mlist lock for a page is
5317  * taken. Therefore we need to change szc under hat level locks if we only
5318  * have an EXCL lock on a single constituent page and hat still references any
5319  * of constituent pages.  (Note we can't "ignore" hat layer by simply
5320  * hat_pageunload() all constituent pages without having EXCL locks on all of
5321  * constituent pages). We use hat_page_demote() call to safely demote szc of
5322  * all constituent pages under hat locks when we only have an EXCL lock on one
5323  * of constituent pages.
5324  *
5325  * This routine calls page_szc_lock() before calling hat_page_demote() to
5326  * allow segvn in one special case not to lock all constituent pages SHARED
5327  * before calling hat_memload_array() that relies on p_szc not changing even
5328  * before hat level mlist lock is taken.  In that case segvn uses
5329  * page_szc_lock() to prevent hat_page_demote() changing p_szc values.
5330  *
5331  * Anonymous or kernel page demotion still has to lock all pages exclusively
5332  * and do hat_pageunload() on all constituent pages before demoting the page
5333  * therefore there's no need for anonymous or kernel page demotion to use
5334  * hat_page_demote() mechanism.
5335  *
5336  * hat_page_demote() removes all large mappings that map pp and then decreases
5337  * p_szc starting from the last constituent page of the large page. By working
5338  * from the tail of a large page in pfn decreasing order allows one looking at
5339  * the root page to know that hat_page_demote() is done for root's szc area.
5340  * e.g. if a root page has szc 1 one knows it only has to lock all constituent
5341  * pages within szc 1 area to prevent szc changes because hat_page_demote()
5342  * that started on this page when it had szc > 1 is done for this szc 1 area.
5343  *
5344  * We are guaranteed that all constituent pages of pp's large page belong to
5345  * the same vnode with the consecutive offsets increasing in the direction of
5346  * the pfn i.e. the identity of constituent pages can't change until their
5347  * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove
5348  * large mappings to pp even though we don't lock any constituent page except
5349  * pp (i.e. we won't unload e.g. kernel locked page).
5350  */
5351 static void
5352 page_demote_vp_pages(page_t *pp)
5353 {
5354         kmutex_t *mtx;
5355 
5356         ASSERT(PAGE_EXCL(pp));
5357         ASSERT(!PP_ISFREE(pp));
5358         ASSERT(pp->p_vnode != NULL);
5359         ASSERT(!IS_SWAPFSVP(pp->p_vnode));
5360         ASSERT(!PP_ISKAS(pp));
5361 
5362         VM_STAT_ADD(pagecnt.pc_demote_pages[0]);
5363 
5364         mtx = page_szc_lock(pp);
5365         if (mtx != NULL) {
5366                 hat_page_demote(pp);
5367                 mutex_exit(mtx);
5368         }
5369         ASSERT(pp->p_szc == 0);
5370 }
5371 
5372 /*
5373  * Mark any existing pages for migration in the given range
5374  */
5375 void
5376 page_mark_migrate(struct seg *seg, caddr_t addr, size_t len,
5377     struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
5378     u_offset_t vnoff, int rflag)
5379 {
5380         struct anon     *ap;
5381         vnode_t         *curvp;
5382         lgrp_t          *from;
5383         pgcnt_t         nlocked;
5384         u_offset_t      off;
5385         pfn_t           pfn;
5386         size_t          pgsz;
5387         size_t          segpgsz;
5388         pgcnt_t         pages;
5389         uint_t          pszc;
5390         page_t          *pp0, *pp;
5391         caddr_t         va;
5392         ulong_t         an_idx;
5393         anon_sync_obj_t cookie;
5394 
5395         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5396 
5397         /*
5398          * Don't do anything if don't need to do lgroup optimizations
5399          * on this system
5400          */
5401         if (!lgrp_optimizations())
5402                 return;
5403 
5404         /*
5405          * Align address and length to (potentially large) page boundary
5406          */
5407         segpgsz = page_get_pagesize(seg->s_szc);
5408         addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz);
5409         if (rflag)
5410                 len = P2ROUNDUP(len, segpgsz);
5411 
5412         /*
5413          * Do one (large) page at a time
5414          */
5415         va = addr;
5416         while (va < addr + len) {
5417                 /*
5418                  * Lookup (root) page for vnode and offset corresponding to
5419                  * this virtual address
5420                  * Try anonmap first since there may be copy-on-write
5421                  * pages, but initialize vnode pointer and offset using
5422                  * vnode arguments just in case there isn't an amp.
5423                  */
5424                 curvp = vp;
5425                 off = vnoff + va - seg->s_base;
5426                 if (amp) {
5427                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5428                         an_idx = anon_index + seg_page(seg, va);
5429                         anon_array_enter(amp, an_idx, &cookie);
5430                         ap = anon_get_ptr(amp->ahp, an_idx);
5431                         if (ap)
5432                                 swap_xlate(ap, &curvp, &off);
5433                         anon_array_exit(&cookie);
5434                         ANON_LOCK_EXIT(&amp->a_rwlock);
5435                 }
5436 
5437                 pp = NULL;
5438                 if (curvp)
5439                         pp = page_lookup(curvp, off, SE_SHARED);
5440 
5441                 /*
5442                  * If there isn't a page at this virtual address,
5443                  * skip to next page
5444                  */
5445                 if (pp == NULL) {
5446                         va += PAGESIZE;
5447                         continue;
5448                 }
5449 
5450                 /*
5451                  * Figure out which lgroup this page is in for kstats
5452                  */
5453                 pfn = page_pptonum(pp);
5454                 from = lgrp_pfn_to_lgrp(pfn);
5455 
5456                 /*
5457                  * Get page size, and round up and skip to next page boundary
5458                  * if unaligned address
5459                  */
5460                 pszc = pp->p_szc;
5461                 pgsz = page_get_pagesize(pszc);
5462                 pages = btop(pgsz);
5463                 if (!IS_P2ALIGNED(va, pgsz) ||
5464                     !IS_P2ALIGNED(pfn, pages) ||
5465                     pgsz > segpgsz) {
5466                         pgsz = MIN(pgsz, segpgsz);
5467                         page_unlock(pp);
5468                         pages = btop(P2END((uintptr_t)va, pgsz) -
5469                             (uintptr_t)va);
5470                         va = (caddr_t)P2END((uintptr_t)va, pgsz);
5471                         lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, pages);
5472                         continue;
5473                 }
5474 
5475                 /*
5476                  * Upgrade to exclusive lock on page
5477                  */
5478                 if (!page_tryupgrade(pp)) {
5479                         page_unlock(pp);
5480                         va += pgsz;
5481                         lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5482                             btop(pgsz));
5483                         continue;
5484                 }
5485 
5486                 pp0 = pp++;
5487                 nlocked = 1;
5488 
5489                 /*
5490                  * Lock constituent pages if this is large page
5491                  */
5492                 if (pages > 1) {
5493                         /*
5494                          * Lock all constituents except root page, since it
5495                          * should be locked already.
5496                          */
5497                         for (; nlocked < pages; nlocked++) {
5498                                 if (!page_trylock(pp, SE_EXCL)) {
5499                                         break;
5500                                 }
5501                                 if (PP_ISFREE(pp) ||
5502                                     pp->p_szc != pszc) {
5503                                         /*
5504                                          * hat_page_demote() raced in with us.
5505                                          */
5506                                         ASSERT(!IS_SWAPFSVP(curvp));
5507                                         page_unlock(pp);
5508                                         break;
5509                                 }
5510                                 pp++;
5511                         }
5512                 }
5513 
5514                 /*
5515                  * If all constituent pages couldn't be locked,
5516                  * unlock pages locked so far and skip to next page.
5517                  */
5518                 if (nlocked < pages) {
5519                         while (pp0 < pp) {
5520                                 page_unlock(pp0++);
5521                         }
5522                         va += pgsz;
5523                         lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5524                             btop(pgsz));
5525                         continue;
5526                 }
5527 
5528                 /*
5529                  * hat_page_demote() can no longer happen
5530                  * since last cons page had the right p_szc after
5531                  * all cons pages were locked. all cons pages
5532                  * should now have the same p_szc.
5533                  */
5534 
5535                 /*
5536                  * All constituent pages locked successfully, so mark
5537                  * large page for migration and unload the mappings of
5538                  * constituent pages, so a fault will occur on any part of the
5539                  * large page
5540                  */
5541                 PP_SETMIGRATE(pp0);
5542                 while (pp0 < pp) {
5543                         (void) hat_pageunload(pp0, HAT_FORCE_PGUNLOAD);
5544                         ASSERT(hat_page_getshare(pp0) == 0);
5545                         page_unlock(pp0++);
5546                 }
5547                 lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked);
5548 
5549                 va += pgsz;
5550         }
5551 }
5552 
5553 /*
5554  * Migrate any pages that have been marked for migration in the given range
5555  */
5556 void
5557 page_migrate(
5558         struct seg      *seg,
5559         caddr_t         addr,
5560         page_t          **ppa,
5561         pgcnt_t         npages)
5562 {
5563         lgrp_t          *from;
5564         lgrp_t          *to;
5565         page_t          *newpp;
5566         page_t          *pp;
5567         pfn_t           pfn;
5568         size_t          pgsz;
5569         spgcnt_t        page_cnt;
5570         spgcnt_t        i;
5571         uint_t          pszc;
5572 
5573         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5574 
5575         while (npages > 0) {
5576                 pp = *ppa;
5577                 pszc = pp->p_szc;
5578                 pgsz = page_get_pagesize(pszc);
5579                 page_cnt = btop(pgsz);
5580 
5581                 /*
5582                  * Check to see whether this page is marked for migration
5583                  *
5584                  * Assume that root page of large page is marked for
5585                  * migration and none of the other constituent pages
5586                  * are marked.  This really simplifies clearing the
5587                  * migrate bit by not having to clear it from each
5588                  * constituent page.
5589                  *
5590                  * note we don't want to relocate an entire large page if
5591                  * someone is only using one subpage.
5592                  */
5593                 if (npages < page_cnt)
5594                         break;
5595 
5596                 /*
5597                  * Is it marked for migration?
5598                  */
5599                 if (!PP_ISMIGRATE(pp))
5600                         goto next;
5601 
5602                 /*
5603                  * Determine lgroups that page is being migrated between
5604                  */
5605                 pfn = page_pptonum(pp);
5606                 if (!IS_P2ALIGNED(pfn, page_cnt)) {
5607                         break;
5608                 }
5609                 from = lgrp_pfn_to_lgrp(pfn);
5610                 to = lgrp_mem_choose(seg, addr, pgsz);
5611 
5612                 /*
5613                  * Need to get exclusive lock's to migrate
5614                  */
5615                 for (i = 0; i < page_cnt; i++) {
5616                         ASSERT(PAGE_LOCKED(ppa[i]));
5617                         if (page_pptonum(ppa[i]) != pfn + i ||
5618                             ppa[i]->p_szc != pszc) {
5619                                 break;
5620                         }
5621                         if (!page_tryupgrade(ppa[i])) {
5622                                 lgrp_stat_add(from->lgrp_id,
5623                                     LGRP_PM_FAIL_LOCK_PGS,
5624                                     page_cnt);
5625                                 break;
5626                         }
5627 
5628                         /*
5629                          * Check to see whether we are trying to migrate
5630                          * page to lgroup where it is allocated already.
5631                          * If so, clear the migrate bit and skip to next
5632                          * page.
5633                          */
5634                         if (i == 0 && to == from) {
5635                                 PP_CLRMIGRATE(ppa[0]);
5636                                 page_downgrade(ppa[0]);
5637                                 goto next;
5638                         }
5639                 }
5640 
5641                 /*
5642                  * If all constituent pages couldn't be locked,
5643                  * unlock pages locked so far and skip to next page.
5644                  */
5645                 if (i != page_cnt) {
5646                         while (--i != -1) {
5647                                 page_downgrade(ppa[i]);
5648                         }
5649                         goto next;
5650                 }
5651 
5652                 (void) page_create_wait(page_cnt, PG_WAIT);
5653                 newpp = page_get_replacement_page(pp, to, PGR_SAMESZC);
5654                 if (newpp == NULL) {
5655                         page_create_putback(page_cnt);
5656                         for (i = 0; i < page_cnt; i++) {
5657                                 page_downgrade(ppa[i]);
5658                         }
5659                         lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS,
5660                             page_cnt);
5661                         goto next;
5662                 }
5663                 ASSERT(newpp->p_szc == pszc);
5664                 /*
5665                  * Clear migrate bit and relocate page
5666                  */
5667                 PP_CLRMIGRATE(pp);
5668                 if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) {
5669                         panic("page_migrate: page_relocate failed");
5670                 }
5671                 ASSERT(page_cnt * PAGESIZE == pgsz);
5672 
5673                 /*
5674                  * Keep stats for number of pages migrated from and to
5675                  * each lgroup
5676                  */
5677                 lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt);
5678                 lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt);
5679                 /*
5680                  * update the page_t array we were passed in and
5681                  * unlink constituent pages of a large page.
5682                  */
5683                 for (i = 0; i < page_cnt; ++i, ++pp) {
5684                         ASSERT(PAGE_EXCL(newpp));
5685                         ASSERT(newpp->p_szc == pszc);
5686                         ppa[i] = newpp;
5687                         pp = newpp;
5688                         page_sub(&newpp, pp);
5689                         page_downgrade(pp);
5690                 }
5691                 ASSERT(newpp == NULL);
5692 next:
5693                 addr += pgsz;
5694                 ppa += page_cnt;
5695                 npages -= page_cnt;
5696         }
5697 }
5698 
5699 #define MAX_CNT 60      /* max num of iterations */
5700 /*
5701  * Reclaim/reserve availrmem for npages.
5702  * If there is not enough memory start reaping seg, kmem caches.
5703  * Start pageout scanner (via page_needfree()).
5704  * Exit after ~ MAX_CNT s regardless of how much memory has been released.
5705  * Note: There is no guarantee that any availrmem will be freed as
5706  * this memory typically is locked (kernel heap) or reserved for swap.
5707  * Also due to memory fragmentation kmem allocator may not be able
5708  * to free any memory (single user allocated buffer will prevent
5709  * freeing slab or a page).
5710  */
5711 int
5712 page_reclaim_mem(pgcnt_t npages, pgcnt_t epages, int adjust)
5713 {
5714         int     i = 0;
5715         int     ret = 0;
5716         pgcnt_t deficit;
5717         pgcnt_t old_availrmem;
5718 
5719         mutex_enter(&freemem_lock);
5720         old_availrmem = availrmem - 1;
5721         while ((availrmem < tune.t_minarmem + npages + epages) &&
5722             (old_availrmem < availrmem) && (i++ < MAX_CNT)) {
5723                 old_availrmem = availrmem;
5724                 deficit = tune.t_minarmem + npages + epages - availrmem;
5725                 mutex_exit(&freemem_lock);
5726                 page_needfree(deficit);
5727                 kmem_reap();
5728                 delay(hz);
5729                 page_needfree(-(spgcnt_t)deficit);
5730                 mutex_enter(&freemem_lock);
5731         }
5732 
5733         if (adjust && (availrmem >= tune.t_minarmem + npages + epages)) {
5734                 availrmem -= npages;
5735                 ret = 1;
5736         }
5737 
5738         mutex_exit(&freemem_lock);
5739 
5740         return (ret);
5741 }
5742 
5743 /*
5744  * Search the memory segments to locate the desired page.  Within a
5745  * segment, pages increase linearly with one page structure per
5746  * physical page frame (size PAGESIZE).  The search begins
5747  * with the segment that was accessed last, to take advantage of locality.
5748  * If the hint misses, we start from the beginning of the sorted memseg list
5749  */
5750 
5751 
5752 /*
5753  * Some data structures for pfn to pp lookup.
5754  */
5755 ulong_t mhash_per_slot;
5756 struct memseg *memseg_hash[N_MEM_SLOTS];
5757 
5758 page_t *
5759 page_numtopp_nolock(pfn_t pfnum)
5760 {
5761         struct memseg *seg;
5762         page_t *pp;
5763         vm_cpu_data_t *vc;
5764 
5765         /*
5766          * We need to disable kernel preemption while referencing the
5767          * cpu_vm_data field in order to prevent us from being switched to
5768          * another cpu and trying to reference it after it has been freed.
5769          * This will keep us on cpu and prevent it from being removed while
5770          * we are still on it.
5771          *
5772          * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5773          * which is being resued by DR who will flush those references
5774          * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5775          */
5776         kpreempt_disable();
5777         vc = CPU->cpu_vm_data;
5778         ASSERT(vc != NULL);
5779 
5780         MEMSEG_STAT_INCR(nsearch);
5781 
5782         /* Try last winner first */
5783         if (((seg = vc->vc_pnum_memseg) != NULL) &&
5784             (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5785                 MEMSEG_STAT_INCR(nlastwon);
5786                 pp = seg->pages + (pfnum - seg->pages_base);
5787                 if (pp->p_pagenum == pfnum) {
5788                         kpreempt_enable();
5789                         return ((page_t *)pp);
5790                 }
5791         }
5792 
5793         /* Else Try hash */
5794         if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5795             (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5796                 MEMSEG_STAT_INCR(nhashwon);
5797                 vc->vc_pnum_memseg = seg;
5798                 pp = seg->pages + (pfnum - seg->pages_base);
5799                 if (pp->p_pagenum == pfnum) {
5800                         kpreempt_enable();
5801                         return ((page_t *)pp);
5802                 }
5803         }
5804 
5805         /* Else Brute force */
5806         for (seg = memsegs; seg != NULL; seg = seg->next) {
5807                 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5808                         vc->vc_pnum_memseg = seg;
5809                         pp = seg->pages + (pfnum - seg->pages_base);
5810                         if (pp->p_pagenum == pfnum) {
5811                                 kpreempt_enable();
5812                                 return ((page_t *)pp);
5813                         }
5814                 }
5815         }
5816         vc->vc_pnum_memseg = NULL;
5817         kpreempt_enable();
5818         MEMSEG_STAT_INCR(nnotfound);
5819         return ((page_t *)NULL);
5820 
5821 }
5822 
5823 struct memseg *
5824 page_numtomemseg_nolock(pfn_t pfnum)
5825 {
5826         struct memseg *seg;
5827         page_t *pp;
5828 
5829         /*
5830          * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5831          * which is being resued by DR who will flush those references
5832          * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5833          */
5834         kpreempt_disable();
5835         /* Try hash */
5836         if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5837             (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5838                 pp = seg->pages + (pfnum - seg->pages_base);
5839                 if (pp->p_pagenum == pfnum) {
5840                         kpreempt_enable();
5841                         return (seg);
5842                 }
5843         }
5844 
5845         /* Else Brute force */
5846         for (seg = memsegs; seg != NULL; seg = seg->next) {
5847                 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5848                         pp = seg->pages + (pfnum - seg->pages_base);
5849                         if (pp->p_pagenum == pfnum) {
5850                                 kpreempt_enable();
5851                                 return (seg);
5852                         }
5853                 }
5854         }
5855         kpreempt_enable();
5856         return ((struct memseg *)NULL);
5857 }
5858 
5859 /*
5860  * Given a page and a count return the page struct that is
5861  * n structs away from the current one in the global page
5862  * list.
5863  *
5864  * This function wraps to the first page upon
5865  * reaching the end of the memseg list.
5866  */
5867 page_t *
5868 page_nextn(page_t *pp, ulong_t n)
5869 {
5870         struct memseg *seg;
5871         page_t *ppn;
5872         vm_cpu_data_t *vc;
5873 
5874         /*
5875          * We need to disable kernel preemption while referencing the
5876          * cpu_vm_data field in order to prevent us from being switched to
5877          * another cpu and trying to reference it after it has been freed.
5878          * This will keep us on cpu and prevent it from being removed while
5879          * we are still on it.
5880          *
5881          * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5882          * which is being resued by DR who will flush those references
5883          * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5884          */
5885         kpreempt_disable();
5886         vc = (vm_cpu_data_t *)CPU->cpu_vm_data;
5887 
5888         ASSERT(vc != NULL);
5889 
5890         if (((seg = vc->vc_pnext_memseg) == NULL) ||
5891             (seg->pages_base == seg->pages_end) ||
5892             !(pp >= seg->pages && pp < seg->epages)) {
5893 
5894                 for (seg = memsegs; seg; seg = seg->next) {
5895                         if (pp >= seg->pages && pp < seg->epages)
5896                                 break;
5897                 }
5898 
5899                 if (seg == NULL) {
5900                         /* Memory delete got in, return something valid. */
5901                         /* TODO: fix me. */
5902                         seg = memsegs;
5903                         pp = seg->pages;
5904                 }
5905         }
5906 
5907         /* check for wraparound - possible if n is large */
5908         while ((ppn = (pp + n)) >= seg->epages || ppn < pp) {
5909                 n -= seg->epages - pp;
5910                 seg = seg->next;
5911                 if (seg == NULL)
5912                         seg = memsegs;
5913                 pp = seg->pages;
5914         }
5915         vc->vc_pnext_memseg = seg;
5916         kpreempt_enable();
5917         return (ppn);
5918 }
5919 
5920 /*
5921  * Initialize for a loop using page_next_scan_large().
5922  */
5923 page_t *
5924 page_next_scan_init(void **cookie)
5925 {
5926         ASSERT(cookie != NULL);
5927         *cookie = (void *)memsegs;
5928         return ((page_t *)memsegs->pages);
5929 }
5930 
5931 /*
5932  * Return the next page in a scan of page_t's, assuming we want
5933  * to skip over sub-pages within larger page sizes.
5934  *
5935  * The cookie is used to keep track of the current memseg.
5936  */
5937 page_t *
5938 page_next_scan_large(
5939         page_t          *pp,
5940         ulong_t         *n,
5941         void            **cookie)
5942 {
5943         struct memseg   *seg = (struct memseg *)*cookie;
5944         page_t          *new_pp;
5945         ulong_t         cnt;
5946         pfn_t           pfn;
5947 
5948 
5949         /*
5950          * get the count of page_t's to skip based on the page size
5951          */
5952         ASSERT(pp != NULL);
5953         if (pp->p_szc == 0) {
5954                 cnt = 1;
5955         } else {
5956                 pfn = page_pptonum(pp);
5957                 cnt = page_get_pagecnt(pp->p_szc);
5958                 cnt -= pfn & (cnt - 1);
5959         }
5960         *n += cnt;
5961         new_pp = pp + cnt;
5962 
5963         /*
5964          * Catch if we went past the end of the current memory segment. If so,
5965          * just move to the next segment with pages.
5966          */
5967         if (new_pp >= seg->epages || seg->pages_base == seg->pages_end) {
5968                 do {
5969                         seg = seg->next;
5970                         if (seg == NULL)
5971                                 seg = memsegs;
5972                 } while (seg->pages_base == seg->pages_end);
5973                 new_pp = seg->pages;
5974                 *cookie = (void *)seg;
5975         }
5976 
5977         return (new_pp);
5978 }
5979 
5980 
5981 /*
5982  * Returns next page in list. Note: this function wraps
5983  * to the first page in the list upon reaching the end
5984  * of the list. Callers should be aware of this fact.
5985  */
5986 
5987 /* We should change this be a #define */
5988 
5989 page_t *
5990 page_next(page_t *pp)
5991 {
5992         return (page_nextn(pp, 1));
5993 }
5994 
5995 page_t *
5996 page_first()
5997 {
5998         return ((page_t *)memsegs->pages);
5999 }
6000 
6001 
6002 /*
6003  * This routine is called at boot with the initial memory configuration
6004  * and when memory is added or removed.
6005  */
6006 void
6007 build_pfn_hash()
6008 {
6009         pfn_t cur;
6010         pgcnt_t index;
6011         struct memseg *pseg;
6012         int     i;
6013 
6014         /*
6015          * Clear memseg_hash array.
6016          * Since memory add/delete is designed to operate concurrently
6017          * with normal operation, the hash rebuild must be able to run
6018          * concurrently with page_numtopp_nolock(). To support this
6019          * functionality, assignments to memseg_hash array members must
6020          * be done atomically.
6021          *
6022          * NOTE: bzero() does not currently guarantee this for kernel
6023          * threads, and cannot be used here.
6024          */
6025         for (i = 0; i < N_MEM_SLOTS; i++)
6026                 memseg_hash[i] = NULL;
6027 
6028         hat_kpm_mseghash_clear(N_MEM_SLOTS);
6029 
6030         /*
6031          * Physmax is the last valid pfn.
6032          */
6033         mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT;
6034         for (pseg = memsegs; pseg != NULL; pseg = pseg->next) {
6035                 index = MEMSEG_PFN_HASH(pseg->pages_base);
6036                 cur = pseg->pages_base;
6037                 do {
6038                         if (index >= N_MEM_SLOTS)
6039                                 index = MEMSEG_PFN_HASH(cur);
6040 
6041                         if (memseg_hash[index] == NULL ||
6042                             memseg_hash[index]->pages_base > pseg->pages_base) {
6043                                 memseg_hash[index] = pseg;
6044                                 hat_kpm_mseghash_update(index, pseg);
6045                         }
6046                         cur += mhash_per_slot;
6047                         index++;
6048                 } while (cur < pseg->pages_end);
6049         }
6050 }
6051 
6052 /*
6053  * Return the pagenum for the pp
6054  */
6055 pfn_t
6056 page_pptonum(page_t *pp)
6057 {
6058         return (pp->p_pagenum);
6059 }
6060 
6061 /*
6062  * interface to the referenced and modified etc bits
6063  * in the PSM part of the page struct
6064  * when no locking is desired.
6065  */
6066 void
6067 page_set_props(page_t *pp, uint_t flags)
6068 {
6069         ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0);
6070         pp->p_nrm |= (uchar_t)flags;
6071 }
6072 
6073 void
6074 page_clr_all_props(page_t *pp)
6075 {
6076         pp->p_nrm = 0;
6077 }
6078 
6079 /*
6080  * Clear p_lckcnt and p_cowcnt, adjusting freemem if required.
6081  */
6082 int
6083 page_clear_lck_cow(page_t *pp, int adjust)
6084 {
6085         int     f_amount;
6086 
6087         ASSERT(PAGE_EXCL(pp));
6088 
6089         /*
6090          * The page_struct_lock need not be acquired here since
6091          * we require the caller hold the page exclusively locked.
6092          */
6093         f_amount = 0;
6094         if (pp->p_lckcnt) {
6095                 f_amount = 1;
6096                 pp->p_lckcnt = 0;
6097         }
6098         if (pp->p_cowcnt) {
6099                 f_amount += pp->p_cowcnt;
6100                 pp->p_cowcnt = 0;
6101         }
6102 
6103         if (adjust && f_amount) {
6104                 mutex_enter(&freemem_lock);
6105                 availrmem += f_amount;
6106                 mutex_exit(&freemem_lock);
6107         }
6108 
6109         return (f_amount);
6110 }
6111 
6112 /*
6113  * The following functions is called from free_vp_pages()
6114  * for an inexact estimate of a newly free'd page...
6115  */
6116 ulong_t
6117 page_share_cnt(page_t *pp)
6118 {
6119         return (hat_page_getshare(pp));
6120 }
6121 
6122 int
6123 page_isshared(page_t *pp)
6124 {
6125         return (hat_page_checkshare(pp, 1));
6126 }
6127 
6128 int
6129 page_isfree(page_t *pp)
6130 {
6131         return (PP_ISFREE(pp));
6132 }
6133 
6134 int
6135 page_isref(page_t *pp)
6136 {
6137         return (hat_page_getattr(pp, P_REF));
6138 }
6139 
6140 int
6141 page_ismod(page_t *pp)
6142 {
6143         return (hat_page_getattr(pp, P_MOD));
6144 }
6145 
6146 /*
6147  * The following code all currently relates to the page capture logic:
6148  *
6149  * This logic is used for cases where there is a desire to claim a certain
6150  * physical page in the system for the caller.  As it may not be possible
6151  * to capture the page immediately, the p_toxic bits are used in the page
6152  * structure to indicate that someone wants to capture this page.  When the
6153  * page gets unlocked, the toxic flag will be noted and an attempt to capture
6154  * the page will be made.  If it is successful, the original callers callback
6155  * will be called with the page to do with it what they please.
6156  *
6157  * There is also an async thread which wakes up to attempt to capture
6158  * pages occasionally which have the capture bit set.  All of the pages which
6159  * need to be captured asynchronously have been inserted into the
6160  * page_capture_hash and thus this thread walks that hash list.  Items in the
6161  * hash have an expiration time so this thread handles that as well by removing
6162  * the item from the hash if it has expired.
6163  *
6164  * Some important things to note are:
6165  * - if the PR_CAPTURE bit is set on a page, then the page is in the
6166  *   page_capture_hash.  The page_capture_hash_head.pchh_mutex is needed
6167  *   to set and clear this bit, and while the lock is held is the only time
6168  *   you can add or remove an entry from the hash.
6169  * - the PR_CAPTURE bit can only be set and cleared while holding the
6170  *   page_capture_hash_head.pchh_mutex
6171  * - the t_flag field of the thread struct is used with the T_CAPTURING
6172  *   flag to prevent recursion while dealing with large pages.
6173  * - pages which need to be retired never expire on the page_capture_hash.
6174  */
6175 
6176 static void page_capture_thread(void);
6177 static kthread_t *pc_thread_id;
6178 kcondvar_t pc_cv;
6179 static kmutex_t pc_thread_mutex;
6180 static clock_t pc_thread_shortwait;
6181 static clock_t pc_thread_longwait;
6182 static int pc_thread_retry;
6183 
6184 struct page_capture_callback pc_cb[PC_NUM_CALLBACKS];
6185 
6186 /* Note that this is a circular linked list */
6187 typedef struct page_capture_hash_bucket {
6188         page_t *pp;
6189         uchar_t szc;
6190         uchar_t pri;
6191         uint_t flags;
6192         clock_t expires;        /* lbolt at which this request expires. */
6193         void *datap;            /* Cached data passed in for callback */
6194         struct page_capture_hash_bucket *next;
6195         struct page_capture_hash_bucket *prev;
6196 } page_capture_hash_bucket_t;
6197 
6198 #define PC_PRI_HI       0       /* capture now */
6199 #define PC_PRI_LO       1       /* capture later */
6200 #define PC_NUM_PRI      2
6201 
6202 #define PAGE_CAPTURE_PRIO(pp) (PP_ISRAF(pp) ? PC_PRI_LO : PC_PRI_HI)
6203 
6204 
6205 /*
6206  * Each hash bucket will have it's own mutex and two lists which are:
6207  * active (0):  represents requests which have not been processed by
6208  *              the page_capture async thread yet.
6209  * walked (1):  represents requests which have been processed by the
6210  *              page_capture async thread within it's given walk of this bucket.
6211  *
6212  * These are all needed so that we can synchronize all async page_capture
6213  * events.  When the async thread moves to a new bucket, it will append the
6214  * walked list to the active list and walk each item one at a time, moving it
6215  * from the active list to the walked list.  Thus if there is an async request
6216  * outstanding for a given page, it will always be in one of the two lists.
6217  * New requests will always be added to the active list.
6218  * If we were not able to capture a page before the request expired, we'd free
6219  * up the request structure which would indicate to page_capture that there is
6220  * no longer a need for the given page, and clear the PR_CAPTURE flag if
6221  * possible.
6222  */
6223 typedef struct page_capture_hash_head {
6224         kmutex_t pchh_mutex;
6225         uint_t num_pages[PC_NUM_PRI];
6226         page_capture_hash_bucket_t lists[2]; /* sentinel nodes */
6227 } page_capture_hash_head_t;
6228 
6229 #ifdef DEBUG
6230 #define NUM_PAGE_CAPTURE_BUCKETS 4
6231 #else
6232 #define NUM_PAGE_CAPTURE_BUCKETS 64
6233 #endif
6234 
6235 page_capture_hash_head_t page_capture_hash[NUM_PAGE_CAPTURE_BUCKETS];
6236 
6237 /* for now use a very simple hash based upon the size of a page struct */
6238 #define PAGE_CAPTURE_HASH(pp)   \
6239         ((int)(((uintptr_t)pp >> 7) & (NUM_PAGE_CAPTURE_BUCKETS - 1)))
6240 
6241 extern pgcnt_t swapfs_minfree;
6242 
6243 int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap);
6244 
6245 /*
6246  * a callback function is required for page capture requests.
6247  */
6248 void
6249 page_capture_register_callback(uint_t index, clock_t duration,
6250     int (*cb_func)(page_t *, void *, uint_t))
6251 {
6252         ASSERT(pc_cb[index].cb_active == 0);
6253         ASSERT(cb_func != NULL);
6254         rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6255         pc_cb[index].duration = duration;
6256         pc_cb[index].cb_func = cb_func;
6257         pc_cb[index].cb_active = 1;
6258         rw_exit(&pc_cb[index].cb_rwlock);
6259 }
6260 
6261 void
6262 page_capture_unregister_callback(uint_t index)
6263 {
6264         int i, j;
6265         struct page_capture_hash_bucket *bp1;
6266         struct page_capture_hash_bucket *bp2;
6267         struct page_capture_hash_bucket *head = NULL;
6268         uint_t flags = (1 << index);
6269 
6270         rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6271         ASSERT(pc_cb[index].cb_active == 1);
6272         pc_cb[index].duration = 0;      /* Paranoia */
6273         pc_cb[index].cb_func = NULL;    /* Paranoia */
6274         pc_cb[index].cb_active = 0;
6275         rw_exit(&pc_cb[index].cb_rwlock);
6276 
6277         /*
6278          * Just move all the entries to a private list which we can walk
6279          * through without the need to hold any locks.
6280          * No more requests can get added to the hash lists for this consumer
6281          * as the cb_active field for the callback has been cleared.
6282          */
6283         for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
6284                 mutex_enter(&page_capture_hash[i].pchh_mutex);
6285                 for (j = 0; j < 2; j++) {
6286                         bp1 = page_capture_hash[i].lists[j].next;
6287                         /* walk through all but first (sentinel) element */
6288                         while (bp1 != &page_capture_hash[i].lists[j]) {
6289                                 bp2 = bp1;
6290                                 if (bp2->flags & flags) {
6291                                         bp1 = bp2->next;
6292                                         bp1->prev = bp2->prev;
6293                                         bp2->prev->next = bp1;
6294                                         bp2->next = head;
6295                                         head = bp2;
6296                                         /*
6297                                          * Clear the PR_CAPTURE bit as we
6298                                          * hold appropriate locks here.
6299                                          */
6300                                         page_clrtoxic(head->pp, PR_CAPTURE);
6301                                         page_capture_hash[i].
6302                                             num_pages[bp2->pri]--;
6303                                         continue;
6304                                 }
6305                                 bp1 = bp1->next;
6306                         }
6307                 }
6308                 mutex_exit(&page_capture_hash[i].pchh_mutex);
6309         }
6310 
6311         while (head != NULL) {
6312                 bp1 = head;
6313                 head = head->next;
6314                 kmem_free(bp1, sizeof (*bp1));
6315         }
6316 }
6317 
6318 
6319 /*
6320  * Find pp in the active list and move it to the walked list if it
6321  * exists.
6322  * Note that most often pp should be at the front of the active list
6323  * as it is currently used and thus there is no other sort of optimization
6324  * being done here as this is a linked list data structure.
6325  * Returns 1 on successful move or 0 if page could not be found.
6326  */
6327 static int
6328 page_capture_move_to_walked(page_t *pp)
6329 {
6330         page_capture_hash_bucket_t *bp;
6331         int index;
6332 
6333         index = PAGE_CAPTURE_HASH(pp);
6334 
6335         mutex_enter(&page_capture_hash[index].pchh_mutex);
6336         bp = page_capture_hash[index].lists[0].next;
6337         while (bp != &page_capture_hash[index].lists[0]) {
6338                 if (bp->pp == pp) {
6339                         /* Remove from old list */
6340                         bp->next->prev = bp->prev;
6341                         bp->prev->next = bp->next;
6342 
6343                         /* Add to new list */
6344                         bp->next = page_capture_hash[index].lists[1].next;
6345                         bp->prev = &page_capture_hash[index].lists[1];
6346                         page_capture_hash[index].lists[1].next = bp;
6347                         bp->next->prev = bp;
6348 
6349                         /*
6350                          * There is a small probability of page on a free
6351                          * list being retired while being allocated
6352                          * and before P_RAF is set on it. The page may
6353                          * end up marked as high priority request instead
6354                          * of low priority request.
6355                          * If P_RAF page is not marked as low priority request
6356                          * change it to low priority request.
6357                          */
6358                         page_capture_hash[index].num_pages[bp->pri]--;
6359                         bp->pri = PAGE_CAPTURE_PRIO(pp);
6360                         page_capture_hash[index].num_pages[bp->pri]++;
6361                         mutex_exit(&page_capture_hash[index].pchh_mutex);
6362                         return (1);
6363                 }
6364                 bp = bp->next;
6365         }
6366         mutex_exit(&page_capture_hash[index].pchh_mutex);
6367         return (0);
6368 }
6369 
6370 /*
6371  * Add a new entry to the page capture hash.  The only case where a new
6372  * entry is not added is when the page capture consumer is no longer registered.
6373  * In this case, we'll silently not add the page to the hash.  We know that
6374  * page retire will always be registered for the case where we are currently
6375  * unretiring a page and thus there are no conflicts.
6376  */
6377 static void
6378 page_capture_add_hash(page_t *pp, uint_t szc, uint_t flags, void *datap)
6379 {
6380         page_capture_hash_bucket_t *bp1;
6381         page_capture_hash_bucket_t *bp2;
6382         int index;
6383         int cb_index;
6384         int i;
6385         uchar_t pri;
6386 #ifdef DEBUG
6387         page_capture_hash_bucket_t *tp1;
6388         int l;
6389 #endif
6390 
6391         ASSERT(!(flags & CAPTURE_ASYNC));
6392 
6393         bp1 = kmem_alloc(sizeof (struct page_capture_hash_bucket), KM_SLEEP);
6394 
6395         bp1->pp = pp;
6396         bp1->szc = szc;
6397         bp1->flags = flags;
6398         bp1->datap = datap;
6399 
6400         for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6401                 if ((flags >> cb_index) & 1) {
6402                         break;
6403                 }
6404         }
6405 
6406         ASSERT(cb_index != PC_NUM_CALLBACKS);
6407 
6408         rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6409         if (pc_cb[cb_index].cb_active) {
6410                 if (pc_cb[cb_index].duration == -1) {
6411                         bp1->expires = (clock_t)-1;
6412                 } else {
6413                         bp1->expires = ddi_get_lbolt() +
6414                             pc_cb[cb_index].duration;
6415                 }
6416         } else {
6417                 /* There's no callback registered so don't add to the hash */
6418                 rw_exit(&pc_cb[cb_index].cb_rwlock);
6419                 kmem_free(bp1, sizeof (*bp1));
6420                 return;
6421         }
6422 
6423         index = PAGE_CAPTURE_HASH(pp);
6424 
6425         /*
6426          * Only allow capture flag to be modified under this mutex.
6427          * Prevents multiple entries for same page getting added.
6428          */
6429         mutex_enter(&page_capture_hash[index].pchh_mutex);
6430 
6431         /*
6432          * if not already on the hash, set capture bit and add to the hash
6433          */
6434         if (!(pp->p_toxic & PR_CAPTURE)) {
6435 #ifdef DEBUG
6436                 /* Check for duplicate entries */
6437                 for (l = 0; l < 2; l++) {
6438                         tp1 = page_capture_hash[index].lists[l].next;
6439                         while (tp1 != &page_capture_hash[index].lists[l]) {
6440                                 if (tp1->pp == pp) {
6441                                         panic("page pp 0x%p already on hash "
6442                                             "at 0x%p\n",
6443                                             (void *)pp, (void *)tp1);
6444                                 }
6445                                 tp1 = tp1->next;
6446                         }
6447                 }
6448 
6449 #endif
6450                 page_settoxic(pp, PR_CAPTURE);
6451                 pri = PAGE_CAPTURE_PRIO(pp);
6452                 bp1->pri = pri;
6453                 bp1->next = page_capture_hash[index].lists[0].next;
6454                 bp1->prev = &page_capture_hash[index].lists[0];
6455                 bp1->next->prev = bp1;
6456                 page_capture_hash[index].lists[0].next = bp1;
6457                 page_capture_hash[index].num_pages[pri]++;
6458                 if (flags & CAPTURE_RETIRE) {
6459                         page_retire_incr_pend_count(datap);
6460                 }
6461                 mutex_exit(&page_capture_hash[index].pchh_mutex);
6462                 rw_exit(&pc_cb[cb_index].cb_rwlock);
6463                 cv_signal(&pc_cv);
6464                 return;
6465         }
6466 
6467         /*
6468          * A page retire request will replace any other request.
6469          * A second physmem request which is for a different process than
6470          * the currently registered one will be dropped as there is
6471          * no way to hold the private data for both calls.
6472          * In the future, once there are more callers, this will have to
6473          * be worked out better as there needs to be private storage for
6474          * at least each type of caller (maybe have datap be an array of
6475          * *void's so that we can index based upon callers index).
6476          */
6477 
6478         /* walk hash list to update expire time */
6479         for (i = 0; i < 2; i++) {
6480                 bp2 = page_capture_hash[index].lists[i].next;
6481                 while (bp2 != &page_capture_hash[index].lists[i]) {
6482                         if (bp2->pp == pp) {
6483                                 if (flags & CAPTURE_RETIRE) {
6484                                         if (!(bp2->flags & CAPTURE_RETIRE)) {
6485                                                 page_retire_incr_pend_count(
6486                                                     datap);
6487                                                 bp2->flags = flags;
6488                                                 bp2->expires = bp1->expires;
6489                                                 bp2->datap = datap;
6490                                         }
6491                                 } else {
6492                                         ASSERT(flags & CAPTURE_PHYSMEM);
6493                                         if (!(bp2->flags & CAPTURE_RETIRE) &&
6494                                             (datap == bp2->datap)) {
6495                                                 bp2->expires = bp1->expires;
6496                                         }
6497                                 }
6498                                 mutex_exit(&page_capture_hash[index].
6499                                     pchh_mutex);
6500                                 rw_exit(&pc_cb[cb_index].cb_rwlock);
6501                                 kmem_free(bp1, sizeof (*bp1));
6502                                 return;
6503                         }
6504                         bp2 = bp2->next;
6505                 }
6506         }
6507 
6508         /*
6509          * the PR_CAPTURE flag is protected by the page_capture_hash mutexes
6510          * and thus it either has to be set or not set and can't change
6511          * while holding the mutex above.
6512          */
6513         panic("page_capture_add_hash, PR_CAPTURE flag set on pp %p\n",
6514             (void *)pp);
6515 }
6516 
6517 /*
6518  * We have a page in our hands, lets try and make it ours by turning
6519  * it into a clean page like it had just come off the freelists.
6520  *
6521  * Returns 0 on success, with the page still EXCL locked.
6522  * On failure, the page will be unlocked, and returns EAGAIN
6523  */
6524 static int
6525 page_capture_clean_page(page_t *pp)
6526 {
6527         page_t *newpp;
6528         int skip_unlock = 0;
6529         spgcnt_t count;
6530         page_t *tpp;
6531         int ret = 0;
6532         int extra;
6533 
6534         ASSERT(PAGE_EXCL(pp));
6535         ASSERT(!PP_RETIRED(pp));
6536         ASSERT(curthread->t_flag & T_CAPTURING);
6537 
6538         if (PP_ISFREE(pp)) {
6539                 if (!page_reclaim(pp, NULL)) {
6540                         skip_unlock = 1;
6541                         ret = EAGAIN;
6542                         goto cleanup;
6543                 }
6544                 ASSERT(pp->p_szc == 0);
6545                 if (pp->p_vnode != NULL) {
6546                         /*
6547                          * Since this page came from the
6548                          * cachelist, we must destroy the
6549                          * old vnode association.
6550                          */
6551                         page_hashout(pp, NULL);
6552                 }
6553                 goto cleanup;
6554         }
6555 
6556         /*
6557          * If we know page_relocate will fail, skip it
6558          * It could still fail due to a UE on another page but we
6559          * can't do anything about that.
6560          */
6561         if (pp->p_toxic & PR_UE) {
6562                 goto skip_relocate;
6563         }
6564 
6565         /*
6566          * It's possible that pages can not have a vnode as fsflush comes
6567          * through and cleans up these pages.  It's ugly but that's how it is.
6568          */
6569         if (pp->p_vnode == NULL) {
6570                 goto skip_relocate;
6571         }
6572 
6573         /*
6574          * Page was not free, so lets try to relocate it.
6575          * page_relocate only works with root pages, so if this is not a root
6576          * page, we need to demote it to try and relocate it.
6577          * Unfortunately this is the best we can do right now.
6578          */
6579         newpp = NULL;
6580         if ((pp->p_szc > 0) && (pp != PP_PAGEROOT(pp))) {
6581                 if (page_try_demote_pages(pp) == 0) {
6582                         ret = EAGAIN;
6583                         goto cleanup;
6584                 }
6585         }
6586         ret = page_relocate(&pp, &newpp, 1, 0, &count, NULL);
6587         if (ret == 0) {
6588                 page_t *npp;
6589                 /* unlock the new page(s) */
6590                 while (count-- > 0) {
6591                         ASSERT(newpp != NULL);
6592                         npp = newpp;
6593                         page_sub(&newpp, npp);
6594                         page_unlock(npp);
6595                 }
6596                 ASSERT(newpp == NULL);
6597                 /*
6598                  * Check to see if the page we have is too large.
6599                  * If so, demote it freeing up the extra pages.
6600                  */
6601                 if (pp->p_szc > 0) {
6602                         /* For now demote extra pages to szc == 0 */
6603                         extra = page_get_pagecnt(pp->p_szc) - 1;
6604                         while (extra > 0) {
6605                                 tpp = pp->p_next;
6606                                 page_sub(&pp, tpp);
6607                                 tpp->p_szc = 0;
6608                                 page_free(tpp, 1);
6609                                 extra--;
6610                         }
6611                         /* Make sure to set our page to szc 0 as well */
6612                         ASSERT(pp->p_next == pp && pp->p_prev == pp);
6613                         pp->p_szc = 0;
6614                 }
6615                 goto cleanup;
6616         } else if (ret == EIO) {
6617                 ret = EAGAIN;
6618                 goto cleanup;
6619         } else {
6620                 /*
6621                  * Need to reset return type as we failed to relocate the page
6622                  * but that does not mean that some of the next steps will not
6623                  * work.
6624                  */
6625                 ret = 0;
6626         }
6627 
6628 skip_relocate:
6629 
6630         if (pp->p_szc > 0) {
6631                 if (page_try_demote_pages(pp) == 0) {
6632                         ret = EAGAIN;
6633                         goto cleanup;
6634                 }
6635         }
6636 
6637         ASSERT(pp->p_szc == 0);
6638 
6639         if (hat_ismod(pp)) {
6640                 ret = EAGAIN;
6641                 goto cleanup;
6642         }
6643         if (PP_ISKAS(pp)) {
6644                 ret = EAGAIN;
6645                 goto cleanup;
6646         }
6647         if (pp->p_lckcnt || pp->p_cowcnt) {
6648                 ret = EAGAIN;
6649                 goto cleanup;
6650         }
6651 
6652         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
6653         ASSERT(!hat_page_is_mapped(pp));
6654 
6655         if (hat_ismod(pp)) {
6656                 /*
6657                  * This is a semi-odd case as the page is now modified but not
6658                  * mapped as we just unloaded the mappings above.
6659                  */
6660                 ret = EAGAIN;
6661                 goto cleanup;
6662         }
6663         if (pp->p_vnode != NULL) {
6664                 page_hashout(pp, NULL);
6665         }
6666 
6667         /*
6668          * At this point, the page should be in a clean state and
6669          * we can do whatever we want with it.
6670          */
6671 
6672 cleanup:
6673         if (ret != 0) {
6674                 if (!skip_unlock) {
6675                         page_unlock(pp);
6676                 }
6677         } else {
6678                 ASSERT(pp->p_szc == 0);
6679                 ASSERT(PAGE_EXCL(pp));
6680 
6681                 pp->p_next = pp;
6682                 pp->p_prev = pp;
6683         }
6684         return (ret);
6685 }
6686 
6687 /*
6688  * Various callers of page_trycapture() can have different restrictions upon
6689  * what memory they have access to.
6690  * Returns 0 on success, with the following error codes on failure:
6691  *      EPERM - The requested page is long term locked, and thus repeated
6692  *              requests to capture this page will likely fail.
6693  *      ENOMEM - There was not enough free memory in the system to safely
6694  *              map the requested page.
6695  *      ENOENT - The requested page was inside the kernel cage, and the
6696  *              PHYSMEM_CAGE flag was not set.
6697  */
6698 int
6699 page_capture_pre_checks(page_t *pp, uint_t flags)
6700 {
6701         ASSERT(pp != NULL);
6702 
6703 #if defined(__sparc)
6704         if (pp->p_vnode == &promvp) {
6705                 return (EPERM);
6706         }
6707 
6708         if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE) &&
6709             (flags & CAPTURE_PHYSMEM)) {
6710                 return (ENOENT);
6711         }
6712 
6713         if (PP_ISNORELOCKERNEL(pp)) {
6714                 return (EPERM);
6715         }
6716 #else
6717         if (PP_ISKAS(pp)) {
6718                 return (EPERM);
6719         }
6720 #endif /* __sparc */
6721 
6722         /* only physmem currently has the restrictions checked below */
6723         if (!(flags & CAPTURE_PHYSMEM)) {
6724                 return (0);
6725         }
6726 
6727         if (availrmem < swapfs_minfree) {
6728                 /*
6729                  * We won't try to capture this page as we are
6730                  * running low on memory.
6731                  */
6732                 return (ENOMEM);
6733         }
6734         return (0);
6735 }
6736 
6737 /*
6738  * Once we have a page in our mits, go ahead and complete the capture
6739  * operation.
6740  * Returns 1 on failure where page is no longer needed
6741  * Returns 0 on success
6742  * Returns -1 if there was a transient failure.
6743  * Failure cases must release the SE_EXCL lock on pp (usually via page_free).
6744  */
6745 int
6746 page_capture_take_action(page_t *pp, uint_t flags, void *datap)
6747 {
6748         int cb_index;
6749         int ret = 0;
6750         page_capture_hash_bucket_t *bp1;
6751         page_capture_hash_bucket_t *bp2;
6752         int index;
6753         int found = 0;
6754         int i;
6755 
6756         ASSERT(PAGE_EXCL(pp));
6757         ASSERT(curthread->t_flag & T_CAPTURING);
6758 
6759         for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6760                 if ((flags >> cb_index) & 1) {
6761                         break;
6762                 }
6763         }
6764         ASSERT(cb_index < PC_NUM_CALLBACKS);
6765 
6766         /*
6767          * Remove the entry from the page_capture hash, but don't free it yet
6768          * as we may need to put it back.
6769          * Since we own the page at this point in time, we should find it
6770          * in the hash if this is an ASYNC call.  If we don't it's likely
6771          * that the page_capture_async() thread decided that this request
6772          * had expired, in which case we just continue on.
6773          */
6774         if (flags & CAPTURE_ASYNC) {
6775 
6776                 index = PAGE_CAPTURE_HASH(pp);
6777 
6778                 mutex_enter(&page_capture_hash[index].pchh_mutex);
6779                 for (i = 0; i < 2 && !found; i++) {
6780                         bp1 = page_capture_hash[index].lists[i].next;
6781                         while (bp1 != &page_capture_hash[index].lists[i]) {
6782                                 if (bp1->pp == pp) {
6783                                         bp1->next->prev = bp1->prev;
6784                                         bp1->prev->next = bp1->next;
6785                                         page_capture_hash[index].
6786                                             num_pages[bp1->pri]--;
6787                                         page_clrtoxic(pp, PR_CAPTURE);
6788                                         found = 1;
6789                                         break;
6790                                 }
6791                                 bp1 = bp1->next;
6792                         }
6793                 }
6794                 mutex_exit(&page_capture_hash[index].pchh_mutex);
6795         }
6796 
6797         /* Synchronize with the unregister func. */
6798         rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6799         if (!pc_cb[cb_index].cb_active) {
6800                 page_free(pp, 1);
6801                 rw_exit(&pc_cb[cb_index].cb_rwlock);
6802                 if (found) {
6803                         kmem_free(bp1, sizeof (*bp1));
6804                 }
6805                 return (1);
6806         }
6807 
6808         /*
6809          * We need to remove the entry from the page capture hash and turn off
6810          * the PR_CAPTURE bit before calling the callback.  We'll need to cache
6811          * the entry here, and then based upon the return value, cleanup
6812          * appropriately or re-add it to the hash, making sure that someone else
6813          * hasn't already done so.
6814          * It should be rare for the callback to fail and thus it's ok for
6815          * the failure path to be a bit complicated as the success path is
6816          * cleaner and the locking rules are easier to follow.
6817          */
6818 
6819         ret = pc_cb[cb_index].cb_func(pp, datap, flags);
6820 
6821         rw_exit(&pc_cb[cb_index].cb_rwlock);
6822 
6823         /*
6824          * If this was an ASYNC request, we need to cleanup the hash if the
6825          * callback was successful or if the request was no longer valid.
6826          * For non-ASYNC requests, we return failure to map and the caller
6827          * will take care of adding the request to the hash.
6828          * Note also that the callback itself is responsible for the page
6829          * at this point in time in terms of locking ...  The most common
6830          * case for the failure path should just be a page_free.
6831          */
6832         if (ret >= 0) {
6833                 if (found) {
6834                         if (bp1->flags & CAPTURE_RETIRE) {
6835                                 page_retire_decr_pend_count(datap);
6836                         }
6837                         kmem_free(bp1, sizeof (*bp1));
6838                 }
6839                 return (ret);
6840         }
6841         if (!found) {
6842                 return (ret);
6843         }
6844 
6845         ASSERT(flags & CAPTURE_ASYNC);
6846 
6847         /*
6848          * Check for expiration time first as we can just free it up if it's
6849          * expired.
6850          */
6851         if (ddi_get_lbolt() > bp1->expires && bp1->expires != -1) {
6852                 kmem_free(bp1, sizeof (*bp1));
6853                 return (ret);
6854         }
6855 
6856         /*
6857          * The callback failed and there used to be an entry in the hash for
6858          * this page, so we need to add it back to the hash.
6859          */
6860         mutex_enter(&page_capture_hash[index].pchh_mutex);
6861         if (!(pp->p_toxic & PR_CAPTURE)) {
6862                 /* just add bp1 back to head of walked list */
6863                 page_settoxic(pp, PR_CAPTURE);
6864                 bp1->next = page_capture_hash[index].lists[1].next;
6865                 bp1->prev = &page_capture_hash[index].lists[1];
6866                 bp1->next->prev = bp1;
6867                 bp1->pri = PAGE_CAPTURE_PRIO(pp);
6868                 page_capture_hash[index].lists[1].next = bp1;
6869                 page_capture_hash[index].num_pages[bp1->pri]++;
6870                 mutex_exit(&page_capture_hash[index].pchh_mutex);
6871                 return (ret);
6872         }
6873 
6874         /*
6875          * Otherwise there was a new capture request added to list
6876          * Need to make sure that our original data is represented if
6877          * appropriate.
6878          */
6879         for (i = 0; i < 2; i++) {
6880                 bp2 = page_capture_hash[index].lists[i].next;
6881                 while (bp2 != &page_capture_hash[index].lists[i]) {
6882                         if (bp2->pp == pp) {
6883                                 if (bp1->flags & CAPTURE_RETIRE) {
6884                                         if (!(bp2->flags & CAPTURE_RETIRE)) {
6885                                                 bp2->szc = bp1->szc;
6886                                                 bp2->flags = bp1->flags;
6887                                                 bp2->expires = bp1->expires;
6888                                                 bp2->datap = bp1->datap;
6889                                         }
6890                                 } else {
6891                                         ASSERT(bp1->flags & CAPTURE_PHYSMEM);
6892                                         if (!(bp2->flags & CAPTURE_RETIRE)) {
6893                                                 bp2->szc = bp1->szc;
6894                                                 bp2->flags = bp1->flags;
6895                                                 bp2->expires = bp1->expires;
6896                                                 bp2->datap = bp1->datap;
6897                                         }
6898                                 }
6899                                 page_capture_hash[index].num_pages[bp2->pri]--;
6900                                 bp2->pri = PAGE_CAPTURE_PRIO(pp);
6901                                 page_capture_hash[index].num_pages[bp2->pri]++;
6902                                 mutex_exit(&page_capture_hash[index].
6903                                     pchh_mutex);
6904                                 kmem_free(bp1, sizeof (*bp1));
6905                                 return (ret);
6906                         }
6907                         bp2 = bp2->next;
6908                 }
6909         }
6910         panic("PR_CAPTURE set but not on hash for pp 0x%p\n", (void *)pp);
6911         /*NOTREACHED*/
6912 }
6913 
6914 /*
6915  * Try to capture the given page for the caller specified in the flags
6916  * parameter.  The page will either be captured and handed over to the
6917  * appropriate callback, or will be queued up in the page capture hash
6918  * to be captured asynchronously.
6919  * If the current request is due to an async capture, the page must be
6920  * exclusively locked before calling this function.
6921  * Currently szc must be 0 but in the future this should be expandable to
6922  * other page sizes.
6923  * Returns 0 on success, with the following error codes on failure:
6924  *      EPERM - The requested page is long term locked, and thus repeated
6925  *              requests to capture this page will likely fail.
6926  *      ENOMEM - There was not enough free memory in the system to safely
6927  *              map the requested page.
6928  *      ENOENT - The requested page was inside the kernel cage, and the
6929  *              CAPTURE_GET_CAGE flag was not set.
6930  *      EAGAIN - The requested page could not be capturead at this point in
6931  *              time but future requests will likely work.
6932  *      EBUSY - The requested page is retired and the CAPTURE_GET_RETIRED flag
6933  *              was not set.
6934  */
6935 int
6936 page_itrycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
6937 {
6938         int ret;
6939         int cb_index;
6940 
6941         if (flags & CAPTURE_ASYNC) {
6942                 ASSERT(PAGE_EXCL(pp));
6943                 goto async;
6944         }
6945 
6946         /* Make sure there's enough availrmem ... */
6947         ret = page_capture_pre_checks(pp, flags);
6948         if (ret != 0) {
6949                 return (ret);
6950         }
6951 
6952         if (!page_trylock(pp, SE_EXCL)) {
6953                 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6954                         if ((flags >> cb_index) & 1) {
6955                                 break;
6956                         }
6957                 }
6958                 ASSERT(cb_index < PC_NUM_CALLBACKS);
6959                 ret = EAGAIN;
6960                 /* Special case for retired pages */
6961                 if (PP_RETIRED(pp)) {
6962                         if (flags & CAPTURE_GET_RETIRED) {
6963                                 if (!page_unretire_pp(pp, PR_UNR_TEMP)) {
6964                                         /*
6965                                          * Need to set capture bit and add to
6966                                          * hash so that the page will be
6967                                          * retired when freed.
6968                                          */
6969                                         page_capture_add_hash(pp, szc,
6970                                             CAPTURE_RETIRE, NULL);
6971                                         ret = 0;
6972                                         goto own_page;
6973                                 }
6974                         } else {
6975                                 return (EBUSY);
6976                         }
6977                 }
6978                 page_capture_add_hash(pp, szc, flags, datap);
6979                 return (ret);
6980         }
6981 
6982 async:
6983         ASSERT(PAGE_EXCL(pp));
6984 
6985         /* Need to check for physmem async requests that availrmem is sane */
6986         if ((flags & (CAPTURE_ASYNC | CAPTURE_PHYSMEM)) ==
6987             (CAPTURE_ASYNC | CAPTURE_PHYSMEM) &&
6988             (availrmem < swapfs_minfree)) {
6989                 page_unlock(pp);
6990                 return (ENOMEM);
6991         }
6992 
6993         ret = page_capture_clean_page(pp);
6994 
6995         if (ret != 0) {
6996                 /* We failed to get the page, so lets add it to the hash */
6997                 if (!(flags & CAPTURE_ASYNC)) {
6998                         page_capture_add_hash(pp, szc, flags, datap);
6999                 }
7000                 return (ret);
7001         }
7002 
7003 own_page:
7004         ASSERT(PAGE_EXCL(pp));
7005         ASSERT(pp->p_szc == 0);
7006 
7007         /* Call the callback */
7008         ret = page_capture_take_action(pp, flags, datap);
7009 
7010         if (ret == 0) {
7011                 return (0);
7012         }
7013 
7014         /*
7015          * Note that in the failure cases from page_capture_take_action, the
7016          * EXCL lock will have already been dropped.
7017          */
7018         if ((ret == -1) && (!(flags & CAPTURE_ASYNC))) {
7019                 page_capture_add_hash(pp, szc, flags, datap);
7020         }
7021         return (EAGAIN);
7022 }
7023 
7024 int
7025 page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
7026 {
7027         int ret;
7028 
7029         curthread->t_flag |= T_CAPTURING;
7030         ret = page_itrycapture(pp, szc, flags, datap);
7031         curthread->t_flag &= ~T_CAPTURING; /* xor works as we know its set */
7032         return (ret);
7033 }
7034 
7035 /*
7036  * When unlocking a page which has the PR_CAPTURE bit set, this routine
7037  * gets called to try and capture the page.
7038  */
7039 void
7040 page_unlock_capture(page_t *pp)
7041 {
7042         page_capture_hash_bucket_t *bp;
7043         int index;
7044         int i;
7045         uint_t szc;
7046         uint_t flags = 0;
7047         void *datap;
7048         kmutex_t *mp;
7049         extern vnode_t retired_pages;
7050 
7051         /*
7052          * We need to protect against a possible deadlock here where we own
7053          * the vnode page hash mutex and want to acquire it again as there
7054          * are locations in the code, where we unlock a page while holding
7055          * the mutex which can lead to the page being captured and eventually
7056          * end up here.  As we may be hashing out the old page and hashing into
7057          * the retire vnode, we need to make sure we don't own them.
7058          * Other callbacks who do hash operations also need to make sure that
7059          * before they hashin to a vnode that they do not currently own the
7060          * vphm mutex otherwise there will be a panic.
7061          */
7062         if (mutex_owned(page_vnode_mutex(&retired_pages))) {
7063                 page_unlock_nocapture(pp);
7064                 return;
7065         }
7066         if (pp->p_vnode != NULL && mutex_owned(page_vnode_mutex(pp->p_vnode))) {
7067                 page_unlock_nocapture(pp);
7068                 return;
7069         }
7070 
7071         index = PAGE_CAPTURE_HASH(pp);
7072 
7073         mp = &page_capture_hash[index].pchh_mutex;
7074         mutex_enter(mp);
7075         for (i = 0; i < 2; i++) {
7076                 bp = page_capture_hash[index].lists[i].next;
7077                 while (bp != &page_capture_hash[index].lists[i]) {
7078                         if (bp->pp == pp) {
7079                                 szc = bp->szc;
7080                                 flags = bp->flags | CAPTURE_ASYNC;
7081                                 datap = bp->datap;
7082                                 mutex_exit(mp);
7083                                 (void) page_trycapture(pp, szc, flags, datap);
7084                                 return;
7085                         }
7086                         bp = bp->next;
7087                 }
7088         }
7089 
7090         /* Failed to find page in hash so clear flags and unlock it. */
7091         page_clrtoxic(pp, PR_CAPTURE);
7092         page_unlock(pp);
7093 
7094         mutex_exit(mp);
7095 }
7096 
7097 void
7098 page_capture_init()
7099 {
7100         int i;
7101         for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7102                 page_capture_hash[i].lists[0].next =
7103                     &page_capture_hash[i].lists[0];
7104                 page_capture_hash[i].lists[0].prev =
7105                     &page_capture_hash[i].lists[0];
7106                 page_capture_hash[i].lists[1].next =
7107                     &page_capture_hash[i].lists[1];
7108                 page_capture_hash[i].lists[1].prev =
7109                     &page_capture_hash[i].lists[1];
7110         }
7111 
7112         pc_thread_shortwait = 23 * hz;
7113         pc_thread_longwait = 1201 * hz;
7114         pc_thread_retry = 3;
7115         mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
7116         cv_init(&pc_cv, NULL, CV_DEFAULT, NULL);
7117         pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0,
7118             TS_RUN, minclsyspri);
7119 }
7120 
7121 /*
7122  * It is necessary to scrub any failing pages prior to reboot in order to
7123  * prevent a latent error trap from occurring on the next boot.
7124  */
7125 void
7126 page_retire_mdboot()
7127 {
7128         page_t *pp;
7129         int i, j;
7130         page_capture_hash_bucket_t *bp;
7131         uchar_t pri;
7132 
7133         /* walk lists looking for pages to scrub */
7134         for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7135                 for (pri = 0; pri < PC_NUM_PRI; pri++) {
7136                         if (page_capture_hash[i].num_pages[pri] != 0) {
7137                                 break;
7138                         }
7139                 }
7140                 if (pri == PC_NUM_PRI)
7141                         continue;
7142 
7143                 mutex_enter(&page_capture_hash[i].pchh_mutex);
7144 
7145                 for (j = 0; j < 2; j++) {
7146                         bp = page_capture_hash[i].lists[j].next;
7147                         while (bp != &page_capture_hash[i].lists[j]) {
7148                                 pp = bp->pp;
7149                                 if (PP_TOXIC(pp)) {
7150                                         if (page_trylock(pp, SE_EXCL)) {
7151                                                 PP_CLRFREE(pp);
7152                                                 pagescrub(pp, 0, PAGESIZE);
7153                                                 page_unlock(pp);
7154                                         }
7155                                 }
7156                                 bp = bp->next;
7157                         }
7158                 }
7159                 mutex_exit(&page_capture_hash[i].pchh_mutex);
7160         }
7161 }
7162 
7163 /*
7164  * Walk the page_capture_hash trying to capture pages and also cleanup old
7165  * entries which have expired.
7166  */
7167 void
7168 page_capture_async()
7169 {
7170         page_t *pp;
7171         int i;
7172         int ret;
7173         page_capture_hash_bucket_t *bp1, *bp2;
7174         uint_t szc;
7175         uint_t flags;
7176         void *datap;
7177         uchar_t pri;
7178 
7179         /* If there are outstanding pages to be captured, get to work */
7180         for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7181                 for (pri = 0; pri < PC_NUM_PRI; pri++) {
7182                         if (page_capture_hash[i].num_pages[pri] != 0)
7183                                 break;
7184                 }
7185                 if (pri == PC_NUM_PRI)
7186                         continue;
7187 
7188                 /* Append list 1 to list 0 and then walk through list 0 */
7189                 mutex_enter(&page_capture_hash[i].pchh_mutex);
7190                 bp1 = &page_capture_hash[i].lists[1];
7191                 bp2 = bp1->next;
7192                 if (bp1 != bp2) {
7193                         bp1->prev->next = page_capture_hash[i].lists[0].next;
7194                         bp2->prev = &page_capture_hash[i].lists[0];
7195                         page_capture_hash[i].lists[0].next->prev = bp1->prev;
7196                         page_capture_hash[i].lists[0].next = bp2;
7197                         bp1->next = bp1;
7198                         bp1->prev = bp1;
7199                 }
7200 
7201                 /* list[1] will be empty now */
7202 
7203                 bp1 = page_capture_hash[i].lists[0].next;
7204                 while (bp1 != &page_capture_hash[i].lists[0]) {
7205                         /* Check expiration time */
7206                         if ((ddi_get_lbolt() > bp1->expires &&
7207                             bp1->expires != -1) ||
7208                             page_deleted(bp1->pp)) {
7209                                 page_capture_hash[i].lists[0].next = bp1->next;
7210                                 bp1->next->prev =
7211                                     &page_capture_hash[i].lists[0];
7212                                 page_capture_hash[i].num_pages[bp1->pri]--;
7213 
7214                                 /*
7215                                  * We can safely remove the PR_CAPTURE bit
7216                                  * without holding the EXCL lock on the page
7217                                  * as the PR_CAPTURE bit requres that the
7218                                  * page_capture_hash[].pchh_mutex be held
7219                                  * to modify it.
7220                                  */
7221                                 page_clrtoxic(bp1->pp, PR_CAPTURE);
7222                                 mutex_exit(&page_capture_hash[i].pchh_mutex);
7223                                 kmem_free(bp1, sizeof (*bp1));
7224                                 mutex_enter(&page_capture_hash[i].pchh_mutex);
7225                                 bp1 = page_capture_hash[i].lists[0].next;
7226                                 continue;
7227                         }
7228                         pp = bp1->pp;
7229                         szc = bp1->szc;
7230                         flags = bp1->flags;
7231                         datap = bp1->datap;
7232                         mutex_exit(&page_capture_hash[i].pchh_mutex);
7233                         if (page_trylock(pp, SE_EXCL)) {
7234                                 ret = page_trycapture(pp, szc,
7235                                     flags | CAPTURE_ASYNC, datap);
7236                         } else {
7237                                 ret = 1;        /* move to walked hash */
7238                         }
7239 
7240                         if (ret != 0) {
7241                                 /* Move to walked hash */
7242                                 (void) page_capture_move_to_walked(pp);
7243                         }
7244                         mutex_enter(&page_capture_hash[i].pchh_mutex);
7245                         bp1 = page_capture_hash[i].lists[0].next;
7246                 }
7247 
7248                 mutex_exit(&page_capture_hash[i].pchh_mutex);
7249         }
7250 }
7251 
7252 /*
7253  * This function is called by the page_capture_thread, and is needed in
7254  * in order to initiate aio cleanup, so that pages used in aio
7255  * will be unlocked and subsequently retired by page_capture_thread.
7256  */
7257 static int
7258 do_aio_cleanup(void)
7259 {
7260         proc_t *procp;
7261         int (*aio_cleanup_dr_delete_memory)(proc_t *);
7262         int cleaned = 0;
7263 
7264         if (modload("sys", "kaio") == -1) {
7265                 cmn_err(CE_WARN, "do_aio_cleanup: cannot load kaio");
7266                 return (0);
7267         }
7268         /*
7269          * We use the aio_cleanup_dr_delete_memory function to
7270          * initiate the actual clean up; this function will wake
7271          * up the per-process aio_cleanup_thread.
7272          */
7273         aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
7274             modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
7275         if (aio_cleanup_dr_delete_memory == NULL) {
7276                 cmn_err(CE_WARN,
7277             "aio_cleanup_dr_delete_memory not found in kaio");
7278                 return (0);
7279         }
7280         mutex_enter(&pidlock);
7281         for (procp = practive; (procp != NULL); procp = procp->p_next) {
7282                 mutex_enter(&procp->p_lock);
7283                 if (procp->p_aio != NULL) {
7284                         /* cleanup proc's outstanding kaio */
7285                         cleaned += (*aio_cleanup_dr_delete_memory)(procp);
7286                 }
7287                 mutex_exit(&procp->p_lock);
7288         }
7289         mutex_exit(&pidlock);
7290         return (cleaned);
7291 }
7292 
7293 /*
7294  * helper function for page_capture_thread
7295  */
7296 static void
7297 page_capture_handle_outstanding(void)
7298 {
7299         int ntry;
7300 
7301         /* Reap pages before attempting capture pages */
7302         kmem_reap();
7303 
7304         if ((page_retire_pend_count() > page_retire_pend_kas_count()) &&
7305             hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
7306                 /*
7307                  * Note: Purging only for platforms that support
7308                  * ISM hat_pageunload() - mainly SPARC. On x86/x64
7309                  * platforms ISM pages SE_SHARED locked until destroyed.
7310                  */
7311 
7312                 /* disable and purge seg_pcache */
7313                 (void) seg_p_disable();
7314                 for (ntry = 0; ntry < pc_thread_retry; ntry++) {
7315                         if (!page_retire_pend_count())
7316                                 break;
7317                         if (do_aio_cleanup()) {
7318                                 /*
7319                                  * allow the apps cleanup threads
7320                                  * to run
7321                                  */
7322                                 delay(pc_thread_shortwait);
7323                         }
7324                         page_capture_async();
7325                 }
7326                 /* reenable seg_pcache */
7327                 seg_p_enable();
7328 
7329                 /* completed what can be done.  break out */
7330                 return;
7331         }
7332 
7333         /*
7334          * For kernel pages and/or unsupported HAT_DYNAMIC_ISM_UNMAP, reap
7335          * and then attempt to capture.
7336          */
7337         seg_preap();
7338         page_capture_async();
7339 }
7340 
7341 /*
7342  * The page_capture_thread loops forever, looking to see if there are
7343  * pages still waiting to be captured.
7344  */
7345 static void
7346 page_capture_thread(void)
7347 {
7348         callb_cpr_t c;
7349         int i;
7350         int high_pri_pages;
7351         int low_pri_pages;
7352         clock_t timeout;
7353 
7354         CALLB_CPR_INIT(&c, &pc_thread_mutex, callb_generic_cpr, "page_capture");
7355 
7356         mutex_enter(&pc_thread_mutex);
7357         for (;;) {
7358                 high_pri_pages = 0;
7359                 low_pri_pages = 0;
7360                 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7361                         high_pri_pages +=
7362                             page_capture_hash[i].num_pages[PC_PRI_HI];
7363                         low_pri_pages +=
7364                             page_capture_hash[i].num_pages[PC_PRI_LO];
7365                 }
7366 
7367                 timeout = pc_thread_longwait;
7368                 if (high_pri_pages != 0) {
7369                         timeout = pc_thread_shortwait;
7370                         page_capture_handle_outstanding();
7371                 } else if (low_pri_pages != 0) {
7372                         page_capture_async();
7373                 }
7374                 CALLB_CPR_SAFE_BEGIN(&c);
7375                 (void) cv_reltimedwait(&pc_cv, &pc_thread_mutex,
7376                     timeout, TR_CLOCK_TICK);
7377                 CALLB_CPR_SAFE_END(&c, &pc_thread_mutex);
7378         }
7379         /*NOTREACHED*/
7380 }
7381 /*
7382  * Attempt to locate a bucket that has enough pages to satisfy the request.
7383  * The initial check is done without the lock to avoid unneeded contention.
7384  * The function returns 1 if enough pages were found, else 0 if it could not
7385  * find enough pages in a bucket.
7386  */
7387 static int
7388 pcf_decrement_bucket(pgcnt_t npages)
7389 {
7390         struct pcf      *p;
7391         struct pcf      *q;
7392         int i;
7393 
7394         p = &pcf[PCF_INDEX()];
7395         q = &pcf[pcf_fanout];
7396         for (i = 0; i < pcf_fanout; i++) {
7397                 if (p->pcf_count > npages) {
7398                         /*
7399                          * a good one to try.
7400                          */
7401                         mutex_enter(&p->pcf_lock);
7402                         if (p->pcf_count > npages) {
7403                                 p->pcf_count -= (uint_t)npages;
7404                                 /*
7405                                  * freemem is not protected by any lock.
7406                                  * Thus, we cannot have any assertion
7407                                  * containing freemem here.
7408                                  */
7409                                 freemem -= npages;
7410                                 mutex_exit(&p->pcf_lock);
7411                                 return (1);
7412                         }
7413                         mutex_exit(&p->pcf_lock);
7414                 }
7415                 p++;
7416                 if (p >= q) {
7417                         p = pcf;
7418                 }
7419         }
7420         return (0);
7421 }
7422 
7423 /*
7424  * Arguments:
7425  *      pcftotal_ret:   If the value is not NULL and we have walked all the
7426  *                      buckets but did not find enough pages then it will
7427  *                      be set to the total number of pages in all the pcf
7428  *                      buckets.
7429  *      npages:         Is the number of pages we have been requested to
7430  *                      find.
7431  *      unlock:         If set to 0 we will leave the buckets locked if the
7432  *                      requested number of pages are not found.
7433  *
7434  * Go and try to satisfy the page request  from any number of buckets.
7435  * This can be a very expensive operation as we have to lock the buckets
7436  * we are checking (and keep them locked), starting at bucket 0.
7437  *
7438  * The function returns 1 if enough pages were found, else 0 if it could not
7439  * find enough pages in the buckets.
7440  *
7441  */
7442 static int
7443 pcf_decrement_multiple(pgcnt_t *pcftotal_ret, pgcnt_t npages, int unlock)
7444 {
7445         struct pcf      *p;
7446         pgcnt_t pcftotal;
7447         int i;
7448 
7449         p = pcf;
7450         /* try to collect pages from several pcf bins */
7451         for (pcftotal = 0, i = 0; i < pcf_fanout; i++) {
7452                 mutex_enter(&p->pcf_lock);
7453                 pcftotal += p->pcf_count;
7454                 if (pcftotal >= npages) {
7455                         /*
7456                          * Wow!  There are enough pages laying around
7457                          * to satisfy the request.  Do the accounting,
7458                          * drop the locks we acquired, and go back.
7459                          *
7460                          * freemem is not protected by any lock. So,
7461                          * we cannot have any assertion containing
7462                          * freemem.
7463                          */
7464                         freemem -= npages;
7465                         while (p >= pcf) {
7466                                 if (p->pcf_count <= npages) {
7467                                         npages -= p->pcf_count;
7468                                         p->pcf_count = 0;
7469                                 } else {
7470                                         p->pcf_count -= (uint_t)npages;
7471                                         npages = 0;
7472                                 }
7473                                 mutex_exit(&p->pcf_lock);
7474                                 p--;
7475                         }
7476                         ASSERT(npages == 0);
7477                         return (1);
7478                 }
7479                 p++;
7480         }
7481         if (unlock) {
7482                 /* failed to collect pages - release the locks */
7483                 while (--p >= pcf) {
7484                         mutex_exit(&p->pcf_lock);
7485                 }
7486         }
7487         if (pcftotal_ret != NULL)
7488                 *pcftotal_ret = pcftotal;
7489         return (0);
7490 }